{ "best_global_step": 32694, "best_metric": 0.1661907583475113, "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_multirc_123_1764892185/checkpoint-32694", "epoch": 20.0, "eval_steps": 10898, "global_step": 108980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009175995595522114, "grad_norm": 176.12417602539062, "learning_rate": 3.670398238208846e-09, "loss": 13.3004, "num_input_tokens_seen": 9920, "step": 5 }, { "epoch": 0.0018351991191044228, "grad_norm": 149.58544921875, "learning_rate": 8.258396035969903e-09, "loss": 13.4257, "num_input_tokens_seen": 22240, "step": 10 }, { "epoch": 0.0027527986786566342, "grad_norm": 155.7015380859375, "learning_rate": 1.284639383373096e-08, "loss": 13.2007, "num_input_tokens_seen": 33056, "step": 15 }, { "epoch": 0.0036703982382088455, "grad_norm": 151.31521606445312, "learning_rate": 1.7434391631492018e-08, "loss": 13.272, "num_input_tokens_seen": 44096, "step": 20 }, { "epoch": 0.004587997797761057, "grad_norm": 136.21530151367188, "learning_rate": 2.2022389429253076e-08, "loss": 13.1993, "num_input_tokens_seen": 54688, "step": 25 }, { "epoch": 0.0055055973573132685, "grad_norm": 162.4971466064453, "learning_rate": 2.6610387227014133e-08, "loss": 13.4241, "num_input_tokens_seen": 65376, "step": 30 }, { "epoch": 0.00642319691686548, "grad_norm": 137.74008178710938, "learning_rate": 3.119838502477519e-08, "loss": 13.3434, "num_input_tokens_seen": 75776, "step": 35 }, { "epoch": 0.007340796476417691, "grad_norm": 140.79441833496094, "learning_rate": 3.578638282253625e-08, "loss": 13.2915, "num_input_tokens_seen": 85952, "step": 40 }, { "epoch": 0.008258396035969904, "grad_norm": 145.3714599609375, "learning_rate": 4.0374380620297305e-08, "loss": 13.1416, "num_input_tokens_seen": 96256, "step": 45 }, { "epoch": 0.009175995595522114, "grad_norm": 142.95407104492188, "learning_rate": 4.496237841805836e-08, "loss": 13.2331, "num_input_tokens_seen": 107232, "step": 50 }, { "epoch": 0.010093595155074325, "grad_norm": 139.3367462158203, "learning_rate": 4.9550376215819427e-08, "loss": 13.2505, "num_input_tokens_seen": 117312, "step": 55 }, { "epoch": 0.011011194714626537, "grad_norm": 163.2412872314453, "learning_rate": 5.413837401358047e-08, "loss": 12.9177, "num_input_tokens_seen": 127200, "step": 60 }, { "epoch": 0.011928794274178749, "grad_norm": 147.5352325439453, "learning_rate": 5.8726371811341535e-08, "loss": 13.1667, "num_input_tokens_seen": 136064, "step": 65 }, { "epoch": 0.01284639383373096, "grad_norm": 144.78529357910156, "learning_rate": 6.331436960910259e-08, "loss": 12.7246, "num_input_tokens_seen": 146560, "step": 70 }, { "epoch": 0.013763993393283172, "grad_norm": 143.59286499023438, "learning_rate": 6.790236740686365e-08, "loss": 12.6791, "num_input_tokens_seen": 157536, "step": 75 }, { "epoch": 0.014681592952835382, "grad_norm": 151.33897399902344, "learning_rate": 7.249036520462471e-08, "loss": 12.852, "num_input_tokens_seen": 168640, "step": 80 }, { "epoch": 0.015599192512387594, "grad_norm": 139.82102966308594, "learning_rate": 7.707836300238576e-08, "loss": 12.7036, "num_input_tokens_seen": 180256, "step": 85 }, { "epoch": 0.016516792071939807, "grad_norm": 146.79981994628906, "learning_rate": 8.166636080014682e-08, "loss": 12.4999, "num_input_tokens_seen": 190592, "step": 90 }, { "epoch": 0.017434391631492015, "grad_norm": 140.09629821777344, "learning_rate": 8.625435859790789e-08, "loss": 12.5588, "num_input_tokens_seen": 200736, "step": 95 }, { "epoch": 0.018351991191044227, "grad_norm": 137.1687774658203, "learning_rate": 9.084235639566895e-08, "loss": 12.6001, "num_input_tokens_seen": 212416, "step": 100 }, { "epoch": 0.01926959075059644, "grad_norm": 147.43995666503906, "learning_rate": 9.543035419342998e-08, "loss": 12.3119, "num_input_tokens_seen": 223680, "step": 105 }, { "epoch": 0.02018719031014865, "grad_norm": 156.97621154785156, "learning_rate": 1.0001835199119105e-07, "loss": 12.0581, "num_input_tokens_seen": 233856, "step": 110 }, { "epoch": 0.021104789869700862, "grad_norm": 151.010009765625, "learning_rate": 1.0460634978895211e-07, "loss": 12.3105, "num_input_tokens_seen": 246208, "step": 115 }, { "epoch": 0.022022389429253074, "grad_norm": 148.3113250732422, "learning_rate": 1.0919434758671317e-07, "loss": 11.8405, "num_input_tokens_seen": 257088, "step": 120 }, { "epoch": 0.022939988988805286, "grad_norm": 138.19691467285156, "learning_rate": 1.1378234538447422e-07, "loss": 12.1078, "num_input_tokens_seen": 267840, "step": 125 }, { "epoch": 0.023857588548357497, "grad_norm": 144.21209716796875, "learning_rate": 1.1837034318223528e-07, "loss": 11.8592, "num_input_tokens_seen": 278272, "step": 130 }, { "epoch": 0.02477518810790971, "grad_norm": 155.81027221679688, "learning_rate": 1.2295834097999634e-07, "loss": 11.4807, "num_input_tokens_seen": 289888, "step": 135 }, { "epoch": 0.02569278766746192, "grad_norm": 145.5962677001953, "learning_rate": 1.275463387777574e-07, "loss": 11.547, "num_input_tokens_seen": 300672, "step": 140 }, { "epoch": 0.026610387227014132, "grad_norm": 145.35533142089844, "learning_rate": 1.3213433657551845e-07, "loss": 11.482, "num_input_tokens_seen": 311712, "step": 145 }, { "epoch": 0.027527986786566344, "grad_norm": 131.8557891845703, "learning_rate": 1.3672233437327952e-07, "loss": 11.3052, "num_input_tokens_seen": 321472, "step": 150 }, { "epoch": 0.028445586346118552, "grad_norm": 142.22547912597656, "learning_rate": 1.4131033217104057e-07, "loss": 10.9453, "num_input_tokens_seen": 330592, "step": 155 }, { "epoch": 0.029363185905670764, "grad_norm": 174.64593505859375, "learning_rate": 1.4589832996880164e-07, "loss": 11.0998, "num_input_tokens_seen": 341696, "step": 160 }, { "epoch": 0.030280785465222976, "grad_norm": 136.49879455566406, "learning_rate": 1.5048632776656268e-07, "loss": 10.8481, "num_input_tokens_seen": 351904, "step": 165 }, { "epoch": 0.031198385024775187, "grad_norm": 151.18092346191406, "learning_rate": 1.5507432556432373e-07, "loss": 10.5924, "num_input_tokens_seen": 361312, "step": 170 }, { "epoch": 0.0321159845843274, "grad_norm": 149.25592041015625, "learning_rate": 1.596623233620848e-07, "loss": 10.5468, "num_input_tokens_seen": 372896, "step": 175 }, { "epoch": 0.033033584143879614, "grad_norm": 146.56292724609375, "learning_rate": 1.6425032115984587e-07, "loss": 10.0587, "num_input_tokens_seen": 382016, "step": 180 }, { "epoch": 0.03395118370343182, "grad_norm": 141.68133544921875, "learning_rate": 1.688383189576069e-07, "loss": 10.2492, "num_input_tokens_seen": 392992, "step": 185 }, { "epoch": 0.03486878326298403, "grad_norm": 129.59718322753906, "learning_rate": 1.7342631675536798e-07, "loss": 9.8038, "num_input_tokens_seen": 404192, "step": 190 }, { "epoch": 0.035786382822536246, "grad_norm": 147.91970825195312, "learning_rate": 1.7801431455312903e-07, "loss": 9.4725, "num_input_tokens_seen": 414560, "step": 195 }, { "epoch": 0.036703982382088454, "grad_norm": 140.78805541992188, "learning_rate": 1.8260231235089007e-07, "loss": 9.5267, "num_input_tokens_seen": 424384, "step": 200 }, { "epoch": 0.03762158194164067, "grad_norm": 133.37109375, "learning_rate": 1.8719031014865114e-07, "loss": 9.5287, "num_input_tokens_seen": 435904, "step": 205 }, { "epoch": 0.03853918150119288, "grad_norm": 128.65992736816406, "learning_rate": 1.9177830794641219e-07, "loss": 9.2903, "num_input_tokens_seen": 447456, "step": 210 }, { "epoch": 0.03945678106074509, "grad_norm": 148.74261474609375, "learning_rate": 1.9636630574417326e-07, "loss": 8.9764, "num_input_tokens_seen": 458880, "step": 215 }, { "epoch": 0.0403743806202973, "grad_norm": 128.00390625, "learning_rate": 2.009543035419343e-07, "loss": 8.4692, "num_input_tokens_seen": 469280, "step": 220 }, { "epoch": 0.041291980179849516, "grad_norm": 129.0933837890625, "learning_rate": 2.0554230133969537e-07, "loss": 8.3735, "num_input_tokens_seen": 481024, "step": 225 }, { "epoch": 0.042209579739401724, "grad_norm": 140.8704376220703, "learning_rate": 2.1013029913745642e-07, "loss": 8.1893, "num_input_tokens_seen": 492096, "step": 230 }, { "epoch": 0.04312717929895394, "grad_norm": 132.97242736816406, "learning_rate": 2.1471829693521751e-07, "loss": 7.9145, "num_input_tokens_seen": 503040, "step": 235 }, { "epoch": 0.04404477885850615, "grad_norm": 129.19073486328125, "learning_rate": 2.1930629473297853e-07, "loss": 7.8652, "num_input_tokens_seen": 514144, "step": 240 }, { "epoch": 0.044962378418058356, "grad_norm": 131.2498779296875, "learning_rate": 2.2389429253073963e-07, "loss": 7.7418, "num_input_tokens_seen": 524928, "step": 245 }, { "epoch": 0.04587997797761057, "grad_norm": 122.8309326171875, "learning_rate": 2.2848229032850067e-07, "loss": 7.2644, "num_input_tokens_seen": 535808, "step": 250 }, { "epoch": 0.04679757753716278, "grad_norm": 119.38818359375, "learning_rate": 2.330702881262617e-07, "loss": 6.9732, "num_input_tokens_seen": 546848, "step": 255 }, { "epoch": 0.047715177096714995, "grad_norm": 119.60884857177734, "learning_rate": 2.376582859240228e-07, "loss": 6.6634, "num_input_tokens_seen": 558368, "step": 260 }, { "epoch": 0.0486327766562672, "grad_norm": 125.43573760986328, "learning_rate": 2.4224628372178383e-07, "loss": 6.5445, "num_input_tokens_seen": 568640, "step": 265 }, { "epoch": 0.04955037621581942, "grad_norm": 120.30415344238281, "learning_rate": 2.468342815195449e-07, "loss": 6.3498, "num_input_tokens_seen": 579552, "step": 270 }, { "epoch": 0.050467975775371626, "grad_norm": 109.29427337646484, "learning_rate": 2.514222793173059e-07, "loss": 6.3226, "num_input_tokens_seen": 591200, "step": 275 }, { "epoch": 0.05138557533492384, "grad_norm": 117.44766998291016, "learning_rate": 2.56010277115067e-07, "loss": 5.9712, "num_input_tokens_seen": 601696, "step": 280 }, { "epoch": 0.05230317489447605, "grad_norm": 113.87281036376953, "learning_rate": 2.6059827491282806e-07, "loss": 5.7575, "num_input_tokens_seen": 614336, "step": 285 }, { "epoch": 0.053220774454028265, "grad_norm": 99.93218231201172, "learning_rate": 2.651862727105891e-07, "loss": 5.3814, "num_input_tokens_seen": 624992, "step": 290 }, { "epoch": 0.05413837401358047, "grad_norm": 102.88504028320312, "learning_rate": 2.6977427050835015e-07, "loss": 5.1861, "num_input_tokens_seen": 635328, "step": 295 }, { "epoch": 0.05505597357313269, "grad_norm": 91.67082214355469, "learning_rate": 2.7436226830611125e-07, "loss": 5.0452, "num_input_tokens_seen": 645760, "step": 300 }, { "epoch": 0.055973573132684896, "grad_norm": 90.75875854492188, "learning_rate": 2.789502661038723e-07, "loss": 4.7944, "num_input_tokens_seen": 656192, "step": 305 }, { "epoch": 0.056891172692237105, "grad_norm": 98.55912780761719, "learning_rate": 2.835382639016334e-07, "loss": 4.4773, "num_input_tokens_seen": 667008, "step": 310 }, { "epoch": 0.05780877225178932, "grad_norm": 83.1011962890625, "learning_rate": 2.8812626169939443e-07, "loss": 4.381, "num_input_tokens_seen": 677600, "step": 315 }, { "epoch": 0.05872637181134153, "grad_norm": 86.61333465576172, "learning_rate": 2.927142594971554e-07, "loss": 3.9697, "num_input_tokens_seen": 687840, "step": 320 }, { "epoch": 0.05964397137089374, "grad_norm": 84.20294952392578, "learning_rate": 2.973022572949165e-07, "loss": 3.8879, "num_input_tokens_seen": 698944, "step": 325 }, { "epoch": 0.06056157093044595, "grad_norm": 72.00992584228516, "learning_rate": 3.0189025509267756e-07, "loss": 3.8187, "num_input_tokens_seen": 710560, "step": 330 }, { "epoch": 0.06147917048999817, "grad_norm": 113.80841064453125, "learning_rate": 3.0647825289043866e-07, "loss": 3.8169, "num_input_tokens_seen": 722752, "step": 335 }, { "epoch": 0.062396770049550375, "grad_norm": 78.21851348876953, "learning_rate": 3.110662506881997e-07, "loss": 3.3397, "num_input_tokens_seen": 734048, "step": 340 }, { "epoch": 0.06331436960910258, "grad_norm": 74.15465545654297, "learning_rate": 3.1565424848596075e-07, "loss": 3.2113, "num_input_tokens_seen": 745344, "step": 345 }, { "epoch": 0.0642319691686548, "grad_norm": 81.93450164794922, "learning_rate": 3.202422462837218e-07, "loss": 3.1351, "num_input_tokens_seen": 757376, "step": 350 }, { "epoch": 0.06514956872820701, "grad_norm": 67.46986389160156, "learning_rate": 3.248302440814829e-07, "loss": 2.7601, "num_input_tokens_seen": 767680, "step": 355 }, { "epoch": 0.06606716828775923, "grad_norm": 76.61467742919922, "learning_rate": 3.2941824187924394e-07, "loss": 2.7434, "num_input_tokens_seen": 778784, "step": 360 }, { "epoch": 0.06698476784731143, "grad_norm": 66.31843566894531, "learning_rate": 3.34006239677005e-07, "loss": 2.5841, "num_input_tokens_seen": 789664, "step": 365 }, { "epoch": 0.06790236740686365, "grad_norm": 79.79821014404297, "learning_rate": 3.38594237474766e-07, "loss": 2.463, "num_input_tokens_seen": 801472, "step": 370 }, { "epoch": 0.06881996696641586, "grad_norm": 62.253902435302734, "learning_rate": 3.431822352725271e-07, "loss": 2.081, "num_input_tokens_seen": 812608, "step": 375 }, { "epoch": 0.06973756652596806, "grad_norm": 65.85543823242188, "learning_rate": 3.477702330702881e-07, "loss": 2.0463, "num_input_tokens_seen": 823456, "step": 380 }, { "epoch": 0.07065516608552028, "grad_norm": 63.21333312988281, "learning_rate": 3.523582308680492e-07, "loss": 1.7915, "num_input_tokens_seen": 833856, "step": 385 }, { "epoch": 0.07157276564507249, "grad_norm": 57.983680725097656, "learning_rate": 3.5694622866581025e-07, "loss": 1.7979, "num_input_tokens_seen": 844736, "step": 390 }, { "epoch": 0.07249036520462471, "grad_norm": 46.910194396972656, "learning_rate": 3.6153422646357135e-07, "loss": 1.5475, "num_input_tokens_seen": 856064, "step": 395 }, { "epoch": 0.07340796476417691, "grad_norm": 50.760311126708984, "learning_rate": 3.6612222426133234e-07, "loss": 1.4669, "num_input_tokens_seen": 867712, "step": 400 }, { "epoch": 0.07432556432372912, "grad_norm": 56.45383071899414, "learning_rate": 3.7071022205909344e-07, "loss": 1.4751, "num_input_tokens_seen": 877312, "step": 405 }, { "epoch": 0.07524316388328134, "grad_norm": 51.101905822753906, "learning_rate": 3.752982198568545e-07, "loss": 1.2695, "num_input_tokens_seen": 889024, "step": 410 }, { "epoch": 0.07616076344283355, "grad_norm": 51.52912139892578, "learning_rate": 3.798862176546156e-07, "loss": 1.2389, "num_input_tokens_seen": 900128, "step": 415 }, { "epoch": 0.07707836300238576, "grad_norm": 47.26045608520508, "learning_rate": 3.8447421545237657e-07, "loss": 1.0926, "num_input_tokens_seen": 910784, "step": 420 }, { "epoch": 0.07799596256193797, "grad_norm": 36.513587951660156, "learning_rate": 3.8906221325013767e-07, "loss": 0.9316, "num_input_tokens_seen": 921248, "step": 425 }, { "epoch": 0.07891356212149019, "grad_norm": 44.548038482666016, "learning_rate": 3.9365021104789877e-07, "loss": 0.8832, "num_input_tokens_seen": 931648, "step": 430 }, { "epoch": 0.07983116168104239, "grad_norm": 39.68247985839844, "learning_rate": 3.982382088456598e-07, "loss": 0.7983, "num_input_tokens_seen": 942976, "step": 435 }, { "epoch": 0.0807487612405946, "grad_norm": 40.80929183959961, "learning_rate": 4.0282620664342085e-07, "loss": 0.7166, "num_input_tokens_seen": 954048, "step": 440 }, { "epoch": 0.08166636080014682, "grad_norm": 25.177453994750977, "learning_rate": 4.074142044411819e-07, "loss": 0.685, "num_input_tokens_seen": 965728, "step": 445 }, { "epoch": 0.08258396035969903, "grad_norm": 30.59040069580078, "learning_rate": 4.12002202238943e-07, "loss": 0.6437, "num_input_tokens_seen": 976032, "step": 450 }, { "epoch": 0.08350155991925123, "grad_norm": 30.785703659057617, "learning_rate": 4.16590200036704e-07, "loss": 0.5995, "num_input_tokens_seen": 986720, "step": 455 }, { "epoch": 0.08441915947880345, "grad_norm": 42.36625671386719, "learning_rate": 4.211781978344651e-07, "loss": 0.5352, "num_input_tokens_seen": 998080, "step": 460 }, { "epoch": 0.08533675903835566, "grad_norm": 23.79561996459961, "learning_rate": 4.2576619563222613e-07, "loss": 0.5429, "num_input_tokens_seen": 1008960, "step": 465 }, { "epoch": 0.08625435859790788, "grad_norm": 24.684093475341797, "learning_rate": 4.303541934299872e-07, "loss": 0.4624, "num_input_tokens_seen": 1020384, "step": 470 }, { "epoch": 0.08717195815746008, "grad_norm": 19.758867263793945, "learning_rate": 4.349421912277482e-07, "loss": 0.4791, "num_input_tokens_seen": 1032160, "step": 475 }, { "epoch": 0.0880895577170123, "grad_norm": 42.74286651611328, "learning_rate": 4.395301890255093e-07, "loss": 0.5546, "num_input_tokens_seen": 1043456, "step": 480 }, { "epoch": 0.08900715727656451, "grad_norm": 57.90437316894531, "learning_rate": 4.4411818682327036e-07, "loss": 0.4724, "num_input_tokens_seen": 1054208, "step": 485 }, { "epoch": 0.08992475683611671, "grad_norm": 18.5361270904541, "learning_rate": 4.4870618462103145e-07, "loss": 0.4656, "num_input_tokens_seen": 1064256, "step": 490 }, { "epoch": 0.09084235639566893, "grad_norm": 41.3071403503418, "learning_rate": 4.5329418241879245e-07, "loss": 0.4736, "num_input_tokens_seen": 1074944, "step": 495 }, { "epoch": 0.09175995595522114, "grad_norm": 36.265750885009766, "learning_rate": 4.5788218021655354e-07, "loss": 0.4527, "num_input_tokens_seen": 1085792, "step": 500 }, { "epoch": 0.09267755551477336, "grad_norm": 24.569799423217773, "learning_rate": 4.624701780143146e-07, "loss": 0.4222, "num_input_tokens_seen": 1096544, "step": 505 }, { "epoch": 0.09359515507432556, "grad_norm": 20.417301177978516, "learning_rate": 4.6705817581207563e-07, "loss": 0.4734, "num_input_tokens_seen": 1108224, "step": 510 }, { "epoch": 0.09451275463387777, "grad_norm": 50.67850112915039, "learning_rate": 4.716461736098367e-07, "loss": 0.4536, "num_input_tokens_seen": 1119200, "step": 515 }, { "epoch": 0.09543035419342999, "grad_norm": 50.938636779785156, "learning_rate": 4.7623417140759777e-07, "loss": 0.4341, "num_input_tokens_seen": 1129760, "step": 520 }, { "epoch": 0.0963479537529822, "grad_norm": 59.87480926513672, "learning_rate": 4.808221692053589e-07, "loss": 0.4299, "num_input_tokens_seen": 1139136, "step": 525 }, { "epoch": 0.0972655533125344, "grad_norm": 31.616134643554688, "learning_rate": 4.854101670031198e-07, "loss": 0.4259, "num_input_tokens_seen": 1149056, "step": 530 }, { "epoch": 0.09818315287208662, "grad_norm": 27.071155548095703, "learning_rate": 4.89998164800881e-07, "loss": 0.4035, "num_input_tokens_seen": 1160064, "step": 535 }, { "epoch": 0.09910075243163884, "grad_norm": 18.034282684326172, "learning_rate": 4.94586162598642e-07, "loss": 0.4217, "num_input_tokens_seen": 1171392, "step": 540 }, { "epoch": 0.10001835199119104, "grad_norm": 26.04720687866211, "learning_rate": 4.99174160396403e-07, "loss": 0.3884, "num_input_tokens_seen": 1182208, "step": 545 }, { "epoch": 0.10093595155074325, "grad_norm": 22.86056900024414, "learning_rate": 5.037621581941641e-07, "loss": 0.388, "num_input_tokens_seen": 1193472, "step": 550 }, { "epoch": 0.10185355111029547, "grad_norm": 25.04741096496582, "learning_rate": 5.083501559919251e-07, "loss": 0.3893, "num_input_tokens_seen": 1205056, "step": 555 }, { "epoch": 0.10277115066984768, "grad_norm": 29.908802032470703, "learning_rate": 5.129381537896863e-07, "loss": 0.3984, "num_input_tokens_seen": 1215104, "step": 560 }, { "epoch": 0.10368875022939988, "grad_norm": 67.56624603271484, "learning_rate": 5.175261515874472e-07, "loss": 0.4382, "num_input_tokens_seen": 1227648, "step": 565 }, { "epoch": 0.1046063497889521, "grad_norm": 54.32497787475586, "learning_rate": 5.221141493852084e-07, "loss": 0.397, "num_input_tokens_seen": 1238720, "step": 570 }, { "epoch": 0.10552394934850431, "grad_norm": 39.518463134765625, "learning_rate": 5.267021471829694e-07, "loss": 0.3891, "num_input_tokens_seen": 1249824, "step": 575 }, { "epoch": 0.10644154890805653, "grad_norm": 31.200660705566406, "learning_rate": 5.312901449807305e-07, "loss": 0.4302, "num_input_tokens_seen": 1260384, "step": 580 }, { "epoch": 0.10735914846760873, "grad_norm": 71.09517669677734, "learning_rate": 5.358781427784915e-07, "loss": 0.4093, "num_input_tokens_seen": 1271520, "step": 585 }, { "epoch": 0.10827674802716095, "grad_norm": 40.881134033203125, "learning_rate": 5.404661405762526e-07, "loss": 0.3883, "num_input_tokens_seen": 1282784, "step": 590 }, { "epoch": 0.10919434758671316, "grad_norm": 26.01938247680664, "learning_rate": 5.450541383740136e-07, "loss": 0.403, "num_input_tokens_seen": 1292096, "step": 595 }, { "epoch": 0.11011194714626538, "grad_norm": 41.499752044677734, "learning_rate": 5.496421361717747e-07, "loss": 0.3893, "num_input_tokens_seen": 1302176, "step": 600 }, { "epoch": 0.11102954670581758, "grad_norm": 32.576663970947266, "learning_rate": 5.542301339695357e-07, "loss": 0.4011, "num_input_tokens_seen": 1312384, "step": 605 }, { "epoch": 0.11194714626536979, "grad_norm": 14.616876602172852, "learning_rate": 5.588181317672968e-07, "loss": 0.3762, "num_input_tokens_seen": 1321632, "step": 610 }, { "epoch": 0.11286474582492201, "grad_norm": 16.55613136291504, "learning_rate": 5.634061295650579e-07, "loss": 0.4215, "num_input_tokens_seen": 1332064, "step": 615 }, { "epoch": 0.11378234538447421, "grad_norm": 67.97463989257812, "learning_rate": 5.679941273628189e-07, "loss": 0.3964, "num_input_tokens_seen": 1342784, "step": 620 }, { "epoch": 0.11469994494402642, "grad_norm": 24.146373748779297, "learning_rate": 5.7258212516058e-07, "loss": 0.3739, "num_input_tokens_seen": 1353120, "step": 625 }, { "epoch": 0.11561754450357864, "grad_norm": 27.42155647277832, "learning_rate": 5.77170122958341e-07, "loss": 0.4057, "num_input_tokens_seen": 1363488, "step": 630 }, { "epoch": 0.11653514406313085, "grad_norm": 25.129901885986328, "learning_rate": 5.817581207561022e-07, "loss": 0.3555, "num_input_tokens_seen": 1373632, "step": 635 }, { "epoch": 0.11745274362268306, "grad_norm": 28.849761962890625, "learning_rate": 5.863461185538631e-07, "loss": 0.3422, "num_input_tokens_seen": 1384960, "step": 640 }, { "epoch": 0.11837034318223527, "grad_norm": 21.881757736206055, "learning_rate": 5.909341163516241e-07, "loss": 0.4893, "num_input_tokens_seen": 1396352, "step": 645 }, { "epoch": 0.11928794274178749, "grad_norm": 23.194643020629883, "learning_rate": 5.955221141493853e-07, "loss": 0.3736, "num_input_tokens_seen": 1406912, "step": 650 }, { "epoch": 0.1202055423013397, "grad_norm": 17.60102081298828, "learning_rate": 6.001101119471463e-07, "loss": 0.3543, "num_input_tokens_seen": 1418080, "step": 655 }, { "epoch": 0.1211231418608919, "grad_norm": 24.16057014465332, "learning_rate": 6.046981097449074e-07, "loss": 0.3894, "num_input_tokens_seen": 1428928, "step": 660 }, { "epoch": 0.12204074142044412, "grad_norm": 19.913328170776367, "learning_rate": 6.092861075426684e-07, "loss": 0.3549, "num_input_tokens_seen": 1440480, "step": 665 }, { "epoch": 0.12295834097999633, "grad_norm": 11.951531410217285, "learning_rate": 6.138741053404295e-07, "loss": 0.3583, "num_input_tokens_seen": 1450848, "step": 670 }, { "epoch": 0.12387594053954853, "grad_norm": 44.001747131347656, "learning_rate": 6.184621031381906e-07, "loss": 0.354, "num_input_tokens_seen": 1462976, "step": 675 }, { "epoch": 0.12479354009910075, "grad_norm": 16.506664276123047, "learning_rate": 6.230501009359516e-07, "loss": 0.4, "num_input_tokens_seen": 1474080, "step": 680 }, { "epoch": 0.12571113965865297, "grad_norm": 28.3294677734375, "learning_rate": 6.276380987337126e-07, "loss": 0.3406, "num_input_tokens_seen": 1485952, "step": 685 }, { "epoch": 0.12662873921820517, "grad_norm": 51.582054138183594, "learning_rate": 6.322260965314738e-07, "loss": 0.3827, "num_input_tokens_seen": 1496096, "step": 690 }, { "epoch": 0.1275463387777574, "grad_norm": 42.946205139160156, "learning_rate": 6.368140943292348e-07, "loss": 0.3642, "num_input_tokens_seen": 1508448, "step": 695 }, { "epoch": 0.1284639383373096, "grad_norm": 65.52490997314453, "learning_rate": 6.414020921269958e-07, "loss": 0.4257, "num_input_tokens_seen": 1518912, "step": 700 }, { "epoch": 0.1293815378968618, "grad_norm": 54.52393341064453, "learning_rate": 6.459900899247569e-07, "loss": 0.381, "num_input_tokens_seen": 1531008, "step": 705 }, { "epoch": 0.13029913745641403, "grad_norm": 27.02136993408203, "learning_rate": 6.505780877225179e-07, "loss": 0.3627, "num_input_tokens_seen": 1541664, "step": 710 }, { "epoch": 0.13121673701596623, "grad_norm": 33.73469161987305, "learning_rate": 6.551660855202791e-07, "loss": 0.3864, "num_input_tokens_seen": 1553312, "step": 715 }, { "epoch": 0.13213433657551846, "grad_norm": 18.377470016479492, "learning_rate": 6.5975408331804e-07, "loss": 0.3236, "num_input_tokens_seen": 1564000, "step": 720 }, { "epoch": 0.13305193613507066, "grad_norm": 30.337505340576172, "learning_rate": 6.643420811158011e-07, "loss": 0.3553, "num_input_tokens_seen": 1575488, "step": 725 }, { "epoch": 0.13396953569462286, "grad_norm": 20.581436157226562, "learning_rate": 6.689300789135622e-07, "loss": 0.4052, "num_input_tokens_seen": 1587328, "step": 730 }, { "epoch": 0.1348871352541751, "grad_norm": 28.832651138305664, "learning_rate": 6.735180767113233e-07, "loss": 0.3672, "num_input_tokens_seen": 1598944, "step": 735 }, { "epoch": 0.1358047348137273, "grad_norm": 47.10135269165039, "learning_rate": 6.781060745090842e-07, "loss": 0.3837, "num_input_tokens_seen": 1609216, "step": 740 }, { "epoch": 0.1367223343732795, "grad_norm": 19.21710777282715, "learning_rate": 6.826940723068453e-07, "loss": 0.3463, "num_input_tokens_seen": 1618144, "step": 745 }, { "epoch": 0.13763993393283172, "grad_norm": 60.532596588134766, "learning_rate": 6.872820701046064e-07, "loss": 0.3467, "num_input_tokens_seen": 1629216, "step": 750 }, { "epoch": 0.13855753349238392, "grad_norm": 15.81717586517334, "learning_rate": 6.918700679023675e-07, "loss": 0.3385, "num_input_tokens_seen": 1640864, "step": 755 }, { "epoch": 0.13947513305193612, "grad_norm": 41.48883819580078, "learning_rate": 6.964580657001285e-07, "loss": 0.3583, "num_input_tokens_seen": 1652416, "step": 760 }, { "epoch": 0.14039273261148835, "grad_norm": 21.91533660888672, "learning_rate": 7.010460634978895e-07, "loss": 0.3463, "num_input_tokens_seen": 1664160, "step": 765 }, { "epoch": 0.14131033217104055, "grad_norm": 28.269317626953125, "learning_rate": 7.056340612956507e-07, "loss": 0.3529, "num_input_tokens_seen": 1675520, "step": 770 }, { "epoch": 0.14222793173059278, "grad_norm": 46.61476516723633, "learning_rate": 7.102220590934117e-07, "loss": 0.384, "num_input_tokens_seen": 1687136, "step": 775 }, { "epoch": 0.14314553129014498, "grad_norm": 22.487075805664062, "learning_rate": 7.148100568911727e-07, "loss": 0.3568, "num_input_tokens_seen": 1697728, "step": 780 }, { "epoch": 0.14406313084969719, "grad_norm": 30.197853088378906, "learning_rate": 7.193980546889338e-07, "loss": 0.3493, "num_input_tokens_seen": 1708576, "step": 785 }, { "epoch": 0.14498073040924941, "grad_norm": 21.87785530090332, "learning_rate": 7.239860524866948e-07, "loss": 0.3498, "num_input_tokens_seen": 1719968, "step": 790 }, { "epoch": 0.14589832996880162, "grad_norm": 19.408946990966797, "learning_rate": 7.28574050284456e-07, "loss": 0.3373, "num_input_tokens_seen": 1731872, "step": 795 }, { "epoch": 0.14681592952835382, "grad_norm": 19.759323120117188, "learning_rate": 7.33162048082217e-07, "loss": 0.3597, "num_input_tokens_seen": 1743328, "step": 800 }, { "epoch": 0.14773352908790605, "grad_norm": 19.043310165405273, "learning_rate": 7.37750045879978e-07, "loss": 0.3216, "num_input_tokens_seen": 1753792, "step": 805 }, { "epoch": 0.14865112864745825, "grad_norm": 29.48263931274414, "learning_rate": 7.423380436777391e-07, "loss": 0.3772, "num_input_tokens_seen": 1763200, "step": 810 }, { "epoch": 0.14956872820701045, "grad_norm": 43.140296936035156, "learning_rate": 7.469260414755002e-07, "loss": 0.3709, "num_input_tokens_seen": 1773952, "step": 815 }, { "epoch": 0.15048632776656268, "grad_norm": 19.689550399780273, "learning_rate": 7.515140392732611e-07, "loss": 0.3298, "num_input_tokens_seen": 1784832, "step": 820 }, { "epoch": 0.15140392732611488, "grad_norm": 60.50889205932617, "learning_rate": 7.561020370710223e-07, "loss": 0.3845, "num_input_tokens_seen": 1795296, "step": 825 }, { "epoch": 0.1523215268856671, "grad_norm": 34.60541915893555, "learning_rate": 7.606900348687833e-07, "loss": 0.403, "num_input_tokens_seen": 1806368, "step": 830 }, { "epoch": 0.1532391264452193, "grad_norm": 18.883148193359375, "learning_rate": 7.652780326665444e-07, "loss": 0.3394, "num_input_tokens_seen": 1817440, "step": 835 }, { "epoch": 0.1541567260047715, "grad_norm": 23.265886306762695, "learning_rate": 7.698660304643055e-07, "loss": 0.3148, "num_input_tokens_seen": 1829088, "step": 840 }, { "epoch": 0.15507432556432374, "grad_norm": 20.250999450683594, "learning_rate": 7.744540282620664e-07, "loss": 0.327, "num_input_tokens_seen": 1840544, "step": 845 }, { "epoch": 0.15599192512387594, "grad_norm": 23.785612106323242, "learning_rate": 7.790420260598276e-07, "loss": 0.3548, "num_input_tokens_seen": 1851456, "step": 850 }, { "epoch": 0.15690952468342814, "grad_norm": 23.363327026367188, "learning_rate": 7.836300238575886e-07, "loss": 0.3865, "num_input_tokens_seen": 1862720, "step": 855 }, { "epoch": 0.15782712424298037, "grad_norm": 32.47359085083008, "learning_rate": 7.882180216553497e-07, "loss": 0.3879, "num_input_tokens_seen": 1873344, "step": 860 }, { "epoch": 0.15874472380253257, "grad_norm": 27.36910057067871, "learning_rate": 7.928060194531108e-07, "loss": 0.2989, "num_input_tokens_seen": 1883328, "step": 865 }, { "epoch": 0.15966232336208477, "grad_norm": 44.53080368041992, "learning_rate": 7.973940172508718e-07, "loss": 0.4243, "num_input_tokens_seen": 1893440, "step": 870 }, { "epoch": 0.160579922921637, "grad_norm": 41.24163818359375, "learning_rate": 8.019820150486328e-07, "loss": 0.3527, "num_input_tokens_seen": 1904320, "step": 875 }, { "epoch": 0.1614975224811892, "grad_norm": 57.03498077392578, "learning_rate": 8.06570012846394e-07, "loss": 0.3414, "num_input_tokens_seen": 1916448, "step": 880 }, { "epoch": 0.16241512204074143, "grad_norm": 38.21156311035156, "learning_rate": 8.111580106441549e-07, "loss": 0.3817, "num_input_tokens_seen": 1928160, "step": 885 }, { "epoch": 0.16333272160029363, "grad_norm": 28.32093620300293, "learning_rate": 8.157460084419159e-07, "loss": 0.3609, "num_input_tokens_seen": 1936960, "step": 890 }, { "epoch": 0.16425032115984584, "grad_norm": 22.859773635864258, "learning_rate": 8.203340062396771e-07, "loss": 0.3433, "num_input_tokens_seen": 1947488, "step": 895 }, { "epoch": 0.16516792071939806, "grad_norm": 26.573986053466797, "learning_rate": 8.249220040374381e-07, "loss": 0.3332, "num_input_tokens_seen": 1959168, "step": 900 }, { "epoch": 0.16608552027895027, "grad_norm": 28.150470733642578, "learning_rate": 8.295100018351993e-07, "loss": 0.4029, "num_input_tokens_seen": 1969600, "step": 905 }, { "epoch": 0.16700311983850247, "grad_norm": 31.961820602416992, "learning_rate": 8.340979996329602e-07, "loss": 0.3699, "num_input_tokens_seen": 1980352, "step": 910 }, { "epoch": 0.1679207193980547, "grad_norm": 24.871601104736328, "learning_rate": 8.386859974307213e-07, "loss": 0.3745, "num_input_tokens_seen": 1990624, "step": 915 }, { "epoch": 0.1688383189576069, "grad_norm": 41.92853927612305, "learning_rate": 8.432739952284824e-07, "loss": 0.3729, "num_input_tokens_seen": 2001696, "step": 920 }, { "epoch": 0.1697559185171591, "grad_norm": 26.23346519470215, "learning_rate": 8.478619930262435e-07, "loss": 0.3376, "num_input_tokens_seen": 2013536, "step": 925 }, { "epoch": 0.17067351807671133, "grad_norm": 20.219900131225586, "learning_rate": 8.524499908240044e-07, "loss": 0.3388, "num_input_tokens_seen": 2023392, "step": 930 }, { "epoch": 0.17159111763626353, "grad_norm": 27.094310760498047, "learning_rate": 8.570379886217656e-07, "loss": 0.2901, "num_input_tokens_seen": 2034816, "step": 935 }, { "epoch": 0.17250871719581576, "grad_norm": 13.786638259887695, "learning_rate": 8.616259864195266e-07, "loss": 0.3586, "num_input_tokens_seen": 2046720, "step": 940 }, { "epoch": 0.17342631675536796, "grad_norm": 27.686315536499023, "learning_rate": 8.662139842172875e-07, "loss": 0.4587, "num_input_tokens_seen": 2057312, "step": 945 }, { "epoch": 0.17434391631492016, "grad_norm": 20.311187744140625, "learning_rate": 8.708019820150487e-07, "loss": 0.386, "num_input_tokens_seen": 2066912, "step": 950 }, { "epoch": 0.1752615158744724, "grad_norm": 36.61603546142578, "learning_rate": 8.753899798128097e-07, "loss": 0.3362, "num_input_tokens_seen": 2078624, "step": 955 }, { "epoch": 0.1761791154340246, "grad_norm": 31.341224670410156, "learning_rate": 8.799779776105709e-07, "loss": 0.3444, "num_input_tokens_seen": 2087360, "step": 960 }, { "epoch": 0.1770967149935768, "grad_norm": 23.092859268188477, "learning_rate": 8.845659754083319e-07, "loss": 0.3928, "num_input_tokens_seen": 2097728, "step": 965 }, { "epoch": 0.17801431455312902, "grad_norm": 38.281944274902344, "learning_rate": 8.891539732060929e-07, "loss": 0.375, "num_input_tokens_seen": 2108448, "step": 970 }, { "epoch": 0.17893191411268122, "grad_norm": 22.986848831176758, "learning_rate": 8.93741971003854e-07, "loss": 0.368, "num_input_tokens_seen": 2119136, "step": 975 }, { "epoch": 0.17984951367223342, "grad_norm": 14.866080284118652, "learning_rate": 8.983299688016151e-07, "loss": 0.3799, "num_input_tokens_seen": 2129600, "step": 980 }, { "epoch": 0.18076711323178565, "grad_norm": 24.407257080078125, "learning_rate": 9.029179665993761e-07, "loss": 0.3224, "num_input_tokens_seen": 2140672, "step": 985 }, { "epoch": 0.18168471279133785, "grad_norm": 35.84478759765625, "learning_rate": 9.075059643971372e-07, "loss": 0.3322, "num_input_tokens_seen": 2151136, "step": 990 }, { "epoch": 0.18260231235089008, "grad_norm": 35.83539581298828, "learning_rate": 9.120939621948982e-07, "loss": 0.3877, "num_input_tokens_seen": 2162176, "step": 995 }, { "epoch": 0.18351991191044228, "grad_norm": 23.053102493286133, "learning_rate": 9.166819599926592e-07, "loss": 0.376, "num_input_tokens_seen": 2172384, "step": 1000 }, { "epoch": 0.1844375114699945, "grad_norm": 39.202396392822266, "learning_rate": 9.212699577904204e-07, "loss": 0.3386, "num_input_tokens_seen": 2181888, "step": 1005 }, { "epoch": 0.18535511102954672, "grad_norm": 30.949514389038086, "learning_rate": 9.258579555881813e-07, "loss": 0.3863, "num_input_tokens_seen": 2191904, "step": 1010 }, { "epoch": 0.18627271058909892, "grad_norm": 39.323184967041016, "learning_rate": 9.304459533859425e-07, "loss": 0.3727, "num_input_tokens_seen": 2202752, "step": 1015 }, { "epoch": 0.18719031014865112, "grad_norm": 29.57746696472168, "learning_rate": 9.350339511837035e-07, "loss": 0.3485, "num_input_tokens_seen": 2212736, "step": 1020 }, { "epoch": 0.18810790970820335, "grad_norm": 18.274776458740234, "learning_rate": 9.396219489814646e-07, "loss": 0.3262, "num_input_tokens_seen": 2223328, "step": 1025 }, { "epoch": 0.18902550926775555, "grad_norm": 28.089702606201172, "learning_rate": 9.442099467792257e-07, "loss": 0.3302, "num_input_tokens_seen": 2233504, "step": 1030 }, { "epoch": 0.18994310882730775, "grad_norm": 19.159934997558594, "learning_rate": 9.487979445769866e-07, "loss": 0.3864, "num_input_tokens_seen": 2243968, "step": 1035 }, { "epoch": 0.19086070838685998, "grad_norm": 20.819501876831055, "learning_rate": 9.533859423747477e-07, "loss": 0.314, "num_input_tokens_seen": 2253888, "step": 1040 }, { "epoch": 0.19177830794641218, "grad_norm": 19.883283615112305, "learning_rate": 9.579739401725087e-07, "loss": 0.3534, "num_input_tokens_seen": 2264544, "step": 1045 }, { "epoch": 0.1926959075059644, "grad_norm": 18.13475799560547, "learning_rate": 9.625619379702699e-07, "loss": 0.3383, "num_input_tokens_seen": 2276128, "step": 1050 }, { "epoch": 0.1936135070655166, "grad_norm": 14.896899223327637, "learning_rate": 9.671499357680308e-07, "loss": 0.3311, "num_input_tokens_seen": 2287456, "step": 1055 }, { "epoch": 0.1945311066250688, "grad_norm": 29.27265167236328, "learning_rate": 9.71737933565792e-07, "loss": 0.3458, "num_input_tokens_seen": 2297088, "step": 1060 }, { "epoch": 0.19544870618462104, "grad_norm": 48.173770904541016, "learning_rate": 9.76325931363553e-07, "loss": 0.3332, "num_input_tokens_seen": 2307360, "step": 1065 }, { "epoch": 0.19636630574417324, "grad_norm": 18.28656578063965, "learning_rate": 9.80913929161314e-07, "loss": 0.3697, "num_input_tokens_seen": 2317344, "step": 1070 }, { "epoch": 0.19728390530372544, "grad_norm": 28.433557510375977, "learning_rate": 9.855019269590752e-07, "loss": 0.3567, "num_input_tokens_seen": 2326880, "step": 1075 }, { "epoch": 0.19820150486327767, "grad_norm": 40.97999572753906, "learning_rate": 9.900899247568362e-07, "loss": 0.3895, "num_input_tokens_seen": 2337152, "step": 1080 }, { "epoch": 0.19911910442282987, "grad_norm": 29.067129135131836, "learning_rate": 9.946779225545973e-07, "loss": 0.4054, "num_input_tokens_seen": 2347936, "step": 1085 }, { "epoch": 0.20003670398238207, "grad_norm": 14.278656005859375, "learning_rate": 9.992659203523582e-07, "loss": 0.2652, "num_input_tokens_seen": 2358816, "step": 1090 }, { "epoch": 0.2009543035419343, "grad_norm": 29.619524002075195, "learning_rate": 1.0038539181501194e-06, "loss": 0.388, "num_input_tokens_seen": 2369824, "step": 1095 }, { "epoch": 0.2018719031014865, "grad_norm": 20.459291458129883, "learning_rate": 1.0084419159478805e-06, "loss": 0.3402, "num_input_tokens_seen": 2380960, "step": 1100 }, { "epoch": 0.20278950266103873, "grad_norm": 15.5150146484375, "learning_rate": 1.0130299137456415e-06, "loss": 0.3368, "num_input_tokens_seen": 2391968, "step": 1105 }, { "epoch": 0.20370710222059094, "grad_norm": 48.870975494384766, "learning_rate": 1.0176179115434024e-06, "loss": 0.3738, "num_input_tokens_seen": 2402560, "step": 1110 }, { "epoch": 0.20462470178014314, "grad_norm": 25.977943420410156, "learning_rate": 1.0222059093411636e-06, "loss": 0.394, "num_input_tokens_seen": 2413760, "step": 1115 }, { "epoch": 0.20554230133969537, "grad_norm": 15.66594123840332, "learning_rate": 1.0267939071389247e-06, "loss": 0.3428, "num_input_tokens_seen": 2423616, "step": 1120 }, { "epoch": 0.20645990089924757, "grad_norm": 20.541345596313477, "learning_rate": 1.0313819049366859e-06, "loss": 0.3541, "num_input_tokens_seen": 2434912, "step": 1125 }, { "epoch": 0.20737750045879977, "grad_norm": 15.837491035461426, "learning_rate": 1.0359699027344468e-06, "loss": 0.347, "num_input_tokens_seen": 2445088, "step": 1130 }, { "epoch": 0.208295100018352, "grad_norm": 33.97455596923828, "learning_rate": 1.0405579005322077e-06, "loss": 0.3514, "num_input_tokens_seen": 2456992, "step": 1135 }, { "epoch": 0.2092126995779042, "grad_norm": 48.42661666870117, "learning_rate": 1.0451458983299689e-06, "loss": 0.4176, "num_input_tokens_seen": 2468384, "step": 1140 }, { "epoch": 0.21013029913745643, "grad_norm": 18.483016967773438, "learning_rate": 1.04973389612773e-06, "loss": 0.3536, "num_input_tokens_seen": 2479392, "step": 1145 }, { "epoch": 0.21104789869700863, "grad_norm": 19.577802658081055, "learning_rate": 1.054321893925491e-06, "loss": 0.3747, "num_input_tokens_seen": 2490784, "step": 1150 }, { "epoch": 0.21196549825656083, "grad_norm": 22.311777114868164, "learning_rate": 1.0589098917232521e-06, "loss": 0.3174, "num_input_tokens_seen": 2500256, "step": 1155 }, { "epoch": 0.21288309781611306, "grad_norm": 24.936874389648438, "learning_rate": 1.063497889521013e-06, "loss": 0.3497, "num_input_tokens_seen": 2510784, "step": 1160 }, { "epoch": 0.21380069737566526, "grad_norm": 34.493446350097656, "learning_rate": 1.068085887318774e-06, "loss": 0.3984, "num_input_tokens_seen": 2521056, "step": 1165 }, { "epoch": 0.21471829693521746, "grad_norm": 20.37646484375, "learning_rate": 1.0726738851165352e-06, "loss": 0.3761, "num_input_tokens_seen": 2531072, "step": 1170 }, { "epoch": 0.2156358964947697, "grad_norm": 17.86088752746582, "learning_rate": 1.0772618829142963e-06, "loss": 0.33, "num_input_tokens_seen": 2543200, "step": 1175 }, { "epoch": 0.2165534960543219, "grad_norm": 46.60181427001953, "learning_rate": 1.0818498807120575e-06, "loss": 0.3469, "num_input_tokens_seen": 2554304, "step": 1180 }, { "epoch": 0.2174710956138741, "grad_norm": 36.19442367553711, "learning_rate": 1.0864378785098184e-06, "loss": 0.3596, "num_input_tokens_seen": 2564800, "step": 1185 }, { "epoch": 0.21838869517342632, "grad_norm": 22.674135208129883, "learning_rate": 1.0910258763075793e-06, "loss": 0.4008, "num_input_tokens_seen": 2574944, "step": 1190 }, { "epoch": 0.21930629473297852, "grad_norm": 29.06866455078125, "learning_rate": 1.0956138741053405e-06, "loss": 0.3412, "num_input_tokens_seen": 2586880, "step": 1195 }, { "epoch": 0.22022389429253075, "grad_norm": 18.705951690673828, "learning_rate": 1.1002018719031016e-06, "loss": 0.3739, "num_input_tokens_seen": 2596032, "step": 1200 }, { "epoch": 0.22114149385208295, "grad_norm": 22.83848762512207, "learning_rate": 1.1047898697008626e-06, "loss": 0.3063, "num_input_tokens_seen": 2606752, "step": 1205 }, { "epoch": 0.22205909341163516, "grad_norm": 18.233924865722656, "learning_rate": 1.1093778674986237e-06, "loss": 0.3275, "num_input_tokens_seen": 2616480, "step": 1210 }, { "epoch": 0.22297669297118738, "grad_norm": 18.7579288482666, "learning_rate": 1.1139658652963847e-06, "loss": 0.3189, "num_input_tokens_seen": 2627552, "step": 1215 }, { "epoch": 0.22389429253073959, "grad_norm": 27.411096572875977, "learning_rate": 1.1185538630941458e-06, "loss": 0.3419, "num_input_tokens_seen": 2638400, "step": 1220 }, { "epoch": 0.2248118920902918, "grad_norm": 29.710773468017578, "learning_rate": 1.123141860891907e-06, "loss": 0.2767, "num_input_tokens_seen": 2648352, "step": 1225 }, { "epoch": 0.22572949164984402, "grad_norm": 15.832015037536621, "learning_rate": 1.127729858689668e-06, "loss": 0.3707, "num_input_tokens_seen": 2660352, "step": 1230 }, { "epoch": 0.22664709120939622, "grad_norm": 65.7623062133789, "learning_rate": 1.132317856487429e-06, "loss": 0.3328, "num_input_tokens_seen": 2672704, "step": 1235 }, { "epoch": 0.22756469076894842, "grad_norm": 38.189876556396484, "learning_rate": 1.13690585428519e-06, "loss": 0.4145, "num_input_tokens_seen": 2683200, "step": 1240 }, { "epoch": 0.22848229032850065, "grad_norm": 33.167236328125, "learning_rate": 1.1414938520829511e-06, "loss": 0.3098, "num_input_tokens_seen": 2694176, "step": 1245 }, { "epoch": 0.22939988988805285, "grad_norm": 35.59111785888672, "learning_rate": 1.1460818498807123e-06, "loss": 0.4129, "num_input_tokens_seen": 2705120, "step": 1250 }, { "epoch": 0.23031748944760508, "grad_norm": 26.585905075073242, "learning_rate": 1.1506698476784732e-06, "loss": 0.3476, "num_input_tokens_seen": 2714816, "step": 1255 }, { "epoch": 0.23123508900715728, "grad_norm": 18.496864318847656, "learning_rate": 1.1552578454762342e-06, "loss": 0.3143, "num_input_tokens_seen": 2725888, "step": 1260 }, { "epoch": 0.23215268856670948, "grad_norm": 40.45967102050781, "learning_rate": 1.1598458432739953e-06, "loss": 0.3589, "num_input_tokens_seen": 2737376, "step": 1265 }, { "epoch": 0.2330702881262617, "grad_norm": 28.759916305541992, "learning_rate": 1.1644338410717565e-06, "loss": 0.3331, "num_input_tokens_seen": 2747968, "step": 1270 }, { "epoch": 0.2339878876858139, "grad_norm": 27.078947067260742, "learning_rate": 1.1690218388695174e-06, "loss": 0.4056, "num_input_tokens_seen": 2757696, "step": 1275 }, { "epoch": 0.2349054872453661, "grad_norm": 26.06607437133789, "learning_rate": 1.1736098366672786e-06, "loss": 0.342, "num_input_tokens_seen": 2768192, "step": 1280 }, { "epoch": 0.23582308680491834, "grad_norm": 19.096284866333008, "learning_rate": 1.1781978344650395e-06, "loss": 0.3293, "num_input_tokens_seen": 2778880, "step": 1285 }, { "epoch": 0.23674068636447054, "grad_norm": 22.48682403564453, "learning_rate": 1.1827858322628006e-06, "loss": 0.3842, "num_input_tokens_seen": 2789920, "step": 1290 }, { "epoch": 0.23765828592402274, "grad_norm": 28.37664031982422, "learning_rate": 1.1873738300605616e-06, "loss": 0.328, "num_input_tokens_seen": 2802176, "step": 1295 }, { "epoch": 0.23857588548357497, "grad_norm": 38.66581344604492, "learning_rate": 1.1919618278583227e-06, "loss": 0.3476, "num_input_tokens_seen": 2813184, "step": 1300 }, { "epoch": 0.23949348504312717, "grad_norm": 12.908601760864258, "learning_rate": 1.1965498256560839e-06, "loss": 0.3299, "num_input_tokens_seen": 2824224, "step": 1305 }, { "epoch": 0.2404110846026794, "grad_norm": 26.573768615722656, "learning_rate": 1.2011378234538448e-06, "loss": 0.3197, "num_input_tokens_seen": 2834496, "step": 1310 }, { "epoch": 0.2413286841622316, "grad_norm": 25.535717010498047, "learning_rate": 1.2057258212516058e-06, "loss": 0.3642, "num_input_tokens_seen": 2844256, "step": 1315 }, { "epoch": 0.2422462837217838, "grad_norm": 32.846405029296875, "learning_rate": 1.210313819049367e-06, "loss": 0.3469, "num_input_tokens_seen": 2855968, "step": 1320 }, { "epoch": 0.24316388328133604, "grad_norm": 26.295787811279297, "learning_rate": 1.214901816847128e-06, "loss": 0.3071, "num_input_tokens_seen": 2867488, "step": 1325 }, { "epoch": 0.24408148284088824, "grad_norm": 32.23371887207031, "learning_rate": 1.2194898146448892e-06, "loss": 0.3371, "num_input_tokens_seen": 2878368, "step": 1330 }, { "epoch": 0.24499908240044044, "grad_norm": 17.96919059753418, "learning_rate": 1.2240778124426501e-06, "loss": 0.3489, "num_input_tokens_seen": 2890816, "step": 1335 }, { "epoch": 0.24591668195999267, "grad_norm": 28.115230560302734, "learning_rate": 1.228665810240411e-06, "loss": 0.3058, "num_input_tokens_seen": 2900960, "step": 1340 }, { "epoch": 0.24683428151954487, "grad_norm": 17.18935775756836, "learning_rate": 1.2332538080381722e-06, "loss": 0.4061, "num_input_tokens_seen": 2911744, "step": 1345 }, { "epoch": 0.24775188107909707, "grad_norm": 29.43348503112793, "learning_rate": 1.2378418058359334e-06, "loss": 0.2892, "num_input_tokens_seen": 2922464, "step": 1350 }, { "epoch": 0.2486694806386493, "grad_norm": 19.993070602416992, "learning_rate": 1.2424298036336943e-06, "loss": 0.3312, "num_input_tokens_seen": 2933280, "step": 1355 }, { "epoch": 0.2495870801982015, "grad_norm": 53.0883903503418, "learning_rate": 1.2470178014314555e-06, "loss": 0.3157, "num_input_tokens_seen": 2943488, "step": 1360 }, { "epoch": 0.2505046797577537, "grad_norm": 46.959007263183594, "learning_rate": 1.2516057992292166e-06, "loss": 0.3701, "num_input_tokens_seen": 2954560, "step": 1365 }, { "epoch": 0.25142227931730593, "grad_norm": 48.92950439453125, "learning_rate": 1.2561937970269776e-06, "loss": 0.404, "num_input_tokens_seen": 2965472, "step": 1370 }, { "epoch": 0.25233987887685816, "grad_norm": 29.513980865478516, "learning_rate": 1.2607817948247387e-06, "loss": 0.338, "num_input_tokens_seen": 2977632, "step": 1375 }, { "epoch": 0.25325747843641033, "grad_norm": 56.63252639770508, "learning_rate": 1.2653697926224999e-06, "loss": 0.3897, "num_input_tokens_seen": 2987456, "step": 1380 }, { "epoch": 0.25417507799596256, "grad_norm": 22.747718811035156, "learning_rate": 1.2699577904202606e-06, "loss": 0.3605, "num_input_tokens_seen": 2998784, "step": 1385 }, { "epoch": 0.2550926775555148, "grad_norm": 12.598017692565918, "learning_rate": 1.2745457882180217e-06, "loss": 0.3049, "num_input_tokens_seen": 3010560, "step": 1390 }, { "epoch": 0.25601027711506696, "grad_norm": 22.557186126708984, "learning_rate": 1.2791337860157829e-06, "loss": 0.3486, "num_input_tokens_seen": 3021312, "step": 1395 }, { "epoch": 0.2569278766746192, "grad_norm": 18.108224868774414, "learning_rate": 1.2837217838135438e-06, "loss": 0.3002, "num_input_tokens_seen": 3031488, "step": 1400 }, { "epoch": 0.2578454762341714, "grad_norm": 40.86549758911133, "learning_rate": 1.288309781611305e-06, "loss": 0.3778, "num_input_tokens_seen": 3042816, "step": 1405 }, { "epoch": 0.2587630757937236, "grad_norm": 19.606258392333984, "learning_rate": 1.2928977794090661e-06, "loss": 0.35, "num_input_tokens_seen": 3053536, "step": 1410 }, { "epoch": 0.2596806753532758, "grad_norm": 25.052797317504883, "learning_rate": 1.2974857772068269e-06, "loss": 0.3196, "num_input_tokens_seen": 3064896, "step": 1415 }, { "epoch": 0.26059827491282805, "grad_norm": 20.901853561401367, "learning_rate": 1.302073775004588e-06, "loss": 0.4114, "num_input_tokens_seen": 3076288, "step": 1420 }, { "epoch": 0.2615158744723802, "grad_norm": 45.97092056274414, "learning_rate": 1.3066617728023492e-06, "loss": 0.3599, "num_input_tokens_seen": 3086880, "step": 1425 }, { "epoch": 0.26243347403193246, "grad_norm": 39.58267593383789, "learning_rate": 1.31124977060011e-06, "loss": 0.4136, "num_input_tokens_seen": 3098912, "step": 1430 }, { "epoch": 0.2633510735914847, "grad_norm": 12.154407501220703, "learning_rate": 1.3158377683978712e-06, "loss": 0.3371, "num_input_tokens_seen": 3108768, "step": 1435 }, { "epoch": 0.2642686731510369, "grad_norm": 30.313716888427734, "learning_rate": 1.3204257661956324e-06, "loss": 0.3313, "num_input_tokens_seen": 3119648, "step": 1440 }, { "epoch": 0.2651862727105891, "grad_norm": 20.368709564208984, "learning_rate": 1.3250137639933933e-06, "loss": 0.3224, "num_input_tokens_seen": 3130560, "step": 1445 }, { "epoch": 0.2661038722701413, "grad_norm": 18.3480167388916, "learning_rate": 1.3296017617911545e-06, "loss": 0.3669, "num_input_tokens_seen": 3142432, "step": 1450 }, { "epoch": 0.26702147182969355, "grad_norm": 14.722963333129883, "learning_rate": 1.3341897595889156e-06, "loss": 0.3628, "num_input_tokens_seen": 3152768, "step": 1455 }, { "epoch": 0.2679390713892457, "grad_norm": 22.652523040771484, "learning_rate": 1.3387777573866768e-06, "loss": 0.366, "num_input_tokens_seen": 3163456, "step": 1460 }, { "epoch": 0.26885667094879795, "grad_norm": 32.7216911315918, "learning_rate": 1.3433657551844375e-06, "loss": 0.3462, "num_input_tokens_seen": 3175072, "step": 1465 }, { "epoch": 0.2697742705083502, "grad_norm": 18.085693359375, "learning_rate": 1.3479537529821987e-06, "loss": 0.3256, "num_input_tokens_seen": 3186432, "step": 1470 }, { "epoch": 0.27069187006790235, "grad_norm": 23.175506591796875, "learning_rate": 1.3525417507799598e-06, "loss": 0.3026, "num_input_tokens_seen": 3196480, "step": 1475 }, { "epoch": 0.2716094696274546, "grad_norm": 41.800655364990234, "learning_rate": 1.3571297485777207e-06, "loss": 0.3058, "num_input_tokens_seen": 3206464, "step": 1480 }, { "epoch": 0.2725270691870068, "grad_norm": 61.670135498046875, "learning_rate": 1.3617177463754819e-06, "loss": 0.4676, "num_input_tokens_seen": 3217856, "step": 1485 }, { "epoch": 0.273444668746559, "grad_norm": 12.82122802734375, "learning_rate": 1.366305744173243e-06, "loss": 0.3761, "num_input_tokens_seen": 3229152, "step": 1490 }, { "epoch": 0.2743622683061112, "grad_norm": 30.57657241821289, "learning_rate": 1.370893741971004e-06, "loss": 0.3424, "num_input_tokens_seen": 3240608, "step": 1495 }, { "epoch": 0.27527986786566344, "grad_norm": 22.23455047607422, "learning_rate": 1.3754817397687651e-06, "loss": 0.3578, "num_input_tokens_seen": 3251552, "step": 1500 }, { "epoch": 0.2761974674252156, "grad_norm": 28.928815841674805, "learning_rate": 1.3800697375665263e-06, "loss": 0.3675, "num_input_tokens_seen": 3262560, "step": 1505 }, { "epoch": 0.27711506698476784, "grad_norm": 51.02064895629883, "learning_rate": 1.384657735364287e-06, "loss": 0.3651, "num_input_tokens_seen": 3273344, "step": 1510 }, { "epoch": 0.2780326665443201, "grad_norm": 13.214334487915039, "learning_rate": 1.3892457331620482e-06, "loss": 0.3587, "num_input_tokens_seen": 3284544, "step": 1515 }, { "epoch": 0.27895026610387225, "grad_norm": 19.248647689819336, "learning_rate": 1.3938337309598093e-06, "loss": 0.3448, "num_input_tokens_seen": 3294208, "step": 1520 }, { "epoch": 0.2798678656634245, "grad_norm": 18.37225914001465, "learning_rate": 1.3984217287575702e-06, "loss": 0.3816, "num_input_tokens_seen": 3304320, "step": 1525 }, { "epoch": 0.2807854652229767, "grad_norm": 20.205299377441406, "learning_rate": 1.4030097265553314e-06, "loss": 0.3767, "num_input_tokens_seen": 3314976, "step": 1530 }, { "epoch": 0.2817030647825289, "grad_norm": 23.23419189453125, "learning_rate": 1.4075977243530925e-06, "loss": 0.3614, "num_input_tokens_seen": 3325664, "step": 1535 }, { "epoch": 0.2826206643420811, "grad_norm": 23.49896812438965, "learning_rate": 1.4121857221508533e-06, "loss": 0.2957, "num_input_tokens_seen": 3335488, "step": 1540 }, { "epoch": 0.28353826390163334, "grad_norm": 24.300168991088867, "learning_rate": 1.4167737199486144e-06, "loss": 0.3769, "num_input_tokens_seen": 3345312, "step": 1545 }, { "epoch": 0.28445586346118557, "grad_norm": 28.494091033935547, "learning_rate": 1.4213617177463756e-06, "loss": 0.3749, "num_input_tokens_seen": 3356384, "step": 1550 }, { "epoch": 0.28537346302073774, "grad_norm": 20.176258087158203, "learning_rate": 1.4259497155441365e-06, "loss": 0.3458, "num_input_tokens_seen": 3367488, "step": 1555 }, { "epoch": 0.28629106258028997, "grad_norm": 18.334033966064453, "learning_rate": 1.4305377133418977e-06, "loss": 0.3542, "num_input_tokens_seen": 3378176, "step": 1560 }, { "epoch": 0.2872086621398422, "grad_norm": 32.779136657714844, "learning_rate": 1.4351257111396588e-06, "loss": 0.3966, "num_input_tokens_seen": 3389760, "step": 1565 }, { "epoch": 0.28812626169939437, "grad_norm": 31.810001373291016, "learning_rate": 1.43971370893742e-06, "loss": 0.3024, "num_input_tokens_seen": 3399488, "step": 1570 }, { "epoch": 0.2890438612589466, "grad_norm": 20.837778091430664, "learning_rate": 1.444301706735181e-06, "loss": 0.3854, "num_input_tokens_seen": 3409152, "step": 1575 }, { "epoch": 0.28996146081849883, "grad_norm": 24.165246963500977, "learning_rate": 1.448889704532942e-06, "loss": 0.3589, "num_input_tokens_seen": 3421440, "step": 1580 }, { "epoch": 0.290879060378051, "grad_norm": 34.92793655395508, "learning_rate": 1.4534777023307032e-06, "loss": 0.3685, "num_input_tokens_seen": 3431520, "step": 1585 }, { "epoch": 0.29179665993760323, "grad_norm": 49.91215896606445, "learning_rate": 1.458065700128464e-06, "loss": 0.3308, "num_input_tokens_seen": 3442368, "step": 1590 }, { "epoch": 0.29271425949715546, "grad_norm": 31.815053939819336, "learning_rate": 1.462653697926225e-06, "loss": 0.2972, "num_input_tokens_seen": 3452352, "step": 1595 }, { "epoch": 0.29363185905670763, "grad_norm": 22.004966735839844, "learning_rate": 1.4672416957239862e-06, "loss": 0.3186, "num_input_tokens_seen": 3462816, "step": 1600 }, { "epoch": 0.29454945861625986, "grad_norm": 23.614683151245117, "learning_rate": 1.4718296935217472e-06, "loss": 0.3143, "num_input_tokens_seen": 3474208, "step": 1605 }, { "epoch": 0.2954670581758121, "grad_norm": 41.117279052734375, "learning_rate": 1.4764176913195083e-06, "loss": 0.4058, "num_input_tokens_seen": 3486048, "step": 1610 }, { "epoch": 0.29638465773536427, "grad_norm": 40.03440856933594, "learning_rate": 1.4810056891172695e-06, "loss": 0.3378, "num_input_tokens_seen": 3496576, "step": 1615 }, { "epoch": 0.2973022572949165, "grad_norm": 33.475425720214844, "learning_rate": 1.4855936869150304e-06, "loss": 0.3779, "num_input_tokens_seen": 3509504, "step": 1620 }, { "epoch": 0.2982198568544687, "grad_norm": 29.68485450744629, "learning_rate": 1.4901816847127916e-06, "loss": 0.3603, "num_input_tokens_seen": 3520768, "step": 1625 }, { "epoch": 0.2991374564140209, "grad_norm": 43.126399993896484, "learning_rate": 1.4947696825105527e-06, "loss": 0.3423, "num_input_tokens_seen": 3530848, "step": 1630 }, { "epoch": 0.3000550559735731, "grad_norm": 27.162519454956055, "learning_rate": 1.4993576803083134e-06, "loss": 0.3888, "num_input_tokens_seen": 3541216, "step": 1635 }, { "epoch": 0.30097265553312536, "grad_norm": 14.783987998962402, "learning_rate": 1.5039456781060746e-06, "loss": 0.3402, "num_input_tokens_seen": 3552416, "step": 1640 }, { "epoch": 0.30189025509267753, "grad_norm": 25.876216888427734, "learning_rate": 1.5085336759038357e-06, "loss": 0.3481, "num_input_tokens_seen": 3563424, "step": 1645 }, { "epoch": 0.30280785465222976, "grad_norm": 23.44689178466797, "learning_rate": 1.5131216737015967e-06, "loss": 0.3378, "num_input_tokens_seen": 3574112, "step": 1650 }, { "epoch": 0.303725454211782, "grad_norm": 18.203136444091797, "learning_rate": 1.5177096714993578e-06, "loss": 0.3787, "num_input_tokens_seen": 3584512, "step": 1655 }, { "epoch": 0.3046430537713342, "grad_norm": 23.520353317260742, "learning_rate": 1.522297669297119e-06, "loss": 0.3611, "num_input_tokens_seen": 3595008, "step": 1660 }, { "epoch": 0.3055606533308864, "grad_norm": 14.333218574523926, "learning_rate": 1.5268856670948797e-06, "loss": 0.3598, "num_input_tokens_seen": 3605632, "step": 1665 }, { "epoch": 0.3064782528904386, "grad_norm": 26.54912757873535, "learning_rate": 1.5314736648926408e-06, "loss": 0.3067, "num_input_tokens_seen": 3614496, "step": 1670 }, { "epoch": 0.30739585244999085, "grad_norm": 24.96587562561035, "learning_rate": 1.536061662690402e-06, "loss": 0.3774, "num_input_tokens_seen": 3624896, "step": 1675 }, { "epoch": 0.308313452009543, "grad_norm": 15.059660911560059, "learning_rate": 1.5406496604881631e-06, "loss": 0.3386, "num_input_tokens_seen": 3635520, "step": 1680 }, { "epoch": 0.30923105156909525, "grad_norm": 34.3145751953125, "learning_rate": 1.545237658285924e-06, "loss": 0.3634, "num_input_tokens_seen": 3646176, "step": 1685 }, { "epoch": 0.3101486511286475, "grad_norm": 29.522546768188477, "learning_rate": 1.5498256560836852e-06, "loss": 0.4058, "num_input_tokens_seen": 3658176, "step": 1690 }, { "epoch": 0.31106625068819965, "grad_norm": 23.07151222229004, "learning_rate": 1.5544136538814464e-06, "loss": 0.3173, "num_input_tokens_seen": 3669024, "step": 1695 }, { "epoch": 0.3119838502477519, "grad_norm": 28.232572555541992, "learning_rate": 1.5590016516792073e-06, "loss": 0.3711, "num_input_tokens_seen": 3680800, "step": 1700 }, { "epoch": 0.3129014498073041, "grad_norm": 16.959163665771484, "learning_rate": 1.5635896494769685e-06, "loss": 0.3235, "num_input_tokens_seen": 3691808, "step": 1705 }, { "epoch": 0.3138190493668563, "grad_norm": 14.966397285461426, "learning_rate": 1.5681776472747296e-06, "loss": 0.334, "num_input_tokens_seen": 3701728, "step": 1710 }, { "epoch": 0.3147366489264085, "grad_norm": 38.39533996582031, "learning_rate": 1.5727656450724903e-06, "loss": 0.38, "num_input_tokens_seen": 3712096, "step": 1715 }, { "epoch": 0.31565424848596074, "grad_norm": 15.93130111694336, "learning_rate": 1.5773536428702515e-06, "loss": 0.3564, "num_input_tokens_seen": 3723616, "step": 1720 }, { "epoch": 0.3165718480455129, "grad_norm": 16.0859375, "learning_rate": 1.5819416406680126e-06, "loss": 0.3636, "num_input_tokens_seen": 3734208, "step": 1725 }, { "epoch": 0.31748944760506514, "grad_norm": 12.038972854614258, "learning_rate": 1.5865296384657736e-06, "loss": 0.3444, "num_input_tokens_seen": 3745888, "step": 1730 }, { "epoch": 0.3184070471646174, "grad_norm": 13.231365203857422, "learning_rate": 1.5911176362635347e-06, "loss": 0.3182, "num_input_tokens_seen": 3756352, "step": 1735 }, { "epoch": 0.31932464672416955, "grad_norm": 16.205284118652344, "learning_rate": 1.5957056340612959e-06, "loss": 0.3736, "num_input_tokens_seen": 3766848, "step": 1740 }, { "epoch": 0.3202422462837218, "grad_norm": 31.519166946411133, "learning_rate": 1.6002936318590568e-06, "loss": 0.2876, "num_input_tokens_seen": 3777376, "step": 1745 }, { "epoch": 0.321159845843274, "grad_norm": 16.65972137451172, "learning_rate": 1.604881629656818e-06, "loss": 0.3624, "num_input_tokens_seen": 3788160, "step": 1750 }, { "epoch": 0.3220774454028262, "grad_norm": 37.269447326660156, "learning_rate": 1.609469627454579e-06, "loss": 0.3249, "num_input_tokens_seen": 3799232, "step": 1755 }, { "epoch": 0.3229950449623784, "grad_norm": 26.817527770996094, "learning_rate": 1.6140576252523399e-06, "loss": 0.2672, "num_input_tokens_seen": 3808224, "step": 1760 }, { "epoch": 0.32391264452193064, "grad_norm": 30.63141441345215, "learning_rate": 1.618645623050101e-06, "loss": 0.3996, "num_input_tokens_seen": 3818720, "step": 1765 }, { "epoch": 0.32483024408148287, "grad_norm": 14.31575870513916, "learning_rate": 1.6232336208478622e-06, "loss": 0.3466, "num_input_tokens_seen": 3830400, "step": 1770 }, { "epoch": 0.32574784364103504, "grad_norm": 41.35475158691406, "learning_rate": 1.627821618645623e-06, "loss": 0.3463, "num_input_tokens_seen": 3840704, "step": 1775 }, { "epoch": 0.32666544320058727, "grad_norm": 19.959510803222656, "learning_rate": 1.6324096164433842e-06, "loss": 0.2544, "num_input_tokens_seen": 3852160, "step": 1780 }, { "epoch": 0.3275830427601395, "grad_norm": 31.970430374145508, "learning_rate": 1.6369976142411454e-06, "loss": 0.3108, "num_input_tokens_seen": 3862080, "step": 1785 }, { "epoch": 0.32850064231969167, "grad_norm": 34.211997985839844, "learning_rate": 1.6415856120389065e-06, "loss": 0.5643, "num_input_tokens_seen": 3873984, "step": 1790 }, { "epoch": 0.3294182418792439, "grad_norm": 59.97713088989258, "learning_rate": 1.6461736098366673e-06, "loss": 0.4194, "num_input_tokens_seen": 3883008, "step": 1795 }, { "epoch": 0.33033584143879613, "grad_norm": 50.20813751220703, "learning_rate": 1.6507616076344284e-06, "loss": 0.4001, "num_input_tokens_seen": 3894624, "step": 1800 }, { "epoch": 0.3312534409983483, "grad_norm": 35.115779876708984, "learning_rate": 1.6553496054321896e-06, "loss": 0.3917, "num_input_tokens_seen": 3906752, "step": 1805 }, { "epoch": 0.33217104055790053, "grad_norm": 24.942920684814453, "learning_rate": 1.6599376032299505e-06, "loss": 0.3802, "num_input_tokens_seen": 3917376, "step": 1810 }, { "epoch": 0.33308864011745276, "grad_norm": 12.763558387756348, "learning_rate": 1.6645256010277117e-06, "loss": 0.2921, "num_input_tokens_seen": 3927904, "step": 1815 }, { "epoch": 0.33400623967700493, "grad_norm": 23.30872344970703, "learning_rate": 1.6691135988254728e-06, "loss": 0.3692, "num_input_tokens_seen": 3939264, "step": 1820 }, { "epoch": 0.33492383923655716, "grad_norm": 10.293805122375488, "learning_rate": 1.6737015966232337e-06, "loss": 0.3397, "num_input_tokens_seen": 3950112, "step": 1825 }, { "epoch": 0.3358414387961094, "grad_norm": 15.758155822753906, "learning_rate": 1.678289594420995e-06, "loss": 0.3263, "num_input_tokens_seen": 3961856, "step": 1830 }, { "epoch": 0.33675903835566157, "grad_norm": 15.078376770019531, "learning_rate": 1.682877592218756e-06, "loss": 0.3948, "num_input_tokens_seen": 3973024, "step": 1835 }, { "epoch": 0.3376766379152138, "grad_norm": 17.85489273071289, "learning_rate": 1.6874655900165168e-06, "loss": 0.4133, "num_input_tokens_seen": 3983040, "step": 1840 }, { "epoch": 0.338594237474766, "grad_norm": 28.6861629486084, "learning_rate": 1.692053587814278e-06, "loss": 0.3803, "num_input_tokens_seen": 3994144, "step": 1845 }, { "epoch": 0.3395118370343182, "grad_norm": 18.19024085998535, "learning_rate": 1.696641585612039e-06, "loss": 0.3416, "num_input_tokens_seen": 4004928, "step": 1850 }, { "epoch": 0.3404294365938704, "grad_norm": 34.57956314086914, "learning_rate": 1.7012295834098e-06, "loss": 0.3252, "num_input_tokens_seen": 4016096, "step": 1855 }, { "epoch": 0.34134703615342266, "grad_norm": 24.081514358520508, "learning_rate": 1.7058175812075612e-06, "loss": 0.3069, "num_input_tokens_seen": 4027328, "step": 1860 }, { "epoch": 0.3422646357129749, "grad_norm": 30.553749084472656, "learning_rate": 1.7104055790053223e-06, "loss": 0.3382, "num_input_tokens_seen": 4038208, "step": 1865 }, { "epoch": 0.34318223527252706, "grad_norm": 42.91640090942383, "learning_rate": 1.7149935768030832e-06, "loss": 0.2999, "num_input_tokens_seen": 4048992, "step": 1870 }, { "epoch": 0.3440998348320793, "grad_norm": 25.001562118530273, "learning_rate": 1.7195815746008444e-06, "loss": 0.3158, "num_input_tokens_seen": 4060544, "step": 1875 }, { "epoch": 0.3450174343916315, "grad_norm": 24.657819747924805, "learning_rate": 1.7241695723986053e-06, "loss": 0.3, "num_input_tokens_seen": 4070464, "step": 1880 }, { "epoch": 0.3459350339511837, "grad_norm": 23.783891677856445, "learning_rate": 1.7287575701963665e-06, "loss": 0.3039, "num_input_tokens_seen": 4081952, "step": 1885 }, { "epoch": 0.3468526335107359, "grad_norm": 23.96781349182129, "learning_rate": 1.7333455679941274e-06, "loss": 0.3316, "num_input_tokens_seen": 4093888, "step": 1890 }, { "epoch": 0.34777023307028815, "grad_norm": 28.50166130065918, "learning_rate": 1.7379335657918886e-06, "loss": 0.367, "num_input_tokens_seen": 4104736, "step": 1895 }, { "epoch": 0.3486878326298403, "grad_norm": 34.70766067504883, "learning_rate": 1.7425215635896497e-06, "loss": 0.354, "num_input_tokens_seen": 4113280, "step": 1900 }, { "epoch": 0.34960543218939255, "grad_norm": 19.15537452697754, "learning_rate": 1.7471095613874107e-06, "loss": 0.3666, "num_input_tokens_seen": 4124960, "step": 1905 }, { "epoch": 0.3505230317489448, "grad_norm": 25.291818618774414, "learning_rate": 1.7516975591851718e-06, "loss": 0.3355, "num_input_tokens_seen": 4134880, "step": 1910 }, { "epoch": 0.35144063130849695, "grad_norm": 48.128204345703125, "learning_rate": 1.756285556982933e-06, "loss": 0.3907, "num_input_tokens_seen": 4145792, "step": 1915 }, { "epoch": 0.3523582308680492, "grad_norm": 18.250368118286133, "learning_rate": 1.7608735547806937e-06, "loss": 0.356, "num_input_tokens_seen": 4156096, "step": 1920 }, { "epoch": 0.3532758304276014, "grad_norm": 21.307209014892578, "learning_rate": 1.7654615525784548e-06, "loss": 0.3305, "num_input_tokens_seen": 4166944, "step": 1925 }, { "epoch": 0.3541934299871536, "grad_norm": 10.095202445983887, "learning_rate": 1.770049550376216e-06, "loss": 0.3409, "num_input_tokens_seen": 4177376, "step": 1930 }, { "epoch": 0.3551110295467058, "grad_norm": 9.953170776367188, "learning_rate": 1.774637548173977e-06, "loss": 0.2876, "num_input_tokens_seen": 4188832, "step": 1935 }, { "epoch": 0.35602862910625804, "grad_norm": 17.228891372680664, "learning_rate": 1.779225545971738e-06, "loss": 0.4705, "num_input_tokens_seen": 4200320, "step": 1940 }, { "epoch": 0.3569462286658102, "grad_norm": 21.636768341064453, "learning_rate": 1.7838135437694992e-06, "loss": 0.3138, "num_input_tokens_seen": 4211104, "step": 1945 }, { "epoch": 0.35786382822536245, "grad_norm": 15.478200912475586, "learning_rate": 1.7884015415672602e-06, "loss": 0.3767, "num_input_tokens_seen": 4221152, "step": 1950 }, { "epoch": 0.3587814277849147, "grad_norm": 12.92282485961914, "learning_rate": 1.7929895393650213e-06, "loss": 0.3348, "num_input_tokens_seen": 4232832, "step": 1955 }, { "epoch": 0.35969902734446685, "grad_norm": 19.252328872680664, "learning_rate": 1.7975775371627825e-06, "loss": 0.3491, "num_input_tokens_seen": 4243008, "step": 1960 }, { "epoch": 0.3606166269040191, "grad_norm": 21.98760986328125, "learning_rate": 1.8021655349605432e-06, "loss": 0.3305, "num_input_tokens_seen": 4254976, "step": 1965 }, { "epoch": 0.3615342264635713, "grad_norm": 19.88874053955078, "learning_rate": 1.8067535327583043e-06, "loss": 0.3464, "num_input_tokens_seen": 4267296, "step": 1970 }, { "epoch": 0.36245182602312354, "grad_norm": 20.4709529876709, "learning_rate": 1.8113415305560655e-06, "loss": 0.3622, "num_input_tokens_seen": 4277248, "step": 1975 }, { "epoch": 0.3633694255826757, "grad_norm": 38.79010009765625, "learning_rate": 1.8159295283538264e-06, "loss": 0.3343, "num_input_tokens_seen": 4288096, "step": 1980 }, { "epoch": 0.36428702514222794, "grad_norm": 16.15134620666504, "learning_rate": 1.8205175261515876e-06, "loss": 0.3751, "num_input_tokens_seen": 4298336, "step": 1985 }, { "epoch": 0.36520462470178017, "grad_norm": 15.220520973205566, "learning_rate": 1.8251055239493487e-06, "loss": 0.3557, "num_input_tokens_seen": 4308448, "step": 1990 }, { "epoch": 0.36612222426133234, "grad_norm": 23.43718910217285, "learning_rate": 1.8296935217471099e-06, "loss": 0.3304, "num_input_tokens_seen": 4318944, "step": 1995 }, { "epoch": 0.36703982382088457, "grad_norm": 29.028108596801758, "learning_rate": 1.8342815195448708e-06, "loss": 0.2743, "num_input_tokens_seen": 4329248, "step": 2000 }, { "epoch": 0.3679574233804368, "grad_norm": 9.34978199005127, "learning_rate": 1.8388695173426318e-06, "loss": 0.3289, "num_input_tokens_seen": 4340448, "step": 2005 }, { "epoch": 0.368875022939989, "grad_norm": 13.145711898803711, "learning_rate": 1.843457515140393e-06, "loss": 0.3391, "num_input_tokens_seen": 4350816, "step": 2010 }, { "epoch": 0.3697926224995412, "grad_norm": 16.52374267578125, "learning_rate": 1.8480455129381538e-06, "loss": 0.3843, "num_input_tokens_seen": 4361472, "step": 2015 }, { "epoch": 0.37071022205909343, "grad_norm": 21.18210220336914, "learning_rate": 1.852633510735915e-06, "loss": 0.3621, "num_input_tokens_seen": 4370400, "step": 2020 }, { "epoch": 0.3716278216186456, "grad_norm": 18.781463623046875, "learning_rate": 1.8572215085336761e-06, "loss": 0.3896, "num_input_tokens_seen": 4381792, "step": 2025 }, { "epoch": 0.37254542117819783, "grad_norm": 16.144580841064453, "learning_rate": 1.861809506331437e-06, "loss": 0.442, "num_input_tokens_seen": 4391968, "step": 2030 }, { "epoch": 0.37346302073775006, "grad_norm": 15.583085060119629, "learning_rate": 1.8663975041291982e-06, "loss": 0.3482, "num_input_tokens_seen": 4404032, "step": 2035 }, { "epoch": 0.37438062029730224, "grad_norm": 21.333152770996094, "learning_rate": 1.8709855019269594e-06, "loss": 0.3649, "num_input_tokens_seen": 4415712, "step": 2040 }, { "epoch": 0.37529821985685446, "grad_norm": 21.1905574798584, "learning_rate": 1.8755734997247201e-06, "loss": 0.3247, "num_input_tokens_seen": 4427072, "step": 2045 }, { "epoch": 0.3762158194164067, "grad_norm": 14.53261661529541, "learning_rate": 1.8801614975224813e-06, "loss": 0.3232, "num_input_tokens_seen": 4437024, "step": 2050 }, { "epoch": 0.37713341897595887, "grad_norm": 12.815985679626465, "learning_rate": 1.8847494953202424e-06, "loss": 0.3253, "num_input_tokens_seen": 4447744, "step": 2055 }, { "epoch": 0.3780510185355111, "grad_norm": 21.530540466308594, "learning_rate": 1.8893374931180034e-06, "loss": 0.2868, "num_input_tokens_seen": 4458368, "step": 2060 }, { "epoch": 0.3789686180950633, "grad_norm": 22.8850040435791, "learning_rate": 1.8939254909157645e-06, "loss": 0.2803, "num_input_tokens_seen": 4468608, "step": 2065 }, { "epoch": 0.3798862176546155, "grad_norm": 10.203630447387695, "learning_rate": 1.8985134887135257e-06, "loss": 0.2397, "num_input_tokens_seen": 4478784, "step": 2070 }, { "epoch": 0.38080381721416773, "grad_norm": 13.245610237121582, "learning_rate": 1.9031014865112866e-06, "loss": 0.5553, "num_input_tokens_seen": 4490752, "step": 2075 }, { "epoch": 0.38172141677371996, "grad_norm": 20.71548080444336, "learning_rate": 1.9076894843090475e-06, "loss": 0.3586, "num_input_tokens_seen": 4500928, "step": 2080 }, { "epoch": 0.3826390163332722, "grad_norm": 15.951150894165039, "learning_rate": 1.912277482106809e-06, "loss": 0.3561, "num_input_tokens_seen": 4512704, "step": 2085 }, { "epoch": 0.38355661589282436, "grad_norm": 15.701671600341797, "learning_rate": 1.91686547990457e-06, "loss": 0.3542, "num_input_tokens_seen": 4522976, "step": 2090 }, { "epoch": 0.3844742154523766, "grad_norm": 10.740577697753906, "learning_rate": 1.9214534777023308e-06, "loss": 0.3574, "num_input_tokens_seen": 4532960, "step": 2095 }, { "epoch": 0.3853918150119288, "grad_norm": 29.39570426940918, "learning_rate": 1.926041475500092e-06, "loss": 0.3829, "num_input_tokens_seen": 4543456, "step": 2100 }, { "epoch": 0.386309414571481, "grad_norm": 14.867191314697266, "learning_rate": 1.930629473297853e-06, "loss": 0.3384, "num_input_tokens_seen": 4554688, "step": 2105 }, { "epoch": 0.3872270141310332, "grad_norm": 14.389782905578613, "learning_rate": 1.935217471095614e-06, "loss": 0.2867, "num_input_tokens_seen": 4565408, "step": 2110 }, { "epoch": 0.38814461369058545, "grad_norm": 17.04846954345703, "learning_rate": 1.939805468893375e-06, "loss": 0.3472, "num_input_tokens_seen": 4576864, "step": 2115 }, { "epoch": 0.3890622132501376, "grad_norm": 25.077592849731445, "learning_rate": 1.9443934666911363e-06, "loss": 0.2798, "num_input_tokens_seen": 4588768, "step": 2120 }, { "epoch": 0.38997981280968985, "grad_norm": 18.371305465698242, "learning_rate": 1.9489814644888972e-06, "loss": 0.3918, "num_input_tokens_seen": 4599936, "step": 2125 }, { "epoch": 0.3908974123692421, "grad_norm": 38.488319396972656, "learning_rate": 1.953569462286658e-06, "loss": 0.399, "num_input_tokens_seen": 4610816, "step": 2130 }, { "epoch": 0.39181501192879425, "grad_norm": 18.133649826049805, "learning_rate": 1.9581574600844195e-06, "loss": 0.3665, "num_input_tokens_seen": 4622080, "step": 2135 }, { "epoch": 0.3927326114883465, "grad_norm": 20.087421417236328, "learning_rate": 1.9627454578821805e-06, "loss": 0.4042, "num_input_tokens_seen": 4632352, "step": 2140 }, { "epoch": 0.3936502110478987, "grad_norm": 12.790421485900879, "learning_rate": 1.9673334556799414e-06, "loss": 0.2726, "num_input_tokens_seen": 4642464, "step": 2145 }, { "epoch": 0.3945678106074509, "grad_norm": 38.21997833251953, "learning_rate": 1.9719214534777028e-06, "loss": 0.3466, "num_input_tokens_seen": 4652736, "step": 2150 }, { "epoch": 0.3954854101670031, "grad_norm": 17.677906036376953, "learning_rate": 1.9765094512754633e-06, "loss": 0.3672, "num_input_tokens_seen": 4663232, "step": 2155 }, { "epoch": 0.39640300972655534, "grad_norm": 18.826597213745117, "learning_rate": 1.9810974490732247e-06, "loss": 0.3732, "num_input_tokens_seen": 4672064, "step": 2160 }, { "epoch": 0.3973206092861075, "grad_norm": 12.077119827270508, "learning_rate": 1.9856854468709856e-06, "loss": 0.27, "num_input_tokens_seen": 4682304, "step": 2165 }, { "epoch": 0.39823820884565975, "grad_norm": 22.5463809967041, "learning_rate": 1.9902734446687465e-06, "loss": 0.2907, "num_input_tokens_seen": 4691392, "step": 2170 }, { "epoch": 0.399155808405212, "grad_norm": 12.97579574584961, "learning_rate": 1.994861442466508e-06, "loss": 0.345, "num_input_tokens_seen": 4702560, "step": 2175 }, { "epoch": 0.40007340796476415, "grad_norm": 42.80659866333008, "learning_rate": 1.999449440264269e-06, "loss": 0.3447, "num_input_tokens_seen": 4713632, "step": 2180 }, { "epoch": 0.4009910075243164, "grad_norm": 14.432217597961426, "learning_rate": 2.0040374380620298e-06, "loss": 0.2935, "num_input_tokens_seen": 4725280, "step": 2185 }, { "epoch": 0.4019086070838686, "grad_norm": 26.0948543548584, "learning_rate": 2.0086254358597907e-06, "loss": 0.3623, "num_input_tokens_seen": 4735680, "step": 2190 }, { "epoch": 0.40282620664342084, "grad_norm": 62.13546371459961, "learning_rate": 2.013213433657552e-06, "loss": 0.3252, "num_input_tokens_seen": 4747264, "step": 2195 }, { "epoch": 0.403743806202973, "grad_norm": 52.56909942626953, "learning_rate": 2.017801431455313e-06, "loss": 0.3322, "num_input_tokens_seen": 4758656, "step": 2200 }, { "epoch": 0.40466140576252524, "grad_norm": 24.27638053894043, "learning_rate": 2.022389429253074e-06, "loss": 0.3365, "num_input_tokens_seen": 4769856, "step": 2205 }, { "epoch": 0.40557900532207747, "grad_norm": 22.110824584960938, "learning_rate": 2.0269774270508353e-06, "loss": 0.3379, "num_input_tokens_seen": 4782016, "step": 2210 }, { "epoch": 0.40649660488162964, "grad_norm": 41.843013763427734, "learning_rate": 2.0315654248485962e-06, "loss": 0.3334, "num_input_tokens_seen": 4792672, "step": 2215 }, { "epoch": 0.40741420444118187, "grad_norm": 17.280746459960938, "learning_rate": 2.036153422646357e-06, "loss": 0.3086, "num_input_tokens_seen": 4803584, "step": 2220 }, { "epoch": 0.4083318040007341, "grad_norm": 31.99681282043457, "learning_rate": 2.0407414204441185e-06, "loss": 0.2913, "num_input_tokens_seen": 4814976, "step": 2225 }, { "epoch": 0.4092494035602863, "grad_norm": 39.6531982421875, "learning_rate": 2.0453294182418795e-06, "loss": 0.3611, "num_input_tokens_seen": 4825952, "step": 2230 }, { "epoch": 0.4101670031198385, "grad_norm": 19.013500213623047, "learning_rate": 2.0499174160396404e-06, "loss": 0.286, "num_input_tokens_seen": 4836352, "step": 2235 }, { "epoch": 0.41108460267939073, "grad_norm": 32.873748779296875, "learning_rate": 2.0545054138374014e-06, "loss": 0.4571, "num_input_tokens_seen": 4846464, "step": 2240 }, { "epoch": 0.4120022022389429, "grad_norm": 19.834381103515625, "learning_rate": 2.0590934116351627e-06, "loss": 0.3019, "num_input_tokens_seen": 4857856, "step": 2245 }, { "epoch": 0.41291980179849513, "grad_norm": 10.947006225585938, "learning_rate": 2.0636814094329237e-06, "loss": 0.2669, "num_input_tokens_seen": 4870048, "step": 2250 }, { "epoch": 0.41383740135804736, "grad_norm": 15.145331382751465, "learning_rate": 2.0682694072306846e-06, "loss": 0.2607, "num_input_tokens_seen": 4882048, "step": 2255 }, { "epoch": 0.41475500091759954, "grad_norm": 52.131446838378906, "learning_rate": 2.072857405028446e-06, "loss": 0.3504, "num_input_tokens_seen": 4892032, "step": 2260 }, { "epoch": 0.41567260047715177, "grad_norm": 59.001705169677734, "learning_rate": 2.077445402826207e-06, "loss": 0.371, "num_input_tokens_seen": 4902304, "step": 2265 }, { "epoch": 0.416590200036704, "grad_norm": 11.686222076416016, "learning_rate": 2.082033400623968e-06, "loss": 0.3628, "num_input_tokens_seen": 4913792, "step": 2270 }, { "epoch": 0.41750779959625617, "grad_norm": 30.50171661376953, "learning_rate": 2.086621398421729e-06, "loss": 0.3033, "num_input_tokens_seen": 4924864, "step": 2275 }, { "epoch": 0.4184253991558084, "grad_norm": 25.3564510345459, "learning_rate": 2.0912093962194897e-06, "loss": 0.3084, "num_input_tokens_seen": 4933984, "step": 2280 }, { "epoch": 0.4193429987153606, "grad_norm": 72.54115295410156, "learning_rate": 2.095797394017251e-06, "loss": 0.3002, "num_input_tokens_seen": 4945280, "step": 2285 }, { "epoch": 0.42026059827491286, "grad_norm": 66.41421508789062, "learning_rate": 2.100385391815012e-06, "loss": 0.5386, "num_input_tokens_seen": 4955968, "step": 2290 }, { "epoch": 0.42117819783446503, "grad_norm": 21.600488662719727, "learning_rate": 2.104973389612773e-06, "loss": 0.2656, "num_input_tokens_seen": 4966240, "step": 2295 }, { "epoch": 0.42209579739401726, "grad_norm": 34.943084716796875, "learning_rate": 2.1095613874105343e-06, "loss": 0.3348, "num_input_tokens_seen": 4976832, "step": 2300 }, { "epoch": 0.4230133969535695, "grad_norm": 13.734845161437988, "learning_rate": 2.1141493852082953e-06, "loss": 0.2717, "num_input_tokens_seen": 4987552, "step": 2305 }, { "epoch": 0.42393099651312166, "grad_norm": 31.541105270385742, "learning_rate": 2.118737383006056e-06, "loss": 0.3696, "num_input_tokens_seen": 4998624, "step": 2310 }, { "epoch": 0.4248485960726739, "grad_norm": 55.503108978271484, "learning_rate": 2.123325380803817e-06, "loss": 0.3427, "num_input_tokens_seen": 5009408, "step": 2315 }, { "epoch": 0.4257661956322261, "grad_norm": 45.116920471191406, "learning_rate": 2.1279133786015785e-06, "loss": 0.4607, "num_input_tokens_seen": 5021408, "step": 2320 }, { "epoch": 0.4266837951917783, "grad_norm": 28.64314842224121, "learning_rate": 2.1325013763993394e-06, "loss": 0.4281, "num_input_tokens_seen": 5031232, "step": 2325 }, { "epoch": 0.4276013947513305, "grad_norm": 33.45762634277344, "learning_rate": 2.1370893741971004e-06, "loss": 0.3071, "num_input_tokens_seen": 5040960, "step": 2330 }, { "epoch": 0.42851899431088275, "grad_norm": 16.122987747192383, "learning_rate": 2.1416773719948617e-06, "loss": 0.3696, "num_input_tokens_seen": 5050976, "step": 2335 }, { "epoch": 0.4294365938704349, "grad_norm": 28.398012161254883, "learning_rate": 2.1462653697926227e-06, "loss": 0.4308, "num_input_tokens_seen": 5062400, "step": 2340 }, { "epoch": 0.43035419342998715, "grad_norm": 13.147467613220215, "learning_rate": 2.1508533675903836e-06, "loss": 0.3224, "num_input_tokens_seen": 5073632, "step": 2345 }, { "epoch": 0.4312717929895394, "grad_norm": 23.264829635620117, "learning_rate": 2.155441365388145e-06, "loss": 0.3774, "num_input_tokens_seen": 5085888, "step": 2350 }, { "epoch": 0.43218939254909156, "grad_norm": 14.988195419311523, "learning_rate": 2.160029363185906e-06, "loss": 0.3529, "num_input_tokens_seen": 5097728, "step": 2355 }, { "epoch": 0.4331069921086438, "grad_norm": 13.541486740112305, "learning_rate": 2.164617360983667e-06, "loss": 0.3349, "num_input_tokens_seen": 5108192, "step": 2360 }, { "epoch": 0.434024591668196, "grad_norm": 10.838065147399902, "learning_rate": 2.1692053587814278e-06, "loss": 0.341, "num_input_tokens_seen": 5118944, "step": 2365 }, { "epoch": 0.4349421912277482, "grad_norm": 20.491609573364258, "learning_rate": 2.173793356579189e-06, "loss": 0.3114, "num_input_tokens_seen": 5129760, "step": 2370 }, { "epoch": 0.4358597907873004, "grad_norm": 18.70850944519043, "learning_rate": 2.17838135437695e-06, "loss": 0.3774, "num_input_tokens_seen": 5140768, "step": 2375 }, { "epoch": 0.43677739034685265, "grad_norm": 13.892935752868652, "learning_rate": 2.182969352174711e-06, "loss": 0.34, "num_input_tokens_seen": 5151456, "step": 2380 }, { "epoch": 0.4376949899064048, "grad_norm": 14.682526588439941, "learning_rate": 2.1875573499724724e-06, "loss": 0.3209, "num_input_tokens_seen": 5162464, "step": 2385 }, { "epoch": 0.43861258946595705, "grad_norm": 25.333389282226562, "learning_rate": 2.1921453477702333e-06, "loss": 0.2747, "num_input_tokens_seen": 5173792, "step": 2390 }, { "epoch": 0.4395301890255093, "grad_norm": 36.814453125, "learning_rate": 2.1967333455679943e-06, "loss": 0.3592, "num_input_tokens_seen": 5184672, "step": 2395 }, { "epoch": 0.4404477885850615, "grad_norm": 25.035964965820312, "learning_rate": 2.2013213433657556e-06, "loss": 0.4069, "num_input_tokens_seen": 5195424, "step": 2400 }, { "epoch": 0.4413653881446137, "grad_norm": 42.439151763916016, "learning_rate": 2.205909341163516e-06, "loss": 0.3537, "num_input_tokens_seen": 5206080, "step": 2405 }, { "epoch": 0.4422829877041659, "grad_norm": 10.186593055725098, "learning_rate": 2.2104973389612775e-06, "loss": 0.3274, "num_input_tokens_seen": 5217376, "step": 2410 }, { "epoch": 0.44320058726371814, "grad_norm": 13.264947891235352, "learning_rate": 2.2150853367590384e-06, "loss": 0.3359, "num_input_tokens_seen": 5226592, "step": 2415 }, { "epoch": 0.4441181868232703, "grad_norm": 16.207971572875977, "learning_rate": 2.2196733345568e-06, "loss": 0.377, "num_input_tokens_seen": 5238144, "step": 2420 }, { "epoch": 0.44503578638282254, "grad_norm": 14.965044021606445, "learning_rate": 2.2242613323545607e-06, "loss": 0.3803, "num_input_tokens_seen": 5248960, "step": 2425 }, { "epoch": 0.44595338594237477, "grad_norm": 31.636411666870117, "learning_rate": 2.2288493301523217e-06, "loss": 0.3472, "num_input_tokens_seen": 5259936, "step": 2430 }, { "epoch": 0.44687098550192694, "grad_norm": 19.776578903198242, "learning_rate": 2.233437327950083e-06, "loss": 0.3448, "num_input_tokens_seen": 5270592, "step": 2435 }, { "epoch": 0.44778858506147917, "grad_norm": 16.03999137878418, "learning_rate": 2.2380253257478436e-06, "loss": 0.3905, "num_input_tokens_seen": 5281696, "step": 2440 }, { "epoch": 0.4487061846210314, "grad_norm": 11.321558952331543, "learning_rate": 2.242613323545605e-06, "loss": 0.3384, "num_input_tokens_seen": 5291040, "step": 2445 }, { "epoch": 0.4496237841805836, "grad_norm": 10.74166202545166, "learning_rate": 2.247201321343366e-06, "loss": 0.3677, "num_input_tokens_seen": 5301856, "step": 2450 }, { "epoch": 0.4505413837401358, "grad_norm": 12.443467140197754, "learning_rate": 2.251789319141127e-06, "loss": 0.3765, "num_input_tokens_seen": 5312736, "step": 2455 }, { "epoch": 0.45145898329968803, "grad_norm": 8.64746379852295, "learning_rate": 2.256377316938888e-06, "loss": 0.3467, "num_input_tokens_seen": 5322912, "step": 2460 }, { "epoch": 0.4523765828592402, "grad_norm": 10.759008407592773, "learning_rate": 2.260965314736649e-06, "loss": 0.2771, "num_input_tokens_seen": 5334976, "step": 2465 }, { "epoch": 0.45329418241879244, "grad_norm": 20.0767822265625, "learning_rate": 2.26555331253441e-06, "loss": 0.3698, "num_input_tokens_seen": 5346976, "step": 2470 }, { "epoch": 0.45421178197834466, "grad_norm": 15.1443510055542, "learning_rate": 2.2701413103321714e-06, "loss": 0.2986, "num_input_tokens_seen": 5357280, "step": 2475 }, { "epoch": 0.45512938153789684, "grad_norm": 11.803468704223633, "learning_rate": 2.2747293081299323e-06, "loss": 0.374, "num_input_tokens_seen": 5368896, "step": 2480 }, { "epoch": 0.45604698109744907, "grad_norm": 11.64734935760498, "learning_rate": 2.2793173059276933e-06, "loss": 0.3006, "num_input_tokens_seen": 5379168, "step": 2485 }, { "epoch": 0.4569645806570013, "grad_norm": 7.02736759185791, "learning_rate": 2.283905303725454e-06, "loss": 0.3772, "num_input_tokens_seen": 5389280, "step": 2490 }, { "epoch": 0.45788218021655347, "grad_norm": 12.659947395324707, "learning_rate": 2.2884933015232156e-06, "loss": 0.288, "num_input_tokens_seen": 5399744, "step": 2495 }, { "epoch": 0.4587997797761057, "grad_norm": 12.67811107635498, "learning_rate": 2.2930812993209765e-06, "loss": 0.297, "num_input_tokens_seen": 5411808, "step": 2500 }, { "epoch": 0.4597173793356579, "grad_norm": 6.190281867980957, "learning_rate": 2.2976692971187374e-06, "loss": 0.2761, "num_input_tokens_seen": 5422656, "step": 2505 }, { "epoch": 0.46063497889521016, "grad_norm": 22.281784057617188, "learning_rate": 2.302257294916499e-06, "loss": 0.3571, "num_input_tokens_seen": 5433280, "step": 2510 }, { "epoch": 0.46155257845476233, "grad_norm": 14.114054679870605, "learning_rate": 2.3068452927142597e-06, "loss": 0.3435, "num_input_tokens_seen": 5444384, "step": 2515 }, { "epoch": 0.46247017801431456, "grad_norm": 31.001028060913086, "learning_rate": 2.3114332905120207e-06, "loss": 0.3796, "num_input_tokens_seen": 5455232, "step": 2520 }, { "epoch": 0.4633877775738668, "grad_norm": 21.429967880249023, "learning_rate": 2.316021288309782e-06, "loss": 0.2756, "num_input_tokens_seen": 5466144, "step": 2525 }, { "epoch": 0.46430537713341896, "grad_norm": 12.01675796508789, "learning_rate": 2.320609286107543e-06, "loss": 0.2961, "num_input_tokens_seen": 5477376, "step": 2530 }, { "epoch": 0.4652229766929712, "grad_norm": 35.5133056640625, "learning_rate": 2.325197283905304e-06, "loss": 0.4144, "num_input_tokens_seen": 5487872, "step": 2535 }, { "epoch": 0.4661405762525234, "grad_norm": 12.051313400268555, "learning_rate": 2.329785281703065e-06, "loss": 0.3983, "num_input_tokens_seen": 5499520, "step": 2540 }, { "epoch": 0.4670581758120756, "grad_norm": 15.986292839050293, "learning_rate": 2.3343732795008262e-06, "loss": 0.3614, "num_input_tokens_seen": 5508640, "step": 2545 }, { "epoch": 0.4679757753716278, "grad_norm": 22.094200134277344, "learning_rate": 2.338961277298587e-06, "loss": 0.2949, "num_input_tokens_seen": 5520960, "step": 2550 }, { "epoch": 0.46889337493118005, "grad_norm": 21.626354217529297, "learning_rate": 2.343549275096348e-06, "loss": 0.3571, "num_input_tokens_seen": 5532288, "step": 2555 }, { "epoch": 0.4698109744907322, "grad_norm": 13.100361824035645, "learning_rate": 2.3481372728941095e-06, "loss": 0.3304, "num_input_tokens_seen": 5542848, "step": 2560 }, { "epoch": 0.47072857405028445, "grad_norm": 16.395301818847656, "learning_rate": 2.35272527069187e-06, "loss": 0.3324, "num_input_tokens_seen": 5554176, "step": 2565 }, { "epoch": 0.4716461736098367, "grad_norm": 10.798255920410156, "learning_rate": 2.3573132684896313e-06, "loss": 0.3583, "num_input_tokens_seen": 5565504, "step": 2570 }, { "epoch": 0.47256377316938886, "grad_norm": 13.864266395568848, "learning_rate": 2.3619012662873923e-06, "loss": 0.4266, "num_input_tokens_seen": 5576448, "step": 2575 }, { "epoch": 0.4734813727289411, "grad_norm": 34.09209442138672, "learning_rate": 2.3664892640851532e-06, "loss": 0.3697, "num_input_tokens_seen": 5588192, "step": 2580 }, { "epoch": 0.4743989722884933, "grad_norm": 18.519466400146484, "learning_rate": 2.3710772618829146e-06, "loss": 0.3142, "num_input_tokens_seen": 5599296, "step": 2585 }, { "epoch": 0.4753165718480455, "grad_norm": 13.568312644958496, "learning_rate": 2.3756652596806755e-06, "loss": 0.3238, "num_input_tokens_seen": 5609216, "step": 2590 }, { "epoch": 0.4762341714075977, "grad_norm": 17.642166137695312, "learning_rate": 2.3802532574784365e-06, "loss": 0.3316, "num_input_tokens_seen": 5619584, "step": 2595 }, { "epoch": 0.47715177096714995, "grad_norm": 11.466231346130371, "learning_rate": 2.384841255276198e-06, "loss": 0.3704, "num_input_tokens_seen": 5630240, "step": 2600 }, { "epoch": 0.4780693705267021, "grad_norm": 9.644320487976074, "learning_rate": 2.3894292530739588e-06, "loss": 0.3556, "num_input_tokens_seen": 5640448, "step": 2605 }, { "epoch": 0.47898697008625435, "grad_norm": 8.798616409301758, "learning_rate": 2.3940172508717197e-06, "loss": 0.2832, "num_input_tokens_seen": 5651488, "step": 2610 }, { "epoch": 0.4799045696458066, "grad_norm": 13.748780250549316, "learning_rate": 2.3986052486694806e-06, "loss": 0.3609, "num_input_tokens_seen": 5663168, "step": 2615 }, { "epoch": 0.4808221692053588, "grad_norm": 24.371397018432617, "learning_rate": 2.403193246467242e-06, "loss": 0.3428, "num_input_tokens_seen": 5674048, "step": 2620 }, { "epoch": 0.481739768764911, "grad_norm": 10.55804443359375, "learning_rate": 2.407781244265003e-06, "loss": 0.335, "num_input_tokens_seen": 5684416, "step": 2625 }, { "epoch": 0.4826573683244632, "grad_norm": 24.682985305786133, "learning_rate": 2.412369242062764e-06, "loss": 0.3973, "num_input_tokens_seen": 5694368, "step": 2630 }, { "epoch": 0.48357496788401544, "grad_norm": 9.33574390411377, "learning_rate": 2.4169572398605252e-06, "loss": 0.3416, "num_input_tokens_seen": 5705696, "step": 2635 }, { "epoch": 0.4844925674435676, "grad_norm": 17.60628318786621, "learning_rate": 2.421545237658286e-06, "loss": 0.3828, "num_input_tokens_seen": 5716896, "step": 2640 }, { "epoch": 0.48541016700311984, "grad_norm": 27.03196144104004, "learning_rate": 2.426133235456047e-06, "loss": 0.3378, "num_input_tokens_seen": 5727040, "step": 2645 }, { "epoch": 0.48632776656267207, "grad_norm": 31.21512794494629, "learning_rate": 2.4307212332538085e-06, "loss": 0.3376, "num_input_tokens_seen": 5737024, "step": 2650 }, { "epoch": 0.48724536612222424, "grad_norm": 17.44399642944336, "learning_rate": 2.4353092310515694e-06, "loss": 0.3597, "num_input_tokens_seen": 5746880, "step": 2655 }, { "epoch": 0.4881629656817765, "grad_norm": 19.438091278076172, "learning_rate": 2.4398972288493303e-06, "loss": 0.3029, "num_input_tokens_seen": 5757152, "step": 2660 }, { "epoch": 0.4890805652413287, "grad_norm": 10.604873657226562, "learning_rate": 2.4444852266470913e-06, "loss": 0.2894, "num_input_tokens_seen": 5766080, "step": 2665 }, { "epoch": 0.4899981648008809, "grad_norm": 10.514728546142578, "learning_rate": 2.4490732244448526e-06, "loss": 0.3569, "num_input_tokens_seen": 5776320, "step": 2670 }, { "epoch": 0.4909157643604331, "grad_norm": 29.228309631347656, "learning_rate": 2.4536612222426136e-06, "loss": 0.3531, "num_input_tokens_seen": 5786400, "step": 2675 }, { "epoch": 0.49183336391998533, "grad_norm": 13.989030838012695, "learning_rate": 2.4582492200403745e-06, "loss": 0.3733, "num_input_tokens_seen": 5796864, "step": 2680 }, { "epoch": 0.4927509634795375, "grad_norm": 14.571187973022461, "learning_rate": 2.462837217838136e-06, "loss": 0.3692, "num_input_tokens_seen": 5807712, "step": 2685 }, { "epoch": 0.49366856303908974, "grad_norm": 20.11689567565918, "learning_rate": 2.4674252156358964e-06, "loss": 0.3717, "num_input_tokens_seen": 5818368, "step": 2690 }, { "epoch": 0.49458616259864197, "grad_norm": 10.952680587768555, "learning_rate": 2.4720132134336578e-06, "loss": 0.379, "num_input_tokens_seen": 5828096, "step": 2695 }, { "epoch": 0.49550376215819414, "grad_norm": 10.239851951599121, "learning_rate": 2.4766012112314187e-06, "loss": 0.3322, "num_input_tokens_seen": 5839264, "step": 2700 }, { "epoch": 0.49642136171774637, "grad_norm": 16.004390716552734, "learning_rate": 2.4811892090291796e-06, "loss": 0.3457, "num_input_tokens_seen": 5848736, "step": 2705 }, { "epoch": 0.4973389612772986, "grad_norm": 15.28675651550293, "learning_rate": 2.485777206826941e-06, "loss": 0.3246, "num_input_tokens_seen": 5858656, "step": 2710 }, { "epoch": 0.4982565608368508, "grad_norm": 10.6226167678833, "learning_rate": 2.490365204624702e-06, "loss": 0.3076, "num_input_tokens_seen": 5867584, "step": 2715 }, { "epoch": 0.499174160396403, "grad_norm": 12.465950965881348, "learning_rate": 2.494953202422463e-06, "loss": 0.3556, "num_input_tokens_seen": 5878752, "step": 2720 }, { "epoch": 0.5000917599559552, "grad_norm": 17.49136734008789, "learning_rate": 2.4995412002202242e-06, "loss": 0.3673, "num_input_tokens_seen": 5888736, "step": 2725 }, { "epoch": 0.5010093595155074, "grad_norm": 20.007110595703125, "learning_rate": 2.5041291980179848e-06, "loss": 0.2815, "num_input_tokens_seen": 5900128, "step": 2730 }, { "epoch": 0.5019269590750597, "grad_norm": 8.207905769348145, "learning_rate": 2.5087171958157465e-06, "loss": 0.2629, "num_input_tokens_seen": 5910048, "step": 2735 }, { "epoch": 0.5028445586346119, "grad_norm": 18.07990837097168, "learning_rate": 2.513305193613507e-06, "loss": 0.3276, "num_input_tokens_seen": 5920192, "step": 2740 }, { "epoch": 0.503762158194164, "grad_norm": 39.230224609375, "learning_rate": 2.517893191411268e-06, "loss": 0.3756, "num_input_tokens_seen": 5931744, "step": 2745 }, { "epoch": 0.5046797577537163, "grad_norm": 11.458648681640625, "learning_rate": 2.5224811892090294e-06, "loss": 0.4166, "num_input_tokens_seen": 5941280, "step": 2750 }, { "epoch": 0.5055973573132685, "grad_norm": 10.667207717895508, "learning_rate": 2.5270691870067903e-06, "loss": 0.3212, "num_input_tokens_seen": 5951648, "step": 2755 }, { "epoch": 0.5065149568728207, "grad_norm": 12.912055969238281, "learning_rate": 2.5316571848045512e-06, "loss": 0.3674, "num_input_tokens_seen": 5962272, "step": 2760 }, { "epoch": 0.507432556432373, "grad_norm": 19.60918426513672, "learning_rate": 2.5362451826023126e-06, "loss": 0.3175, "num_input_tokens_seen": 5973568, "step": 2765 }, { "epoch": 0.5083501559919251, "grad_norm": 28.642000198364258, "learning_rate": 2.5408331804000735e-06, "loss": 0.2577, "num_input_tokens_seen": 5985280, "step": 2770 }, { "epoch": 0.5092677555514773, "grad_norm": 10.37392520904541, "learning_rate": 2.5454211781978345e-06, "loss": 0.3028, "num_input_tokens_seen": 5996896, "step": 2775 }, { "epoch": 0.5101853551110296, "grad_norm": 28.611724853515625, "learning_rate": 2.550009175995596e-06, "loss": 0.3164, "num_input_tokens_seen": 6007744, "step": 2780 }, { "epoch": 0.5111029546705818, "grad_norm": 47.573299407958984, "learning_rate": 2.5545971737933568e-06, "loss": 0.4229, "num_input_tokens_seen": 6018016, "step": 2785 }, { "epoch": 0.5120205542301339, "grad_norm": 36.9384765625, "learning_rate": 2.559185171591118e-06, "loss": 0.3203, "num_input_tokens_seen": 6028608, "step": 2790 }, { "epoch": 0.5129381537896862, "grad_norm": 20.989164352416992, "learning_rate": 2.563773169388879e-06, "loss": 0.365, "num_input_tokens_seen": 6039392, "step": 2795 }, { "epoch": 0.5138557533492384, "grad_norm": 10.160348892211914, "learning_rate": 2.56836116718664e-06, "loss": 0.2917, "num_input_tokens_seen": 6050048, "step": 2800 }, { "epoch": 0.5147733529087906, "grad_norm": 10.238409996032715, "learning_rate": 2.5729491649844014e-06, "loss": 0.3677, "num_input_tokens_seen": 6060480, "step": 2805 }, { "epoch": 0.5156909524683428, "grad_norm": 9.51674747467041, "learning_rate": 2.5775371627821623e-06, "loss": 0.3725, "num_input_tokens_seen": 6071616, "step": 2810 }, { "epoch": 0.516608552027895, "grad_norm": 18.048267364501953, "learning_rate": 2.582125160579923e-06, "loss": 0.3576, "num_input_tokens_seen": 6082432, "step": 2815 }, { "epoch": 0.5175261515874472, "grad_norm": 13.649455070495605, "learning_rate": 2.5867131583776846e-06, "loss": 0.3236, "num_input_tokens_seen": 6092832, "step": 2820 }, { "epoch": 0.5184437511469995, "grad_norm": 8.265779495239258, "learning_rate": 2.591301156175445e-06, "loss": 0.3282, "num_input_tokens_seen": 6102496, "step": 2825 }, { "epoch": 0.5193613507065516, "grad_norm": 19.045047760009766, "learning_rate": 2.595889153973206e-06, "loss": 0.3105, "num_input_tokens_seen": 6112768, "step": 2830 }, { "epoch": 0.5202789502661038, "grad_norm": 12.51884651184082, "learning_rate": 2.6004771517709674e-06, "loss": 0.4141, "num_input_tokens_seen": 6123904, "step": 2835 }, { "epoch": 0.5211965498256561, "grad_norm": 7.348196029663086, "learning_rate": 2.6050651495687284e-06, "loss": 0.2935, "num_input_tokens_seen": 6135872, "step": 2840 }, { "epoch": 0.5221141493852083, "grad_norm": 12.223257064819336, "learning_rate": 2.6096531473664893e-06, "loss": 0.3916, "num_input_tokens_seen": 6146944, "step": 2845 }, { "epoch": 0.5230317489447605, "grad_norm": 6.460174083709717, "learning_rate": 2.6142411451642507e-06, "loss": 0.2753, "num_input_tokens_seen": 6158176, "step": 2850 }, { "epoch": 0.5239493485043127, "grad_norm": 8.617239952087402, "learning_rate": 2.6188291429620116e-06, "loss": 0.3302, "num_input_tokens_seen": 6170176, "step": 2855 }, { "epoch": 0.5248669480638649, "grad_norm": 22.53301429748535, "learning_rate": 2.6234171407597725e-06, "loss": 0.4179, "num_input_tokens_seen": 6180800, "step": 2860 }, { "epoch": 0.5257845476234171, "grad_norm": 22.75333595275879, "learning_rate": 2.628005138557534e-06, "loss": 0.3097, "num_input_tokens_seen": 6192608, "step": 2865 }, { "epoch": 0.5267021471829694, "grad_norm": 17.472150802612305, "learning_rate": 2.632593136355295e-06, "loss": 0.3205, "num_input_tokens_seen": 6205216, "step": 2870 }, { "epoch": 0.5276197467425215, "grad_norm": 14.976883888244629, "learning_rate": 2.6371811341530558e-06, "loss": 0.2579, "num_input_tokens_seen": 6215744, "step": 2875 }, { "epoch": 0.5285373463020738, "grad_norm": 20.682342529296875, "learning_rate": 2.641769131950817e-06, "loss": 0.3831, "num_input_tokens_seen": 6225216, "step": 2880 }, { "epoch": 0.529454945861626, "grad_norm": 19.081037521362305, "learning_rate": 2.646357129748578e-06, "loss": 0.3103, "num_input_tokens_seen": 6236096, "step": 2885 }, { "epoch": 0.5303725454211782, "grad_norm": 26.38585090637207, "learning_rate": 2.650945127546339e-06, "loss": 0.2674, "num_input_tokens_seen": 6245376, "step": 2890 }, { "epoch": 0.5312901449807305, "grad_norm": 46.055355072021484, "learning_rate": 2.6555331253441004e-06, "loss": 0.3211, "num_input_tokens_seen": 6255040, "step": 2895 }, { "epoch": 0.5322077445402826, "grad_norm": 68.3750228881836, "learning_rate": 2.6601211231418613e-06, "loss": 0.4435, "num_input_tokens_seen": 6266176, "step": 2900 }, { "epoch": 0.5331253440998348, "grad_norm": 6.9537177085876465, "learning_rate": 2.664709120939622e-06, "loss": 0.2832, "num_input_tokens_seen": 6278240, "step": 2905 }, { "epoch": 0.5340429436593871, "grad_norm": 12.542890548706055, "learning_rate": 2.669297118737383e-06, "loss": 0.3471, "num_input_tokens_seen": 6288320, "step": 2910 }, { "epoch": 0.5349605432189393, "grad_norm": 7.967313289642334, "learning_rate": 2.673885116535144e-06, "loss": 0.3031, "num_input_tokens_seen": 6298112, "step": 2915 }, { "epoch": 0.5358781427784914, "grad_norm": 10.589349746704102, "learning_rate": 2.678473114332905e-06, "loss": 0.3526, "num_input_tokens_seen": 6309472, "step": 2920 }, { "epoch": 0.5367957423380437, "grad_norm": 13.252424240112305, "learning_rate": 2.6830611121306664e-06, "loss": 0.3878, "num_input_tokens_seen": 6320512, "step": 2925 }, { "epoch": 0.5377133418975959, "grad_norm": 7.691258430480957, "learning_rate": 2.6876491099284274e-06, "loss": 0.2816, "num_input_tokens_seen": 6329216, "step": 2930 }, { "epoch": 0.5386309414571481, "grad_norm": 12.05805492401123, "learning_rate": 2.6922371077261883e-06, "loss": 0.2777, "num_input_tokens_seen": 6340032, "step": 2935 }, { "epoch": 0.5395485410167004, "grad_norm": 8.057910919189453, "learning_rate": 2.6968251055239497e-06, "loss": 0.3383, "num_input_tokens_seen": 6350528, "step": 2940 }, { "epoch": 0.5404661405762525, "grad_norm": 8.134148597717285, "learning_rate": 2.7014131033217106e-06, "loss": 0.3709, "num_input_tokens_seen": 6361504, "step": 2945 }, { "epoch": 0.5413837401358047, "grad_norm": 10.054030418395996, "learning_rate": 2.7060011011194715e-06, "loss": 0.3693, "num_input_tokens_seen": 6371520, "step": 2950 }, { "epoch": 0.542301339695357, "grad_norm": 13.328201293945312, "learning_rate": 2.710589098917233e-06, "loss": 0.3309, "num_input_tokens_seen": 6382144, "step": 2955 }, { "epoch": 0.5432189392549092, "grad_norm": 50.12836456298828, "learning_rate": 2.715177096714994e-06, "loss": 0.4406, "num_input_tokens_seen": 6393568, "step": 2960 }, { "epoch": 0.5441365388144613, "grad_norm": 12.17877197265625, "learning_rate": 2.7197650945127548e-06, "loss": 0.309, "num_input_tokens_seen": 6404416, "step": 2965 }, { "epoch": 0.5450541383740136, "grad_norm": 6.032882213592529, "learning_rate": 2.724353092310516e-06, "loss": 0.2785, "num_input_tokens_seen": 6415168, "step": 2970 }, { "epoch": 0.5459717379335658, "grad_norm": 10.911568641662598, "learning_rate": 2.728941090108277e-06, "loss": 0.4521, "num_input_tokens_seen": 6424480, "step": 2975 }, { "epoch": 0.546889337493118, "grad_norm": 9.105517387390137, "learning_rate": 2.7335290879060376e-06, "loss": 0.2987, "num_input_tokens_seen": 6435360, "step": 2980 }, { "epoch": 0.5478069370526703, "grad_norm": 9.835237503051758, "learning_rate": 2.7381170857037994e-06, "loss": 0.3887, "num_input_tokens_seen": 6447008, "step": 2985 }, { "epoch": 0.5487245366122224, "grad_norm": 9.877107620239258, "learning_rate": 2.74270508350156e-06, "loss": 0.3497, "num_input_tokens_seen": 6456224, "step": 2990 }, { "epoch": 0.5496421361717746, "grad_norm": 6.3946213722229, "learning_rate": 2.747293081299321e-06, "loss": 0.3983, "num_input_tokens_seen": 6467264, "step": 2995 }, { "epoch": 0.5505597357313269, "grad_norm": 5.885002136230469, "learning_rate": 2.751881079097082e-06, "loss": 0.3519, "num_input_tokens_seen": 6477216, "step": 3000 }, { "epoch": 0.5514773352908791, "grad_norm": 9.784916877746582, "learning_rate": 2.756469076894843e-06, "loss": 0.261, "num_input_tokens_seen": 6487520, "step": 3005 }, { "epoch": 0.5523949348504312, "grad_norm": 10.53094482421875, "learning_rate": 2.7610570746926045e-06, "loss": 0.3541, "num_input_tokens_seen": 6498016, "step": 3010 }, { "epoch": 0.5533125344099835, "grad_norm": 9.909029006958008, "learning_rate": 2.7656450724903654e-06, "loss": 0.356, "num_input_tokens_seen": 6508448, "step": 3015 }, { "epoch": 0.5542301339695357, "grad_norm": 15.705235481262207, "learning_rate": 2.7702330702881264e-06, "loss": 0.3915, "num_input_tokens_seen": 6519744, "step": 3020 }, { "epoch": 0.5551477335290879, "grad_norm": 26.645750045776367, "learning_rate": 2.7748210680858877e-06, "loss": 0.3137, "num_input_tokens_seen": 6530624, "step": 3025 }, { "epoch": 0.5560653330886401, "grad_norm": 4.8282389640808105, "learning_rate": 2.7794090658836487e-06, "loss": 0.3444, "num_input_tokens_seen": 6541344, "step": 3030 }, { "epoch": 0.5569829326481923, "grad_norm": 10.760416030883789, "learning_rate": 2.7839970636814096e-06, "loss": 0.297, "num_input_tokens_seen": 6552256, "step": 3035 }, { "epoch": 0.5579005322077445, "grad_norm": 30.0593318939209, "learning_rate": 2.788585061479171e-06, "loss": 0.4432, "num_input_tokens_seen": 6562304, "step": 3040 }, { "epoch": 0.5588181317672968, "grad_norm": 11.595783233642578, "learning_rate": 2.793173059276932e-06, "loss": 0.3883, "num_input_tokens_seen": 6572640, "step": 3045 }, { "epoch": 0.559735731326849, "grad_norm": 9.40044116973877, "learning_rate": 2.797761057074693e-06, "loss": 0.3406, "num_input_tokens_seen": 6583840, "step": 3050 }, { "epoch": 0.5606533308864011, "grad_norm": 13.187131881713867, "learning_rate": 2.8023490548724542e-06, "loss": 0.3275, "num_input_tokens_seen": 6594656, "step": 3055 }, { "epoch": 0.5615709304459534, "grad_norm": 29.797651290893555, "learning_rate": 2.806937052670215e-06, "loss": 0.3621, "num_input_tokens_seen": 6605568, "step": 3060 }, { "epoch": 0.5624885300055056, "grad_norm": 14.425203323364258, "learning_rate": 2.8115250504679757e-06, "loss": 0.3033, "num_input_tokens_seen": 6617184, "step": 3065 }, { "epoch": 0.5634061295650578, "grad_norm": 21.540559768676758, "learning_rate": 2.8161130482657375e-06, "loss": 0.3246, "num_input_tokens_seen": 6628992, "step": 3070 }, { "epoch": 0.56432372912461, "grad_norm": 10.703221321105957, "learning_rate": 2.820701046063498e-06, "loss": 0.3557, "num_input_tokens_seen": 6638720, "step": 3075 }, { "epoch": 0.5652413286841622, "grad_norm": 19.37401580810547, "learning_rate": 2.825289043861259e-06, "loss": 0.3218, "num_input_tokens_seen": 6649504, "step": 3080 }, { "epoch": 0.5661589282437144, "grad_norm": 10.871063232421875, "learning_rate": 2.8298770416590203e-06, "loss": 0.286, "num_input_tokens_seen": 6660704, "step": 3085 }, { "epoch": 0.5670765278032667, "grad_norm": 12.523116111755371, "learning_rate": 2.834465039456781e-06, "loss": 0.3621, "num_input_tokens_seen": 6672704, "step": 3090 }, { "epoch": 0.5679941273628188, "grad_norm": 15.544677734375, "learning_rate": 2.839053037254542e-06, "loss": 0.3929, "num_input_tokens_seen": 6684928, "step": 3095 }, { "epoch": 0.5689117269223711, "grad_norm": 30.32935905456543, "learning_rate": 2.8436410350523035e-06, "loss": 0.3503, "num_input_tokens_seen": 6695008, "step": 3100 }, { "epoch": 0.5698293264819233, "grad_norm": 30.898849487304688, "learning_rate": 2.8482290328500644e-06, "loss": 0.368, "num_input_tokens_seen": 6705312, "step": 3105 }, { "epoch": 0.5707469260414755, "grad_norm": 23.623760223388672, "learning_rate": 2.8528170306478254e-06, "loss": 0.3972, "num_input_tokens_seen": 6715136, "step": 3110 }, { "epoch": 0.5716645256010278, "grad_norm": 10.526803970336914, "learning_rate": 2.8574050284455867e-06, "loss": 0.3713, "num_input_tokens_seen": 6726368, "step": 3115 }, { "epoch": 0.5725821251605799, "grad_norm": 12.57529354095459, "learning_rate": 2.8619930262433477e-06, "loss": 0.3121, "num_input_tokens_seen": 6735968, "step": 3120 }, { "epoch": 0.5734997247201321, "grad_norm": 16.504013061523438, "learning_rate": 2.8665810240411086e-06, "loss": 0.2923, "num_input_tokens_seen": 6746080, "step": 3125 }, { "epoch": 0.5744173242796844, "grad_norm": 22.475446701049805, "learning_rate": 2.87116902183887e-06, "loss": 0.3081, "num_input_tokens_seen": 6757760, "step": 3130 }, { "epoch": 0.5753349238392366, "grad_norm": 8.243173599243164, "learning_rate": 2.875757019636631e-06, "loss": 0.4196, "num_input_tokens_seen": 6769088, "step": 3135 }, { "epoch": 0.5762525233987887, "grad_norm": 20.046785354614258, "learning_rate": 2.880345017434392e-06, "loss": 0.3705, "num_input_tokens_seen": 6779616, "step": 3140 }, { "epoch": 0.577170122958341, "grad_norm": 18.945396423339844, "learning_rate": 2.8849330152321532e-06, "loss": 0.3565, "num_input_tokens_seen": 6788992, "step": 3145 }, { "epoch": 0.5780877225178932, "grad_norm": 47.9402961730957, "learning_rate": 2.889521013029914e-06, "loss": 0.3642, "num_input_tokens_seen": 6800096, "step": 3150 }, { "epoch": 0.5790053220774454, "grad_norm": 13.380125045776367, "learning_rate": 2.8941090108276747e-06, "loss": 0.3734, "num_input_tokens_seen": 6810976, "step": 3155 }, { "epoch": 0.5799229216369977, "grad_norm": 10.422333717346191, "learning_rate": 2.898697008625436e-06, "loss": 0.3348, "num_input_tokens_seen": 6820896, "step": 3160 }, { "epoch": 0.5808405211965498, "grad_norm": 12.385754585266113, "learning_rate": 2.903285006423197e-06, "loss": 0.3501, "num_input_tokens_seen": 6831584, "step": 3165 }, { "epoch": 0.581758120756102, "grad_norm": 10.643510818481445, "learning_rate": 2.907873004220958e-06, "loss": 0.3362, "num_input_tokens_seen": 6842816, "step": 3170 }, { "epoch": 0.5826757203156543, "grad_norm": 15.434459686279297, "learning_rate": 2.9124610020187193e-06, "loss": 0.3412, "num_input_tokens_seen": 6854400, "step": 3175 }, { "epoch": 0.5835933198752065, "grad_norm": 23.092313766479492, "learning_rate": 2.91704899981648e-06, "loss": 0.3429, "num_input_tokens_seen": 6865216, "step": 3180 }, { "epoch": 0.5845109194347586, "grad_norm": 10.492303848266602, "learning_rate": 2.921636997614241e-06, "loss": 0.3018, "num_input_tokens_seen": 6877024, "step": 3185 }, { "epoch": 0.5854285189943109, "grad_norm": 10.276147842407227, "learning_rate": 2.9262249954120025e-06, "loss": 0.3765, "num_input_tokens_seen": 6887840, "step": 3190 }, { "epoch": 0.5863461185538631, "grad_norm": 10.034799575805664, "learning_rate": 2.9308129932097634e-06, "loss": 0.3287, "num_input_tokens_seen": 6898816, "step": 3195 }, { "epoch": 0.5872637181134153, "grad_norm": 19.591983795166016, "learning_rate": 2.9354009910075244e-06, "loss": 0.382, "num_input_tokens_seen": 6910944, "step": 3200 }, { "epoch": 0.5881813176729676, "grad_norm": 4.566303730010986, "learning_rate": 2.9399889888052857e-06, "loss": 0.3655, "num_input_tokens_seen": 6921152, "step": 3205 }, { "epoch": 0.5890989172325197, "grad_norm": 11.850485801696777, "learning_rate": 2.9445769866030467e-06, "loss": 0.3764, "num_input_tokens_seen": 6931872, "step": 3210 }, { "epoch": 0.5900165167920719, "grad_norm": 5.508541584014893, "learning_rate": 2.9491649844008076e-06, "loss": 0.2919, "num_input_tokens_seen": 6942560, "step": 3215 }, { "epoch": 0.5909341163516242, "grad_norm": 6.8920392990112305, "learning_rate": 2.953752982198569e-06, "loss": 0.2994, "num_input_tokens_seen": 6953472, "step": 3220 }, { "epoch": 0.5918517159111764, "grad_norm": 12.238359451293945, "learning_rate": 2.95834097999633e-06, "loss": 0.3612, "num_input_tokens_seen": 6963904, "step": 3225 }, { "epoch": 0.5927693154707285, "grad_norm": 17.125423431396484, "learning_rate": 2.9629289777940913e-06, "loss": 0.3757, "num_input_tokens_seen": 6975904, "step": 3230 }, { "epoch": 0.5936869150302808, "grad_norm": 11.53808879852295, "learning_rate": 2.9675169755918522e-06, "loss": 0.2926, "num_input_tokens_seen": 6986656, "step": 3235 }, { "epoch": 0.594604514589833, "grad_norm": 10.954652786254883, "learning_rate": 2.9721049733896127e-06, "loss": 0.3531, "num_input_tokens_seen": 6996608, "step": 3240 }, { "epoch": 0.5955221141493852, "grad_norm": 21.360454559326172, "learning_rate": 2.9766929711873745e-06, "loss": 0.336, "num_input_tokens_seen": 7007648, "step": 3245 }, { "epoch": 0.5964397137089374, "grad_norm": 15.253193855285645, "learning_rate": 2.981280968985135e-06, "loss": 0.3388, "num_input_tokens_seen": 7018592, "step": 3250 }, { "epoch": 0.5973573132684896, "grad_norm": 7.478594779968262, "learning_rate": 2.985868966782896e-06, "loss": 0.3402, "num_input_tokens_seen": 7029376, "step": 3255 }, { "epoch": 0.5982749128280418, "grad_norm": 7.206541538238525, "learning_rate": 2.9904569645806573e-06, "loss": 0.3414, "num_input_tokens_seen": 7040384, "step": 3260 }, { "epoch": 0.5991925123875941, "grad_norm": 17.856462478637695, "learning_rate": 2.9950449623784183e-06, "loss": 0.3829, "num_input_tokens_seen": 7051072, "step": 3265 }, { "epoch": 0.6001101119471463, "grad_norm": 7.6076273918151855, "learning_rate": 2.9996329601761792e-06, "loss": 0.3305, "num_input_tokens_seen": 7061824, "step": 3270 }, { "epoch": 0.6010277115066984, "grad_norm": 6.9956817626953125, "learning_rate": 3.0042209579739406e-06, "loss": 0.3484, "num_input_tokens_seen": 7072864, "step": 3275 }, { "epoch": 0.6019453110662507, "grad_norm": 8.87632942199707, "learning_rate": 3.0088089557717015e-06, "loss": 0.3935, "num_input_tokens_seen": 7083968, "step": 3280 }, { "epoch": 0.6028629106258029, "grad_norm": 5.800905227661133, "learning_rate": 3.0133969535694625e-06, "loss": 0.3549, "num_input_tokens_seen": 7094272, "step": 3285 }, { "epoch": 0.6037805101853551, "grad_norm": 16.600725173950195, "learning_rate": 3.017984951367224e-06, "loss": 0.3527, "num_input_tokens_seen": 7105600, "step": 3290 }, { "epoch": 0.6046981097449073, "grad_norm": 9.861617088317871, "learning_rate": 3.0225729491649848e-06, "loss": 0.324, "num_input_tokens_seen": 7115424, "step": 3295 }, { "epoch": 0.6056157093044595, "grad_norm": 11.21810531616211, "learning_rate": 3.0271609469627457e-06, "loss": 0.2903, "num_input_tokens_seen": 7125728, "step": 3300 }, { "epoch": 0.6065333088640118, "grad_norm": 22.553329467773438, "learning_rate": 3.031748944760507e-06, "loss": 0.3572, "num_input_tokens_seen": 7136928, "step": 3305 }, { "epoch": 0.607450908423564, "grad_norm": 11.628507614135742, "learning_rate": 3.036336942558268e-06, "loss": 0.3673, "num_input_tokens_seen": 7146880, "step": 3310 }, { "epoch": 0.6083685079831161, "grad_norm": 5.109407424926758, "learning_rate": 3.0409249403560285e-06, "loss": 0.34, "num_input_tokens_seen": 7158304, "step": 3315 }, { "epoch": 0.6092861075426684, "grad_norm": 7.3805646896362305, "learning_rate": 3.0455129381537903e-06, "loss": 0.3378, "num_input_tokens_seen": 7169728, "step": 3320 }, { "epoch": 0.6102037071022206, "grad_norm": 5.853867053985596, "learning_rate": 3.050100935951551e-06, "loss": 0.3326, "num_input_tokens_seen": 7180576, "step": 3325 }, { "epoch": 0.6111213066617728, "grad_norm": 24.165937423706055, "learning_rate": 3.0546889337493117e-06, "loss": 0.3438, "num_input_tokens_seen": 7190944, "step": 3330 }, { "epoch": 0.6120389062213251, "grad_norm": 23.390417098999023, "learning_rate": 3.059276931547073e-06, "loss": 0.3176, "num_input_tokens_seen": 7202400, "step": 3335 }, { "epoch": 0.6129565057808772, "grad_norm": 11.873044967651367, "learning_rate": 3.063864929344834e-06, "loss": 0.3037, "num_input_tokens_seen": 7214016, "step": 3340 }, { "epoch": 0.6138741053404294, "grad_norm": 14.28446102142334, "learning_rate": 3.068452927142595e-06, "loss": 0.3663, "num_input_tokens_seen": 7224320, "step": 3345 }, { "epoch": 0.6147917048999817, "grad_norm": 50.26692199707031, "learning_rate": 3.0730409249403563e-06, "loss": 0.4482, "num_input_tokens_seen": 7235584, "step": 3350 }, { "epoch": 0.6157093044595339, "grad_norm": 14.039661407470703, "learning_rate": 3.0776289227381173e-06, "loss": 0.3469, "num_input_tokens_seen": 7246976, "step": 3355 }, { "epoch": 0.616626904019086, "grad_norm": 19.763500213623047, "learning_rate": 3.0822169205358782e-06, "loss": 0.3994, "num_input_tokens_seen": 7258208, "step": 3360 }, { "epoch": 0.6175445035786383, "grad_norm": 5.944614410400391, "learning_rate": 3.0868049183336396e-06, "loss": 0.3599, "num_input_tokens_seen": 7269344, "step": 3365 }, { "epoch": 0.6184621031381905, "grad_norm": 7.324315547943115, "learning_rate": 3.0913929161314005e-06, "loss": 0.3359, "num_input_tokens_seen": 7280448, "step": 3370 }, { "epoch": 0.6193797026977427, "grad_norm": 12.683117866516113, "learning_rate": 3.0959809139291615e-06, "loss": 0.3905, "num_input_tokens_seen": 7291872, "step": 3375 }, { "epoch": 0.620297302257295, "grad_norm": 7.223912715911865, "learning_rate": 3.100568911726923e-06, "loss": 0.3155, "num_input_tokens_seen": 7302496, "step": 3380 }, { "epoch": 0.6212149018168471, "grad_norm": 17.984403610229492, "learning_rate": 3.1051569095246838e-06, "loss": 0.3576, "num_input_tokens_seen": 7313024, "step": 3385 }, { "epoch": 0.6221325013763993, "grad_norm": 11.878480911254883, "learning_rate": 3.1097449073224447e-06, "loss": 0.3605, "num_input_tokens_seen": 7324768, "step": 3390 }, { "epoch": 0.6230501009359516, "grad_norm": 12.02227783203125, "learning_rate": 3.114332905120206e-06, "loss": 0.3671, "num_input_tokens_seen": 7335488, "step": 3395 }, { "epoch": 0.6239677004955038, "grad_norm": 5.356926441192627, "learning_rate": 3.1189209029179666e-06, "loss": 0.3871, "num_input_tokens_seen": 7346144, "step": 3400 }, { "epoch": 0.6248853000550559, "grad_norm": 5.36794376373291, "learning_rate": 3.1235089007157275e-06, "loss": 0.3849, "num_input_tokens_seen": 7357824, "step": 3405 }, { "epoch": 0.6258028996146082, "grad_norm": 14.09557056427002, "learning_rate": 3.128096898513489e-06, "loss": 0.3459, "num_input_tokens_seen": 7369056, "step": 3410 }, { "epoch": 0.6267204991741604, "grad_norm": 9.643715858459473, "learning_rate": 3.13268489631125e-06, "loss": 0.3414, "num_input_tokens_seen": 7378496, "step": 3415 }, { "epoch": 0.6276380987337126, "grad_norm": 8.92561149597168, "learning_rate": 3.1372728941090108e-06, "loss": 0.326, "num_input_tokens_seen": 7389088, "step": 3420 }, { "epoch": 0.6285556982932649, "grad_norm": 7.440300941467285, "learning_rate": 3.141860891906772e-06, "loss": 0.3017, "num_input_tokens_seen": 7399904, "step": 3425 }, { "epoch": 0.629473297852817, "grad_norm": 13.274091720581055, "learning_rate": 3.146448889704533e-06, "loss": 0.3708, "num_input_tokens_seen": 7411072, "step": 3430 }, { "epoch": 0.6303908974123692, "grad_norm": 18.825361251831055, "learning_rate": 3.1510368875022944e-06, "loss": 0.407, "num_input_tokens_seen": 7422432, "step": 3435 }, { "epoch": 0.6313084969719215, "grad_norm": 23.324567794799805, "learning_rate": 3.1556248853000554e-06, "loss": 0.3447, "num_input_tokens_seen": 7432672, "step": 3440 }, { "epoch": 0.6322260965314737, "grad_norm": 9.605603218078613, "learning_rate": 3.1602128830978163e-06, "loss": 0.3279, "num_input_tokens_seen": 7444384, "step": 3445 }, { "epoch": 0.6331436960910258, "grad_norm": 5.169836044311523, "learning_rate": 3.1648008808955777e-06, "loss": 0.2882, "num_input_tokens_seen": 7455328, "step": 3450 }, { "epoch": 0.6340612956505781, "grad_norm": 12.339884757995605, "learning_rate": 3.1693888786933386e-06, "loss": 0.3862, "num_input_tokens_seen": 7466656, "step": 3455 }, { "epoch": 0.6349788952101303, "grad_norm": 19.44207000732422, "learning_rate": 3.1739768764910995e-06, "loss": 0.2952, "num_input_tokens_seen": 7476992, "step": 3460 }, { "epoch": 0.6358964947696825, "grad_norm": 10.201376914978027, "learning_rate": 3.178564874288861e-06, "loss": 0.3915, "num_input_tokens_seen": 7487520, "step": 3465 }, { "epoch": 0.6368140943292347, "grad_norm": 8.68386459350586, "learning_rate": 3.183152872086622e-06, "loss": 0.3134, "num_input_tokens_seen": 7498624, "step": 3470 }, { "epoch": 0.6377316938887869, "grad_norm": 62.84284973144531, "learning_rate": 3.1877408698843828e-06, "loss": 0.3416, "num_input_tokens_seen": 7509440, "step": 3475 }, { "epoch": 0.6386492934483391, "grad_norm": 6.370108604431152, "learning_rate": 3.192328867682144e-06, "loss": 0.4268, "num_input_tokens_seen": 7520480, "step": 3480 }, { "epoch": 0.6395668930078914, "grad_norm": 13.801281929016113, "learning_rate": 3.196916865479905e-06, "loss": 0.2998, "num_input_tokens_seen": 7532096, "step": 3485 }, { "epoch": 0.6404844925674436, "grad_norm": 35.171329498291016, "learning_rate": 3.2015048632776656e-06, "loss": 0.457, "num_input_tokens_seen": 7542240, "step": 3490 }, { "epoch": 0.6414020921269957, "grad_norm": 11.297736167907715, "learning_rate": 3.2060928610754274e-06, "loss": 0.3083, "num_input_tokens_seen": 7552160, "step": 3495 }, { "epoch": 0.642319691686548, "grad_norm": 9.632706642150879, "learning_rate": 3.210680858873188e-06, "loss": 0.2367, "num_input_tokens_seen": 7563200, "step": 3500 }, { "epoch": 0.6432372912461002, "grad_norm": 3.481294631958008, "learning_rate": 3.215268856670949e-06, "loss": 0.2936, "num_input_tokens_seen": 7573664, "step": 3505 }, { "epoch": 0.6441548908056524, "grad_norm": 4.253149032592773, "learning_rate": 3.21985685446871e-06, "loss": 0.2815, "num_input_tokens_seen": 7583488, "step": 3510 }, { "epoch": 0.6450724903652046, "grad_norm": 15.570107460021973, "learning_rate": 3.224444852266471e-06, "loss": 0.3532, "num_input_tokens_seen": 7594880, "step": 3515 }, { "epoch": 0.6459900899247568, "grad_norm": 12.386420249938965, "learning_rate": 3.229032850064232e-06, "loss": 0.4104, "num_input_tokens_seen": 7605152, "step": 3520 }, { "epoch": 0.6469076894843091, "grad_norm": 56.24921417236328, "learning_rate": 3.2336208478619934e-06, "loss": 0.3764, "num_input_tokens_seen": 7616672, "step": 3525 }, { "epoch": 0.6478252890438613, "grad_norm": 12.257161140441895, "learning_rate": 3.2382088456597544e-06, "loss": 0.2895, "num_input_tokens_seen": 7626944, "step": 3530 }, { "epoch": 0.6487428886034134, "grad_norm": 33.46105194091797, "learning_rate": 3.2427968434575153e-06, "loss": 0.3955, "num_input_tokens_seen": 7637920, "step": 3535 }, { "epoch": 0.6496604881629657, "grad_norm": 5.272995948791504, "learning_rate": 3.2473848412552767e-06, "loss": 0.4082, "num_input_tokens_seen": 7647456, "step": 3540 }, { "epoch": 0.6505780877225179, "grad_norm": 5.49667501449585, "learning_rate": 3.2519728390530376e-06, "loss": 0.2825, "num_input_tokens_seen": 7658176, "step": 3545 }, { "epoch": 0.6514956872820701, "grad_norm": 4.965662956237793, "learning_rate": 3.2565608368507985e-06, "loss": 0.3525, "num_input_tokens_seen": 7668448, "step": 3550 }, { "epoch": 0.6524132868416224, "grad_norm": 7.673798561096191, "learning_rate": 3.26114883464856e-06, "loss": 0.371, "num_input_tokens_seen": 7679008, "step": 3555 }, { "epoch": 0.6533308864011745, "grad_norm": 14.380219459533691, "learning_rate": 3.265736832446321e-06, "loss": 0.2392, "num_input_tokens_seen": 7689696, "step": 3560 }, { "epoch": 0.6542484859607267, "grad_norm": 18.36910057067871, "learning_rate": 3.2703248302440814e-06, "loss": 0.4955, "num_input_tokens_seen": 7699968, "step": 3565 }, { "epoch": 0.655166085520279, "grad_norm": 7.956936359405518, "learning_rate": 3.274912828041843e-06, "loss": 0.3736, "num_input_tokens_seen": 7710976, "step": 3570 }, { "epoch": 0.6560836850798312, "grad_norm": 16.563827514648438, "learning_rate": 3.2795008258396037e-06, "loss": 0.2786, "num_input_tokens_seen": 7720640, "step": 3575 }, { "epoch": 0.6570012846393833, "grad_norm": 8.59093952178955, "learning_rate": 3.2840888236373646e-06, "loss": 0.4479, "num_input_tokens_seen": 7730816, "step": 3580 }, { "epoch": 0.6579188841989356, "grad_norm": 9.112879753112793, "learning_rate": 3.288676821435126e-06, "loss": 0.2917, "num_input_tokens_seen": 7741696, "step": 3585 }, { "epoch": 0.6588364837584878, "grad_norm": 16.07282066345215, "learning_rate": 3.293264819232887e-06, "loss": 0.4147, "num_input_tokens_seen": 7753216, "step": 3590 }, { "epoch": 0.65975408331804, "grad_norm": 14.574568748474121, "learning_rate": 3.297852817030648e-06, "loss": 0.369, "num_input_tokens_seen": 7763136, "step": 3595 }, { "epoch": 0.6606716828775923, "grad_norm": 11.914022445678711, "learning_rate": 3.302440814828409e-06, "loss": 0.2913, "num_input_tokens_seen": 7773664, "step": 3600 }, { "epoch": 0.6615892824371444, "grad_norm": 27.24892807006836, "learning_rate": 3.30702881262617e-06, "loss": 0.3354, "num_input_tokens_seen": 7782592, "step": 3605 }, { "epoch": 0.6625068819966966, "grad_norm": 5.2525835037231445, "learning_rate": 3.311616810423931e-06, "loss": 0.3516, "num_input_tokens_seen": 7792608, "step": 3610 }, { "epoch": 0.6634244815562489, "grad_norm": 17.23410987854004, "learning_rate": 3.3162048082216924e-06, "loss": 0.3447, "num_input_tokens_seen": 7803392, "step": 3615 }, { "epoch": 0.6643420811158011, "grad_norm": 6.510613918304443, "learning_rate": 3.3207928060194534e-06, "loss": 0.3595, "num_input_tokens_seen": 7815232, "step": 3620 }, { "epoch": 0.6652596806753532, "grad_norm": 10.129454612731934, "learning_rate": 3.3253808038172143e-06, "loss": 0.3139, "num_input_tokens_seen": 7825952, "step": 3625 }, { "epoch": 0.6661772802349055, "grad_norm": 7.695369243621826, "learning_rate": 3.3299688016149757e-06, "loss": 0.3942, "num_input_tokens_seen": 7838240, "step": 3630 }, { "epoch": 0.6670948797944577, "grad_norm": 4.481592178344727, "learning_rate": 3.3345567994127366e-06, "loss": 0.3077, "num_input_tokens_seen": 7849408, "step": 3635 }, { "epoch": 0.6680124793540099, "grad_norm": 13.401658058166504, "learning_rate": 3.3391447972104975e-06, "loss": 0.2975, "num_input_tokens_seen": 7860064, "step": 3640 }, { "epoch": 0.6689300789135622, "grad_norm": 67.05248260498047, "learning_rate": 3.343732795008259e-06, "loss": 0.4397, "num_input_tokens_seen": 7870624, "step": 3645 }, { "epoch": 0.6698476784731143, "grad_norm": 12.6394681930542, "learning_rate": 3.3483207928060194e-06, "loss": 0.3372, "num_input_tokens_seen": 7880672, "step": 3650 }, { "epoch": 0.6707652780326665, "grad_norm": 13.858123779296875, "learning_rate": 3.352908790603781e-06, "loss": 0.3225, "num_input_tokens_seen": 7890816, "step": 3655 }, { "epoch": 0.6716828775922188, "grad_norm": 8.40493392944336, "learning_rate": 3.3574967884015417e-06, "loss": 0.3199, "num_input_tokens_seen": 7900608, "step": 3660 }, { "epoch": 0.672600477151771, "grad_norm": 5.418062686920166, "learning_rate": 3.3620847861993027e-06, "loss": 0.2936, "num_input_tokens_seen": 7911360, "step": 3665 }, { "epoch": 0.6735180767113231, "grad_norm": 7.795927047729492, "learning_rate": 3.366672783997064e-06, "loss": 0.3624, "num_input_tokens_seen": 7922176, "step": 3670 }, { "epoch": 0.6744356762708754, "grad_norm": 26.342300415039062, "learning_rate": 3.371260781794825e-06, "loss": 0.3003, "num_input_tokens_seen": 7933504, "step": 3675 }, { "epoch": 0.6753532758304276, "grad_norm": 8.749773979187012, "learning_rate": 3.375848779592586e-06, "loss": 0.3485, "num_input_tokens_seen": 7943584, "step": 3680 }, { "epoch": 0.6762708753899798, "grad_norm": 8.522345542907715, "learning_rate": 3.3804367773903473e-06, "loss": 0.3553, "num_input_tokens_seen": 7954144, "step": 3685 }, { "epoch": 0.677188474949532, "grad_norm": 13.875625610351562, "learning_rate": 3.385024775188108e-06, "loss": 0.2694, "num_input_tokens_seen": 7965696, "step": 3690 }, { "epoch": 0.6781060745090842, "grad_norm": 7.5518317222595215, "learning_rate": 3.389612772985869e-06, "loss": 0.359, "num_input_tokens_seen": 7976960, "step": 3695 }, { "epoch": 0.6790236740686364, "grad_norm": 4.878243446350098, "learning_rate": 3.3942007707836305e-06, "loss": 0.3062, "num_input_tokens_seen": 7988320, "step": 3700 }, { "epoch": 0.6799412736281887, "grad_norm": 8.9259614944458, "learning_rate": 3.3987887685813914e-06, "loss": 0.3235, "num_input_tokens_seen": 7997696, "step": 3705 }, { "epoch": 0.6808588731877409, "grad_norm": 4.3998894691467285, "learning_rate": 3.4033767663791524e-06, "loss": 0.3051, "num_input_tokens_seen": 8009760, "step": 3710 }, { "epoch": 0.681776472747293, "grad_norm": 12.25780963897705, "learning_rate": 3.4079647641769137e-06, "loss": 0.3951, "num_input_tokens_seen": 8021344, "step": 3715 }, { "epoch": 0.6826940723068453, "grad_norm": 53.0804443359375, "learning_rate": 3.4125527619746747e-06, "loss": 0.4122, "num_input_tokens_seen": 8031232, "step": 3720 }, { "epoch": 0.6836116718663975, "grad_norm": 6.481688499450684, "learning_rate": 3.4171407597724356e-06, "loss": 0.3452, "num_input_tokens_seen": 8042272, "step": 3725 }, { "epoch": 0.6845292714259498, "grad_norm": 11.176430702209473, "learning_rate": 3.421728757570197e-06, "loss": 0.3286, "num_input_tokens_seen": 8053952, "step": 3730 }, { "epoch": 0.6854468709855019, "grad_norm": 21.40456199645996, "learning_rate": 3.426316755367958e-06, "loss": 0.3189, "num_input_tokens_seen": 8064608, "step": 3735 }, { "epoch": 0.6863644705450541, "grad_norm": 36.797935485839844, "learning_rate": 3.4309047531657184e-06, "loss": 0.5375, "num_input_tokens_seen": 8076416, "step": 3740 }, { "epoch": 0.6872820701046064, "grad_norm": 13.537101745605469, "learning_rate": 3.4354927509634802e-06, "loss": 0.3461, "num_input_tokens_seen": 8087776, "step": 3745 }, { "epoch": 0.6881996696641586, "grad_norm": 7.871702194213867, "learning_rate": 3.4400807487612407e-06, "loss": 0.2482, "num_input_tokens_seen": 8097824, "step": 3750 }, { "epoch": 0.6891172692237107, "grad_norm": 15.743928909301758, "learning_rate": 3.4446687465590017e-06, "loss": 0.4151, "num_input_tokens_seen": 8109184, "step": 3755 }, { "epoch": 0.690034868783263, "grad_norm": 3.5473568439483643, "learning_rate": 3.449256744356763e-06, "loss": 0.4189, "num_input_tokens_seen": 8121536, "step": 3760 }, { "epoch": 0.6909524683428152, "grad_norm": 7.783815860748291, "learning_rate": 3.453844742154524e-06, "loss": 0.3378, "num_input_tokens_seen": 8132352, "step": 3765 }, { "epoch": 0.6918700679023674, "grad_norm": 4.0930047035217285, "learning_rate": 3.458432739952285e-06, "loss": 0.323, "num_input_tokens_seen": 8143904, "step": 3770 }, { "epoch": 0.6927876674619197, "grad_norm": 21.042234420776367, "learning_rate": 3.4630207377500463e-06, "loss": 0.3571, "num_input_tokens_seen": 8154368, "step": 3775 }, { "epoch": 0.6937052670214718, "grad_norm": 5.876950740814209, "learning_rate": 3.467608735547807e-06, "loss": 0.2938, "num_input_tokens_seen": 8163584, "step": 3780 }, { "epoch": 0.694622866581024, "grad_norm": 7.09097957611084, "learning_rate": 3.472196733345568e-06, "loss": 0.4219, "num_input_tokens_seen": 8173728, "step": 3785 }, { "epoch": 0.6955404661405763, "grad_norm": 12.544437408447266, "learning_rate": 3.4767847311433295e-06, "loss": 0.3404, "num_input_tokens_seen": 8184160, "step": 3790 }, { "epoch": 0.6964580657001285, "grad_norm": 14.281872749328613, "learning_rate": 3.4813727289410904e-06, "loss": 0.2919, "num_input_tokens_seen": 8194976, "step": 3795 }, { "epoch": 0.6973756652596806, "grad_norm": 14.995004653930664, "learning_rate": 3.4859607267388514e-06, "loss": 0.3341, "num_input_tokens_seen": 8206528, "step": 3800 }, { "epoch": 0.6982932648192329, "grad_norm": 18.282285690307617, "learning_rate": 3.4905487245366127e-06, "loss": 0.3173, "num_input_tokens_seen": 8215712, "step": 3805 }, { "epoch": 0.6992108643787851, "grad_norm": 4.910135269165039, "learning_rate": 3.4951367223343737e-06, "loss": 0.3164, "num_input_tokens_seen": 8226336, "step": 3810 }, { "epoch": 0.7001284639383373, "grad_norm": 6.760330677032471, "learning_rate": 3.499724720132134e-06, "loss": 0.3076, "num_input_tokens_seen": 8237792, "step": 3815 }, { "epoch": 0.7010460634978896, "grad_norm": 12.28083610534668, "learning_rate": 3.504312717929896e-06, "loss": 0.2599, "num_input_tokens_seen": 8248640, "step": 3820 }, { "epoch": 0.7019636630574417, "grad_norm": 5.652581214904785, "learning_rate": 3.5089007157276565e-06, "loss": 0.237, "num_input_tokens_seen": 8259264, "step": 3825 }, { "epoch": 0.7028812626169939, "grad_norm": 19.241134643554688, "learning_rate": 3.5134887135254174e-06, "loss": 0.3171, "num_input_tokens_seen": 8271040, "step": 3830 }, { "epoch": 0.7037988621765462, "grad_norm": 7.0034379959106445, "learning_rate": 3.518076711323179e-06, "loss": 0.2716, "num_input_tokens_seen": 8281920, "step": 3835 }, { "epoch": 0.7047164617360984, "grad_norm": 25.60723114013672, "learning_rate": 3.5226647091209397e-06, "loss": 0.3945, "num_input_tokens_seen": 8292512, "step": 3840 }, { "epoch": 0.7056340612956505, "grad_norm": 7.343397617340088, "learning_rate": 3.5272527069187007e-06, "loss": 0.6963, "num_input_tokens_seen": 8302784, "step": 3845 }, { "epoch": 0.7065516608552028, "grad_norm": 14.580601692199707, "learning_rate": 3.531840704716462e-06, "loss": 0.3281, "num_input_tokens_seen": 8314208, "step": 3850 }, { "epoch": 0.707469260414755, "grad_norm": 6.917762279510498, "learning_rate": 3.536428702514223e-06, "loss": 0.3048, "num_input_tokens_seen": 8324448, "step": 3855 }, { "epoch": 0.7083868599743072, "grad_norm": 35.13231658935547, "learning_rate": 3.5410167003119843e-06, "loss": 0.3507, "num_input_tokens_seen": 8334048, "step": 3860 }, { "epoch": 0.7093044595338595, "grad_norm": 24.921194076538086, "learning_rate": 3.5456046981097453e-06, "loss": 0.3379, "num_input_tokens_seen": 8344384, "step": 3865 }, { "epoch": 0.7102220590934116, "grad_norm": 10.373514175415039, "learning_rate": 3.5501926959075062e-06, "loss": 0.2001, "num_input_tokens_seen": 8354784, "step": 3870 }, { "epoch": 0.7111396586529638, "grad_norm": 4.369505405426025, "learning_rate": 3.5547806937052676e-06, "loss": 0.3605, "num_input_tokens_seen": 8365184, "step": 3875 }, { "epoch": 0.7120572582125161, "grad_norm": 13.928075790405273, "learning_rate": 3.5593686915030285e-06, "loss": 0.3251, "num_input_tokens_seen": 8375936, "step": 3880 }, { "epoch": 0.7129748577720683, "grad_norm": 8.676788330078125, "learning_rate": 3.5639566893007895e-06, "loss": 0.3858, "num_input_tokens_seen": 8387488, "step": 3885 }, { "epoch": 0.7138924573316204, "grad_norm": 11.827277183532715, "learning_rate": 3.568544687098551e-06, "loss": 0.3785, "num_input_tokens_seen": 8397952, "step": 3890 }, { "epoch": 0.7148100568911727, "grad_norm": 5.201151371002197, "learning_rate": 3.5731326848963118e-06, "loss": 0.4019, "num_input_tokens_seen": 8409696, "step": 3895 }, { "epoch": 0.7157276564507249, "grad_norm": 6.575876235961914, "learning_rate": 3.5777206826940723e-06, "loss": 0.315, "num_input_tokens_seen": 8420608, "step": 3900 }, { "epoch": 0.7166452560102771, "grad_norm": 12.60274600982666, "learning_rate": 3.582308680491834e-06, "loss": 0.3348, "num_input_tokens_seen": 8432448, "step": 3905 }, { "epoch": 0.7175628555698293, "grad_norm": 14.900016784667969, "learning_rate": 3.5868966782895946e-06, "loss": 0.2922, "num_input_tokens_seen": 8441952, "step": 3910 }, { "epoch": 0.7184804551293815, "grad_norm": 15.422626495361328, "learning_rate": 3.5914846760873555e-06, "loss": 0.3558, "num_input_tokens_seen": 8451840, "step": 3915 }, { "epoch": 0.7193980546889337, "grad_norm": 7.175503730773926, "learning_rate": 3.596072673885117e-06, "loss": 0.3803, "num_input_tokens_seen": 8462400, "step": 3920 }, { "epoch": 0.720315654248486, "grad_norm": 5.893672466278076, "learning_rate": 3.600660671682878e-06, "loss": 0.3036, "num_input_tokens_seen": 8472992, "step": 3925 }, { "epoch": 0.7212332538080382, "grad_norm": 47.65228271484375, "learning_rate": 3.6052486694806387e-06, "loss": 0.3431, "num_input_tokens_seen": 8483616, "step": 3930 }, { "epoch": 0.7221508533675903, "grad_norm": 5.916916847229004, "learning_rate": 3.6098366672784e-06, "loss": 0.3483, "num_input_tokens_seen": 8494144, "step": 3935 }, { "epoch": 0.7230684529271426, "grad_norm": 10.376957893371582, "learning_rate": 3.614424665076161e-06, "loss": 0.5009, "num_input_tokens_seen": 8505152, "step": 3940 }, { "epoch": 0.7239860524866948, "grad_norm": 8.85330581665039, "learning_rate": 3.619012662873922e-06, "loss": 0.3252, "num_input_tokens_seen": 8516480, "step": 3945 }, { "epoch": 0.7249036520462471, "grad_norm": 7.468721866607666, "learning_rate": 3.6236006606716833e-06, "loss": 0.354, "num_input_tokens_seen": 8527744, "step": 3950 }, { "epoch": 0.7258212516057992, "grad_norm": 9.994597434997559, "learning_rate": 3.6281886584694443e-06, "loss": 0.3239, "num_input_tokens_seen": 8539072, "step": 3955 }, { "epoch": 0.7267388511653514, "grad_norm": 44.8820686340332, "learning_rate": 3.6327766562672052e-06, "loss": 0.4249, "num_input_tokens_seen": 8549824, "step": 3960 }, { "epoch": 0.7276564507249037, "grad_norm": 5.0782151222229, "learning_rate": 3.6373646540649666e-06, "loss": 0.2694, "num_input_tokens_seen": 8561280, "step": 3965 }, { "epoch": 0.7285740502844559, "grad_norm": 6.402698993682861, "learning_rate": 3.6419526518627275e-06, "loss": 0.2725, "num_input_tokens_seen": 8571648, "step": 3970 }, { "epoch": 0.729491649844008, "grad_norm": 8.650310516357422, "learning_rate": 3.6465406496604885e-06, "loss": 0.3692, "num_input_tokens_seen": 8582400, "step": 3975 }, { "epoch": 0.7304092494035603, "grad_norm": 4.276065826416016, "learning_rate": 3.65112864745825e-06, "loss": 0.2903, "num_input_tokens_seen": 8592064, "step": 3980 }, { "epoch": 0.7313268489631125, "grad_norm": 42.80140686035156, "learning_rate": 3.6557166452560108e-06, "loss": 0.3531, "num_input_tokens_seen": 8602880, "step": 3985 }, { "epoch": 0.7322444485226647, "grad_norm": 27.114030838012695, "learning_rate": 3.6603046430537713e-06, "loss": 0.4363, "num_input_tokens_seen": 8614656, "step": 3990 }, { "epoch": 0.733162048082217, "grad_norm": 4.705941677093506, "learning_rate": 3.6648926408515326e-06, "loss": 0.3039, "num_input_tokens_seen": 8626304, "step": 3995 }, { "epoch": 0.7340796476417691, "grad_norm": 8.490904808044434, "learning_rate": 3.6694806386492936e-06, "loss": 0.31, "num_input_tokens_seen": 8636096, "step": 4000 }, { "epoch": 0.7349972472013213, "grad_norm": 6.077770709991455, "learning_rate": 3.6740686364470545e-06, "loss": 0.2858, "num_input_tokens_seen": 8647232, "step": 4005 }, { "epoch": 0.7359148467608736, "grad_norm": 17.42539405822754, "learning_rate": 3.678656634244816e-06, "loss": 0.4506, "num_input_tokens_seen": 8657088, "step": 4010 }, { "epoch": 0.7368324463204258, "grad_norm": 3.268547534942627, "learning_rate": 3.683244632042577e-06, "loss": 0.3249, "num_input_tokens_seen": 8667712, "step": 4015 }, { "epoch": 0.737750045879978, "grad_norm": 7.13576078414917, "learning_rate": 3.6878326298403378e-06, "loss": 0.3107, "num_input_tokens_seen": 8679296, "step": 4020 }, { "epoch": 0.7386676454395302, "grad_norm": 4.219841480255127, "learning_rate": 3.692420627638099e-06, "loss": 0.5316, "num_input_tokens_seen": 8689728, "step": 4025 }, { "epoch": 0.7395852449990824, "grad_norm": 3.9835033416748047, "learning_rate": 3.69700862543586e-06, "loss": 0.3627, "num_input_tokens_seen": 8700320, "step": 4030 }, { "epoch": 0.7405028445586346, "grad_norm": 5.129510402679443, "learning_rate": 3.701596623233621e-06, "loss": 0.3406, "num_input_tokens_seen": 8711680, "step": 4035 }, { "epoch": 0.7414204441181869, "grad_norm": 6.267297744750977, "learning_rate": 3.7061846210313824e-06, "loss": 0.3575, "num_input_tokens_seen": 8723168, "step": 4040 }, { "epoch": 0.742338043677739, "grad_norm": 5.122734546661377, "learning_rate": 3.7107726188291433e-06, "loss": 0.3078, "num_input_tokens_seen": 8733568, "step": 4045 }, { "epoch": 0.7432556432372912, "grad_norm": 12.294049263000488, "learning_rate": 3.7153606166269042e-06, "loss": 0.3902, "num_input_tokens_seen": 8744672, "step": 4050 }, { "epoch": 0.7441732427968435, "grad_norm": 13.82618522644043, "learning_rate": 3.7199486144246656e-06, "loss": 0.318, "num_input_tokens_seen": 8754656, "step": 4055 }, { "epoch": 0.7450908423563957, "grad_norm": 7.800384521484375, "learning_rate": 3.7245366122224265e-06, "loss": 0.3088, "num_input_tokens_seen": 8764096, "step": 4060 }, { "epoch": 0.7460084419159478, "grad_norm": 8.217744827270508, "learning_rate": 3.729124610020187e-06, "loss": 0.29, "num_input_tokens_seen": 8773664, "step": 4065 }, { "epoch": 0.7469260414755001, "grad_norm": 9.172096252441406, "learning_rate": 3.733712607817949e-06, "loss": 0.3251, "num_input_tokens_seen": 8785376, "step": 4070 }, { "epoch": 0.7478436410350523, "grad_norm": 11.504594802856445, "learning_rate": 3.7383006056157093e-06, "loss": 0.2568, "num_input_tokens_seen": 8796128, "step": 4075 }, { "epoch": 0.7487612405946045, "grad_norm": 14.79405403137207, "learning_rate": 3.742888603413471e-06, "loss": 0.3719, "num_input_tokens_seen": 8806688, "step": 4080 }, { "epoch": 0.7496788401541568, "grad_norm": 27.13628578186035, "learning_rate": 3.7474766012112316e-06, "loss": 0.2915, "num_input_tokens_seen": 8817696, "step": 4085 }, { "epoch": 0.7505964397137089, "grad_norm": 8.300667762756348, "learning_rate": 3.7520645990089926e-06, "loss": 0.3175, "num_input_tokens_seen": 8826816, "step": 4090 }, { "epoch": 0.7515140392732611, "grad_norm": 4.577517509460449, "learning_rate": 3.756652596806754e-06, "loss": 0.3729, "num_input_tokens_seen": 8837568, "step": 4095 }, { "epoch": 0.7524316388328134, "grad_norm": 18.622425079345703, "learning_rate": 3.761240594604515e-06, "loss": 0.3398, "num_input_tokens_seen": 8848576, "step": 4100 }, { "epoch": 0.7533492383923656, "grad_norm": 15.198208808898926, "learning_rate": 3.765828592402276e-06, "loss": 0.3219, "num_input_tokens_seen": 8859264, "step": 4105 }, { "epoch": 0.7542668379519177, "grad_norm": 27.101232528686523, "learning_rate": 3.770416590200037e-06, "loss": 0.3165, "num_input_tokens_seen": 8869824, "step": 4110 }, { "epoch": 0.75518443751147, "grad_norm": 18.167205810546875, "learning_rate": 3.775004587997798e-06, "loss": 0.3603, "num_input_tokens_seen": 8880416, "step": 4115 }, { "epoch": 0.7561020370710222, "grad_norm": 33.87724304199219, "learning_rate": 3.779592585795559e-06, "loss": 0.3729, "num_input_tokens_seen": 8890976, "step": 4120 }, { "epoch": 0.7570196366305744, "grad_norm": 5.579159736633301, "learning_rate": 3.7841805835933204e-06, "loss": 0.3339, "num_input_tokens_seen": 8901600, "step": 4125 }, { "epoch": 0.7579372361901267, "grad_norm": 6.831249237060547, "learning_rate": 3.7887685813910814e-06, "loss": 0.3617, "num_input_tokens_seen": 8912160, "step": 4130 }, { "epoch": 0.7588548357496788, "grad_norm": 4.272757530212402, "learning_rate": 3.7933565791888423e-06, "loss": 0.2941, "num_input_tokens_seen": 8922880, "step": 4135 }, { "epoch": 0.759772435309231, "grad_norm": 6.417923927307129, "learning_rate": 3.7979445769866037e-06, "loss": 0.3775, "num_input_tokens_seen": 8932768, "step": 4140 }, { "epoch": 0.7606900348687833, "grad_norm": 7.67933464050293, "learning_rate": 3.8025325747843646e-06, "loss": 0.3728, "num_input_tokens_seen": 8944288, "step": 4145 }, { "epoch": 0.7616076344283355, "grad_norm": 23.740388870239258, "learning_rate": 3.807120572582125e-06, "loss": 0.3836, "num_input_tokens_seen": 8956032, "step": 4150 }, { "epoch": 0.7625252339878877, "grad_norm": 21.05784034729004, "learning_rate": 3.811708570379887e-06, "loss": 0.4763, "num_input_tokens_seen": 8966400, "step": 4155 }, { "epoch": 0.7634428335474399, "grad_norm": 3.6248903274536133, "learning_rate": 3.816296568177648e-06, "loss": 0.3678, "num_input_tokens_seen": 8978208, "step": 4160 }, { "epoch": 0.7643604331069921, "grad_norm": 6.400071620941162, "learning_rate": 3.820884565975408e-06, "loss": 0.3567, "num_input_tokens_seen": 8988416, "step": 4165 }, { "epoch": 0.7652780326665444, "grad_norm": 4.875039100646973, "learning_rate": 3.82547256377317e-06, "loss": 0.3259, "num_input_tokens_seen": 9001088, "step": 4170 }, { "epoch": 0.7661956322260965, "grad_norm": 6.073320388793945, "learning_rate": 3.830060561570931e-06, "loss": 0.3079, "num_input_tokens_seen": 9012928, "step": 4175 }, { "epoch": 0.7671132317856487, "grad_norm": 15.109795570373535, "learning_rate": 3.834648559368692e-06, "loss": 0.4055, "num_input_tokens_seen": 9023104, "step": 4180 }, { "epoch": 0.768030831345201, "grad_norm": 8.270118713378906, "learning_rate": 3.839236557166453e-06, "loss": 0.3495, "num_input_tokens_seen": 9034560, "step": 4185 }, { "epoch": 0.7689484309047532, "grad_norm": 24.577180862426758, "learning_rate": 3.843824554964214e-06, "loss": 0.378, "num_input_tokens_seen": 9045184, "step": 4190 }, { "epoch": 0.7698660304643054, "grad_norm": 7.5135393142700195, "learning_rate": 3.848412552761975e-06, "loss": 0.3515, "num_input_tokens_seen": 9054880, "step": 4195 }, { "epoch": 0.7707836300238576, "grad_norm": 8.580896377563477, "learning_rate": 3.853000550559736e-06, "loss": 0.3712, "num_input_tokens_seen": 9065952, "step": 4200 }, { "epoch": 0.7717012295834098, "grad_norm": 5.250425815582275, "learning_rate": 3.857588548357497e-06, "loss": 0.3811, "num_input_tokens_seen": 9076032, "step": 4205 }, { "epoch": 0.772618829142962, "grad_norm": 19.0824031829834, "learning_rate": 3.862176546155258e-06, "loss": 0.3804, "num_input_tokens_seen": 9085600, "step": 4210 }, { "epoch": 0.7735364287025143, "grad_norm": 6.126400947570801, "learning_rate": 3.8667645439530194e-06, "loss": 0.3759, "num_input_tokens_seen": 9096576, "step": 4215 }, { "epoch": 0.7744540282620664, "grad_norm": 7.128438949584961, "learning_rate": 3.87135254175078e-06, "loss": 0.291, "num_input_tokens_seen": 9107936, "step": 4220 }, { "epoch": 0.7753716278216186, "grad_norm": 3.8804006576538086, "learning_rate": 3.875940539548541e-06, "loss": 0.4073, "num_input_tokens_seen": 9117888, "step": 4225 }, { "epoch": 0.7762892273811709, "grad_norm": 7.903203010559082, "learning_rate": 3.880528537346303e-06, "loss": 0.3107, "num_input_tokens_seen": 9128512, "step": 4230 }, { "epoch": 0.7772068269407231, "grad_norm": 3.9677064418792725, "learning_rate": 3.885116535144063e-06, "loss": 0.3565, "num_input_tokens_seen": 9139136, "step": 4235 }, { "epoch": 0.7781244265002752, "grad_norm": 6.359309196472168, "learning_rate": 3.8897045329418245e-06, "loss": 0.3282, "num_input_tokens_seen": 9150368, "step": 4240 }, { "epoch": 0.7790420260598275, "grad_norm": 3.394665002822876, "learning_rate": 3.894292530739586e-06, "loss": 0.3046, "num_input_tokens_seen": 9160448, "step": 4245 }, { "epoch": 0.7799596256193797, "grad_norm": 7.3269758224487305, "learning_rate": 3.898880528537346e-06, "loss": 0.351, "num_input_tokens_seen": 9171904, "step": 4250 }, { "epoch": 0.7808772251789319, "grad_norm": 12.39365005493164, "learning_rate": 3.903468526335108e-06, "loss": 0.3882, "num_input_tokens_seen": 9183776, "step": 4255 }, { "epoch": 0.7817948247384842, "grad_norm": 21.144319534301758, "learning_rate": 3.908056524132869e-06, "loss": 0.2954, "num_input_tokens_seen": 9195616, "step": 4260 }, { "epoch": 0.7827124242980363, "grad_norm": 16.970354080200195, "learning_rate": 3.91264452193063e-06, "loss": 0.3802, "num_input_tokens_seen": 9206272, "step": 4265 }, { "epoch": 0.7836300238575885, "grad_norm": 11.283385276794434, "learning_rate": 3.91723251972839e-06, "loss": 0.3604, "num_input_tokens_seen": 9218080, "step": 4270 }, { "epoch": 0.7845476234171408, "grad_norm": 8.841992378234863, "learning_rate": 3.921820517526152e-06, "loss": 0.2742, "num_input_tokens_seen": 9228928, "step": 4275 }, { "epoch": 0.785465222976693, "grad_norm": 4.267895698547363, "learning_rate": 3.926408515323913e-06, "loss": 0.3424, "num_input_tokens_seen": 9240416, "step": 4280 }, { "epoch": 0.7863828225362451, "grad_norm": 3.703181505203247, "learning_rate": 3.930996513121674e-06, "loss": 0.3676, "num_input_tokens_seen": 9252544, "step": 4285 }, { "epoch": 0.7873004220957974, "grad_norm": 4.319753170013428, "learning_rate": 3.935584510919435e-06, "loss": 0.3675, "num_input_tokens_seen": 9264352, "step": 4290 }, { "epoch": 0.7882180216553496, "grad_norm": 8.07114028930664, "learning_rate": 3.940172508717196e-06, "loss": 0.3027, "num_input_tokens_seen": 9274528, "step": 4295 }, { "epoch": 0.7891356212149018, "grad_norm": 6.614454746246338, "learning_rate": 3.9447605065149575e-06, "loss": 0.2876, "num_input_tokens_seen": 9285152, "step": 4300 }, { "epoch": 0.7900532207744541, "grad_norm": 6.06585168838501, "learning_rate": 3.949348504312718e-06, "loss": 0.3151, "num_input_tokens_seen": 9296000, "step": 4305 }, { "epoch": 0.7909708203340062, "grad_norm": 3.123687505722046, "learning_rate": 3.953936502110479e-06, "loss": 0.2835, "num_input_tokens_seen": 9305280, "step": 4310 }, { "epoch": 0.7918884198935584, "grad_norm": 23.117961883544922, "learning_rate": 3.958524499908241e-06, "loss": 0.3006, "num_input_tokens_seen": 9316352, "step": 4315 }, { "epoch": 0.7928060194531107, "grad_norm": 7.690202236175537, "learning_rate": 3.963112497706001e-06, "loss": 0.3756, "num_input_tokens_seen": 9325024, "step": 4320 }, { "epoch": 0.7937236190126629, "grad_norm": 7.6006855964660645, "learning_rate": 3.967700495503763e-06, "loss": 0.3368, "num_input_tokens_seen": 9335424, "step": 4325 }, { "epoch": 0.794641218572215, "grad_norm": 51.03190994262695, "learning_rate": 3.972288493301524e-06, "loss": 0.5051, "num_input_tokens_seen": 9345056, "step": 4330 }, { "epoch": 0.7955588181317673, "grad_norm": 4.037737846374512, "learning_rate": 3.9768764910992845e-06, "loss": 0.3005, "num_input_tokens_seen": 9356640, "step": 4335 }, { "epoch": 0.7964764176913195, "grad_norm": 4.896265029907227, "learning_rate": 3.981464488897046e-06, "loss": 0.2978, "num_input_tokens_seen": 9367136, "step": 4340 }, { "epoch": 0.7973940172508717, "grad_norm": 8.185891151428223, "learning_rate": 3.986052486694807e-06, "loss": 0.4233, "num_input_tokens_seen": 9376832, "step": 4345 }, { "epoch": 0.798311616810424, "grad_norm": 6.068218231201172, "learning_rate": 3.990640484492568e-06, "loss": 0.3621, "num_input_tokens_seen": 9389024, "step": 4350 }, { "epoch": 0.7992292163699761, "grad_norm": 3.6544101238250732, "learning_rate": 3.995228482290329e-06, "loss": 0.3485, "num_input_tokens_seen": 9398304, "step": 4355 }, { "epoch": 0.8001468159295283, "grad_norm": 6.698964595794678, "learning_rate": 3.9998164800880904e-06, "loss": 0.3387, "num_input_tokens_seen": 9409792, "step": 4360 }, { "epoch": 0.8010644154890806, "grad_norm": 10.612879753112793, "learning_rate": 4.004404477885851e-06, "loss": 0.3345, "num_input_tokens_seen": 9422176, "step": 4365 }, { "epoch": 0.8019820150486328, "grad_norm": 8.620397567749023, "learning_rate": 4.0089924756836115e-06, "loss": 0.3486, "num_input_tokens_seen": 9433344, "step": 4370 }, { "epoch": 0.802899614608185, "grad_norm": 5.498469352722168, "learning_rate": 4.013580473481373e-06, "loss": 0.3204, "num_input_tokens_seen": 9444192, "step": 4375 }, { "epoch": 0.8038172141677372, "grad_norm": 2.6585190296173096, "learning_rate": 4.018168471279134e-06, "loss": 0.3107, "num_input_tokens_seen": 9455392, "step": 4380 }, { "epoch": 0.8047348137272894, "grad_norm": 29.309762954711914, "learning_rate": 4.022756469076895e-06, "loss": 0.4081, "num_input_tokens_seen": 9465824, "step": 4385 }, { "epoch": 0.8056524132868417, "grad_norm": 3.3510119915008545, "learning_rate": 4.027344466874656e-06, "loss": 0.2498, "num_input_tokens_seen": 9475552, "step": 4390 }, { "epoch": 0.8065700128463938, "grad_norm": 9.001788139343262, "learning_rate": 4.0319324646724174e-06, "loss": 0.417, "num_input_tokens_seen": 9486400, "step": 4395 }, { "epoch": 0.807487612405946, "grad_norm": 3.749915361404419, "learning_rate": 4.036520462470178e-06, "loss": 0.323, "num_input_tokens_seen": 9498624, "step": 4400 }, { "epoch": 0.8084052119654983, "grad_norm": 4.114740371704102, "learning_rate": 4.041108460267939e-06, "loss": 0.365, "num_input_tokens_seen": 9507808, "step": 4405 }, { "epoch": 0.8093228115250505, "grad_norm": 6.978715419769287, "learning_rate": 4.045696458065701e-06, "loss": 0.3431, "num_input_tokens_seen": 9518144, "step": 4410 }, { "epoch": 0.8102404110846027, "grad_norm": 4.134138107299805, "learning_rate": 4.050284455863461e-06, "loss": 0.2957, "num_input_tokens_seen": 9528032, "step": 4415 }, { "epoch": 0.8111580106441549, "grad_norm": 29.19188117980957, "learning_rate": 4.0548724536612226e-06, "loss": 0.2954, "num_input_tokens_seen": 9539616, "step": 4420 }, { "epoch": 0.8120756102037071, "grad_norm": 9.111091613769531, "learning_rate": 4.059460451458984e-06, "loss": 0.2869, "num_input_tokens_seen": 9550848, "step": 4425 }, { "epoch": 0.8129932097632593, "grad_norm": 9.947832107543945, "learning_rate": 4.0640484492567444e-06, "loss": 0.5094, "num_input_tokens_seen": 9561216, "step": 4430 }, { "epoch": 0.8139108093228116, "grad_norm": 10.18228530883789, "learning_rate": 4.068636447054506e-06, "loss": 0.4266, "num_input_tokens_seen": 9571648, "step": 4435 }, { "epoch": 0.8148284088823637, "grad_norm": 4.857194900512695, "learning_rate": 4.073224444852267e-06, "loss": 0.3248, "num_input_tokens_seen": 9582976, "step": 4440 }, { "epoch": 0.8157460084419159, "grad_norm": 3.9223716259002686, "learning_rate": 4.077812442650028e-06, "loss": 0.3257, "num_input_tokens_seen": 9593344, "step": 4445 }, { "epoch": 0.8166636080014682, "grad_norm": 4.954355716705322, "learning_rate": 4.082400440447789e-06, "loss": 0.2938, "num_input_tokens_seen": 9603616, "step": 4450 }, { "epoch": 0.8175812075610204, "grad_norm": 9.097742080688477, "learning_rate": 4.0869884382455495e-06, "loss": 0.3695, "num_input_tokens_seen": 9614752, "step": 4455 }, { "epoch": 0.8184988071205725, "grad_norm": 7.573348045349121, "learning_rate": 4.091576436043311e-06, "loss": 0.2814, "num_input_tokens_seen": 9626336, "step": 4460 }, { "epoch": 0.8194164066801248, "grad_norm": 8.900527954101562, "learning_rate": 4.096164433841072e-06, "loss": 0.3382, "num_input_tokens_seen": 9637184, "step": 4465 }, { "epoch": 0.820334006239677, "grad_norm": 6.611092567443848, "learning_rate": 4.100752431638833e-06, "loss": 0.4969, "num_input_tokens_seen": 9647360, "step": 4470 }, { "epoch": 0.8212516057992292, "grad_norm": 11.19074535369873, "learning_rate": 4.105340429436594e-06, "loss": 0.2555, "num_input_tokens_seen": 9659008, "step": 4475 }, { "epoch": 0.8221692053587815, "grad_norm": 2.9096412658691406, "learning_rate": 4.1099284272343555e-06, "loss": 0.2517, "num_input_tokens_seen": 9669440, "step": 4480 }, { "epoch": 0.8230868049183336, "grad_norm": 38.35871124267578, "learning_rate": 4.114516425032116e-06, "loss": 0.3798, "num_input_tokens_seen": 9680864, "step": 4485 }, { "epoch": 0.8240044044778858, "grad_norm": 4.283586025238037, "learning_rate": 4.119104422829877e-06, "loss": 0.2059, "num_input_tokens_seen": 9691296, "step": 4490 }, { "epoch": 0.8249220040374381, "grad_norm": 28.815425872802734, "learning_rate": 4.123692420627639e-06, "loss": 0.4104, "num_input_tokens_seen": 9702304, "step": 4495 }, { "epoch": 0.8258396035969903, "grad_norm": 8.16897964477539, "learning_rate": 4.128280418425399e-06, "loss": 0.4559, "num_input_tokens_seen": 9711264, "step": 4500 }, { "epoch": 0.8267572031565424, "grad_norm": 6.95845890045166, "learning_rate": 4.132868416223161e-06, "loss": 0.3429, "num_input_tokens_seen": 9721888, "step": 4505 }, { "epoch": 0.8276748027160947, "grad_norm": 6.684139728546143, "learning_rate": 4.137456414020922e-06, "loss": 0.3808, "num_input_tokens_seen": 9731488, "step": 4510 }, { "epoch": 0.8285924022756469, "grad_norm": 5.121516227722168, "learning_rate": 4.1420444118186825e-06, "loss": 0.3585, "num_input_tokens_seen": 9741888, "step": 4515 }, { "epoch": 0.8295100018351991, "grad_norm": 7.8997979164123535, "learning_rate": 4.146632409616444e-06, "loss": 0.3412, "num_input_tokens_seen": 9753472, "step": 4520 }, { "epoch": 0.8304276013947514, "grad_norm": 7.197716236114502, "learning_rate": 4.151220407414205e-06, "loss": 0.2545, "num_input_tokens_seen": 9763904, "step": 4525 }, { "epoch": 0.8313452009543035, "grad_norm": 6.008565425872803, "learning_rate": 4.155808405211966e-06, "loss": 0.3361, "num_input_tokens_seen": 9774272, "step": 4530 }, { "epoch": 0.8322628005138557, "grad_norm": 11.504437446594238, "learning_rate": 4.160396403009727e-06, "loss": 0.4295, "num_input_tokens_seen": 9785984, "step": 4535 }, { "epoch": 0.833180400073408, "grad_norm": 7.926251411437988, "learning_rate": 4.164984400807488e-06, "loss": 0.3411, "num_input_tokens_seen": 9797248, "step": 4540 }, { "epoch": 0.8340979996329602, "grad_norm": 3.2920081615448, "learning_rate": 4.169572398605249e-06, "loss": 0.3429, "num_input_tokens_seen": 9806976, "step": 4545 }, { "epoch": 0.8350155991925123, "grad_norm": 23.299325942993164, "learning_rate": 4.17416039640301e-06, "loss": 0.3729, "num_input_tokens_seen": 9817536, "step": 4550 }, { "epoch": 0.8359331987520646, "grad_norm": 3.5161168575286865, "learning_rate": 4.178748394200771e-06, "loss": 0.3416, "num_input_tokens_seen": 9828608, "step": 4555 }, { "epoch": 0.8368507983116168, "grad_norm": 3.553406238555908, "learning_rate": 4.183336391998532e-06, "loss": 0.3806, "num_input_tokens_seen": 9838816, "step": 4560 }, { "epoch": 0.837768397871169, "grad_norm": 8.631230354309082, "learning_rate": 4.187924389796294e-06, "loss": 0.332, "num_input_tokens_seen": 9849056, "step": 4565 }, { "epoch": 0.8386859974307213, "grad_norm": 4.861538410186768, "learning_rate": 4.192512387594054e-06, "loss": 0.3577, "num_input_tokens_seen": 9859360, "step": 4570 }, { "epoch": 0.8396035969902734, "grad_norm": 6.06955099105835, "learning_rate": 4.1971003853918155e-06, "loss": 0.3753, "num_input_tokens_seen": 9868928, "step": 4575 }, { "epoch": 0.8405211965498257, "grad_norm": 7.584432601928711, "learning_rate": 4.201688383189577e-06, "loss": 0.3218, "num_input_tokens_seen": 9880288, "step": 4580 }, { "epoch": 0.8414387961093779, "grad_norm": 5.6067681312561035, "learning_rate": 4.206276380987337e-06, "loss": 0.3597, "num_input_tokens_seen": 9890752, "step": 4585 }, { "epoch": 0.8423563956689301, "grad_norm": 10.64186954498291, "learning_rate": 4.210864378785099e-06, "loss": 0.3316, "num_input_tokens_seen": 9901920, "step": 4590 }, { "epoch": 0.8432739952284823, "grad_norm": 5.945583343505859, "learning_rate": 4.21545237658286e-06, "loss": 0.344, "num_input_tokens_seen": 9912192, "step": 4595 }, { "epoch": 0.8441915947880345, "grad_norm": 4.091275691986084, "learning_rate": 4.2200403743806206e-06, "loss": 0.3432, "num_input_tokens_seen": 9923552, "step": 4600 }, { "epoch": 0.8451091943475867, "grad_norm": 6.743246078491211, "learning_rate": 4.224628372178382e-06, "loss": 0.342, "num_input_tokens_seen": 9934112, "step": 4605 }, { "epoch": 0.846026793907139, "grad_norm": 12.007132530212402, "learning_rate": 4.229216369976143e-06, "loss": 0.3841, "num_input_tokens_seen": 9945216, "step": 4610 }, { "epoch": 0.8469443934666911, "grad_norm": 4.722589015960693, "learning_rate": 4.233804367773904e-06, "loss": 0.3115, "num_input_tokens_seen": 9955648, "step": 4615 }, { "epoch": 0.8478619930262433, "grad_norm": 8.8184175491333, "learning_rate": 4.238392365571664e-06, "loss": 0.3138, "num_input_tokens_seen": 9965696, "step": 4620 }, { "epoch": 0.8487795925857956, "grad_norm": 7.008704662322998, "learning_rate": 4.242980363369426e-06, "loss": 0.2452, "num_input_tokens_seen": 9975840, "step": 4625 }, { "epoch": 0.8496971921453478, "grad_norm": 8.056933403015137, "learning_rate": 4.247568361167187e-06, "loss": 0.3077, "num_input_tokens_seen": 9986496, "step": 4630 }, { "epoch": 0.8506147917049, "grad_norm": 7.312727451324463, "learning_rate": 4.2521563589649476e-06, "loss": 0.3623, "num_input_tokens_seen": 9998048, "step": 4635 }, { "epoch": 0.8515323912644522, "grad_norm": 5.588328838348389, "learning_rate": 4.256744356762709e-06, "loss": 0.3434, "num_input_tokens_seen": 10009408, "step": 4640 }, { "epoch": 0.8524499908240044, "grad_norm": 6.134771347045898, "learning_rate": 4.26133235456047e-06, "loss": 0.3119, "num_input_tokens_seen": 10020736, "step": 4645 }, { "epoch": 0.8533675903835566, "grad_norm": 13.710342407226562, "learning_rate": 4.265920352358231e-06, "loss": 0.3474, "num_input_tokens_seen": 10031392, "step": 4650 }, { "epoch": 0.8542851899431089, "grad_norm": 17.41887855529785, "learning_rate": 4.270508350155992e-06, "loss": 0.3898, "num_input_tokens_seen": 10042400, "step": 4655 }, { "epoch": 0.855202789502661, "grad_norm": 7.557412147521973, "learning_rate": 4.2750963479537535e-06, "loss": 0.3127, "num_input_tokens_seen": 10054528, "step": 4660 }, { "epoch": 0.8561203890622132, "grad_norm": 8.354846000671387, "learning_rate": 4.279684345751514e-06, "loss": 0.3359, "num_input_tokens_seen": 10065920, "step": 4665 }, { "epoch": 0.8570379886217655, "grad_norm": 9.380339622497559, "learning_rate": 4.284272343549275e-06, "loss": 0.3808, "num_input_tokens_seen": 10074880, "step": 4670 }, { "epoch": 0.8579555881813177, "grad_norm": 7.848635196685791, "learning_rate": 4.288860341347037e-06, "loss": 0.4336, "num_input_tokens_seen": 10085600, "step": 4675 }, { "epoch": 0.8588731877408698, "grad_norm": 13.935070991516113, "learning_rate": 4.293448339144797e-06, "loss": 0.3315, "num_input_tokens_seen": 10096736, "step": 4680 }, { "epoch": 0.8597907873004221, "grad_norm": 6.503190517425537, "learning_rate": 4.298036336942559e-06, "loss": 0.3296, "num_input_tokens_seen": 10106528, "step": 4685 }, { "epoch": 0.8607083868599743, "grad_norm": 3.442423105239868, "learning_rate": 4.30262433474032e-06, "loss": 0.2823, "num_input_tokens_seen": 10116224, "step": 4690 }, { "epoch": 0.8616259864195265, "grad_norm": 4.856221675872803, "learning_rate": 4.3072123325380805e-06, "loss": 0.3847, "num_input_tokens_seen": 10126848, "step": 4695 }, { "epoch": 0.8625435859790788, "grad_norm": 19.047466278076172, "learning_rate": 4.311800330335842e-06, "loss": 0.3958, "num_input_tokens_seen": 10137600, "step": 4700 }, { "epoch": 0.8634611855386309, "grad_norm": 5.021126747131348, "learning_rate": 4.316388328133602e-06, "loss": 0.3544, "num_input_tokens_seen": 10148832, "step": 4705 }, { "epoch": 0.8643787850981831, "grad_norm": 3.8118317127227783, "learning_rate": 4.320976325931365e-06, "loss": 0.3286, "num_input_tokens_seen": 10159744, "step": 4710 }, { "epoch": 0.8652963846577354, "grad_norm": 4.177282810211182, "learning_rate": 4.325564323729125e-06, "loss": 0.3076, "num_input_tokens_seen": 10171392, "step": 4715 }, { "epoch": 0.8662139842172876, "grad_norm": 5.293072700500488, "learning_rate": 4.330152321526886e-06, "loss": 0.3111, "num_input_tokens_seen": 10180672, "step": 4720 }, { "epoch": 0.8671315837768397, "grad_norm": 9.297017097473145, "learning_rate": 4.334740319324647e-06, "loss": 0.349, "num_input_tokens_seen": 10190208, "step": 4725 }, { "epoch": 0.868049183336392, "grad_norm": 7.010962009429932, "learning_rate": 4.339328317122408e-06, "loss": 0.2739, "num_input_tokens_seen": 10202272, "step": 4730 }, { "epoch": 0.8689667828959442, "grad_norm": 5.3798828125, "learning_rate": 4.343916314920169e-06, "loss": 0.2981, "num_input_tokens_seen": 10210944, "step": 4735 }, { "epoch": 0.8698843824554964, "grad_norm": 7.700274467468262, "learning_rate": 4.34850431271793e-06, "loss": 0.349, "num_input_tokens_seen": 10222048, "step": 4740 }, { "epoch": 0.8708019820150487, "grad_norm": 14.15079116821289, "learning_rate": 4.353092310515692e-06, "loss": 0.2953, "num_input_tokens_seen": 10232992, "step": 4745 }, { "epoch": 0.8717195815746008, "grad_norm": 9.48437213897705, "learning_rate": 4.357680308313452e-06, "loss": 0.3044, "num_input_tokens_seen": 10243872, "step": 4750 }, { "epoch": 0.872637181134153, "grad_norm": 15.26181411743164, "learning_rate": 4.3622683061112135e-06, "loss": 0.3414, "num_input_tokens_seen": 10256064, "step": 4755 }, { "epoch": 0.8735547806937053, "grad_norm": 6.302229881286621, "learning_rate": 4.366856303908975e-06, "loss": 0.5086, "num_input_tokens_seen": 10266752, "step": 4760 }, { "epoch": 0.8744723802532575, "grad_norm": 9.95093822479248, "learning_rate": 4.371444301706735e-06, "loss": 0.3484, "num_input_tokens_seen": 10276352, "step": 4765 }, { "epoch": 0.8753899798128096, "grad_norm": 6.275716304779053, "learning_rate": 4.376032299504497e-06, "loss": 0.3528, "num_input_tokens_seen": 10287456, "step": 4770 }, { "epoch": 0.8763075793723619, "grad_norm": 5.66068696975708, "learning_rate": 4.380620297302258e-06, "loss": 0.332, "num_input_tokens_seen": 10299200, "step": 4775 }, { "epoch": 0.8772251789319141, "grad_norm": 31.780536651611328, "learning_rate": 4.385208295100019e-06, "loss": 0.4095, "num_input_tokens_seen": 10309664, "step": 4780 }, { "epoch": 0.8781427784914663, "grad_norm": 3.2973756790161133, "learning_rate": 4.38979629289778e-06, "loss": 0.309, "num_input_tokens_seen": 10321824, "step": 4785 }, { "epoch": 0.8790603780510186, "grad_norm": 7.985776424407959, "learning_rate": 4.3943842906955405e-06, "loss": 0.33, "num_input_tokens_seen": 10333152, "step": 4790 }, { "epoch": 0.8799779776105707, "grad_norm": 3.8098647594451904, "learning_rate": 4.398972288493302e-06, "loss": 0.3257, "num_input_tokens_seen": 10342688, "step": 4795 }, { "epoch": 0.880895577170123, "grad_norm": 3.7230772972106934, "learning_rate": 4.403560286291063e-06, "loss": 0.3168, "num_input_tokens_seen": 10353920, "step": 4800 }, { "epoch": 0.8818131767296752, "grad_norm": 6.986894130706787, "learning_rate": 4.408148284088824e-06, "loss": 0.348, "num_input_tokens_seen": 10363520, "step": 4805 }, { "epoch": 0.8827307762892274, "grad_norm": 5.014360427856445, "learning_rate": 4.412736281886585e-06, "loss": 0.3198, "num_input_tokens_seen": 10374848, "step": 4810 }, { "epoch": 0.8836483758487796, "grad_norm": 31.816316604614258, "learning_rate": 4.417324279684346e-06, "loss": 0.4736, "num_input_tokens_seen": 10386112, "step": 4815 }, { "epoch": 0.8845659754083318, "grad_norm": 7.878712177276611, "learning_rate": 4.421912277482107e-06, "loss": 0.3084, "num_input_tokens_seen": 10396128, "step": 4820 }, { "epoch": 0.885483574967884, "grad_norm": 2.7707035541534424, "learning_rate": 4.426500275279868e-06, "loss": 0.4871, "num_input_tokens_seen": 10406720, "step": 4825 }, { "epoch": 0.8864011745274363, "grad_norm": 5.104531288146973, "learning_rate": 4.43108827307763e-06, "loss": 0.3164, "num_input_tokens_seen": 10418176, "step": 4830 }, { "epoch": 0.8873187740869884, "grad_norm": 6.675735950469971, "learning_rate": 4.43567627087539e-06, "loss": 0.3384, "num_input_tokens_seen": 10429120, "step": 4835 }, { "epoch": 0.8882363736465406, "grad_norm": 4.780701637268066, "learning_rate": 4.4402642686731515e-06, "loss": 0.3382, "num_input_tokens_seen": 10440448, "step": 4840 }, { "epoch": 0.8891539732060929, "grad_norm": 2.371126174926758, "learning_rate": 4.444852266470913e-06, "loss": 0.2953, "num_input_tokens_seen": 10451424, "step": 4845 }, { "epoch": 0.8900715727656451, "grad_norm": 7.819759368896484, "learning_rate": 4.449440264268673e-06, "loss": 0.2806, "num_input_tokens_seen": 10461600, "step": 4850 }, { "epoch": 0.8909891723251973, "grad_norm": 7.258191108703613, "learning_rate": 4.454028262066434e-06, "loss": 0.2283, "num_input_tokens_seen": 10471392, "step": 4855 }, { "epoch": 0.8919067718847495, "grad_norm": 5.798925876617432, "learning_rate": 4.458616259864196e-06, "loss": 0.4688, "num_input_tokens_seen": 10482144, "step": 4860 }, { "epoch": 0.8928243714443017, "grad_norm": 5.479024887084961, "learning_rate": 4.463204257661957e-06, "loss": 0.3605, "num_input_tokens_seen": 10492608, "step": 4865 }, { "epoch": 0.8937419710038539, "grad_norm": 4.029538154602051, "learning_rate": 4.467792255459717e-06, "loss": 0.3418, "num_input_tokens_seen": 10503424, "step": 4870 }, { "epoch": 0.8946595705634062, "grad_norm": 4.497775554656982, "learning_rate": 4.4723802532574785e-06, "loss": 0.3493, "num_input_tokens_seen": 10514208, "step": 4875 }, { "epoch": 0.8955771701229583, "grad_norm": 7.725506782531738, "learning_rate": 4.47696825105524e-06, "loss": 0.2996, "num_input_tokens_seen": 10525760, "step": 4880 }, { "epoch": 0.8964947696825105, "grad_norm": 11.844635009765625, "learning_rate": 4.481556248853e-06, "loss": 0.4183, "num_input_tokens_seen": 10536928, "step": 4885 }, { "epoch": 0.8974123692420628, "grad_norm": 3.2434535026550293, "learning_rate": 4.486144246650762e-06, "loss": 0.3424, "num_input_tokens_seen": 10546752, "step": 4890 }, { "epoch": 0.898329968801615, "grad_norm": 6.63881254196167, "learning_rate": 4.490732244448523e-06, "loss": 0.4729, "num_input_tokens_seen": 10558720, "step": 4895 }, { "epoch": 0.8992475683611671, "grad_norm": 2.9684271812438965, "learning_rate": 4.495320242246284e-06, "loss": 0.3112, "num_input_tokens_seen": 10569504, "step": 4900 }, { "epoch": 0.9001651679207194, "grad_norm": 3.5674965381622314, "learning_rate": 4.499908240044045e-06, "loss": 0.3229, "num_input_tokens_seen": 10580160, "step": 4905 }, { "epoch": 0.9010827674802716, "grad_norm": 4.940062046051025, "learning_rate": 4.504496237841806e-06, "loss": 0.2974, "num_input_tokens_seen": 10590496, "step": 4910 }, { "epoch": 0.9020003670398238, "grad_norm": 2.511751413345337, "learning_rate": 4.509084235639567e-06, "loss": 0.3679, "num_input_tokens_seen": 10601696, "step": 4915 }, { "epoch": 0.9029179665993761, "grad_norm": 17.81734275817871, "learning_rate": 4.513672233437328e-06, "loss": 0.456, "num_input_tokens_seen": 10611296, "step": 4920 }, { "epoch": 0.9038355661589282, "grad_norm": 22.31389617919922, "learning_rate": 4.51826023123509e-06, "loss": 0.3569, "num_input_tokens_seen": 10621888, "step": 4925 }, { "epoch": 0.9047531657184804, "grad_norm": 3.637860059738159, "learning_rate": 4.522848229032851e-06, "loss": 0.3868, "num_input_tokens_seen": 10632672, "step": 4930 }, { "epoch": 0.9056707652780327, "grad_norm": 3.0025339126586914, "learning_rate": 4.5274362268306115e-06, "loss": 0.3175, "num_input_tokens_seen": 10643040, "step": 4935 }, { "epoch": 0.9065883648375849, "grad_norm": 13.123212814331055, "learning_rate": 4.532024224628373e-06, "loss": 0.3716, "num_input_tokens_seen": 10652768, "step": 4940 }, { "epoch": 0.907505964397137, "grad_norm": 7.168600559234619, "learning_rate": 4.536612222426134e-06, "loss": 0.3368, "num_input_tokens_seen": 10660960, "step": 4945 }, { "epoch": 0.9084235639566893, "grad_norm": 5.8466339111328125, "learning_rate": 4.541200220223895e-06, "loss": 0.251, "num_input_tokens_seen": 10671808, "step": 4950 }, { "epoch": 0.9093411635162415, "grad_norm": 4.611461639404297, "learning_rate": 4.545788218021655e-06, "loss": 0.2734, "num_input_tokens_seen": 10682976, "step": 4955 }, { "epoch": 0.9102587630757937, "grad_norm": 3.1025941371917725, "learning_rate": 4.550376215819417e-06, "loss": 0.2734, "num_input_tokens_seen": 10693600, "step": 4960 }, { "epoch": 0.911176362635346, "grad_norm": 3.3952274322509766, "learning_rate": 4.554964213617178e-06, "loss": 0.4625, "num_input_tokens_seen": 10704576, "step": 4965 }, { "epoch": 0.9120939621948981, "grad_norm": 5.62730073928833, "learning_rate": 4.5595522114149385e-06, "loss": 0.3758, "num_input_tokens_seen": 10715776, "step": 4970 }, { "epoch": 0.9130115617544503, "grad_norm": 5.365262508392334, "learning_rate": 4.5641402092127e-06, "loss": 0.3581, "num_input_tokens_seen": 10726304, "step": 4975 }, { "epoch": 0.9139291613140026, "grad_norm": 6.637637615203857, "learning_rate": 4.568728207010461e-06, "loss": 0.3581, "num_input_tokens_seen": 10737344, "step": 4980 }, { "epoch": 0.9148467608735548, "grad_norm": 6.392209529876709, "learning_rate": 4.573316204808222e-06, "loss": 0.3055, "num_input_tokens_seen": 10748832, "step": 4985 }, { "epoch": 0.9157643604331069, "grad_norm": 3.183100700378418, "learning_rate": 4.577904202605983e-06, "loss": 0.3142, "num_input_tokens_seen": 10759840, "step": 4990 }, { "epoch": 0.9166819599926592, "grad_norm": 7.272416591644287, "learning_rate": 4.5824922004037444e-06, "loss": 0.393, "num_input_tokens_seen": 10769248, "step": 4995 }, { "epoch": 0.9175995595522114, "grad_norm": 4.1121721267700195, "learning_rate": 4.587080198201505e-06, "loss": 0.3681, "num_input_tokens_seen": 10779360, "step": 5000 }, { "epoch": 0.9185171591117637, "grad_norm": 10.9462890625, "learning_rate": 4.591668195999266e-06, "loss": 0.3413, "num_input_tokens_seen": 10789152, "step": 5005 }, { "epoch": 0.9194347586713159, "grad_norm": 3.2778611183166504, "learning_rate": 4.596256193797028e-06, "loss": 0.292, "num_input_tokens_seen": 10799904, "step": 5010 }, { "epoch": 0.920352358230868, "grad_norm": 3.864288568496704, "learning_rate": 4.600844191594788e-06, "loss": 0.3942, "num_input_tokens_seen": 10811264, "step": 5015 }, { "epoch": 0.9212699577904203, "grad_norm": 6.820008277893066, "learning_rate": 4.6054321893925496e-06, "loss": 0.3405, "num_input_tokens_seen": 10824128, "step": 5020 }, { "epoch": 0.9221875573499725, "grad_norm": 3.178846597671509, "learning_rate": 4.610020187190311e-06, "loss": 0.294, "num_input_tokens_seen": 10835872, "step": 5025 }, { "epoch": 0.9231051569095247, "grad_norm": 5.784050941467285, "learning_rate": 4.6146081849880714e-06, "loss": 0.3028, "num_input_tokens_seen": 10847168, "step": 5030 }, { "epoch": 0.924022756469077, "grad_norm": 4.369187355041504, "learning_rate": 4.619196182785833e-06, "loss": 0.3679, "num_input_tokens_seen": 10858464, "step": 5035 }, { "epoch": 0.9249403560286291, "grad_norm": 5.336948871612549, "learning_rate": 4.623784180583593e-06, "loss": 0.4166, "num_input_tokens_seen": 10868704, "step": 5040 }, { "epoch": 0.9258579555881813, "grad_norm": 9.299680709838867, "learning_rate": 4.628372178381355e-06, "loss": 0.4104, "num_input_tokens_seen": 10880288, "step": 5045 }, { "epoch": 0.9267755551477336, "grad_norm": 9.069634437561035, "learning_rate": 4.632960176179116e-06, "loss": 0.3198, "num_input_tokens_seen": 10891904, "step": 5050 }, { "epoch": 0.9276931547072857, "grad_norm": 5.1396355628967285, "learning_rate": 4.6375481739768765e-06, "loss": 0.2708, "num_input_tokens_seen": 10902720, "step": 5055 }, { "epoch": 0.9286107542668379, "grad_norm": 2.9107608795166016, "learning_rate": 4.642136171774638e-06, "loss": 0.3132, "num_input_tokens_seen": 10913472, "step": 5060 }, { "epoch": 0.9295283538263902, "grad_norm": 2.236595392227173, "learning_rate": 4.646724169572399e-06, "loss": 0.2331, "num_input_tokens_seen": 10923552, "step": 5065 }, { "epoch": 0.9304459533859424, "grad_norm": 8.485769271850586, "learning_rate": 4.65131216737016e-06, "loss": 0.4086, "num_input_tokens_seen": 10933568, "step": 5070 }, { "epoch": 0.9313635529454946, "grad_norm": 3.0210280418395996, "learning_rate": 4.655900165167921e-06, "loss": 0.258, "num_input_tokens_seen": 10945376, "step": 5075 }, { "epoch": 0.9322811525050468, "grad_norm": 3.083833694458008, "learning_rate": 4.6604881629656825e-06, "loss": 0.2855, "num_input_tokens_seen": 10955520, "step": 5080 }, { "epoch": 0.933198752064599, "grad_norm": 14.540199279785156, "learning_rate": 4.665076160763443e-06, "loss": 0.2353, "num_input_tokens_seen": 10965920, "step": 5085 }, { "epoch": 0.9341163516241512, "grad_norm": 22.04912757873535, "learning_rate": 4.669664158561204e-06, "loss": 0.5594, "num_input_tokens_seen": 10977568, "step": 5090 }, { "epoch": 0.9350339511837035, "grad_norm": 15.490151405334473, "learning_rate": 4.674252156358966e-06, "loss": 0.4167, "num_input_tokens_seen": 10988960, "step": 5095 }, { "epoch": 0.9359515507432556, "grad_norm": 3.560131311416626, "learning_rate": 4.678840154156726e-06, "loss": 0.2902, "num_input_tokens_seen": 11000512, "step": 5100 }, { "epoch": 0.9368691503028078, "grad_norm": 11.9209623336792, "learning_rate": 4.683428151954487e-06, "loss": 0.3914, "num_input_tokens_seen": 11011744, "step": 5105 }, { "epoch": 0.9377867498623601, "grad_norm": 4.558887004852295, "learning_rate": 4.688016149752249e-06, "loss": 0.4057, "num_input_tokens_seen": 11022048, "step": 5110 }, { "epoch": 0.9387043494219123, "grad_norm": 5.785874366760254, "learning_rate": 4.6926041475500095e-06, "loss": 0.3982, "num_input_tokens_seen": 11033152, "step": 5115 }, { "epoch": 0.9396219489814644, "grad_norm": 5.34645414352417, "learning_rate": 4.69719214534777e-06, "loss": 0.4071, "num_input_tokens_seen": 11043904, "step": 5120 }, { "epoch": 0.9405395485410167, "grad_norm": 14.088774681091309, "learning_rate": 4.701780143145531e-06, "loss": 0.3084, "num_input_tokens_seen": 11054400, "step": 5125 }, { "epoch": 0.9414571481005689, "grad_norm": 23.58002281188965, "learning_rate": 4.706368140943293e-06, "loss": 0.321, "num_input_tokens_seen": 11065184, "step": 5130 }, { "epoch": 0.9423747476601211, "grad_norm": 14.467004776000977, "learning_rate": 4.710956138741054e-06, "loss": 0.2961, "num_input_tokens_seen": 11075328, "step": 5135 }, { "epoch": 0.9432923472196734, "grad_norm": 11.888556480407715, "learning_rate": 4.715544136538815e-06, "loss": 0.298, "num_input_tokens_seen": 11085728, "step": 5140 }, { "epoch": 0.9442099467792255, "grad_norm": 19.76478385925293, "learning_rate": 4.720132134336576e-06, "loss": 0.3414, "num_input_tokens_seen": 11096128, "step": 5145 }, { "epoch": 0.9451275463387777, "grad_norm": 7.93342924118042, "learning_rate": 4.724720132134337e-06, "loss": 0.344, "num_input_tokens_seen": 11106336, "step": 5150 }, { "epoch": 0.94604514589833, "grad_norm": 6.712163925170898, "learning_rate": 4.729308129932098e-06, "loss": 0.3114, "num_input_tokens_seen": 11116384, "step": 5155 }, { "epoch": 0.9469627454578822, "grad_norm": 22.848485946655273, "learning_rate": 4.733896127729859e-06, "loss": 0.403, "num_input_tokens_seen": 11126976, "step": 5160 }, { "epoch": 0.9478803450174343, "grad_norm": 6.810908794403076, "learning_rate": 4.7384841255276206e-06, "loss": 0.3182, "num_input_tokens_seen": 11138944, "step": 5165 }, { "epoch": 0.9487979445769866, "grad_norm": 4.629920959472656, "learning_rate": 4.743072123325381e-06, "loss": 0.2904, "num_input_tokens_seen": 11150752, "step": 5170 }, { "epoch": 0.9497155441365388, "grad_norm": 7.995797634124756, "learning_rate": 4.7476601211231424e-06, "loss": 0.3899, "num_input_tokens_seen": 11161504, "step": 5175 }, { "epoch": 0.950633143696091, "grad_norm": 6.81344747543335, "learning_rate": 4.752248118920904e-06, "loss": 0.3076, "num_input_tokens_seen": 11172000, "step": 5180 }, { "epoch": 0.9515507432556433, "grad_norm": 9.632420539855957, "learning_rate": 4.756836116718664e-06, "loss": 0.3024, "num_input_tokens_seen": 11182688, "step": 5185 }, { "epoch": 0.9524683428151954, "grad_norm": 9.159616470336914, "learning_rate": 4.761424114516426e-06, "loss": 0.3403, "num_input_tokens_seen": 11194208, "step": 5190 }, { "epoch": 0.9533859423747476, "grad_norm": 3.831713914871216, "learning_rate": 4.766012112314187e-06, "loss": 0.249, "num_input_tokens_seen": 11205760, "step": 5195 }, { "epoch": 0.9543035419342999, "grad_norm": 5.9626946449279785, "learning_rate": 4.7706001101119476e-06, "loss": 0.4224, "num_input_tokens_seen": 11217056, "step": 5200 }, { "epoch": 0.9552211414938521, "grad_norm": 4.577091693878174, "learning_rate": 4.775188107909708e-06, "loss": 0.4383, "num_input_tokens_seen": 11228288, "step": 5205 }, { "epoch": 0.9561387410534042, "grad_norm": 6.437540531158447, "learning_rate": 4.7797761057074694e-06, "loss": 0.3332, "num_input_tokens_seen": 11239040, "step": 5210 }, { "epoch": 0.9570563406129565, "grad_norm": 3.1330435276031494, "learning_rate": 4.784364103505231e-06, "loss": 0.402, "num_input_tokens_seen": 11248736, "step": 5215 }, { "epoch": 0.9579739401725087, "grad_norm": 7.144501686096191, "learning_rate": 4.788952101302991e-06, "loss": 0.4207, "num_input_tokens_seen": 11260064, "step": 5220 }, { "epoch": 0.958891539732061, "grad_norm": 9.30153751373291, "learning_rate": 4.793540099100753e-06, "loss": 0.2769, "num_input_tokens_seen": 11271744, "step": 5225 }, { "epoch": 0.9598091392916132, "grad_norm": 4.7390570640563965, "learning_rate": 4.798128096898514e-06, "loss": 0.298, "num_input_tokens_seen": 11283200, "step": 5230 }, { "epoch": 0.9607267388511653, "grad_norm": 3.537508487701416, "learning_rate": 4.8027160946962746e-06, "loss": 0.344, "num_input_tokens_seen": 11293664, "step": 5235 }, { "epoch": 0.9616443384107176, "grad_norm": 4.3372931480407715, "learning_rate": 4.807304092494036e-06, "loss": 0.3039, "num_input_tokens_seen": 11303968, "step": 5240 }, { "epoch": 0.9625619379702698, "grad_norm": 8.630131721496582, "learning_rate": 4.811892090291797e-06, "loss": 0.3724, "num_input_tokens_seen": 11315008, "step": 5245 }, { "epoch": 0.963479537529822, "grad_norm": 4.069986820220947, "learning_rate": 4.816480088089558e-06, "loss": 0.3399, "num_input_tokens_seen": 11325984, "step": 5250 }, { "epoch": 0.9643971370893742, "grad_norm": 4.182072162628174, "learning_rate": 4.821068085887319e-06, "loss": 0.3415, "num_input_tokens_seen": 11337312, "step": 5255 }, { "epoch": 0.9653147366489264, "grad_norm": 6.605531215667725, "learning_rate": 4.8256560836850805e-06, "loss": 0.2583, "num_input_tokens_seen": 11348128, "step": 5260 }, { "epoch": 0.9662323362084786, "grad_norm": 5.719473838806152, "learning_rate": 4.830244081482841e-06, "loss": 0.3788, "num_input_tokens_seen": 11359392, "step": 5265 }, { "epoch": 0.9671499357680309, "grad_norm": 3.1258034706115723, "learning_rate": 4.834832079280602e-06, "loss": 0.2639, "num_input_tokens_seen": 11370368, "step": 5270 }, { "epoch": 0.968067535327583, "grad_norm": 4.010629653930664, "learning_rate": 4.839420077078364e-06, "loss": 0.2997, "num_input_tokens_seen": 11380704, "step": 5275 }, { "epoch": 0.9689851348871352, "grad_norm": 30.093952178955078, "learning_rate": 4.844008074876124e-06, "loss": 0.3188, "num_input_tokens_seen": 11391296, "step": 5280 }, { "epoch": 0.9699027344466875, "grad_norm": 3.647089719772339, "learning_rate": 4.848596072673886e-06, "loss": 0.4245, "num_input_tokens_seen": 11402336, "step": 5285 }, { "epoch": 0.9708203340062397, "grad_norm": 6.920048236846924, "learning_rate": 4.853184070471646e-06, "loss": 0.335, "num_input_tokens_seen": 11413440, "step": 5290 }, { "epoch": 0.9717379335657919, "grad_norm": 5.5688252449035645, "learning_rate": 4.8577720682694075e-06, "loss": 0.426, "num_input_tokens_seen": 11423776, "step": 5295 }, { "epoch": 0.9726555331253441, "grad_norm": 7.470386981964111, "learning_rate": 4.862360066067169e-06, "loss": 0.3239, "num_input_tokens_seen": 11435008, "step": 5300 }, { "epoch": 0.9735731326848963, "grad_norm": 11.506468772888184, "learning_rate": 4.866948063864929e-06, "loss": 0.3802, "num_input_tokens_seen": 11446016, "step": 5305 }, { "epoch": 0.9744907322444485, "grad_norm": 5.980489253997803, "learning_rate": 4.871536061662691e-06, "loss": 0.3107, "num_input_tokens_seen": 11456864, "step": 5310 }, { "epoch": 0.9754083318040008, "grad_norm": 8.086063385009766, "learning_rate": 4.876124059460452e-06, "loss": 0.2624, "num_input_tokens_seen": 11467648, "step": 5315 }, { "epoch": 0.976325931363553, "grad_norm": 8.364421844482422, "learning_rate": 4.880712057258213e-06, "loss": 0.3962, "num_input_tokens_seen": 11478272, "step": 5320 }, { "epoch": 0.9772435309231051, "grad_norm": 18.820144653320312, "learning_rate": 4.885300055055974e-06, "loss": 0.3551, "num_input_tokens_seen": 11488768, "step": 5325 }, { "epoch": 0.9781611304826574, "grad_norm": 2.831843137741089, "learning_rate": 4.889888052853735e-06, "loss": 0.4016, "num_input_tokens_seen": 11499488, "step": 5330 }, { "epoch": 0.9790787300422096, "grad_norm": 12.40536880493164, "learning_rate": 4.894476050651496e-06, "loss": 0.2997, "num_input_tokens_seen": 11509408, "step": 5335 }, { "epoch": 0.9799963296017618, "grad_norm": 3.6247284412384033, "learning_rate": 4.899064048449257e-06, "loss": 0.2675, "num_input_tokens_seen": 11521664, "step": 5340 }, { "epoch": 0.980913929161314, "grad_norm": 22.156604766845703, "learning_rate": 4.903652046247019e-06, "loss": 0.4407, "num_input_tokens_seen": 11532160, "step": 5345 }, { "epoch": 0.9818315287208662, "grad_norm": 2.729459762573242, "learning_rate": 4.908240044044779e-06, "loss": 0.3539, "num_input_tokens_seen": 11542912, "step": 5350 }, { "epoch": 0.9827491282804184, "grad_norm": 46.1008186340332, "learning_rate": 4.9128280418425405e-06, "loss": 0.3538, "num_input_tokens_seen": 11553504, "step": 5355 }, { "epoch": 0.9836667278399707, "grad_norm": 10.851150512695312, "learning_rate": 4.917416039640302e-06, "loss": 0.3486, "num_input_tokens_seen": 11564000, "step": 5360 }, { "epoch": 0.9845843273995228, "grad_norm": 2.9071414470672607, "learning_rate": 4.922004037438062e-06, "loss": 0.3323, "num_input_tokens_seen": 11574368, "step": 5365 }, { "epoch": 0.985501926959075, "grad_norm": 29.86637306213379, "learning_rate": 4.926592035235824e-06, "loss": 0.3498, "num_input_tokens_seen": 11585280, "step": 5370 }, { "epoch": 0.9864195265186273, "grad_norm": 2.8211536407470703, "learning_rate": 4.931180033033584e-06, "loss": 0.3475, "num_input_tokens_seen": 11596448, "step": 5375 }, { "epoch": 0.9873371260781795, "grad_norm": 5.135778903961182, "learning_rate": 4.935768030831346e-06, "loss": 0.4419, "num_input_tokens_seen": 11608416, "step": 5380 }, { "epoch": 0.9882547256377316, "grad_norm": 4.0511040687561035, "learning_rate": 4.940356028629107e-06, "loss": 0.3148, "num_input_tokens_seen": 11620128, "step": 5385 }, { "epoch": 0.9891723251972839, "grad_norm": 2.387033700942993, "learning_rate": 4.9449440264268675e-06, "loss": 0.3684, "num_input_tokens_seen": 11630400, "step": 5390 }, { "epoch": 0.9900899247568361, "grad_norm": 4.134677410125732, "learning_rate": 4.949532024224629e-06, "loss": 0.3346, "num_input_tokens_seen": 11640704, "step": 5395 }, { "epoch": 0.9910075243163883, "grad_norm": 4.7312726974487305, "learning_rate": 4.95412002202239e-06, "loss": 0.3622, "num_input_tokens_seen": 11651264, "step": 5400 }, { "epoch": 0.9919251238759406, "grad_norm": 4.452801704406738, "learning_rate": 4.958708019820151e-06, "loss": 0.2759, "num_input_tokens_seen": 11661632, "step": 5405 }, { "epoch": 0.9928427234354927, "grad_norm": 15.194635391235352, "learning_rate": 4.963296017617912e-06, "loss": 0.3179, "num_input_tokens_seen": 11672864, "step": 5410 }, { "epoch": 0.9937603229950449, "grad_norm": 17.987194061279297, "learning_rate": 4.967884015415673e-06, "loss": 0.2231, "num_input_tokens_seen": 11683232, "step": 5415 }, { "epoch": 0.9946779225545972, "grad_norm": 26.202844619750977, "learning_rate": 4.972472013213434e-06, "loss": 0.3574, "num_input_tokens_seen": 11694624, "step": 5420 }, { "epoch": 0.9955955221141494, "grad_norm": 12.100301742553711, "learning_rate": 4.977060011011195e-06, "loss": 0.5091, "num_input_tokens_seen": 11704320, "step": 5425 }, { "epoch": 0.9965131216737017, "grad_norm": 5.212721347808838, "learning_rate": 4.981648008808957e-06, "loss": 0.4005, "num_input_tokens_seen": 11714912, "step": 5430 }, { "epoch": 0.9974307212332538, "grad_norm": 16.004682540893555, "learning_rate": 4.986236006606717e-06, "loss": 0.2309, "num_input_tokens_seen": 11725888, "step": 5435 }, { "epoch": 0.998348320792806, "grad_norm": 22.96844482421875, "learning_rate": 4.9908240044044785e-06, "loss": 0.3848, "num_input_tokens_seen": 11738208, "step": 5440 }, { "epoch": 0.9992659203523583, "grad_norm": 7.538995742797852, "learning_rate": 4.99541200220224e-06, "loss": 0.6248, "num_input_tokens_seen": 11748672, "step": 5445 }, { "epoch": 1.0001835199119105, "grad_norm": 9.712811470031738, "learning_rate": 5e-06, "loss": 0.3464, "num_input_tokens_seen": 11756848, "step": 5450 }, { "epoch": 1.0011011194714627, "grad_norm": 4.594287872314453, "learning_rate": 5.004587997797762e-06, "loss": 0.3848, "num_input_tokens_seen": 11768432, "step": 5455 }, { "epoch": 1.0020187190310148, "grad_norm": 2.7443158626556396, "learning_rate": 5.009175995595522e-06, "loss": 0.3366, "num_input_tokens_seen": 11778576, "step": 5460 }, { "epoch": 1.002936318590567, "grad_norm": 5.263712406158447, "learning_rate": 5.013763993393284e-06, "loss": 0.3677, "num_input_tokens_seen": 11789648, "step": 5465 }, { "epoch": 1.0038539181501194, "grad_norm": 4.4701762199401855, "learning_rate": 5.018351991191045e-06, "loss": 0.3646, "num_input_tokens_seen": 11801264, "step": 5470 }, { "epoch": 1.0047715177096714, "grad_norm": 2.9402008056640625, "learning_rate": 5.0229399889888055e-06, "loss": 0.3023, "num_input_tokens_seen": 11811120, "step": 5475 }, { "epoch": 1.0056891172692237, "grad_norm": 5.405763626098633, "learning_rate": 5.027527986786567e-06, "loss": 0.275, "num_input_tokens_seen": 11821008, "step": 5480 }, { "epoch": 1.006606716828776, "grad_norm": 4.3140153884887695, "learning_rate": 5.032115984584328e-06, "loss": 0.3053, "num_input_tokens_seen": 11832656, "step": 5485 }, { "epoch": 1.007524316388328, "grad_norm": 5.219191074371338, "learning_rate": 5.036703982382089e-06, "loss": 0.3651, "num_input_tokens_seen": 11844048, "step": 5490 }, { "epoch": 1.0084419159478804, "grad_norm": 7.553913593292236, "learning_rate": 5.04129198017985e-06, "loss": 0.299, "num_input_tokens_seen": 11855248, "step": 5495 }, { "epoch": 1.0093595155074326, "grad_norm": 6.725157260894775, "learning_rate": 5.0458799779776115e-06, "loss": 0.2904, "num_input_tokens_seen": 11867120, "step": 5500 }, { "epoch": 1.0102771150669847, "grad_norm": 7.307336807250977, "learning_rate": 5.050467975775372e-06, "loss": 0.3001, "num_input_tokens_seen": 11878288, "step": 5505 }, { "epoch": 1.011194714626537, "grad_norm": 6.261562824249268, "learning_rate": 5.055055973573133e-06, "loss": 0.3569, "num_input_tokens_seen": 11889680, "step": 5510 }, { "epoch": 1.0121123141860893, "grad_norm": 7.186318874359131, "learning_rate": 5.059643971370895e-06, "loss": 0.3462, "num_input_tokens_seen": 11899088, "step": 5515 }, { "epoch": 1.0130299137456413, "grad_norm": 3.2267632484436035, "learning_rate": 5.064231969168654e-06, "loss": 0.3081, "num_input_tokens_seen": 11910576, "step": 5520 }, { "epoch": 1.0139475133051936, "grad_norm": 3.1321306228637695, "learning_rate": 5.068819966966417e-06, "loss": 0.2412, "num_input_tokens_seen": 11921968, "step": 5525 }, { "epoch": 1.014865112864746, "grad_norm": 6.698932647705078, "learning_rate": 5.073407964764178e-06, "loss": 0.2784, "num_input_tokens_seen": 11933616, "step": 5530 }, { "epoch": 1.015782712424298, "grad_norm": 7.71200704574585, "learning_rate": 5.077995962561938e-06, "loss": 0.2809, "num_input_tokens_seen": 11944432, "step": 5535 }, { "epoch": 1.0167003119838502, "grad_norm": 6.362753391265869, "learning_rate": 5.082583960359699e-06, "loss": 0.4497, "num_input_tokens_seen": 11955504, "step": 5540 }, { "epoch": 1.0176179115434025, "grad_norm": 6.594398021697998, "learning_rate": 5.087171958157461e-06, "loss": 0.2397, "num_input_tokens_seen": 11966288, "step": 5545 }, { "epoch": 1.0185355111029546, "grad_norm": 11.771859169006348, "learning_rate": 5.091759955955221e-06, "loss": 0.3399, "num_input_tokens_seen": 11977488, "step": 5550 }, { "epoch": 1.0194531106625069, "grad_norm": 32.4345588684082, "learning_rate": 5.096347953752982e-06, "loss": 0.4747, "num_input_tokens_seen": 11987664, "step": 5555 }, { "epoch": 1.0203707102220592, "grad_norm": 13.868666648864746, "learning_rate": 5.100935951550744e-06, "loss": 0.3066, "num_input_tokens_seen": 11998160, "step": 5560 }, { "epoch": 1.0212883097816112, "grad_norm": 10.537569046020508, "learning_rate": 5.105523949348504e-06, "loss": 0.4613, "num_input_tokens_seen": 12008496, "step": 5565 }, { "epoch": 1.0222059093411635, "grad_norm": 13.469151496887207, "learning_rate": 5.1101119471462655e-06, "loss": 0.3189, "num_input_tokens_seen": 12019248, "step": 5570 }, { "epoch": 1.0231235089007158, "grad_norm": 6.963699817657471, "learning_rate": 5.114699944944027e-06, "loss": 0.2904, "num_input_tokens_seen": 12029680, "step": 5575 }, { "epoch": 1.0240411084602679, "grad_norm": 8.339214324951172, "learning_rate": 5.119287942741787e-06, "loss": 0.35, "num_input_tokens_seen": 12039088, "step": 5580 }, { "epoch": 1.0249587080198201, "grad_norm": 4.833394527435303, "learning_rate": 5.123875940539549e-06, "loss": 0.3126, "num_input_tokens_seen": 12048240, "step": 5585 }, { "epoch": 1.0258763075793724, "grad_norm": 4.407170295715332, "learning_rate": 5.12846393833731e-06, "loss": 0.3285, "num_input_tokens_seen": 12059120, "step": 5590 }, { "epoch": 1.0267939071389245, "grad_norm": 9.224454879760742, "learning_rate": 5.133051936135071e-06, "loss": 0.3638, "num_input_tokens_seen": 12069392, "step": 5595 }, { "epoch": 1.0277115066984768, "grad_norm": 16.996309280395508, "learning_rate": 5.137639933932832e-06, "loss": 0.319, "num_input_tokens_seen": 12079408, "step": 5600 }, { "epoch": 1.028629106258029, "grad_norm": 38.651363372802734, "learning_rate": 5.142227931730593e-06, "loss": 0.3527, "num_input_tokens_seen": 12089712, "step": 5605 }, { "epoch": 1.0295467058175811, "grad_norm": 6.306747913360596, "learning_rate": 5.146815929528354e-06, "loss": 0.3688, "num_input_tokens_seen": 12099792, "step": 5610 }, { "epoch": 1.0304643053771334, "grad_norm": 7.787338733673096, "learning_rate": 5.151403927326115e-06, "loss": 0.3133, "num_input_tokens_seen": 12110736, "step": 5615 }, { "epoch": 1.0313819049366857, "grad_norm": 4.923642158508301, "learning_rate": 5.1559919251238765e-06, "loss": 0.359, "num_input_tokens_seen": 12121424, "step": 5620 }, { "epoch": 1.0322995044962378, "grad_norm": 8.118623733520508, "learning_rate": 5.160579922921637e-06, "loss": 0.2884, "num_input_tokens_seen": 12131920, "step": 5625 }, { "epoch": 1.03321710405579, "grad_norm": 5.689245223999023, "learning_rate": 5.165167920719398e-06, "loss": 0.3482, "num_input_tokens_seen": 12143664, "step": 5630 }, { "epoch": 1.0341347036153423, "grad_norm": 9.827619552612305, "learning_rate": 5.16975591851716e-06, "loss": 0.3837, "num_input_tokens_seen": 12152912, "step": 5635 }, { "epoch": 1.0350523031748944, "grad_norm": 12.054234504699707, "learning_rate": 5.17434391631492e-06, "loss": 0.3453, "num_input_tokens_seen": 12163792, "step": 5640 }, { "epoch": 1.0359699027344467, "grad_norm": 4.804598808288574, "learning_rate": 5.178931914112682e-06, "loss": 0.3238, "num_input_tokens_seen": 12173552, "step": 5645 }, { "epoch": 1.036887502293999, "grad_norm": 13.170909881591797, "learning_rate": 5.183519911910443e-06, "loss": 0.3345, "num_input_tokens_seen": 12184688, "step": 5650 }, { "epoch": 1.037805101853551, "grad_norm": 10.329895973205566, "learning_rate": 5.1881079097082035e-06, "loss": 0.3322, "num_input_tokens_seen": 12195632, "step": 5655 }, { "epoch": 1.0387227014131033, "grad_norm": 9.036709785461426, "learning_rate": 5.192695907505965e-06, "loss": 0.3542, "num_input_tokens_seen": 12206288, "step": 5660 }, { "epoch": 1.0396403009726556, "grad_norm": 7.823295593261719, "learning_rate": 5.197283905303726e-06, "loss": 0.3156, "num_input_tokens_seen": 12217456, "step": 5665 }, { "epoch": 1.0405579005322076, "grad_norm": 5.619918346405029, "learning_rate": 5.201871903101488e-06, "loss": 0.2892, "num_input_tokens_seen": 12227664, "step": 5670 }, { "epoch": 1.04147550009176, "grad_norm": 5.133744239807129, "learning_rate": 5.206459900899248e-06, "loss": 0.2858, "num_input_tokens_seen": 12237424, "step": 5675 }, { "epoch": 1.0423930996513122, "grad_norm": 6.442492961883545, "learning_rate": 5.2110478986970095e-06, "loss": 0.2689, "num_input_tokens_seen": 12249616, "step": 5680 }, { "epoch": 1.0433106992108643, "grad_norm": 10.765819549560547, "learning_rate": 5.215635896494771e-06, "loss": 0.3287, "num_input_tokens_seen": 12260144, "step": 5685 }, { "epoch": 1.0442282987704166, "grad_norm": 7.604118824005127, "learning_rate": 5.220223894292531e-06, "loss": 0.3805, "num_input_tokens_seen": 12270096, "step": 5690 }, { "epoch": 1.0451458983299688, "grad_norm": 21.106258392333984, "learning_rate": 5.224811892090293e-06, "loss": 0.3639, "num_input_tokens_seen": 12280848, "step": 5695 }, { "epoch": 1.046063497889521, "grad_norm": 4.686093807220459, "learning_rate": 5.229399889888054e-06, "loss": 0.3757, "num_input_tokens_seen": 12290992, "step": 5700 }, { "epoch": 1.0469810974490732, "grad_norm": 7.268383502960205, "learning_rate": 5.233987887685814e-06, "loss": 0.3172, "num_input_tokens_seen": 12301168, "step": 5705 }, { "epoch": 1.0478986970086255, "grad_norm": 9.046043395996094, "learning_rate": 5.238575885483575e-06, "loss": 0.2868, "num_input_tokens_seen": 12312752, "step": 5710 }, { "epoch": 1.0488162965681775, "grad_norm": 4.245380401611328, "learning_rate": 5.243163883281337e-06, "loss": 0.3518, "num_input_tokens_seen": 12323344, "step": 5715 }, { "epoch": 1.0497338961277298, "grad_norm": 5.245821952819824, "learning_rate": 5.247751881079097e-06, "loss": 0.284, "num_input_tokens_seen": 12334832, "step": 5720 }, { "epoch": 1.050651495687282, "grad_norm": 3.958211660385132, "learning_rate": 5.252339878876858e-06, "loss": 0.4577, "num_input_tokens_seen": 12344144, "step": 5725 }, { "epoch": 1.0515690952468342, "grad_norm": 4.095719337463379, "learning_rate": 5.25692787667462e-06, "loss": 0.2203, "num_input_tokens_seen": 12354256, "step": 5730 }, { "epoch": 1.0524866948063865, "grad_norm": 25.439315795898438, "learning_rate": 5.26151587447238e-06, "loss": 0.5046, "num_input_tokens_seen": 12364720, "step": 5735 }, { "epoch": 1.0534042943659387, "grad_norm": 26.612253189086914, "learning_rate": 5.266103872270142e-06, "loss": 0.3143, "num_input_tokens_seen": 12375888, "step": 5740 }, { "epoch": 1.0543218939254908, "grad_norm": 4.13484525680542, "learning_rate": 5.270691870067903e-06, "loss": 0.414, "num_input_tokens_seen": 12387312, "step": 5745 }, { "epoch": 1.055239493485043, "grad_norm": 2.9707324504852295, "learning_rate": 5.2752798678656635e-06, "loss": 0.288, "num_input_tokens_seen": 12398352, "step": 5750 }, { "epoch": 1.0561570930445954, "grad_norm": 5.936627388000488, "learning_rate": 5.279867865663425e-06, "loss": 0.3679, "num_input_tokens_seen": 12409200, "step": 5755 }, { "epoch": 1.0570746926041474, "grad_norm": 3.0236735343933105, "learning_rate": 5.284455863461186e-06, "loss": 0.3244, "num_input_tokens_seen": 12420816, "step": 5760 }, { "epoch": 1.0579922921636997, "grad_norm": 5.273994445800781, "learning_rate": 5.289043861258947e-06, "loss": 0.3117, "num_input_tokens_seen": 12431536, "step": 5765 }, { "epoch": 1.058909891723252, "grad_norm": 2.62522292137146, "learning_rate": 5.293631859056708e-06, "loss": 0.3602, "num_input_tokens_seen": 12441360, "step": 5770 }, { "epoch": 1.0598274912828043, "grad_norm": 8.22956657409668, "learning_rate": 5.2982198568544694e-06, "loss": 0.2867, "num_input_tokens_seen": 12452656, "step": 5775 }, { "epoch": 1.0607450908423564, "grad_norm": 23.372886657714844, "learning_rate": 5.30280785465223e-06, "loss": 0.3823, "num_input_tokens_seen": 12462896, "step": 5780 }, { "epoch": 1.0616626904019086, "grad_norm": 8.274856567382812, "learning_rate": 5.307395852449991e-06, "loss": 0.3711, "num_input_tokens_seen": 12474736, "step": 5785 }, { "epoch": 1.062580289961461, "grad_norm": 2.082820415496826, "learning_rate": 5.311983850247753e-06, "loss": 0.3974, "num_input_tokens_seen": 12485808, "step": 5790 }, { "epoch": 1.063497889521013, "grad_norm": 4.2627153396606445, "learning_rate": 5.316571848045513e-06, "loss": 0.3117, "num_input_tokens_seen": 12496304, "step": 5795 }, { "epoch": 1.0644154890805653, "grad_norm": 4.697381496429443, "learning_rate": 5.3211598458432746e-06, "loss": 0.3383, "num_input_tokens_seen": 12506800, "step": 5800 }, { "epoch": 1.0653330886401176, "grad_norm": 28.165395736694336, "learning_rate": 5.325747843641036e-06, "loss": 0.396, "num_input_tokens_seen": 12519152, "step": 5805 }, { "epoch": 1.0662506881996696, "grad_norm": 3.385200023651123, "learning_rate": 5.3303358414387964e-06, "loss": 0.3056, "num_input_tokens_seen": 12529552, "step": 5810 }, { "epoch": 1.067168287759222, "grad_norm": 3.451173782348633, "learning_rate": 5.334923839236558e-06, "loss": 0.3131, "num_input_tokens_seen": 12540976, "step": 5815 }, { "epoch": 1.0680858873187742, "grad_norm": 5.181934356689453, "learning_rate": 5.339511837034319e-06, "loss": 0.3123, "num_input_tokens_seen": 12551824, "step": 5820 }, { "epoch": 1.0690034868783262, "grad_norm": 8.18843936920166, "learning_rate": 5.34409983483208e-06, "loss": 0.2902, "num_input_tokens_seen": 12563728, "step": 5825 }, { "epoch": 1.0699210864378785, "grad_norm": 3.050506114959717, "learning_rate": 5.348687832629841e-06, "loss": 0.3324, "num_input_tokens_seen": 12574032, "step": 5830 }, { "epoch": 1.0708386859974308, "grad_norm": 7.0938401222229, "learning_rate": 5.353275830427602e-06, "loss": 0.317, "num_input_tokens_seen": 12585072, "step": 5835 }, { "epoch": 1.0717562855569829, "grad_norm": 6.477105617523193, "learning_rate": 5.357863828225363e-06, "loss": 0.3565, "num_input_tokens_seen": 12595696, "step": 5840 }, { "epoch": 1.0726738851165352, "grad_norm": 4.800605773925781, "learning_rate": 5.362451826023124e-06, "loss": 0.38, "num_input_tokens_seen": 12606160, "step": 5845 }, { "epoch": 1.0735914846760874, "grad_norm": 4.101863384246826, "learning_rate": 5.367039823820886e-06, "loss": 0.3044, "num_input_tokens_seen": 12616816, "step": 5850 }, { "epoch": 1.0745090842356395, "grad_norm": 3.762946128845215, "learning_rate": 5.371627821618645e-06, "loss": 0.2407, "num_input_tokens_seen": 12627888, "step": 5855 }, { "epoch": 1.0754266837951918, "grad_norm": 3.210580825805664, "learning_rate": 5.3762158194164075e-06, "loss": 0.4363, "num_input_tokens_seen": 12638128, "step": 5860 }, { "epoch": 1.076344283354744, "grad_norm": 25.493789672851562, "learning_rate": 5.380803817214169e-06, "loss": 0.3824, "num_input_tokens_seen": 12649808, "step": 5865 }, { "epoch": 1.0772618829142961, "grad_norm": 6.148460865020752, "learning_rate": 5.3853918150119285e-06, "loss": 0.2761, "num_input_tokens_seen": 12660592, "step": 5870 }, { "epoch": 1.0781794824738484, "grad_norm": 9.605459213256836, "learning_rate": 5.38997981280969e-06, "loss": 0.214, "num_input_tokens_seen": 12670384, "step": 5875 }, { "epoch": 1.0790970820334007, "grad_norm": 9.21951675415039, "learning_rate": 5.394567810607452e-06, "loss": 0.3394, "num_input_tokens_seen": 12681072, "step": 5880 }, { "epoch": 1.0800146815929528, "grad_norm": 21.21697235107422, "learning_rate": 5.399155808405212e-06, "loss": 0.4489, "num_input_tokens_seen": 12692464, "step": 5885 }, { "epoch": 1.080932281152505, "grad_norm": 9.08956527709961, "learning_rate": 5.403743806202973e-06, "loss": 0.3263, "num_input_tokens_seen": 12702544, "step": 5890 }, { "epoch": 1.0818498807120573, "grad_norm": 21.584779739379883, "learning_rate": 5.4083318040007345e-06, "loss": 0.3472, "num_input_tokens_seen": 12712688, "step": 5895 }, { "epoch": 1.0827674802716094, "grad_norm": 16.70962142944336, "learning_rate": 5.412919801798495e-06, "loss": 0.3855, "num_input_tokens_seen": 12722608, "step": 5900 }, { "epoch": 1.0836850798311617, "grad_norm": 31.514965057373047, "learning_rate": 5.417507799596256e-06, "loss": 0.4217, "num_input_tokens_seen": 12734224, "step": 5905 }, { "epoch": 1.084602679390714, "grad_norm": 12.142326354980469, "learning_rate": 5.422095797394018e-06, "loss": 0.4251, "num_input_tokens_seen": 12744752, "step": 5910 }, { "epoch": 1.085520278950266, "grad_norm": 13.603802680969238, "learning_rate": 5.426683795191778e-06, "loss": 0.3126, "num_input_tokens_seen": 12754864, "step": 5915 }, { "epoch": 1.0864378785098183, "grad_norm": 30.390588760375977, "learning_rate": 5.43127179298954e-06, "loss": 0.3345, "num_input_tokens_seen": 12763728, "step": 5920 }, { "epoch": 1.0873554780693706, "grad_norm": 39.26765823364258, "learning_rate": 5.435859790787301e-06, "loss": 0.4237, "num_input_tokens_seen": 12774160, "step": 5925 }, { "epoch": 1.0882730776289227, "grad_norm": 38.999839782714844, "learning_rate": 5.4404477885850615e-06, "loss": 0.3525, "num_input_tokens_seen": 12785232, "step": 5930 }, { "epoch": 1.089190677188475, "grad_norm": 19.91054916381836, "learning_rate": 5.445035786382823e-06, "loss": 0.4275, "num_input_tokens_seen": 12795152, "step": 5935 }, { "epoch": 1.0901082767480272, "grad_norm": 17.914731979370117, "learning_rate": 5.449623784180584e-06, "loss": 0.3362, "num_input_tokens_seen": 12806608, "step": 5940 }, { "epoch": 1.0910258763075793, "grad_norm": 27.663368225097656, "learning_rate": 5.454211781978345e-06, "loss": 0.3302, "num_input_tokens_seen": 12816848, "step": 5945 }, { "epoch": 1.0919434758671316, "grad_norm": 31.538532257080078, "learning_rate": 5.458799779776106e-06, "loss": 0.4111, "num_input_tokens_seen": 12827440, "step": 5950 }, { "epoch": 1.0928610754266839, "grad_norm": 7.78417444229126, "learning_rate": 5.4633877775738675e-06, "loss": 0.29, "num_input_tokens_seen": 12839216, "step": 5955 }, { "epoch": 1.093778674986236, "grad_norm": 6.898991107940674, "learning_rate": 5.467975775371628e-06, "loss": 0.354, "num_input_tokens_seen": 12849712, "step": 5960 }, { "epoch": 1.0946962745457882, "grad_norm": 14.002665519714355, "learning_rate": 5.472563773169389e-06, "loss": 0.3317, "num_input_tokens_seen": 12859856, "step": 5965 }, { "epoch": 1.0956138741053405, "grad_norm": 9.531506538391113, "learning_rate": 5.477151770967151e-06, "loss": 0.3587, "num_input_tokens_seen": 12871248, "step": 5970 }, { "epoch": 1.0965314736648926, "grad_norm": 10.294178009033203, "learning_rate": 5.481739768764911e-06, "loss": 0.3663, "num_input_tokens_seen": 12882480, "step": 5975 }, { "epoch": 1.0974490732244448, "grad_norm": 7.147150993347168, "learning_rate": 5.4863277665626726e-06, "loss": 0.3449, "num_input_tokens_seen": 12893648, "step": 5980 }, { "epoch": 1.0983666727839971, "grad_norm": 9.140804290771484, "learning_rate": 5.490915764360434e-06, "loss": 0.3409, "num_input_tokens_seen": 12905456, "step": 5985 }, { "epoch": 1.0992842723435492, "grad_norm": 19.126216888427734, "learning_rate": 5.4955037621581945e-06, "loss": 0.4041, "num_input_tokens_seen": 12915760, "step": 5990 }, { "epoch": 1.1002018719031015, "grad_norm": 8.499013900756836, "learning_rate": 5.500091759955956e-06, "loss": 0.2918, "num_input_tokens_seen": 12926576, "step": 5995 }, { "epoch": 1.1011194714626538, "grad_norm": 6.761445045471191, "learning_rate": 5.504679757753717e-06, "loss": 0.3949, "num_input_tokens_seen": 12937232, "step": 6000 }, { "epoch": 1.1020370710222058, "grad_norm": 9.87544059753418, "learning_rate": 5.509267755551478e-06, "loss": 0.4031, "num_input_tokens_seen": 12947856, "step": 6005 }, { "epoch": 1.1029546705817581, "grad_norm": 8.158553123474121, "learning_rate": 5.513855753349239e-06, "loss": 0.3554, "num_input_tokens_seen": 12958576, "step": 6010 }, { "epoch": 1.1038722701413104, "grad_norm": 2.650043487548828, "learning_rate": 5.518443751147e-06, "loss": 0.3518, "num_input_tokens_seen": 12968944, "step": 6015 }, { "epoch": 1.1047898697008625, "grad_norm": 6.867227554321289, "learning_rate": 5.52303174894476e-06, "loss": 0.3458, "num_input_tokens_seen": 12980176, "step": 6020 }, { "epoch": 1.1057074692604147, "grad_norm": 6.061088562011719, "learning_rate": 5.527619746742522e-06, "loss": 0.3617, "num_input_tokens_seen": 12990320, "step": 6025 }, { "epoch": 1.106625068819967, "grad_norm": 3.312884569168091, "learning_rate": 5.532207744540284e-06, "loss": 0.3236, "num_input_tokens_seen": 13001680, "step": 6030 }, { "epoch": 1.107542668379519, "grad_norm": 5.200764179229736, "learning_rate": 5.536795742338043e-06, "loss": 0.3528, "num_input_tokens_seen": 13011536, "step": 6035 }, { "epoch": 1.1084602679390714, "grad_norm": 13.976358413696289, "learning_rate": 5.541383740135805e-06, "loss": 0.4272, "num_input_tokens_seen": 13022384, "step": 6040 }, { "epoch": 1.1093778674986237, "grad_norm": 6.402804374694824, "learning_rate": 5.545971737933567e-06, "loss": 0.3754, "num_input_tokens_seen": 13031792, "step": 6045 }, { "epoch": 1.1102954670581757, "grad_norm": 10.706258773803711, "learning_rate": 5.5505597357313266e-06, "loss": 0.338, "num_input_tokens_seen": 13043568, "step": 6050 }, { "epoch": 1.111213066617728, "grad_norm": 7.0284576416015625, "learning_rate": 5.555147733529088e-06, "loss": 0.3219, "num_input_tokens_seen": 13053104, "step": 6055 }, { "epoch": 1.1121306661772803, "grad_norm": 2.5699679851531982, "learning_rate": 5.559735731326849e-06, "loss": 0.312, "num_input_tokens_seen": 13065008, "step": 6060 }, { "epoch": 1.1130482657368324, "grad_norm": 4.749087333679199, "learning_rate": 5.56432372912461e-06, "loss": 0.243, "num_input_tokens_seen": 13075440, "step": 6065 }, { "epoch": 1.1139658652963846, "grad_norm": 4.873486042022705, "learning_rate": 5.568911726922371e-06, "loss": 0.4203, "num_input_tokens_seen": 13088176, "step": 6070 }, { "epoch": 1.114883464855937, "grad_norm": 9.652217864990234, "learning_rate": 5.5734997247201325e-06, "loss": 0.3914, "num_input_tokens_seen": 13099312, "step": 6075 }, { "epoch": 1.115801064415489, "grad_norm": 7.75630521774292, "learning_rate": 5.578087722517893e-06, "loss": 0.3654, "num_input_tokens_seen": 13108048, "step": 6080 }, { "epoch": 1.1167186639750413, "grad_norm": 12.183680534362793, "learning_rate": 5.582675720315654e-06, "loss": 0.3053, "num_input_tokens_seen": 13117456, "step": 6085 }, { "epoch": 1.1176362635345936, "grad_norm": 5.060271739959717, "learning_rate": 5.587263718113416e-06, "loss": 0.3071, "num_input_tokens_seen": 13128048, "step": 6090 }, { "epoch": 1.1185538630941456, "grad_norm": 14.808307647705078, "learning_rate": 5.591851715911177e-06, "loss": 0.275, "num_input_tokens_seen": 13139248, "step": 6095 }, { "epoch": 1.119471462653698, "grad_norm": 2.4776782989501953, "learning_rate": 5.596439713708938e-06, "loss": 0.4436, "num_input_tokens_seen": 13149936, "step": 6100 }, { "epoch": 1.1203890622132502, "grad_norm": 7.455094337463379, "learning_rate": 5.601027711506699e-06, "loss": 0.4599, "num_input_tokens_seen": 13159536, "step": 6105 }, { "epoch": 1.1213066617728022, "grad_norm": 11.232617378234863, "learning_rate": 5.60561570930446e-06, "loss": 0.3827, "num_input_tokens_seen": 13169424, "step": 6110 }, { "epoch": 1.1222242613323545, "grad_norm": 2.970417022705078, "learning_rate": 5.610203707102221e-06, "loss": 0.3434, "num_input_tokens_seen": 13180528, "step": 6115 }, { "epoch": 1.1231418608919068, "grad_norm": 4.6149797439575195, "learning_rate": 5.614791704899982e-06, "loss": 0.3103, "num_input_tokens_seen": 13191344, "step": 6120 }, { "epoch": 1.1240594604514589, "grad_norm": 7.983474254608154, "learning_rate": 5.619379702697744e-06, "loss": 0.3466, "num_input_tokens_seen": 13202128, "step": 6125 }, { "epoch": 1.1249770600110112, "grad_norm": 2.565215826034546, "learning_rate": 5.623967700495504e-06, "loss": 0.3378, "num_input_tokens_seen": 13213264, "step": 6130 }, { "epoch": 1.1258946595705634, "grad_norm": 1.9771592617034912, "learning_rate": 5.6285556982932655e-06, "loss": 0.3222, "num_input_tokens_seen": 13223696, "step": 6135 }, { "epoch": 1.1268122591301157, "grad_norm": 2.2125117778778076, "learning_rate": 5.633143696091027e-06, "loss": 0.3446, "num_input_tokens_seen": 13234224, "step": 6140 }, { "epoch": 1.1277298586896678, "grad_norm": 4.878102779388428, "learning_rate": 5.637731693888787e-06, "loss": 0.3268, "num_input_tokens_seen": 13244560, "step": 6145 }, { "epoch": 1.12864745824922, "grad_norm": 4.069942951202393, "learning_rate": 5.642319691686549e-06, "loss": 0.2755, "num_input_tokens_seen": 13254608, "step": 6150 }, { "epoch": 1.1295650578087724, "grad_norm": 3.3426127433776855, "learning_rate": 5.64690768948431e-06, "loss": 0.3154, "num_input_tokens_seen": 13265072, "step": 6155 }, { "epoch": 1.1304826573683244, "grad_norm": 6.239987850189209, "learning_rate": 5.651495687282071e-06, "loss": 0.2397, "num_input_tokens_seen": 13276912, "step": 6160 }, { "epoch": 1.1314002569278767, "grad_norm": 2.2132718563079834, "learning_rate": 5.656083685079832e-06, "loss": 0.2823, "num_input_tokens_seen": 13287152, "step": 6165 }, { "epoch": 1.132317856487429, "grad_norm": 11.632454872131348, "learning_rate": 5.660671682877593e-06, "loss": 0.3887, "num_input_tokens_seen": 13298672, "step": 6170 }, { "epoch": 1.133235456046981, "grad_norm": 26.863637924194336, "learning_rate": 5.665259680675354e-06, "loss": 0.4986, "num_input_tokens_seen": 13309680, "step": 6175 }, { "epoch": 1.1341530556065333, "grad_norm": 3.2949841022491455, "learning_rate": 5.669847678473115e-06, "loss": 0.334, "num_input_tokens_seen": 13322000, "step": 6180 }, { "epoch": 1.1350706551660856, "grad_norm": 4.198614120483398, "learning_rate": 5.6744356762708766e-06, "loss": 0.3298, "num_input_tokens_seen": 13333520, "step": 6185 }, { "epoch": 1.1359882547256377, "grad_norm": 3.306199550628662, "learning_rate": 5.679023674068636e-06, "loss": 0.3353, "num_input_tokens_seen": 13344880, "step": 6190 }, { "epoch": 1.13690585428519, "grad_norm": 2.833726644515991, "learning_rate": 5.6836116718663984e-06, "loss": 0.3354, "num_input_tokens_seen": 13355440, "step": 6195 }, { "epoch": 1.1378234538447423, "grad_norm": 3.3127174377441406, "learning_rate": 5.68819966966416e-06, "loss": 0.3263, "num_input_tokens_seen": 13366864, "step": 6200 }, { "epoch": 1.1387410534042943, "grad_norm": 4.324026107788086, "learning_rate": 5.6927876674619195e-06, "loss": 0.389, "num_input_tokens_seen": 13377808, "step": 6205 }, { "epoch": 1.1396586529638466, "grad_norm": 3.4535019397735596, "learning_rate": 5.697375665259681e-06, "loss": 0.3876, "num_input_tokens_seen": 13389392, "step": 6210 }, { "epoch": 1.140576252523399, "grad_norm": 12.197086334228516, "learning_rate": 5.701963663057443e-06, "loss": 0.2591, "num_input_tokens_seen": 13400048, "step": 6215 }, { "epoch": 1.141493852082951, "grad_norm": 3.9591434001922607, "learning_rate": 5.706551660855203e-06, "loss": 0.362, "num_input_tokens_seen": 13411632, "step": 6220 }, { "epoch": 1.1424114516425032, "grad_norm": 3.265242099761963, "learning_rate": 5.711139658652964e-06, "loss": 0.322, "num_input_tokens_seen": 13421520, "step": 6225 }, { "epoch": 1.1433290512020555, "grad_norm": 27.12226676940918, "learning_rate": 5.715727656450725e-06, "loss": 0.3318, "num_input_tokens_seen": 13432496, "step": 6230 }, { "epoch": 1.1442466507616076, "grad_norm": 4.684857368469238, "learning_rate": 5.720315654248486e-06, "loss": 0.2904, "num_input_tokens_seen": 13443248, "step": 6235 }, { "epoch": 1.1451642503211599, "grad_norm": 6.552073955535889, "learning_rate": 5.724903652046247e-06, "loss": 0.5128, "num_input_tokens_seen": 13453680, "step": 6240 }, { "epoch": 1.1460818498807122, "grad_norm": 4.71095085144043, "learning_rate": 5.729491649844009e-06, "loss": 0.2875, "num_input_tokens_seen": 13464560, "step": 6245 }, { "epoch": 1.1469994494402642, "grad_norm": 5.967587471008301, "learning_rate": 5.734079647641769e-06, "loss": 0.3575, "num_input_tokens_seen": 13475760, "step": 6250 }, { "epoch": 1.1479170489998165, "grad_norm": 14.496994972229004, "learning_rate": 5.7386676454395305e-06, "loss": 0.3729, "num_input_tokens_seen": 13486704, "step": 6255 }, { "epoch": 1.1488346485593688, "grad_norm": 3.1032755374908447, "learning_rate": 5.743255643237292e-06, "loss": 0.2733, "num_input_tokens_seen": 13498352, "step": 6260 }, { "epoch": 1.1497522481189208, "grad_norm": 2.763385057449341, "learning_rate": 5.747843641035052e-06, "loss": 0.2782, "num_input_tokens_seen": 13509936, "step": 6265 }, { "epoch": 1.1506698476784731, "grad_norm": 6.963400363922119, "learning_rate": 5.752431638832814e-06, "loss": 0.3744, "num_input_tokens_seen": 13521872, "step": 6270 }, { "epoch": 1.1515874472380254, "grad_norm": 4.247408390045166, "learning_rate": 5.757019636630575e-06, "loss": 0.3598, "num_input_tokens_seen": 13532080, "step": 6275 }, { "epoch": 1.1525050467975775, "grad_norm": 10.218422889709473, "learning_rate": 5.761607634428336e-06, "loss": 0.2725, "num_input_tokens_seen": 13542256, "step": 6280 }, { "epoch": 1.1534226463571298, "grad_norm": 7.547605037689209, "learning_rate": 5.766195632226097e-06, "loss": 0.3277, "num_input_tokens_seen": 13553072, "step": 6285 }, { "epoch": 1.154340245916682, "grad_norm": 3.5398917198181152, "learning_rate": 5.770783630023858e-06, "loss": 0.2595, "num_input_tokens_seen": 13564304, "step": 6290 }, { "epoch": 1.1552578454762341, "grad_norm": 3.454064130783081, "learning_rate": 5.775371627821619e-06, "loss": 0.2782, "num_input_tokens_seen": 13574608, "step": 6295 }, { "epoch": 1.1561754450357864, "grad_norm": 9.103105545043945, "learning_rate": 5.77995962561938e-06, "loss": 0.4862, "num_input_tokens_seen": 13585712, "step": 6300 }, { "epoch": 1.1570930445953387, "grad_norm": 4.7738037109375, "learning_rate": 5.784547623417142e-06, "loss": 0.3377, "num_input_tokens_seen": 13596752, "step": 6305 }, { "epoch": 1.1580106441548907, "grad_norm": 3.979224443435669, "learning_rate": 5.789135621214902e-06, "loss": 0.3349, "num_input_tokens_seen": 13607344, "step": 6310 }, { "epoch": 1.158928243714443, "grad_norm": 7.7431230545043945, "learning_rate": 5.7937236190126635e-06, "loss": 0.2812, "num_input_tokens_seen": 13619280, "step": 6315 }, { "epoch": 1.1598458432739953, "grad_norm": 3.859441041946411, "learning_rate": 5.798311616810425e-06, "loss": 0.3559, "num_input_tokens_seen": 13629424, "step": 6320 }, { "epoch": 1.1607634428335474, "grad_norm": 3.588792085647583, "learning_rate": 5.802899614608185e-06, "loss": 0.3204, "num_input_tokens_seen": 13639952, "step": 6325 }, { "epoch": 1.1616810423930997, "grad_norm": 7.942054271697998, "learning_rate": 5.807487612405947e-06, "loss": 0.3416, "num_input_tokens_seen": 13651056, "step": 6330 }, { "epoch": 1.162598641952652, "grad_norm": 2.8641748428344727, "learning_rate": 5.812075610203708e-06, "loss": 0.269, "num_input_tokens_seen": 13661616, "step": 6335 }, { "epoch": 1.163516241512204, "grad_norm": 1.9537339210510254, "learning_rate": 5.816663608001469e-06, "loss": 0.2792, "num_input_tokens_seen": 13671472, "step": 6340 }, { "epoch": 1.1644338410717563, "grad_norm": 10.906365394592285, "learning_rate": 5.82125160579923e-06, "loss": 0.3302, "num_input_tokens_seen": 13681488, "step": 6345 }, { "epoch": 1.1653514406313086, "grad_norm": 19.168453216552734, "learning_rate": 5.825839603596991e-06, "loss": 0.4607, "num_input_tokens_seen": 13693040, "step": 6350 }, { "epoch": 1.1662690401908606, "grad_norm": 5.539305686950684, "learning_rate": 5.830427601394751e-06, "loss": 0.2875, "num_input_tokens_seen": 13702704, "step": 6355 }, { "epoch": 1.167186639750413, "grad_norm": 25.080585479736328, "learning_rate": 5.835015599192513e-06, "loss": 0.43, "num_input_tokens_seen": 13714128, "step": 6360 }, { "epoch": 1.1681042393099652, "grad_norm": 6.155920505523682, "learning_rate": 5.8396035969902746e-06, "loss": 0.3366, "num_input_tokens_seen": 13724592, "step": 6365 }, { "epoch": 1.1690218388695173, "grad_norm": 17.84935760498047, "learning_rate": 5.844191594788034e-06, "loss": 0.5455, "num_input_tokens_seen": 13735728, "step": 6370 }, { "epoch": 1.1699394384290696, "grad_norm": 8.240729331970215, "learning_rate": 5.848779592585796e-06, "loss": 0.3689, "num_input_tokens_seen": 13747536, "step": 6375 }, { "epoch": 1.1708570379886218, "grad_norm": 7.718134880065918, "learning_rate": 5.853367590383558e-06, "loss": 0.2405, "num_input_tokens_seen": 13757712, "step": 6380 }, { "epoch": 1.171774637548174, "grad_norm": 10.70333480834961, "learning_rate": 5.8579555881813175e-06, "loss": 0.4621, "num_input_tokens_seen": 13769168, "step": 6385 }, { "epoch": 1.1726922371077262, "grad_norm": 10.35879135131836, "learning_rate": 5.862543585979079e-06, "loss": 0.425, "num_input_tokens_seen": 13780048, "step": 6390 }, { "epoch": 1.1736098366672785, "grad_norm": 3.084073066711426, "learning_rate": 5.86713158377684e-06, "loss": 0.3891, "num_input_tokens_seen": 13789872, "step": 6395 }, { "epoch": 1.1745274362268305, "grad_norm": 5.202464580535889, "learning_rate": 5.871719581574601e-06, "loss": 0.3169, "num_input_tokens_seen": 13800976, "step": 6400 }, { "epoch": 1.1754450357863828, "grad_norm": 2.9653592109680176, "learning_rate": 5.876307579372362e-06, "loss": 0.2886, "num_input_tokens_seen": 13813904, "step": 6405 }, { "epoch": 1.176362635345935, "grad_norm": 2.7201318740844727, "learning_rate": 5.8808955771701234e-06, "loss": 0.3145, "num_input_tokens_seen": 13826064, "step": 6410 }, { "epoch": 1.1772802349054872, "grad_norm": 3.1190412044525146, "learning_rate": 5.885483574967884e-06, "loss": 0.3218, "num_input_tokens_seen": 13836592, "step": 6415 }, { "epoch": 1.1781978344650395, "grad_norm": 5.857068061828613, "learning_rate": 5.890071572765645e-06, "loss": 0.3973, "num_input_tokens_seen": 13846992, "step": 6420 }, { "epoch": 1.1791154340245917, "grad_norm": 4.301835060119629, "learning_rate": 5.894659570563407e-06, "loss": 0.306, "num_input_tokens_seen": 13857296, "step": 6425 }, { "epoch": 1.1800330335841438, "grad_norm": 4.542324066162109, "learning_rate": 5.899247568361167e-06, "loss": 0.3575, "num_input_tokens_seen": 13867312, "step": 6430 }, { "epoch": 1.180950633143696, "grad_norm": 3.070835828781128, "learning_rate": 5.9038355661589285e-06, "loss": 0.3643, "num_input_tokens_seen": 13878800, "step": 6435 }, { "epoch": 1.1818682327032484, "grad_norm": 4.712692737579346, "learning_rate": 5.90842356395669e-06, "loss": 0.272, "num_input_tokens_seen": 13888976, "step": 6440 }, { "epoch": 1.1827858322628004, "grad_norm": 4.812789440155029, "learning_rate": 5.9130115617544504e-06, "loss": 0.22, "num_input_tokens_seen": 13900752, "step": 6445 }, { "epoch": 1.1837034318223527, "grad_norm": 2.943662643432617, "learning_rate": 5.917599559552212e-06, "loss": 0.3493, "num_input_tokens_seen": 13912944, "step": 6450 }, { "epoch": 1.184621031381905, "grad_norm": 2.3025591373443604, "learning_rate": 5.922187557349973e-06, "loss": 0.4125, "num_input_tokens_seen": 13923344, "step": 6455 }, { "epoch": 1.185538630941457, "grad_norm": 3.8769357204437256, "learning_rate": 5.926775555147734e-06, "loss": 0.311, "num_input_tokens_seen": 13934352, "step": 6460 }, { "epoch": 1.1864562305010093, "grad_norm": 3.080439567565918, "learning_rate": 5.931363552945495e-06, "loss": 0.3312, "num_input_tokens_seen": 13944816, "step": 6465 }, { "epoch": 1.1873738300605616, "grad_norm": 15.56381893157959, "learning_rate": 5.935951550743256e-06, "loss": 0.3851, "num_input_tokens_seen": 13955920, "step": 6470 }, { "epoch": 1.1882914296201137, "grad_norm": 9.38681411743164, "learning_rate": 5.940539548541017e-06, "loss": 0.3659, "num_input_tokens_seen": 13967344, "step": 6475 }, { "epoch": 1.189209029179666, "grad_norm": 19.81049156188965, "learning_rate": 5.945127546338778e-06, "loss": 0.4474, "num_input_tokens_seen": 13976912, "step": 6480 }, { "epoch": 1.1901266287392183, "grad_norm": 2.1353585720062256, "learning_rate": 5.94971554413654e-06, "loss": 0.3081, "num_input_tokens_seen": 13988752, "step": 6485 }, { "epoch": 1.1910442282987703, "grad_norm": 3.476954221725464, "learning_rate": 5.9543035419343e-06, "loss": 0.3065, "num_input_tokens_seen": 13999600, "step": 6490 }, { "epoch": 1.1919618278583226, "grad_norm": 3.670180320739746, "learning_rate": 5.9588915397320615e-06, "loss": 0.2896, "num_input_tokens_seen": 14010960, "step": 6495 }, { "epoch": 1.192879427417875, "grad_norm": 3.590404987335205, "learning_rate": 5.963479537529823e-06, "loss": 0.3458, "num_input_tokens_seen": 14022416, "step": 6500 }, { "epoch": 1.193797026977427, "grad_norm": 2.457503080368042, "learning_rate": 5.968067535327583e-06, "loss": 0.273, "num_input_tokens_seen": 14033104, "step": 6505 }, { "epoch": 1.1947146265369792, "grad_norm": 2.7911243438720703, "learning_rate": 5.972655533125345e-06, "loss": 0.4065, "num_input_tokens_seen": 14044304, "step": 6510 }, { "epoch": 1.1956322260965315, "grad_norm": 1.8294439315795898, "learning_rate": 5.977243530923106e-06, "loss": 0.2853, "num_input_tokens_seen": 14055312, "step": 6515 }, { "epoch": 1.1965498256560836, "grad_norm": 9.105364799499512, "learning_rate": 5.9818315287208675e-06, "loss": 0.4388, "num_input_tokens_seen": 14066032, "step": 6520 }, { "epoch": 1.1974674252156359, "grad_norm": 5.045330047607422, "learning_rate": 5.986419526518628e-06, "loss": 0.2721, "num_input_tokens_seen": 14076624, "step": 6525 }, { "epoch": 1.1983850247751882, "grad_norm": 3.7381134033203125, "learning_rate": 5.991007524316389e-06, "loss": 0.3828, "num_input_tokens_seen": 14086448, "step": 6530 }, { "epoch": 1.1993026243347402, "grad_norm": 3.127309560775757, "learning_rate": 5.995595522114151e-06, "loss": 0.376, "num_input_tokens_seen": 14098800, "step": 6535 }, { "epoch": 1.2002202238942925, "grad_norm": 4.7150373458862305, "learning_rate": 6.00018351991191e-06, "loss": 0.3394, "num_input_tokens_seen": 14109840, "step": 6540 }, { "epoch": 1.2011378234538448, "grad_norm": 3.0592567920684814, "learning_rate": 6.004771517709672e-06, "loss": 0.3217, "num_input_tokens_seen": 14120464, "step": 6545 }, { "epoch": 1.2020554230133969, "grad_norm": 2.388061761856079, "learning_rate": 6.009359515507434e-06, "loss": 0.3277, "num_input_tokens_seen": 14131248, "step": 6550 }, { "epoch": 1.2029730225729491, "grad_norm": 1.5549585819244385, "learning_rate": 6.013947513305194e-06, "loss": 0.3359, "num_input_tokens_seen": 14142224, "step": 6555 }, { "epoch": 1.2038906221325014, "grad_norm": 15.269634246826172, "learning_rate": 6.018535511102955e-06, "loss": 0.3713, "num_input_tokens_seen": 14152688, "step": 6560 }, { "epoch": 1.2048082216920535, "grad_norm": 3.3737828731536865, "learning_rate": 6.023123508900716e-06, "loss": 0.3315, "num_input_tokens_seen": 14163728, "step": 6565 }, { "epoch": 1.2057258212516058, "grad_norm": 3.306211471557617, "learning_rate": 6.027711506698477e-06, "loss": 0.3176, "num_input_tokens_seen": 14174000, "step": 6570 }, { "epoch": 1.206643420811158, "grad_norm": 1.51575767993927, "learning_rate": 6.032299504496238e-06, "loss": 0.3193, "num_input_tokens_seen": 14184304, "step": 6575 }, { "epoch": 1.2075610203707101, "grad_norm": 3.2660319805145264, "learning_rate": 6.0368875022939996e-06, "loss": 0.2677, "num_input_tokens_seen": 14194448, "step": 6580 }, { "epoch": 1.2084786199302624, "grad_norm": 2.616157293319702, "learning_rate": 6.04147550009176e-06, "loss": 0.3379, "num_input_tokens_seen": 14203088, "step": 6585 }, { "epoch": 1.2093962194898147, "grad_norm": 4.346057415008545, "learning_rate": 6.0460634978895214e-06, "loss": 0.3386, "num_input_tokens_seen": 14215984, "step": 6590 }, { "epoch": 1.2103138190493667, "grad_norm": 6.53338098526001, "learning_rate": 6.050651495687283e-06, "loss": 0.3521, "num_input_tokens_seen": 14226512, "step": 6595 }, { "epoch": 1.211231418608919, "grad_norm": 6.330822467803955, "learning_rate": 6.055239493485043e-06, "loss": 0.2857, "num_input_tokens_seen": 14237328, "step": 6600 }, { "epoch": 1.2121490181684713, "grad_norm": 3.839121103286743, "learning_rate": 6.059827491282805e-06, "loss": 0.2659, "num_input_tokens_seen": 14247696, "step": 6605 }, { "epoch": 1.2130666177280234, "grad_norm": 18.19260025024414, "learning_rate": 6.064415489080566e-06, "loss": 0.5332, "num_input_tokens_seen": 14258928, "step": 6610 }, { "epoch": 1.2139842172875757, "grad_norm": 26.184797286987305, "learning_rate": 6.0690034868783266e-06, "loss": 0.4248, "num_input_tokens_seen": 14269488, "step": 6615 }, { "epoch": 1.214901816847128, "grad_norm": 13.343894958496094, "learning_rate": 6.073591484676088e-06, "loss": 0.3318, "num_input_tokens_seen": 14280656, "step": 6620 }, { "epoch": 1.21581941640668, "grad_norm": 2.6368277072906494, "learning_rate": 6.078179482473849e-06, "loss": 0.3551, "num_input_tokens_seen": 14291952, "step": 6625 }, { "epoch": 1.2167370159662323, "grad_norm": 4.337449550628662, "learning_rate": 6.08276748027161e-06, "loss": 0.3344, "num_input_tokens_seen": 14302544, "step": 6630 }, { "epoch": 1.2176546155257846, "grad_norm": 3.7007675170898438, "learning_rate": 6.087355478069371e-06, "loss": 0.3138, "num_input_tokens_seen": 14314448, "step": 6635 }, { "epoch": 1.2185722150853366, "grad_norm": 6.981049537658691, "learning_rate": 6.0919434758671325e-06, "loss": 0.4115, "num_input_tokens_seen": 14324368, "step": 6640 }, { "epoch": 1.219489814644889, "grad_norm": 14.080286026000977, "learning_rate": 6.096531473664893e-06, "loss": 0.4216, "num_input_tokens_seen": 14335952, "step": 6645 }, { "epoch": 1.2204074142044412, "grad_norm": 2.403946876525879, "learning_rate": 6.101119471462654e-06, "loss": 0.3852, "num_input_tokens_seen": 14346480, "step": 6650 }, { "epoch": 1.2213250137639933, "grad_norm": 2.9002726078033447, "learning_rate": 6.105707469260416e-06, "loss": 0.3256, "num_input_tokens_seen": 14357488, "step": 6655 }, { "epoch": 1.2222426133235456, "grad_norm": 4.648468971252441, "learning_rate": 6.110295467058176e-06, "loss": 0.3211, "num_input_tokens_seen": 14366544, "step": 6660 }, { "epoch": 1.2231602128830978, "grad_norm": 3.5672740936279297, "learning_rate": 6.114883464855938e-06, "loss": 0.3122, "num_input_tokens_seen": 14378192, "step": 6665 }, { "epoch": 1.22407781244265, "grad_norm": 5.916459560394287, "learning_rate": 6.119471462653699e-06, "loss": 0.3413, "num_input_tokens_seen": 14389840, "step": 6670 }, { "epoch": 1.2249954120022022, "grad_norm": 3.0136466026306152, "learning_rate": 6.1240594604514595e-06, "loss": 0.3161, "num_input_tokens_seen": 14401552, "step": 6675 }, { "epoch": 1.2259130115617545, "grad_norm": 4.27792501449585, "learning_rate": 6.128647458249221e-06, "loss": 0.3341, "num_input_tokens_seen": 14413360, "step": 6680 }, { "epoch": 1.2268306111213068, "grad_norm": 3.594247817993164, "learning_rate": 6.133235456046982e-06, "loss": 0.3072, "num_input_tokens_seen": 14424656, "step": 6685 }, { "epoch": 1.2277482106808588, "grad_norm": 12.040365219116211, "learning_rate": 6.137823453844742e-06, "loss": 0.3492, "num_input_tokens_seen": 14435376, "step": 6690 }, { "epoch": 1.228665810240411, "grad_norm": 1.9492707252502441, "learning_rate": 6.142411451642504e-06, "loss": 0.4211, "num_input_tokens_seen": 14446480, "step": 6695 }, { "epoch": 1.2295834097999634, "grad_norm": 7.187397003173828, "learning_rate": 6.1469994494402655e-06, "loss": 0.4099, "num_input_tokens_seen": 14456976, "step": 6700 }, { "epoch": 1.2305010093595155, "grad_norm": 8.626126289367676, "learning_rate": 6.151587447238025e-06, "loss": 0.2819, "num_input_tokens_seen": 14466768, "step": 6705 }, { "epoch": 1.2314186089190677, "grad_norm": 4.557368278503418, "learning_rate": 6.1561754450357865e-06, "loss": 0.3231, "num_input_tokens_seen": 14478288, "step": 6710 }, { "epoch": 1.23233620847862, "grad_norm": 5.1729960441589355, "learning_rate": 6.160763442833549e-06, "loss": 0.2701, "num_input_tokens_seen": 14489744, "step": 6715 }, { "epoch": 1.233253808038172, "grad_norm": 2.5711212158203125, "learning_rate": 6.165351440631308e-06, "loss": 0.2812, "num_input_tokens_seen": 14500560, "step": 6720 }, { "epoch": 1.2341714075977244, "grad_norm": 17.426177978515625, "learning_rate": 6.16993943842907e-06, "loss": 0.3032, "num_input_tokens_seen": 14510704, "step": 6725 }, { "epoch": 1.2350890071572767, "grad_norm": 2.731968402862549, "learning_rate": 6.174527436226831e-06, "loss": 0.2631, "num_input_tokens_seen": 14521136, "step": 6730 }, { "epoch": 1.2360066067168287, "grad_norm": 4.350820541381836, "learning_rate": 6.179115434024592e-06, "loss": 0.4316, "num_input_tokens_seen": 14533072, "step": 6735 }, { "epoch": 1.236924206276381, "grad_norm": 2.331613779067993, "learning_rate": 6.183703431822353e-06, "loss": 0.5084, "num_input_tokens_seen": 14544496, "step": 6740 }, { "epoch": 1.2378418058359333, "grad_norm": 4.654799461364746, "learning_rate": 6.188291429620114e-06, "loss": 0.2839, "num_input_tokens_seen": 14555856, "step": 6745 }, { "epoch": 1.2387594053954853, "grad_norm": 6.947691440582275, "learning_rate": 6.192879427417875e-06, "loss": 0.3116, "num_input_tokens_seen": 14566448, "step": 6750 }, { "epoch": 1.2396770049550376, "grad_norm": 10.994080543518066, "learning_rate": 6.197467425215636e-06, "loss": 0.3251, "num_input_tokens_seen": 14577296, "step": 6755 }, { "epoch": 1.24059460451459, "grad_norm": 11.604902267456055, "learning_rate": 6.202055423013398e-06, "loss": 0.2612, "num_input_tokens_seen": 14588784, "step": 6760 }, { "epoch": 1.241512204074142, "grad_norm": 9.63725757598877, "learning_rate": 6.206643420811158e-06, "loss": 0.2925, "num_input_tokens_seen": 14600912, "step": 6765 }, { "epoch": 1.2424298036336943, "grad_norm": 2.1707656383514404, "learning_rate": 6.2112314186089195e-06, "loss": 0.386, "num_input_tokens_seen": 14610736, "step": 6770 }, { "epoch": 1.2433474031932465, "grad_norm": 8.367619514465332, "learning_rate": 6.215819416406681e-06, "loss": 0.4032, "num_input_tokens_seen": 14621616, "step": 6775 }, { "epoch": 1.2442650027527986, "grad_norm": 2.0486342906951904, "learning_rate": 6.220407414204441e-06, "loss": 0.3291, "num_input_tokens_seen": 14632272, "step": 6780 }, { "epoch": 1.245182602312351, "grad_norm": 6.124977111816406, "learning_rate": 6.224995412002203e-06, "loss": 0.359, "num_input_tokens_seen": 14642544, "step": 6785 }, { "epoch": 1.2461002018719032, "grad_norm": 2.973072052001953, "learning_rate": 6.229583409799964e-06, "loss": 0.287, "num_input_tokens_seen": 14652432, "step": 6790 }, { "epoch": 1.2470178014314552, "grad_norm": 3.371272325515747, "learning_rate": 6.2341714075977246e-06, "loss": 0.3916, "num_input_tokens_seen": 14662768, "step": 6795 }, { "epoch": 1.2479354009910075, "grad_norm": 3.8273026943206787, "learning_rate": 6.238759405395486e-06, "loss": 0.3729, "num_input_tokens_seen": 14673520, "step": 6800 }, { "epoch": 1.2488530005505598, "grad_norm": 1.8129969835281372, "learning_rate": 6.243347403193247e-06, "loss": 0.3414, "num_input_tokens_seen": 14684464, "step": 6805 }, { "epoch": 1.2497706001101119, "grad_norm": 11.717060089111328, "learning_rate": 6.247935400991008e-06, "loss": 0.3631, "num_input_tokens_seen": 14694896, "step": 6810 }, { "epoch": 1.2506881996696642, "grad_norm": 3.6163909435272217, "learning_rate": 6.252523398788769e-06, "loss": 0.3074, "num_input_tokens_seen": 14705360, "step": 6815 }, { "epoch": 1.2516057992292164, "grad_norm": 2.9666149616241455, "learning_rate": 6.2571113965865305e-06, "loss": 0.3282, "num_input_tokens_seen": 14716976, "step": 6820 }, { "epoch": 1.2525233987887685, "grad_norm": 8.442380905151367, "learning_rate": 6.261699394384291e-06, "loss": 0.2827, "num_input_tokens_seen": 14727088, "step": 6825 }, { "epoch": 1.2534409983483208, "grad_norm": 3.3563435077667236, "learning_rate": 6.266287392182052e-06, "loss": 0.3328, "num_input_tokens_seen": 14737008, "step": 6830 }, { "epoch": 1.254358597907873, "grad_norm": 2.9041695594787598, "learning_rate": 6.270875389979814e-06, "loss": 0.3257, "num_input_tokens_seen": 14746608, "step": 6835 }, { "epoch": 1.2552761974674251, "grad_norm": 1.8268173933029175, "learning_rate": 6.275463387777574e-06, "loss": 0.2835, "num_input_tokens_seen": 14756624, "step": 6840 }, { "epoch": 1.2561937970269774, "grad_norm": 2.140101909637451, "learning_rate": 6.280051385575336e-06, "loss": 0.256, "num_input_tokens_seen": 14767472, "step": 6845 }, { "epoch": 1.2571113965865297, "grad_norm": 7.164632797241211, "learning_rate": 6.284639383373097e-06, "loss": 0.2855, "num_input_tokens_seen": 14778096, "step": 6850 }, { "epoch": 1.2580289961460818, "grad_norm": 1.9431543350219727, "learning_rate": 6.289227381170857e-06, "loss": 0.3185, "num_input_tokens_seen": 14788848, "step": 6855 }, { "epoch": 1.258946595705634, "grad_norm": 4.983038902282715, "learning_rate": 6.293815378968619e-06, "loss": 0.3506, "num_input_tokens_seen": 14799920, "step": 6860 }, { "epoch": 1.2598641952651863, "grad_norm": 2.164445638656616, "learning_rate": 6.29840337676638e-06, "loss": 0.2385, "num_input_tokens_seen": 14811824, "step": 6865 }, { "epoch": 1.2607817948247386, "grad_norm": 2.411043405532837, "learning_rate": 6.30299137456414e-06, "loss": 0.316, "num_input_tokens_seen": 14821712, "step": 6870 }, { "epoch": 1.2616993943842907, "grad_norm": 4.80349063873291, "learning_rate": 6.307579372361901e-06, "loss": 0.3008, "num_input_tokens_seen": 14831600, "step": 6875 }, { "epoch": 1.262616993943843, "grad_norm": 6.888799667358398, "learning_rate": 6.3121673701596635e-06, "loss": 0.443, "num_input_tokens_seen": 14840592, "step": 6880 }, { "epoch": 1.2635345935033953, "grad_norm": 9.301177978515625, "learning_rate": 6.316755367957423e-06, "loss": 0.3693, "num_input_tokens_seen": 14851088, "step": 6885 }, { "epoch": 1.2644521930629473, "grad_norm": 1.6835193634033203, "learning_rate": 6.3213433657551845e-06, "loss": 0.3713, "num_input_tokens_seen": 14861680, "step": 6890 }, { "epoch": 1.2653697926224996, "grad_norm": 1.9329919815063477, "learning_rate": 6.325931363552946e-06, "loss": 0.3774, "num_input_tokens_seen": 14873104, "step": 6895 }, { "epoch": 1.2662873921820519, "grad_norm": 3.8614864349365234, "learning_rate": 6.330519361350706e-06, "loss": 0.3314, "num_input_tokens_seen": 14883088, "step": 6900 }, { "epoch": 1.267204991741604, "grad_norm": 2.1114614009857178, "learning_rate": 6.335107359148468e-06, "loss": 0.2808, "num_input_tokens_seen": 14892784, "step": 6905 }, { "epoch": 1.2681225913011562, "grad_norm": 1.562658429145813, "learning_rate": 6.339695356946229e-06, "loss": 0.4182, "num_input_tokens_seen": 14902992, "step": 6910 }, { "epoch": 1.2690401908607085, "grad_norm": 5.757858753204346, "learning_rate": 6.34428335474399e-06, "loss": 0.3142, "num_input_tokens_seen": 14913488, "step": 6915 }, { "epoch": 1.2699577904202606, "grad_norm": 2.384592294692993, "learning_rate": 6.348871352541751e-06, "loss": 0.3249, "num_input_tokens_seen": 14924144, "step": 6920 }, { "epoch": 1.2708753899798129, "grad_norm": 1.3457752466201782, "learning_rate": 6.353459350339512e-06, "loss": 0.3635, "num_input_tokens_seen": 14935216, "step": 6925 }, { "epoch": 1.2717929895393651, "grad_norm": 6.853085517883301, "learning_rate": 6.358047348137273e-06, "loss": 0.3579, "num_input_tokens_seen": 14945424, "step": 6930 }, { "epoch": 1.2727105890989172, "grad_norm": 2.2031850814819336, "learning_rate": 6.362635345935034e-06, "loss": 0.306, "num_input_tokens_seen": 14955376, "step": 6935 }, { "epoch": 1.2736281886584695, "grad_norm": 1.70191490650177, "learning_rate": 6.367223343732796e-06, "loss": 0.3314, "num_input_tokens_seen": 14967504, "step": 6940 }, { "epoch": 1.2745457882180218, "grad_norm": 3.3177571296691895, "learning_rate": 6.371811341530557e-06, "loss": 0.3428, "num_input_tokens_seen": 14978288, "step": 6945 }, { "epoch": 1.2754633877775738, "grad_norm": 5.44589900970459, "learning_rate": 6.3763993393283175e-06, "loss": 0.3655, "num_input_tokens_seen": 14988752, "step": 6950 }, { "epoch": 1.2763809873371261, "grad_norm": 5.661803245544434, "learning_rate": 6.380987337126079e-06, "loss": 0.2944, "num_input_tokens_seen": 14998512, "step": 6955 }, { "epoch": 1.2772985868966784, "grad_norm": 2.2477569580078125, "learning_rate": 6.38557533492384e-06, "loss": 0.2842, "num_input_tokens_seen": 15007888, "step": 6960 }, { "epoch": 1.2782161864562305, "grad_norm": 4.096258640289307, "learning_rate": 6.390163332721601e-06, "loss": 0.2551, "num_input_tokens_seen": 15019248, "step": 6965 }, { "epoch": 1.2791337860157828, "grad_norm": 3.1451332569122314, "learning_rate": 6.394751330519362e-06, "loss": 0.357, "num_input_tokens_seen": 15028624, "step": 6970 }, { "epoch": 1.280051385575335, "grad_norm": 2.823143243789673, "learning_rate": 6.3993393283171234e-06, "loss": 0.3011, "num_input_tokens_seen": 15041392, "step": 6975 }, { "epoch": 1.280968985134887, "grad_norm": 3.9962995052337646, "learning_rate": 6.403927326114884e-06, "loss": 0.4293, "num_input_tokens_seen": 15052688, "step": 6980 }, { "epoch": 1.2818865846944394, "grad_norm": 17.4381046295166, "learning_rate": 6.408515323912645e-06, "loss": 0.3898, "num_input_tokens_seen": 15065136, "step": 6985 }, { "epoch": 1.2828041842539917, "grad_norm": 2.0183537006378174, "learning_rate": 6.413103321710407e-06, "loss": 0.538, "num_input_tokens_seen": 15076528, "step": 6990 }, { "epoch": 1.2837217838135437, "grad_norm": 3.592742919921875, "learning_rate": 6.417691319508167e-06, "loss": 0.3019, "num_input_tokens_seen": 15086576, "step": 6995 }, { "epoch": 1.284639383373096, "grad_norm": 2.0098061561584473, "learning_rate": 6.4222793173059286e-06, "loss": 0.3173, "num_input_tokens_seen": 15098288, "step": 7000 }, { "epoch": 1.2855569829326483, "grad_norm": 2.3187153339385986, "learning_rate": 6.42686731510369e-06, "loss": 0.2728, "num_input_tokens_seen": 15108848, "step": 7005 }, { "epoch": 1.2864745824922004, "grad_norm": 1.8123674392700195, "learning_rate": 6.4314553129014504e-06, "loss": 0.3626, "num_input_tokens_seen": 15120112, "step": 7010 }, { "epoch": 1.2873921820517527, "grad_norm": 1.2379310131072998, "learning_rate": 6.436043310699212e-06, "loss": 0.3114, "num_input_tokens_seen": 15130096, "step": 7015 }, { "epoch": 1.288309781611305, "grad_norm": 3.1096112728118896, "learning_rate": 6.440631308496973e-06, "loss": 0.365, "num_input_tokens_seen": 15140464, "step": 7020 }, { "epoch": 1.289227381170857, "grad_norm": 2.2928028106689453, "learning_rate": 6.445219306294734e-06, "loss": 0.307, "num_input_tokens_seen": 15152048, "step": 7025 }, { "epoch": 1.2901449807304093, "grad_norm": 1.7053604125976562, "learning_rate": 6.449807304092495e-06, "loss": 0.2981, "num_input_tokens_seen": 15161392, "step": 7030 }, { "epoch": 1.2910625802899616, "grad_norm": 23.625722885131836, "learning_rate": 6.454395301890256e-06, "loss": 0.3826, "num_input_tokens_seen": 15172240, "step": 7035 }, { "epoch": 1.2919801798495136, "grad_norm": 4.180346965789795, "learning_rate": 6.458983299688016e-06, "loss": 0.3343, "num_input_tokens_seen": 15183120, "step": 7040 }, { "epoch": 1.292897779409066, "grad_norm": 2.1533782482147217, "learning_rate": 6.463571297485777e-06, "loss": 0.3575, "num_input_tokens_seen": 15195056, "step": 7045 }, { "epoch": 1.2938153789686182, "grad_norm": 3.0594639778137207, "learning_rate": 6.46815929528354e-06, "loss": 0.2684, "num_input_tokens_seen": 15204656, "step": 7050 }, { "epoch": 1.2947329785281703, "grad_norm": 2.0091145038604736, "learning_rate": 6.472747293081299e-06, "loss": 0.4252, "num_input_tokens_seen": 15215696, "step": 7055 }, { "epoch": 1.2956505780877225, "grad_norm": 3.4968879222869873, "learning_rate": 6.477335290879061e-06, "loss": 0.5531, "num_input_tokens_seen": 15226352, "step": 7060 }, { "epoch": 1.2965681776472748, "grad_norm": 4.838013172149658, "learning_rate": 6.481923288676822e-06, "loss": 0.2439, "num_input_tokens_seen": 15238000, "step": 7065 }, { "epoch": 1.297485777206827, "grad_norm": 12.563252449035645, "learning_rate": 6.4865112864745825e-06, "loss": 0.3741, "num_input_tokens_seen": 15247920, "step": 7070 }, { "epoch": 1.2984033767663792, "grad_norm": 2.6866672039031982, "learning_rate": 6.491099284272344e-06, "loss": 0.3422, "num_input_tokens_seen": 15258768, "step": 7075 }, { "epoch": 1.2993209763259315, "grad_norm": 2.7169647216796875, "learning_rate": 6.495687282070105e-06, "loss": 0.4457, "num_input_tokens_seen": 15270032, "step": 7080 }, { "epoch": 1.3002385758854835, "grad_norm": 4.3524675369262695, "learning_rate": 6.500275279867866e-06, "loss": 0.3686, "num_input_tokens_seen": 15279664, "step": 7085 }, { "epoch": 1.3011561754450358, "grad_norm": 3.645101547241211, "learning_rate": 6.504863277665627e-06, "loss": 0.3239, "num_input_tokens_seen": 15289744, "step": 7090 }, { "epoch": 1.302073775004588, "grad_norm": 2.7180593013763428, "learning_rate": 6.5094512754633885e-06, "loss": 0.34, "num_input_tokens_seen": 15300528, "step": 7095 }, { "epoch": 1.3029913745641402, "grad_norm": 14.14658260345459, "learning_rate": 6.514039273261149e-06, "loss": 0.3747, "num_input_tokens_seen": 15311536, "step": 7100 }, { "epoch": 1.3039089741236924, "grad_norm": 5.647047519683838, "learning_rate": 6.51862727105891e-06, "loss": 0.3318, "num_input_tokens_seen": 15322288, "step": 7105 }, { "epoch": 1.3048265736832447, "grad_norm": 3.8522908687591553, "learning_rate": 6.523215268856672e-06, "loss": 0.3546, "num_input_tokens_seen": 15333648, "step": 7110 }, { "epoch": 1.3057441732427968, "grad_norm": 2.618959665298462, "learning_rate": 6.527803266654432e-06, "loss": 0.347, "num_input_tokens_seen": 15343824, "step": 7115 }, { "epoch": 1.306661772802349, "grad_norm": 1.814242959022522, "learning_rate": 6.532391264452194e-06, "loss": 0.3288, "num_input_tokens_seen": 15354992, "step": 7120 }, { "epoch": 1.3075793723619014, "grad_norm": 1.973965048789978, "learning_rate": 6.536979262249955e-06, "loss": 0.2919, "num_input_tokens_seen": 15365232, "step": 7125 }, { "epoch": 1.3084969719214534, "grad_norm": 7.496115684509277, "learning_rate": 6.5415672600477155e-06, "loss": 0.3186, "num_input_tokens_seen": 15374096, "step": 7130 }, { "epoch": 1.3094145714810057, "grad_norm": 4.052692413330078, "learning_rate": 6.546155257845477e-06, "loss": 0.362, "num_input_tokens_seen": 15385648, "step": 7135 }, { "epoch": 1.310332171040558, "grad_norm": 3.344250440597534, "learning_rate": 6.550743255643238e-06, "loss": 0.2587, "num_input_tokens_seen": 15394800, "step": 7140 }, { "epoch": 1.31124977060011, "grad_norm": 2.4956138134002686, "learning_rate": 6.555331253440999e-06, "loss": 0.3025, "num_input_tokens_seen": 15406576, "step": 7145 }, { "epoch": 1.3121673701596623, "grad_norm": 17.46871566772461, "learning_rate": 6.55991925123876e-06, "loss": 0.5355, "num_input_tokens_seen": 15417104, "step": 7150 }, { "epoch": 1.3130849697192146, "grad_norm": 1.6613792181015015, "learning_rate": 6.5645072490365215e-06, "loss": 0.3367, "num_input_tokens_seen": 15427664, "step": 7155 }, { "epoch": 1.3140025692787667, "grad_norm": 6.642973899841309, "learning_rate": 6.569095246834282e-06, "loss": 0.332, "num_input_tokens_seen": 15438224, "step": 7160 }, { "epoch": 1.314920168838319, "grad_norm": 3.256213426589966, "learning_rate": 6.573683244632043e-06, "loss": 0.3649, "num_input_tokens_seen": 15449136, "step": 7165 }, { "epoch": 1.3158377683978713, "grad_norm": 2.384446859359741, "learning_rate": 6.578271242429805e-06, "loss": 0.3476, "num_input_tokens_seen": 15458128, "step": 7170 }, { "epoch": 1.3167553679574233, "grad_norm": 2.9860148429870605, "learning_rate": 6.582859240227565e-06, "loss": 0.3124, "num_input_tokens_seen": 15468560, "step": 7175 }, { "epoch": 1.3176729675169756, "grad_norm": 11.982368469238281, "learning_rate": 6.5874472380253266e-06, "loss": 0.39, "num_input_tokens_seen": 15479536, "step": 7180 }, { "epoch": 1.3185905670765279, "grad_norm": 2.0963454246520996, "learning_rate": 6.592035235823088e-06, "loss": 0.2997, "num_input_tokens_seen": 15490640, "step": 7185 }, { "epoch": 1.31950816663608, "grad_norm": 2.1898694038391113, "learning_rate": 6.596623233620848e-06, "loss": 0.2632, "num_input_tokens_seen": 15502480, "step": 7190 }, { "epoch": 1.3204257661956322, "grad_norm": 2.8378994464874268, "learning_rate": 6.60121123141861e-06, "loss": 0.3713, "num_input_tokens_seen": 15512016, "step": 7195 }, { "epoch": 1.3213433657551845, "grad_norm": 8.152722358703613, "learning_rate": 6.605799229216371e-06, "loss": 0.25, "num_input_tokens_seen": 15523120, "step": 7200 }, { "epoch": 1.3222609653147366, "grad_norm": 2.362298011779785, "learning_rate": 6.610387227014131e-06, "loss": 0.3925, "num_input_tokens_seen": 15533648, "step": 7205 }, { "epoch": 1.3231785648742889, "grad_norm": 3.3844170570373535, "learning_rate": 6.614975224811892e-06, "loss": 0.3808, "num_input_tokens_seen": 15544208, "step": 7210 }, { "epoch": 1.3240961644338411, "grad_norm": 20.535133361816406, "learning_rate": 6.619563222609654e-06, "loss": 0.3177, "num_input_tokens_seen": 15554864, "step": 7215 }, { "epoch": 1.3250137639933932, "grad_norm": 1.8787723779678345, "learning_rate": 6.624151220407414e-06, "loss": 0.4073, "num_input_tokens_seen": 15565648, "step": 7220 }, { "epoch": 1.3259313635529455, "grad_norm": 7.0860137939453125, "learning_rate": 6.6287392182051754e-06, "loss": 0.2946, "num_input_tokens_seen": 15577200, "step": 7225 }, { "epoch": 1.3268489631124978, "grad_norm": 2.0126900672912598, "learning_rate": 6.633327216002937e-06, "loss": 0.3383, "num_input_tokens_seen": 15587952, "step": 7230 }, { "epoch": 1.3277665626720498, "grad_norm": 1.8143775463104248, "learning_rate": 6.637915213800697e-06, "loss": 0.2416, "num_input_tokens_seen": 15598960, "step": 7235 }, { "epoch": 1.3286841622316021, "grad_norm": 1.4414414167404175, "learning_rate": 6.642503211598459e-06, "loss": 0.4042, "num_input_tokens_seen": 15610576, "step": 7240 }, { "epoch": 1.3296017617911544, "grad_norm": 1.9169756174087524, "learning_rate": 6.64709120939622e-06, "loss": 0.287, "num_input_tokens_seen": 15620528, "step": 7245 }, { "epoch": 1.3305193613507065, "grad_norm": 7.307530879974365, "learning_rate": 6.6516792071939805e-06, "loss": 0.2884, "num_input_tokens_seen": 15632112, "step": 7250 }, { "epoch": 1.3314369609102588, "grad_norm": 14.036060333251953, "learning_rate": 6.656267204991742e-06, "loss": 0.3571, "num_input_tokens_seen": 15642320, "step": 7255 }, { "epoch": 1.332354560469811, "grad_norm": 3.7450859546661377, "learning_rate": 6.660855202789503e-06, "loss": 0.4054, "num_input_tokens_seen": 15652272, "step": 7260 }, { "epoch": 1.333272160029363, "grad_norm": 3.6141393184661865, "learning_rate": 6.665443200587264e-06, "loss": 0.3278, "num_input_tokens_seen": 15662800, "step": 7265 }, { "epoch": 1.3341897595889154, "grad_norm": 15.398890495300293, "learning_rate": 6.670031198385025e-06, "loss": 0.4535, "num_input_tokens_seen": 15672848, "step": 7270 }, { "epoch": 1.3351073591484677, "grad_norm": 1.9221744537353516, "learning_rate": 6.6746191961827865e-06, "loss": 0.4145, "num_input_tokens_seen": 15684464, "step": 7275 }, { "epoch": 1.3360249587080197, "grad_norm": 1.724312424659729, "learning_rate": 6.679207193980547e-06, "loss": 0.2696, "num_input_tokens_seen": 15694192, "step": 7280 }, { "epoch": 1.336942558267572, "grad_norm": 2.622514247894287, "learning_rate": 6.683795191778308e-06, "loss": 0.3255, "num_input_tokens_seen": 15705424, "step": 7285 }, { "epoch": 1.3378601578271243, "grad_norm": 11.140867233276367, "learning_rate": 6.68838318957607e-06, "loss": 0.3418, "num_input_tokens_seen": 15715952, "step": 7290 }, { "epoch": 1.3387777573866764, "grad_norm": 1.9499459266662598, "learning_rate": 6.69297118737383e-06, "loss": 0.3105, "num_input_tokens_seen": 15726576, "step": 7295 }, { "epoch": 1.3396953569462287, "grad_norm": 1.6665802001953125, "learning_rate": 6.697559185171592e-06, "loss": 0.3911, "num_input_tokens_seen": 15737552, "step": 7300 }, { "epoch": 1.340612956505781, "grad_norm": 1.573358416557312, "learning_rate": 6.702147182969353e-06, "loss": 0.3745, "num_input_tokens_seen": 15749264, "step": 7305 }, { "epoch": 1.341530556065333, "grad_norm": 2.250704050064087, "learning_rate": 6.7067351807671135e-06, "loss": 0.2327, "num_input_tokens_seen": 15761424, "step": 7310 }, { "epoch": 1.3424481556248853, "grad_norm": 1.2972508668899536, "learning_rate": 6.711323178564875e-06, "loss": 0.2717, "num_input_tokens_seen": 15771440, "step": 7315 }, { "epoch": 1.3433657551844376, "grad_norm": 14.170099258422852, "learning_rate": 6.715911176362636e-06, "loss": 0.4454, "num_input_tokens_seen": 15782352, "step": 7320 }, { "epoch": 1.3442833547439896, "grad_norm": 6.464470863342285, "learning_rate": 6.720499174160397e-06, "loss": 0.3875, "num_input_tokens_seen": 15792944, "step": 7325 }, { "epoch": 1.345200954303542, "grad_norm": 2.2590250968933105, "learning_rate": 6.725087171958158e-06, "loss": 0.423, "num_input_tokens_seen": 15804112, "step": 7330 }, { "epoch": 1.3461185538630942, "grad_norm": 7.001298427581787, "learning_rate": 6.7296751697559195e-06, "loss": 0.3132, "num_input_tokens_seen": 15815984, "step": 7335 }, { "epoch": 1.3470361534226463, "grad_norm": 1.5306267738342285, "learning_rate": 6.73426316755368e-06, "loss": 0.3311, "num_input_tokens_seen": 15826896, "step": 7340 }, { "epoch": 1.3479537529821985, "grad_norm": 4.283219814300537, "learning_rate": 6.738851165351441e-06, "loss": 0.2669, "num_input_tokens_seen": 15838128, "step": 7345 }, { "epoch": 1.3488713525417508, "grad_norm": 3.0459492206573486, "learning_rate": 6.743439163149203e-06, "loss": 0.3229, "num_input_tokens_seen": 15849680, "step": 7350 }, { "epoch": 1.349788952101303, "grad_norm": 6.236119270324707, "learning_rate": 6.748027160946962e-06, "loss": 0.306, "num_input_tokens_seen": 15859984, "step": 7355 }, { "epoch": 1.3507065516608552, "grad_norm": 5.158218860626221, "learning_rate": 6.752615158744725e-06, "loss": 0.3952, "num_input_tokens_seen": 15870832, "step": 7360 }, { "epoch": 1.3516241512204075, "grad_norm": 2.41437029838562, "learning_rate": 6.757203156542486e-06, "loss": 0.2905, "num_input_tokens_seen": 15882288, "step": 7365 }, { "epoch": 1.3525417507799595, "grad_norm": 2.5641820430755615, "learning_rate": 6.761791154340247e-06, "loss": 0.2796, "num_input_tokens_seen": 15893232, "step": 7370 }, { "epoch": 1.3534593503395118, "grad_norm": 2.29020357131958, "learning_rate": 6.766379152138007e-06, "loss": 0.3887, "num_input_tokens_seen": 15904624, "step": 7375 }, { "epoch": 1.354376949899064, "grad_norm": 2.3843743801116943, "learning_rate": 6.770967149935769e-06, "loss": 0.2843, "num_input_tokens_seen": 15915600, "step": 7380 }, { "epoch": 1.3552945494586162, "grad_norm": 9.949136734008789, "learning_rate": 6.7755551477335305e-06, "loss": 0.4053, "num_input_tokens_seen": 15925968, "step": 7385 }, { "epoch": 1.3562121490181684, "grad_norm": 3.562915086746216, "learning_rate": 6.78014314553129e-06, "loss": 0.3977, "num_input_tokens_seen": 15934768, "step": 7390 }, { "epoch": 1.3571297485777207, "grad_norm": 7.6063618659973145, "learning_rate": 6.7847311433290516e-06, "loss": 0.3642, "num_input_tokens_seen": 15945488, "step": 7395 }, { "epoch": 1.3580473481372728, "grad_norm": 3.8237199783325195, "learning_rate": 6.789319141126813e-06, "loss": 0.3814, "num_input_tokens_seen": 15954672, "step": 7400 }, { "epoch": 1.358964947696825, "grad_norm": 7.748843193054199, "learning_rate": 6.7939071389245734e-06, "loss": 0.3527, "num_input_tokens_seen": 15965232, "step": 7405 }, { "epoch": 1.3598825472563774, "grad_norm": 2.9471824169158936, "learning_rate": 6.798495136722335e-06, "loss": 0.3279, "num_input_tokens_seen": 15976624, "step": 7410 }, { "epoch": 1.3608001468159294, "grad_norm": 3.712066173553467, "learning_rate": 6.803083134520096e-06, "loss": 0.3174, "num_input_tokens_seen": 15986576, "step": 7415 }, { "epoch": 1.3617177463754817, "grad_norm": 3.4432666301727295, "learning_rate": 6.807671132317857e-06, "loss": 0.3898, "num_input_tokens_seen": 15997360, "step": 7420 }, { "epoch": 1.362635345935034, "grad_norm": 1.746205449104309, "learning_rate": 6.812259130115618e-06, "loss": 0.281, "num_input_tokens_seen": 16009520, "step": 7425 }, { "epoch": 1.363552945494586, "grad_norm": 2.756671905517578, "learning_rate": 6.816847127913379e-06, "loss": 0.3441, "num_input_tokens_seen": 16020720, "step": 7430 }, { "epoch": 1.3644705450541383, "grad_norm": 1.8700761795043945, "learning_rate": 6.82143512571114e-06, "loss": 0.2964, "num_input_tokens_seen": 16031248, "step": 7435 }, { "epoch": 1.3653881446136906, "grad_norm": 1.6976518630981445, "learning_rate": 6.826023123508901e-06, "loss": 0.3275, "num_input_tokens_seen": 16042224, "step": 7440 }, { "epoch": 1.3663057441732427, "grad_norm": 1.5019577741622925, "learning_rate": 6.830611121306663e-06, "loss": 0.3195, "num_input_tokens_seen": 16053648, "step": 7445 }, { "epoch": 1.367223343732795, "grad_norm": 1.3976763486862183, "learning_rate": 6.835199119104423e-06, "loss": 0.2988, "num_input_tokens_seen": 16065264, "step": 7450 }, { "epoch": 1.3681409432923473, "grad_norm": 10.90492057800293, "learning_rate": 6.8397871169021845e-06, "loss": 0.351, "num_input_tokens_seen": 16075728, "step": 7455 }, { "epoch": 1.3690585428518993, "grad_norm": 6.4196085929870605, "learning_rate": 6.844375114699946e-06, "loss": 0.3474, "num_input_tokens_seen": 16086032, "step": 7460 }, { "epoch": 1.3699761424114516, "grad_norm": 9.089536666870117, "learning_rate": 6.848963112497706e-06, "loss": 0.3486, "num_input_tokens_seen": 16097552, "step": 7465 }, { "epoch": 1.3708937419710039, "grad_norm": 3.5294857025146484, "learning_rate": 6.853551110295468e-06, "loss": 0.4495, "num_input_tokens_seen": 16107376, "step": 7470 }, { "epoch": 1.371811341530556, "grad_norm": 4.334187030792236, "learning_rate": 6.858139108093229e-06, "loss": 0.3507, "num_input_tokens_seen": 16118768, "step": 7475 }, { "epoch": 1.3727289410901082, "grad_norm": 1.420762538909912, "learning_rate": 6.86272710589099e-06, "loss": 0.3467, "num_input_tokens_seen": 16129936, "step": 7480 }, { "epoch": 1.3736465406496605, "grad_norm": 3.84822154045105, "learning_rate": 6.867315103688751e-06, "loss": 0.3212, "num_input_tokens_seen": 16141360, "step": 7485 }, { "epoch": 1.3745641402092126, "grad_norm": 1.5016814470291138, "learning_rate": 6.871903101486512e-06, "loss": 0.3868, "num_input_tokens_seen": 16153520, "step": 7490 }, { "epoch": 1.3754817397687649, "grad_norm": 1.4219427108764648, "learning_rate": 6.876491099284273e-06, "loss": 0.2945, "num_input_tokens_seen": 16163504, "step": 7495 }, { "epoch": 1.3763993393283172, "grad_norm": 2.113538980484009, "learning_rate": 6.881079097082034e-06, "loss": 0.2896, "num_input_tokens_seen": 16173872, "step": 7500 }, { "epoch": 1.3773169388878692, "grad_norm": 1.409751534461975, "learning_rate": 6.885667094879796e-06, "loss": 0.2794, "num_input_tokens_seen": 16185232, "step": 7505 }, { "epoch": 1.3782345384474215, "grad_norm": 2.3141098022460938, "learning_rate": 6.890255092677556e-06, "loss": 0.324, "num_input_tokens_seen": 16196752, "step": 7510 }, { "epoch": 1.3791521380069738, "grad_norm": 3.932119607925415, "learning_rate": 6.8948430904753175e-06, "loss": 0.2485, "num_input_tokens_seen": 16207152, "step": 7515 }, { "epoch": 1.3800697375665258, "grad_norm": 2.7763400077819824, "learning_rate": 6.899431088273079e-06, "loss": 0.3978, "num_input_tokens_seen": 16217904, "step": 7520 }, { "epoch": 1.3809873371260781, "grad_norm": 4.703010559082031, "learning_rate": 6.9040190860708385e-06, "loss": 0.2857, "num_input_tokens_seen": 16229616, "step": 7525 }, { "epoch": 1.3819049366856304, "grad_norm": 4.439111709594727, "learning_rate": 6.908607083868601e-06, "loss": 0.4235, "num_input_tokens_seen": 16241360, "step": 7530 }, { "epoch": 1.3828225362451825, "grad_norm": 1.542271375656128, "learning_rate": 6.913195081666362e-06, "loss": 0.2597, "num_input_tokens_seen": 16251344, "step": 7535 }, { "epoch": 1.3837401358047348, "grad_norm": 17.623794555664062, "learning_rate": 6.917783079464122e-06, "loss": 0.3786, "num_input_tokens_seen": 16261968, "step": 7540 }, { "epoch": 1.384657735364287, "grad_norm": 2.8210866451263428, "learning_rate": 6.922371077261883e-06, "loss": 0.388, "num_input_tokens_seen": 16273328, "step": 7545 }, { "epoch": 1.385575334923839, "grad_norm": 1.8874080181121826, "learning_rate": 6.926959075059645e-06, "loss": 0.3725, "num_input_tokens_seen": 16283280, "step": 7550 }, { "epoch": 1.3864929344833914, "grad_norm": 15.033286094665527, "learning_rate": 6.931547072857405e-06, "loss": 0.3993, "num_input_tokens_seen": 16294608, "step": 7555 }, { "epoch": 1.3874105340429437, "grad_norm": 2.8282992839813232, "learning_rate": 6.936135070655166e-06, "loss": 0.3256, "num_input_tokens_seen": 16304624, "step": 7560 }, { "epoch": 1.3883281336024957, "grad_norm": 2.725085973739624, "learning_rate": 6.940723068452928e-06, "loss": 0.304, "num_input_tokens_seen": 16316816, "step": 7565 }, { "epoch": 1.389245733162048, "grad_norm": 6.369361400604248, "learning_rate": 6.945311066250688e-06, "loss": 0.4061, "num_input_tokens_seen": 16326800, "step": 7570 }, { "epoch": 1.3901633327216003, "grad_norm": 1.2441433668136597, "learning_rate": 6.94989906404845e-06, "loss": 0.3152, "num_input_tokens_seen": 16338512, "step": 7575 }, { "epoch": 1.3910809322811524, "grad_norm": 3.7128891944885254, "learning_rate": 6.954487061846211e-06, "loss": 0.2427, "num_input_tokens_seen": 16349104, "step": 7580 }, { "epoch": 1.3919985318407047, "grad_norm": 3.773226022720337, "learning_rate": 6.9590750596439715e-06, "loss": 0.3306, "num_input_tokens_seen": 16360944, "step": 7585 }, { "epoch": 1.392916131400257, "grad_norm": 1.7805392742156982, "learning_rate": 6.963663057441733e-06, "loss": 0.2626, "num_input_tokens_seen": 16371696, "step": 7590 }, { "epoch": 1.393833730959809, "grad_norm": 1.550970196723938, "learning_rate": 6.968251055239494e-06, "loss": 0.2573, "num_input_tokens_seen": 16380720, "step": 7595 }, { "epoch": 1.3947513305193613, "grad_norm": 1.6220903396606445, "learning_rate": 6.972839053037255e-06, "loss": 0.2979, "num_input_tokens_seen": 16392016, "step": 7600 }, { "epoch": 1.3956689300789136, "grad_norm": 2.996131420135498, "learning_rate": 6.977427050835016e-06, "loss": 0.3141, "num_input_tokens_seen": 16402224, "step": 7605 }, { "epoch": 1.3965865296384659, "grad_norm": 9.355862617492676, "learning_rate": 6.982015048632777e-06, "loss": 0.3637, "num_input_tokens_seen": 16411728, "step": 7610 }, { "epoch": 1.397504129198018, "grad_norm": 4.677515029907227, "learning_rate": 6.986603046430538e-06, "loss": 0.277, "num_input_tokens_seen": 16421552, "step": 7615 }, { "epoch": 1.3984217287575702, "grad_norm": 1.3704792261123657, "learning_rate": 6.991191044228299e-06, "loss": 0.3113, "num_input_tokens_seen": 16433488, "step": 7620 }, { "epoch": 1.3993393283171225, "grad_norm": 1.9783316850662231, "learning_rate": 6.995779042026061e-06, "loss": 0.4003, "num_input_tokens_seen": 16444592, "step": 7625 }, { "epoch": 1.4002569278766746, "grad_norm": 10.600567817687988, "learning_rate": 7.000367039823821e-06, "loss": 0.3127, "num_input_tokens_seen": 16456336, "step": 7630 }, { "epoch": 1.4011745274362268, "grad_norm": 2.198787212371826, "learning_rate": 7.0049550376215825e-06, "loss": 0.3903, "num_input_tokens_seen": 16467600, "step": 7635 }, { "epoch": 1.4020921269957791, "grad_norm": 1.4255337715148926, "learning_rate": 7.009543035419344e-06, "loss": 0.3711, "num_input_tokens_seen": 16479376, "step": 7640 }, { "epoch": 1.4030097265553312, "grad_norm": 2.8532750606536865, "learning_rate": 7.014131033217104e-06, "loss": 0.2831, "num_input_tokens_seen": 16491376, "step": 7645 }, { "epoch": 1.4039273261148835, "grad_norm": 2.3009848594665527, "learning_rate": 7.018719031014866e-06, "loss": 0.4088, "num_input_tokens_seen": 16502768, "step": 7650 }, { "epoch": 1.4048449256744358, "grad_norm": 2.4864377975463867, "learning_rate": 7.023307028812627e-06, "loss": 0.2982, "num_input_tokens_seen": 16513328, "step": 7655 }, { "epoch": 1.4057625252339878, "grad_norm": 6.402119159698486, "learning_rate": 7.027895026610388e-06, "loss": 0.2793, "num_input_tokens_seen": 16525520, "step": 7660 }, { "epoch": 1.40668012479354, "grad_norm": 1.5278242826461792, "learning_rate": 7.032483024408149e-06, "loss": 0.376, "num_input_tokens_seen": 16536784, "step": 7665 }, { "epoch": 1.4075977243530924, "grad_norm": 1.468542218208313, "learning_rate": 7.03707102220591e-06, "loss": 0.274, "num_input_tokens_seen": 16547408, "step": 7670 }, { "epoch": 1.4085153239126444, "grad_norm": 2.043438196182251, "learning_rate": 7.041659020003671e-06, "loss": 0.2508, "num_input_tokens_seen": 16557584, "step": 7675 }, { "epoch": 1.4094329234721967, "grad_norm": 7.814321517944336, "learning_rate": 7.046247017801432e-06, "loss": 0.4304, "num_input_tokens_seen": 16568656, "step": 7680 }, { "epoch": 1.410350523031749, "grad_norm": 7.145325183868408, "learning_rate": 7.050835015599194e-06, "loss": 0.345, "num_input_tokens_seen": 16579984, "step": 7685 }, { "epoch": 1.411268122591301, "grad_norm": 2.557448148727417, "learning_rate": 7.055423013396953e-06, "loss": 0.3281, "num_input_tokens_seen": 16591280, "step": 7690 }, { "epoch": 1.4121857221508534, "grad_norm": 22.295373916625977, "learning_rate": 7.0600110111947155e-06, "loss": 0.4095, "num_input_tokens_seen": 16602480, "step": 7695 }, { "epoch": 1.4131033217104056, "grad_norm": 2.736768960952759, "learning_rate": 7.064599008992477e-06, "loss": 0.2841, "num_input_tokens_seen": 16613360, "step": 7700 }, { "epoch": 1.4140209212699577, "grad_norm": 2.425382375717163, "learning_rate": 7.0691870067902365e-06, "loss": 0.3338, "num_input_tokens_seen": 16624240, "step": 7705 }, { "epoch": 1.41493852082951, "grad_norm": 4.245667457580566, "learning_rate": 7.073775004587998e-06, "loss": 0.3184, "num_input_tokens_seen": 16634192, "step": 7710 }, { "epoch": 1.4158561203890623, "grad_norm": 6.804183006286621, "learning_rate": 7.07836300238576e-06, "loss": 0.4026, "num_input_tokens_seen": 16644240, "step": 7715 }, { "epoch": 1.4167737199486146, "grad_norm": 5.964150905609131, "learning_rate": 7.08295100018352e-06, "loss": 0.3095, "num_input_tokens_seen": 16657424, "step": 7720 }, { "epoch": 1.4176913195081666, "grad_norm": 2.321816921234131, "learning_rate": 7.087538997981281e-06, "loss": 0.3315, "num_input_tokens_seen": 16668112, "step": 7725 }, { "epoch": 1.418608919067719, "grad_norm": 2.2875940799713135, "learning_rate": 7.0921269957790425e-06, "loss": 0.2613, "num_input_tokens_seen": 16678864, "step": 7730 }, { "epoch": 1.4195265186272712, "grad_norm": 3.5706491470336914, "learning_rate": 7.096714993576803e-06, "loss": 0.3369, "num_input_tokens_seen": 16690160, "step": 7735 }, { "epoch": 1.4204441181868233, "grad_norm": 4.058722972869873, "learning_rate": 7.101302991374564e-06, "loss": 0.3332, "num_input_tokens_seen": 16701008, "step": 7740 }, { "epoch": 1.4213617177463755, "grad_norm": 8.653152465820312, "learning_rate": 7.105890989172326e-06, "loss": 0.4026, "num_input_tokens_seen": 16711696, "step": 7745 }, { "epoch": 1.4222793173059278, "grad_norm": 1.0664446353912354, "learning_rate": 7.110478986970086e-06, "loss": 0.2999, "num_input_tokens_seen": 16722608, "step": 7750 }, { "epoch": 1.4231969168654799, "grad_norm": 3.501211404800415, "learning_rate": 7.115066984767848e-06, "loss": 0.3725, "num_input_tokens_seen": 16732240, "step": 7755 }, { "epoch": 1.4241145164250322, "grad_norm": 3.6326353549957275, "learning_rate": 7.119654982565609e-06, "loss": 0.3163, "num_input_tokens_seen": 16743312, "step": 7760 }, { "epoch": 1.4250321159845845, "grad_norm": 2.7553296089172363, "learning_rate": 7.1242429803633695e-06, "loss": 0.3081, "num_input_tokens_seen": 16753328, "step": 7765 }, { "epoch": 1.4259497155441365, "grad_norm": 3.1902616024017334, "learning_rate": 7.128830978161131e-06, "loss": 0.3519, "num_input_tokens_seen": 16764400, "step": 7770 }, { "epoch": 1.4268673151036888, "grad_norm": 2.7488620281219482, "learning_rate": 7.133418975958892e-06, "loss": 0.3009, "num_input_tokens_seen": 16774096, "step": 7775 }, { "epoch": 1.427784914663241, "grad_norm": 1.7460299730300903, "learning_rate": 7.138006973756653e-06, "loss": 0.3948, "num_input_tokens_seen": 16784432, "step": 7780 }, { "epoch": 1.4287025142227932, "grad_norm": 10.863037109375, "learning_rate": 7.142594971554414e-06, "loss": 0.359, "num_input_tokens_seen": 16795952, "step": 7785 }, { "epoch": 1.4296201137823454, "grad_norm": 20.19995880126953, "learning_rate": 7.1471829693521754e-06, "loss": 0.3844, "num_input_tokens_seen": 16807664, "step": 7790 }, { "epoch": 1.4305377133418977, "grad_norm": 2.050694704055786, "learning_rate": 7.151770967149937e-06, "loss": 0.2497, "num_input_tokens_seen": 16818960, "step": 7795 }, { "epoch": 1.4314553129014498, "grad_norm": 3.974470853805542, "learning_rate": 7.156358964947697e-06, "loss": 0.3397, "num_input_tokens_seen": 16829808, "step": 7800 }, { "epoch": 1.432372912461002, "grad_norm": 1.9596517086029053, "learning_rate": 7.160946962745459e-06, "loss": 0.4538, "num_input_tokens_seen": 16841104, "step": 7805 }, { "epoch": 1.4332905120205544, "grad_norm": 14.091232299804688, "learning_rate": 7.16553496054322e-06, "loss": 0.414, "num_input_tokens_seen": 16850960, "step": 7810 }, { "epoch": 1.4342081115801064, "grad_norm": 2.0299952030181885, "learning_rate": 7.1701229583409806e-06, "loss": 0.2913, "num_input_tokens_seen": 16861168, "step": 7815 }, { "epoch": 1.4351257111396587, "grad_norm": 3.068471670150757, "learning_rate": 7.174710956138742e-06, "loss": 0.4513, "num_input_tokens_seen": 16871600, "step": 7820 }, { "epoch": 1.436043310699211, "grad_norm": 4.657344341278076, "learning_rate": 7.179298953936503e-06, "loss": 0.4063, "num_input_tokens_seen": 16884336, "step": 7825 }, { "epoch": 1.436960910258763, "grad_norm": 2.357992649078369, "learning_rate": 7.183886951734264e-06, "loss": 0.2829, "num_input_tokens_seen": 16895280, "step": 7830 }, { "epoch": 1.4378785098183153, "grad_norm": 1.1908023357391357, "learning_rate": 7.188474949532025e-06, "loss": 0.3292, "num_input_tokens_seen": 16905584, "step": 7835 }, { "epoch": 1.4387961093778676, "grad_norm": 4.208125591278076, "learning_rate": 7.1930629473297865e-06, "loss": 0.275, "num_input_tokens_seen": 16915696, "step": 7840 }, { "epoch": 1.4397137089374197, "grad_norm": 1.607274055480957, "learning_rate": 7.197650945127547e-06, "loss": 0.3389, "num_input_tokens_seen": 16928080, "step": 7845 }, { "epoch": 1.440631308496972, "grad_norm": 1.1741985082626343, "learning_rate": 7.202238942925308e-06, "loss": 0.3211, "num_input_tokens_seen": 16939472, "step": 7850 }, { "epoch": 1.4415489080565242, "grad_norm": 6.038405418395996, "learning_rate": 7.20682694072307e-06, "loss": 0.2714, "num_input_tokens_seen": 16951440, "step": 7855 }, { "epoch": 1.4424665076160763, "grad_norm": 1.7551547288894653, "learning_rate": 7.21141493852083e-06, "loss": 0.3904, "num_input_tokens_seen": 16962992, "step": 7860 }, { "epoch": 1.4433841071756286, "grad_norm": 3.9874672889709473, "learning_rate": 7.216002936318592e-06, "loss": 0.33, "num_input_tokens_seen": 16973200, "step": 7865 }, { "epoch": 1.4443017067351809, "grad_norm": 1.4651423692703247, "learning_rate": 7.220590934116353e-06, "loss": 0.3719, "num_input_tokens_seen": 16983824, "step": 7870 }, { "epoch": 1.445219306294733, "grad_norm": 1.338280200958252, "learning_rate": 7.225178931914113e-06, "loss": 0.325, "num_input_tokens_seen": 16994512, "step": 7875 }, { "epoch": 1.4461369058542852, "grad_norm": 2.1567015647888184, "learning_rate": 7.229766929711874e-06, "loss": 0.3327, "num_input_tokens_seen": 17004336, "step": 7880 }, { "epoch": 1.4470545054138375, "grad_norm": 2.3927555084228516, "learning_rate": 7.234354927509636e-06, "loss": 0.2588, "num_input_tokens_seen": 17015120, "step": 7885 }, { "epoch": 1.4479721049733896, "grad_norm": 2.9379069805145264, "learning_rate": 7.238942925307396e-06, "loss": 0.3332, "num_input_tokens_seen": 17023984, "step": 7890 }, { "epoch": 1.4488897045329419, "grad_norm": 2.6450862884521484, "learning_rate": 7.243530923105157e-06, "loss": 0.303, "num_input_tokens_seen": 17035088, "step": 7895 }, { "epoch": 1.4498073040924941, "grad_norm": 0.9866490364074707, "learning_rate": 7.248118920902919e-06, "loss": 0.3075, "num_input_tokens_seen": 17045360, "step": 7900 }, { "epoch": 1.4507249036520462, "grad_norm": 4.565676689147949, "learning_rate": 7.252706918700679e-06, "loss": 0.3174, "num_input_tokens_seen": 17056272, "step": 7905 }, { "epoch": 1.4516425032115985, "grad_norm": 2.8287413120269775, "learning_rate": 7.2572949164984405e-06, "loss": 0.2914, "num_input_tokens_seen": 17066864, "step": 7910 }, { "epoch": 1.4525601027711508, "grad_norm": 11.627795219421387, "learning_rate": 7.261882914296202e-06, "loss": 0.4042, "num_input_tokens_seen": 17076944, "step": 7915 }, { "epoch": 1.4534777023307028, "grad_norm": 3.8986334800720215, "learning_rate": 7.266470912093962e-06, "loss": 0.3896, "num_input_tokens_seen": 17088048, "step": 7920 }, { "epoch": 1.4543953018902551, "grad_norm": 3.0610392093658447, "learning_rate": 7.271058909891724e-06, "loss": 0.3212, "num_input_tokens_seen": 17098320, "step": 7925 }, { "epoch": 1.4553129014498074, "grad_norm": 1.7188215255737305, "learning_rate": 7.275646907689485e-06, "loss": 0.2419, "num_input_tokens_seen": 17109520, "step": 7930 }, { "epoch": 1.4562305010093595, "grad_norm": 1.5894238948822021, "learning_rate": 7.280234905487246e-06, "loss": 0.5336, "num_input_tokens_seen": 17120688, "step": 7935 }, { "epoch": 1.4571481005689118, "grad_norm": 3.70831036567688, "learning_rate": 7.284822903285007e-06, "loss": 0.3394, "num_input_tokens_seen": 17131088, "step": 7940 }, { "epoch": 1.458065700128464, "grad_norm": 8.228182792663574, "learning_rate": 7.289410901082768e-06, "loss": 0.3129, "num_input_tokens_seen": 17141232, "step": 7945 }, { "epoch": 1.458983299688016, "grad_norm": 2.5264487266540527, "learning_rate": 7.293998898880529e-06, "loss": 0.3406, "num_input_tokens_seen": 17152080, "step": 7950 }, { "epoch": 1.4599008992475684, "grad_norm": 2.7947373390197754, "learning_rate": 7.29858689667829e-06, "loss": 0.3294, "num_input_tokens_seen": 17163856, "step": 7955 }, { "epoch": 1.4608184988071207, "grad_norm": 23.949337005615234, "learning_rate": 7.3031748944760516e-06, "loss": 0.3216, "num_input_tokens_seen": 17175056, "step": 7960 }, { "epoch": 1.4617360983666727, "grad_norm": 2.1503195762634277, "learning_rate": 7.307762892273812e-06, "loss": 0.4505, "num_input_tokens_seen": 17186160, "step": 7965 }, { "epoch": 1.462653697926225, "grad_norm": 16.23208999633789, "learning_rate": 7.3123508900715735e-06, "loss": 0.4387, "num_input_tokens_seen": 17196432, "step": 7970 }, { "epoch": 1.4635712974857773, "grad_norm": 2.3410980701446533, "learning_rate": 7.316938887869335e-06, "loss": 0.4005, "num_input_tokens_seen": 17207504, "step": 7975 }, { "epoch": 1.4644888970453294, "grad_norm": 4.6453728675842285, "learning_rate": 7.321526885667095e-06, "loss": 0.4654, "num_input_tokens_seen": 17217808, "step": 7980 }, { "epoch": 1.4654064966048816, "grad_norm": 2.00947904586792, "learning_rate": 7.326114883464857e-06, "loss": 0.2119, "num_input_tokens_seen": 17228880, "step": 7985 }, { "epoch": 1.466324096164434, "grad_norm": 2.718585729598999, "learning_rate": 7.330702881262618e-06, "loss": 0.367, "num_input_tokens_seen": 17239984, "step": 7990 }, { "epoch": 1.467241695723986, "grad_norm": 2.705108642578125, "learning_rate": 7.3352908790603786e-06, "loss": 0.3423, "num_input_tokens_seen": 17250992, "step": 7995 }, { "epoch": 1.4681592952835383, "grad_norm": 1.674997329711914, "learning_rate": 7.33987887685814e-06, "loss": 0.3474, "num_input_tokens_seen": 17261840, "step": 8000 }, { "epoch": 1.4690768948430906, "grad_norm": 6.429266929626465, "learning_rate": 7.344466874655901e-06, "loss": 0.3421, "num_input_tokens_seen": 17271056, "step": 8005 }, { "epoch": 1.4699944944026426, "grad_norm": 2.772942066192627, "learning_rate": 7.349054872453662e-06, "loss": 0.2589, "num_input_tokens_seen": 17281904, "step": 8010 }, { "epoch": 1.470912093962195, "grad_norm": 3.39979887008667, "learning_rate": 7.353642870251423e-06, "loss": 0.2988, "num_input_tokens_seen": 17293552, "step": 8015 }, { "epoch": 1.4718296935217472, "grad_norm": 2.300400495529175, "learning_rate": 7.3582308680491845e-06, "loss": 0.3276, "num_input_tokens_seen": 17303344, "step": 8020 }, { "epoch": 1.4727472930812993, "grad_norm": 2.317702531814575, "learning_rate": 7.362818865846944e-06, "loss": 0.4699, "num_input_tokens_seen": 17313328, "step": 8025 }, { "epoch": 1.4736648926408515, "grad_norm": 9.859728813171387, "learning_rate": 7.367406863644706e-06, "loss": 0.4009, "num_input_tokens_seen": 17323792, "step": 8030 }, { "epoch": 1.4745824922004038, "grad_norm": 1.162721872329712, "learning_rate": 7.371994861442468e-06, "loss": 0.3429, "num_input_tokens_seen": 17334992, "step": 8035 }, { "epoch": 1.475500091759956, "grad_norm": 2.4360861778259277, "learning_rate": 7.3765828592402274e-06, "loss": 0.3755, "num_input_tokens_seen": 17345552, "step": 8040 }, { "epoch": 1.4764176913195082, "grad_norm": 2.17679762840271, "learning_rate": 7.381170857037989e-06, "loss": 0.3486, "num_input_tokens_seen": 17356368, "step": 8045 }, { "epoch": 1.4773352908790605, "grad_norm": 2.0230941772460938, "learning_rate": 7.385758854835751e-06, "loss": 0.2952, "num_input_tokens_seen": 17367216, "step": 8050 }, { "epoch": 1.4782528904386125, "grad_norm": 6.83452033996582, "learning_rate": 7.390346852633511e-06, "loss": 0.3864, "num_input_tokens_seen": 17376752, "step": 8055 }, { "epoch": 1.4791704899981648, "grad_norm": 3.957731008529663, "learning_rate": 7.394934850431272e-06, "loss": 0.3544, "num_input_tokens_seen": 17388496, "step": 8060 }, { "epoch": 1.480088089557717, "grad_norm": 3.7799136638641357, "learning_rate": 7.399522848229033e-06, "loss": 0.3685, "num_input_tokens_seen": 17400272, "step": 8065 }, { "epoch": 1.4810056891172692, "grad_norm": 4.991748809814453, "learning_rate": 7.404110846026794e-06, "loss": 0.2742, "num_input_tokens_seen": 17411728, "step": 8070 }, { "epoch": 1.4819232886768214, "grad_norm": 3.826172113418579, "learning_rate": 7.408698843824555e-06, "loss": 0.3416, "num_input_tokens_seen": 17422128, "step": 8075 }, { "epoch": 1.4828408882363737, "grad_norm": 2.5778920650482178, "learning_rate": 7.413286841622317e-06, "loss": 0.3088, "num_input_tokens_seen": 17433264, "step": 8080 }, { "epoch": 1.4837584877959258, "grad_norm": 2.138065814971924, "learning_rate": 7.417874839420077e-06, "loss": 0.3407, "num_input_tokens_seen": 17444496, "step": 8085 }, { "epoch": 1.484676087355478, "grad_norm": 1.5760306119918823, "learning_rate": 7.4224628372178385e-06, "loss": 0.3255, "num_input_tokens_seen": 17454320, "step": 8090 }, { "epoch": 1.4855936869150304, "grad_norm": 3.639362096786499, "learning_rate": 7.4270508350156e-06, "loss": 0.332, "num_input_tokens_seen": 17464464, "step": 8095 }, { "epoch": 1.4865112864745824, "grad_norm": 1.9845917224884033, "learning_rate": 7.43163883281336e-06, "loss": 0.266, "num_input_tokens_seen": 17474544, "step": 8100 }, { "epoch": 1.4874288860341347, "grad_norm": 2.7331793308258057, "learning_rate": 7.436226830611122e-06, "loss": 0.3619, "num_input_tokens_seen": 17484976, "step": 8105 }, { "epoch": 1.488346485593687, "grad_norm": 11.891611099243164, "learning_rate": 7.440814828408883e-06, "loss": 0.3938, "num_input_tokens_seen": 17493552, "step": 8110 }, { "epoch": 1.489264085153239, "grad_norm": 2.1571075916290283, "learning_rate": 7.445402826206644e-06, "loss": 0.2974, "num_input_tokens_seen": 17504432, "step": 8115 }, { "epoch": 1.4901816847127913, "grad_norm": 5.9995436668396, "learning_rate": 7.449990824004405e-06, "loss": 0.3779, "num_input_tokens_seen": 17515216, "step": 8120 }, { "epoch": 1.4910992842723436, "grad_norm": 9.29678726196289, "learning_rate": 7.454578821802166e-06, "loss": 0.2224, "num_input_tokens_seen": 17526480, "step": 8125 }, { "epoch": 1.4920168838318957, "grad_norm": 2.044149398803711, "learning_rate": 7.459166819599927e-06, "loss": 0.2932, "num_input_tokens_seen": 17537168, "step": 8130 }, { "epoch": 1.492934483391448, "grad_norm": 3.543294906616211, "learning_rate": 7.463754817397688e-06, "loss": 0.4218, "num_input_tokens_seen": 17548496, "step": 8135 }, { "epoch": 1.4938520829510002, "grad_norm": 2.497802972793579, "learning_rate": 7.46834281519545e-06, "loss": 0.2623, "num_input_tokens_seen": 17559792, "step": 8140 }, { "epoch": 1.4947696825105523, "grad_norm": 4.348855972290039, "learning_rate": 7.47293081299321e-06, "loss": 0.4048, "num_input_tokens_seen": 17571120, "step": 8145 }, { "epoch": 1.4956872820701046, "grad_norm": 9.831814765930176, "learning_rate": 7.4775188107909715e-06, "loss": 0.3695, "num_input_tokens_seen": 17581392, "step": 8150 }, { "epoch": 1.4966048816296569, "grad_norm": 5.9648237228393555, "learning_rate": 7.482106808588733e-06, "loss": 0.382, "num_input_tokens_seen": 17591696, "step": 8155 }, { "epoch": 1.497522481189209, "grad_norm": 3.802239418029785, "learning_rate": 7.486694806386493e-06, "loss": 0.3194, "num_input_tokens_seen": 17602480, "step": 8160 }, { "epoch": 1.4984400807487612, "grad_norm": 3.29319429397583, "learning_rate": 7.491282804184255e-06, "loss": 0.2727, "num_input_tokens_seen": 17612976, "step": 8165 }, { "epoch": 1.4993576803083135, "grad_norm": 4.033226013183594, "learning_rate": 7.495870801982016e-06, "loss": 0.3211, "num_input_tokens_seen": 17623440, "step": 8170 }, { "epoch": 1.5002752798678656, "grad_norm": 1.898066520690918, "learning_rate": 7.500458799779777e-06, "loss": 0.2595, "num_input_tokens_seen": 17634640, "step": 8175 }, { "epoch": 1.5011928794274179, "grad_norm": 2.227447986602783, "learning_rate": 7.505046797577538e-06, "loss": 0.2537, "num_input_tokens_seen": 17645584, "step": 8180 }, { "epoch": 1.5021104789869701, "grad_norm": 1.9316235780715942, "learning_rate": 7.509634795375299e-06, "loss": 0.2195, "num_input_tokens_seen": 17655472, "step": 8185 }, { "epoch": 1.5030280785465222, "grad_norm": 1.356458306312561, "learning_rate": 7.514222793173059e-06, "loss": 0.3878, "num_input_tokens_seen": 17667504, "step": 8190 }, { "epoch": 1.5039456781060745, "grad_norm": 4.306859016418457, "learning_rate": 7.518810790970821e-06, "loss": 0.4241, "num_input_tokens_seen": 17676240, "step": 8195 }, { "epoch": 1.5048632776656268, "grad_norm": 1.614072561264038, "learning_rate": 7.5233987887685825e-06, "loss": 0.4812, "num_input_tokens_seen": 17685904, "step": 8200 }, { "epoch": 1.5057808772251788, "grad_norm": 3.3108322620391846, "learning_rate": 7.527986786566342e-06, "loss": 0.2727, "num_input_tokens_seen": 17696752, "step": 8205 }, { "epoch": 1.5066984767847311, "grad_norm": 5.356296539306641, "learning_rate": 7.5325747843641036e-06, "loss": 0.3731, "num_input_tokens_seen": 17707888, "step": 8210 }, { "epoch": 1.5076160763442834, "grad_norm": 8.762772560119629, "learning_rate": 7.537162782161866e-06, "loss": 0.3914, "num_input_tokens_seen": 17717808, "step": 8215 }, { "epoch": 1.5085336759038355, "grad_norm": 5.244813442230225, "learning_rate": 7.541750779959627e-06, "loss": 0.2658, "num_input_tokens_seen": 17728304, "step": 8220 }, { "epoch": 1.5094512754633878, "grad_norm": 2.6563541889190674, "learning_rate": 7.546338777757387e-06, "loss": 0.2671, "num_input_tokens_seen": 17739856, "step": 8225 }, { "epoch": 1.51036887502294, "grad_norm": 3.8925650119781494, "learning_rate": 7.550926775555148e-06, "loss": 0.3548, "num_input_tokens_seen": 17751472, "step": 8230 }, { "epoch": 1.511286474582492, "grad_norm": 2.0383338928222656, "learning_rate": 7.5555147733529095e-06, "loss": 0.2635, "num_input_tokens_seen": 17762416, "step": 8235 }, { "epoch": 1.5122040741420444, "grad_norm": 15.740507125854492, "learning_rate": 7.56010277115067e-06, "loss": 0.4091, "num_input_tokens_seen": 17772976, "step": 8240 }, { "epoch": 1.5131216737015967, "grad_norm": 1.9676051139831543, "learning_rate": 7.564690768948431e-06, "loss": 0.383, "num_input_tokens_seen": 17783024, "step": 8245 }, { "epoch": 1.5140392732611487, "grad_norm": 3.589581251144409, "learning_rate": 7.569278766746193e-06, "loss": 0.443, "num_input_tokens_seen": 17793904, "step": 8250 }, { "epoch": 1.514956872820701, "grad_norm": 3.35296368598938, "learning_rate": 7.573866764543953e-06, "loss": 0.2973, "num_input_tokens_seen": 17805232, "step": 8255 }, { "epoch": 1.5158744723802533, "grad_norm": 2.0779271125793457, "learning_rate": 7.578454762341715e-06, "loss": 0.31, "num_input_tokens_seen": 17815760, "step": 8260 }, { "epoch": 1.5167920719398054, "grad_norm": 1.4912794828414917, "learning_rate": 7.583042760139476e-06, "loss": 0.3264, "num_input_tokens_seen": 17826768, "step": 8265 }, { "epoch": 1.5177096714993576, "grad_norm": 1.4271485805511475, "learning_rate": 7.5876307579372365e-06, "loss": 0.2792, "num_input_tokens_seen": 17838608, "step": 8270 }, { "epoch": 1.51862727105891, "grad_norm": 1.6063565015792847, "learning_rate": 7.592218755734998e-06, "loss": 0.3972, "num_input_tokens_seen": 17848304, "step": 8275 }, { "epoch": 1.519544870618462, "grad_norm": 9.05770492553711, "learning_rate": 7.596806753532759e-06, "loss": 0.3595, "num_input_tokens_seen": 17859408, "step": 8280 }, { "epoch": 1.5204624701780143, "grad_norm": 1.6041593551635742, "learning_rate": 7.60139475133052e-06, "loss": 0.3093, "num_input_tokens_seen": 17870448, "step": 8285 }, { "epoch": 1.5213800697375666, "grad_norm": 2.070228099822998, "learning_rate": 7.605982749128281e-06, "loss": 0.3329, "num_input_tokens_seen": 17880816, "step": 8290 }, { "epoch": 1.5222976692971186, "grad_norm": 13.99982738494873, "learning_rate": 7.6105707469260425e-06, "loss": 0.3144, "num_input_tokens_seen": 17892112, "step": 8295 }, { "epoch": 1.523215268856671, "grad_norm": 6.300705909729004, "learning_rate": 7.615158744723803e-06, "loss": 0.3621, "num_input_tokens_seen": 17901744, "step": 8300 }, { "epoch": 1.5241328684162232, "grad_norm": 8.856427192687988, "learning_rate": 7.619746742521564e-06, "loss": 0.404, "num_input_tokens_seen": 17910320, "step": 8305 }, { "epoch": 1.5250504679757753, "grad_norm": 1.3673845529556274, "learning_rate": 7.624334740319326e-06, "loss": 0.2812, "num_input_tokens_seen": 17921552, "step": 8310 }, { "epoch": 1.5259680675353275, "grad_norm": 7.829507350921631, "learning_rate": 7.628922738117086e-06, "loss": 0.3691, "num_input_tokens_seen": 17932784, "step": 8315 }, { "epoch": 1.5268856670948798, "grad_norm": 3.301154136657715, "learning_rate": 7.633510735914847e-06, "loss": 0.2781, "num_input_tokens_seen": 17942800, "step": 8320 }, { "epoch": 1.527803266654432, "grad_norm": 8.71766185760498, "learning_rate": 7.638098733712609e-06, "loss": 0.3668, "num_input_tokens_seen": 17954160, "step": 8325 }, { "epoch": 1.5287208662139842, "grad_norm": 7.989460468292236, "learning_rate": 7.64268673151037e-06, "loss": 0.4037, "num_input_tokens_seen": 17965040, "step": 8330 }, { "epoch": 1.5296384657735365, "grad_norm": 1.6154775619506836, "learning_rate": 7.64727472930813e-06, "loss": 0.3289, "num_input_tokens_seen": 17976688, "step": 8335 }, { "epoch": 1.5305560653330885, "grad_norm": 2.379856824874878, "learning_rate": 7.651862727105892e-06, "loss": 0.2839, "num_input_tokens_seen": 17988080, "step": 8340 }, { "epoch": 1.5314736648926408, "grad_norm": 1.77922523021698, "learning_rate": 7.656450724903653e-06, "loss": 0.3029, "num_input_tokens_seen": 17998608, "step": 8345 }, { "epoch": 1.532391264452193, "grad_norm": 1.5423972606658936, "learning_rate": 7.661038722701413e-06, "loss": 0.3329, "num_input_tokens_seen": 18008464, "step": 8350 }, { "epoch": 1.5333088640117452, "grad_norm": 2.4387853145599365, "learning_rate": 7.665626720499175e-06, "loss": 0.3143, "num_input_tokens_seen": 18020144, "step": 8355 }, { "epoch": 1.5342264635712974, "grad_norm": 1.3698320388793945, "learning_rate": 7.670214718296936e-06, "loss": 0.344, "num_input_tokens_seen": 18029552, "step": 8360 }, { "epoch": 1.5351440631308497, "grad_norm": 3.256122350692749, "learning_rate": 7.674802716094696e-06, "loss": 0.3094, "num_input_tokens_seen": 18039920, "step": 8365 }, { "epoch": 1.5360616626904018, "grad_norm": 8.54688549041748, "learning_rate": 7.679390713892459e-06, "loss": 0.4185, "num_input_tokens_seen": 18051312, "step": 8370 }, { "epoch": 1.536979262249954, "grad_norm": 2.227924346923828, "learning_rate": 7.68397871169022e-06, "loss": 0.3947, "num_input_tokens_seen": 18061456, "step": 8375 }, { "epoch": 1.5378968618095064, "grad_norm": 6.012393474578857, "learning_rate": 7.68856670948798e-06, "loss": 0.2989, "num_input_tokens_seen": 18072816, "step": 8380 }, { "epoch": 1.5388144613690584, "grad_norm": 0.8536925911903381, "learning_rate": 7.693154707285742e-06, "loss": 0.2922, "num_input_tokens_seen": 18083312, "step": 8385 }, { "epoch": 1.5397320609286107, "grad_norm": 3.91467022895813, "learning_rate": 7.697742705083502e-06, "loss": 0.366, "num_input_tokens_seen": 18094064, "step": 8390 }, { "epoch": 1.540649660488163, "grad_norm": 1.2266350984573364, "learning_rate": 7.702330702881263e-06, "loss": 0.4038, "num_input_tokens_seen": 18103632, "step": 8395 }, { "epoch": 1.541567260047715, "grad_norm": 1.1270352602005005, "learning_rate": 7.706918700679025e-06, "loss": 0.2654, "num_input_tokens_seen": 18114032, "step": 8400 }, { "epoch": 1.5424848596072673, "grad_norm": 1.316177487373352, "learning_rate": 7.711506698476786e-06, "loss": 0.3299, "num_input_tokens_seen": 18124496, "step": 8405 }, { "epoch": 1.5434024591668196, "grad_norm": 1.7780777215957642, "learning_rate": 7.716094696274546e-06, "loss": 0.2461, "num_input_tokens_seen": 18133776, "step": 8410 }, { "epoch": 1.5443200587263717, "grad_norm": 0.8954845666885376, "learning_rate": 7.720682694072308e-06, "loss": 0.2952, "num_input_tokens_seen": 18144816, "step": 8415 }, { "epoch": 1.5452376582859242, "grad_norm": 1.591339349746704, "learning_rate": 7.725270691870069e-06, "loss": 0.3571, "num_input_tokens_seen": 18156560, "step": 8420 }, { "epoch": 1.5461552578454762, "grad_norm": 1.0096577405929565, "learning_rate": 7.72985868966783e-06, "loss": 0.3421, "num_input_tokens_seen": 18167248, "step": 8425 }, { "epoch": 1.5470728574050283, "grad_norm": 1.3265920877456665, "learning_rate": 7.734446687465592e-06, "loss": 0.3398, "num_input_tokens_seen": 18178416, "step": 8430 }, { "epoch": 1.5479904569645808, "grad_norm": 7.321253776550293, "learning_rate": 7.739034685263352e-06, "loss": 0.4593, "num_input_tokens_seen": 18190224, "step": 8435 }, { "epoch": 1.5489080565241329, "grad_norm": 1.0397062301635742, "learning_rate": 7.743622683061113e-06, "loss": 0.2311, "num_input_tokens_seen": 18201648, "step": 8440 }, { "epoch": 1.549825656083685, "grad_norm": 1.8799058198928833, "learning_rate": 7.748210680858875e-06, "loss": 0.3907, "num_input_tokens_seen": 18212304, "step": 8445 }, { "epoch": 1.5507432556432375, "grad_norm": 2.635495662689209, "learning_rate": 7.752798678656634e-06, "loss": 0.3214, "num_input_tokens_seen": 18223216, "step": 8450 }, { "epoch": 1.5516608552027895, "grad_norm": 1.9514789581298828, "learning_rate": 7.757386676454396e-06, "loss": 0.3052, "num_input_tokens_seen": 18234576, "step": 8455 }, { "epoch": 1.5525784547623416, "grad_norm": 14.488800048828125, "learning_rate": 7.761974674252158e-06, "loss": 0.4944, "num_input_tokens_seen": 18246192, "step": 8460 }, { "epoch": 1.553496054321894, "grad_norm": 8.234895706176758, "learning_rate": 7.766562672049917e-06, "loss": 0.2589, "num_input_tokens_seen": 18255408, "step": 8465 }, { "epoch": 1.5544136538814461, "grad_norm": 3.6988613605499268, "learning_rate": 7.771150669847679e-06, "loss": 0.4127, "num_input_tokens_seen": 18265968, "step": 8470 }, { "epoch": 1.5553312534409982, "grad_norm": 1.7797632217407227, "learning_rate": 7.77573866764544e-06, "loss": 0.3303, "num_input_tokens_seen": 18274640, "step": 8475 }, { "epoch": 1.5562488530005507, "grad_norm": 2.42751145362854, "learning_rate": 7.7803266654432e-06, "loss": 0.2757, "num_input_tokens_seen": 18285232, "step": 8480 }, { "epoch": 1.5571664525601028, "grad_norm": 3.1936752796173096, "learning_rate": 7.784914663240962e-06, "loss": 0.2909, "num_input_tokens_seen": 18296688, "step": 8485 }, { "epoch": 1.5580840521196548, "grad_norm": 1.5817604064941406, "learning_rate": 7.789502661038723e-06, "loss": 0.4768, "num_input_tokens_seen": 18307952, "step": 8490 }, { "epoch": 1.5590016516792073, "grad_norm": 1.6458418369293213, "learning_rate": 7.794090658836483e-06, "loss": 0.275, "num_input_tokens_seen": 18317904, "step": 8495 }, { "epoch": 1.5599192512387594, "grad_norm": 6.248214244842529, "learning_rate": 7.798678656634246e-06, "loss": 0.3647, "num_input_tokens_seen": 18328080, "step": 8500 }, { "epoch": 1.5608368507983115, "grad_norm": 2.259831666946411, "learning_rate": 7.803266654432006e-06, "loss": 0.3182, "num_input_tokens_seen": 18338832, "step": 8505 }, { "epoch": 1.561754450357864, "grad_norm": 1.463179588317871, "learning_rate": 7.807854652229767e-06, "loss": 0.3887, "num_input_tokens_seen": 18350064, "step": 8510 }, { "epoch": 1.562672049917416, "grad_norm": 6.33369255065918, "learning_rate": 7.812442650027529e-06, "loss": 0.3, "num_input_tokens_seen": 18361392, "step": 8515 }, { "epoch": 1.563589649476968, "grad_norm": 1.8678568601608276, "learning_rate": 7.81703064782529e-06, "loss": 0.234, "num_input_tokens_seen": 18372784, "step": 8520 }, { "epoch": 1.5645072490365206, "grad_norm": 1.748094081878662, "learning_rate": 7.82161864562305e-06, "loss": 0.2649, "num_input_tokens_seen": 18383120, "step": 8525 }, { "epoch": 1.5654248485960727, "grad_norm": 1.8837254047393799, "learning_rate": 7.826206643420812e-06, "loss": 0.2926, "num_input_tokens_seen": 18394256, "step": 8530 }, { "epoch": 1.5663424481556247, "grad_norm": 1.353871464729309, "learning_rate": 7.830794641218573e-06, "loss": 0.314, "num_input_tokens_seen": 18405680, "step": 8535 }, { "epoch": 1.5672600477151772, "grad_norm": 1.4835277795791626, "learning_rate": 7.835382639016333e-06, "loss": 0.3961, "num_input_tokens_seen": 18417104, "step": 8540 }, { "epoch": 1.5681776472747293, "grad_norm": 2.5423784255981445, "learning_rate": 7.839970636814095e-06, "loss": 0.3126, "num_input_tokens_seen": 18428464, "step": 8545 }, { "epoch": 1.5690952468342814, "grad_norm": 0.7857255339622498, "learning_rate": 7.844558634611856e-06, "loss": 0.2848, "num_input_tokens_seen": 18439856, "step": 8550 }, { "epoch": 1.5700128463938339, "grad_norm": 1.5227444171905518, "learning_rate": 7.849146632409616e-06, "loss": 0.2953, "num_input_tokens_seen": 18451600, "step": 8555 }, { "epoch": 1.570930445953386, "grad_norm": 2.1119627952575684, "learning_rate": 7.853734630207379e-06, "loss": 0.3437, "num_input_tokens_seen": 18462032, "step": 8560 }, { "epoch": 1.571848045512938, "grad_norm": 1.724753737449646, "learning_rate": 7.858322628005139e-06, "loss": 0.2315, "num_input_tokens_seen": 18471920, "step": 8565 }, { "epoch": 1.5727656450724905, "grad_norm": 1.4011986255645752, "learning_rate": 7.8629106258029e-06, "loss": 0.2267, "num_input_tokens_seen": 18482288, "step": 8570 }, { "epoch": 1.5736832446320426, "grad_norm": 7.285332679748535, "learning_rate": 7.867498623600662e-06, "loss": 0.3438, "num_input_tokens_seen": 18491984, "step": 8575 }, { "epoch": 1.5746008441915946, "grad_norm": 7.706747531890869, "learning_rate": 7.872086621398422e-06, "loss": 0.3949, "num_input_tokens_seen": 18503056, "step": 8580 }, { "epoch": 1.5755184437511471, "grad_norm": 1.4098687171936035, "learning_rate": 7.876674619196183e-06, "loss": 0.2398, "num_input_tokens_seen": 18513744, "step": 8585 }, { "epoch": 1.5764360433106992, "grad_norm": 5.176730632781982, "learning_rate": 7.881262616993945e-06, "loss": 0.3134, "num_input_tokens_seen": 18523760, "step": 8590 }, { "epoch": 1.5773536428702513, "grad_norm": 4.79651403427124, "learning_rate": 7.885850614791706e-06, "loss": 0.2633, "num_input_tokens_seen": 18535248, "step": 8595 }, { "epoch": 1.5782712424298038, "grad_norm": 15.44127082824707, "learning_rate": 7.890438612589466e-06, "loss": 0.2616, "num_input_tokens_seen": 18545616, "step": 8600 }, { "epoch": 1.5791888419893558, "grad_norm": 6.1689453125, "learning_rate": 7.895026610387228e-06, "loss": 0.291, "num_input_tokens_seen": 18555696, "step": 8605 }, { "epoch": 1.580106441548908, "grad_norm": 5.797368049621582, "learning_rate": 7.899614608184989e-06, "loss": 0.3989, "num_input_tokens_seen": 18565328, "step": 8610 }, { "epoch": 1.5810240411084604, "grad_norm": 2.540584087371826, "learning_rate": 7.90420260598275e-06, "loss": 0.3114, "num_input_tokens_seen": 18576688, "step": 8615 }, { "epoch": 1.5819416406680125, "grad_norm": 13.385600090026855, "learning_rate": 7.90879060378051e-06, "loss": 0.2493, "num_input_tokens_seen": 18585648, "step": 8620 }, { "epoch": 1.5828592402275647, "grad_norm": 4.495770454406738, "learning_rate": 7.913378601578272e-06, "loss": 0.3496, "num_input_tokens_seen": 18598160, "step": 8625 }, { "epoch": 1.583776839787117, "grad_norm": 10.524798393249512, "learning_rate": 7.917966599376033e-06, "loss": 0.3212, "num_input_tokens_seen": 18609040, "step": 8630 }, { "epoch": 1.584694439346669, "grad_norm": 7.9462409019470215, "learning_rate": 7.922554597173793e-06, "loss": 0.4459, "num_input_tokens_seen": 18619856, "step": 8635 }, { "epoch": 1.5856120389062214, "grad_norm": 5.5061421394348145, "learning_rate": 7.927142594971555e-06, "loss": 0.336, "num_input_tokens_seen": 18630896, "step": 8640 }, { "epoch": 1.5865296384657737, "grad_norm": 1.6200288534164429, "learning_rate": 7.931730592769317e-06, "loss": 0.613, "num_input_tokens_seen": 18641552, "step": 8645 }, { "epoch": 1.5874472380253257, "grad_norm": 1.2929048538208008, "learning_rate": 7.936318590567076e-06, "loss": 0.371, "num_input_tokens_seen": 18652336, "step": 8650 }, { "epoch": 1.588364837584878, "grad_norm": 2.3818166255950928, "learning_rate": 7.940906588364839e-06, "loss": 0.2812, "num_input_tokens_seen": 18663664, "step": 8655 }, { "epoch": 1.5892824371444303, "grad_norm": 1.2355320453643799, "learning_rate": 7.945494586162599e-06, "loss": 0.3982, "num_input_tokens_seen": 18674128, "step": 8660 }, { "epoch": 1.5902000367039824, "grad_norm": 2.613450765609741, "learning_rate": 7.95008258396036e-06, "loss": 0.3033, "num_input_tokens_seen": 18686224, "step": 8665 }, { "epoch": 1.5911176362635346, "grad_norm": 18.379701614379883, "learning_rate": 7.954670581758122e-06, "loss": 0.3717, "num_input_tokens_seen": 18695856, "step": 8670 }, { "epoch": 1.592035235823087, "grad_norm": 8.684057235717773, "learning_rate": 7.959258579555882e-06, "loss": 0.3955, "num_input_tokens_seen": 18705680, "step": 8675 }, { "epoch": 1.592952835382639, "grad_norm": 3.231379508972168, "learning_rate": 7.963846577353643e-06, "loss": 0.3011, "num_input_tokens_seen": 18717776, "step": 8680 }, { "epoch": 1.5938704349421913, "grad_norm": 1.380094289779663, "learning_rate": 7.968434575151405e-06, "loss": 0.273, "num_input_tokens_seen": 18728688, "step": 8685 }, { "epoch": 1.5947880345017436, "grad_norm": 2.0931055545806885, "learning_rate": 7.973022572949166e-06, "loss": 0.3376, "num_input_tokens_seen": 18738224, "step": 8690 }, { "epoch": 1.5957056340612956, "grad_norm": 13.711746215820312, "learning_rate": 7.977610570746926e-06, "loss": 0.3652, "num_input_tokens_seen": 18748560, "step": 8695 }, { "epoch": 1.596623233620848, "grad_norm": 7.842366695404053, "learning_rate": 7.982198568544688e-06, "loss": 0.2861, "num_input_tokens_seen": 18759088, "step": 8700 }, { "epoch": 1.5975408331804002, "grad_norm": 2.0753064155578613, "learning_rate": 7.986786566342449e-06, "loss": 0.2957, "num_input_tokens_seen": 18770128, "step": 8705 }, { "epoch": 1.5984584327399523, "grad_norm": 9.014568328857422, "learning_rate": 7.99137456414021e-06, "loss": 0.3754, "num_input_tokens_seen": 18779760, "step": 8710 }, { "epoch": 1.5993760322995045, "grad_norm": 2.8866469860076904, "learning_rate": 7.995962561937971e-06, "loss": 0.3347, "num_input_tokens_seen": 18790640, "step": 8715 }, { "epoch": 1.6002936318590568, "grad_norm": 1.2899448871612549, "learning_rate": 8.000550559735732e-06, "loss": 0.3075, "num_input_tokens_seen": 18801136, "step": 8720 }, { "epoch": 1.6012112314186089, "grad_norm": 6.4347052574157715, "learning_rate": 8.005138557533492e-06, "loss": 0.3975, "num_input_tokens_seen": 18812656, "step": 8725 }, { "epoch": 1.6021288309781612, "grad_norm": 2.0574872493743896, "learning_rate": 8.009726555331255e-06, "loss": 0.3057, "num_input_tokens_seen": 18823568, "step": 8730 }, { "epoch": 1.6030464305377135, "grad_norm": 10.693977355957031, "learning_rate": 8.014314553129015e-06, "loss": 0.2353, "num_input_tokens_seen": 18834224, "step": 8735 }, { "epoch": 1.6039640300972655, "grad_norm": 9.573314666748047, "learning_rate": 8.018902550926776e-06, "loss": 0.308, "num_input_tokens_seen": 18844912, "step": 8740 }, { "epoch": 1.6048816296568178, "grad_norm": 8.618040084838867, "learning_rate": 8.023490548724538e-06, "loss": 0.3776, "num_input_tokens_seen": 18855728, "step": 8745 }, { "epoch": 1.60579922921637, "grad_norm": 1.3297368288040161, "learning_rate": 8.028078546522298e-06, "loss": 0.2954, "num_input_tokens_seen": 18866416, "step": 8750 }, { "epoch": 1.6067168287759221, "grad_norm": 1.2440109252929688, "learning_rate": 8.032666544320059e-06, "loss": 0.3392, "num_input_tokens_seen": 18876976, "step": 8755 }, { "epoch": 1.6076344283354744, "grad_norm": 2.5339303016662598, "learning_rate": 8.037254542117821e-06, "loss": 0.3017, "num_input_tokens_seen": 18888048, "step": 8760 }, { "epoch": 1.6085520278950267, "grad_norm": 6.589540958404541, "learning_rate": 8.041842539915582e-06, "loss": 0.4452, "num_input_tokens_seen": 18899408, "step": 8765 }, { "epoch": 1.6094696274545788, "grad_norm": 1.9060488939285278, "learning_rate": 8.046430537713342e-06, "loss": 0.3441, "num_input_tokens_seen": 18910544, "step": 8770 }, { "epoch": 1.610387227014131, "grad_norm": 1.6136363744735718, "learning_rate": 8.051018535511104e-06, "loss": 0.3069, "num_input_tokens_seen": 18922064, "step": 8775 }, { "epoch": 1.6113048265736833, "grad_norm": 3.9754209518432617, "learning_rate": 8.055606533308865e-06, "loss": 0.3062, "num_input_tokens_seen": 18932624, "step": 8780 }, { "epoch": 1.6122224261332354, "grad_norm": 3.1856439113616943, "learning_rate": 8.060194531106625e-06, "loss": 0.3217, "num_input_tokens_seen": 18943920, "step": 8785 }, { "epoch": 1.6131400256927877, "grad_norm": 2.2865610122680664, "learning_rate": 8.064782528904388e-06, "loss": 0.3092, "num_input_tokens_seen": 18953488, "step": 8790 }, { "epoch": 1.61405762525234, "grad_norm": 2.460811138153076, "learning_rate": 8.069370526702148e-06, "loss": 0.2886, "num_input_tokens_seen": 18963824, "step": 8795 }, { "epoch": 1.614975224811892, "grad_norm": 3.69113826751709, "learning_rate": 8.073958524499909e-06, "loss": 0.3671, "num_input_tokens_seen": 18974736, "step": 8800 }, { "epoch": 1.6158928243714443, "grad_norm": 6.948248386383057, "learning_rate": 8.07854652229767e-06, "loss": 0.3878, "num_input_tokens_seen": 18985520, "step": 8805 }, { "epoch": 1.6168104239309966, "grad_norm": 6.712940692901611, "learning_rate": 8.083134520095431e-06, "loss": 0.3929, "num_input_tokens_seen": 18994576, "step": 8810 }, { "epoch": 1.6177280234905487, "grad_norm": 8.006871223449707, "learning_rate": 8.087722517893192e-06, "loss": 0.3288, "num_input_tokens_seen": 19005072, "step": 8815 }, { "epoch": 1.618645623050101, "grad_norm": 5.500711917877197, "learning_rate": 8.092310515690952e-06, "loss": 0.3437, "num_input_tokens_seen": 19016848, "step": 8820 }, { "epoch": 1.6195632226096532, "grad_norm": 3.5545804500579834, "learning_rate": 8.096898513488715e-06, "loss": 0.3611, "num_input_tokens_seen": 19027664, "step": 8825 }, { "epoch": 1.6204808221692053, "grad_norm": 3.6042137145996094, "learning_rate": 8.101486511286475e-06, "loss": 0.3133, "num_input_tokens_seen": 19039824, "step": 8830 }, { "epoch": 1.6213984217287576, "grad_norm": 2.8110508918762207, "learning_rate": 8.106074509084236e-06, "loss": 0.2372, "num_input_tokens_seen": 19050608, "step": 8835 }, { "epoch": 1.6223160212883099, "grad_norm": 3.552489995956421, "learning_rate": 8.110662506881998e-06, "loss": 0.2819, "num_input_tokens_seen": 19062448, "step": 8840 }, { "epoch": 1.623233620847862, "grad_norm": 3.0060856342315674, "learning_rate": 8.115250504679758e-06, "loss": 0.279, "num_input_tokens_seen": 19072944, "step": 8845 }, { "epoch": 1.6241512204074142, "grad_norm": 3.649948835372925, "learning_rate": 8.119838502477519e-06, "loss": 0.3029, "num_input_tokens_seen": 19084656, "step": 8850 }, { "epoch": 1.6250688199669665, "grad_norm": 8.424897193908691, "learning_rate": 8.124426500275281e-06, "loss": 0.3876, "num_input_tokens_seen": 19096496, "step": 8855 }, { "epoch": 1.6259864195265186, "grad_norm": 19.0126953125, "learning_rate": 8.129014498073042e-06, "loss": 0.4192, "num_input_tokens_seen": 19107632, "step": 8860 }, { "epoch": 1.6269040190860709, "grad_norm": 2.582085132598877, "learning_rate": 8.133602495870802e-06, "loss": 0.3701, "num_input_tokens_seen": 19117488, "step": 8865 }, { "epoch": 1.6278216186456231, "grad_norm": 4.194077968597412, "learning_rate": 8.138190493668564e-06, "loss": 0.2829, "num_input_tokens_seen": 19128080, "step": 8870 }, { "epoch": 1.6287392182051752, "grad_norm": 3.5270423889160156, "learning_rate": 8.142778491466325e-06, "loss": 0.3619, "num_input_tokens_seen": 19140048, "step": 8875 }, { "epoch": 1.6296568177647275, "grad_norm": 3.3094537258148193, "learning_rate": 8.147366489264085e-06, "loss": 0.2282, "num_input_tokens_seen": 19150224, "step": 8880 }, { "epoch": 1.6305744173242798, "grad_norm": 2.2725117206573486, "learning_rate": 8.151954487061848e-06, "loss": 0.338, "num_input_tokens_seen": 19161136, "step": 8885 }, { "epoch": 1.6314920168838318, "grad_norm": 2.563244581222534, "learning_rate": 8.156542484859608e-06, "loss": 0.3083, "num_input_tokens_seen": 19172656, "step": 8890 }, { "epoch": 1.6324096164433841, "grad_norm": 4.057332992553711, "learning_rate": 8.161130482657369e-06, "loss": 0.3027, "num_input_tokens_seen": 19182576, "step": 8895 }, { "epoch": 1.6333272160029364, "grad_norm": 6.6017231941223145, "learning_rate": 8.16571848045513e-06, "loss": 0.3978, "num_input_tokens_seen": 19192144, "step": 8900 }, { "epoch": 1.6342448155624885, "grad_norm": 3.993617534637451, "learning_rate": 8.170306478252891e-06, "loss": 0.2811, "num_input_tokens_seen": 19202576, "step": 8905 }, { "epoch": 1.6351624151220407, "grad_norm": 4.330930233001709, "learning_rate": 8.174894476050652e-06, "loss": 0.2907, "num_input_tokens_seen": 19213744, "step": 8910 }, { "epoch": 1.636080014681593, "grad_norm": 3.944345474243164, "learning_rate": 8.179482473848414e-06, "loss": 0.3617, "num_input_tokens_seen": 19224752, "step": 8915 }, { "epoch": 1.636997614241145, "grad_norm": 2.9417548179626465, "learning_rate": 8.184070471646175e-06, "loss": 0.2975, "num_input_tokens_seen": 19235440, "step": 8920 }, { "epoch": 1.6379152138006974, "grad_norm": 7.363443851470947, "learning_rate": 8.188658469443935e-06, "loss": 0.3088, "num_input_tokens_seen": 19246960, "step": 8925 }, { "epoch": 1.6388328133602497, "grad_norm": 6.484044551849365, "learning_rate": 8.193246467241697e-06, "loss": 0.3057, "num_input_tokens_seen": 19257744, "step": 8930 }, { "epoch": 1.6397504129198017, "grad_norm": 21.030664443969727, "learning_rate": 8.197834465039458e-06, "loss": 0.3465, "num_input_tokens_seen": 19269968, "step": 8935 }, { "epoch": 1.640668012479354, "grad_norm": 1.7821131944656372, "learning_rate": 8.202422462837218e-06, "loss": 0.1148, "num_input_tokens_seen": 19281136, "step": 8940 }, { "epoch": 1.6415856120389063, "grad_norm": 30.269054412841797, "learning_rate": 8.20701046063498e-06, "loss": 0.747, "num_input_tokens_seen": 19291568, "step": 8945 }, { "epoch": 1.6425032115984584, "grad_norm": 6.524789810180664, "learning_rate": 8.21159845843274e-06, "loss": 0.4282, "num_input_tokens_seen": 19302192, "step": 8950 }, { "epoch": 1.6434208111580106, "grad_norm": 8.106391906738281, "learning_rate": 8.216186456230502e-06, "loss": 0.3343, "num_input_tokens_seen": 19312144, "step": 8955 }, { "epoch": 1.644338410717563, "grad_norm": 2.8515124320983887, "learning_rate": 8.220774454028264e-06, "loss": 0.2852, "num_input_tokens_seen": 19322096, "step": 8960 }, { "epoch": 1.645256010277115, "grad_norm": 8.432924270629883, "learning_rate": 8.225362451826023e-06, "loss": 0.3151, "num_input_tokens_seen": 19333264, "step": 8965 }, { "epoch": 1.6461736098366673, "grad_norm": 3.3536417484283447, "learning_rate": 8.229950449623785e-06, "loss": 0.5265, "num_input_tokens_seen": 19343120, "step": 8970 }, { "epoch": 1.6470912093962196, "grad_norm": 3.048765182495117, "learning_rate": 8.234538447421545e-06, "loss": 0.3624, "num_input_tokens_seen": 19354448, "step": 8975 }, { "epoch": 1.6480088089557716, "grad_norm": 4.703948974609375, "learning_rate": 8.239126445219306e-06, "loss": 0.3165, "num_input_tokens_seen": 19365136, "step": 8980 }, { "epoch": 1.648926408515324, "grad_norm": 5.531694412231445, "learning_rate": 8.243714443017068e-06, "loss": 0.3475, "num_input_tokens_seen": 19375536, "step": 8985 }, { "epoch": 1.6498440080748762, "grad_norm": 4.7688069343566895, "learning_rate": 8.248302440814829e-06, "loss": 0.3815, "num_input_tokens_seen": 19386512, "step": 8990 }, { "epoch": 1.6507616076344283, "grad_norm": 2.5053303241729736, "learning_rate": 8.252890438612589e-06, "loss": 0.3385, "num_input_tokens_seen": 19398224, "step": 8995 }, { "epoch": 1.6516792071939805, "grad_norm": 2.432682514190674, "learning_rate": 8.257478436410351e-06, "loss": 0.275, "num_input_tokens_seen": 19409328, "step": 9000 }, { "epoch": 1.6525968067535328, "grad_norm": 3.369307279586792, "learning_rate": 8.262066434208112e-06, "loss": 0.3271, "num_input_tokens_seen": 19420304, "step": 9005 }, { "epoch": 1.6535144063130849, "grad_norm": 3.1543219089508057, "learning_rate": 8.266654432005872e-06, "loss": 0.2871, "num_input_tokens_seen": 19430736, "step": 9010 }, { "epoch": 1.6544320058726372, "grad_norm": 2.6988601684570312, "learning_rate": 8.271242429803635e-06, "loss": 0.3049, "num_input_tokens_seen": 19441104, "step": 9015 }, { "epoch": 1.6553496054321895, "grad_norm": 3.0413880348205566, "learning_rate": 8.275830427601395e-06, "loss": 0.2758, "num_input_tokens_seen": 19451344, "step": 9020 }, { "epoch": 1.6562672049917415, "grad_norm": 9.248980522155762, "learning_rate": 8.280418425399156e-06, "loss": 0.4568, "num_input_tokens_seen": 19461296, "step": 9025 }, { "epoch": 1.6571848045512938, "grad_norm": 8.129858016967773, "learning_rate": 8.285006423196918e-06, "loss": 0.3457, "num_input_tokens_seen": 19471792, "step": 9030 }, { "epoch": 1.658102404110846, "grad_norm": 3.0463991165161133, "learning_rate": 8.289594420994678e-06, "loss": 0.2679, "num_input_tokens_seen": 19482960, "step": 9035 }, { "epoch": 1.6590200036703981, "grad_norm": 3.9927330017089844, "learning_rate": 8.294182418792439e-06, "loss": 0.4389, "num_input_tokens_seen": 19493616, "step": 9040 }, { "epoch": 1.6599376032299504, "grad_norm": 3.859680652618408, "learning_rate": 8.298770416590201e-06, "loss": 0.3646, "num_input_tokens_seen": 19505456, "step": 9045 }, { "epoch": 1.6608552027895027, "grad_norm": 7.44108247756958, "learning_rate": 8.303358414387962e-06, "loss": 0.3446, "num_input_tokens_seen": 19516400, "step": 9050 }, { "epoch": 1.6617728023490548, "grad_norm": 3.359769105911255, "learning_rate": 8.307946412185722e-06, "loss": 0.3628, "num_input_tokens_seen": 19527472, "step": 9055 }, { "epoch": 1.662690401908607, "grad_norm": 4.280404090881348, "learning_rate": 8.312534409983484e-06, "loss": 0.335, "num_input_tokens_seen": 19538640, "step": 9060 }, { "epoch": 1.6636080014681593, "grad_norm": 3.640211582183838, "learning_rate": 8.317122407781245e-06, "loss": 0.3075, "num_input_tokens_seen": 19550032, "step": 9065 }, { "epoch": 1.6645256010277114, "grad_norm": 6.974141597747803, "learning_rate": 8.321710405579007e-06, "loss": 0.3255, "num_input_tokens_seen": 19561488, "step": 9070 }, { "epoch": 1.6654432005872637, "grad_norm": 5.257266521453857, "learning_rate": 8.326298403376767e-06, "loss": 0.3517, "num_input_tokens_seen": 19572176, "step": 9075 }, { "epoch": 1.666360800146816, "grad_norm": 5.569756507873535, "learning_rate": 8.330886401174528e-06, "loss": 0.4171, "num_input_tokens_seen": 19582768, "step": 9080 }, { "epoch": 1.667278399706368, "grad_norm": 10.414348602294922, "learning_rate": 8.33547439897229e-06, "loss": 0.2891, "num_input_tokens_seen": 19592624, "step": 9085 }, { "epoch": 1.6681959992659203, "grad_norm": 3.625819206237793, "learning_rate": 8.34006239677005e-06, "loss": 0.3285, "num_input_tokens_seen": 19604688, "step": 9090 }, { "epoch": 1.6691135988254726, "grad_norm": 6.4562668800354, "learning_rate": 8.344650394567811e-06, "loss": 0.3704, "num_input_tokens_seen": 19614320, "step": 9095 }, { "epoch": 1.6700311983850247, "grad_norm": 4.222243309020996, "learning_rate": 8.349238392365573e-06, "loss": 0.2903, "num_input_tokens_seen": 19625168, "step": 9100 }, { "epoch": 1.670948797944577, "grad_norm": 3.7890849113464355, "learning_rate": 8.353826390163334e-06, "loss": 0.2984, "num_input_tokens_seen": 19635856, "step": 9105 }, { "epoch": 1.6718663975041292, "grad_norm": 3.4636623859405518, "learning_rate": 8.358414387961094e-06, "loss": 0.3437, "num_input_tokens_seen": 19646128, "step": 9110 }, { "epoch": 1.6727839970636813, "grad_norm": 5.221144676208496, "learning_rate": 8.363002385758857e-06, "loss": 0.3231, "num_input_tokens_seen": 19658224, "step": 9115 }, { "epoch": 1.6737015966232336, "grad_norm": 3.791567325592041, "learning_rate": 8.367590383556616e-06, "loss": 0.2844, "num_input_tokens_seen": 19669104, "step": 9120 }, { "epoch": 1.6746191961827859, "grad_norm": 3.9986231327056885, "learning_rate": 8.372178381354378e-06, "loss": 0.3316, "num_input_tokens_seen": 19679376, "step": 9125 }, { "epoch": 1.675536795742338, "grad_norm": 4.136163711547852, "learning_rate": 8.37676637915214e-06, "loss": 0.309, "num_input_tokens_seen": 19690512, "step": 9130 }, { "epoch": 1.6764543953018902, "grad_norm": 6.1884236335754395, "learning_rate": 8.381354376949899e-06, "loss": 0.3899, "num_input_tokens_seen": 19702704, "step": 9135 }, { "epoch": 1.6773719948614425, "grad_norm": 2.3675506114959717, "learning_rate": 8.385942374747661e-06, "loss": 0.3076, "num_input_tokens_seen": 19712688, "step": 9140 }, { "epoch": 1.6782895944209946, "grad_norm": 13.046402931213379, "learning_rate": 8.390530372545423e-06, "loss": 0.3458, "num_input_tokens_seen": 19723664, "step": 9145 }, { "epoch": 1.6792071939805469, "grad_norm": 3.2228806018829346, "learning_rate": 8.395118370343182e-06, "loss": 0.2676, "num_input_tokens_seen": 19734256, "step": 9150 }, { "epoch": 1.6801247935400991, "grad_norm": 3.0930187702178955, "learning_rate": 8.399706368140944e-06, "loss": 0.2962, "num_input_tokens_seen": 19744240, "step": 9155 }, { "epoch": 1.6810423930996512, "grad_norm": 4.2617878913879395, "learning_rate": 8.404294365938705e-06, "loss": 0.285, "num_input_tokens_seen": 19755920, "step": 9160 }, { "epoch": 1.6819599926592035, "grad_norm": 3.2950804233551025, "learning_rate": 8.408882363736465e-06, "loss": 0.1948, "num_input_tokens_seen": 19766192, "step": 9165 }, { "epoch": 1.6828775922187558, "grad_norm": 4.5478925704956055, "learning_rate": 8.413470361534227e-06, "loss": 0.3606, "num_input_tokens_seen": 19777136, "step": 9170 }, { "epoch": 1.6837951917783078, "grad_norm": 1.793792963027954, "learning_rate": 8.418058359331988e-06, "loss": 0.418, "num_input_tokens_seen": 19788400, "step": 9175 }, { "epoch": 1.6847127913378601, "grad_norm": 4.900216579437256, "learning_rate": 8.422646357129748e-06, "loss": 0.4923, "num_input_tokens_seen": 19799024, "step": 9180 }, { "epoch": 1.6856303908974124, "grad_norm": 3.4982426166534424, "learning_rate": 8.42723435492751e-06, "loss": 0.2986, "num_input_tokens_seen": 19809488, "step": 9185 }, { "epoch": 1.6865479904569645, "grad_norm": 2.9226927757263184, "learning_rate": 8.431822352725271e-06, "loss": 0.251, "num_input_tokens_seen": 19819024, "step": 9190 }, { "epoch": 1.6874655900165167, "grad_norm": 3.075711965560913, "learning_rate": 8.436410350523032e-06, "loss": 0.2508, "num_input_tokens_seen": 19829904, "step": 9195 }, { "epoch": 1.688383189576069, "grad_norm": 7.438153266906738, "learning_rate": 8.440998348320794e-06, "loss": 0.3297, "num_input_tokens_seen": 19840656, "step": 9200 }, { "epoch": 1.689300789135621, "grad_norm": 3.5917348861694336, "learning_rate": 8.445586346118554e-06, "loss": 0.3455, "num_input_tokens_seen": 19851952, "step": 9205 }, { "epoch": 1.6902183886951734, "grad_norm": 5.2576751708984375, "learning_rate": 8.450174343916315e-06, "loss": 0.2638, "num_input_tokens_seen": 19862832, "step": 9210 }, { "epoch": 1.6911359882547257, "grad_norm": 9.065199851989746, "learning_rate": 8.454762341714077e-06, "loss": 0.3136, "num_input_tokens_seen": 19874192, "step": 9215 }, { "epoch": 1.6920535878142777, "grad_norm": 20.437360763549805, "learning_rate": 8.459350339511838e-06, "loss": 0.3651, "num_input_tokens_seen": 19885968, "step": 9220 }, { "epoch": 1.69297118737383, "grad_norm": 3.8199141025543213, "learning_rate": 8.463938337309598e-06, "loss": 0.266, "num_input_tokens_seen": 19896624, "step": 9225 }, { "epoch": 1.6938887869333823, "grad_norm": 5.596640586853027, "learning_rate": 8.46852633510736e-06, "loss": 0.3007, "num_input_tokens_seen": 19906160, "step": 9230 }, { "epoch": 1.6948063864929344, "grad_norm": 4.640010356903076, "learning_rate": 8.473114332905121e-06, "loss": 0.278, "num_input_tokens_seen": 19916688, "step": 9235 }, { "epoch": 1.6957239860524866, "grad_norm": 16.341415405273438, "learning_rate": 8.477702330702881e-06, "loss": 0.3494, "num_input_tokens_seen": 19927856, "step": 9240 }, { "epoch": 1.696641585612039, "grad_norm": 2.723629951477051, "learning_rate": 8.482290328500644e-06, "loss": 0.3, "num_input_tokens_seen": 19939024, "step": 9245 }, { "epoch": 1.697559185171591, "grad_norm": 6.911598205566406, "learning_rate": 8.486878326298404e-06, "loss": 0.2708, "num_input_tokens_seen": 19950032, "step": 9250 }, { "epoch": 1.6984767847311433, "grad_norm": 13.86898422241211, "learning_rate": 8.491466324096165e-06, "loss": 0.3739, "num_input_tokens_seen": 19960528, "step": 9255 }, { "epoch": 1.6993943842906956, "grad_norm": 17.770708084106445, "learning_rate": 8.496054321893927e-06, "loss": 0.3279, "num_input_tokens_seen": 19971280, "step": 9260 }, { "epoch": 1.7003119838502476, "grad_norm": 4.902008056640625, "learning_rate": 8.500642319691687e-06, "loss": 0.2352, "num_input_tokens_seen": 19982672, "step": 9265 }, { "epoch": 1.7012295834098001, "grad_norm": 3.6845669746398926, "learning_rate": 8.505230317489448e-06, "loss": 0.251, "num_input_tokens_seen": 19992592, "step": 9270 }, { "epoch": 1.7021471829693522, "grad_norm": 7.322566509246826, "learning_rate": 8.50981831528721e-06, "loss": 0.3438, "num_input_tokens_seen": 20003728, "step": 9275 }, { "epoch": 1.7030647825289043, "grad_norm": 6.914515495300293, "learning_rate": 8.51440631308497e-06, "loss": 0.3651, "num_input_tokens_seen": 20014128, "step": 9280 }, { "epoch": 1.7039823820884568, "grad_norm": 7.391944408416748, "learning_rate": 8.518994310882731e-06, "loss": 0.3031, "num_input_tokens_seen": 20025104, "step": 9285 }, { "epoch": 1.7048999816480088, "grad_norm": 10.059625625610352, "learning_rate": 8.523582308680493e-06, "loss": 0.3385, "num_input_tokens_seen": 20034160, "step": 9290 }, { "epoch": 1.7058175812075609, "grad_norm": 18.17926597595215, "learning_rate": 8.528170306478254e-06, "loss": 0.3667, "num_input_tokens_seen": 20045584, "step": 9295 }, { "epoch": 1.7067351807671134, "grad_norm": 4.5102667808532715, "learning_rate": 8.532758304276014e-06, "loss": 0.2917, "num_input_tokens_seen": 20057872, "step": 9300 }, { "epoch": 1.7076527803266655, "grad_norm": 4.106893062591553, "learning_rate": 8.537346302073775e-06, "loss": 0.2589, "num_input_tokens_seen": 20068432, "step": 9305 }, { "epoch": 1.7085703798862175, "grad_norm": 5.332723140716553, "learning_rate": 8.541934299871537e-06, "loss": 0.3397, "num_input_tokens_seen": 20079536, "step": 9310 }, { "epoch": 1.70948797944577, "grad_norm": 10.279181480407715, "learning_rate": 8.546522297669298e-06, "loss": 0.2481, "num_input_tokens_seen": 20090800, "step": 9315 }, { "epoch": 1.710405579005322, "grad_norm": 9.9141263961792, "learning_rate": 8.551110295467058e-06, "loss": 0.2908, "num_input_tokens_seen": 20101808, "step": 9320 }, { "epoch": 1.7113231785648741, "grad_norm": 3.0145461559295654, "learning_rate": 8.55569829326482e-06, "loss": 0.4539, "num_input_tokens_seen": 20111440, "step": 9325 }, { "epoch": 1.7122407781244267, "grad_norm": 15.556326866149902, "learning_rate": 8.560286291062581e-06, "loss": 0.3356, "num_input_tokens_seen": 20123280, "step": 9330 }, { "epoch": 1.7131583776839787, "grad_norm": 2.4717564582824707, "learning_rate": 8.564874288860341e-06, "loss": 0.2733, "num_input_tokens_seen": 20134064, "step": 9335 }, { "epoch": 1.7140759772435308, "grad_norm": 5.641574382781982, "learning_rate": 8.569462286658104e-06, "loss": 0.325, "num_input_tokens_seen": 20145136, "step": 9340 }, { "epoch": 1.7149935768030833, "grad_norm": 23.96670913696289, "learning_rate": 8.574050284455864e-06, "loss": 0.3288, "num_input_tokens_seen": 20156144, "step": 9345 }, { "epoch": 1.7159111763626353, "grad_norm": 9.002015113830566, "learning_rate": 8.578638282253625e-06, "loss": 0.2851, "num_input_tokens_seen": 20167504, "step": 9350 }, { "epoch": 1.7168287759221874, "grad_norm": 5.134876251220703, "learning_rate": 8.583226280051387e-06, "loss": 0.3296, "num_input_tokens_seen": 20178672, "step": 9355 }, { "epoch": 1.71774637548174, "grad_norm": 4.01519250869751, "learning_rate": 8.587814277849147e-06, "loss": 0.3641, "num_input_tokens_seen": 20190064, "step": 9360 }, { "epoch": 1.718663975041292, "grad_norm": 8.910208702087402, "learning_rate": 8.592402275646908e-06, "loss": 0.3311, "num_input_tokens_seen": 20201072, "step": 9365 }, { "epoch": 1.719581574600844, "grad_norm": 4.586288928985596, "learning_rate": 8.59699027344467e-06, "loss": 0.2428, "num_input_tokens_seen": 20211664, "step": 9370 }, { "epoch": 1.7204991741603965, "grad_norm": 3.438035488128662, "learning_rate": 8.60157827124243e-06, "loss": 0.238, "num_input_tokens_seen": 20222736, "step": 9375 }, { "epoch": 1.7214167737199486, "grad_norm": 15.083779335021973, "learning_rate": 8.606166269040191e-06, "loss": 0.3613, "num_input_tokens_seen": 20233904, "step": 9380 }, { "epoch": 1.7223343732795007, "grad_norm": 3.3119845390319824, "learning_rate": 8.610754266837953e-06, "loss": 0.3307, "num_input_tokens_seen": 20245200, "step": 9385 }, { "epoch": 1.7232519728390532, "grad_norm": 3.0196497440338135, "learning_rate": 8.615342264635714e-06, "loss": 0.3063, "num_input_tokens_seen": 20255664, "step": 9390 }, { "epoch": 1.7241695723986052, "grad_norm": 8.583541870117188, "learning_rate": 8.619930262433474e-06, "loss": 0.3863, "num_input_tokens_seen": 20267344, "step": 9395 }, { "epoch": 1.7250871719581573, "grad_norm": 15.71385669708252, "learning_rate": 8.624518260231237e-06, "loss": 0.3574, "num_input_tokens_seen": 20277776, "step": 9400 }, { "epoch": 1.7260047715177098, "grad_norm": 8.66064167022705, "learning_rate": 8.629106258028997e-06, "loss": 0.4919, "num_input_tokens_seen": 20289744, "step": 9405 }, { "epoch": 1.7269223710772619, "grad_norm": 3.5420777797698975, "learning_rate": 8.633694255826758e-06, "loss": 0.3361, "num_input_tokens_seen": 20299696, "step": 9410 }, { "epoch": 1.727839970636814, "grad_norm": 3.4457437992095947, "learning_rate": 8.63828225362452e-06, "loss": 0.2309, "num_input_tokens_seen": 20310192, "step": 9415 }, { "epoch": 1.7287575701963664, "grad_norm": 3.846881151199341, "learning_rate": 8.64287025142228e-06, "loss": 0.1903, "num_input_tokens_seen": 20320368, "step": 9420 }, { "epoch": 1.7296751697559185, "grad_norm": 11.62859058380127, "learning_rate": 8.64745824922004e-06, "loss": 0.3883, "num_input_tokens_seen": 20331888, "step": 9425 }, { "epoch": 1.7305927693154706, "grad_norm": 7.2933855056762695, "learning_rate": 8.652046247017803e-06, "loss": 0.1931, "num_input_tokens_seen": 20342640, "step": 9430 }, { "epoch": 1.731510368875023, "grad_norm": 7.895130634307861, "learning_rate": 8.656634244815564e-06, "loss": 0.2274, "num_input_tokens_seen": 20353552, "step": 9435 }, { "epoch": 1.7324279684345751, "grad_norm": 3.9793200492858887, "learning_rate": 8.661222242613324e-06, "loss": 0.2566, "num_input_tokens_seen": 20364208, "step": 9440 }, { "epoch": 1.7333455679941272, "grad_norm": 2.2669718265533447, "learning_rate": 8.665810240411086e-06, "loss": 0.1975, "num_input_tokens_seen": 20374992, "step": 9445 }, { "epoch": 1.7342631675536797, "grad_norm": 0.7728497982025146, "learning_rate": 8.670398238208845e-06, "loss": 0.1624, "num_input_tokens_seen": 20386512, "step": 9450 }, { "epoch": 1.7351807671132318, "grad_norm": 6.738224506378174, "learning_rate": 8.674986236006607e-06, "loss": 0.4613, "num_input_tokens_seen": 20397488, "step": 9455 }, { "epoch": 1.7360983666727838, "grad_norm": 0.353167861700058, "learning_rate": 8.67957423380437e-06, "loss": 0.3883, "num_input_tokens_seen": 20407440, "step": 9460 }, { "epoch": 1.7370159662323363, "grad_norm": 7.911971092224121, "learning_rate": 8.684162231602128e-06, "loss": 0.3786, "num_input_tokens_seen": 20417104, "step": 9465 }, { "epoch": 1.7379335657918884, "grad_norm": 4.062906742095947, "learning_rate": 8.68875022939989e-06, "loss": 0.3861, "num_input_tokens_seen": 20427568, "step": 9470 }, { "epoch": 1.7388511653514407, "grad_norm": 5.024840831756592, "learning_rate": 8.693338227197651e-06, "loss": 0.4793, "num_input_tokens_seen": 20437968, "step": 9475 }, { "epoch": 1.739768764910993, "grad_norm": 3.1633803844451904, "learning_rate": 8.697926224995412e-06, "loss": 0.2509, "num_input_tokens_seen": 20450192, "step": 9480 }, { "epoch": 1.740686364470545, "grad_norm": 8.407103538513184, "learning_rate": 8.702514222793174e-06, "loss": 0.302, "num_input_tokens_seen": 20460432, "step": 9485 }, { "epoch": 1.7416039640300973, "grad_norm": 7.875458240509033, "learning_rate": 8.707102220590934e-06, "loss": 0.3205, "num_input_tokens_seen": 20470480, "step": 9490 }, { "epoch": 1.7425215635896496, "grad_norm": 4.3777360916137695, "learning_rate": 8.711690218388696e-06, "loss": 0.2754, "num_input_tokens_seen": 20481200, "step": 9495 }, { "epoch": 1.7434391631492017, "grad_norm": 2.6145236492156982, "learning_rate": 8.716278216186457e-06, "loss": 0.2899, "num_input_tokens_seen": 20490960, "step": 9500 }, { "epoch": 1.744356762708754, "grad_norm": 4.047552585601807, "learning_rate": 8.720866213984218e-06, "loss": 0.2315, "num_input_tokens_seen": 20502896, "step": 9505 }, { "epoch": 1.7452743622683062, "grad_norm": 6.021176338195801, "learning_rate": 8.72545421178198e-06, "loss": 0.3251, "num_input_tokens_seen": 20513200, "step": 9510 }, { "epoch": 1.7461919618278583, "grad_norm": 3.2192044258117676, "learning_rate": 8.73004220957974e-06, "loss": 0.2292, "num_input_tokens_seen": 20522832, "step": 9515 }, { "epoch": 1.7471095613874106, "grad_norm": 5.01315450668335, "learning_rate": 8.7346302073775e-06, "loss": 0.1972, "num_input_tokens_seen": 20532528, "step": 9520 }, { "epoch": 1.7480271609469629, "grad_norm": 15.534274101257324, "learning_rate": 8.739218205175263e-06, "loss": 0.2474, "num_input_tokens_seen": 20543312, "step": 9525 }, { "epoch": 1.748944760506515, "grad_norm": 4.294713020324707, "learning_rate": 8.743806202973023e-06, "loss": 0.396, "num_input_tokens_seen": 20554320, "step": 9530 }, { "epoch": 1.7498623600660672, "grad_norm": 2.983750820159912, "learning_rate": 8.748394200770784e-06, "loss": 0.5086, "num_input_tokens_seen": 20565680, "step": 9535 }, { "epoch": 1.7507799596256195, "grad_norm": 7.6256537437438965, "learning_rate": 8.752982198568546e-06, "loss": 0.2347, "num_input_tokens_seen": 20576944, "step": 9540 }, { "epoch": 1.7516975591851716, "grad_norm": 11.182294845581055, "learning_rate": 8.757570196366307e-06, "loss": 0.3587, "num_input_tokens_seen": 20586736, "step": 9545 }, { "epoch": 1.7526151587447238, "grad_norm": 14.77816104888916, "learning_rate": 8.762158194164067e-06, "loss": 0.3344, "num_input_tokens_seen": 20598000, "step": 9550 }, { "epoch": 1.7535327583042761, "grad_norm": 7.462334632873535, "learning_rate": 8.76674619196183e-06, "loss": 0.2483, "num_input_tokens_seen": 20609680, "step": 9555 }, { "epoch": 1.7544503578638282, "grad_norm": 10.382651329040527, "learning_rate": 8.77133418975959e-06, "loss": 0.2828, "num_input_tokens_seen": 20620560, "step": 9560 }, { "epoch": 1.7553679574233805, "grad_norm": 11.258688926696777, "learning_rate": 8.77592218755735e-06, "loss": 0.3445, "num_input_tokens_seen": 20632496, "step": 9565 }, { "epoch": 1.7562855569829328, "grad_norm": 12.788298606872559, "learning_rate": 8.780510185355113e-06, "loss": 0.2257, "num_input_tokens_seen": 20643856, "step": 9570 }, { "epoch": 1.7572031565424848, "grad_norm": 8.2757568359375, "learning_rate": 8.785098183152873e-06, "loss": 0.2748, "num_input_tokens_seen": 20654672, "step": 9575 }, { "epoch": 1.758120756102037, "grad_norm": 5.242051601409912, "learning_rate": 8.789686180950634e-06, "loss": 0.4411, "num_input_tokens_seen": 20665520, "step": 9580 }, { "epoch": 1.7590383556615894, "grad_norm": 6.728303909301758, "learning_rate": 8.794274178748396e-06, "loss": 0.3933, "num_input_tokens_seen": 20675984, "step": 9585 }, { "epoch": 1.7599559552211415, "grad_norm": 5.551909446716309, "learning_rate": 8.798862176546156e-06, "loss": 0.3111, "num_input_tokens_seen": 20686832, "step": 9590 }, { "epoch": 1.7608735547806937, "grad_norm": 4.572329044342041, "learning_rate": 8.803450174343917e-06, "loss": 0.3386, "num_input_tokens_seen": 20697488, "step": 9595 }, { "epoch": 1.761791154340246, "grad_norm": 12.12414264678955, "learning_rate": 8.808038172141679e-06, "loss": 0.4089, "num_input_tokens_seen": 20708432, "step": 9600 }, { "epoch": 1.762708753899798, "grad_norm": 6.1966094970703125, "learning_rate": 8.81262616993944e-06, "loss": 0.286, "num_input_tokens_seen": 20719408, "step": 9605 }, { "epoch": 1.7636263534593504, "grad_norm": 4.41945743560791, "learning_rate": 8.8172141677372e-06, "loss": 0.2591, "num_input_tokens_seen": 20729936, "step": 9610 }, { "epoch": 1.7645439530189027, "grad_norm": 4.052540302276611, "learning_rate": 8.821802165534962e-06, "loss": 0.3012, "num_input_tokens_seen": 20739952, "step": 9615 }, { "epoch": 1.7654615525784547, "grad_norm": 4.422616958618164, "learning_rate": 8.826390163332721e-06, "loss": 0.3488, "num_input_tokens_seen": 20751344, "step": 9620 }, { "epoch": 1.766379152138007, "grad_norm": 6.109668254852295, "learning_rate": 8.830978161130483e-06, "loss": 0.2702, "num_input_tokens_seen": 20762032, "step": 9625 }, { "epoch": 1.7672967516975593, "grad_norm": 9.670140266418457, "learning_rate": 8.835566158928246e-06, "loss": 0.4216, "num_input_tokens_seen": 20773072, "step": 9630 }, { "epoch": 1.7682143512571113, "grad_norm": 9.537525177001953, "learning_rate": 8.840154156726004e-06, "loss": 0.2989, "num_input_tokens_seen": 20783824, "step": 9635 }, { "epoch": 1.7691319508166636, "grad_norm": 7.496180057525635, "learning_rate": 8.844742154523767e-06, "loss": 0.3258, "num_input_tokens_seen": 20793744, "step": 9640 }, { "epoch": 1.770049550376216, "grad_norm": 5.484989166259766, "learning_rate": 8.849330152321529e-06, "loss": 0.3274, "num_input_tokens_seen": 20805232, "step": 9645 }, { "epoch": 1.770967149935768, "grad_norm": 4.507748126983643, "learning_rate": 8.853918150119288e-06, "loss": 0.3276, "num_input_tokens_seen": 20816176, "step": 9650 }, { "epoch": 1.7718847494953203, "grad_norm": 4.549119472503662, "learning_rate": 8.85850614791705e-06, "loss": 0.3759, "num_input_tokens_seen": 20827760, "step": 9655 }, { "epoch": 1.7728023490548726, "grad_norm": 5.499055862426758, "learning_rate": 8.86309414571481e-06, "loss": 0.2115, "num_input_tokens_seen": 20839248, "step": 9660 }, { "epoch": 1.7737199486144246, "grad_norm": 6.829256057739258, "learning_rate": 8.867682143512571e-06, "loss": 0.2496, "num_input_tokens_seen": 20849840, "step": 9665 }, { "epoch": 1.774637548173977, "grad_norm": 8.376630783081055, "learning_rate": 8.872270141310333e-06, "loss": 0.3997, "num_input_tokens_seen": 20861328, "step": 9670 }, { "epoch": 1.7755551477335292, "grad_norm": 6.926634311676025, "learning_rate": 8.876858139108094e-06, "loss": 0.491, "num_input_tokens_seen": 20871120, "step": 9675 }, { "epoch": 1.7764727472930812, "grad_norm": 17.901763916015625, "learning_rate": 8.881446136905854e-06, "loss": 0.2429, "num_input_tokens_seen": 20880912, "step": 9680 }, { "epoch": 1.7773903468526335, "grad_norm": 8.611532211303711, "learning_rate": 8.886034134703616e-06, "loss": 0.2551, "num_input_tokens_seen": 20891984, "step": 9685 }, { "epoch": 1.7783079464121858, "grad_norm": 5.023443222045898, "learning_rate": 8.890622132501377e-06, "loss": 0.2714, "num_input_tokens_seen": 20902384, "step": 9690 }, { "epoch": 1.7792255459717379, "grad_norm": 6.712355613708496, "learning_rate": 8.895210130299137e-06, "loss": 0.2823, "num_input_tokens_seen": 20912944, "step": 9695 }, { "epoch": 1.7801431455312902, "grad_norm": 9.716682434082031, "learning_rate": 8.8997981280969e-06, "loss": 0.2753, "num_input_tokens_seen": 20924304, "step": 9700 }, { "epoch": 1.7810607450908424, "grad_norm": 4.661048889160156, "learning_rate": 8.90438612589466e-06, "loss": 0.2172, "num_input_tokens_seen": 20934544, "step": 9705 }, { "epoch": 1.7819783446503945, "grad_norm": 2.2127692699432373, "learning_rate": 8.90897412369242e-06, "loss": 0.2865, "num_input_tokens_seen": 20946640, "step": 9710 }, { "epoch": 1.7828959442099468, "grad_norm": 2.753511428833008, "learning_rate": 8.913562121490183e-06, "loss": 0.2569, "num_input_tokens_seen": 20957936, "step": 9715 }, { "epoch": 1.783813543769499, "grad_norm": 8.35222053527832, "learning_rate": 8.918150119287943e-06, "loss": 0.1951, "num_input_tokens_seen": 20968912, "step": 9720 }, { "epoch": 1.7847311433290511, "grad_norm": 9.734004020690918, "learning_rate": 8.922738117085704e-06, "loss": 0.294, "num_input_tokens_seen": 20980176, "step": 9725 }, { "epoch": 1.7856487428886034, "grad_norm": 5.522035121917725, "learning_rate": 8.927326114883466e-06, "loss": 0.3501, "num_input_tokens_seen": 20991120, "step": 9730 }, { "epoch": 1.7865663424481557, "grad_norm": 13.798358917236328, "learning_rate": 8.931914112681227e-06, "loss": 0.3016, "num_input_tokens_seen": 21002640, "step": 9735 }, { "epoch": 1.7874839420077078, "grad_norm": 4.997480392456055, "learning_rate": 8.936502110478987e-06, "loss": 0.2695, "num_input_tokens_seen": 21013808, "step": 9740 }, { "epoch": 1.78840154156726, "grad_norm": 8.460646629333496, "learning_rate": 8.94109010827675e-06, "loss": 0.1688, "num_input_tokens_seen": 21023152, "step": 9745 }, { "epoch": 1.7893191411268123, "grad_norm": 7.437516689300537, "learning_rate": 8.94567810607451e-06, "loss": 0.3273, "num_input_tokens_seen": 21034672, "step": 9750 }, { "epoch": 1.7902367406863644, "grad_norm": 11.08122730255127, "learning_rate": 8.95026610387227e-06, "loss": 0.4698, "num_input_tokens_seen": 21046320, "step": 9755 }, { "epoch": 1.7911543402459167, "grad_norm": 7.9375128746032715, "learning_rate": 8.954854101670033e-06, "loss": 0.2858, "num_input_tokens_seen": 21056272, "step": 9760 }, { "epoch": 1.792071939805469, "grad_norm": 4.6567583084106445, "learning_rate": 8.959442099467793e-06, "loss": 0.2317, "num_input_tokens_seen": 21066384, "step": 9765 }, { "epoch": 1.792989539365021, "grad_norm": 0.800189733505249, "learning_rate": 8.964030097265554e-06, "loss": 0.2294, "num_input_tokens_seen": 21077488, "step": 9770 }, { "epoch": 1.7939071389245733, "grad_norm": 9.275778770446777, "learning_rate": 8.968618095063316e-06, "loss": 0.5016, "num_input_tokens_seen": 21087664, "step": 9775 }, { "epoch": 1.7948247384841256, "grad_norm": 3.8266243934631348, "learning_rate": 8.973206092861076e-06, "loss": 0.2134, "num_input_tokens_seen": 21097072, "step": 9780 }, { "epoch": 1.7957423380436777, "grad_norm": 25.582904815673828, "learning_rate": 8.977794090658837e-06, "loss": 0.5015, "num_input_tokens_seen": 21106128, "step": 9785 }, { "epoch": 1.79665993760323, "grad_norm": 4.259832859039307, "learning_rate": 8.982382088456599e-06, "loss": 0.3401, "num_input_tokens_seen": 21116912, "step": 9790 }, { "epoch": 1.7975775371627822, "grad_norm": 7.869581699371338, "learning_rate": 8.98697008625436e-06, "loss": 0.2275, "num_input_tokens_seen": 21127504, "step": 9795 }, { "epoch": 1.7984951367223343, "grad_norm": 4.095052719116211, "learning_rate": 8.99155808405212e-06, "loss": 0.3436, "num_input_tokens_seen": 21139024, "step": 9800 }, { "epoch": 1.7994127362818866, "grad_norm": 14.878586769104004, "learning_rate": 8.99614608184988e-06, "loss": 0.4911, "num_input_tokens_seen": 21150192, "step": 9805 }, { "epoch": 1.8003303358414389, "grad_norm": 8.336198806762695, "learning_rate": 9.000734079647643e-06, "loss": 0.369, "num_input_tokens_seen": 21161200, "step": 9810 }, { "epoch": 1.801247935400991, "grad_norm": 4.129666328430176, "learning_rate": 9.005322077445403e-06, "loss": 0.2738, "num_input_tokens_seen": 21171440, "step": 9815 }, { "epoch": 1.8021655349605432, "grad_norm": 6.184445858001709, "learning_rate": 9.009910075243164e-06, "loss": 0.2766, "num_input_tokens_seen": 21180336, "step": 9820 }, { "epoch": 1.8030831345200955, "grad_norm": 7.08534574508667, "learning_rate": 9.014498073040926e-06, "loss": 0.1707, "num_input_tokens_seen": 21191568, "step": 9825 }, { "epoch": 1.8040007340796476, "grad_norm": 9.83291244506836, "learning_rate": 9.019086070838687e-06, "loss": 0.2211, "num_input_tokens_seen": 21203056, "step": 9830 }, { "epoch": 1.8049183336391998, "grad_norm": 15.02633285522461, "learning_rate": 9.023674068636447e-06, "loss": 0.453, "num_input_tokens_seen": 21215344, "step": 9835 }, { "epoch": 1.8058359331987521, "grad_norm": 9.161398887634277, "learning_rate": 9.02826206643421e-06, "loss": 0.3577, "num_input_tokens_seen": 21226192, "step": 9840 }, { "epoch": 1.8067535327583042, "grad_norm": 7.196934700012207, "learning_rate": 9.03285006423197e-06, "loss": 0.2723, "num_input_tokens_seen": 21236112, "step": 9845 }, { "epoch": 1.8076711323178565, "grad_norm": 8.305859565734863, "learning_rate": 9.03743806202973e-06, "loss": 0.2466, "num_input_tokens_seen": 21247792, "step": 9850 }, { "epoch": 1.8085887318774088, "grad_norm": 4.886589527130127, "learning_rate": 9.042026059827493e-06, "loss": 0.4469, "num_input_tokens_seen": 21258768, "step": 9855 }, { "epoch": 1.8095063314369608, "grad_norm": 4.455465316772461, "learning_rate": 9.046614057625253e-06, "loss": 0.282, "num_input_tokens_seen": 21268976, "step": 9860 }, { "epoch": 1.810423930996513, "grad_norm": 4.702786445617676, "learning_rate": 9.051202055423014e-06, "loss": 0.3201, "num_input_tokens_seen": 21279760, "step": 9865 }, { "epoch": 1.8113415305560654, "grad_norm": 8.265375137329102, "learning_rate": 9.055790053220776e-06, "loss": 0.2674, "num_input_tokens_seen": 21291536, "step": 9870 }, { "epoch": 1.8122591301156175, "grad_norm": 3.3411765098571777, "learning_rate": 9.060378051018536e-06, "loss": 0.2752, "num_input_tokens_seen": 21301712, "step": 9875 }, { "epoch": 1.8131767296751697, "grad_norm": 8.634279251098633, "learning_rate": 9.064966048816297e-06, "loss": 0.3136, "num_input_tokens_seen": 21311984, "step": 9880 }, { "epoch": 1.814094329234722, "grad_norm": 13.056395530700684, "learning_rate": 9.069554046614059e-06, "loss": 0.3293, "num_input_tokens_seen": 21321776, "step": 9885 }, { "epoch": 1.815011928794274, "grad_norm": 5.585083961486816, "learning_rate": 9.07414204441182e-06, "loss": 0.3476, "num_input_tokens_seen": 21332016, "step": 9890 }, { "epoch": 1.8159295283538264, "grad_norm": 4.153979778289795, "learning_rate": 9.07873004220958e-06, "loss": 0.281, "num_input_tokens_seen": 21341392, "step": 9895 }, { "epoch": 1.8168471279133787, "grad_norm": 4.3612799644470215, "learning_rate": 9.083318040007342e-06, "loss": 0.4023, "num_input_tokens_seen": 21351952, "step": 9900 }, { "epoch": 1.8177647274729307, "grad_norm": 7.591611385345459, "learning_rate": 9.087906037805103e-06, "loss": 0.357, "num_input_tokens_seen": 21364528, "step": 9905 }, { "epoch": 1.818682327032483, "grad_norm": 10.41921615600586, "learning_rate": 9.092494035602863e-06, "loss": 0.4523, "num_input_tokens_seen": 21374160, "step": 9910 }, { "epoch": 1.8195999265920353, "grad_norm": 3.1707303524017334, "learning_rate": 9.097082033400625e-06, "loss": 0.3732, "num_input_tokens_seen": 21385648, "step": 9915 }, { "epoch": 1.8205175261515874, "grad_norm": 3.7139437198638916, "learning_rate": 9.101670031198386e-06, "loss": 0.3345, "num_input_tokens_seen": 21397232, "step": 9920 }, { "epoch": 1.8214351257111396, "grad_norm": 3.9221019744873047, "learning_rate": 9.106258028996146e-06, "loss": 0.311, "num_input_tokens_seen": 21406800, "step": 9925 }, { "epoch": 1.822352725270692, "grad_norm": 4.2316813468933105, "learning_rate": 9.110846026793909e-06, "loss": 0.2659, "num_input_tokens_seen": 21418128, "step": 9930 }, { "epoch": 1.823270324830244, "grad_norm": 3.0823912620544434, "learning_rate": 9.11543402459167e-06, "loss": 0.3268, "num_input_tokens_seen": 21428560, "step": 9935 }, { "epoch": 1.8241879243897963, "grad_norm": 3.20864200592041, "learning_rate": 9.12002202238943e-06, "loss": 0.2722, "num_input_tokens_seen": 21440240, "step": 9940 }, { "epoch": 1.8251055239493486, "grad_norm": 4.336888790130615, "learning_rate": 9.124610020187192e-06, "loss": 0.2692, "num_input_tokens_seen": 21452080, "step": 9945 }, { "epoch": 1.8260231235089006, "grad_norm": 15.9400634765625, "learning_rate": 9.129198017984952e-06, "loss": 0.326, "num_input_tokens_seen": 21463344, "step": 9950 }, { "epoch": 1.826940723068453, "grad_norm": 14.181668281555176, "learning_rate": 9.133786015782713e-06, "loss": 0.3969, "num_input_tokens_seen": 21474192, "step": 9955 }, { "epoch": 1.8278583226280052, "grad_norm": 4.648120880126953, "learning_rate": 9.138374013580475e-06, "loss": 0.3947, "num_input_tokens_seen": 21484880, "step": 9960 }, { "epoch": 1.8287759221875572, "grad_norm": 19.030763626098633, "learning_rate": 9.142962011378236e-06, "loss": 0.4185, "num_input_tokens_seen": 21495472, "step": 9965 }, { "epoch": 1.8296935217471095, "grad_norm": 4.063207149505615, "learning_rate": 9.147550009175996e-06, "loss": 0.3468, "num_input_tokens_seen": 21506352, "step": 9970 }, { "epoch": 1.8306111213066618, "grad_norm": 4.974555492401123, "learning_rate": 9.152138006973757e-06, "loss": 0.21, "num_input_tokens_seen": 21516560, "step": 9975 }, { "epoch": 1.8315287208662139, "grad_norm": 3.381103038787842, "learning_rate": 9.156726004771519e-06, "loss": 0.2381, "num_input_tokens_seen": 21526992, "step": 9980 }, { "epoch": 1.8324463204257662, "grad_norm": 17.81342887878418, "learning_rate": 9.16131400256928e-06, "loss": 0.3571, "num_input_tokens_seen": 21537712, "step": 9985 }, { "epoch": 1.8333639199853184, "grad_norm": 5.016183376312256, "learning_rate": 9.16590200036704e-06, "loss": 0.3491, "num_input_tokens_seen": 21549360, "step": 9990 }, { "epoch": 1.8342815195448705, "grad_norm": 4.970723628997803, "learning_rate": 9.170489998164802e-06, "loss": 0.3295, "num_input_tokens_seen": 21558704, "step": 9995 }, { "epoch": 1.8351991191044228, "grad_norm": 4.0058746337890625, "learning_rate": 9.175077995962563e-06, "loss": 0.2493, "num_input_tokens_seen": 21567856, "step": 10000 }, { "epoch": 1.836116718663975, "grad_norm": 10.367918968200684, "learning_rate": 9.179665993760323e-06, "loss": 0.357, "num_input_tokens_seen": 21578288, "step": 10005 }, { "epoch": 1.8370343182235271, "grad_norm": 3.4724996089935303, "learning_rate": 9.184253991558085e-06, "loss": 0.2906, "num_input_tokens_seen": 21589488, "step": 10010 }, { "epoch": 1.8379519177830794, "grad_norm": 6.51158332824707, "learning_rate": 9.188841989355846e-06, "loss": 0.2871, "num_input_tokens_seen": 21599024, "step": 10015 }, { "epoch": 1.8388695173426317, "grad_norm": 3.9341533184051514, "learning_rate": 9.193429987153606e-06, "loss": 0.2858, "num_input_tokens_seen": 21610128, "step": 10020 }, { "epoch": 1.8397871169021838, "grad_norm": 7.611935138702393, "learning_rate": 9.198017984951369e-06, "loss": 0.2511, "num_input_tokens_seen": 21619920, "step": 10025 }, { "epoch": 1.840704716461736, "grad_norm": 14.002884864807129, "learning_rate": 9.20260598274913e-06, "loss": 0.2877, "num_input_tokens_seen": 21629872, "step": 10030 }, { "epoch": 1.8416223160212883, "grad_norm": 4.060583114624023, "learning_rate": 9.20719398054689e-06, "loss": 0.1954, "num_input_tokens_seen": 21639568, "step": 10035 }, { "epoch": 1.8425399155808404, "grad_norm": 4.653133392333984, "learning_rate": 9.211781978344652e-06, "loss": 0.4234, "num_input_tokens_seen": 21651024, "step": 10040 }, { "epoch": 1.8434575151403927, "grad_norm": 5.3822760581970215, "learning_rate": 9.216369976142412e-06, "loss": 0.376, "num_input_tokens_seen": 21662672, "step": 10045 }, { "epoch": 1.844375114699945, "grad_norm": 5.977126598358154, "learning_rate": 9.220957973940173e-06, "loss": 0.3268, "num_input_tokens_seen": 21673744, "step": 10050 }, { "epoch": 1.845292714259497, "grad_norm": 14.304112434387207, "learning_rate": 9.225545971737935e-06, "loss": 0.2233, "num_input_tokens_seen": 21685392, "step": 10055 }, { "epoch": 1.8462103138190493, "grad_norm": 7.398131370544434, "learning_rate": 9.230133969535696e-06, "loss": 0.168, "num_input_tokens_seen": 21696560, "step": 10060 }, { "epoch": 1.8471279133786016, "grad_norm": 16.97554588317871, "learning_rate": 9.234721967333456e-06, "loss": 0.2207, "num_input_tokens_seen": 21707568, "step": 10065 }, { "epoch": 1.8480455129381537, "grad_norm": 14.321292877197266, "learning_rate": 9.239309965131218e-06, "loss": 0.4172, "num_input_tokens_seen": 21719312, "step": 10070 }, { "epoch": 1.848963112497706, "grad_norm": 11.576197624206543, "learning_rate": 9.243897962928979e-06, "loss": 0.4345, "num_input_tokens_seen": 21728912, "step": 10075 }, { "epoch": 1.8498807120572582, "grad_norm": 8.025195121765137, "learning_rate": 9.24848596072674e-06, "loss": 0.2569, "num_input_tokens_seen": 21738256, "step": 10080 }, { "epoch": 1.8507983116168103, "grad_norm": 4.94758415222168, "learning_rate": 9.253073958524502e-06, "loss": 0.2995, "num_input_tokens_seen": 21749360, "step": 10085 }, { "epoch": 1.8517159111763626, "grad_norm": 8.168574333190918, "learning_rate": 9.257661956322262e-06, "loss": 0.3081, "num_input_tokens_seen": 21761040, "step": 10090 }, { "epoch": 1.8526335107359149, "grad_norm": 5.861288547515869, "learning_rate": 9.262249954120023e-06, "loss": 0.2836, "num_input_tokens_seen": 21772272, "step": 10095 }, { "epoch": 1.853551110295467, "grad_norm": 6.571388244628906, "learning_rate": 9.266837951917785e-06, "loss": 0.2849, "num_input_tokens_seen": 21784400, "step": 10100 }, { "epoch": 1.8544687098550192, "grad_norm": 5.9241180419921875, "learning_rate": 9.271425949715545e-06, "loss": 0.2521, "num_input_tokens_seen": 21795664, "step": 10105 }, { "epoch": 1.8553863094145715, "grad_norm": 9.153797149658203, "learning_rate": 9.276013947513306e-06, "loss": 0.3094, "num_input_tokens_seen": 21807152, "step": 10110 }, { "epoch": 1.8563039089741236, "grad_norm": 2.6762940883636475, "learning_rate": 9.280601945311068e-06, "loss": 0.3633, "num_input_tokens_seen": 21817616, "step": 10115 }, { "epoch": 1.857221508533676, "grad_norm": 7.141127109527588, "learning_rate": 9.285189943108827e-06, "loss": 0.2555, "num_input_tokens_seen": 21828464, "step": 10120 }, { "epoch": 1.8581391080932281, "grad_norm": 15.59949016571045, "learning_rate": 9.289777940906589e-06, "loss": 0.2263, "num_input_tokens_seen": 21840432, "step": 10125 }, { "epoch": 1.8590567076527802, "grad_norm": 16.73431968688965, "learning_rate": 9.294365938704351e-06, "loss": 0.3114, "num_input_tokens_seen": 21851536, "step": 10130 }, { "epoch": 1.8599743072123327, "grad_norm": 11.21884536743164, "learning_rate": 9.29895393650211e-06, "loss": 0.3426, "num_input_tokens_seen": 21862960, "step": 10135 }, { "epoch": 1.8608919067718848, "grad_norm": 3.788881778717041, "learning_rate": 9.303541934299872e-06, "loss": 0.3508, "num_input_tokens_seen": 21875184, "step": 10140 }, { "epoch": 1.8618095063314368, "grad_norm": 3.887770175933838, "learning_rate": 9.308129932097635e-06, "loss": 0.4188, "num_input_tokens_seen": 21886000, "step": 10145 }, { "epoch": 1.8627271058909893, "grad_norm": 4.342674732208252, "learning_rate": 9.312717929895393e-06, "loss": 0.3134, "num_input_tokens_seen": 21896848, "step": 10150 }, { "epoch": 1.8636447054505414, "grad_norm": 2.8539772033691406, "learning_rate": 9.317305927693156e-06, "loss": 0.28, "num_input_tokens_seen": 21906960, "step": 10155 }, { "epoch": 1.8645623050100935, "grad_norm": 2.45029354095459, "learning_rate": 9.321893925490916e-06, "loss": 0.3357, "num_input_tokens_seen": 21917904, "step": 10160 }, { "epoch": 1.865479904569646, "grad_norm": 6.4540300369262695, "learning_rate": 9.326481923288677e-06, "loss": 0.3099, "num_input_tokens_seen": 21927472, "step": 10165 }, { "epoch": 1.866397504129198, "grad_norm": 2.9555721282958984, "learning_rate": 9.331069921086439e-06, "loss": 0.3295, "num_input_tokens_seen": 21938928, "step": 10170 }, { "epoch": 1.86731510368875, "grad_norm": 4.797801971435547, "learning_rate": 9.3356579188842e-06, "loss": 0.283, "num_input_tokens_seen": 21948976, "step": 10175 }, { "epoch": 1.8682327032483026, "grad_norm": 4.136722564697266, "learning_rate": 9.34024591668196e-06, "loss": 0.2796, "num_input_tokens_seen": 21959888, "step": 10180 }, { "epoch": 1.8691503028078547, "grad_norm": 6.649714469909668, "learning_rate": 9.344833914479722e-06, "loss": 0.3336, "num_input_tokens_seen": 21970000, "step": 10185 }, { "epoch": 1.8700679023674067, "grad_norm": 13.266241073608398, "learning_rate": 9.349421912277483e-06, "loss": 0.3159, "num_input_tokens_seen": 21981488, "step": 10190 }, { "epoch": 1.8709855019269592, "grad_norm": 4.707775592803955, "learning_rate": 9.354009910075243e-06, "loss": 0.3081, "num_input_tokens_seen": 21992400, "step": 10195 }, { "epoch": 1.8719031014865113, "grad_norm": 3.47294282913208, "learning_rate": 9.358597907873005e-06, "loss": 0.212, "num_input_tokens_seen": 22003440, "step": 10200 }, { "epoch": 1.8728207010460634, "grad_norm": 1.9812357425689697, "learning_rate": 9.363185905670766e-06, "loss": 0.3414, "num_input_tokens_seen": 22014864, "step": 10205 }, { "epoch": 1.8737383006056159, "grad_norm": 5.673616886138916, "learning_rate": 9.367773903468526e-06, "loss": 0.2879, "num_input_tokens_seen": 22026224, "step": 10210 }, { "epoch": 1.874655900165168, "grad_norm": 4.2947678565979, "learning_rate": 9.372361901266289e-06, "loss": 0.1499, "num_input_tokens_seen": 22036816, "step": 10215 }, { "epoch": 1.87557349972472, "grad_norm": 10.098917007446289, "learning_rate": 9.376949899064049e-06, "loss": 0.843, "num_input_tokens_seen": 22047760, "step": 10220 }, { "epoch": 1.8764910992842725, "grad_norm": 2.2467575073242188, "learning_rate": 9.38153789686181e-06, "loss": 0.2893, "num_input_tokens_seen": 22059472, "step": 10225 }, { "epoch": 1.8774086988438246, "grad_norm": 5.988301753997803, "learning_rate": 9.386125894659572e-06, "loss": 0.2344, "num_input_tokens_seen": 22069840, "step": 10230 }, { "epoch": 1.8783262984033766, "grad_norm": 4.1634955406188965, "learning_rate": 9.390713892457332e-06, "loss": 0.5115, "num_input_tokens_seen": 22080336, "step": 10235 }, { "epoch": 1.8792438979629291, "grad_norm": 18.549287796020508, "learning_rate": 9.395301890255093e-06, "loss": 0.3687, "num_input_tokens_seen": 22091664, "step": 10240 }, { "epoch": 1.8801614975224812, "grad_norm": 3.787266731262207, "learning_rate": 9.399889888052855e-06, "loss": 0.4903, "num_input_tokens_seen": 22102384, "step": 10245 }, { "epoch": 1.8810790970820332, "grad_norm": 2.404526710510254, "learning_rate": 9.404477885850616e-06, "loss": 0.3209, "num_input_tokens_seen": 22112784, "step": 10250 }, { "epoch": 1.8819966966415858, "grad_norm": 3.1695995330810547, "learning_rate": 9.409065883648376e-06, "loss": 0.2741, "num_input_tokens_seen": 22123568, "step": 10255 }, { "epoch": 1.8829142962011378, "grad_norm": 3.6001884937286377, "learning_rate": 9.413653881446138e-06, "loss": 0.3597, "num_input_tokens_seen": 22133488, "step": 10260 }, { "epoch": 1.8838318957606899, "grad_norm": 4.209583759307861, "learning_rate": 9.418241879243899e-06, "loss": 0.3016, "num_input_tokens_seen": 22143632, "step": 10265 }, { "epoch": 1.8847494953202424, "grad_norm": 3.2718448638916016, "learning_rate": 9.42282987704166e-06, "loss": 0.297, "num_input_tokens_seen": 22153168, "step": 10270 }, { "epoch": 1.8856670948797944, "grad_norm": 1.734670639038086, "learning_rate": 9.427417874839421e-06, "loss": 0.298, "num_input_tokens_seen": 22162896, "step": 10275 }, { "epoch": 1.8865846944393465, "grad_norm": 3.5343003273010254, "learning_rate": 9.432005872637182e-06, "loss": 0.3424, "num_input_tokens_seen": 22172240, "step": 10280 }, { "epoch": 1.887502293998899, "grad_norm": 3.2702884674072266, "learning_rate": 9.436593870434943e-06, "loss": 0.2506, "num_input_tokens_seen": 22183536, "step": 10285 }, { "epoch": 1.888419893558451, "grad_norm": 3.24877667427063, "learning_rate": 9.441181868232705e-06, "loss": 0.2661, "num_input_tokens_seen": 22194128, "step": 10290 }, { "epoch": 1.8893374931180031, "grad_norm": 3.5191855430603027, "learning_rate": 9.445769866030465e-06, "loss": 0.2501, "num_input_tokens_seen": 22204336, "step": 10295 }, { "epoch": 1.8902550926775556, "grad_norm": 6.570464134216309, "learning_rate": 9.450357863828226e-06, "loss": 0.3742, "num_input_tokens_seen": 22216624, "step": 10300 }, { "epoch": 1.8911726922371077, "grad_norm": 2.610275983810425, "learning_rate": 9.454945861625986e-06, "loss": 0.3662, "num_input_tokens_seen": 22227344, "step": 10305 }, { "epoch": 1.8920902917966598, "grad_norm": 1.164998173713684, "learning_rate": 9.459533859423748e-06, "loss": 0.3676, "num_input_tokens_seen": 22237360, "step": 10310 }, { "epoch": 1.8930078913562123, "grad_norm": 1.831139087677002, "learning_rate": 9.464121857221509e-06, "loss": 0.2376, "num_input_tokens_seen": 22247184, "step": 10315 }, { "epoch": 1.8939254909157643, "grad_norm": 2.760568857192993, "learning_rate": 9.46870985501927e-06, "loss": 0.4665, "num_input_tokens_seen": 22257680, "step": 10320 }, { "epoch": 1.8948430904753166, "grad_norm": 10.147391319274902, "learning_rate": 9.473297852817032e-06, "loss": 0.3858, "num_input_tokens_seen": 22269520, "step": 10325 }, { "epoch": 1.895760690034869, "grad_norm": 1.532842993736267, "learning_rate": 9.477885850614792e-06, "loss": 0.2558, "num_input_tokens_seen": 22281200, "step": 10330 }, { "epoch": 1.896678289594421, "grad_norm": 4.4954023361206055, "learning_rate": 9.482473848412553e-06, "loss": 0.296, "num_input_tokens_seen": 22291824, "step": 10335 }, { "epoch": 1.8975958891539733, "grad_norm": 2.6302406787872314, "learning_rate": 9.487061846210315e-06, "loss": 0.2557, "num_input_tokens_seen": 22304368, "step": 10340 }, { "epoch": 1.8985134887135255, "grad_norm": 5.425471782684326, "learning_rate": 9.491649844008075e-06, "loss": 0.3179, "num_input_tokens_seen": 22316176, "step": 10345 }, { "epoch": 1.8994310882730776, "grad_norm": 5.482667922973633, "learning_rate": 9.496237841805836e-06, "loss": 0.427, "num_input_tokens_seen": 22326064, "step": 10350 }, { "epoch": 1.90034868783263, "grad_norm": 12.33047103881836, "learning_rate": 9.500825839603598e-06, "loss": 0.3346, "num_input_tokens_seen": 22336816, "step": 10355 }, { "epoch": 1.9012662873921822, "grad_norm": 5.675306797027588, "learning_rate": 9.505413837401359e-06, "loss": 0.3127, "num_input_tokens_seen": 22347216, "step": 10360 }, { "epoch": 1.9021838869517342, "grad_norm": 2.3667938709259033, "learning_rate": 9.51000183519912e-06, "loss": 0.3176, "num_input_tokens_seen": 22358992, "step": 10365 }, { "epoch": 1.9031014865112865, "grad_norm": 3.3400087356567383, "learning_rate": 9.514589832996881e-06, "loss": 0.2537, "num_input_tokens_seen": 22370064, "step": 10370 }, { "epoch": 1.9040190860708388, "grad_norm": 2.138941526412964, "learning_rate": 9.519177830794642e-06, "loss": 0.2919, "num_input_tokens_seen": 22382032, "step": 10375 }, { "epoch": 1.9049366856303909, "grad_norm": 3.9877943992614746, "learning_rate": 9.523765828592402e-06, "loss": 0.2908, "num_input_tokens_seen": 22392464, "step": 10380 }, { "epoch": 1.9058542851899432, "grad_norm": 3.338390827178955, "learning_rate": 9.528353826390165e-06, "loss": 0.3131, "num_input_tokens_seen": 22403568, "step": 10385 }, { "epoch": 1.9067718847494954, "grad_norm": 5.106674671173096, "learning_rate": 9.532941824187925e-06, "loss": 0.2585, "num_input_tokens_seen": 22415088, "step": 10390 }, { "epoch": 1.9076894843090475, "grad_norm": 10.141694068908691, "learning_rate": 9.537529821985686e-06, "loss": 0.2849, "num_input_tokens_seen": 22426672, "step": 10395 }, { "epoch": 1.9086070838685998, "grad_norm": 7.390340805053711, "learning_rate": 9.542117819783448e-06, "loss": 0.2703, "num_input_tokens_seen": 22437552, "step": 10400 }, { "epoch": 1.909524683428152, "grad_norm": 8.378366470336914, "learning_rate": 9.546705817581208e-06, "loss": 0.2903, "num_input_tokens_seen": 22447408, "step": 10405 }, { "epoch": 1.9104422829877041, "grad_norm": 12.305851936340332, "learning_rate": 9.551293815378969e-06, "loss": 0.4067, "num_input_tokens_seen": 22457008, "step": 10410 }, { "epoch": 1.9113598825472564, "grad_norm": 23.052995681762695, "learning_rate": 9.555881813176731e-06, "loss": 0.243, "num_input_tokens_seen": 22467600, "step": 10415 }, { "epoch": 1.9122774821068087, "grad_norm": 5.922523498535156, "learning_rate": 9.560469810974492e-06, "loss": 0.4005, "num_input_tokens_seen": 22477616, "step": 10420 }, { "epoch": 1.9131950816663608, "grad_norm": 1.5558080673217773, "learning_rate": 9.565057808772252e-06, "loss": 0.3273, "num_input_tokens_seen": 22488528, "step": 10425 }, { "epoch": 1.914112681225913, "grad_norm": 5.572723865509033, "learning_rate": 9.569645806570014e-06, "loss": 0.3085, "num_input_tokens_seen": 22498640, "step": 10430 }, { "epoch": 1.9150302807854653, "grad_norm": 5.636843681335449, "learning_rate": 9.574233804367775e-06, "loss": 0.3866, "num_input_tokens_seen": 22509328, "step": 10435 }, { "epoch": 1.9159478803450174, "grad_norm": 1.7235068082809448, "learning_rate": 9.578821802165535e-06, "loss": 0.41, "num_input_tokens_seen": 22519824, "step": 10440 }, { "epoch": 1.9168654799045697, "grad_norm": 2.7170140743255615, "learning_rate": 9.583409799963298e-06, "loss": 0.3098, "num_input_tokens_seen": 22529616, "step": 10445 }, { "epoch": 1.917783079464122, "grad_norm": 2.2962632179260254, "learning_rate": 9.587997797761058e-06, "loss": 0.3312, "num_input_tokens_seen": 22539184, "step": 10450 }, { "epoch": 1.918700679023674, "grad_norm": 6.326003551483154, "learning_rate": 9.592585795558819e-06, "loss": 0.2213, "num_input_tokens_seen": 22550480, "step": 10455 }, { "epoch": 1.9196182785832263, "grad_norm": 2.980503559112549, "learning_rate": 9.597173793356581e-06, "loss": 0.3982, "num_input_tokens_seen": 22560912, "step": 10460 }, { "epoch": 1.9205358781427786, "grad_norm": 12.273818969726562, "learning_rate": 9.601761791154341e-06, "loss": 0.3111, "num_input_tokens_seen": 22571760, "step": 10465 }, { "epoch": 1.9214534777023307, "grad_norm": 3.0378122329711914, "learning_rate": 9.606349788952102e-06, "loss": 0.3781, "num_input_tokens_seen": 22582480, "step": 10470 }, { "epoch": 1.922371077261883, "grad_norm": 2.3372533321380615, "learning_rate": 9.610937786749862e-06, "loss": 0.2619, "num_input_tokens_seen": 22593136, "step": 10475 }, { "epoch": 1.9232886768214352, "grad_norm": 9.16244888305664, "learning_rate": 9.615525784547625e-06, "loss": 0.2663, "num_input_tokens_seen": 22603728, "step": 10480 }, { "epoch": 1.9242062763809873, "grad_norm": 8.76370906829834, "learning_rate": 9.620113782345385e-06, "loss": 0.4799, "num_input_tokens_seen": 22615344, "step": 10485 }, { "epoch": 1.9251238759405396, "grad_norm": 2.488227128982544, "learning_rate": 9.624701780143146e-06, "loss": 0.208, "num_input_tokens_seen": 22626736, "step": 10490 }, { "epoch": 1.9260414755000919, "grad_norm": 15.132133483886719, "learning_rate": 9.629289777940908e-06, "loss": 0.3638, "num_input_tokens_seen": 22637456, "step": 10495 }, { "epoch": 1.926959075059644, "grad_norm": 1.9113214015960693, "learning_rate": 9.633877775738668e-06, "loss": 0.35, "num_input_tokens_seen": 22647888, "step": 10500 }, { "epoch": 1.9278766746191962, "grad_norm": 2.8981363773345947, "learning_rate": 9.638465773536429e-06, "loss": 0.2132, "num_input_tokens_seen": 22658320, "step": 10505 }, { "epoch": 1.9287942741787485, "grad_norm": 6.464080810546875, "learning_rate": 9.643053771334191e-06, "loss": 0.3984, "num_input_tokens_seen": 22668944, "step": 10510 }, { "epoch": 1.9297118737383006, "grad_norm": 2.896333932876587, "learning_rate": 9.647641769131952e-06, "loss": 0.2271, "num_input_tokens_seen": 22680912, "step": 10515 }, { "epoch": 1.9306294732978528, "grad_norm": 3.6894078254699707, "learning_rate": 9.652229766929712e-06, "loss": 0.2682, "num_input_tokens_seen": 22690544, "step": 10520 }, { "epoch": 1.9315470728574051, "grad_norm": 20.961162567138672, "learning_rate": 9.656817764727474e-06, "loss": 0.4931, "num_input_tokens_seen": 22701424, "step": 10525 }, { "epoch": 1.9324646724169572, "grad_norm": 12.340434074401855, "learning_rate": 9.661405762525235e-06, "loss": 0.4521, "num_input_tokens_seen": 22711760, "step": 10530 }, { "epoch": 1.9333822719765095, "grad_norm": 4.272472858428955, "learning_rate": 9.665993760322995e-06, "loss": 0.3397, "num_input_tokens_seen": 22723280, "step": 10535 }, { "epoch": 1.9342998715360618, "grad_norm": 2.3846795558929443, "learning_rate": 9.670581758120758e-06, "loss": 0.3425, "num_input_tokens_seen": 22734576, "step": 10540 }, { "epoch": 1.9352174710956138, "grad_norm": 6.329697132110596, "learning_rate": 9.675169755918518e-06, "loss": 0.2888, "num_input_tokens_seen": 22746064, "step": 10545 }, { "epoch": 1.936135070655166, "grad_norm": 8.553730964660645, "learning_rate": 9.679757753716279e-06, "loss": 0.3432, "num_input_tokens_seen": 22756016, "step": 10550 }, { "epoch": 1.9370526702147184, "grad_norm": 3.5397560596466064, "learning_rate": 9.68434575151404e-06, "loss": 0.2896, "num_input_tokens_seen": 22767792, "step": 10555 }, { "epoch": 1.9379702697742704, "grad_norm": 2.242082357406616, "learning_rate": 9.688933749311801e-06, "loss": 0.3553, "num_input_tokens_seen": 22779824, "step": 10560 }, { "epoch": 1.9388878693338227, "grad_norm": 3.101362705230713, "learning_rate": 9.693521747109562e-06, "loss": 0.3931, "num_input_tokens_seen": 22790288, "step": 10565 }, { "epoch": 1.939805468893375, "grad_norm": 5.65133810043335, "learning_rate": 9.698109744907324e-06, "loss": 0.3055, "num_input_tokens_seen": 22800976, "step": 10570 }, { "epoch": 1.940723068452927, "grad_norm": 2.0405991077423096, "learning_rate": 9.702697742705085e-06, "loss": 0.3426, "num_input_tokens_seen": 22811408, "step": 10575 }, { "epoch": 1.9416406680124794, "grad_norm": 3.827538013458252, "learning_rate": 9.707285740502845e-06, "loss": 0.2631, "num_input_tokens_seen": 22822352, "step": 10580 }, { "epoch": 1.9425582675720316, "grad_norm": 4.783318042755127, "learning_rate": 9.711873738300607e-06, "loss": 0.4511, "num_input_tokens_seen": 22833584, "step": 10585 }, { "epoch": 1.9434758671315837, "grad_norm": 7.495079040527344, "learning_rate": 9.716461736098368e-06, "loss": 0.3342, "num_input_tokens_seen": 22844016, "step": 10590 }, { "epoch": 1.944393466691136, "grad_norm": 11.097040176391602, "learning_rate": 9.721049733896128e-06, "loss": 0.2341, "num_input_tokens_seen": 22855568, "step": 10595 }, { "epoch": 1.9453110662506883, "grad_norm": 5.928249359130859, "learning_rate": 9.72563773169389e-06, "loss": 0.2503, "num_input_tokens_seen": 22866928, "step": 10600 }, { "epoch": 1.9462286658102403, "grad_norm": 8.3919677734375, "learning_rate": 9.730225729491651e-06, "loss": 0.3326, "num_input_tokens_seen": 22877968, "step": 10605 }, { "epoch": 1.9471462653697926, "grad_norm": 2.0846498012542725, "learning_rate": 9.734813727289412e-06, "loss": 0.2579, "num_input_tokens_seen": 22888368, "step": 10610 }, { "epoch": 1.948063864929345, "grad_norm": 6.682167053222656, "learning_rate": 9.739401725087174e-06, "loss": 0.3935, "num_input_tokens_seen": 22898704, "step": 10615 }, { "epoch": 1.948981464488897, "grad_norm": 7.542962074279785, "learning_rate": 9.743989722884933e-06, "loss": 0.2921, "num_input_tokens_seen": 22910384, "step": 10620 }, { "epoch": 1.9498990640484493, "grad_norm": 14.181371688842773, "learning_rate": 9.748577720682695e-06, "loss": 0.4296, "num_input_tokens_seen": 22921616, "step": 10625 }, { "epoch": 1.9508166636080015, "grad_norm": 7.80803108215332, "learning_rate": 9.753165718480457e-06, "loss": 0.4436, "num_input_tokens_seen": 22933296, "step": 10630 }, { "epoch": 1.9517342631675536, "grad_norm": 12.607503890991211, "learning_rate": 9.757753716278216e-06, "loss": 0.2815, "num_input_tokens_seen": 22943344, "step": 10635 }, { "epoch": 1.952651862727106, "grad_norm": 7.463188171386719, "learning_rate": 9.762341714075978e-06, "loss": 0.4021, "num_input_tokens_seen": 22954800, "step": 10640 }, { "epoch": 1.9535694622866582, "grad_norm": 12.009556770324707, "learning_rate": 9.76692971187374e-06, "loss": 0.366, "num_input_tokens_seen": 22965360, "step": 10645 }, { "epoch": 1.9544870618462102, "grad_norm": 3.5504965782165527, "learning_rate": 9.771517709671499e-06, "loss": 0.1831, "num_input_tokens_seen": 22977776, "step": 10650 }, { "epoch": 1.9554046614057625, "grad_norm": 7.560934543609619, "learning_rate": 9.776105707469261e-06, "loss": 0.34, "num_input_tokens_seen": 22988208, "step": 10655 }, { "epoch": 1.9563222609653148, "grad_norm": 6.130038738250732, "learning_rate": 9.780693705267022e-06, "loss": 0.268, "num_input_tokens_seen": 22998384, "step": 10660 }, { "epoch": 1.9572398605248669, "grad_norm": 7.813295841217041, "learning_rate": 9.785281703064782e-06, "loss": 0.2662, "num_input_tokens_seen": 23009136, "step": 10665 }, { "epoch": 1.9581574600844192, "grad_norm": 10.022104263305664, "learning_rate": 9.789869700862545e-06, "loss": 0.363, "num_input_tokens_seen": 23019184, "step": 10670 }, { "epoch": 1.9590750596439714, "grad_norm": 14.263845443725586, "learning_rate": 9.794457698660305e-06, "loss": 0.381, "num_input_tokens_seen": 23029328, "step": 10675 }, { "epoch": 1.9599926592035235, "grad_norm": 6.544396877288818, "learning_rate": 9.799045696458066e-06, "loss": 0.2832, "num_input_tokens_seen": 23039696, "step": 10680 }, { "epoch": 1.9609102587630758, "grad_norm": 6.539590835571289, "learning_rate": 9.803633694255828e-06, "loss": 0.2984, "num_input_tokens_seen": 23051472, "step": 10685 }, { "epoch": 1.961827858322628, "grad_norm": 6.061500072479248, "learning_rate": 9.808221692053588e-06, "loss": 0.306, "num_input_tokens_seen": 23062032, "step": 10690 }, { "epoch": 1.9627454578821801, "grad_norm": 7.188908576965332, "learning_rate": 9.812809689851349e-06, "loss": 0.2572, "num_input_tokens_seen": 23073648, "step": 10695 }, { "epoch": 1.9636630574417324, "grad_norm": 9.001574516296387, "learning_rate": 9.817397687649111e-06, "loss": 0.2449, "num_input_tokens_seen": 23084080, "step": 10700 }, { "epoch": 1.9645806570012847, "grad_norm": 4.940621376037598, "learning_rate": 9.821985685446871e-06, "loss": 0.2577, "num_input_tokens_seen": 23093744, "step": 10705 }, { "epoch": 1.9654982565608368, "grad_norm": 3.5256171226501465, "learning_rate": 9.826573683244632e-06, "loss": 0.2602, "num_input_tokens_seen": 23104304, "step": 10710 }, { "epoch": 1.966415856120389, "grad_norm": 2.4992876052856445, "learning_rate": 9.831161681042394e-06, "loss": 0.3255, "num_input_tokens_seen": 23115568, "step": 10715 }, { "epoch": 1.9673334556799413, "grad_norm": 4.684780597686768, "learning_rate": 9.835749678840155e-06, "loss": 0.2376, "num_input_tokens_seen": 23125776, "step": 10720 }, { "epoch": 1.9682510552394934, "grad_norm": 7.207123279571533, "learning_rate": 9.840337676637915e-06, "loss": 0.3605, "num_input_tokens_seen": 23137872, "step": 10725 }, { "epoch": 1.9691686547990457, "grad_norm": 8.235774040222168, "learning_rate": 9.844925674435677e-06, "loss": 0.403, "num_input_tokens_seen": 23147632, "step": 10730 }, { "epoch": 1.970086254358598, "grad_norm": 3.5826988220214844, "learning_rate": 9.849513672233438e-06, "loss": 0.287, "num_input_tokens_seen": 23158704, "step": 10735 }, { "epoch": 1.97100385391815, "grad_norm": 6.002819538116455, "learning_rate": 9.854101670031198e-06, "loss": 0.2652, "num_input_tokens_seen": 23169872, "step": 10740 }, { "epoch": 1.9719214534777023, "grad_norm": 5.2855634689331055, "learning_rate": 9.85868966782896e-06, "loss": 0.3061, "num_input_tokens_seen": 23180432, "step": 10745 }, { "epoch": 1.9728390530372546, "grad_norm": 4.708727836608887, "learning_rate": 9.863277665626721e-06, "loss": 0.1575, "num_input_tokens_seen": 23193200, "step": 10750 }, { "epoch": 1.9737566525968067, "grad_norm": 1.6941580772399902, "learning_rate": 9.867865663424482e-06, "loss": 0.282, "num_input_tokens_seen": 23203376, "step": 10755 }, { "epoch": 1.974674252156359, "grad_norm": 11.201908111572266, "learning_rate": 9.872453661222244e-06, "loss": 0.311, "num_input_tokens_seen": 23212816, "step": 10760 }, { "epoch": 1.9755918517159112, "grad_norm": 19.854108810424805, "learning_rate": 9.877041659020004e-06, "loss": 0.3941, "num_input_tokens_seen": 23224144, "step": 10765 }, { "epoch": 1.9765094512754633, "grad_norm": 8.814470291137695, "learning_rate": 9.881629656817765e-06, "loss": 0.247, "num_input_tokens_seen": 23235216, "step": 10770 }, { "epoch": 1.9774270508350156, "grad_norm": 7.506138324737549, "learning_rate": 9.886217654615527e-06, "loss": 0.2321, "num_input_tokens_seen": 23247600, "step": 10775 }, { "epoch": 1.9783446503945679, "grad_norm": 4.925406455993652, "learning_rate": 9.890805652413288e-06, "loss": 0.2719, "num_input_tokens_seen": 23257936, "step": 10780 }, { "epoch": 1.97926224995412, "grad_norm": 4.067590713500977, "learning_rate": 9.89539365021105e-06, "loss": 0.3063, "num_input_tokens_seen": 23268112, "step": 10785 }, { "epoch": 1.9801798495136722, "grad_norm": 3.999678134918213, "learning_rate": 9.899981648008809e-06, "loss": 0.3169, "num_input_tokens_seen": 23280080, "step": 10790 }, { "epoch": 1.9810974490732245, "grad_norm": 3.3185904026031494, "learning_rate": 9.904569645806571e-06, "loss": 0.2162, "num_input_tokens_seen": 23288816, "step": 10795 }, { "epoch": 1.9820150486327766, "grad_norm": 6.661012649536133, "learning_rate": 9.909157643604333e-06, "loss": 0.303, "num_input_tokens_seen": 23300208, "step": 10800 }, { "epoch": 1.9829326481923288, "grad_norm": 13.994660377502441, "learning_rate": 9.913745641402092e-06, "loss": 0.4846, "num_input_tokens_seen": 23310544, "step": 10805 }, { "epoch": 1.9838502477518811, "grad_norm": 10.15087890625, "learning_rate": 9.918333639199854e-06, "loss": 0.4443, "num_input_tokens_seen": 23322224, "step": 10810 }, { "epoch": 1.9847678473114332, "grad_norm": 4.5427045822143555, "learning_rate": 9.922921636997616e-06, "loss": 0.2263, "num_input_tokens_seen": 23334256, "step": 10815 }, { "epoch": 1.9856854468709855, "grad_norm": 4.811145305633545, "learning_rate": 9.927509634795375e-06, "loss": 0.3209, "num_input_tokens_seen": 23345168, "step": 10820 }, { "epoch": 1.9866030464305378, "grad_norm": 4.848150253295898, "learning_rate": 9.932097632593137e-06, "loss": 0.3919, "num_input_tokens_seen": 23356944, "step": 10825 }, { "epoch": 1.9875206459900898, "grad_norm": 3.4526758193969727, "learning_rate": 9.936685630390898e-06, "loss": 0.3895, "num_input_tokens_seen": 23366416, "step": 10830 }, { "epoch": 1.988438245549642, "grad_norm": 3.2868595123291016, "learning_rate": 9.941273628188658e-06, "loss": 0.2478, "num_input_tokens_seen": 23377872, "step": 10835 }, { "epoch": 1.9893558451091944, "grad_norm": 2.6430840492248535, "learning_rate": 9.94586162598642e-06, "loss": 0.2785, "num_input_tokens_seen": 23389488, "step": 10840 }, { "epoch": 1.9902734446687464, "grad_norm": 6.181952476501465, "learning_rate": 9.950449623784181e-06, "loss": 0.4235, "num_input_tokens_seen": 23400176, "step": 10845 }, { "epoch": 1.9911910442282987, "grad_norm": 5.927119255065918, "learning_rate": 9.955037621581942e-06, "loss": 0.357, "num_input_tokens_seen": 23411216, "step": 10850 }, { "epoch": 1.992108643787851, "grad_norm": 5.675550937652588, "learning_rate": 9.959625619379704e-06, "loss": 0.2924, "num_input_tokens_seen": 23421904, "step": 10855 }, { "epoch": 1.993026243347403, "grad_norm": 4.56970739364624, "learning_rate": 9.964213617177464e-06, "loss": 0.3528, "num_input_tokens_seen": 23433456, "step": 10860 }, { "epoch": 1.9939438429069554, "grad_norm": 3.281423568725586, "learning_rate": 9.968801614975225e-06, "loss": 0.3052, "num_input_tokens_seen": 23445584, "step": 10865 }, { "epoch": 1.9948614424665077, "grad_norm": 2.061511754989624, "learning_rate": 9.973389612772987e-06, "loss": 0.2708, "num_input_tokens_seen": 23455248, "step": 10870 }, { "epoch": 1.9957790420260597, "grad_norm": 3.476405620574951, "learning_rate": 9.977977610570748e-06, "loss": 0.3227, "num_input_tokens_seen": 23466608, "step": 10875 }, { "epoch": 1.996696641585612, "grad_norm": 8.045883178710938, "learning_rate": 9.982565608368508e-06, "loss": 0.326, "num_input_tokens_seen": 23478192, "step": 10880 }, { "epoch": 1.9976142411451643, "grad_norm": 6.982784748077393, "learning_rate": 9.98715360616627e-06, "loss": 0.3348, "num_input_tokens_seen": 23488720, "step": 10885 }, { "epoch": 1.9985318407047163, "grad_norm": 4.578664302825928, "learning_rate": 9.991741603964031e-06, "loss": 0.2894, "num_input_tokens_seen": 23500048, "step": 10890 }, { "epoch": 1.9994494402642686, "grad_norm": 5.679470062255859, "learning_rate": 9.996329601761791e-06, "loss": 0.2708, "num_input_tokens_seen": 23511280, "step": 10895 }, { "epoch": 2.0, "eval_loss": 0.2876589000225067, "eval_runtime": 178.3045, "eval_samples_per_second": 30.56, "eval_steps_per_second": 7.644, "num_input_tokens_seen": 23516768, "step": 10898 }, { "epoch": 2.000367039823821, "grad_norm": 2.7259016036987305, "learning_rate": 9.999999997435156e-06, "loss": 0.2579, "num_input_tokens_seen": 23521440, "step": 10900 }, { "epoch": 2.001284639383373, "grad_norm": 17.301929473876953, "learning_rate": 9.999999907665581e-06, "loss": 0.2808, "num_input_tokens_seen": 23531424, "step": 10905 }, { "epoch": 2.0022022389429255, "grad_norm": 8.289338111877441, "learning_rate": 9.999999689653756e-06, "loss": 0.3915, "num_input_tokens_seen": 23543168, "step": 10910 }, { "epoch": 2.0031198385024775, "grad_norm": 8.408637046813965, "learning_rate": 9.99999934339969e-06, "loss": 0.4867, "num_input_tokens_seen": 23553408, "step": 10915 }, { "epoch": 2.0040374380620296, "grad_norm": 6.235904216766357, "learning_rate": 9.99999886890339e-06, "loss": 0.3016, "num_input_tokens_seen": 23564832, "step": 10920 }, { "epoch": 2.004955037621582, "grad_norm": 2.10756516456604, "learning_rate": 9.999998266164868e-06, "loss": 0.2525, "num_input_tokens_seen": 23576224, "step": 10925 }, { "epoch": 2.005872637181134, "grad_norm": 15.77977466583252, "learning_rate": 9.99999753518414e-06, "loss": 0.3814, "num_input_tokens_seen": 23586944, "step": 10930 }, { "epoch": 2.0067902367406862, "grad_norm": 8.847223281860352, "learning_rate": 9.999996675961223e-06, "loss": 0.3578, "num_input_tokens_seen": 23597984, "step": 10935 }, { "epoch": 2.0077078363002387, "grad_norm": 4.751491069793701, "learning_rate": 9.999995688496142e-06, "loss": 0.3664, "num_input_tokens_seen": 23608704, "step": 10940 }, { "epoch": 2.008625435859791, "grad_norm": 5.115724086761475, "learning_rate": 9.999994572788922e-06, "loss": 0.2771, "num_input_tokens_seen": 23618496, "step": 10945 }, { "epoch": 2.009543035419343, "grad_norm": 3.462764263153076, "learning_rate": 9.999993328839588e-06, "loss": 0.222, "num_input_tokens_seen": 23629216, "step": 10950 }, { "epoch": 2.0104606349788954, "grad_norm": 7.21170711517334, "learning_rate": 9.999991956648177e-06, "loss": 0.2095, "num_input_tokens_seen": 23639744, "step": 10955 }, { "epoch": 2.0113782345384474, "grad_norm": 3.991042375564575, "learning_rate": 9.999990456214719e-06, "loss": 0.1826, "num_input_tokens_seen": 23648960, "step": 10960 }, { "epoch": 2.0122958340979995, "grad_norm": 7.313308238983154, "learning_rate": 9.999988827539256e-06, "loss": 0.2666, "num_input_tokens_seen": 23659168, "step": 10965 }, { "epoch": 2.013213433657552, "grad_norm": 5.833609580993652, "learning_rate": 9.999987070621831e-06, "loss": 0.2794, "num_input_tokens_seen": 23669472, "step": 10970 }, { "epoch": 2.014131033217104, "grad_norm": 5.66803503036499, "learning_rate": 9.999985185462483e-06, "loss": 0.3516, "num_input_tokens_seen": 23680544, "step": 10975 }, { "epoch": 2.015048632776656, "grad_norm": 12.33746337890625, "learning_rate": 9.999983172061268e-06, "loss": 0.3831, "num_input_tokens_seen": 23690720, "step": 10980 }, { "epoch": 2.0159662323362086, "grad_norm": 10.16683292388916, "learning_rate": 9.999981030418231e-06, "loss": 0.2583, "num_input_tokens_seen": 23701536, "step": 10985 }, { "epoch": 2.0168838318957607, "grad_norm": 12.671812057495117, "learning_rate": 9.999978760533432e-06, "loss": 0.2422, "num_input_tokens_seen": 23714752, "step": 10990 }, { "epoch": 2.0178014314553128, "grad_norm": 3.5545458793640137, "learning_rate": 9.999976362406924e-06, "loss": 0.2, "num_input_tokens_seen": 23725696, "step": 10995 }, { "epoch": 2.0187190310148653, "grad_norm": 8.245214462280273, "learning_rate": 9.999973836038775e-06, "loss": 0.2853, "num_input_tokens_seen": 23735840, "step": 11000 }, { "epoch": 2.0196366305744173, "grad_norm": 17.405467987060547, "learning_rate": 9.999971181429045e-06, "loss": 0.4364, "num_input_tokens_seen": 23746656, "step": 11005 }, { "epoch": 2.0205542301339694, "grad_norm": 16.19526481628418, "learning_rate": 9.999968398577804e-06, "loss": 0.4737, "num_input_tokens_seen": 23756768, "step": 11010 }, { "epoch": 2.021471829693522, "grad_norm": 14.602252960205078, "learning_rate": 9.999965487485122e-06, "loss": 0.3946, "num_input_tokens_seen": 23767424, "step": 11015 }, { "epoch": 2.022389429253074, "grad_norm": 3.307816505432129, "learning_rate": 9.999962448151075e-06, "loss": 0.1793, "num_input_tokens_seen": 23778304, "step": 11020 }, { "epoch": 2.023307028812626, "grad_norm": 8.409663200378418, "learning_rate": 9.999959280575739e-06, "loss": 0.2812, "num_input_tokens_seen": 23789408, "step": 11025 }, { "epoch": 2.0242246283721785, "grad_norm": 6.276768684387207, "learning_rate": 9.9999559847592e-06, "loss": 0.2737, "num_input_tokens_seen": 23801088, "step": 11030 }, { "epoch": 2.0251422279317306, "grad_norm": 10.153885841369629, "learning_rate": 9.999952560701536e-06, "loss": 0.1697, "num_input_tokens_seen": 23812672, "step": 11035 }, { "epoch": 2.0260598274912827, "grad_norm": 16.874324798583984, "learning_rate": 9.99994900840284e-06, "loss": 0.2812, "num_input_tokens_seen": 23823424, "step": 11040 }, { "epoch": 2.026977427050835, "grad_norm": 8.326888084411621, "learning_rate": 9.9999453278632e-06, "loss": 0.3201, "num_input_tokens_seen": 23834400, "step": 11045 }, { "epoch": 2.0278950266103872, "grad_norm": 3.7665507793426514, "learning_rate": 9.999941519082713e-06, "loss": 0.2021, "num_input_tokens_seen": 23845376, "step": 11050 }, { "epoch": 2.0288126261699393, "grad_norm": 8.142403602600098, "learning_rate": 9.999937582061472e-06, "loss": 0.3123, "num_input_tokens_seen": 23856000, "step": 11055 }, { "epoch": 2.029730225729492, "grad_norm": 8.782761573791504, "learning_rate": 9.999933516799584e-06, "loss": 0.2884, "num_input_tokens_seen": 23866624, "step": 11060 }, { "epoch": 2.030647825289044, "grad_norm": 11.799690246582031, "learning_rate": 9.999929323297151e-06, "loss": 0.2977, "num_input_tokens_seen": 23877984, "step": 11065 }, { "epoch": 2.031565424848596, "grad_norm": 11.040410041809082, "learning_rate": 9.999925001554277e-06, "loss": 0.4225, "num_input_tokens_seen": 23888608, "step": 11070 }, { "epoch": 2.0324830244081484, "grad_norm": 7.225735187530518, "learning_rate": 9.99992055157108e-06, "loss": 0.3123, "num_input_tokens_seen": 23899232, "step": 11075 }, { "epoch": 2.0334006239677005, "grad_norm": 3.7168638706207275, "learning_rate": 9.999915973347667e-06, "loss": 0.2579, "num_input_tokens_seen": 23909600, "step": 11080 }, { "epoch": 2.0343182235272526, "grad_norm": 3.1661438941955566, "learning_rate": 9.99991126688416e-06, "loss": 0.2034, "num_input_tokens_seen": 23920576, "step": 11085 }, { "epoch": 2.035235823086805, "grad_norm": 9.999105453491211, "learning_rate": 9.999906432180676e-06, "loss": 0.3364, "num_input_tokens_seen": 23930912, "step": 11090 }, { "epoch": 2.036153422646357, "grad_norm": 9.659588813781738, "learning_rate": 9.999901469237344e-06, "loss": 0.3571, "num_input_tokens_seen": 23941408, "step": 11095 }, { "epoch": 2.037071022205909, "grad_norm": 9.806279182434082, "learning_rate": 9.999896378054285e-06, "loss": 0.2217, "num_input_tokens_seen": 23953696, "step": 11100 }, { "epoch": 2.0379886217654617, "grad_norm": 10.123931884765625, "learning_rate": 9.999891158631637e-06, "loss": 0.2724, "num_input_tokens_seen": 23964480, "step": 11105 }, { "epoch": 2.0389062213250138, "grad_norm": 6.4327569007873535, "learning_rate": 9.999885810969528e-06, "loss": 0.2137, "num_input_tokens_seen": 23975072, "step": 11110 }, { "epoch": 2.039823820884566, "grad_norm": 6.656304359436035, "learning_rate": 9.999880335068096e-06, "loss": 0.3084, "num_input_tokens_seen": 23986304, "step": 11115 }, { "epoch": 2.0407414204441183, "grad_norm": 20.82772445678711, "learning_rate": 9.999874730927484e-06, "loss": 0.2866, "num_input_tokens_seen": 23996608, "step": 11120 }, { "epoch": 2.0416590200036704, "grad_norm": 4.5969767570495605, "learning_rate": 9.999868998547834e-06, "loss": 0.4057, "num_input_tokens_seen": 24007648, "step": 11125 }, { "epoch": 2.0425766195632225, "grad_norm": 11.103429794311523, "learning_rate": 9.999863137929293e-06, "loss": 0.2443, "num_input_tokens_seen": 24017984, "step": 11130 }, { "epoch": 2.043494219122775, "grad_norm": 9.843463897705078, "learning_rate": 9.999857149072011e-06, "loss": 0.3604, "num_input_tokens_seen": 24029856, "step": 11135 }, { "epoch": 2.044411818682327, "grad_norm": 4.875168800354004, "learning_rate": 9.999851031976142e-06, "loss": 0.2261, "num_input_tokens_seen": 24040608, "step": 11140 }, { "epoch": 2.045329418241879, "grad_norm": 8.776263236999512, "learning_rate": 9.999844786641845e-06, "loss": 0.3252, "num_input_tokens_seen": 24050496, "step": 11145 }, { "epoch": 2.0462470178014316, "grad_norm": 6.959217548370361, "learning_rate": 9.999838413069279e-06, "loss": 0.2635, "num_input_tokens_seen": 24061632, "step": 11150 }, { "epoch": 2.0471646173609837, "grad_norm": 3.857541561126709, "learning_rate": 9.999831911258604e-06, "loss": 0.3933, "num_input_tokens_seen": 24073600, "step": 11155 }, { "epoch": 2.0480822169205357, "grad_norm": 4.771834373474121, "learning_rate": 9.999825281209989e-06, "loss": 0.4303, "num_input_tokens_seen": 24083520, "step": 11160 }, { "epoch": 2.048999816480088, "grad_norm": 5.756290912628174, "learning_rate": 9.999818522923608e-06, "loss": 0.3523, "num_input_tokens_seen": 24095296, "step": 11165 }, { "epoch": 2.0499174160396403, "grad_norm": 7.240843296051025, "learning_rate": 9.999811636399628e-06, "loss": 0.3424, "num_input_tokens_seen": 24104704, "step": 11170 }, { "epoch": 2.0508350155991923, "grad_norm": 4.277096271514893, "learning_rate": 9.99980462163823e-06, "loss": 0.2193, "num_input_tokens_seen": 24115840, "step": 11175 }, { "epoch": 2.051752615158745, "grad_norm": 2.2415125370025635, "learning_rate": 9.999797478639593e-06, "loss": 0.3247, "num_input_tokens_seen": 24126336, "step": 11180 }, { "epoch": 2.052670214718297, "grad_norm": 3.5037190914154053, "learning_rate": 9.999790207403898e-06, "loss": 0.3017, "num_input_tokens_seen": 24137344, "step": 11185 }, { "epoch": 2.053587814277849, "grad_norm": 11.614054679870605, "learning_rate": 9.999782807931333e-06, "loss": 0.1901, "num_input_tokens_seen": 24148224, "step": 11190 }, { "epoch": 2.0545054138374015, "grad_norm": 6.066946983337402, "learning_rate": 9.999775280222089e-06, "loss": 0.2727, "num_input_tokens_seen": 24158400, "step": 11195 }, { "epoch": 2.0554230133969535, "grad_norm": 9.136951446533203, "learning_rate": 9.999767624276357e-06, "loss": 0.3383, "num_input_tokens_seen": 24168640, "step": 11200 }, { "epoch": 2.0563406129565056, "grad_norm": 5.201135158538818, "learning_rate": 9.999759840094336e-06, "loss": 0.39, "num_input_tokens_seen": 24179264, "step": 11205 }, { "epoch": 2.057258212516058, "grad_norm": 5.425063133239746, "learning_rate": 9.999751927676223e-06, "loss": 0.2844, "num_input_tokens_seen": 24190368, "step": 11210 }, { "epoch": 2.05817581207561, "grad_norm": 2.425041913986206, "learning_rate": 9.999743887022223e-06, "loss": 0.3671, "num_input_tokens_seen": 24200224, "step": 11215 }, { "epoch": 2.0590934116351622, "grad_norm": 9.504328727722168, "learning_rate": 9.99973571813254e-06, "loss": 0.3634, "num_input_tokens_seen": 24211616, "step": 11220 }, { "epoch": 2.0600110111947147, "grad_norm": 4.7825117111206055, "learning_rate": 9.999727421007387e-06, "loss": 0.2006, "num_input_tokens_seen": 24221792, "step": 11225 }, { "epoch": 2.060928610754267, "grad_norm": 6.259507656097412, "learning_rate": 9.99971899564697e-06, "loss": 0.2511, "num_input_tokens_seen": 24231968, "step": 11230 }, { "epoch": 2.061846210313819, "grad_norm": 14.101293563842773, "learning_rate": 9.999710442051514e-06, "loss": 0.3593, "num_input_tokens_seen": 24244416, "step": 11235 }, { "epoch": 2.0627638098733714, "grad_norm": 10.699189186096191, "learning_rate": 9.999701760221231e-06, "loss": 0.4544, "num_input_tokens_seen": 24255648, "step": 11240 }, { "epoch": 2.0636814094329234, "grad_norm": 5.600504398345947, "learning_rate": 9.999692950156347e-06, "loss": 0.3024, "num_input_tokens_seen": 24267040, "step": 11245 }, { "epoch": 2.0645990089924755, "grad_norm": 4.335362911224365, "learning_rate": 9.999684011857089e-06, "loss": 0.2755, "num_input_tokens_seen": 24277376, "step": 11250 }, { "epoch": 2.065516608552028, "grad_norm": 4.668900489807129, "learning_rate": 9.999674945323685e-06, "loss": 0.3008, "num_input_tokens_seen": 24287744, "step": 11255 }, { "epoch": 2.06643420811158, "grad_norm": 3.050610303878784, "learning_rate": 9.999665750556367e-06, "loss": 0.4319, "num_input_tokens_seen": 24298720, "step": 11260 }, { "epoch": 2.067351807671132, "grad_norm": 4.940123558044434, "learning_rate": 9.99965642755537e-06, "loss": 0.2414, "num_input_tokens_seen": 24309952, "step": 11265 }, { "epoch": 2.0682694072306846, "grad_norm": 3.060605764389038, "learning_rate": 9.999646976320937e-06, "loss": 0.2927, "num_input_tokens_seen": 24320256, "step": 11270 }, { "epoch": 2.0691870067902367, "grad_norm": 5.278637886047363, "learning_rate": 9.999637396853306e-06, "loss": 0.2972, "num_input_tokens_seen": 24331584, "step": 11275 }, { "epoch": 2.0701046063497888, "grad_norm": 4.944475173950195, "learning_rate": 9.999627689152725e-06, "loss": 0.2225, "num_input_tokens_seen": 24341120, "step": 11280 }, { "epoch": 2.0710222059093413, "grad_norm": 4.154308795928955, "learning_rate": 9.999617853219444e-06, "loss": 0.2309, "num_input_tokens_seen": 24351072, "step": 11285 }, { "epoch": 2.0719398054688933, "grad_norm": 3.8087475299835205, "learning_rate": 9.99960788905371e-06, "loss": 0.3429, "num_input_tokens_seen": 24361888, "step": 11290 }, { "epoch": 2.0728574050284454, "grad_norm": 3.7493185997009277, "learning_rate": 9.999597796655785e-06, "loss": 0.2919, "num_input_tokens_seen": 24372480, "step": 11295 }, { "epoch": 2.073775004587998, "grad_norm": 2.8232946395874023, "learning_rate": 9.999587576025924e-06, "loss": 0.2504, "num_input_tokens_seen": 24383872, "step": 11300 }, { "epoch": 2.07469260414755, "grad_norm": 24.097049713134766, "learning_rate": 9.999577227164393e-06, "loss": 0.2305, "num_input_tokens_seen": 24394400, "step": 11305 }, { "epoch": 2.075610203707102, "grad_norm": 7.656175136566162, "learning_rate": 9.999566750071453e-06, "loss": 0.3714, "num_input_tokens_seen": 24405120, "step": 11310 }, { "epoch": 2.0765278032666545, "grad_norm": 5.596554279327393, "learning_rate": 9.999556144747373e-06, "loss": 0.2558, "num_input_tokens_seen": 24415392, "step": 11315 }, { "epoch": 2.0774454028262066, "grad_norm": 4.727376937866211, "learning_rate": 9.999545411192428e-06, "loss": 0.2792, "num_input_tokens_seen": 24426464, "step": 11320 }, { "epoch": 2.0783630023857587, "grad_norm": 6.5386762619018555, "learning_rate": 9.99953454940689e-06, "loss": 0.2807, "num_input_tokens_seen": 24437216, "step": 11325 }, { "epoch": 2.079280601945311, "grad_norm": 10.278352737426758, "learning_rate": 9.999523559391042e-06, "loss": 0.2385, "num_input_tokens_seen": 24448288, "step": 11330 }, { "epoch": 2.0801982015048632, "grad_norm": 12.68419075012207, "learning_rate": 9.999512441145163e-06, "loss": 0.3739, "num_input_tokens_seen": 24458272, "step": 11335 }, { "epoch": 2.0811158010644153, "grad_norm": 10.922774314880371, "learning_rate": 9.999501194669536e-06, "loss": 0.3649, "num_input_tokens_seen": 24468192, "step": 11340 }, { "epoch": 2.082033400623968, "grad_norm": 5.564507484436035, "learning_rate": 9.999489819964454e-06, "loss": 0.3728, "num_input_tokens_seen": 24479936, "step": 11345 }, { "epoch": 2.08295100018352, "grad_norm": 6.978668689727783, "learning_rate": 9.999478317030207e-06, "loss": 0.3794, "num_input_tokens_seen": 24490464, "step": 11350 }, { "epoch": 2.083868599743072, "grad_norm": 10.085699081420898, "learning_rate": 9.999466685867089e-06, "loss": 0.3311, "num_input_tokens_seen": 24501344, "step": 11355 }, { "epoch": 2.0847861993026244, "grad_norm": 6.558310508728027, "learning_rate": 9.999454926475399e-06, "loss": 0.3005, "num_input_tokens_seen": 24511552, "step": 11360 }, { "epoch": 2.0857037988621765, "grad_norm": 4.089015483856201, "learning_rate": 9.999443038855438e-06, "loss": 0.2386, "num_input_tokens_seen": 24522528, "step": 11365 }, { "epoch": 2.0866213984217286, "grad_norm": 2.142237901687622, "learning_rate": 9.999431023007511e-06, "loss": 0.2501, "num_input_tokens_seen": 24533216, "step": 11370 }, { "epoch": 2.087538997981281, "grad_norm": 2.0553534030914307, "learning_rate": 9.999418878931927e-06, "loss": 0.2688, "num_input_tokens_seen": 24543968, "step": 11375 }, { "epoch": 2.088456597540833, "grad_norm": 13.221633911132812, "learning_rate": 9.999406606628999e-06, "loss": 0.3142, "num_input_tokens_seen": 24555680, "step": 11380 }, { "epoch": 2.089374197100385, "grad_norm": 5.0572381019592285, "learning_rate": 9.999394206099038e-06, "loss": 0.3241, "num_input_tokens_seen": 24566624, "step": 11385 }, { "epoch": 2.0902917966599377, "grad_norm": 8.747896194458008, "learning_rate": 9.999381677342365e-06, "loss": 0.2906, "num_input_tokens_seen": 24577152, "step": 11390 }, { "epoch": 2.0912093962194898, "grad_norm": 2.5050272941589355, "learning_rate": 9.999369020359299e-06, "loss": 0.3839, "num_input_tokens_seen": 24588832, "step": 11395 }, { "epoch": 2.092126995779042, "grad_norm": 3.958989381790161, "learning_rate": 9.999356235150169e-06, "loss": 0.1589, "num_input_tokens_seen": 24598880, "step": 11400 }, { "epoch": 2.0930445953385943, "grad_norm": 2.3127126693725586, "learning_rate": 9.999343321715296e-06, "loss": 0.252, "num_input_tokens_seen": 24608832, "step": 11405 }, { "epoch": 2.0939621948981464, "grad_norm": 8.56747817993164, "learning_rate": 9.999330280055018e-06, "loss": 0.2526, "num_input_tokens_seen": 24618912, "step": 11410 }, { "epoch": 2.0948797944576985, "grad_norm": 17.70288848876953, "learning_rate": 9.999317110169665e-06, "loss": 0.2819, "num_input_tokens_seen": 24628928, "step": 11415 }, { "epoch": 2.095797394017251, "grad_norm": 3.1969974040985107, "learning_rate": 9.999303812059576e-06, "loss": 0.3132, "num_input_tokens_seen": 24639872, "step": 11420 }, { "epoch": 2.096714993576803, "grad_norm": 5.7787933349609375, "learning_rate": 9.999290385725093e-06, "loss": 0.3897, "num_input_tokens_seen": 24650528, "step": 11425 }, { "epoch": 2.097632593136355, "grad_norm": 5.241342067718506, "learning_rate": 9.99927683116656e-06, "loss": 0.2325, "num_input_tokens_seen": 24661664, "step": 11430 }, { "epoch": 2.0985501926959076, "grad_norm": 4.756380081176758, "learning_rate": 9.999263148384326e-06, "loss": 0.2833, "num_input_tokens_seen": 24672736, "step": 11435 }, { "epoch": 2.0994677922554597, "grad_norm": 4.190909385681152, "learning_rate": 9.999249337378739e-06, "loss": 0.2621, "num_input_tokens_seen": 24682816, "step": 11440 }, { "epoch": 2.1003853918150117, "grad_norm": 6.834646224975586, "learning_rate": 9.999235398150154e-06, "loss": 0.3133, "num_input_tokens_seen": 24692640, "step": 11445 }, { "epoch": 2.101302991374564, "grad_norm": 7.396344184875488, "learning_rate": 9.99922133069893e-06, "loss": 0.1659, "num_input_tokens_seen": 24704224, "step": 11450 }, { "epoch": 2.1022205909341163, "grad_norm": 7.795497894287109, "learning_rate": 9.999207135025425e-06, "loss": 0.2036, "num_input_tokens_seen": 24714624, "step": 11455 }, { "epoch": 2.1031381904936683, "grad_norm": 6.025665283203125, "learning_rate": 9.999192811130008e-06, "loss": 0.2854, "num_input_tokens_seen": 24724768, "step": 11460 }, { "epoch": 2.104055790053221, "grad_norm": 16.511430740356445, "learning_rate": 9.999178359013042e-06, "loss": 0.2543, "num_input_tokens_seen": 24735584, "step": 11465 }, { "epoch": 2.104973389612773, "grad_norm": 13.586663246154785, "learning_rate": 9.9991637786749e-06, "loss": 0.5476, "num_input_tokens_seen": 24745920, "step": 11470 }, { "epoch": 2.105890989172325, "grad_norm": 4.467092514038086, "learning_rate": 9.999149070115952e-06, "loss": 0.3918, "num_input_tokens_seen": 24757344, "step": 11475 }, { "epoch": 2.1068085887318775, "grad_norm": 5.771144390106201, "learning_rate": 9.999134233336581e-06, "loss": 0.3362, "num_input_tokens_seen": 24768128, "step": 11480 }, { "epoch": 2.1077261882914295, "grad_norm": 2.8365864753723145, "learning_rate": 9.999119268337165e-06, "loss": 0.2257, "num_input_tokens_seen": 24779200, "step": 11485 }, { "epoch": 2.1086437878509816, "grad_norm": 4.988222599029541, "learning_rate": 9.999104175118087e-06, "loss": 0.3025, "num_input_tokens_seen": 24789664, "step": 11490 }, { "epoch": 2.109561387410534, "grad_norm": 10.158021926879883, "learning_rate": 9.999088953679734e-06, "loss": 0.2882, "num_input_tokens_seen": 24800928, "step": 11495 }, { "epoch": 2.110478986970086, "grad_norm": 2.257373809814453, "learning_rate": 9.999073604022498e-06, "loss": 0.2837, "num_input_tokens_seen": 24811872, "step": 11500 }, { "epoch": 2.1113965865296382, "grad_norm": 9.438865661621094, "learning_rate": 9.999058126146773e-06, "loss": 0.3224, "num_input_tokens_seen": 24822752, "step": 11505 }, { "epoch": 2.1123141860891907, "grad_norm": 6.381481170654297, "learning_rate": 9.999042520052954e-06, "loss": 0.4046, "num_input_tokens_seen": 24833632, "step": 11510 }, { "epoch": 2.113231785648743, "grad_norm": 4.589237689971924, "learning_rate": 9.999026785741443e-06, "loss": 0.3609, "num_input_tokens_seen": 24844544, "step": 11515 }, { "epoch": 2.114149385208295, "grad_norm": 2.8704981803894043, "learning_rate": 9.999010923212642e-06, "loss": 0.3143, "num_input_tokens_seen": 24854592, "step": 11520 }, { "epoch": 2.1150669847678474, "grad_norm": 4.376993179321289, "learning_rate": 9.998994932466958e-06, "loss": 0.2947, "num_input_tokens_seen": 24866816, "step": 11525 }, { "epoch": 2.1159845843273994, "grad_norm": 3.2336089611053467, "learning_rate": 9.998978813504803e-06, "loss": 0.2134, "num_input_tokens_seen": 24878528, "step": 11530 }, { "epoch": 2.116902183886952, "grad_norm": 3.7494189739227295, "learning_rate": 9.998962566326587e-06, "loss": 0.23, "num_input_tokens_seen": 24889792, "step": 11535 }, { "epoch": 2.117819783446504, "grad_norm": 5.147046089172363, "learning_rate": 9.99894619093273e-06, "loss": 0.2979, "num_input_tokens_seen": 24900672, "step": 11540 }, { "epoch": 2.118737383006056, "grad_norm": 7.381143569946289, "learning_rate": 9.998929687323651e-06, "loss": 0.3212, "num_input_tokens_seen": 24911040, "step": 11545 }, { "epoch": 2.1196549825656086, "grad_norm": 5.20147705078125, "learning_rate": 9.998913055499775e-06, "loss": 0.2814, "num_input_tokens_seen": 24922464, "step": 11550 }, { "epoch": 2.1205725821251606, "grad_norm": 3.496483087539673, "learning_rate": 9.998896295461524e-06, "loss": 0.1983, "num_input_tokens_seen": 24933664, "step": 11555 }, { "epoch": 2.1214901816847127, "grad_norm": 8.862667083740234, "learning_rate": 9.998879407209332e-06, "loss": 0.3719, "num_input_tokens_seen": 24943136, "step": 11560 }, { "epoch": 2.122407781244265, "grad_norm": 4.93242883682251, "learning_rate": 9.998862390743632e-06, "loss": 0.2763, "num_input_tokens_seen": 24954400, "step": 11565 }, { "epoch": 2.1233253808038173, "grad_norm": 9.421260833740234, "learning_rate": 9.998845246064856e-06, "loss": 0.3154, "num_input_tokens_seen": 24966048, "step": 11570 }, { "epoch": 2.1242429803633693, "grad_norm": 2.472907781600952, "learning_rate": 9.998827973173448e-06, "loss": 0.1353, "num_input_tokens_seen": 24976608, "step": 11575 }, { "epoch": 2.125160579922922, "grad_norm": 2.7837603092193604, "learning_rate": 9.998810572069851e-06, "loss": 0.2472, "num_input_tokens_seen": 24987200, "step": 11580 }, { "epoch": 2.126078179482474, "grad_norm": 4.425224304199219, "learning_rate": 9.99879304275451e-06, "loss": 0.3312, "num_input_tokens_seen": 24998432, "step": 11585 }, { "epoch": 2.126995779042026, "grad_norm": 7.125009059906006, "learning_rate": 9.998775385227875e-06, "loss": 0.3308, "num_input_tokens_seen": 25009088, "step": 11590 }, { "epoch": 2.1279133786015785, "grad_norm": 5.105013370513916, "learning_rate": 9.998757599490398e-06, "loss": 0.1864, "num_input_tokens_seen": 25020096, "step": 11595 }, { "epoch": 2.1288309781611305, "grad_norm": 16.089679718017578, "learning_rate": 9.998739685542536e-06, "loss": 0.3696, "num_input_tokens_seen": 25030816, "step": 11600 }, { "epoch": 2.1297485777206826, "grad_norm": 8.936731338500977, "learning_rate": 9.998721643384748e-06, "loss": 0.2616, "num_input_tokens_seen": 25040000, "step": 11605 }, { "epoch": 2.130666177280235, "grad_norm": 4.373447895050049, "learning_rate": 9.998703473017499e-06, "loss": 0.4137, "num_input_tokens_seen": 25049984, "step": 11610 }, { "epoch": 2.131583776839787, "grad_norm": 3.603454113006592, "learning_rate": 9.998685174441252e-06, "loss": 0.2191, "num_input_tokens_seen": 25061184, "step": 11615 }, { "epoch": 2.1325013763993392, "grad_norm": 5.24937629699707, "learning_rate": 9.998666747656479e-06, "loss": 0.3395, "num_input_tokens_seen": 25070624, "step": 11620 }, { "epoch": 2.1334189759588917, "grad_norm": 4.04479455947876, "learning_rate": 9.998648192663648e-06, "loss": 0.2402, "num_input_tokens_seen": 25081056, "step": 11625 }, { "epoch": 2.134336575518444, "grad_norm": 4.856521129608154, "learning_rate": 9.99862950946324e-06, "loss": 0.3106, "num_input_tokens_seen": 25091104, "step": 11630 }, { "epoch": 2.135254175077996, "grad_norm": 6.296347141265869, "learning_rate": 9.998610698055732e-06, "loss": 0.2391, "num_input_tokens_seen": 25102848, "step": 11635 }, { "epoch": 2.1361717746375484, "grad_norm": 11.723033905029297, "learning_rate": 9.998591758441608e-06, "loss": 0.2019, "num_input_tokens_seen": 25111904, "step": 11640 }, { "epoch": 2.1370893741971004, "grad_norm": 10.908712387084961, "learning_rate": 9.99857269062135e-06, "loss": 0.208, "num_input_tokens_seen": 25121952, "step": 11645 }, { "epoch": 2.1380069737566525, "grad_norm": 2.6645100116729736, "learning_rate": 9.998553494595453e-06, "loss": 0.2989, "num_input_tokens_seen": 25132096, "step": 11650 }, { "epoch": 2.138924573316205, "grad_norm": 13.429795265197754, "learning_rate": 9.998534170364403e-06, "loss": 0.4778, "num_input_tokens_seen": 25142912, "step": 11655 }, { "epoch": 2.139842172875757, "grad_norm": 2.39284610748291, "learning_rate": 9.9985147179287e-06, "loss": 0.2939, "num_input_tokens_seen": 25152384, "step": 11660 }, { "epoch": 2.140759772435309, "grad_norm": 2.7765960693359375, "learning_rate": 9.99849513728884e-06, "loss": 0.3335, "num_input_tokens_seen": 25163936, "step": 11665 }, { "epoch": 2.1416773719948616, "grad_norm": 4.614788055419922, "learning_rate": 9.998475428445329e-06, "loss": 0.2469, "num_input_tokens_seen": 25174624, "step": 11670 }, { "epoch": 2.1425949715544137, "grad_norm": 8.56151008605957, "learning_rate": 9.998455591398668e-06, "loss": 0.2075, "num_input_tokens_seen": 25186144, "step": 11675 }, { "epoch": 2.1435125711139658, "grad_norm": 5.83230447769165, "learning_rate": 9.99843562614937e-06, "loss": 0.2963, "num_input_tokens_seen": 25197152, "step": 11680 }, { "epoch": 2.1444301706735183, "grad_norm": 6.265300273895264, "learning_rate": 9.998415532697943e-06, "loss": 0.2069, "num_input_tokens_seen": 25207584, "step": 11685 }, { "epoch": 2.1453477702330703, "grad_norm": 2.4670350551605225, "learning_rate": 9.998395311044907e-06, "loss": 0.3878, "num_input_tokens_seen": 25218048, "step": 11690 }, { "epoch": 2.1462653697926224, "grad_norm": 7.438182830810547, "learning_rate": 9.998374961190776e-06, "loss": 0.2497, "num_input_tokens_seen": 25227808, "step": 11695 }, { "epoch": 2.147182969352175, "grad_norm": 8.425724029541016, "learning_rate": 9.998354483136073e-06, "loss": 0.1613, "num_input_tokens_seen": 25238816, "step": 11700 }, { "epoch": 2.148100568911727, "grad_norm": 6.330658435821533, "learning_rate": 9.998333876881325e-06, "loss": 0.4296, "num_input_tokens_seen": 25250272, "step": 11705 }, { "epoch": 2.149018168471279, "grad_norm": 9.099831581115723, "learning_rate": 9.998313142427061e-06, "loss": 0.4192, "num_input_tokens_seen": 25260448, "step": 11710 }, { "epoch": 2.1499357680308315, "grad_norm": 9.606108665466309, "learning_rate": 9.998292279773812e-06, "loss": 0.3296, "num_input_tokens_seen": 25271104, "step": 11715 }, { "epoch": 2.1508533675903836, "grad_norm": 4.8962531089782715, "learning_rate": 9.998271288922111e-06, "loss": 0.3805, "num_input_tokens_seen": 25281600, "step": 11720 }, { "epoch": 2.1517709671499357, "grad_norm": 3.837297201156616, "learning_rate": 9.998250169872499e-06, "loss": 0.2799, "num_input_tokens_seen": 25290240, "step": 11725 }, { "epoch": 2.152688566709488, "grad_norm": 4.648876190185547, "learning_rate": 9.998228922625517e-06, "loss": 0.343, "num_input_tokens_seen": 25300704, "step": 11730 }, { "epoch": 2.1536061662690402, "grad_norm": 3.2959961891174316, "learning_rate": 9.998207547181708e-06, "loss": 0.3674, "num_input_tokens_seen": 25312704, "step": 11735 }, { "epoch": 2.1545237658285923, "grad_norm": 17.549840927124023, "learning_rate": 9.998186043541624e-06, "loss": 0.2019, "num_input_tokens_seen": 25323840, "step": 11740 }, { "epoch": 2.155441365388145, "grad_norm": 3.2944319248199463, "learning_rate": 9.998164411705812e-06, "loss": 0.2956, "num_input_tokens_seen": 25333600, "step": 11745 }, { "epoch": 2.156358964947697, "grad_norm": 8.470412254333496, "learning_rate": 9.998142651674832e-06, "loss": 0.5475, "num_input_tokens_seen": 25344640, "step": 11750 }, { "epoch": 2.157276564507249, "grad_norm": 2.8559319972991943, "learning_rate": 9.998120763449238e-06, "loss": 0.2827, "num_input_tokens_seen": 25356160, "step": 11755 }, { "epoch": 2.1581941640668014, "grad_norm": 5.794381141662598, "learning_rate": 9.998098747029594e-06, "loss": 0.2929, "num_input_tokens_seen": 25367296, "step": 11760 }, { "epoch": 2.1591117636263535, "grad_norm": 2.6151158809661865, "learning_rate": 9.998076602416462e-06, "loss": 0.2762, "num_input_tokens_seen": 25378528, "step": 11765 }, { "epoch": 2.1600293631859055, "grad_norm": 3.2109618186950684, "learning_rate": 9.99805432961041e-06, "loss": 0.1947, "num_input_tokens_seen": 25390272, "step": 11770 }, { "epoch": 2.160946962745458, "grad_norm": 2.0424766540527344, "learning_rate": 9.998031928612015e-06, "loss": 0.3864, "num_input_tokens_seen": 25399456, "step": 11775 }, { "epoch": 2.16186456230501, "grad_norm": 4.432719707489014, "learning_rate": 9.998009399421845e-06, "loss": 0.2876, "num_input_tokens_seen": 25409728, "step": 11780 }, { "epoch": 2.162782161864562, "grad_norm": 7.040672779083252, "learning_rate": 9.997986742040479e-06, "loss": 0.2939, "num_input_tokens_seen": 25420672, "step": 11785 }, { "epoch": 2.1636997614241147, "grad_norm": 5.220311641693115, "learning_rate": 9.997963956468501e-06, "loss": 0.1941, "num_input_tokens_seen": 25431712, "step": 11790 }, { "epoch": 2.1646173609836667, "grad_norm": 3.9682295322418213, "learning_rate": 9.997941042706493e-06, "loss": 0.2433, "num_input_tokens_seen": 25442880, "step": 11795 }, { "epoch": 2.165534960543219, "grad_norm": 3.5462687015533447, "learning_rate": 9.997918000755044e-06, "loss": 0.2831, "num_input_tokens_seen": 25453248, "step": 11800 }, { "epoch": 2.1664525601027713, "grad_norm": 11.232667922973633, "learning_rate": 9.997894830614743e-06, "loss": 0.3012, "num_input_tokens_seen": 25463424, "step": 11805 }, { "epoch": 2.1673701596623234, "grad_norm": 8.796072006225586, "learning_rate": 9.997871532286187e-06, "loss": 0.2453, "num_input_tokens_seen": 25473408, "step": 11810 }, { "epoch": 2.1682877592218754, "grad_norm": 21.464508056640625, "learning_rate": 9.997848105769972e-06, "loss": 0.3421, "num_input_tokens_seen": 25484192, "step": 11815 }, { "epoch": 2.169205358781428, "grad_norm": 3.2564594745635986, "learning_rate": 9.9978245510667e-06, "loss": 0.1832, "num_input_tokens_seen": 25494816, "step": 11820 }, { "epoch": 2.17012295834098, "grad_norm": 6.430293083190918, "learning_rate": 9.997800868176973e-06, "loss": 0.2487, "num_input_tokens_seen": 25505792, "step": 11825 }, { "epoch": 2.171040557900532, "grad_norm": 5.559780597686768, "learning_rate": 9.9977770571014e-06, "loss": 0.3927, "num_input_tokens_seen": 25517248, "step": 11830 }, { "epoch": 2.1719581574600846, "grad_norm": 6.740344047546387, "learning_rate": 9.99775311784059e-06, "loss": 0.234, "num_input_tokens_seen": 25528160, "step": 11835 }, { "epoch": 2.1728757570196366, "grad_norm": 3.3998959064483643, "learning_rate": 9.997729050395157e-06, "loss": 0.1613, "num_input_tokens_seen": 25539328, "step": 11840 }, { "epoch": 2.1737933565791887, "grad_norm": 3.127368450164795, "learning_rate": 9.997704854765723e-06, "loss": 0.3517, "num_input_tokens_seen": 25549664, "step": 11845 }, { "epoch": 2.174710956138741, "grad_norm": 17.09321403503418, "learning_rate": 9.997680530952904e-06, "loss": 0.3087, "num_input_tokens_seen": 25560928, "step": 11850 }, { "epoch": 2.1756285556982933, "grad_norm": 4.260034084320068, "learning_rate": 9.997656078957325e-06, "loss": 0.1789, "num_input_tokens_seen": 25571072, "step": 11855 }, { "epoch": 2.1765461552578453, "grad_norm": 5.568498134613037, "learning_rate": 9.997631498779614e-06, "loss": 0.3652, "num_input_tokens_seen": 25581888, "step": 11860 }, { "epoch": 2.177463754817398, "grad_norm": 3.4625632762908936, "learning_rate": 9.9976067904204e-06, "loss": 0.268, "num_input_tokens_seen": 25591008, "step": 11865 }, { "epoch": 2.17838135437695, "grad_norm": 4.030395984649658, "learning_rate": 9.997581953880316e-06, "loss": 0.2939, "num_input_tokens_seen": 25602080, "step": 11870 }, { "epoch": 2.179298953936502, "grad_norm": 12.847334861755371, "learning_rate": 9.997556989160002e-06, "loss": 0.4891, "num_input_tokens_seen": 25614336, "step": 11875 }, { "epoch": 2.1802165534960545, "grad_norm": 5.258770942687988, "learning_rate": 9.997531896260097e-06, "loss": 0.3462, "num_input_tokens_seen": 25625984, "step": 11880 }, { "epoch": 2.1811341530556065, "grad_norm": 11.183475494384766, "learning_rate": 9.997506675181243e-06, "loss": 0.3175, "num_input_tokens_seen": 25637472, "step": 11885 }, { "epoch": 2.1820517526151586, "grad_norm": 4.670351505279541, "learning_rate": 9.99748132592409e-06, "loss": 0.2997, "num_input_tokens_seen": 25647328, "step": 11890 }, { "epoch": 2.182969352174711, "grad_norm": 8.347089767456055, "learning_rate": 9.997455848489286e-06, "loss": 0.2982, "num_input_tokens_seen": 25658592, "step": 11895 }, { "epoch": 2.183886951734263, "grad_norm": 6.266011714935303, "learning_rate": 9.997430242877484e-06, "loss": 0.2439, "num_input_tokens_seen": 25669664, "step": 11900 }, { "epoch": 2.1848045512938152, "grad_norm": 13.259675025939941, "learning_rate": 9.997404509089342e-06, "loss": 0.3103, "num_input_tokens_seen": 25681568, "step": 11905 }, { "epoch": 2.1857221508533677, "grad_norm": 3.5537500381469727, "learning_rate": 9.99737864712552e-06, "loss": 0.229, "num_input_tokens_seen": 25691968, "step": 11910 }, { "epoch": 2.18663975041292, "grad_norm": 7.2960076332092285, "learning_rate": 9.997352656986681e-06, "loss": 0.2697, "num_input_tokens_seen": 25702624, "step": 11915 }, { "epoch": 2.187557349972472, "grad_norm": 13.82867431640625, "learning_rate": 9.99732653867349e-06, "loss": 0.3232, "num_input_tokens_seen": 25713312, "step": 11920 }, { "epoch": 2.1884749495320244, "grad_norm": 0.613236665725708, "learning_rate": 9.99730029218662e-06, "loss": 0.2462, "num_input_tokens_seen": 25724576, "step": 11925 }, { "epoch": 2.1893925490915764, "grad_norm": 3.387690305709839, "learning_rate": 9.997273917526742e-06, "loss": 0.3128, "num_input_tokens_seen": 25735488, "step": 11930 }, { "epoch": 2.1903101486511285, "grad_norm": 5.2301554679870605, "learning_rate": 9.997247414694532e-06, "loss": 0.1864, "num_input_tokens_seen": 25746752, "step": 11935 }, { "epoch": 2.191227748210681, "grad_norm": 3.807312488555908, "learning_rate": 9.997220783690673e-06, "loss": 0.2881, "num_input_tokens_seen": 25758240, "step": 11940 }, { "epoch": 2.192145347770233, "grad_norm": 6.424769401550293, "learning_rate": 9.997194024515846e-06, "loss": 0.258, "num_input_tokens_seen": 25768480, "step": 11945 }, { "epoch": 2.193062947329785, "grad_norm": 11.559477806091309, "learning_rate": 9.997167137170736e-06, "loss": 0.2387, "num_input_tokens_seen": 25779392, "step": 11950 }, { "epoch": 2.1939805468893376, "grad_norm": 11.031952857971191, "learning_rate": 9.997140121656033e-06, "loss": 0.5079, "num_input_tokens_seen": 25789024, "step": 11955 }, { "epoch": 2.1948981464488897, "grad_norm": 1.4947314262390137, "learning_rate": 9.997112977972432e-06, "loss": 0.3195, "num_input_tokens_seen": 25798496, "step": 11960 }, { "epoch": 2.1958157460084418, "grad_norm": 6.829302787780762, "learning_rate": 9.997085706120628e-06, "loss": 0.2754, "num_input_tokens_seen": 25809600, "step": 11965 }, { "epoch": 2.1967333455679943, "grad_norm": 3.4688401222229004, "learning_rate": 9.99705830610132e-06, "loss": 0.2553, "num_input_tokens_seen": 25820096, "step": 11970 }, { "epoch": 2.1976509451275463, "grad_norm": 3.36156964302063, "learning_rate": 9.997030777915211e-06, "loss": 0.1918, "num_input_tokens_seen": 25830944, "step": 11975 }, { "epoch": 2.1985685446870984, "grad_norm": 1.91466224193573, "learning_rate": 9.997003121563007e-06, "loss": 0.2357, "num_input_tokens_seen": 25842112, "step": 11980 }, { "epoch": 2.199486144246651, "grad_norm": 2.858340263366699, "learning_rate": 9.996975337045419e-06, "loss": 0.251, "num_input_tokens_seen": 25852832, "step": 11985 }, { "epoch": 2.200403743806203, "grad_norm": 5.81595516204834, "learning_rate": 9.996947424363157e-06, "loss": 0.3538, "num_input_tokens_seen": 25863808, "step": 11990 }, { "epoch": 2.201321343365755, "grad_norm": 8.034623146057129, "learning_rate": 9.996919383516938e-06, "loss": 0.5025, "num_input_tokens_seen": 25874016, "step": 11995 }, { "epoch": 2.2022389429253075, "grad_norm": 3.711890459060669, "learning_rate": 9.996891214507483e-06, "loss": 0.2201, "num_input_tokens_seen": 25885216, "step": 12000 }, { "epoch": 2.2031565424848596, "grad_norm": 7.563361167907715, "learning_rate": 9.99686291733551e-06, "loss": 0.2254, "num_input_tokens_seen": 25895488, "step": 12005 }, { "epoch": 2.2040741420444117, "grad_norm": 5.312833309173584, "learning_rate": 9.99683449200175e-06, "loss": 0.4908, "num_input_tokens_seen": 25905728, "step": 12010 }, { "epoch": 2.204991741603964, "grad_norm": 12.201173782348633, "learning_rate": 9.996805938506928e-06, "loss": 0.5009, "num_input_tokens_seen": 25917120, "step": 12015 }, { "epoch": 2.2059093411635162, "grad_norm": 4.5366129875183105, "learning_rate": 9.99677725685178e-06, "loss": 0.3868, "num_input_tokens_seen": 25927328, "step": 12020 }, { "epoch": 2.2068269407230683, "grad_norm": 2.810542583465576, "learning_rate": 9.996748447037039e-06, "loss": 0.2092, "num_input_tokens_seen": 25937152, "step": 12025 }, { "epoch": 2.207744540282621, "grad_norm": 2.0668418407440186, "learning_rate": 9.996719509063444e-06, "loss": 0.2582, "num_input_tokens_seen": 25948704, "step": 12030 }, { "epoch": 2.208662139842173, "grad_norm": 3.9702916145324707, "learning_rate": 9.996690442931737e-06, "loss": 0.1881, "num_input_tokens_seen": 25960192, "step": 12035 }, { "epoch": 2.209579739401725, "grad_norm": 3.2300496101379395, "learning_rate": 9.996661248642665e-06, "loss": 0.2518, "num_input_tokens_seen": 25971296, "step": 12040 }, { "epoch": 2.2104973389612774, "grad_norm": 13.027257919311523, "learning_rate": 9.996631926196977e-06, "loss": 0.5239, "num_input_tokens_seen": 25982016, "step": 12045 }, { "epoch": 2.2114149385208295, "grad_norm": 4.953002452850342, "learning_rate": 9.996602475595424e-06, "loss": 0.3327, "num_input_tokens_seen": 25992320, "step": 12050 }, { "epoch": 2.2123325380803815, "grad_norm": 2.1534523963928223, "learning_rate": 9.996572896838761e-06, "loss": 0.276, "num_input_tokens_seen": 26002336, "step": 12055 }, { "epoch": 2.213250137639934, "grad_norm": 3.722369432449341, "learning_rate": 9.996543189927747e-06, "loss": 0.2601, "num_input_tokens_seen": 26012288, "step": 12060 }, { "epoch": 2.214167737199486, "grad_norm": 6.734062194824219, "learning_rate": 9.996513354863144e-06, "loss": 0.2368, "num_input_tokens_seen": 26023168, "step": 12065 }, { "epoch": 2.215085336759038, "grad_norm": 2.426041603088379, "learning_rate": 9.996483391645719e-06, "loss": 0.2397, "num_input_tokens_seen": 26032672, "step": 12070 }, { "epoch": 2.2160029363185907, "grad_norm": 4.684819221496582, "learning_rate": 9.996453300276237e-06, "loss": 0.3078, "num_input_tokens_seen": 26043328, "step": 12075 }, { "epoch": 2.2169205358781428, "grad_norm": 3.640052556991577, "learning_rate": 9.996423080755472e-06, "loss": 0.2476, "num_input_tokens_seen": 26053088, "step": 12080 }, { "epoch": 2.217838135437695, "grad_norm": 3.5108418464660645, "learning_rate": 9.9963927330842e-06, "loss": 0.1843, "num_input_tokens_seen": 26063584, "step": 12085 }, { "epoch": 2.2187557349972473, "grad_norm": 11.23890209197998, "learning_rate": 9.996362257263195e-06, "loss": 0.2468, "num_input_tokens_seen": 26075072, "step": 12090 }, { "epoch": 2.2196733345567994, "grad_norm": 10.933547973632812, "learning_rate": 9.996331653293245e-06, "loss": 0.347, "num_input_tokens_seen": 26085440, "step": 12095 }, { "epoch": 2.2205909341163514, "grad_norm": 13.824237823486328, "learning_rate": 9.99630092117513e-06, "loss": 0.321, "num_input_tokens_seen": 26096192, "step": 12100 }, { "epoch": 2.221508533675904, "grad_norm": 4.683196067810059, "learning_rate": 9.99627006090964e-06, "loss": 0.4663, "num_input_tokens_seen": 26105408, "step": 12105 }, { "epoch": 2.222426133235456, "grad_norm": 7.489907741546631, "learning_rate": 9.996239072497568e-06, "loss": 0.2905, "num_input_tokens_seen": 26115008, "step": 12110 }, { "epoch": 2.223343732795008, "grad_norm": 2.1494154930114746, "learning_rate": 9.996207955939705e-06, "loss": 0.2468, "num_input_tokens_seen": 26124704, "step": 12115 }, { "epoch": 2.2242613323545606, "grad_norm": 6.973246097564697, "learning_rate": 9.996176711236854e-06, "loss": 0.31, "num_input_tokens_seen": 26135680, "step": 12120 }, { "epoch": 2.2251789319141126, "grad_norm": 1.7371315956115723, "learning_rate": 9.99614533838981e-06, "loss": 0.2681, "num_input_tokens_seen": 26146528, "step": 12125 }, { "epoch": 2.2260965314736647, "grad_norm": 3.4803638458251953, "learning_rate": 9.996113837399385e-06, "loss": 0.4605, "num_input_tokens_seen": 26157408, "step": 12130 }, { "epoch": 2.227014131033217, "grad_norm": 7.285716533660889, "learning_rate": 9.996082208266382e-06, "loss": 0.4057, "num_input_tokens_seen": 26169152, "step": 12135 }, { "epoch": 2.2279317305927693, "grad_norm": 6.366878509521484, "learning_rate": 9.996050450991614e-06, "loss": 0.3544, "num_input_tokens_seen": 26180064, "step": 12140 }, { "epoch": 2.2288493301523213, "grad_norm": 8.40652084350586, "learning_rate": 9.996018565575894e-06, "loss": 0.351, "num_input_tokens_seen": 26191392, "step": 12145 }, { "epoch": 2.229766929711874, "grad_norm": 5.348551273345947, "learning_rate": 9.995986552020043e-06, "loss": 0.321, "num_input_tokens_seen": 26201696, "step": 12150 }, { "epoch": 2.230684529271426, "grad_norm": 4.041742324829102, "learning_rate": 9.995954410324877e-06, "loss": 0.2879, "num_input_tokens_seen": 26211552, "step": 12155 }, { "epoch": 2.231602128830978, "grad_norm": 3.5477161407470703, "learning_rate": 9.995922140491225e-06, "loss": 0.2356, "num_input_tokens_seen": 26221600, "step": 12160 }, { "epoch": 2.2325197283905305, "grad_norm": 1.4037059545516968, "learning_rate": 9.995889742519914e-06, "loss": 0.2491, "num_input_tokens_seen": 26232544, "step": 12165 }, { "epoch": 2.2334373279500825, "grad_norm": 2.696073055267334, "learning_rate": 9.995857216411772e-06, "loss": 0.2696, "num_input_tokens_seen": 26244448, "step": 12170 }, { "epoch": 2.2343549275096346, "grad_norm": 1.7723044157028198, "learning_rate": 9.995824562167638e-06, "loss": 0.3755, "num_input_tokens_seen": 26255488, "step": 12175 }, { "epoch": 2.235272527069187, "grad_norm": 9.428211212158203, "learning_rate": 9.995791779788344e-06, "loss": 0.4225, "num_input_tokens_seen": 26266208, "step": 12180 }, { "epoch": 2.236190126628739, "grad_norm": 2.605386734008789, "learning_rate": 9.995758869274735e-06, "loss": 0.2637, "num_input_tokens_seen": 26277280, "step": 12185 }, { "epoch": 2.2371077261882912, "grad_norm": 4.387911319732666, "learning_rate": 9.995725830627654e-06, "loss": 0.2368, "num_input_tokens_seen": 26288608, "step": 12190 }, { "epoch": 2.2380253257478437, "grad_norm": 3.2363321781158447, "learning_rate": 9.995692663847949e-06, "loss": 0.2736, "num_input_tokens_seen": 26299616, "step": 12195 }, { "epoch": 2.238942925307396, "grad_norm": 7.864592552185059, "learning_rate": 9.995659368936468e-06, "loss": 0.2105, "num_input_tokens_seen": 26309824, "step": 12200 }, { "epoch": 2.239860524866948, "grad_norm": 4.213624954223633, "learning_rate": 9.995625945894067e-06, "loss": 0.284, "num_input_tokens_seen": 26320896, "step": 12205 }, { "epoch": 2.2407781244265004, "grad_norm": 11.364500999450684, "learning_rate": 9.995592394721603e-06, "loss": 0.3015, "num_input_tokens_seen": 26331936, "step": 12210 }, { "epoch": 2.2416957239860524, "grad_norm": 8.816023826599121, "learning_rate": 9.995558715419938e-06, "loss": 0.2568, "num_input_tokens_seen": 26342592, "step": 12215 }, { "epoch": 2.2426133235456045, "grad_norm": 14.022720336914062, "learning_rate": 9.995524907989933e-06, "loss": 0.3966, "num_input_tokens_seen": 26354528, "step": 12220 }, { "epoch": 2.243530923105157, "grad_norm": 3.6274943351745605, "learning_rate": 9.995490972432455e-06, "loss": 0.3813, "num_input_tokens_seen": 26365152, "step": 12225 }, { "epoch": 2.244448522664709, "grad_norm": 3.6693408489227295, "learning_rate": 9.995456908748378e-06, "loss": 0.2489, "num_input_tokens_seen": 26377312, "step": 12230 }, { "epoch": 2.245366122224261, "grad_norm": 2.9861748218536377, "learning_rate": 9.995422716938573e-06, "loss": 0.221, "num_input_tokens_seen": 26386880, "step": 12235 }, { "epoch": 2.2462837217838136, "grad_norm": 9.906658172607422, "learning_rate": 9.995388397003919e-06, "loss": 0.3451, "num_input_tokens_seen": 26397920, "step": 12240 }, { "epoch": 2.2472013213433657, "grad_norm": 2.818425178527832, "learning_rate": 9.995353948945292e-06, "loss": 0.2578, "num_input_tokens_seen": 26408992, "step": 12245 }, { "epoch": 2.2481189209029178, "grad_norm": 4.770135879516602, "learning_rate": 9.995319372763578e-06, "loss": 0.2336, "num_input_tokens_seen": 26420224, "step": 12250 }, { "epoch": 2.2490365204624703, "grad_norm": 1.786396861076355, "learning_rate": 9.995284668459668e-06, "loss": 0.2213, "num_input_tokens_seen": 26429600, "step": 12255 }, { "epoch": 2.2499541200220223, "grad_norm": 7.401514053344727, "learning_rate": 9.995249836034446e-06, "loss": 0.3981, "num_input_tokens_seen": 26440768, "step": 12260 }, { "epoch": 2.250871719581575, "grad_norm": 3.948071241378784, "learning_rate": 9.995214875488806e-06, "loss": 0.5535, "num_input_tokens_seen": 26452128, "step": 12265 }, { "epoch": 2.251789319141127, "grad_norm": 3.5318660736083984, "learning_rate": 9.99517978682365e-06, "loss": 0.3352, "num_input_tokens_seen": 26463552, "step": 12270 }, { "epoch": 2.252706918700679, "grad_norm": 3.2682945728302, "learning_rate": 9.99514457003987e-06, "loss": 0.3403, "num_input_tokens_seen": 26474496, "step": 12275 }, { "epoch": 2.2536245182602315, "grad_norm": 2.676725149154663, "learning_rate": 9.995109225138377e-06, "loss": 0.3155, "num_input_tokens_seen": 26486144, "step": 12280 }, { "epoch": 2.2545421178197835, "grad_norm": 3.4523961544036865, "learning_rate": 9.995073752120073e-06, "loss": 0.182, "num_input_tokens_seen": 26497856, "step": 12285 }, { "epoch": 2.2554597173793356, "grad_norm": 8.53576374053955, "learning_rate": 9.995038150985868e-06, "loss": 0.2872, "num_input_tokens_seen": 26508256, "step": 12290 }, { "epoch": 2.256377316938888, "grad_norm": 2.5271499156951904, "learning_rate": 9.995002421736677e-06, "loss": 0.5218, "num_input_tokens_seen": 26517792, "step": 12295 }, { "epoch": 2.25729491649844, "grad_norm": 9.825647354125977, "learning_rate": 9.994966564373416e-06, "loss": 0.4176, "num_input_tokens_seen": 26528288, "step": 12300 }, { "epoch": 2.2582125160579922, "grad_norm": 1.632481575012207, "learning_rate": 9.994930578897002e-06, "loss": 0.3832, "num_input_tokens_seen": 26538432, "step": 12305 }, { "epoch": 2.2591301156175447, "grad_norm": 2.0517547130584717, "learning_rate": 9.994894465308363e-06, "loss": 0.3244, "num_input_tokens_seen": 26548864, "step": 12310 }, { "epoch": 2.260047715177097, "grad_norm": 1.8692537546157837, "learning_rate": 9.99485822360842e-06, "loss": 0.3928, "num_input_tokens_seen": 26560096, "step": 12315 }, { "epoch": 2.260965314736649, "grad_norm": 1.7810734510421753, "learning_rate": 9.994821853798107e-06, "loss": 0.2977, "num_input_tokens_seen": 26571616, "step": 12320 }, { "epoch": 2.2618829142962014, "grad_norm": 6.537039279937744, "learning_rate": 9.994785355878352e-06, "loss": 0.2572, "num_input_tokens_seen": 26582048, "step": 12325 }, { "epoch": 2.2628005138557534, "grad_norm": 4.440645217895508, "learning_rate": 9.994748729850097e-06, "loss": 0.32, "num_input_tokens_seen": 26592352, "step": 12330 }, { "epoch": 2.2637181134153055, "grad_norm": 11.697132110595703, "learning_rate": 9.994711975714275e-06, "loss": 0.3514, "num_input_tokens_seen": 26603648, "step": 12335 }, { "epoch": 2.264635712974858, "grad_norm": 3.0878238677978516, "learning_rate": 9.994675093471833e-06, "loss": 0.3102, "num_input_tokens_seen": 26613216, "step": 12340 }, { "epoch": 2.26555331253441, "grad_norm": 1.8345688581466675, "learning_rate": 9.994638083123717e-06, "loss": 0.3295, "num_input_tokens_seen": 26624128, "step": 12345 }, { "epoch": 2.266470912093962, "grad_norm": 2.1990625858306885, "learning_rate": 9.994600944670876e-06, "loss": 0.2367, "num_input_tokens_seen": 26635616, "step": 12350 }, { "epoch": 2.2673885116535146, "grad_norm": 2.4336695671081543, "learning_rate": 9.99456367811426e-06, "loss": 0.2622, "num_input_tokens_seen": 26646816, "step": 12355 }, { "epoch": 2.2683061112130667, "grad_norm": 11.007613182067871, "learning_rate": 9.994526283454826e-06, "loss": 0.3207, "num_input_tokens_seen": 26657376, "step": 12360 }, { "epoch": 2.2692237107726188, "grad_norm": 14.459710121154785, "learning_rate": 9.994488760693535e-06, "loss": 0.402, "num_input_tokens_seen": 26668256, "step": 12365 }, { "epoch": 2.2701413103321713, "grad_norm": 3.2154572010040283, "learning_rate": 9.994451109831347e-06, "loss": 0.2597, "num_input_tokens_seen": 26678560, "step": 12370 }, { "epoch": 2.2710589098917233, "grad_norm": 3.160356044769287, "learning_rate": 9.994413330869229e-06, "loss": 0.2422, "num_input_tokens_seen": 26690208, "step": 12375 }, { "epoch": 2.2719765094512754, "grad_norm": 6.09308385848999, "learning_rate": 9.99437542380815e-06, "loss": 0.4292, "num_input_tokens_seen": 26701760, "step": 12380 }, { "epoch": 2.272894109010828, "grad_norm": 2.1251163482666016, "learning_rate": 9.994337388649082e-06, "loss": 0.4124, "num_input_tokens_seen": 26713120, "step": 12385 }, { "epoch": 2.27381170857038, "grad_norm": 5.383272647857666, "learning_rate": 9.994299225393e-06, "loss": 0.3279, "num_input_tokens_seen": 26724256, "step": 12390 }, { "epoch": 2.274729308129932, "grad_norm": 2.2280561923980713, "learning_rate": 9.994260934040884e-06, "loss": 0.2853, "num_input_tokens_seen": 26733824, "step": 12395 }, { "epoch": 2.2756469076894845, "grad_norm": 1.5317401885986328, "learning_rate": 9.994222514593715e-06, "loss": 0.2674, "num_input_tokens_seen": 26744896, "step": 12400 }, { "epoch": 2.2765645072490366, "grad_norm": 2.0860681533813477, "learning_rate": 9.99418396705248e-06, "loss": 0.2375, "num_input_tokens_seen": 26755840, "step": 12405 }, { "epoch": 2.2774821068085886, "grad_norm": 1.8072854280471802, "learning_rate": 9.994145291418165e-06, "loss": 0.2837, "num_input_tokens_seen": 26766496, "step": 12410 }, { "epoch": 2.278399706368141, "grad_norm": 6.1563520431518555, "learning_rate": 9.994106487691763e-06, "loss": 0.269, "num_input_tokens_seen": 26778336, "step": 12415 }, { "epoch": 2.279317305927693, "grad_norm": 12.106025695800781, "learning_rate": 9.994067555874272e-06, "loss": 0.2579, "num_input_tokens_seen": 26789728, "step": 12420 }, { "epoch": 2.2802349054872453, "grad_norm": 4.812803268432617, "learning_rate": 9.994028495966686e-06, "loss": 0.3208, "num_input_tokens_seen": 26800736, "step": 12425 }, { "epoch": 2.281152505046798, "grad_norm": 6.200011253356934, "learning_rate": 9.99398930797001e-06, "loss": 0.3789, "num_input_tokens_seen": 26810912, "step": 12430 }, { "epoch": 2.28207010460635, "grad_norm": 4.702571392059326, "learning_rate": 9.993949991885248e-06, "loss": 0.2657, "num_input_tokens_seen": 26821376, "step": 12435 }, { "epoch": 2.282987704165902, "grad_norm": 7.2095232009887695, "learning_rate": 9.99391054771341e-06, "loss": 0.3341, "num_input_tokens_seen": 26832512, "step": 12440 }, { "epoch": 2.2839053037254544, "grad_norm": 2.215034246444702, "learning_rate": 9.993870975455506e-06, "loss": 0.2312, "num_input_tokens_seen": 26843776, "step": 12445 }, { "epoch": 2.2848229032850065, "grad_norm": 5.408883094787598, "learning_rate": 9.99383127511255e-06, "loss": 0.1906, "num_input_tokens_seen": 26854272, "step": 12450 }, { "epoch": 2.2857405028445585, "grad_norm": 3.862459659576416, "learning_rate": 9.993791446685562e-06, "loss": 0.276, "num_input_tokens_seen": 26865792, "step": 12455 }, { "epoch": 2.286658102404111, "grad_norm": 7.947590351104736, "learning_rate": 9.993751490175563e-06, "loss": 0.2513, "num_input_tokens_seen": 26877088, "step": 12460 }, { "epoch": 2.287575701963663, "grad_norm": 5.699724197387695, "learning_rate": 9.993711405583579e-06, "loss": 0.2937, "num_input_tokens_seen": 26887616, "step": 12465 }, { "epoch": 2.288493301523215, "grad_norm": 13.200516700744629, "learning_rate": 9.993671192910635e-06, "loss": 0.4185, "num_input_tokens_seen": 26897056, "step": 12470 }, { "epoch": 2.2894109010827677, "grad_norm": 3.950531482696533, "learning_rate": 9.993630852157765e-06, "loss": 0.3285, "num_input_tokens_seen": 26906784, "step": 12475 }, { "epoch": 2.2903285006423197, "grad_norm": 9.388944625854492, "learning_rate": 9.993590383326003e-06, "loss": 0.2468, "num_input_tokens_seen": 26917536, "step": 12480 }, { "epoch": 2.291246100201872, "grad_norm": 6.603897571563721, "learning_rate": 9.993549786416389e-06, "loss": 0.4869, "num_input_tokens_seen": 26928448, "step": 12485 }, { "epoch": 2.2921636997614243, "grad_norm": 5.441727161407471, "learning_rate": 9.99350906142996e-06, "loss": 0.3686, "num_input_tokens_seen": 26938144, "step": 12490 }, { "epoch": 2.2930812993209764, "grad_norm": 4.477112293243408, "learning_rate": 9.993468208367765e-06, "loss": 0.2814, "num_input_tokens_seen": 26948608, "step": 12495 }, { "epoch": 2.2939988988805284, "grad_norm": 6.087998390197754, "learning_rate": 9.993427227230847e-06, "loss": 0.2815, "num_input_tokens_seen": 26959808, "step": 12500 }, { "epoch": 2.294916498440081, "grad_norm": 4.028037071228027, "learning_rate": 9.993386118020262e-06, "loss": 0.254, "num_input_tokens_seen": 26970336, "step": 12505 }, { "epoch": 2.295834097999633, "grad_norm": 3.1465651988983154, "learning_rate": 9.99334488073706e-06, "loss": 0.2633, "num_input_tokens_seen": 26979552, "step": 12510 }, { "epoch": 2.296751697559185, "grad_norm": 4.0218305587768555, "learning_rate": 9.993303515382302e-06, "loss": 0.2209, "num_input_tokens_seen": 26990624, "step": 12515 }, { "epoch": 2.2976692971187376, "grad_norm": 6.647977828979492, "learning_rate": 9.993262021957048e-06, "loss": 0.2572, "num_input_tokens_seen": 27002624, "step": 12520 }, { "epoch": 2.2985868966782896, "grad_norm": 8.734296798706055, "learning_rate": 9.993220400462362e-06, "loss": 0.254, "num_input_tokens_seen": 27014080, "step": 12525 }, { "epoch": 2.2995044962378417, "grad_norm": 5.907303333282471, "learning_rate": 9.993178650899312e-06, "loss": 0.2608, "num_input_tokens_seen": 27024576, "step": 12530 }, { "epoch": 2.300422095797394, "grad_norm": 6.936001300811768, "learning_rate": 9.993136773268967e-06, "loss": 0.3095, "num_input_tokens_seen": 27035360, "step": 12535 }, { "epoch": 2.3013396953569463, "grad_norm": 4.292476654052734, "learning_rate": 9.993094767572401e-06, "loss": 0.362, "num_input_tokens_seen": 27046304, "step": 12540 }, { "epoch": 2.3022572949164983, "grad_norm": 1.7947676181793213, "learning_rate": 9.993052633810697e-06, "loss": 0.2901, "num_input_tokens_seen": 27057056, "step": 12545 }, { "epoch": 2.303174894476051, "grad_norm": 18.1950740814209, "learning_rate": 9.993010371984929e-06, "loss": 0.2759, "num_input_tokens_seen": 27067968, "step": 12550 }, { "epoch": 2.304092494035603, "grad_norm": 4.723489284515381, "learning_rate": 9.992967982096183e-06, "loss": 0.3231, "num_input_tokens_seen": 27079424, "step": 12555 }, { "epoch": 2.305010093595155, "grad_norm": 4.002822399139404, "learning_rate": 9.992925464145548e-06, "loss": 0.3324, "num_input_tokens_seen": 27090464, "step": 12560 }, { "epoch": 2.3059276931547075, "grad_norm": 6.745168209075928, "learning_rate": 9.992882818134114e-06, "loss": 0.2425, "num_input_tokens_seen": 27101056, "step": 12565 }, { "epoch": 2.3068452927142595, "grad_norm": 6.674252986907959, "learning_rate": 9.99284004406297e-06, "loss": 0.3622, "num_input_tokens_seen": 27111776, "step": 12570 }, { "epoch": 2.3077628922738116, "grad_norm": 2.1682865619659424, "learning_rate": 9.99279714193322e-06, "loss": 0.3491, "num_input_tokens_seen": 27122688, "step": 12575 }, { "epoch": 2.308680491833364, "grad_norm": 5.7684326171875, "learning_rate": 9.992754111745961e-06, "loss": 0.2709, "num_input_tokens_seen": 27134080, "step": 12580 }, { "epoch": 2.309598091392916, "grad_norm": 5.400938510894775, "learning_rate": 9.992710953502298e-06, "loss": 0.3591, "num_input_tokens_seen": 27144768, "step": 12585 }, { "epoch": 2.3105156909524682, "grad_norm": 8.71768856048584, "learning_rate": 9.992667667203336e-06, "loss": 0.3409, "num_input_tokens_seen": 27155200, "step": 12590 }, { "epoch": 2.3114332905120207, "grad_norm": 3.9007577896118164, "learning_rate": 9.992624252850186e-06, "loss": 0.332, "num_input_tokens_seen": 27165184, "step": 12595 }, { "epoch": 2.312350890071573, "grad_norm": 8.32144832611084, "learning_rate": 9.992580710443962e-06, "loss": 0.2492, "num_input_tokens_seen": 27176288, "step": 12600 }, { "epoch": 2.313268489631125, "grad_norm": 9.79222297668457, "learning_rate": 9.992537039985782e-06, "loss": 0.2822, "num_input_tokens_seen": 27186496, "step": 12605 }, { "epoch": 2.3141860891906774, "grad_norm": 3.338376760482788, "learning_rate": 9.992493241476761e-06, "loss": 0.2215, "num_input_tokens_seen": 27195648, "step": 12610 }, { "epoch": 2.3151036887502294, "grad_norm": 1.7224441766738892, "learning_rate": 9.99244931491803e-06, "loss": 0.284, "num_input_tokens_seen": 27206112, "step": 12615 }, { "epoch": 2.3160212883097815, "grad_norm": 9.926156997680664, "learning_rate": 9.99240526031071e-06, "loss": 0.3881, "num_input_tokens_seen": 27215616, "step": 12620 }, { "epoch": 2.316938887869334, "grad_norm": 6.220623016357422, "learning_rate": 9.992361077655933e-06, "loss": 0.3961, "num_input_tokens_seen": 27227200, "step": 12625 }, { "epoch": 2.317856487428886, "grad_norm": 2.2166500091552734, "learning_rate": 9.99231676695483e-06, "loss": 0.4252, "num_input_tokens_seen": 27237792, "step": 12630 }, { "epoch": 2.318774086988438, "grad_norm": 1.811196208000183, "learning_rate": 9.992272328208542e-06, "loss": 0.2434, "num_input_tokens_seen": 27247904, "step": 12635 }, { "epoch": 2.3196916865479906, "grad_norm": 4.640689849853516, "learning_rate": 9.992227761418206e-06, "loss": 0.3229, "num_input_tokens_seen": 27258496, "step": 12640 }, { "epoch": 2.3206092861075427, "grad_norm": 3.9755468368530273, "learning_rate": 9.992183066584964e-06, "loss": 0.2962, "num_input_tokens_seen": 27270176, "step": 12645 }, { "epoch": 2.3215268856670948, "grad_norm": 2.9445481300354004, "learning_rate": 9.992138243709964e-06, "loss": 0.3502, "num_input_tokens_seen": 27280640, "step": 12650 }, { "epoch": 2.3224444852266473, "grad_norm": 10.016783714294434, "learning_rate": 9.992093292794355e-06, "loss": 0.2556, "num_input_tokens_seen": 27291232, "step": 12655 }, { "epoch": 2.3233620847861993, "grad_norm": 11.254190444946289, "learning_rate": 9.99204821383929e-06, "loss": 0.3161, "num_input_tokens_seen": 27302432, "step": 12660 }, { "epoch": 2.3242796843457514, "grad_norm": 2.583083152770996, "learning_rate": 9.992003006845924e-06, "loss": 0.2653, "num_input_tokens_seen": 27313056, "step": 12665 }, { "epoch": 2.325197283905304, "grad_norm": 4.198090553283691, "learning_rate": 9.991957671815418e-06, "loss": 0.2944, "num_input_tokens_seen": 27324736, "step": 12670 }, { "epoch": 2.326114883464856, "grad_norm": 6.296920299530029, "learning_rate": 9.991912208748937e-06, "loss": 0.4468, "num_input_tokens_seen": 27335616, "step": 12675 }, { "epoch": 2.327032483024408, "grad_norm": 3.6722986698150635, "learning_rate": 9.991866617647643e-06, "loss": 0.3271, "num_input_tokens_seen": 27346560, "step": 12680 }, { "epoch": 2.3279500825839605, "grad_norm": 3.988570213317871, "learning_rate": 9.991820898512706e-06, "loss": 0.3653, "num_input_tokens_seen": 27357472, "step": 12685 }, { "epoch": 2.3288676821435126, "grad_norm": 2.3352301120758057, "learning_rate": 9.9917750513453e-06, "loss": 0.2702, "num_input_tokens_seen": 27368096, "step": 12690 }, { "epoch": 2.3297852817030646, "grad_norm": 3.003335952758789, "learning_rate": 9.991729076146602e-06, "loss": 0.3342, "num_input_tokens_seen": 27379296, "step": 12695 }, { "epoch": 2.330702881262617, "grad_norm": 1.981420636177063, "learning_rate": 9.991682972917788e-06, "loss": 0.3602, "num_input_tokens_seen": 27389696, "step": 12700 }, { "epoch": 2.331620480822169, "grad_norm": 2.69101881980896, "learning_rate": 9.991636741660044e-06, "loss": 0.2596, "num_input_tokens_seen": 27400768, "step": 12705 }, { "epoch": 2.3325380803817213, "grad_norm": 2.2402706146240234, "learning_rate": 9.991590382374552e-06, "loss": 0.2592, "num_input_tokens_seen": 27410400, "step": 12710 }, { "epoch": 2.333455679941274, "grad_norm": 9.783516883850098, "learning_rate": 9.991543895062504e-06, "loss": 0.4864, "num_input_tokens_seen": 27418976, "step": 12715 }, { "epoch": 2.334373279500826, "grad_norm": 3.457798719406128, "learning_rate": 9.991497279725091e-06, "loss": 0.3742, "num_input_tokens_seen": 27430016, "step": 12720 }, { "epoch": 2.335290879060378, "grad_norm": 4.052579402923584, "learning_rate": 9.99145053636351e-06, "loss": 0.2755, "num_input_tokens_seen": 27440480, "step": 12725 }, { "epoch": 2.3362084786199304, "grad_norm": 3.036311626434326, "learning_rate": 9.991403664978959e-06, "loss": 0.2818, "num_input_tokens_seen": 27451616, "step": 12730 }, { "epoch": 2.3371260781794825, "grad_norm": 1.9598520994186401, "learning_rate": 9.991356665572639e-06, "loss": 0.2065, "num_input_tokens_seen": 27462144, "step": 12735 }, { "epoch": 2.3380436777390345, "grad_norm": 5.352352142333984, "learning_rate": 9.991309538145754e-06, "loss": 0.3936, "num_input_tokens_seen": 27473056, "step": 12740 }, { "epoch": 2.338961277298587, "grad_norm": 3.8851544857025146, "learning_rate": 9.991262282699518e-06, "loss": 0.3097, "num_input_tokens_seen": 27482912, "step": 12745 }, { "epoch": 2.339878876858139, "grad_norm": 3.477659225463867, "learning_rate": 9.99121489923514e-06, "loss": 0.223, "num_input_tokens_seen": 27494560, "step": 12750 }, { "epoch": 2.340796476417691, "grad_norm": 5.056783199310303, "learning_rate": 9.991167387753834e-06, "loss": 0.2655, "num_input_tokens_seen": 27505536, "step": 12755 }, { "epoch": 2.3417140759772437, "grad_norm": 5.349851608276367, "learning_rate": 9.99111974825682e-06, "loss": 0.3078, "num_input_tokens_seen": 27516288, "step": 12760 }, { "epoch": 2.3426316755367957, "grad_norm": 3.481581926345825, "learning_rate": 9.991071980745323e-06, "loss": 0.2862, "num_input_tokens_seen": 27526912, "step": 12765 }, { "epoch": 2.343549275096348, "grad_norm": 2.50462007522583, "learning_rate": 9.991024085220563e-06, "loss": 0.2527, "num_input_tokens_seen": 27537312, "step": 12770 }, { "epoch": 2.3444668746559003, "grad_norm": 2.4358599185943604, "learning_rate": 9.99097606168377e-06, "loss": 0.2526, "num_input_tokens_seen": 27547200, "step": 12775 }, { "epoch": 2.3453844742154524, "grad_norm": 4.824763298034668, "learning_rate": 9.990927910136178e-06, "loss": 0.3493, "num_input_tokens_seen": 27558240, "step": 12780 }, { "epoch": 2.3463020737750044, "grad_norm": 5.46177864074707, "learning_rate": 9.99087963057902e-06, "loss": 0.3092, "num_input_tokens_seen": 27568032, "step": 12785 }, { "epoch": 2.347219673334557, "grad_norm": 2.653320789337158, "learning_rate": 9.990831223013533e-06, "loss": 0.2616, "num_input_tokens_seen": 27579040, "step": 12790 }, { "epoch": 2.348137272894109, "grad_norm": 3.1508748531341553, "learning_rate": 9.990782687440962e-06, "loss": 0.1904, "num_input_tokens_seen": 27589568, "step": 12795 }, { "epoch": 2.349054872453661, "grad_norm": 3.519361972808838, "learning_rate": 9.99073402386255e-06, "loss": 0.3616, "num_input_tokens_seen": 27601760, "step": 12800 }, { "epoch": 2.3499724720132136, "grad_norm": 4.732645034790039, "learning_rate": 9.990685232279544e-06, "loss": 0.2666, "num_input_tokens_seen": 27612576, "step": 12805 }, { "epoch": 2.3508900715727656, "grad_norm": 2.398662567138672, "learning_rate": 9.990636312693197e-06, "loss": 0.2444, "num_input_tokens_seen": 27623840, "step": 12810 }, { "epoch": 2.3518076711323177, "grad_norm": 2.5057199001312256, "learning_rate": 9.990587265104765e-06, "loss": 0.224, "num_input_tokens_seen": 27634592, "step": 12815 }, { "epoch": 2.35272527069187, "grad_norm": 2.9754984378814697, "learning_rate": 9.990538089515503e-06, "loss": 0.2551, "num_input_tokens_seen": 27645664, "step": 12820 }, { "epoch": 2.3536428702514223, "grad_norm": 3.859323024749756, "learning_rate": 9.990488785926675e-06, "loss": 0.261, "num_input_tokens_seen": 27656992, "step": 12825 }, { "epoch": 2.3545604698109743, "grad_norm": 2.643808603286743, "learning_rate": 9.990439354339544e-06, "loss": 0.2006, "num_input_tokens_seen": 27669088, "step": 12830 }, { "epoch": 2.355478069370527, "grad_norm": 20.928003311157227, "learning_rate": 9.990389794755375e-06, "loss": 0.2539, "num_input_tokens_seen": 27679360, "step": 12835 }, { "epoch": 2.356395668930079, "grad_norm": 6.296214580535889, "learning_rate": 9.990340107175444e-06, "loss": 0.1704, "num_input_tokens_seen": 27690912, "step": 12840 }, { "epoch": 2.357313268489631, "grad_norm": 13.250069618225098, "learning_rate": 9.990290291601024e-06, "loss": 0.3595, "num_input_tokens_seen": 27702432, "step": 12845 }, { "epoch": 2.3582308680491835, "grad_norm": 9.38401985168457, "learning_rate": 9.990240348033392e-06, "loss": 0.2551, "num_input_tokens_seen": 27712896, "step": 12850 }, { "epoch": 2.3591484676087355, "grad_norm": 4.666481971740723, "learning_rate": 9.99019027647383e-06, "loss": 0.1733, "num_input_tokens_seen": 27724384, "step": 12855 }, { "epoch": 2.3600660671682876, "grad_norm": 23.595783233642578, "learning_rate": 9.99014007692362e-06, "loss": 0.3463, "num_input_tokens_seen": 27735616, "step": 12860 }, { "epoch": 2.36098366672784, "grad_norm": 13.600008010864258, "learning_rate": 9.990089749384053e-06, "loss": 0.4606, "num_input_tokens_seen": 27747264, "step": 12865 }, { "epoch": 2.361901266287392, "grad_norm": 3.811105251312256, "learning_rate": 9.990039293856415e-06, "loss": 0.3355, "num_input_tokens_seen": 27758080, "step": 12870 }, { "epoch": 2.3628188658469442, "grad_norm": 7.820487976074219, "learning_rate": 9.989988710342006e-06, "loss": 0.2614, "num_input_tokens_seen": 27768864, "step": 12875 }, { "epoch": 2.3637364654064967, "grad_norm": 2.858433485031128, "learning_rate": 9.989937998842119e-06, "loss": 0.1904, "num_input_tokens_seen": 27780736, "step": 12880 }, { "epoch": 2.364654064966049, "grad_norm": 19.992971420288086, "learning_rate": 9.989887159358056e-06, "loss": 0.453, "num_input_tokens_seen": 27791040, "step": 12885 }, { "epoch": 2.365571664525601, "grad_norm": 5.936960697174072, "learning_rate": 9.98983619189112e-06, "loss": 0.2181, "num_input_tokens_seen": 27800960, "step": 12890 }, { "epoch": 2.3664892640851534, "grad_norm": 11.263958930969238, "learning_rate": 9.989785096442622e-06, "loss": 0.292, "num_input_tokens_seen": 27812032, "step": 12895 }, { "epoch": 2.3674068636447054, "grad_norm": 8.290976524353027, "learning_rate": 9.989733873013867e-06, "loss": 0.4284, "num_input_tokens_seen": 27823872, "step": 12900 }, { "epoch": 2.3683244632042575, "grad_norm": 3.6941583156585693, "learning_rate": 9.989682521606171e-06, "loss": 0.2518, "num_input_tokens_seen": 27834368, "step": 12905 }, { "epoch": 2.36924206276381, "grad_norm": 2.4972877502441406, "learning_rate": 9.989631042220855e-06, "loss": 0.3043, "num_input_tokens_seen": 27843712, "step": 12910 }, { "epoch": 2.370159662323362, "grad_norm": 8.802135467529297, "learning_rate": 9.989579434859233e-06, "loss": 0.2562, "num_input_tokens_seen": 27853408, "step": 12915 }, { "epoch": 2.371077261882914, "grad_norm": 5.1571879386901855, "learning_rate": 9.989527699522634e-06, "loss": 0.2707, "num_input_tokens_seen": 27863328, "step": 12920 }, { "epoch": 2.3719948614424666, "grad_norm": 3.222245931625366, "learning_rate": 9.98947583621238e-06, "loss": 0.3243, "num_input_tokens_seen": 27874656, "step": 12925 }, { "epoch": 2.3729124610020187, "grad_norm": 5.268673419952393, "learning_rate": 9.989423844929806e-06, "loss": 0.4085, "num_input_tokens_seen": 27886432, "step": 12930 }, { "epoch": 2.3738300605615708, "grad_norm": 1.8857460021972656, "learning_rate": 9.989371725676242e-06, "loss": 0.1941, "num_input_tokens_seen": 27897056, "step": 12935 }, { "epoch": 2.3747476601211233, "grad_norm": 2.5053250789642334, "learning_rate": 9.989319478453028e-06, "loss": 0.3461, "num_input_tokens_seen": 27906496, "step": 12940 }, { "epoch": 2.3756652596806753, "grad_norm": 5.171850681304932, "learning_rate": 9.9892671032615e-06, "loss": 0.3163, "num_input_tokens_seen": 27917504, "step": 12945 }, { "epoch": 2.3765828592402274, "grad_norm": 15.582138061523438, "learning_rate": 9.989214600103003e-06, "loss": 0.3087, "num_input_tokens_seen": 27929536, "step": 12950 }, { "epoch": 2.37750045879978, "grad_norm": 3.5707945823669434, "learning_rate": 9.989161968978887e-06, "loss": 0.2494, "num_input_tokens_seen": 27939840, "step": 12955 }, { "epoch": 2.378418058359332, "grad_norm": 1.862558364868164, "learning_rate": 9.989109209890496e-06, "loss": 0.2597, "num_input_tokens_seen": 27949760, "step": 12960 }, { "epoch": 2.379335657918884, "grad_norm": 1.4592820405960083, "learning_rate": 9.989056322839188e-06, "loss": 0.2116, "num_input_tokens_seen": 27961088, "step": 12965 }, { "epoch": 2.3802532574784365, "grad_norm": 19.21781349182129, "learning_rate": 9.989003307826317e-06, "loss": 0.3182, "num_input_tokens_seen": 27972128, "step": 12970 }, { "epoch": 2.3811708570379886, "grad_norm": 7.3980278968811035, "learning_rate": 9.988950164853244e-06, "loss": 0.3406, "num_input_tokens_seen": 27983712, "step": 12975 }, { "epoch": 2.3820884565975406, "grad_norm": 7.792336463928223, "learning_rate": 9.988896893921331e-06, "loss": 0.431, "num_input_tokens_seen": 27993568, "step": 12980 }, { "epoch": 2.383006056157093, "grad_norm": 2.7363100051879883, "learning_rate": 9.988843495031944e-06, "loss": 0.2192, "num_input_tokens_seen": 28004320, "step": 12985 }, { "epoch": 2.383923655716645, "grad_norm": 1.6734743118286133, "learning_rate": 9.988789968186455e-06, "loss": 0.2192, "num_input_tokens_seen": 28015648, "step": 12990 }, { "epoch": 2.3848412552761973, "grad_norm": 4.162476539611816, "learning_rate": 9.988736313386234e-06, "loss": 0.3098, "num_input_tokens_seen": 28026336, "step": 12995 }, { "epoch": 2.38575885483575, "grad_norm": 3.391757011413574, "learning_rate": 9.988682530632659e-06, "loss": 0.2804, "num_input_tokens_seen": 28038432, "step": 13000 }, { "epoch": 2.386676454395302, "grad_norm": 12.635534286499023, "learning_rate": 9.988628619927108e-06, "loss": 0.3006, "num_input_tokens_seen": 28050080, "step": 13005 }, { "epoch": 2.387594053954854, "grad_norm": 6.318296432495117, "learning_rate": 9.988574581270965e-06, "loss": 0.303, "num_input_tokens_seen": 28060832, "step": 13010 }, { "epoch": 2.3885116535144064, "grad_norm": 4.723381996154785, "learning_rate": 9.988520414665615e-06, "loss": 0.3283, "num_input_tokens_seen": 28072480, "step": 13015 }, { "epoch": 2.3894292530739585, "grad_norm": 2.3344380855560303, "learning_rate": 9.988466120112448e-06, "loss": 0.4298, "num_input_tokens_seen": 28082016, "step": 13020 }, { "epoch": 2.3903468526335105, "grad_norm": 2.3796908855438232, "learning_rate": 9.988411697612858e-06, "loss": 0.2203, "num_input_tokens_seen": 28093056, "step": 13025 }, { "epoch": 2.391264452193063, "grad_norm": 3.040929079055786, "learning_rate": 9.988357147168237e-06, "loss": 0.2853, "num_input_tokens_seen": 28103648, "step": 13030 }, { "epoch": 2.392182051752615, "grad_norm": 1.7524811029434204, "learning_rate": 9.98830246877999e-06, "loss": 0.2342, "num_input_tokens_seen": 28114112, "step": 13035 }, { "epoch": 2.393099651312167, "grad_norm": 10.689569473266602, "learning_rate": 9.988247662449513e-06, "loss": 0.2619, "num_input_tokens_seen": 28124640, "step": 13040 }, { "epoch": 2.3940172508717197, "grad_norm": 2.05684494972229, "learning_rate": 9.988192728178214e-06, "loss": 0.3486, "num_input_tokens_seen": 28136096, "step": 13045 }, { "epoch": 2.3949348504312717, "grad_norm": 2.4668161869049072, "learning_rate": 9.988137665967503e-06, "loss": 0.2539, "num_input_tokens_seen": 28146528, "step": 13050 }, { "epoch": 2.395852449990824, "grad_norm": 13.001422882080078, "learning_rate": 9.988082475818794e-06, "loss": 0.2605, "num_input_tokens_seen": 28157440, "step": 13055 }, { "epoch": 2.3967700495503763, "grad_norm": 8.154723167419434, "learning_rate": 9.988027157733497e-06, "loss": 0.3588, "num_input_tokens_seen": 28168736, "step": 13060 }, { "epoch": 2.3976876491099284, "grad_norm": 2.9433839321136475, "learning_rate": 9.987971711713036e-06, "loss": 0.3444, "num_input_tokens_seen": 28179776, "step": 13065 }, { "epoch": 2.3986052486694804, "grad_norm": 9.2310791015625, "learning_rate": 9.987916137758832e-06, "loss": 0.3825, "num_input_tokens_seen": 28190336, "step": 13070 }, { "epoch": 2.399522848229033, "grad_norm": 2.3625848293304443, "learning_rate": 9.987860435872308e-06, "loss": 0.3563, "num_input_tokens_seen": 28200608, "step": 13075 }, { "epoch": 2.400440447788585, "grad_norm": 7.213968753814697, "learning_rate": 9.987804606054897e-06, "loss": 0.263, "num_input_tokens_seen": 28210432, "step": 13080 }, { "epoch": 2.401358047348137, "grad_norm": 6.786818027496338, "learning_rate": 9.987748648308024e-06, "loss": 0.3099, "num_input_tokens_seen": 28221920, "step": 13085 }, { "epoch": 2.4022756469076896, "grad_norm": 4.469240665435791, "learning_rate": 9.987692562633132e-06, "loss": 0.3332, "num_input_tokens_seen": 28232992, "step": 13090 }, { "epoch": 2.4031932464672416, "grad_norm": 2.347203493118286, "learning_rate": 9.987636349031655e-06, "loss": 0.2689, "num_input_tokens_seen": 28242368, "step": 13095 }, { "epoch": 2.4041108460267937, "grad_norm": 3.6020240783691406, "learning_rate": 9.987580007505035e-06, "loss": 0.2316, "num_input_tokens_seen": 28254176, "step": 13100 }, { "epoch": 2.405028445586346, "grad_norm": 6.919474124908447, "learning_rate": 9.987523538054717e-06, "loss": 0.3849, "num_input_tokens_seen": 28264384, "step": 13105 }, { "epoch": 2.4059460451458983, "grad_norm": 5.842159271240234, "learning_rate": 9.987466940682154e-06, "loss": 0.3104, "num_input_tokens_seen": 28274944, "step": 13110 }, { "epoch": 2.4068636447054503, "grad_norm": 7.272845268249512, "learning_rate": 9.98741021538879e-06, "loss": 0.2997, "num_input_tokens_seen": 28285088, "step": 13115 }, { "epoch": 2.407781244265003, "grad_norm": 4.959159851074219, "learning_rate": 9.987353362176086e-06, "loss": 0.2549, "num_input_tokens_seen": 28294368, "step": 13120 }, { "epoch": 2.408698843824555, "grad_norm": 3.662349224090576, "learning_rate": 9.987296381045497e-06, "loss": 0.2642, "num_input_tokens_seen": 28304352, "step": 13125 }, { "epoch": 2.409616443384107, "grad_norm": 3.2236602306365967, "learning_rate": 9.987239271998486e-06, "loss": 0.3269, "num_input_tokens_seen": 28314944, "step": 13130 }, { "epoch": 2.4105340429436595, "grad_norm": 10.095380783081055, "learning_rate": 9.987182035036516e-06, "loss": 0.2935, "num_input_tokens_seen": 28326016, "step": 13135 }, { "epoch": 2.4114516425032115, "grad_norm": 5.367080211639404, "learning_rate": 9.987124670161057e-06, "loss": 0.2036, "num_input_tokens_seen": 28336672, "step": 13140 }, { "epoch": 2.4123692420627636, "grad_norm": 3.7473037242889404, "learning_rate": 9.987067177373579e-06, "loss": 0.162, "num_input_tokens_seen": 28347392, "step": 13145 }, { "epoch": 2.413286841622316, "grad_norm": 7.863191604614258, "learning_rate": 9.987009556675558e-06, "loss": 0.3012, "num_input_tokens_seen": 28357728, "step": 13150 }, { "epoch": 2.414204441181868, "grad_norm": 12.089767456054688, "learning_rate": 9.986951808068472e-06, "loss": 0.2503, "num_input_tokens_seen": 28368032, "step": 13155 }, { "epoch": 2.4151220407414202, "grad_norm": 11.678547859191895, "learning_rate": 9.986893931553798e-06, "loss": 0.2719, "num_input_tokens_seen": 28379808, "step": 13160 }, { "epoch": 2.4160396403009727, "grad_norm": 11.279657363891602, "learning_rate": 9.986835927133028e-06, "loss": 0.2772, "num_input_tokens_seen": 28390368, "step": 13165 }, { "epoch": 2.416957239860525, "grad_norm": 15.983474731445312, "learning_rate": 9.986777794807641e-06, "loss": 0.3885, "num_input_tokens_seen": 28399712, "step": 13170 }, { "epoch": 2.417874839420077, "grad_norm": 6.577423095703125, "learning_rate": 9.986719534579135e-06, "loss": 0.2919, "num_input_tokens_seen": 28411136, "step": 13175 }, { "epoch": 2.4187924389796294, "grad_norm": 6.523882865905762, "learning_rate": 9.986661146449002e-06, "loss": 0.3619, "num_input_tokens_seen": 28421888, "step": 13180 }, { "epoch": 2.4197100385391814, "grad_norm": 5.904951095581055, "learning_rate": 9.986602630418737e-06, "loss": 0.3136, "num_input_tokens_seen": 28432416, "step": 13185 }, { "epoch": 2.4206276380987335, "grad_norm": 5.045835494995117, "learning_rate": 9.986543986489845e-06, "loss": 0.2874, "num_input_tokens_seen": 28443904, "step": 13190 }, { "epoch": 2.421545237658286, "grad_norm": 4.974370956420898, "learning_rate": 9.986485214663826e-06, "loss": 0.3351, "num_input_tokens_seen": 28456576, "step": 13195 }, { "epoch": 2.422462837217838, "grad_norm": 7.476100444793701, "learning_rate": 9.986426314942192e-06, "loss": 0.3279, "num_input_tokens_seen": 28468032, "step": 13200 }, { "epoch": 2.42338043677739, "grad_norm": 8.165719032287598, "learning_rate": 9.98636728732645e-06, "loss": 0.2443, "num_input_tokens_seen": 28479072, "step": 13205 }, { "epoch": 2.4242980363369426, "grad_norm": 6.6963911056518555, "learning_rate": 9.986308131818116e-06, "loss": 0.2602, "num_input_tokens_seen": 28489440, "step": 13210 }, { "epoch": 2.4252156358964947, "grad_norm": 3.903761863708496, "learning_rate": 9.986248848418706e-06, "loss": 0.2016, "num_input_tokens_seen": 28499488, "step": 13215 }, { "epoch": 2.4261332354560468, "grad_norm": 8.153085708618164, "learning_rate": 9.98618943712974e-06, "loss": 0.2994, "num_input_tokens_seen": 28510304, "step": 13220 }, { "epoch": 2.4270508350155993, "grad_norm": 2.0530688762664795, "learning_rate": 9.986129897952745e-06, "loss": 0.2639, "num_input_tokens_seen": 28522016, "step": 13225 }, { "epoch": 2.4279684345751513, "grad_norm": 18.68463706970215, "learning_rate": 9.986070230889244e-06, "loss": 0.3256, "num_input_tokens_seen": 28531936, "step": 13230 }, { "epoch": 2.4288860341347034, "grad_norm": 2.3493316173553467, "learning_rate": 9.986010435940771e-06, "loss": 0.169, "num_input_tokens_seen": 28543616, "step": 13235 }, { "epoch": 2.429803633694256, "grad_norm": 10.331038475036621, "learning_rate": 9.985950513108858e-06, "loss": 0.3694, "num_input_tokens_seen": 28554176, "step": 13240 }, { "epoch": 2.430721233253808, "grad_norm": 5.43052864074707, "learning_rate": 9.98589046239504e-06, "loss": 0.1786, "num_input_tokens_seen": 28564992, "step": 13245 }, { "epoch": 2.43163883281336, "grad_norm": 5.790275573730469, "learning_rate": 9.98583028380086e-06, "loss": 0.2219, "num_input_tokens_seen": 28575840, "step": 13250 }, { "epoch": 2.4325564323729125, "grad_norm": 5.317076206207275, "learning_rate": 9.98576997732786e-06, "loss": 0.2409, "num_input_tokens_seen": 28586496, "step": 13255 }, { "epoch": 2.4334740319324646, "grad_norm": 3.712230920791626, "learning_rate": 9.985709542977589e-06, "loss": 0.2724, "num_input_tokens_seen": 28597376, "step": 13260 }, { "epoch": 2.4343916314920166, "grad_norm": 4.736734390258789, "learning_rate": 9.985648980751595e-06, "loss": 0.4304, "num_input_tokens_seen": 28608672, "step": 13265 }, { "epoch": 2.435309231051569, "grad_norm": 4.827408313751221, "learning_rate": 9.98558829065143e-06, "loss": 0.2206, "num_input_tokens_seen": 28619008, "step": 13270 }, { "epoch": 2.436226830611121, "grad_norm": 2.2601757049560547, "learning_rate": 9.985527472678654e-06, "loss": 0.1481, "num_input_tokens_seen": 28630208, "step": 13275 }, { "epoch": 2.4371444301706733, "grad_norm": 10.72338581085205, "learning_rate": 9.985466526834823e-06, "loss": 0.5275, "num_input_tokens_seen": 28640544, "step": 13280 }, { "epoch": 2.438062029730226, "grad_norm": 13.894429206848145, "learning_rate": 9.985405453121505e-06, "loss": 0.31, "num_input_tokens_seen": 28652224, "step": 13285 }, { "epoch": 2.438979629289778, "grad_norm": 10.945907592773438, "learning_rate": 9.985344251540262e-06, "loss": 0.1963, "num_input_tokens_seen": 28662048, "step": 13290 }, { "epoch": 2.43989722884933, "grad_norm": 5.822862148284912, "learning_rate": 9.985282922092667e-06, "loss": 0.2741, "num_input_tokens_seen": 28673376, "step": 13295 }, { "epoch": 2.4408148284088824, "grad_norm": 5.052140235900879, "learning_rate": 9.98522146478029e-06, "loss": 0.2897, "num_input_tokens_seen": 28684704, "step": 13300 }, { "epoch": 2.4417324279684345, "grad_norm": 4.232924461364746, "learning_rate": 9.985159879604708e-06, "loss": 0.3016, "num_input_tokens_seen": 28696544, "step": 13305 }, { "epoch": 2.4426500275279865, "grad_norm": 1.9600752592086792, "learning_rate": 9.985098166567504e-06, "loss": 0.2178, "num_input_tokens_seen": 28707872, "step": 13310 }, { "epoch": 2.443567627087539, "grad_norm": 6.447902679443359, "learning_rate": 9.985036325670257e-06, "loss": 0.2712, "num_input_tokens_seen": 28718464, "step": 13315 }, { "epoch": 2.444485226647091, "grad_norm": 11.15108871459961, "learning_rate": 9.984974356914555e-06, "loss": 0.4398, "num_input_tokens_seen": 28727744, "step": 13320 }, { "epoch": 2.445402826206643, "grad_norm": 1.631394386291504, "learning_rate": 9.984912260301986e-06, "loss": 0.264, "num_input_tokens_seen": 28738848, "step": 13325 }, { "epoch": 2.4463204257661957, "grad_norm": 10.297477722167969, "learning_rate": 9.984850035834144e-06, "loss": 0.2664, "num_input_tokens_seen": 28750400, "step": 13330 }, { "epoch": 2.4472380253257477, "grad_norm": 4.387639999389648, "learning_rate": 9.984787683512624e-06, "loss": 0.2581, "num_input_tokens_seen": 28760768, "step": 13335 }, { "epoch": 2.4481556248853, "grad_norm": 8.242300987243652, "learning_rate": 9.984725203339025e-06, "loss": 0.387, "num_input_tokens_seen": 28773184, "step": 13340 }, { "epoch": 2.4490732244448523, "grad_norm": 2.3498432636260986, "learning_rate": 9.98466259531495e-06, "loss": 0.3678, "num_input_tokens_seen": 28783904, "step": 13345 }, { "epoch": 2.4499908240044044, "grad_norm": 5.065151691436768, "learning_rate": 9.984599859442009e-06, "loss": 0.4208, "num_input_tokens_seen": 28794784, "step": 13350 }, { "epoch": 2.4509084235639564, "grad_norm": 5.5509138107299805, "learning_rate": 9.984536995721803e-06, "loss": 0.3164, "num_input_tokens_seen": 28805056, "step": 13355 }, { "epoch": 2.451826023123509, "grad_norm": 7.455893516540527, "learning_rate": 9.984474004155948e-06, "loss": 0.3781, "num_input_tokens_seen": 28816224, "step": 13360 }, { "epoch": 2.452743622683061, "grad_norm": 2.8064353466033936, "learning_rate": 9.984410884746062e-06, "loss": 0.2361, "num_input_tokens_seen": 28826656, "step": 13365 }, { "epoch": 2.4536612222426135, "grad_norm": 4.789297580718994, "learning_rate": 9.984347637493761e-06, "loss": 0.2437, "num_input_tokens_seen": 28836608, "step": 13370 }, { "epoch": 2.4545788218021656, "grad_norm": 4.113411903381348, "learning_rate": 9.984284262400668e-06, "loss": 0.2114, "num_input_tokens_seen": 28847456, "step": 13375 }, { "epoch": 2.4554964213617176, "grad_norm": 4.576658248901367, "learning_rate": 9.984220759468409e-06, "loss": 0.2365, "num_input_tokens_seen": 28858176, "step": 13380 }, { "epoch": 2.45641402092127, "grad_norm": 7.172005653381348, "learning_rate": 9.984157128698612e-06, "loss": 0.4093, "num_input_tokens_seen": 28867584, "step": 13385 }, { "epoch": 2.457331620480822, "grad_norm": 11.07331371307373, "learning_rate": 9.98409337009291e-06, "loss": 0.2652, "num_input_tokens_seen": 28877504, "step": 13390 }, { "epoch": 2.4582492200403743, "grad_norm": 9.673174858093262, "learning_rate": 9.984029483652937e-06, "loss": 0.2462, "num_input_tokens_seen": 28888608, "step": 13395 }, { "epoch": 2.459166819599927, "grad_norm": 6.208278179168701, "learning_rate": 9.983965469380333e-06, "loss": 0.3175, "num_input_tokens_seen": 28899616, "step": 13400 }, { "epoch": 2.460084419159479, "grad_norm": 6.231985569000244, "learning_rate": 9.98390132727674e-06, "loss": 0.2789, "num_input_tokens_seen": 28910592, "step": 13405 }, { "epoch": 2.461002018719031, "grad_norm": 16.187047958374023, "learning_rate": 9.9838370573438e-06, "loss": 0.4113, "num_input_tokens_seen": 28920448, "step": 13410 }, { "epoch": 2.4619196182785834, "grad_norm": 7.408180236816406, "learning_rate": 9.983772659583166e-06, "loss": 0.2621, "num_input_tokens_seen": 28930720, "step": 13415 }, { "epoch": 2.4628372178381355, "grad_norm": 3.5174107551574707, "learning_rate": 9.983708133996486e-06, "loss": 0.28, "num_input_tokens_seen": 28941504, "step": 13420 }, { "epoch": 2.4637548173976875, "grad_norm": 4.9708967208862305, "learning_rate": 9.983643480585416e-06, "loss": 0.2425, "num_input_tokens_seen": 28952704, "step": 13425 }, { "epoch": 2.46467241695724, "grad_norm": 4.766818523406982, "learning_rate": 9.983578699351616e-06, "loss": 0.2548, "num_input_tokens_seen": 28964992, "step": 13430 }, { "epoch": 2.465590016516792, "grad_norm": 4.363246917724609, "learning_rate": 9.983513790296747e-06, "loss": 0.3571, "num_input_tokens_seen": 28974784, "step": 13435 }, { "epoch": 2.466507616076344, "grad_norm": 7.312572956085205, "learning_rate": 9.98344875342247e-06, "loss": 0.2503, "num_input_tokens_seen": 28985952, "step": 13440 }, { "epoch": 2.4674252156358967, "grad_norm": 9.148481369018555, "learning_rate": 9.983383588730457e-06, "loss": 0.2703, "num_input_tokens_seen": 28997440, "step": 13445 }, { "epoch": 2.4683428151954487, "grad_norm": 6.598511695861816, "learning_rate": 9.98331829622238e-06, "loss": 0.3261, "num_input_tokens_seen": 29009056, "step": 13450 }, { "epoch": 2.469260414755001, "grad_norm": 4.187319278717041, "learning_rate": 9.983252875899912e-06, "loss": 0.347, "num_input_tokens_seen": 29020416, "step": 13455 }, { "epoch": 2.4701780143145533, "grad_norm": 7.5152058601379395, "learning_rate": 9.983187327764729e-06, "loss": 0.386, "num_input_tokens_seen": 29031008, "step": 13460 }, { "epoch": 2.4710956138741054, "grad_norm": 4.992849349975586, "learning_rate": 9.983121651818518e-06, "loss": 0.2201, "num_input_tokens_seen": 29042144, "step": 13465 }, { "epoch": 2.4720132134336574, "grad_norm": 6.432671546936035, "learning_rate": 9.983055848062958e-06, "loss": 0.3128, "num_input_tokens_seen": 29051456, "step": 13470 }, { "epoch": 2.47293081299321, "grad_norm": 11.053695678710938, "learning_rate": 9.982989916499736e-06, "loss": 0.2931, "num_input_tokens_seen": 29063168, "step": 13475 }, { "epoch": 2.473848412552762, "grad_norm": 8.06045913696289, "learning_rate": 9.98292385713055e-06, "loss": 0.2229, "num_input_tokens_seen": 29073632, "step": 13480 }, { "epoch": 2.474766012112314, "grad_norm": 3.985880136489868, "learning_rate": 9.982857669957086e-06, "loss": 0.3152, "num_input_tokens_seen": 29084256, "step": 13485 }, { "epoch": 2.4756836116718666, "grad_norm": 3.208463668823242, "learning_rate": 9.982791354981048e-06, "loss": 0.2852, "num_input_tokens_seen": 29095456, "step": 13490 }, { "epoch": 2.4766012112314186, "grad_norm": 5.388087749481201, "learning_rate": 9.982724912204132e-06, "loss": 0.309, "num_input_tokens_seen": 29105312, "step": 13495 }, { "epoch": 2.4775188107909707, "grad_norm": 5.216841697692871, "learning_rate": 9.982658341628046e-06, "loss": 0.2995, "num_input_tokens_seen": 29117504, "step": 13500 }, { "epoch": 2.478436410350523, "grad_norm": 4.017959117889404, "learning_rate": 9.982591643254496e-06, "loss": 0.2536, "num_input_tokens_seen": 29127904, "step": 13505 }, { "epoch": 2.4793540099100753, "grad_norm": 3.5808401107788086, "learning_rate": 9.982524817085193e-06, "loss": 0.2925, "num_input_tokens_seen": 29138208, "step": 13510 }, { "epoch": 2.4802716094696273, "grad_norm": 5.660765647888184, "learning_rate": 9.98245786312185e-06, "loss": 0.2195, "num_input_tokens_seen": 29149920, "step": 13515 }, { "epoch": 2.48118920902918, "grad_norm": 6.227303504943848, "learning_rate": 9.982390781366185e-06, "loss": 0.3367, "num_input_tokens_seen": 29159744, "step": 13520 }, { "epoch": 2.482106808588732, "grad_norm": 7.522193908691406, "learning_rate": 9.982323571819919e-06, "loss": 0.2862, "num_input_tokens_seen": 29169664, "step": 13525 }, { "epoch": 2.483024408148284, "grad_norm": 3.266850233078003, "learning_rate": 9.982256234484775e-06, "loss": 0.3148, "num_input_tokens_seen": 29180352, "step": 13530 }, { "epoch": 2.4839420077078365, "grad_norm": 5.357372283935547, "learning_rate": 9.98218876936248e-06, "loss": 0.2167, "num_input_tokens_seen": 29191552, "step": 13535 }, { "epoch": 2.4848596072673885, "grad_norm": 9.232745170593262, "learning_rate": 9.982121176454764e-06, "loss": 0.2539, "num_input_tokens_seen": 29201824, "step": 13540 }, { "epoch": 2.4857772068269406, "grad_norm": 5.915244102478027, "learning_rate": 9.982053455763364e-06, "loss": 0.3721, "num_input_tokens_seen": 29211808, "step": 13545 }, { "epoch": 2.486694806386493, "grad_norm": 6.195248126983643, "learning_rate": 9.981985607290012e-06, "loss": 0.2491, "num_input_tokens_seen": 29222400, "step": 13550 }, { "epoch": 2.487612405946045, "grad_norm": 2.586341142654419, "learning_rate": 9.98191763103645e-06, "loss": 0.2612, "num_input_tokens_seen": 29233024, "step": 13555 }, { "epoch": 2.488530005505597, "grad_norm": 7.166314601898193, "learning_rate": 9.981849527004425e-06, "loss": 0.2089, "num_input_tokens_seen": 29243136, "step": 13560 }, { "epoch": 2.4894476050651497, "grad_norm": 6.195342540740967, "learning_rate": 9.981781295195678e-06, "loss": 0.2102, "num_input_tokens_seen": 29253536, "step": 13565 }, { "epoch": 2.490365204624702, "grad_norm": 11.67945384979248, "learning_rate": 9.981712935611964e-06, "loss": 0.3941, "num_input_tokens_seen": 29263968, "step": 13570 }, { "epoch": 2.491282804184254, "grad_norm": 5.694405555725098, "learning_rate": 9.981644448255033e-06, "loss": 0.2307, "num_input_tokens_seen": 29274112, "step": 13575 }, { "epoch": 2.4922004037438064, "grad_norm": 10.639710426330566, "learning_rate": 9.981575833126643e-06, "loss": 0.4378, "num_input_tokens_seen": 29284864, "step": 13580 }, { "epoch": 2.4931180033033584, "grad_norm": 13.257688522338867, "learning_rate": 9.981507090228553e-06, "loss": 0.2008, "num_input_tokens_seen": 29296672, "step": 13585 }, { "epoch": 2.4940356028629105, "grad_norm": 14.464656829833984, "learning_rate": 9.981438219562529e-06, "loss": 0.2661, "num_input_tokens_seen": 29307328, "step": 13590 }, { "epoch": 2.494953202422463, "grad_norm": 11.105281829833984, "learning_rate": 9.981369221130332e-06, "loss": 0.2334, "num_input_tokens_seen": 29318304, "step": 13595 }, { "epoch": 2.495870801982015, "grad_norm": 8.237306594848633, "learning_rate": 9.981300094933737e-06, "loss": 0.4141, "num_input_tokens_seen": 29329568, "step": 13600 }, { "epoch": 2.496788401541567, "grad_norm": 17.74395751953125, "learning_rate": 9.981230840974514e-06, "loss": 0.2879, "num_input_tokens_seen": 29341760, "step": 13605 }, { "epoch": 2.4977060011011196, "grad_norm": 7.615338325500488, "learning_rate": 9.98116145925444e-06, "loss": 0.3254, "num_input_tokens_seen": 29352224, "step": 13610 }, { "epoch": 2.4986236006606717, "grad_norm": 5.386433124542236, "learning_rate": 9.981091949775296e-06, "loss": 0.1802, "num_input_tokens_seen": 29364224, "step": 13615 }, { "epoch": 2.4995412002202237, "grad_norm": 8.032888412475586, "learning_rate": 9.981022312538862e-06, "loss": 0.3813, "num_input_tokens_seen": 29375008, "step": 13620 }, { "epoch": 2.5004587997797763, "grad_norm": 4.754059791564941, "learning_rate": 9.980952547546927e-06, "loss": 0.3267, "num_input_tokens_seen": 29386720, "step": 13625 }, { "epoch": 2.5013763993393283, "grad_norm": 7.637799263000488, "learning_rate": 9.980882654801278e-06, "loss": 0.3613, "num_input_tokens_seen": 29397600, "step": 13630 }, { "epoch": 2.5022939988988804, "grad_norm": 1.8790593147277832, "learning_rate": 9.980812634303708e-06, "loss": 0.2792, "num_input_tokens_seen": 29408960, "step": 13635 }, { "epoch": 2.503211598458433, "grad_norm": 10.867087364196777, "learning_rate": 9.980742486056014e-06, "loss": 0.3311, "num_input_tokens_seen": 29418656, "step": 13640 }, { "epoch": 2.504129198017985, "grad_norm": 7.83184289932251, "learning_rate": 9.980672210059994e-06, "loss": 0.328, "num_input_tokens_seen": 29428896, "step": 13645 }, { "epoch": 2.505046797577537, "grad_norm": 7.892043590545654, "learning_rate": 9.980601806317454e-06, "loss": 0.2743, "num_input_tokens_seen": 29439072, "step": 13650 }, { "epoch": 2.5059643971370895, "grad_norm": 9.105283737182617, "learning_rate": 9.980531274830194e-06, "loss": 0.2309, "num_input_tokens_seen": 29450688, "step": 13655 }, { "epoch": 2.5068819966966416, "grad_norm": 7.044553756713867, "learning_rate": 9.980460615600027e-06, "loss": 0.2377, "num_input_tokens_seen": 29461696, "step": 13660 }, { "epoch": 2.5077995962561936, "grad_norm": 7.699073791503906, "learning_rate": 9.980389828628765e-06, "loss": 0.273, "num_input_tokens_seen": 29473216, "step": 13665 }, { "epoch": 2.508717195815746, "grad_norm": 8.250393867492676, "learning_rate": 9.98031891391822e-06, "loss": 0.3491, "num_input_tokens_seen": 29484768, "step": 13670 }, { "epoch": 2.509634795375298, "grad_norm": 6.5742292404174805, "learning_rate": 9.980247871470217e-06, "loss": 0.2118, "num_input_tokens_seen": 29496256, "step": 13675 }, { "epoch": 2.5105523949348503, "grad_norm": 10.060647964477539, "learning_rate": 9.980176701286572e-06, "loss": 0.2634, "num_input_tokens_seen": 29507520, "step": 13680 }, { "epoch": 2.511469994494403, "grad_norm": 7.264251708984375, "learning_rate": 9.980105403369116e-06, "loss": 0.2843, "num_input_tokens_seen": 29517600, "step": 13685 }, { "epoch": 2.512387594053955, "grad_norm": 6.398011684417725, "learning_rate": 9.980033977719671e-06, "loss": 0.2394, "num_input_tokens_seen": 29529664, "step": 13690 }, { "epoch": 2.513305193613507, "grad_norm": 3.0212161540985107, "learning_rate": 9.979962424340076e-06, "loss": 0.2245, "num_input_tokens_seen": 29540256, "step": 13695 }, { "epoch": 2.5142227931730594, "grad_norm": 7.118316650390625, "learning_rate": 9.979890743232161e-06, "loss": 0.3268, "num_input_tokens_seen": 29550560, "step": 13700 }, { "epoch": 2.5151403927326115, "grad_norm": 10.294659614562988, "learning_rate": 9.979818934397768e-06, "loss": 0.2495, "num_input_tokens_seen": 29562176, "step": 13705 }, { "epoch": 2.5160579922921635, "grad_norm": 2.367295980453491, "learning_rate": 9.979746997838738e-06, "loss": 0.2566, "num_input_tokens_seen": 29572800, "step": 13710 }, { "epoch": 2.516975591851716, "grad_norm": 2.638087511062622, "learning_rate": 9.979674933556915e-06, "loss": 0.1707, "num_input_tokens_seen": 29584256, "step": 13715 }, { "epoch": 2.517893191411268, "grad_norm": 12.95111083984375, "learning_rate": 9.979602741554147e-06, "loss": 0.394, "num_input_tokens_seen": 29594720, "step": 13720 }, { "epoch": 2.51881079097082, "grad_norm": 7.628116130828857, "learning_rate": 9.979530421832286e-06, "loss": 0.2809, "num_input_tokens_seen": 29606112, "step": 13725 }, { "epoch": 2.5197283905303727, "grad_norm": 8.605569839477539, "learning_rate": 9.979457974393188e-06, "loss": 0.2083, "num_input_tokens_seen": 29617792, "step": 13730 }, { "epoch": 2.5206459900899247, "grad_norm": 6.495330333709717, "learning_rate": 9.97938539923871e-06, "loss": 0.1907, "num_input_tokens_seen": 29628960, "step": 13735 }, { "epoch": 2.5215635896494772, "grad_norm": 6.273823261260986, "learning_rate": 9.979312696370715e-06, "loss": 0.2108, "num_input_tokens_seen": 29638336, "step": 13740 }, { "epoch": 2.5224811892090293, "grad_norm": 8.786428451538086, "learning_rate": 9.979239865791065e-06, "loss": 0.471, "num_input_tokens_seen": 29649216, "step": 13745 }, { "epoch": 2.5233987887685814, "grad_norm": 3.5756664276123047, "learning_rate": 9.979166907501631e-06, "loss": 0.1244, "num_input_tokens_seen": 29660352, "step": 13750 }, { "epoch": 2.524316388328134, "grad_norm": 0.906157910823822, "learning_rate": 9.979093821504282e-06, "loss": 0.1306, "num_input_tokens_seen": 29670880, "step": 13755 }, { "epoch": 2.525233987887686, "grad_norm": 18.61587905883789, "learning_rate": 9.979020607800894e-06, "loss": 0.548, "num_input_tokens_seen": 29681376, "step": 13760 }, { "epoch": 2.526151587447238, "grad_norm": 17.17556381225586, "learning_rate": 9.978947266393345e-06, "loss": 0.3042, "num_input_tokens_seen": 29691840, "step": 13765 }, { "epoch": 2.5270691870067905, "grad_norm": 24.910015106201172, "learning_rate": 9.978873797283512e-06, "loss": 0.1391, "num_input_tokens_seen": 29702432, "step": 13770 }, { "epoch": 2.5279867865663426, "grad_norm": 27.528274536132812, "learning_rate": 9.978800200473285e-06, "loss": 0.3151, "num_input_tokens_seen": 29713120, "step": 13775 }, { "epoch": 2.5289043861258946, "grad_norm": 3.2503530979156494, "learning_rate": 9.978726475964548e-06, "loss": 0.2553, "num_input_tokens_seen": 29724064, "step": 13780 }, { "epoch": 2.529821985685447, "grad_norm": 9.371681213378906, "learning_rate": 9.978652623759194e-06, "loss": 0.1416, "num_input_tokens_seen": 29734560, "step": 13785 }, { "epoch": 2.530739585244999, "grad_norm": 11.493359565734863, "learning_rate": 9.978578643859118e-06, "loss": 0.3522, "num_input_tokens_seen": 29744736, "step": 13790 }, { "epoch": 2.5316571848045513, "grad_norm": 5.346070766448975, "learning_rate": 9.978504536266215e-06, "loss": 0.5067, "num_input_tokens_seen": 29755776, "step": 13795 }, { "epoch": 2.5325747843641038, "grad_norm": 2.013894557952881, "learning_rate": 9.978430300982387e-06, "loss": 0.2236, "num_input_tokens_seen": 29766848, "step": 13800 }, { "epoch": 2.533492383923656, "grad_norm": 13.274860382080078, "learning_rate": 9.978355938009535e-06, "loss": 0.2437, "num_input_tokens_seen": 29778560, "step": 13805 }, { "epoch": 2.534409983483208, "grad_norm": 18.925254821777344, "learning_rate": 9.978281447349572e-06, "loss": 0.3448, "num_input_tokens_seen": 29789248, "step": 13810 }, { "epoch": 2.5353275830427604, "grad_norm": 7.757248878479004, "learning_rate": 9.978206829004403e-06, "loss": 0.4051, "num_input_tokens_seen": 29800160, "step": 13815 }, { "epoch": 2.5362451826023125, "grad_norm": 21.68645477294922, "learning_rate": 9.978132082975947e-06, "loss": 0.4039, "num_input_tokens_seen": 29810176, "step": 13820 }, { "epoch": 2.5371627821618645, "grad_norm": 9.126877784729004, "learning_rate": 9.978057209266116e-06, "loss": 0.2538, "num_input_tokens_seen": 29821792, "step": 13825 }, { "epoch": 2.538080381721417, "grad_norm": 15.32947826385498, "learning_rate": 9.977982207876834e-06, "loss": 0.1974, "num_input_tokens_seen": 29831456, "step": 13830 }, { "epoch": 2.538997981280969, "grad_norm": 4.1308441162109375, "learning_rate": 9.977907078810023e-06, "loss": 0.3843, "num_input_tokens_seen": 29841312, "step": 13835 }, { "epoch": 2.539915580840521, "grad_norm": 24.647737503051758, "learning_rate": 9.977831822067611e-06, "loss": 0.3113, "num_input_tokens_seen": 29852960, "step": 13840 }, { "epoch": 2.5408331804000737, "grad_norm": 7.035993576049805, "learning_rate": 9.977756437651528e-06, "loss": 0.1817, "num_input_tokens_seen": 29864384, "step": 13845 }, { "epoch": 2.5417507799596257, "grad_norm": 19.81283187866211, "learning_rate": 9.977680925563706e-06, "loss": 0.292, "num_input_tokens_seen": 29875168, "step": 13850 }, { "epoch": 2.542668379519178, "grad_norm": 6.805136680603027, "learning_rate": 9.977605285806085e-06, "loss": 0.143, "num_input_tokens_seen": 29885664, "step": 13855 }, { "epoch": 2.5435859790787303, "grad_norm": 5.2117462158203125, "learning_rate": 9.977529518380603e-06, "loss": 0.1207, "num_input_tokens_seen": 29896448, "step": 13860 }, { "epoch": 2.5445035786382824, "grad_norm": 10.851902961730957, "learning_rate": 9.9774536232892e-06, "loss": 0.1756, "num_input_tokens_seen": 29907008, "step": 13865 }, { "epoch": 2.5454211781978344, "grad_norm": 17.851545333862305, "learning_rate": 9.977377600533828e-06, "loss": 0.2786, "num_input_tokens_seen": 29920128, "step": 13870 }, { "epoch": 2.546338777757387, "grad_norm": 9.681336402893066, "learning_rate": 9.977301450116435e-06, "loss": 0.3767, "num_input_tokens_seen": 29931744, "step": 13875 }, { "epoch": 2.547256377316939, "grad_norm": 7.4133124351501465, "learning_rate": 9.977225172038973e-06, "loss": 0.3001, "num_input_tokens_seen": 29941312, "step": 13880 }, { "epoch": 2.548173976876491, "grad_norm": 9.028303146362305, "learning_rate": 9.977148766303402e-06, "loss": 0.179, "num_input_tokens_seen": 29952512, "step": 13885 }, { "epoch": 2.5490915764360436, "grad_norm": 12.469161987304688, "learning_rate": 9.977072232911677e-06, "loss": 0.1867, "num_input_tokens_seen": 29964576, "step": 13890 }, { "epoch": 2.5500091759955956, "grad_norm": 29.531190872192383, "learning_rate": 9.976995571865762e-06, "loss": 0.1876, "num_input_tokens_seen": 29975360, "step": 13895 }, { "epoch": 2.5509267755551477, "grad_norm": 8.546117782592773, "learning_rate": 9.976918783167625e-06, "loss": 0.3288, "num_input_tokens_seen": 29985760, "step": 13900 }, { "epoch": 2.5518443751147, "grad_norm": 17.333332061767578, "learning_rate": 9.976841866819235e-06, "loss": 0.1293, "num_input_tokens_seen": 29997664, "step": 13905 }, { "epoch": 2.5527619746742523, "grad_norm": 15.665050506591797, "learning_rate": 9.976764822822566e-06, "loss": 0.2784, "num_input_tokens_seen": 30007808, "step": 13910 }, { "epoch": 2.5536795742338043, "grad_norm": 18.052135467529297, "learning_rate": 9.97668765117959e-06, "loss": 0.0777, "num_input_tokens_seen": 30017280, "step": 13915 }, { "epoch": 2.554597173793357, "grad_norm": 7.800328254699707, "learning_rate": 9.97661035189229e-06, "loss": 0.1359, "num_input_tokens_seen": 30027200, "step": 13920 }, { "epoch": 2.555514773352909, "grad_norm": 4.931029319763184, "learning_rate": 9.976532924962648e-06, "loss": 0.2584, "num_input_tokens_seen": 30039072, "step": 13925 }, { "epoch": 2.556432372912461, "grad_norm": 9.77905559539795, "learning_rate": 9.97645537039265e-06, "loss": 0.4363, "num_input_tokens_seen": 30048800, "step": 13930 }, { "epoch": 2.5573499724720135, "grad_norm": 8.320443153381348, "learning_rate": 9.976377688184282e-06, "loss": 0.3561, "num_input_tokens_seen": 30060032, "step": 13935 }, { "epoch": 2.5582675720315655, "grad_norm": 4.465849876403809, "learning_rate": 9.976299878339541e-06, "loss": 0.3439, "num_input_tokens_seen": 30071264, "step": 13940 }, { "epoch": 2.5591851715911176, "grad_norm": 31.658916473388672, "learning_rate": 9.97622194086042e-06, "loss": 0.1623, "num_input_tokens_seen": 30083424, "step": 13945 }, { "epoch": 2.56010277115067, "grad_norm": 3.5985732078552246, "learning_rate": 9.97614387574892e-06, "loss": 0.2137, "num_input_tokens_seen": 30094560, "step": 13950 }, { "epoch": 2.561020370710222, "grad_norm": 14.91562271118164, "learning_rate": 9.97606568300704e-06, "loss": 0.4829, "num_input_tokens_seen": 30106432, "step": 13955 }, { "epoch": 2.561937970269774, "grad_norm": 4.563133716583252, "learning_rate": 9.975987362636789e-06, "loss": 0.2691, "num_input_tokens_seen": 30117184, "step": 13960 }, { "epoch": 2.5628555698293267, "grad_norm": 21.61056900024414, "learning_rate": 9.975908914640174e-06, "loss": 0.4566, "num_input_tokens_seen": 30128928, "step": 13965 }, { "epoch": 2.563773169388879, "grad_norm": 7.8597917556762695, "learning_rate": 9.975830339019205e-06, "loss": 0.3634, "num_input_tokens_seen": 30139936, "step": 13970 }, { "epoch": 2.564690768948431, "grad_norm": 3.113830327987671, "learning_rate": 9.975751635775903e-06, "loss": 0.1235, "num_input_tokens_seen": 30151520, "step": 13975 }, { "epoch": 2.5656083685079834, "grad_norm": 1.5373573303222656, "learning_rate": 9.97567280491228e-06, "loss": 0.1781, "num_input_tokens_seen": 30161888, "step": 13980 }, { "epoch": 2.5665259680675354, "grad_norm": 15.491912841796875, "learning_rate": 9.975593846430364e-06, "loss": 0.2708, "num_input_tokens_seen": 30173696, "step": 13985 }, { "epoch": 2.5674435676270875, "grad_norm": 4.195549964904785, "learning_rate": 9.975514760332176e-06, "loss": 0.2486, "num_input_tokens_seen": 30184992, "step": 13990 }, { "epoch": 2.56836116718664, "grad_norm": 21.87791633605957, "learning_rate": 9.975435546619745e-06, "loss": 0.2822, "num_input_tokens_seen": 30196160, "step": 13995 }, { "epoch": 2.569278766746192, "grad_norm": 30.174009323120117, "learning_rate": 9.975356205295105e-06, "loss": 0.1881, "num_input_tokens_seen": 30207296, "step": 14000 }, { "epoch": 2.570196366305744, "grad_norm": 19.17310333251953, "learning_rate": 9.975276736360288e-06, "loss": 0.5645, "num_input_tokens_seen": 30218560, "step": 14005 }, { "epoch": 2.5711139658652966, "grad_norm": 16.385080337524414, "learning_rate": 9.975197139817336e-06, "loss": 0.2186, "num_input_tokens_seen": 30228352, "step": 14010 }, { "epoch": 2.5720315654248487, "grad_norm": 26.781599044799805, "learning_rate": 9.975117415668285e-06, "loss": 0.3914, "num_input_tokens_seen": 30238176, "step": 14015 }, { "epoch": 2.5729491649844007, "grad_norm": 8.329642295837402, "learning_rate": 9.975037563915186e-06, "loss": 0.2025, "num_input_tokens_seen": 30248992, "step": 14020 }, { "epoch": 2.5738667645439532, "grad_norm": 2.663440465927124, "learning_rate": 9.974957584560082e-06, "loss": 0.231, "num_input_tokens_seen": 30260352, "step": 14025 }, { "epoch": 2.5747843641035053, "grad_norm": 17.818206787109375, "learning_rate": 9.974877477605027e-06, "loss": 0.2166, "num_input_tokens_seen": 30271840, "step": 14030 }, { "epoch": 2.5757019636630574, "grad_norm": 47.106544494628906, "learning_rate": 9.974797243052077e-06, "loss": 0.3107, "num_input_tokens_seen": 30282464, "step": 14035 }, { "epoch": 2.57661956322261, "grad_norm": 14.398961067199707, "learning_rate": 9.974716880903286e-06, "loss": 0.3463, "num_input_tokens_seen": 30291712, "step": 14040 }, { "epoch": 2.577537162782162, "grad_norm": 20.464893341064453, "learning_rate": 9.974636391160717e-06, "loss": 0.4971, "num_input_tokens_seen": 30302464, "step": 14045 }, { "epoch": 2.578454762341714, "grad_norm": 9.641603469848633, "learning_rate": 9.974555773826434e-06, "loss": 0.1582, "num_input_tokens_seen": 30313536, "step": 14050 }, { "epoch": 2.5793723619012665, "grad_norm": 7.248761177062988, "learning_rate": 9.974475028902506e-06, "loss": 0.2394, "num_input_tokens_seen": 30324480, "step": 14055 }, { "epoch": 2.5802899614608186, "grad_norm": 24.137727737426758, "learning_rate": 9.974394156391004e-06, "loss": 0.2419, "num_input_tokens_seen": 30335456, "step": 14060 }, { "epoch": 2.5812075610203706, "grad_norm": 15.363225936889648, "learning_rate": 9.974313156294e-06, "loss": 0.4602, "num_input_tokens_seen": 30345920, "step": 14065 }, { "epoch": 2.582125160579923, "grad_norm": 32.72486877441406, "learning_rate": 9.974232028613575e-06, "loss": 0.2737, "num_input_tokens_seen": 30355200, "step": 14070 }, { "epoch": 2.583042760139475, "grad_norm": 16.011632919311523, "learning_rate": 9.974150773351808e-06, "loss": 0.2658, "num_input_tokens_seen": 30366400, "step": 14075 }, { "epoch": 2.5839603596990273, "grad_norm": 24.55779266357422, "learning_rate": 9.974069390510781e-06, "loss": 0.3685, "num_input_tokens_seen": 30376736, "step": 14080 }, { "epoch": 2.5848779592585798, "grad_norm": 23.742944717407227, "learning_rate": 9.973987880092583e-06, "loss": 0.2658, "num_input_tokens_seen": 30387072, "step": 14085 }, { "epoch": 2.585795558818132, "grad_norm": 4.591745376586914, "learning_rate": 9.973906242099305e-06, "loss": 0.1722, "num_input_tokens_seen": 30398400, "step": 14090 }, { "epoch": 2.586713158377684, "grad_norm": 1.743094801902771, "learning_rate": 9.973824476533043e-06, "loss": 0.2308, "num_input_tokens_seen": 30409312, "step": 14095 }, { "epoch": 2.5876307579372364, "grad_norm": 11.11384391784668, "learning_rate": 9.97374258339589e-06, "loss": 0.2133, "num_input_tokens_seen": 30419840, "step": 14100 }, { "epoch": 2.5885483574967885, "grad_norm": 9.747167587280273, "learning_rate": 9.973660562689948e-06, "loss": 0.3258, "num_input_tokens_seen": 30431712, "step": 14105 }, { "epoch": 2.5894659570563405, "grad_norm": 18.370235443115234, "learning_rate": 9.973578414417322e-06, "loss": 0.372, "num_input_tokens_seen": 30441504, "step": 14110 }, { "epoch": 2.590383556615893, "grad_norm": 3.2975306510925293, "learning_rate": 9.973496138580119e-06, "loss": 0.2278, "num_input_tokens_seen": 30452064, "step": 14115 }, { "epoch": 2.591301156175445, "grad_norm": 16.0482177734375, "learning_rate": 9.973413735180446e-06, "loss": 0.0933, "num_input_tokens_seen": 30462848, "step": 14120 }, { "epoch": 2.592218755734997, "grad_norm": 17.62576675415039, "learning_rate": 9.97333120422042e-06, "loss": 0.195, "num_input_tokens_seen": 30473728, "step": 14125 }, { "epoch": 2.5931363552945497, "grad_norm": 2.0831851959228516, "learning_rate": 9.973248545702156e-06, "loss": 0.2543, "num_input_tokens_seen": 30484704, "step": 14130 }, { "epoch": 2.5940539548541017, "grad_norm": 12.33173656463623, "learning_rate": 9.973165759627777e-06, "loss": 0.3226, "num_input_tokens_seen": 30495072, "step": 14135 }, { "epoch": 2.594971554413654, "grad_norm": 6.704598426818848, "learning_rate": 9.973082845999401e-06, "loss": 0.237, "num_input_tokens_seen": 30506816, "step": 14140 }, { "epoch": 2.5958891539732063, "grad_norm": 32.5750846862793, "learning_rate": 9.97299980481916e-06, "loss": 0.2035, "num_input_tokens_seen": 30518112, "step": 14145 }, { "epoch": 2.5968067535327584, "grad_norm": 34.44056701660156, "learning_rate": 9.972916636089178e-06, "loss": 0.311, "num_input_tokens_seen": 30528128, "step": 14150 }, { "epoch": 2.5977243530923104, "grad_norm": 31.09376335144043, "learning_rate": 9.972833339811594e-06, "loss": 0.1518, "num_input_tokens_seen": 30539424, "step": 14155 }, { "epoch": 2.598641952651863, "grad_norm": 2.253025531768799, "learning_rate": 9.972749915988542e-06, "loss": 0.1498, "num_input_tokens_seen": 30549504, "step": 14160 }, { "epoch": 2.599559552211415, "grad_norm": 8.514521598815918, "learning_rate": 9.972666364622162e-06, "loss": 0.3377, "num_input_tokens_seen": 30560384, "step": 14165 }, { "epoch": 2.600477151770967, "grad_norm": 1.5655674934387207, "learning_rate": 9.972582685714597e-06, "loss": 0.0493, "num_input_tokens_seen": 30569920, "step": 14170 }, { "epoch": 2.6013947513305196, "grad_norm": 9.584144592285156, "learning_rate": 9.972498879267992e-06, "loss": 0.4555, "num_input_tokens_seen": 30580672, "step": 14175 }, { "epoch": 2.6023123508900716, "grad_norm": 9.164206504821777, "learning_rate": 9.972414945284496e-06, "loss": 0.2807, "num_input_tokens_seen": 30590336, "step": 14180 }, { "epoch": 2.6032299504496237, "grad_norm": 1.533301591873169, "learning_rate": 9.972330883766266e-06, "loss": 0.1044, "num_input_tokens_seen": 30602496, "step": 14185 }, { "epoch": 2.604147550009176, "grad_norm": 32.17442321777344, "learning_rate": 9.972246694715452e-06, "loss": 0.2559, "num_input_tokens_seen": 30613568, "step": 14190 }, { "epoch": 2.6050651495687283, "grad_norm": 6.2684454917907715, "learning_rate": 9.97216237813422e-06, "loss": 0.1434, "num_input_tokens_seen": 30624128, "step": 14195 }, { "epoch": 2.6059827491282803, "grad_norm": 41.46134567260742, "learning_rate": 9.972077934024728e-06, "loss": 0.3604, "num_input_tokens_seen": 30634752, "step": 14200 }, { "epoch": 2.606900348687833, "grad_norm": 3.183441400527954, "learning_rate": 9.971993362389143e-06, "loss": 0.1395, "num_input_tokens_seen": 30644992, "step": 14205 }, { "epoch": 2.607817948247385, "grad_norm": 20.858190536499023, "learning_rate": 9.971908663229632e-06, "loss": 0.1733, "num_input_tokens_seen": 30655680, "step": 14210 }, { "epoch": 2.608735547806937, "grad_norm": 2.3790292739868164, "learning_rate": 9.971823836548373e-06, "loss": 0.1722, "num_input_tokens_seen": 30667200, "step": 14215 }, { "epoch": 2.6096531473664895, "grad_norm": 24.444623947143555, "learning_rate": 9.971738882347535e-06, "loss": 0.1501, "num_input_tokens_seen": 30677504, "step": 14220 }, { "epoch": 2.6105707469260415, "grad_norm": 16.502986907958984, "learning_rate": 9.971653800629302e-06, "loss": 0.6617, "num_input_tokens_seen": 30688512, "step": 14225 }, { "epoch": 2.6114883464855936, "grad_norm": 24.132522583007812, "learning_rate": 9.971568591395855e-06, "loss": 0.4158, "num_input_tokens_seen": 30700320, "step": 14230 }, { "epoch": 2.612405946045146, "grad_norm": 21.29383659362793, "learning_rate": 9.971483254649378e-06, "loss": 0.3593, "num_input_tokens_seen": 30711328, "step": 14235 }, { "epoch": 2.613323545604698, "grad_norm": 11.905174255371094, "learning_rate": 9.971397790392062e-06, "loss": 0.1892, "num_input_tokens_seen": 30722976, "step": 14240 }, { "epoch": 2.61424114516425, "grad_norm": 5.7010626792907715, "learning_rate": 9.971312198626096e-06, "loss": 0.2329, "num_input_tokens_seen": 30733408, "step": 14245 }, { "epoch": 2.6151587447238027, "grad_norm": 20.713031768798828, "learning_rate": 9.971226479353675e-06, "loss": 0.3082, "num_input_tokens_seen": 30744224, "step": 14250 }, { "epoch": 2.616076344283355, "grad_norm": 33.13462829589844, "learning_rate": 9.971140632577003e-06, "loss": 0.1237, "num_input_tokens_seen": 30754336, "step": 14255 }, { "epoch": 2.616993943842907, "grad_norm": 1.0116404294967651, "learning_rate": 9.971054658298276e-06, "loss": 0.1126, "num_input_tokens_seen": 30765504, "step": 14260 }, { "epoch": 2.6179115434024594, "grad_norm": 10.184673309326172, "learning_rate": 9.970968556519702e-06, "loss": 0.2072, "num_input_tokens_seen": 30776000, "step": 14265 }, { "epoch": 2.6188291429620114, "grad_norm": 0.27889537811279297, "learning_rate": 9.97088232724349e-06, "loss": 0.2928, "num_input_tokens_seen": 30786784, "step": 14270 }, { "epoch": 2.6197467425215635, "grad_norm": 24.222209930419922, "learning_rate": 9.97079597047185e-06, "loss": 0.3282, "num_input_tokens_seen": 30797024, "step": 14275 }, { "epoch": 2.620664342081116, "grad_norm": 95.65702056884766, "learning_rate": 9.970709486206997e-06, "loss": 0.2114, "num_input_tokens_seen": 30808704, "step": 14280 }, { "epoch": 2.621581941640668, "grad_norm": 0.3931659758090973, "learning_rate": 9.97062287445115e-06, "loss": 0.1736, "num_input_tokens_seen": 30819776, "step": 14285 }, { "epoch": 2.62249954120022, "grad_norm": 12.842304229736328, "learning_rate": 9.97053613520653e-06, "loss": 0.0732, "num_input_tokens_seen": 30830048, "step": 14290 }, { "epoch": 2.6234171407597726, "grad_norm": 6.65300989151001, "learning_rate": 9.970449268475362e-06, "loss": 0.4341, "num_input_tokens_seen": 30841184, "step": 14295 }, { "epoch": 2.6243347403193247, "grad_norm": 0.16851551830768585, "learning_rate": 9.970362274259873e-06, "loss": 0.2497, "num_input_tokens_seen": 30853024, "step": 14300 }, { "epoch": 2.6252523398788767, "grad_norm": 50.75577163696289, "learning_rate": 9.970275152562296e-06, "loss": 0.4467, "num_input_tokens_seen": 30862976, "step": 14305 }, { "epoch": 2.6261699394384292, "grad_norm": 28.165895462036133, "learning_rate": 9.970187903384863e-06, "loss": 0.3628, "num_input_tokens_seen": 30872672, "step": 14310 }, { "epoch": 2.6270875389979813, "grad_norm": 4.792634963989258, "learning_rate": 9.970100526729815e-06, "loss": 0.2342, "num_input_tokens_seen": 30883808, "step": 14315 }, { "epoch": 2.6280051385575334, "grad_norm": 0.19421085715293884, "learning_rate": 9.97001302259939e-06, "loss": 0.0249, "num_input_tokens_seen": 30894880, "step": 14320 }, { "epoch": 2.628922738117086, "grad_norm": 17.650606155395508, "learning_rate": 9.969925390995835e-06, "loss": 0.1892, "num_input_tokens_seen": 30905280, "step": 14325 }, { "epoch": 2.629840337676638, "grad_norm": 14.296913146972656, "learning_rate": 9.969837631921395e-06, "loss": 0.1439, "num_input_tokens_seen": 30914880, "step": 14330 }, { "epoch": 2.63075793723619, "grad_norm": 0.897050678730011, "learning_rate": 9.969749745378324e-06, "loss": 0.265, "num_input_tokens_seen": 30924512, "step": 14335 }, { "epoch": 2.6316755367957425, "grad_norm": 14.841300964355469, "learning_rate": 9.969661731368874e-06, "loss": 0.4095, "num_input_tokens_seen": 30935296, "step": 14340 }, { "epoch": 2.6325931363552946, "grad_norm": 21.419492721557617, "learning_rate": 9.969573589895303e-06, "loss": 0.2008, "num_input_tokens_seen": 30945344, "step": 14345 }, { "epoch": 2.6335107359148466, "grad_norm": 31.51357078552246, "learning_rate": 9.969485320959871e-06, "loss": 0.1571, "num_input_tokens_seen": 30956416, "step": 14350 }, { "epoch": 2.634428335474399, "grad_norm": 93.99176025390625, "learning_rate": 9.969396924564843e-06, "loss": 0.3696, "num_input_tokens_seen": 30967264, "step": 14355 }, { "epoch": 2.635345935033951, "grad_norm": 7.013670921325684, "learning_rate": 9.969308400712485e-06, "loss": 0.2653, "num_input_tokens_seen": 30976992, "step": 14360 }, { "epoch": 2.6362635345935033, "grad_norm": 1.5490474700927734, "learning_rate": 9.969219749405068e-06, "loss": 0.3542, "num_input_tokens_seen": 30987136, "step": 14365 }, { "epoch": 2.6371811341530558, "grad_norm": 0.5298504829406738, "learning_rate": 9.969130970644868e-06, "loss": 0.0288, "num_input_tokens_seen": 30998400, "step": 14370 }, { "epoch": 2.638098733712608, "grad_norm": 30.36534881591797, "learning_rate": 9.969042064434158e-06, "loss": 0.293, "num_input_tokens_seen": 31009088, "step": 14375 }, { "epoch": 2.63901633327216, "grad_norm": 22.50054359436035, "learning_rate": 9.968953030775221e-06, "loss": 0.619, "num_input_tokens_seen": 31019712, "step": 14380 }, { "epoch": 2.6399339328317124, "grad_norm": 43.04838180541992, "learning_rate": 9.96886386967034e-06, "loss": 0.3671, "num_input_tokens_seen": 31031648, "step": 14385 }, { "epoch": 2.6408515323912645, "grad_norm": 19.72381019592285, "learning_rate": 9.968774581121801e-06, "loss": 0.2189, "num_input_tokens_seen": 31042848, "step": 14390 }, { "epoch": 2.6417691319508165, "grad_norm": 7.507689952850342, "learning_rate": 9.968685165131896e-06, "loss": 0.1623, "num_input_tokens_seen": 31054016, "step": 14395 }, { "epoch": 2.642686731510369, "grad_norm": 2.98409104347229, "learning_rate": 9.968595621702916e-06, "loss": 0.2822, "num_input_tokens_seen": 31064896, "step": 14400 }, { "epoch": 2.643604331069921, "grad_norm": 16.465595245361328, "learning_rate": 9.968505950837162e-06, "loss": 0.3796, "num_input_tokens_seen": 31074048, "step": 14405 }, { "epoch": 2.644521930629473, "grad_norm": 13.399805068969727, "learning_rate": 9.968416152536929e-06, "loss": 0.419, "num_input_tokens_seen": 31084576, "step": 14410 }, { "epoch": 2.6454395301890257, "grad_norm": 2.819363832473755, "learning_rate": 9.96832622680452e-06, "loss": 0.2608, "num_input_tokens_seen": 31094880, "step": 14415 }, { "epoch": 2.6463571297485777, "grad_norm": 7.844508171081543, "learning_rate": 9.968236173642244e-06, "loss": 0.307, "num_input_tokens_seen": 31105408, "step": 14420 }, { "epoch": 2.64727472930813, "grad_norm": 16.648792266845703, "learning_rate": 9.968145993052413e-06, "loss": 0.0936, "num_input_tokens_seen": 31116064, "step": 14425 }, { "epoch": 2.6481923288676823, "grad_norm": 0.40695685148239136, "learning_rate": 9.968055685037336e-06, "loss": 0.0771, "num_input_tokens_seen": 31127104, "step": 14430 }, { "epoch": 2.6491099284272344, "grad_norm": 21.305259704589844, "learning_rate": 9.96796524959933e-06, "loss": 0.1555, "num_input_tokens_seen": 31137376, "step": 14435 }, { "epoch": 2.6500275279867864, "grad_norm": 15.881840705871582, "learning_rate": 9.967874686740716e-06, "loss": 0.1014, "num_input_tokens_seen": 31147616, "step": 14440 }, { "epoch": 2.650945127546339, "grad_norm": 15.52011489868164, "learning_rate": 9.967783996463815e-06, "loss": 0.0664, "num_input_tokens_seen": 31158208, "step": 14445 }, { "epoch": 2.651862727105891, "grad_norm": 11.643836975097656, "learning_rate": 9.967693178770952e-06, "loss": 0.3121, "num_input_tokens_seen": 31169824, "step": 14450 }, { "epoch": 2.652780326665443, "grad_norm": 13.73421859741211, "learning_rate": 9.967602233664462e-06, "loss": 0.2312, "num_input_tokens_seen": 31180960, "step": 14455 }, { "epoch": 2.6536979262249956, "grad_norm": 22.633996963500977, "learning_rate": 9.96751116114667e-06, "loss": 0.2334, "num_input_tokens_seen": 31192192, "step": 14460 }, { "epoch": 2.6546155257845476, "grad_norm": 19.42755889892578, "learning_rate": 9.967419961219918e-06, "loss": 0.4777, "num_input_tokens_seen": 31203488, "step": 14465 }, { "epoch": 2.6555331253440997, "grad_norm": 18.292133331298828, "learning_rate": 9.967328633886542e-06, "loss": 0.2281, "num_input_tokens_seen": 31216064, "step": 14470 }, { "epoch": 2.656450724903652, "grad_norm": 15.778483390808105, "learning_rate": 9.967237179148886e-06, "loss": 0.4418, "num_input_tokens_seen": 31227648, "step": 14475 }, { "epoch": 2.6573683244632043, "grad_norm": 5.271329402923584, "learning_rate": 9.967145597009295e-06, "loss": 0.2511, "num_input_tokens_seen": 31238528, "step": 14480 }, { "epoch": 2.6582859240227563, "grad_norm": 16.09749984741211, "learning_rate": 9.967053887470117e-06, "loss": 0.2771, "num_input_tokens_seen": 31249696, "step": 14485 }, { "epoch": 2.659203523582309, "grad_norm": 4.332108974456787, "learning_rate": 9.966962050533705e-06, "loss": 0.2243, "num_input_tokens_seen": 31260384, "step": 14490 }, { "epoch": 2.660121123141861, "grad_norm": 1.3441016674041748, "learning_rate": 9.966870086202413e-06, "loss": 0.2601, "num_input_tokens_seen": 31271296, "step": 14495 }, { "epoch": 2.661038722701413, "grad_norm": 20.32262420654297, "learning_rate": 9.966777994478605e-06, "loss": 0.2396, "num_input_tokens_seen": 31282464, "step": 14500 }, { "epoch": 2.6619563222609655, "grad_norm": 32.979339599609375, "learning_rate": 9.966685775364637e-06, "loss": 0.2863, "num_input_tokens_seen": 31292736, "step": 14505 }, { "epoch": 2.6628739218205175, "grad_norm": 7.262602806091309, "learning_rate": 9.966593428862876e-06, "loss": 0.4295, "num_input_tokens_seen": 31304064, "step": 14510 }, { "epoch": 2.6637915213800696, "grad_norm": 1.6960577964782715, "learning_rate": 9.966500954975692e-06, "loss": 0.2686, "num_input_tokens_seen": 31314656, "step": 14515 }, { "epoch": 2.664709120939622, "grad_norm": 1.5849599838256836, "learning_rate": 9.966408353705455e-06, "loss": 0.2066, "num_input_tokens_seen": 31325952, "step": 14520 }, { "epoch": 2.665626720499174, "grad_norm": 18.95607566833496, "learning_rate": 9.966315625054542e-06, "loss": 0.1546, "num_input_tokens_seen": 31336608, "step": 14525 }, { "epoch": 2.666544320058726, "grad_norm": 17.666051864624023, "learning_rate": 9.966222769025329e-06, "loss": 0.0456, "num_input_tokens_seen": 31346944, "step": 14530 }, { "epoch": 2.6674619196182787, "grad_norm": 18.002851486206055, "learning_rate": 9.966129785620201e-06, "loss": 0.1516, "num_input_tokens_seen": 31357472, "step": 14535 }, { "epoch": 2.668379519177831, "grad_norm": 11.759360313415527, "learning_rate": 9.96603667484154e-06, "loss": 0.2345, "num_input_tokens_seen": 31368512, "step": 14540 }, { "epoch": 2.669297118737383, "grad_norm": 27.581289291381836, "learning_rate": 9.965943436691734e-06, "loss": 0.1695, "num_input_tokens_seen": 31379040, "step": 14545 }, { "epoch": 2.6702147182969354, "grad_norm": 12.980201721191406, "learning_rate": 9.965850071173177e-06, "loss": 0.1897, "num_input_tokens_seen": 31388960, "step": 14550 }, { "epoch": 2.6711323178564874, "grad_norm": 14.36524772644043, "learning_rate": 9.96575657828826e-06, "loss": 0.1312, "num_input_tokens_seen": 31399296, "step": 14555 }, { "epoch": 2.6720499174160395, "grad_norm": 0.2766430974006653, "learning_rate": 9.965662958039384e-06, "loss": 0.1472, "num_input_tokens_seen": 31410816, "step": 14560 }, { "epoch": 2.672967516975592, "grad_norm": 0.44442978501319885, "learning_rate": 9.96556921042895e-06, "loss": 0.2777, "num_input_tokens_seen": 31422208, "step": 14565 }, { "epoch": 2.673885116535144, "grad_norm": 1.8670578002929688, "learning_rate": 9.96547533545936e-06, "loss": 0.4177, "num_input_tokens_seen": 31432416, "step": 14570 }, { "epoch": 2.674802716094696, "grad_norm": 14.618424415588379, "learning_rate": 9.965381333133024e-06, "loss": 0.3937, "num_input_tokens_seen": 31444064, "step": 14575 }, { "epoch": 2.6757203156542486, "grad_norm": 9.007518768310547, "learning_rate": 9.965287203452354e-06, "loss": 0.0647, "num_input_tokens_seen": 31455136, "step": 14580 }, { "epoch": 2.6766379152138007, "grad_norm": 20.8109130859375, "learning_rate": 9.965192946419762e-06, "loss": 0.2092, "num_input_tokens_seen": 31466784, "step": 14585 }, { "epoch": 2.6775555147733527, "grad_norm": 24.788528442382812, "learning_rate": 9.965098562037665e-06, "loss": 0.1123, "num_input_tokens_seen": 31479040, "step": 14590 }, { "epoch": 2.6784731143329052, "grad_norm": 7.325952529907227, "learning_rate": 9.965004050308485e-06, "loss": 0.4088, "num_input_tokens_seen": 31488480, "step": 14595 }, { "epoch": 2.6793907138924573, "grad_norm": 0.3075300455093384, "learning_rate": 9.964909411234646e-06, "loss": 0.2102, "num_input_tokens_seen": 31499840, "step": 14600 }, { "epoch": 2.6803083134520094, "grad_norm": 4.261833667755127, "learning_rate": 9.964814644818578e-06, "loss": 0.17, "num_input_tokens_seen": 31511008, "step": 14605 }, { "epoch": 2.681225913011562, "grad_norm": 23.56065559387207, "learning_rate": 9.964719751062708e-06, "loss": 0.1664, "num_input_tokens_seen": 31521856, "step": 14610 }, { "epoch": 2.682143512571114, "grad_norm": 39.60263442993164, "learning_rate": 9.96462472996947e-06, "loss": 0.197, "num_input_tokens_seen": 31532800, "step": 14615 }, { "epoch": 2.683061112130666, "grad_norm": 0.4398666322231293, "learning_rate": 9.964529581541304e-06, "loss": 0.3883, "num_input_tokens_seen": 31544320, "step": 14620 }, { "epoch": 2.6839787116902185, "grad_norm": 8.655488014221191, "learning_rate": 9.964434305780646e-06, "loss": 0.2813, "num_input_tokens_seen": 31555520, "step": 14625 }, { "epoch": 2.6848963112497706, "grad_norm": 7.256966590881348, "learning_rate": 9.964338902689945e-06, "loss": 0.1985, "num_input_tokens_seen": 31566464, "step": 14630 }, { "epoch": 2.6858139108093226, "grad_norm": 18.57421875, "learning_rate": 9.964243372271642e-06, "loss": 0.4787, "num_input_tokens_seen": 31576640, "step": 14635 }, { "epoch": 2.686731510368875, "grad_norm": 0.7092767357826233, "learning_rate": 9.964147714528194e-06, "loss": 0.2924, "num_input_tokens_seen": 31587296, "step": 14640 }, { "epoch": 2.687649109928427, "grad_norm": 7.8594255447387695, "learning_rate": 9.96405192946205e-06, "loss": 0.1163, "num_input_tokens_seen": 31598560, "step": 14645 }, { "epoch": 2.6885667094879793, "grad_norm": 1.4434987306594849, "learning_rate": 9.963956017075664e-06, "loss": 0.1674, "num_input_tokens_seen": 31608320, "step": 14650 }, { "epoch": 2.6894843090475318, "grad_norm": 12.347683906555176, "learning_rate": 9.963859977371503e-06, "loss": 0.4074, "num_input_tokens_seen": 31618944, "step": 14655 }, { "epoch": 2.690401908607084, "grad_norm": 12.704578399658203, "learning_rate": 9.963763810352026e-06, "loss": 0.1256, "num_input_tokens_seen": 31630112, "step": 14660 }, { "epoch": 2.691319508166636, "grad_norm": 10.759645462036133, "learning_rate": 9.9636675160197e-06, "loss": 0.2883, "num_input_tokens_seen": 31639776, "step": 14665 }, { "epoch": 2.6922371077261884, "grad_norm": 50.60408401489258, "learning_rate": 9.963571094376995e-06, "loss": 0.2193, "num_input_tokens_seen": 31649728, "step": 14670 }, { "epoch": 2.6931547072857405, "grad_norm": 0.5948207974433899, "learning_rate": 9.963474545426386e-06, "loss": 0.1551, "num_input_tokens_seen": 31660736, "step": 14675 }, { "epoch": 2.6940723068452925, "grad_norm": 18.28070640563965, "learning_rate": 9.963377869170347e-06, "loss": 0.4132, "num_input_tokens_seen": 31672000, "step": 14680 }, { "epoch": 2.694989906404845, "grad_norm": 13.816293716430664, "learning_rate": 9.963281065611358e-06, "loss": 0.1692, "num_input_tokens_seen": 31683072, "step": 14685 }, { "epoch": 2.695907505964397, "grad_norm": 1.7005137205123901, "learning_rate": 9.963184134751903e-06, "loss": 0.3634, "num_input_tokens_seen": 31694304, "step": 14690 }, { "epoch": 2.696825105523949, "grad_norm": 3.573113441467285, "learning_rate": 9.963087076594464e-06, "loss": 0.0214, "num_input_tokens_seen": 31705184, "step": 14695 }, { "epoch": 2.6977427050835017, "grad_norm": 33.922149658203125, "learning_rate": 9.962989891141535e-06, "loss": 0.3051, "num_input_tokens_seen": 31716512, "step": 14700 }, { "epoch": 2.6986603046430537, "grad_norm": 30.004026412963867, "learning_rate": 9.962892578395608e-06, "loss": 0.5872, "num_input_tokens_seen": 31726368, "step": 14705 }, { "epoch": 2.699577904202606, "grad_norm": 11.411994934082031, "learning_rate": 9.962795138359178e-06, "loss": 0.1685, "num_input_tokens_seen": 31737536, "step": 14710 }, { "epoch": 2.7004955037621583, "grad_norm": 10.104293823242188, "learning_rate": 9.962697571034745e-06, "loss": 0.1634, "num_input_tokens_seen": 31747776, "step": 14715 }, { "epoch": 2.7014131033217104, "grad_norm": 2.108950614929199, "learning_rate": 9.96259987642481e-06, "loss": 0.0733, "num_input_tokens_seen": 31757536, "step": 14720 }, { "epoch": 2.7023307028812624, "grad_norm": 20.94232749938965, "learning_rate": 9.96250205453188e-06, "loss": 0.1091, "num_input_tokens_seen": 31767456, "step": 14725 }, { "epoch": 2.703248302440815, "grad_norm": 0.39268946647644043, "learning_rate": 9.962404105358463e-06, "loss": 0.3101, "num_input_tokens_seen": 31779936, "step": 14730 }, { "epoch": 2.704165902000367, "grad_norm": 8.320146560668945, "learning_rate": 9.962306028907072e-06, "loss": 0.2691, "num_input_tokens_seen": 31791840, "step": 14735 }, { "epoch": 2.705083501559919, "grad_norm": 15.316773414611816, "learning_rate": 9.96220782518022e-06, "loss": 0.2462, "num_input_tokens_seen": 31801696, "step": 14740 }, { "epoch": 2.7060011011194716, "grad_norm": 15.169530868530273, "learning_rate": 9.962109494180431e-06, "loss": 0.2757, "num_input_tokens_seen": 31812224, "step": 14745 }, { "epoch": 2.7069187006790236, "grad_norm": 14.351449966430664, "learning_rate": 9.962011035910223e-06, "loss": 0.3468, "num_input_tokens_seen": 31823904, "step": 14750 }, { "epoch": 2.7078363002385757, "grad_norm": 2.7658588886260986, "learning_rate": 9.961912450372122e-06, "loss": 0.0995, "num_input_tokens_seen": 31834272, "step": 14755 }, { "epoch": 2.708753899798128, "grad_norm": 1.1189298629760742, "learning_rate": 9.961813737568658e-06, "loss": 0.1936, "num_input_tokens_seen": 31844928, "step": 14760 }, { "epoch": 2.7096714993576803, "grad_norm": 1.45941162109375, "learning_rate": 9.961714897502362e-06, "loss": 0.3488, "num_input_tokens_seen": 31855648, "step": 14765 }, { "epoch": 2.7105890989172323, "grad_norm": 0.8727784156799316, "learning_rate": 9.961615930175767e-06, "loss": 0.2737, "num_input_tokens_seen": 31867904, "step": 14770 }, { "epoch": 2.711506698476785, "grad_norm": 8.86713695526123, "learning_rate": 9.961516835591414e-06, "loss": 0.2673, "num_input_tokens_seen": 31876928, "step": 14775 }, { "epoch": 2.712424298036337, "grad_norm": 34.79593276977539, "learning_rate": 9.961417613751845e-06, "loss": 0.2497, "num_input_tokens_seen": 31889184, "step": 14780 }, { "epoch": 2.713341897595889, "grad_norm": 11.421082496643066, "learning_rate": 9.961318264659601e-06, "loss": 0.3132, "num_input_tokens_seen": 31900096, "step": 14785 }, { "epoch": 2.7142594971554415, "grad_norm": 8.557830810546875, "learning_rate": 9.961218788317235e-06, "loss": 0.1849, "num_input_tokens_seen": 31911392, "step": 14790 }, { "epoch": 2.7151770967149935, "grad_norm": 14.48332691192627, "learning_rate": 9.961119184727297e-06, "loss": 0.2815, "num_input_tokens_seen": 31922720, "step": 14795 }, { "epoch": 2.7160946962745456, "grad_norm": 4.82391881942749, "learning_rate": 9.96101945389234e-06, "loss": 0.3456, "num_input_tokens_seen": 31933088, "step": 14800 }, { "epoch": 2.717012295834098, "grad_norm": 0.9682174921035767, "learning_rate": 9.960919595814922e-06, "loss": 0.3092, "num_input_tokens_seen": 31944256, "step": 14805 }, { "epoch": 2.71792989539365, "grad_norm": 0.48024848103523254, "learning_rate": 9.960819610497606e-06, "loss": 0.1774, "num_input_tokens_seen": 31955520, "step": 14810 }, { "epoch": 2.718847494953202, "grad_norm": 1.495047688484192, "learning_rate": 9.960719497942954e-06, "loss": 0.0688, "num_input_tokens_seen": 31966688, "step": 14815 }, { "epoch": 2.7197650945127547, "grad_norm": 11.298644065856934, "learning_rate": 9.960619258153536e-06, "loss": 0.4191, "num_input_tokens_seen": 31977440, "step": 14820 }, { "epoch": 2.720682694072307, "grad_norm": 1.1423778533935547, "learning_rate": 9.960518891131923e-06, "loss": 0.0747, "num_input_tokens_seen": 31988512, "step": 14825 }, { "epoch": 2.721600293631859, "grad_norm": 0.16456350684165955, "learning_rate": 9.960418396880689e-06, "loss": 0.473, "num_input_tokens_seen": 31999072, "step": 14830 }, { "epoch": 2.7225178931914114, "grad_norm": 14.29389762878418, "learning_rate": 9.96031777540241e-06, "loss": 0.2605, "num_input_tokens_seen": 32009984, "step": 14835 }, { "epoch": 2.7234354927509634, "grad_norm": 0.27853915095329285, "learning_rate": 9.96021702669967e-06, "loss": 0.247, "num_input_tokens_seen": 32020128, "step": 14840 }, { "epoch": 2.7243530923105155, "grad_norm": 11.93320083618164, "learning_rate": 9.960116150775048e-06, "loss": 0.2158, "num_input_tokens_seen": 32031488, "step": 14845 }, { "epoch": 2.725270691870068, "grad_norm": 0.4090787470340729, "learning_rate": 9.960015147631136e-06, "loss": 0.1397, "num_input_tokens_seen": 32043200, "step": 14850 }, { "epoch": 2.72618829142962, "grad_norm": 14.748788833618164, "learning_rate": 9.959914017270522e-06, "loss": 0.3796, "num_input_tokens_seen": 32054016, "step": 14855 }, { "epoch": 2.727105890989172, "grad_norm": 0.36506152153015137, "learning_rate": 9.9598127596958e-06, "loss": 0.0421, "num_input_tokens_seen": 32065408, "step": 14860 }, { "epoch": 2.7280234905487246, "grad_norm": 0.28884458541870117, "learning_rate": 9.959711374909568e-06, "loss": 0.1599, "num_input_tokens_seen": 32076928, "step": 14865 }, { "epoch": 2.7289410901082767, "grad_norm": 39.415809631347656, "learning_rate": 9.959609862914427e-06, "loss": 0.2588, "num_input_tokens_seen": 32088064, "step": 14870 }, { "epoch": 2.7298586896678287, "grad_norm": 33.45818328857422, "learning_rate": 9.95950822371298e-06, "loss": 0.0874, "num_input_tokens_seen": 32098592, "step": 14875 }, { "epoch": 2.7307762892273812, "grad_norm": 0.6631146669387817, "learning_rate": 9.959406457307833e-06, "loss": 0.1522, "num_input_tokens_seen": 32108960, "step": 14880 }, { "epoch": 2.7316938887869333, "grad_norm": 28.56192970275879, "learning_rate": 9.959304563701598e-06, "loss": 0.329, "num_input_tokens_seen": 32121024, "step": 14885 }, { "epoch": 2.7326114883464854, "grad_norm": 0.290365993976593, "learning_rate": 9.959202542896885e-06, "loss": 0.2096, "num_input_tokens_seen": 32130976, "step": 14890 }, { "epoch": 2.733529087906038, "grad_norm": 32.115806579589844, "learning_rate": 9.959100394896314e-06, "loss": 0.6005, "num_input_tokens_seen": 32143104, "step": 14895 }, { "epoch": 2.73444668746559, "grad_norm": 0.30824005603790283, "learning_rate": 9.958998119702503e-06, "loss": 0.1314, "num_input_tokens_seen": 32153728, "step": 14900 }, { "epoch": 2.735364287025142, "grad_norm": 14.73222541809082, "learning_rate": 9.958895717318076e-06, "loss": 0.1723, "num_input_tokens_seen": 32164512, "step": 14905 }, { "epoch": 2.7362818865846945, "grad_norm": 1.1943570375442505, "learning_rate": 9.958793187745662e-06, "loss": 0.1839, "num_input_tokens_seen": 32174848, "step": 14910 }, { "epoch": 2.7371994861442466, "grad_norm": 0.4866178631782532, "learning_rate": 9.958690530987885e-06, "loss": 0.1326, "num_input_tokens_seen": 32184032, "step": 14915 }, { "epoch": 2.7381170857037986, "grad_norm": 16.486793518066406, "learning_rate": 9.958587747047382e-06, "loss": 0.1049, "num_input_tokens_seen": 32194624, "step": 14920 }, { "epoch": 2.739034685263351, "grad_norm": 2.1225478649139404, "learning_rate": 9.95848483592679e-06, "loss": 0.2396, "num_input_tokens_seen": 32205216, "step": 14925 }, { "epoch": 2.739952284822903, "grad_norm": 57.1452522277832, "learning_rate": 9.958381797628745e-06, "loss": 0.1078, "num_input_tokens_seen": 32215456, "step": 14930 }, { "epoch": 2.7408698843824553, "grad_norm": 25.33911895751953, "learning_rate": 9.958278632155892e-06, "loss": 0.1749, "num_input_tokens_seen": 32226528, "step": 14935 }, { "epoch": 2.7417874839420078, "grad_norm": 9.75401496887207, "learning_rate": 9.958175339510875e-06, "loss": 0.0187, "num_input_tokens_seen": 32236992, "step": 14940 }, { "epoch": 2.74270508350156, "grad_norm": 10.567547798156738, "learning_rate": 9.958071919696349e-06, "loss": 0.1895, "num_input_tokens_seen": 32247456, "step": 14945 }, { "epoch": 2.743622683061112, "grad_norm": 48.347206115722656, "learning_rate": 9.95796837271496e-06, "loss": 0.2533, "num_input_tokens_seen": 32257120, "step": 14950 }, { "epoch": 2.7445402826206644, "grad_norm": 0.05190246179699898, "learning_rate": 9.957864698569368e-06, "loss": 0.0603, "num_input_tokens_seen": 32268096, "step": 14955 }, { "epoch": 2.7454578821802165, "grad_norm": 52.001930236816406, "learning_rate": 9.95776089726223e-06, "loss": 0.2004, "num_input_tokens_seen": 32279424, "step": 14960 }, { "epoch": 2.7463754817397685, "grad_norm": 0.32060790061950684, "learning_rate": 9.957656968796208e-06, "loss": 0.3776, "num_input_tokens_seen": 32290144, "step": 14965 }, { "epoch": 2.747293081299321, "grad_norm": 11.73906135559082, "learning_rate": 9.957552913173969e-06, "loss": 0.3529, "num_input_tokens_seen": 32301088, "step": 14970 }, { "epoch": 2.748210680858873, "grad_norm": 0.27548709511756897, "learning_rate": 9.957448730398181e-06, "loss": 0.2928, "num_input_tokens_seen": 32312832, "step": 14975 }, { "epoch": 2.749128280418425, "grad_norm": 30.143644332885742, "learning_rate": 9.957344420471515e-06, "loss": 0.0422, "num_input_tokens_seen": 32323008, "step": 14980 }, { "epoch": 2.7500458799779777, "grad_norm": 11.851909637451172, "learning_rate": 9.95723998339665e-06, "loss": 0.267, "num_input_tokens_seen": 32333888, "step": 14985 }, { "epoch": 2.7509634795375297, "grad_norm": 24.246353149414062, "learning_rate": 9.957135419176262e-06, "loss": 0.1506, "num_input_tokens_seen": 32344608, "step": 14990 }, { "epoch": 2.751881079097082, "grad_norm": 0.3058052957057953, "learning_rate": 9.957030727813033e-06, "loss": 0.1513, "num_input_tokens_seen": 32354304, "step": 14995 }, { "epoch": 2.7527986786566343, "grad_norm": 17.460834503173828, "learning_rate": 9.956925909309647e-06, "loss": 0.1503, "num_input_tokens_seen": 32366208, "step": 15000 }, { "epoch": 2.7537162782161864, "grad_norm": 33.756046295166016, "learning_rate": 9.956820963668797e-06, "loss": 0.1109, "num_input_tokens_seen": 32377792, "step": 15005 }, { "epoch": 2.7546338777757384, "grad_norm": 30.537046432495117, "learning_rate": 9.956715890893169e-06, "loss": 0.3515, "num_input_tokens_seen": 32389024, "step": 15010 }, { "epoch": 2.755551477335291, "grad_norm": 11.553369522094727, "learning_rate": 9.956610690985463e-06, "loss": 0.4066, "num_input_tokens_seen": 32400064, "step": 15015 }, { "epoch": 2.756469076894843, "grad_norm": 33.068851470947266, "learning_rate": 9.956505363948372e-06, "loss": 0.1956, "num_input_tokens_seen": 32411008, "step": 15020 }, { "epoch": 2.757386676454395, "grad_norm": 0.388039767742157, "learning_rate": 9.956399909784603e-06, "loss": 0.1126, "num_input_tokens_seen": 32422176, "step": 15025 }, { "epoch": 2.7583042760139476, "grad_norm": 46.554073333740234, "learning_rate": 9.956294328496856e-06, "loss": 0.3328, "num_input_tokens_seen": 32433632, "step": 15030 }, { "epoch": 2.7592218755734996, "grad_norm": 0.557401180267334, "learning_rate": 9.956188620087844e-06, "loss": 0.0423, "num_input_tokens_seen": 32444064, "step": 15035 }, { "epoch": 2.7601394751330517, "grad_norm": 46.72018051147461, "learning_rate": 9.956082784560273e-06, "loss": 0.1324, "num_input_tokens_seen": 32454816, "step": 15040 }, { "epoch": 2.761057074692604, "grad_norm": 19.443138122558594, "learning_rate": 9.95597682191686e-06, "loss": 0.4605, "num_input_tokens_seen": 32465984, "step": 15045 }, { "epoch": 2.7619746742521563, "grad_norm": 0.05140826851129532, "learning_rate": 9.955870732160321e-06, "loss": 0.0245, "num_input_tokens_seen": 32477696, "step": 15050 }, { "epoch": 2.7628922738117083, "grad_norm": 34.81707763671875, "learning_rate": 9.955764515293381e-06, "loss": 0.3017, "num_input_tokens_seen": 32488608, "step": 15055 }, { "epoch": 2.763809873371261, "grad_norm": 0.6487076282501221, "learning_rate": 9.955658171318762e-06, "loss": 0.1652, "num_input_tokens_seen": 32498976, "step": 15060 }, { "epoch": 2.764727472930813, "grad_norm": 49.608150482177734, "learning_rate": 9.955551700239189e-06, "loss": 0.1292, "num_input_tokens_seen": 32509728, "step": 15065 }, { "epoch": 2.765645072490365, "grad_norm": 0.1541895866394043, "learning_rate": 9.955445102057398e-06, "loss": 0.2187, "num_input_tokens_seen": 32521120, "step": 15070 }, { "epoch": 2.7665626720499175, "grad_norm": 4.650778293609619, "learning_rate": 9.95533837677612e-06, "loss": 0.2178, "num_input_tokens_seen": 32530880, "step": 15075 }, { "epoch": 2.7674802716094695, "grad_norm": 0.13059571385383606, "learning_rate": 9.955231524398093e-06, "loss": 0.7396, "num_input_tokens_seen": 32540608, "step": 15080 }, { "epoch": 2.7683978711690216, "grad_norm": 2.1944708824157715, "learning_rate": 9.955124544926056e-06, "loss": 0.2385, "num_input_tokens_seen": 32551808, "step": 15085 }, { "epoch": 2.769315470728574, "grad_norm": 26.34706687927246, "learning_rate": 9.955017438362752e-06, "loss": 0.352, "num_input_tokens_seen": 32561504, "step": 15090 }, { "epoch": 2.770233070288126, "grad_norm": 12.745453834533691, "learning_rate": 9.954910204710935e-06, "loss": 0.0251, "num_input_tokens_seen": 32572832, "step": 15095 }, { "epoch": 2.771150669847678, "grad_norm": 56.51451110839844, "learning_rate": 9.954802843973348e-06, "loss": 0.3678, "num_input_tokens_seen": 32583136, "step": 15100 }, { "epoch": 2.7720682694072307, "grad_norm": 13.253987312316895, "learning_rate": 9.954695356152747e-06, "loss": 0.3839, "num_input_tokens_seen": 32594528, "step": 15105 }, { "epoch": 2.772985868966783, "grad_norm": 28.355138778686523, "learning_rate": 9.95458774125189e-06, "loss": 0.145, "num_input_tokens_seen": 32605728, "step": 15110 }, { "epoch": 2.773903468526335, "grad_norm": 29.840499877929688, "learning_rate": 9.954479999273537e-06, "loss": 0.1944, "num_input_tokens_seen": 32615456, "step": 15115 }, { "epoch": 2.7748210680858874, "grad_norm": 12.475441932678223, "learning_rate": 9.95437213022045e-06, "loss": 0.3044, "num_input_tokens_seen": 32626336, "step": 15120 }, { "epoch": 2.7757386676454394, "grad_norm": 31.624126434326172, "learning_rate": 9.954264134095397e-06, "loss": 0.4041, "num_input_tokens_seen": 32637120, "step": 15125 }, { "epoch": 2.7766562672049915, "grad_norm": 20.91825294494629, "learning_rate": 9.954156010901146e-06, "loss": 0.1628, "num_input_tokens_seen": 32648288, "step": 15130 }, { "epoch": 2.777573866764544, "grad_norm": 0.2301262766122818, "learning_rate": 9.954047760640472e-06, "loss": 0.0511, "num_input_tokens_seen": 32660352, "step": 15135 }, { "epoch": 2.778491466324096, "grad_norm": 1.0289596319198608, "learning_rate": 9.953939383316154e-06, "loss": 0.0612, "num_input_tokens_seen": 32671584, "step": 15140 }, { "epoch": 2.779409065883648, "grad_norm": 73.57242584228516, "learning_rate": 9.953830878930966e-06, "loss": 0.0794, "num_input_tokens_seen": 32682592, "step": 15145 }, { "epoch": 2.7803266654432006, "grad_norm": 0.2885940968990326, "learning_rate": 9.953722247487694e-06, "loss": 0.2825, "num_input_tokens_seen": 32693504, "step": 15150 }, { "epoch": 2.7812442650027527, "grad_norm": 15.81614875793457, "learning_rate": 9.953613488989123e-06, "loss": 0.3405, "num_input_tokens_seen": 32703936, "step": 15155 }, { "epoch": 2.7821618645623047, "grad_norm": 27.385528564453125, "learning_rate": 9.953504603438045e-06, "loss": 0.2186, "num_input_tokens_seen": 32713696, "step": 15160 }, { "epoch": 2.7830794641218572, "grad_norm": 27.254358291625977, "learning_rate": 9.95339559083725e-06, "loss": 0.488, "num_input_tokens_seen": 32724608, "step": 15165 }, { "epoch": 2.7839970636814093, "grad_norm": 1.2979460954666138, "learning_rate": 9.953286451189535e-06, "loss": 0.106, "num_input_tokens_seen": 32734880, "step": 15170 }, { "epoch": 2.7849146632409614, "grad_norm": 12.181314468383789, "learning_rate": 9.9531771844977e-06, "loss": 0.242, "num_input_tokens_seen": 32744896, "step": 15175 }, { "epoch": 2.785832262800514, "grad_norm": 14.199867248535156, "learning_rate": 9.953067790764548e-06, "loss": 0.0501, "num_input_tokens_seen": 32754944, "step": 15180 }, { "epoch": 2.786749862360066, "grad_norm": 0.7966745495796204, "learning_rate": 9.952958269992883e-06, "loss": 0.1435, "num_input_tokens_seen": 32766752, "step": 15185 }, { "epoch": 2.787667461919618, "grad_norm": 0.06418009847402573, "learning_rate": 9.952848622185514e-06, "loss": 0.0196, "num_input_tokens_seen": 32777152, "step": 15190 }, { "epoch": 2.7885850614791705, "grad_norm": 10.209728240966797, "learning_rate": 9.952738847345254e-06, "loss": 0.3991, "num_input_tokens_seen": 32788416, "step": 15195 }, { "epoch": 2.7895026610387226, "grad_norm": 34.60634994506836, "learning_rate": 9.95262894547492e-06, "loss": 0.2966, "num_input_tokens_seen": 32799360, "step": 15200 }, { "epoch": 2.7904202605982746, "grad_norm": 12.999686241149902, "learning_rate": 9.952518916577328e-06, "loss": 0.0429, "num_input_tokens_seen": 32809248, "step": 15205 }, { "epoch": 2.791337860157827, "grad_norm": 0.12531614303588867, "learning_rate": 9.952408760655302e-06, "loss": 0.1363, "num_input_tokens_seen": 32819904, "step": 15210 }, { "epoch": 2.792255459717379, "grad_norm": 36.142982482910156, "learning_rate": 9.952298477711667e-06, "loss": 0.3193, "num_input_tokens_seen": 32829984, "step": 15215 }, { "epoch": 2.7931730592769317, "grad_norm": 0.40237078070640564, "learning_rate": 9.95218806774925e-06, "loss": 0.1005, "num_input_tokens_seen": 32841600, "step": 15220 }, { "epoch": 2.7940906588364838, "grad_norm": 0.042798127979040146, "learning_rate": 9.952077530770887e-06, "loss": 0.1767, "num_input_tokens_seen": 32853056, "step": 15225 }, { "epoch": 2.795008258396036, "grad_norm": 44.90980529785156, "learning_rate": 9.951966866779409e-06, "loss": 0.2576, "num_input_tokens_seen": 32864096, "step": 15230 }, { "epoch": 2.7959258579555883, "grad_norm": 0.033106036484241486, "learning_rate": 9.951856075777655e-06, "loss": 0.0056, "num_input_tokens_seen": 32875488, "step": 15235 }, { "epoch": 2.7968434575151404, "grad_norm": 42.796878814697266, "learning_rate": 9.951745157768468e-06, "loss": 0.1671, "num_input_tokens_seen": 32887296, "step": 15240 }, { "epoch": 2.7977610570746925, "grad_norm": 15.814212799072266, "learning_rate": 9.951634112754693e-06, "loss": 0.3981, "num_input_tokens_seen": 32898240, "step": 15245 }, { "epoch": 2.798678656634245, "grad_norm": 0.07045760750770569, "learning_rate": 9.951522940739177e-06, "loss": 0.3038, "num_input_tokens_seen": 32909312, "step": 15250 }, { "epoch": 2.799596256193797, "grad_norm": 3.0384676456451416, "learning_rate": 9.95141164172477e-06, "loss": 0.1857, "num_input_tokens_seen": 32920224, "step": 15255 }, { "epoch": 2.800513855753349, "grad_norm": 1.0388085842132568, "learning_rate": 9.95130021571433e-06, "loss": 0.2418, "num_input_tokens_seen": 32931232, "step": 15260 }, { "epoch": 2.8014314553129016, "grad_norm": 80.70225524902344, "learning_rate": 9.951188662710713e-06, "loss": 0.2491, "num_input_tokens_seen": 32941888, "step": 15265 }, { "epoch": 2.8023490548724537, "grad_norm": 0.30240046977996826, "learning_rate": 9.951076982716781e-06, "loss": 0.0094, "num_input_tokens_seen": 32951936, "step": 15270 }, { "epoch": 2.8032666544320057, "grad_norm": 0.1386115998029709, "learning_rate": 9.950965175735397e-06, "loss": 0.2109, "num_input_tokens_seen": 32962944, "step": 15275 }, { "epoch": 2.8041842539915582, "grad_norm": 39.45595932006836, "learning_rate": 9.95085324176943e-06, "loss": 0.8286, "num_input_tokens_seen": 32974848, "step": 15280 }, { "epoch": 2.8051018535511103, "grad_norm": 0.06255634874105453, "learning_rate": 9.950741180821751e-06, "loss": 0.4938, "num_input_tokens_seen": 32986240, "step": 15285 }, { "epoch": 2.8060194531106624, "grad_norm": 33.65735626220703, "learning_rate": 9.950628992895232e-06, "loss": 0.1377, "num_input_tokens_seen": 32997216, "step": 15290 }, { "epoch": 2.806937052670215, "grad_norm": 11.47541618347168, "learning_rate": 9.950516677992755e-06, "loss": 0.0798, "num_input_tokens_seen": 33007904, "step": 15295 }, { "epoch": 2.807854652229767, "grad_norm": 1.4593722820281982, "learning_rate": 9.950404236117195e-06, "loss": 0.4694, "num_input_tokens_seen": 33018784, "step": 15300 }, { "epoch": 2.808772251789319, "grad_norm": 0.5672760009765625, "learning_rate": 9.950291667271438e-06, "loss": 0.1025, "num_input_tokens_seen": 33029600, "step": 15305 }, { "epoch": 2.8096898513488715, "grad_norm": 28.51096534729004, "learning_rate": 9.950178971458375e-06, "loss": 0.2825, "num_input_tokens_seen": 33039296, "step": 15310 }, { "epoch": 2.8106074509084236, "grad_norm": 0.76405268907547, "learning_rate": 9.950066148680893e-06, "loss": 0.0681, "num_input_tokens_seen": 33049856, "step": 15315 }, { "epoch": 2.8115250504679756, "grad_norm": 28.190345764160156, "learning_rate": 9.949953198941884e-06, "loss": 0.1234, "num_input_tokens_seen": 33060640, "step": 15320 }, { "epoch": 2.812442650027528, "grad_norm": 54.09348678588867, "learning_rate": 9.94984012224425e-06, "loss": 0.2146, "num_input_tokens_seen": 33072672, "step": 15325 }, { "epoch": 2.81336024958708, "grad_norm": 16.127199172973633, "learning_rate": 9.949726918590885e-06, "loss": 0.3406, "num_input_tokens_seen": 33084640, "step": 15330 }, { "epoch": 2.8142778491466323, "grad_norm": 12.223085403442383, "learning_rate": 9.9496135879847e-06, "loss": 0.1794, "num_input_tokens_seen": 33095456, "step": 15335 }, { "epoch": 2.8151954487061848, "grad_norm": 0.12773272395133972, "learning_rate": 9.949500130428593e-06, "loss": 0.2875, "num_input_tokens_seen": 33106528, "step": 15340 }, { "epoch": 2.816113048265737, "grad_norm": 0.6292540431022644, "learning_rate": 9.949386545925482e-06, "loss": 0.2934, "num_input_tokens_seen": 33117696, "step": 15345 }, { "epoch": 2.817030647825289, "grad_norm": 1.5571870803833008, "learning_rate": 9.949272834478276e-06, "loss": 0.0133, "num_input_tokens_seen": 33128576, "step": 15350 }, { "epoch": 2.8179482473848414, "grad_norm": 0.14473682641983032, "learning_rate": 9.949158996089893e-06, "loss": 0.2486, "num_input_tokens_seen": 33139488, "step": 15355 }, { "epoch": 2.8188658469443935, "grad_norm": 21.928424835205078, "learning_rate": 9.949045030763251e-06, "loss": 0.3574, "num_input_tokens_seen": 33150432, "step": 15360 }, { "epoch": 2.8197834465039455, "grad_norm": 0.1028464138507843, "learning_rate": 9.948930938501275e-06, "loss": 0.1827, "num_input_tokens_seen": 33162336, "step": 15365 }, { "epoch": 2.820701046063498, "grad_norm": 16.532495498657227, "learning_rate": 9.948816719306892e-06, "loss": 0.1747, "num_input_tokens_seen": 33171264, "step": 15370 }, { "epoch": 2.82161864562305, "grad_norm": 0.05373826622962952, "learning_rate": 9.948702373183027e-06, "loss": 0.1595, "num_input_tokens_seen": 33181632, "step": 15375 }, { "epoch": 2.822536245182602, "grad_norm": 1.3849937915802002, "learning_rate": 9.948587900132619e-06, "loss": 0.3639, "num_input_tokens_seen": 33193696, "step": 15380 }, { "epoch": 2.8234538447421547, "grad_norm": 6.0839996337890625, "learning_rate": 9.9484733001586e-06, "loss": 0.0259, "num_input_tokens_seen": 33203776, "step": 15385 }, { "epoch": 2.8243714443017067, "grad_norm": 18.748353958129883, "learning_rate": 9.948358573263909e-06, "loss": 0.1619, "num_input_tokens_seen": 33214560, "step": 15390 }, { "epoch": 2.825289043861259, "grad_norm": 19.086698532104492, "learning_rate": 9.948243719451491e-06, "loss": 0.2569, "num_input_tokens_seen": 33224640, "step": 15395 }, { "epoch": 2.8262066434208113, "grad_norm": 3.598842144012451, "learning_rate": 9.948128738724291e-06, "loss": 0.0627, "num_input_tokens_seen": 33235616, "step": 15400 }, { "epoch": 2.8271242429803634, "grad_norm": 8.330700874328613, "learning_rate": 9.948013631085258e-06, "loss": 0.3111, "num_input_tokens_seen": 33246080, "step": 15405 }, { "epoch": 2.8280418425399154, "grad_norm": 1.9014517068862915, "learning_rate": 9.947898396537344e-06, "loss": 0.1014, "num_input_tokens_seen": 33257184, "step": 15410 }, { "epoch": 2.828959442099468, "grad_norm": 0.13628332316875458, "learning_rate": 9.947783035083503e-06, "loss": 0.2812, "num_input_tokens_seen": 33268512, "step": 15415 }, { "epoch": 2.82987704165902, "grad_norm": 0.23092854022979736, "learning_rate": 9.947667546726697e-06, "loss": 0.1527, "num_input_tokens_seen": 33278208, "step": 15420 }, { "epoch": 2.830794641218572, "grad_norm": 33.91978454589844, "learning_rate": 9.947551931469886e-06, "loss": 0.0604, "num_input_tokens_seen": 33288736, "step": 15425 }, { "epoch": 2.8317122407781246, "grad_norm": 0.15675753355026245, "learning_rate": 9.947436189316037e-06, "loss": 0.2536, "num_input_tokens_seen": 33299840, "step": 15430 }, { "epoch": 2.8326298403376766, "grad_norm": 0.03759101405739784, "learning_rate": 9.947320320268116e-06, "loss": 0.1873, "num_input_tokens_seen": 33310720, "step": 15435 }, { "epoch": 2.833547439897229, "grad_norm": 8.579876899719238, "learning_rate": 9.947204324329098e-06, "loss": 0.1743, "num_input_tokens_seen": 33321472, "step": 15440 }, { "epoch": 2.834465039456781, "grad_norm": 82.05885314941406, "learning_rate": 9.947088201501956e-06, "loss": 0.2137, "num_input_tokens_seen": 33332192, "step": 15445 }, { "epoch": 2.8353826390163333, "grad_norm": 21.208755493164062, "learning_rate": 9.946971951789668e-06, "loss": 0.1488, "num_input_tokens_seen": 33341952, "step": 15450 }, { "epoch": 2.8363002385758858, "grad_norm": 14.0950288772583, "learning_rate": 9.946855575195217e-06, "loss": 0.127, "num_input_tokens_seen": 33352736, "step": 15455 }, { "epoch": 2.837217838135438, "grad_norm": 0.4727082848548889, "learning_rate": 9.946739071721587e-06, "loss": 0.0029, "num_input_tokens_seen": 33363744, "step": 15460 }, { "epoch": 2.83813543769499, "grad_norm": 1.6320164203643799, "learning_rate": 9.946622441371768e-06, "loss": 0.2317, "num_input_tokens_seen": 33373824, "step": 15465 }, { "epoch": 2.8390530372545424, "grad_norm": 3.4057178497314453, "learning_rate": 9.94650568414875e-06, "loss": 0.3497, "num_input_tokens_seen": 33384000, "step": 15470 }, { "epoch": 2.8399706368140945, "grad_norm": 0.10154350101947784, "learning_rate": 9.946388800055527e-06, "loss": 0.0827, "num_input_tokens_seen": 33396000, "step": 15475 }, { "epoch": 2.8408882363736465, "grad_norm": 20.440166473388672, "learning_rate": 9.946271789095096e-06, "loss": 0.3097, "num_input_tokens_seen": 33405600, "step": 15480 }, { "epoch": 2.841805835933199, "grad_norm": 0.23276697099208832, "learning_rate": 9.94615465127046e-06, "loss": 0.1129, "num_input_tokens_seen": 33415616, "step": 15485 }, { "epoch": 2.842723435492751, "grad_norm": 22.482345581054688, "learning_rate": 9.946037386584626e-06, "loss": 0.143, "num_input_tokens_seen": 33427136, "step": 15490 }, { "epoch": 2.843641035052303, "grad_norm": 19.585269927978516, "learning_rate": 9.945919995040595e-06, "loss": 0.4749, "num_input_tokens_seen": 33437856, "step": 15495 }, { "epoch": 2.8445586346118557, "grad_norm": 0.06732171773910522, "learning_rate": 9.945802476641383e-06, "loss": 0.0097, "num_input_tokens_seen": 33449440, "step": 15500 }, { "epoch": 2.8454762341714077, "grad_norm": 0.18753550946712494, "learning_rate": 9.945684831390004e-06, "loss": 0.0085, "num_input_tokens_seen": 33460768, "step": 15505 }, { "epoch": 2.8463938337309598, "grad_norm": 1.6358658075332642, "learning_rate": 9.945567059289474e-06, "loss": 0.2981, "num_input_tokens_seen": 33473024, "step": 15510 }, { "epoch": 2.8473114332905123, "grad_norm": 0.051756300032138824, "learning_rate": 9.945449160342812e-06, "loss": 0.1845, "num_input_tokens_seen": 33483008, "step": 15515 }, { "epoch": 2.8482290328500643, "grad_norm": 0.2886550724506378, "learning_rate": 9.945331134553045e-06, "loss": 0.1563, "num_input_tokens_seen": 33494272, "step": 15520 }, { "epoch": 2.8491466324096164, "grad_norm": 0.040715087205171585, "learning_rate": 9.945212981923199e-06, "loss": 0.0062, "num_input_tokens_seen": 33506272, "step": 15525 }, { "epoch": 2.850064231969169, "grad_norm": 0.8983501195907593, "learning_rate": 9.945094702456305e-06, "loss": 0.511, "num_input_tokens_seen": 33516288, "step": 15530 }, { "epoch": 2.850981831528721, "grad_norm": 10.911245346069336, "learning_rate": 9.944976296155395e-06, "loss": 0.4524, "num_input_tokens_seen": 33526016, "step": 15535 }, { "epoch": 2.851899431088273, "grad_norm": 13.222536087036133, "learning_rate": 9.944857763023507e-06, "loss": 0.4606, "num_input_tokens_seen": 33536736, "step": 15540 }, { "epoch": 2.8528170306478255, "grad_norm": 15.99738883972168, "learning_rate": 9.94473910306368e-06, "loss": 0.3647, "num_input_tokens_seen": 33547360, "step": 15545 }, { "epoch": 2.8537346302073776, "grad_norm": 21.37777328491211, "learning_rate": 9.944620316278961e-06, "loss": 0.4129, "num_input_tokens_seen": 33558560, "step": 15550 }, { "epoch": 2.8546522297669297, "grad_norm": 0.6045165061950684, "learning_rate": 9.944501402672394e-06, "loss": 0.132, "num_input_tokens_seen": 33569920, "step": 15555 }, { "epoch": 2.855569829326482, "grad_norm": 0.18516123294830322, "learning_rate": 9.94438236224703e-06, "loss": 0.2231, "num_input_tokens_seen": 33581472, "step": 15560 }, { "epoch": 2.8564874288860342, "grad_norm": 16.356792449951172, "learning_rate": 9.944263195005918e-06, "loss": 0.1058, "num_input_tokens_seen": 33591520, "step": 15565 }, { "epoch": 2.8574050284455863, "grad_norm": 0.05387534946203232, "learning_rate": 9.944143900952122e-06, "loss": 0.2347, "num_input_tokens_seen": 33602816, "step": 15570 }, { "epoch": 2.858322628005139, "grad_norm": 19.966060638427734, "learning_rate": 9.944024480088697e-06, "loss": 0.2053, "num_input_tokens_seen": 33613440, "step": 15575 }, { "epoch": 2.859240227564691, "grad_norm": 0.523461103439331, "learning_rate": 9.943904932418704e-06, "loss": 0.1264, "num_input_tokens_seen": 33625696, "step": 15580 }, { "epoch": 2.860157827124243, "grad_norm": 0.1513635665178299, "learning_rate": 9.943785257945214e-06, "loss": 0.093, "num_input_tokens_seen": 33636704, "step": 15585 }, { "epoch": 2.8610754266837954, "grad_norm": 47.27812194824219, "learning_rate": 9.943665456671295e-06, "loss": 0.1403, "num_input_tokens_seen": 33648256, "step": 15590 }, { "epoch": 2.8619930262433475, "grad_norm": 23.971759796142578, "learning_rate": 9.943545528600017e-06, "loss": 0.2033, "num_input_tokens_seen": 33658464, "step": 15595 }, { "epoch": 2.8629106258028996, "grad_norm": 18.6245059967041, "learning_rate": 9.943425473734459e-06, "loss": 0.2394, "num_input_tokens_seen": 33669472, "step": 15600 }, { "epoch": 2.863828225362452, "grad_norm": 6.834936141967773, "learning_rate": 9.943305292077698e-06, "loss": 0.029, "num_input_tokens_seen": 33680928, "step": 15605 }, { "epoch": 2.864745824922004, "grad_norm": 0.6560755968093872, "learning_rate": 9.943184983632819e-06, "loss": 0.1816, "num_input_tokens_seen": 33691840, "step": 15610 }, { "epoch": 2.865663424481556, "grad_norm": 0.38026735186576843, "learning_rate": 9.943064548402906e-06, "loss": 0.1357, "num_input_tokens_seen": 33702752, "step": 15615 }, { "epoch": 2.8665810240411087, "grad_norm": 0.04674693942070007, "learning_rate": 9.94294398639105e-06, "loss": 0.2349, "num_input_tokens_seen": 33712608, "step": 15620 }, { "epoch": 2.8674986236006608, "grad_norm": 0.28582271933555603, "learning_rate": 9.942823297600339e-06, "loss": 0.2839, "num_input_tokens_seen": 33723872, "step": 15625 }, { "epoch": 2.868416223160213, "grad_norm": 0.13368454575538635, "learning_rate": 9.942702482033873e-06, "loss": 0.0425, "num_input_tokens_seen": 33734752, "step": 15630 }, { "epoch": 2.8693338227197653, "grad_norm": 31.894948959350586, "learning_rate": 9.942581539694747e-06, "loss": 0.4943, "num_input_tokens_seen": 33744992, "step": 15635 }, { "epoch": 2.8702514222793174, "grad_norm": 0.04324248060584068, "learning_rate": 9.942460470586066e-06, "loss": 0.4548, "num_input_tokens_seen": 33754688, "step": 15640 }, { "epoch": 2.8711690218388695, "grad_norm": 0.7802226543426514, "learning_rate": 9.942339274710933e-06, "loss": 0.1233, "num_input_tokens_seen": 33764160, "step": 15645 }, { "epoch": 2.872086621398422, "grad_norm": 1.6167930364608765, "learning_rate": 9.942217952072459e-06, "loss": 0.0357, "num_input_tokens_seen": 33775840, "step": 15650 }, { "epoch": 2.873004220957974, "grad_norm": 26.26171875, "learning_rate": 9.942096502673754e-06, "loss": 0.3019, "num_input_tokens_seen": 33786368, "step": 15655 }, { "epoch": 2.873921820517526, "grad_norm": 13.933111190795898, "learning_rate": 9.941974926517932e-06, "loss": 0.2404, "num_input_tokens_seen": 33795904, "step": 15660 }, { "epoch": 2.8748394200770786, "grad_norm": 0.8218989968299866, "learning_rate": 9.941853223608114e-06, "loss": 0.1086, "num_input_tokens_seen": 33806368, "step": 15665 }, { "epoch": 2.8757570196366307, "grad_norm": 3.513486862182617, "learning_rate": 9.94173139394742e-06, "loss": 0.0953, "num_input_tokens_seen": 33816992, "step": 15670 }, { "epoch": 2.8766746191961827, "grad_norm": 0.4605153203010559, "learning_rate": 9.941609437538973e-06, "loss": 0.1926, "num_input_tokens_seen": 33827040, "step": 15675 }, { "epoch": 2.8775922187557352, "grad_norm": 0.29781827330589294, "learning_rate": 9.941487354385904e-06, "loss": 0.1817, "num_input_tokens_seen": 33837504, "step": 15680 }, { "epoch": 2.8785098183152873, "grad_norm": 0.3659191131591797, "learning_rate": 9.941365144491344e-06, "loss": 0.1148, "num_input_tokens_seen": 33848928, "step": 15685 }, { "epoch": 2.8794274178748394, "grad_norm": 0.16876350343227386, "learning_rate": 9.941242807858424e-06, "loss": 0.1059, "num_input_tokens_seen": 33858656, "step": 15690 }, { "epoch": 2.880345017434392, "grad_norm": 0.11862692981958389, "learning_rate": 9.941120344490287e-06, "loss": 0.2406, "num_input_tokens_seen": 33869536, "step": 15695 }, { "epoch": 2.881262616993944, "grad_norm": 40.15751647949219, "learning_rate": 9.940997754390069e-06, "loss": 0.31, "num_input_tokens_seen": 33880480, "step": 15700 }, { "epoch": 2.882180216553496, "grad_norm": 12.076272010803223, "learning_rate": 9.940875037560917e-06, "loss": 0.2677, "num_input_tokens_seen": 33889760, "step": 15705 }, { "epoch": 2.8830978161130485, "grad_norm": 25.265926361083984, "learning_rate": 9.940752194005978e-06, "loss": 0.0868, "num_input_tokens_seen": 33901664, "step": 15710 }, { "epoch": 2.8840154156726006, "grad_norm": 32.51496124267578, "learning_rate": 9.940629223728403e-06, "loss": 0.2439, "num_input_tokens_seen": 33912640, "step": 15715 }, { "epoch": 2.8849330152321526, "grad_norm": 23.1976261138916, "learning_rate": 9.940506126731346e-06, "loss": 0.1276, "num_input_tokens_seen": 33924864, "step": 15720 }, { "epoch": 2.885850614791705, "grad_norm": 0.14727036654949188, "learning_rate": 9.940382903017964e-06, "loss": 0.2597, "num_input_tokens_seen": 33935584, "step": 15725 }, { "epoch": 2.886768214351257, "grad_norm": 1.3227519989013672, "learning_rate": 9.940259552591416e-06, "loss": 0.2319, "num_input_tokens_seen": 33945408, "step": 15730 }, { "epoch": 2.8876858139108093, "grad_norm": 1.2502106428146362, "learning_rate": 9.940136075454869e-06, "loss": 0.2728, "num_input_tokens_seen": 33953984, "step": 15735 }, { "epoch": 2.8886034134703618, "grad_norm": 3.3092269897460938, "learning_rate": 9.940012471611486e-06, "loss": 0.1787, "num_input_tokens_seen": 33966240, "step": 15740 }, { "epoch": 2.889521013029914, "grad_norm": 0.11084331572055817, "learning_rate": 9.939888741064441e-06, "loss": 0.1752, "num_input_tokens_seen": 33976800, "step": 15745 }, { "epoch": 2.890438612589466, "grad_norm": 30.768882751464844, "learning_rate": 9.939764883816907e-06, "loss": 0.0513, "num_input_tokens_seen": 33987584, "step": 15750 }, { "epoch": 2.8913562121490184, "grad_norm": 10.644659996032715, "learning_rate": 9.939640899872058e-06, "loss": 0.201, "num_input_tokens_seen": 33997536, "step": 15755 }, { "epoch": 2.8922738117085705, "grad_norm": 0.042867716401815414, "learning_rate": 9.939516789233076e-06, "loss": 0.1965, "num_input_tokens_seen": 34008384, "step": 15760 }, { "epoch": 2.8931914112681225, "grad_norm": 31.26422882080078, "learning_rate": 9.939392551903144e-06, "loss": 0.2523, "num_input_tokens_seen": 34018784, "step": 15765 }, { "epoch": 2.894109010827675, "grad_norm": 3.9642438888549805, "learning_rate": 9.939268187885449e-06, "loss": 0.2425, "num_input_tokens_seen": 34029824, "step": 15770 }, { "epoch": 2.895026610387227, "grad_norm": 25.494773864746094, "learning_rate": 9.939143697183178e-06, "loss": 0.286, "num_input_tokens_seen": 34039264, "step": 15775 }, { "epoch": 2.895944209946779, "grad_norm": 10.100915908813477, "learning_rate": 9.939019079799527e-06, "loss": 0.2588, "num_input_tokens_seen": 34049344, "step": 15780 }, { "epoch": 2.8968618095063317, "grad_norm": 34.731449127197266, "learning_rate": 9.938894335737693e-06, "loss": 0.1165, "num_input_tokens_seen": 34060352, "step": 15785 }, { "epoch": 2.8977794090658837, "grad_norm": 0.5212255120277405, "learning_rate": 9.938769465000873e-06, "loss": 0.221, "num_input_tokens_seen": 34072480, "step": 15790 }, { "epoch": 2.898697008625436, "grad_norm": 21.273988723754883, "learning_rate": 9.93864446759227e-06, "loss": 0.3354, "num_input_tokens_seen": 34083008, "step": 15795 }, { "epoch": 2.8996146081849883, "grad_norm": 31.423444747924805, "learning_rate": 9.938519343515091e-06, "loss": 0.1822, "num_input_tokens_seen": 34092576, "step": 15800 }, { "epoch": 2.9005322077445403, "grad_norm": 0.3204691410064697, "learning_rate": 9.938394092772545e-06, "loss": 0.2663, "num_input_tokens_seen": 34104992, "step": 15805 }, { "epoch": 2.9014498073040924, "grad_norm": 0.21619506180286407, "learning_rate": 9.938268715367846e-06, "loss": 0.1692, "num_input_tokens_seen": 34115552, "step": 15810 }, { "epoch": 2.902367406863645, "grad_norm": 6.040098190307617, "learning_rate": 9.938143211304205e-06, "loss": 0.0569, "num_input_tokens_seen": 34125792, "step": 15815 }, { "epoch": 2.903285006423197, "grad_norm": 0.17138457298278809, "learning_rate": 9.938017580584846e-06, "loss": 0.2497, "num_input_tokens_seen": 34136640, "step": 15820 }, { "epoch": 2.904202605982749, "grad_norm": 25.259674072265625, "learning_rate": 9.937891823212989e-06, "loss": 0.2318, "num_input_tokens_seen": 34147424, "step": 15825 }, { "epoch": 2.9051202055423015, "grad_norm": 5.542194366455078, "learning_rate": 9.937765939191859e-06, "loss": 0.2583, "num_input_tokens_seen": 34158720, "step": 15830 }, { "epoch": 2.9060378051018536, "grad_norm": 2.7570767402648926, "learning_rate": 9.937639928524687e-06, "loss": 0.2509, "num_input_tokens_seen": 34169536, "step": 15835 }, { "epoch": 2.9069554046614057, "grad_norm": 10.374753952026367, "learning_rate": 9.9375137912147e-06, "loss": 0.2347, "num_input_tokens_seen": 34179072, "step": 15840 }, { "epoch": 2.907873004220958, "grad_norm": 46.052696228027344, "learning_rate": 9.937387527265142e-06, "loss": 0.0712, "num_input_tokens_seen": 34190016, "step": 15845 }, { "epoch": 2.9087906037805102, "grad_norm": 1.234297513961792, "learning_rate": 9.937261136679243e-06, "loss": 0.3406, "num_input_tokens_seen": 34202048, "step": 15850 }, { "epoch": 2.9097082033400623, "grad_norm": 0.3540582060813904, "learning_rate": 9.937134619460248e-06, "loss": 0.056, "num_input_tokens_seen": 34212608, "step": 15855 }, { "epoch": 2.910625802899615, "grad_norm": 0.5163159370422363, "learning_rate": 9.9370079756114e-06, "loss": 0.194, "num_input_tokens_seen": 34222048, "step": 15860 }, { "epoch": 2.911543402459167, "grad_norm": 24.2641544342041, "learning_rate": 9.936881205135953e-06, "loss": 0.2965, "num_input_tokens_seen": 34232608, "step": 15865 }, { "epoch": 2.912461002018719, "grad_norm": 10.612631797790527, "learning_rate": 9.936754308037154e-06, "loss": 0.334, "num_input_tokens_seen": 34242624, "step": 15870 }, { "epoch": 2.9133786015782714, "grad_norm": 0.8133704662322998, "learning_rate": 9.936627284318257e-06, "loss": 0.2845, "num_input_tokens_seen": 34253696, "step": 15875 }, { "epoch": 2.9142962011378235, "grad_norm": 0.5743218064308167, "learning_rate": 9.93650013398252e-06, "loss": 0.1186, "num_input_tokens_seen": 34264096, "step": 15880 }, { "epoch": 2.9152138006973756, "grad_norm": 1.9541231393814087, "learning_rate": 9.936372857033207e-06, "loss": 0.0899, "num_input_tokens_seen": 34275200, "step": 15885 }, { "epoch": 2.916131400256928, "grad_norm": 2.2050833702087402, "learning_rate": 9.93624545347358e-06, "loss": 0.0438, "num_input_tokens_seen": 34286688, "step": 15890 }, { "epoch": 2.91704899981648, "grad_norm": 13.865191459655762, "learning_rate": 9.93611792330691e-06, "loss": 0.3236, "num_input_tokens_seen": 34297632, "step": 15895 }, { "epoch": 2.917966599376032, "grad_norm": 14.912209510803223, "learning_rate": 9.935990266536464e-06, "loss": 0.2024, "num_input_tokens_seen": 34307616, "step": 15900 }, { "epoch": 2.9188841989355847, "grad_norm": 29.531747817993164, "learning_rate": 9.935862483165517e-06, "loss": 0.1045, "num_input_tokens_seen": 34319072, "step": 15905 }, { "epoch": 2.9198017984951368, "grad_norm": 0.15334096550941467, "learning_rate": 9.935734573197348e-06, "loss": 0.1005, "num_input_tokens_seen": 34329184, "step": 15910 }, { "epoch": 2.920719398054689, "grad_norm": 0.11365611851215363, "learning_rate": 9.935606536635237e-06, "loss": 0.1413, "num_input_tokens_seen": 34340224, "step": 15915 }, { "epoch": 2.9216369976142413, "grad_norm": 7.682363033294678, "learning_rate": 9.935478373482466e-06, "loss": 0.2916, "num_input_tokens_seen": 34351360, "step": 15920 }, { "epoch": 2.9225545971737934, "grad_norm": 8.540167808532715, "learning_rate": 9.935350083742325e-06, "loss": 0.5768, "num_input_tokens_seen": 34364000, "step": 15925 }, { "epoch": 2.9234721967333455, "grad_norm": 1.4939789772033691, "learning_rate": 9.935221667418105e-06, "loss": 0.1521, "num_input_tokens_seen": 34373888, "step": 15930 }, { "epoch": 2.924389796292898, "grad_norm": 1.0075041055679321, "learning_rate": 9.935093124513098e-06, "loss": 0.0268, "num_input_tokens_seen": 34384736, "step": 15935 }, { "epoch": 2.92530739585245, "grad_norm": 24.042264938354492, "learning_rate": 9.9349644550306e-06, "loss": 0.2143, "num_input_tokens_seen": 34395136, "step": 15940 }, { "epoch": 2.926224995412002, "grad_norm": 5.500182151794434, "learning_rate": 9.934835658973912e-06, "loss": 0.4777, "num_input_tokens_seen": 34406176, "step": 15945 }, { "epoch": 2.9271425949715546, "grad_norm": 0.8607062101364136, "learning_rate": 9.934706736346337e-06, "loss": 0.1705, "num_input_tokens_seen": 34416576, "step": 15950 }, { "epoch": 2.9280601945311067, "grad_norm": 1.797385573387146, "learning_rate": 9.934577687151184e-06, "loss": 0.3472, "num_input_tokens_seen": 34427776, "step": 15955 }, { "epoch": 2.9289777940906587, "grad_norm": 8.038885116577148, "learning_rate": 9.934448511391762e-06, "loss": 0.0815, "num_input_tokens_seen": 34438624, "step": 15960 }, { "epoch": 2.9298953936502112, "grad_norm": 0.5330855250358582, "learning_rate": 9.934319209071382e-06, "loss": 0.13, "num_input_tokens_seen": 34449952, "step": 15965 }, { "epoch": 2.9308129932097633, "grad_norm": 11.22068977355957, "learning_rate": 9.934189780193361e-06, "loss": 0.1377, "num_input_tokens_seen": 34461568, "step": 15970 }, { "epoch": 2.9317305927693154, "grad_norm": 12.679862022399902, "learning_rate": 9.93406022476102e-06, "loss": 0.1305, "num_input_tokens_seen": 34472704, "step": 15975 }, { "epoch": 2.932648192328868, "grad_norm": 1.2710872888565063, "learning_rate": 9.933930542777681e-06, "loss": 0.0516, "num_input_tokens_seen": 34484192, "step": 15980 }, { "epoch": 2.93356579188842, "grad_norm": 32.190162658691406, "learning_rate": 9.933800734246673e-06, "loss": 0.1382, "num_input_tokens_seen": 34495584, "step": 15985 }, { "epoch": 2.934483391447972, "grad_norm": 0.06911150366067886, "learning_rate": 9.933670799171319e-06, "loss": 0.1881, "num_input_tokens_seen": 34506208, "step": 15990 }, { "epoch": 2.9354009910075245, "grad_norm": 7.306904315948486, "learning_rate": 9.933540737554959e-06, "loss": 0.1905, "num_input_tokens_seen": 34517664, "step": 15995 }, { "epoch": 2.9363185905670766, "grad_norm": 0.16502204537391663, "learning_rate": 9.933410549400924e-06, "loss": 0.1032, "num_input_tokens_seen": 34528512, "step": 16000 }, { "epoch": 2.9372361901266286, "grad_norm": 13.616348266601562, "learning_rate": 9.933280234712552e-06, "loss": 0.6009, "num_input_tokens_seen": 34540288, "step": 16005 }, { "epoch": 2.938153789686181, "grad_norm": 9.977275848388672, "learning_rate": 9.933149793493191e-06, "loss": 0.2288, "num_input_tokens_seen": 34551104, "step": 16010 }, { "epoch": 2.939071389245733, "grad_norm": 15.788668632507324, "learning_rate": 9.933019225746183e-06, "loss": 0.2911, "num_input_tokens_seen": 34562240, "step": 16015 }, { "epoch": 2.9399889888052853, "grad_norm": 0.2723703980445862, "learning_rate": 9.932888531474877e-06, "loss": 0.2105, "num_input_tokens_seen": 34573568, "step": 16020 }, { "epoch": 2.9409065883648378, "grad_norm": 8.926312446594238, "learning_rate": 9.932757710682625e-06, "loss": 0.2051, "num_input_tokens_seen": 34584032, "step": 16025 }, { "epoch": 2.94182418792439, "grad_norm": 3.185567617416382, "learning_rate": 9.932626763372784e-06, "loss": 0.1683, "num_input_tokens_seen": 34593824, "step": 16030 }, { "epoch": 2.942741787483942, "grad_norm": 14.799309730529785, "learning_rate": 9.93249568954871e-06, "loss": 0.2006, "num_input_tokens_seen": 34605152, "step": 16035 }, { "epoch": 2.9436593870434944, "grad_norm": 10.830175399780273, "learning_rate": 9.932364489213767e-06, "loss": 0.3192, "num_input_tokens_seen": 34616704, "step": 16040 }, { "epoch": 2.9445769866030465, "grad_norm": 25.122812271118164, "learning_rate": 9.932233162371318e-06, "loss": 0.1018, "num_input_tokens_seen": 34627424, "step": 16045 }, { "epoch": 2.9454945861625985, "grad_norm": 0.22822335362434387, "learning_rate": 9.932101709024735e-06, "loss": 0.1671, "num_input_tokens_seen": 34638432, "step": 16050 }, { "epoch": 2.946412185722151, "grad_norm": 0.41625261306762695, "learning_rate": 9.931970129177387e-06, "loss": 0.1659, "num_input_tokens_seen": 34648704, "step": 16055 }, { "epoch": 2.947329785281703, "grad_norm": 15.468552589416504, "learning_rate": 9.931838422832646e-06, "loss": 0.2316, "num_input_tokens_seen": 34660224, "step": 16060 }, { "epoch": 2.948247384841255, "grad_norm": 3.918874979019165, "learning_rate": 9.931706589993898e-06, "loss": 0.3219, "num_input_tokens_seen": 34670272, "step": 16065 }, { "epoch": 2.9491649844008077, "grad_norm": 51.325252532958984, "learning_rate": 9.931574630664516e-06, "loss": 0.1699, "num_input_tokens_seen": 34681376, "step": 16070 }, { "epoch": 2.9500825839603597, "grad_norm": 4.665029525756836, "learning_rate": 9.931442544847888e-06, "loss": 0.0902, "num_input_tokens_seen": 34691680, "step": 16075 }, { "epoch": 2.951000183519912, "grad_norm": 0.34593868255615234, "learning_rate": 9.931310332547402e-06, "loss": 0.2083, "num_input_tokens_seen": 34703104, "step": 16080 }, { "epoch": 2.9519177830794643, "grad_norm": 0.6033722758293152, "learning_rate": 9.93117799376645e-06, "loss": 0.0794, "num_input_tokens_seen": 34713472, "step": 16085 }, { "epoch": 2.9528353826390163, "grad_norm": 3.7707886695861816, "learning_rate": 9.931045528508423e-06, "loss": 0.0216, "num_input_tokens_seen": 34723904, "step": 16090 }, { "epoch": 2.9537529821985684, "grad_norm": 15.636699676513672, "learning_rate": 9.930912936776723e-06, "loss": 0.2184, "num_input_tokens_seen": 34735456, "step": 16095 }, { "epoch": 2.954670581758121, "grad_norm": 3.0397274494171143, "learning_rate": 9.930780218574746e-06, "loss": 0.0756, "num_input_tokens_seen": 34745152, "step": 16100 }, { "epoch": 2.955588181317673, "grad_norm": 33.90857696533203, "learning_rate": 9.930647373905901e-06, "loss": 0.2141, "num_input_tokens_seen": 34755104, "step": 16105 }, { "epoch": 2.956505780877225, "grad_norm": 0.07562364637851715, "learning_rate": 9.930514402773591e-06, "loss": 0.2983, "num_input_tokens_seen": 34766112, "step": 16110 }, { "epoch": 2.9574233804367775, "grad_norm": 0.09243784844875336, "learning_rate": 9.93038130518123e-06, "loss": 0.4144, "num_input_tokens_seen": 34776000, "step": 16115 }, { "epoch": 2.9583409799963296, "grad_norm": 0.06081278994679451, "learning_rate": 9.930248081132227e-06, "loss": 0.1954, "num_input_tokens_seen": 34786720, "step": 16120 }, { "epoch": 2.9592585795558817, "grad_norm": 7.574416160583496, "learning_rate": 9.930114730630003e-06, "loss": 0.1245, "num_input_tokens_seen": 34798432, "step": 16125 }, { "epoch": 2.960176179115434, "grad_norm": 39.8139762878418, "learning_rate": 9.929981253677978e-06, "loss": 0.4002, "num_input_tokens_seen": 34809216, "step": 16130 }, { "epoch": 2.9610937786749862, "grad_norm": 83.7325439453125, "learning_rate": 9.929847650279573e-06, "loss": 0.0762, "num_input_tokens_seen": 34821952, "step": 16135 }, { "epoch": 2.9620113782345383, "grad_norm": 24.450910568237305, "learning_rate": 9.929713920438218e-06, "loss": 0.3021, "num_input_tokens_seen": 34831040, "step": 16140 }, { "epoch": 2.962928977794091, "grad_norm": 0.4273886978626251, "learning_rate": 9.929580064157341e-06, "loss": 0.1536, "num_input_tokens_seen": 34842048, "step": 16145 }, { "epoch": 2.963846577353643, "grad_norm": 0.12180273979902267, "learning_rate": 9.929446081440376e-06, "loss": 0.2175, "num_input_tokens_seen": 34853408, "step": 16150 }, { "epoch": 2.964764176913195, "grad_norm": 0.37464165687561035, "learning_rate": 9.929311972290758e-06, "loss": 0.0869, "num_input_tokens_seen": 34863104, "step": 16155 }, { "epoch": 2.9656817764727474, "grad_norm": 0.09337323904037476, "learning_rate": 9.929177736711927e-06, "loss": 0.0568, "num_input_tokens_seen": 34874688, "step": 16160 }, { "epoch": 2.9665993760322995, "grad_norm": 0.10491538792848587, "learning_rate": 9.929043374707329e-06, "loss": 0.2346, "num_input_tokens_seen": 34886336, "step": 16165 }, { "epoch": 2.9675169755918516, "grad_norm": 0.05320350453257561, "learning_rate": 9.928908886280406e-06, "loss": 0.0922, "num_input_tokens_seen": 34898048, "step": 16170 }, { "epoch": 2.968434575151404, "grad_norm": 0.3153159022331238, "learning_rate": 9.92877427143461e-06, "loss": 0.0904, "num_input_tokens_seen": 34909632, "step": 16175 }, { "epoch": 2.969352174710956, "grad_norm": 0.03374204784631729, "learning_rate": 9.928639530173392e-06, "loss": 0.158, "num_input_tokens_seen": 34919776, "step": 16180 }, { "epoch": 2.970269774270508, "grad_norm": 0.2872661352157593, "learning_rate": 9.928504662500209e-06, "loss": 0.3472, "num_input_tokens_seen": 34929792, "step": 16185 }, { "epoch": 2.9711873738300607, "grad_norm": 0.17412734031677246, "learning_rate": 9.92836966841852e-06, "loss": 0.0347, "num_input_tokens_seen": 34940032, "step": 16190 }, { "epoch": 2.9721049733896128, "grad_norm": 7.040235996246338, "learning_rate": 9.928234547931787e-06, "loss": 0.435, "num_input_tokens_seen": 34950560, "step": 16195 }, { "epoch": 2.973022572949165, "grad_norm": 1.070284366607666, "learning_rate": 9.928099301043476e-06, "loss": 0.1696, "num_input_tokens_seen": 34961504, "step": 16200 }, { "epoch": 2.9739401725087173, "grad_norm": 8.739986419677734, "learning_rate": 9.927963927757057e-06, "loss": 0.076, "num_input_tokens_seen": 34972800, "step": 16205 }, { "epoch": 2.9748577720682694, "grad_norm": 7.3231730461120605, "learning_rate": 9.927828428075998e-06, "loss": 0.0468, "num_input_tokens_seen": 34984320, "step": 16210 }, { "epoch": 2.9757753716278215, "grad_norm": 0.821215033531189, "learning_rate": 9.92769280200378e-06, "loss": 0.1663, "num_input_tokens_seen": 34995936, "step": 16215 }, { "epoch": 2.976692971187374, "grad_norm": 11.872807502746582, "learning_rate": 9.927557049543877e-06, "loss": 0.079, "num_input_tokens_seen": 35006944, "step": 16220 }, { "epoch": 2.977610570746926, "grad_norm": 0.03594343736767769, "learning_rate": 9.927421170699775e-06, "loss": 0.3829, "num_input_tokens_seen": 35017824, "step": 16225 }, { "epoch": 2.978528170306478, "grad_norm": 29.304489135742188, "learning_rate": 9.927285165474955e-06, "loss": 0.039, "num_input_tokens_seen": 35029056, "step": 16230 }, { "epoch": 2.9794457698660306, "grad_norm": 23.672359466552734, "learning_rate": 9.927149033872908e-06, "loss": 0.2807, "num_input_tokens_seen": 35037216, "step": 16235 }, { "epoch": 2.9803633694255827, "grad_norm": 1.248062252998352, "learning_rate": 9.927012775897124e-06, "loss": 0.0034, "num_input_tokens_seen": 35048128, "step": 16240 }, { "epoch": 2.9812809689851347, "grad_norm": 0.20932121574878693, "learning_rate": 9.9268763915511e-06, "loss": 0.1621, "num_input_tokens_seen": 35058624, "step": 16245 }, { "epoch": 2.9821985685446872, "grad_norm": 25.47930145263672, "learning_rate": 9.92673988083833e-06, "loss": 0.1691, "num_input_tokens_seen": 35068736, "step": 16250 }, { "epoch": 2.9831161681042393, "grad_norm": 8.609831809997559, "learning_rate": 9.926603243762319e-06, "loss": 0.3658, "num_input_tokens_seen": 35080160, "step": 16255 }, { "epoch": 2.9840337676637914, "grad_norm": 36.97684860229492, "learning_rate": 9.926466480326571e-06, "loss": 0.0496, "num_input_tokens_seen": 35090144, "step": 16260 }, { "epoch": 2.984951367223344, "grad_norm": 39.72750473022461, "learning_rate": 9.92632959053459e-06, "loss": 0.1038, "num_input_tokens_seen": 35101152, "step": 16265 }, { "epoch": 2.985868966782896, "grad_norm": 3.4172539710998535, "learning_rate": 9.926192574389894e-06, "loss": 0.4973, "num_input_tokens_seen": 35113024, "step": 16270 }, { "epoch": 2.986786566342448, "grad_norm": 0.3100101053714752, "learning_rate": 9.926055431895993e-06, "loss": 0.0026, "num_input_tokens_seen": 35123648, "step": 16275 }, { "epoch": 2.9877041659020005, "grad_norm": 3.4547126293182373, "learning_rate": 9.925918163056402e-06, "loss": 0.0385, "num_input_tokens_seen": 35134816, "step": 16280 }, { "epoch": 2.9886217654615526, "grad_norm": 2.4176251888275146, "learning_rate": 9.925780767874648e-06, "loss": 0.2828, "num_input_tokens_seen": 35145536, "step": 16285 }, { "epoch": 2.9895393650211046, "grad_norm": 9.389872550964355, "learning_rate": 9.92564324635425e-06, "loss": 0.3086, "num_input_tokens_seen": 35156672, "step": 16290 }, { "epoch": 2.990456964580657, "grad_norm": 34.52143859863281, "learning_rate": 9.925505598498738e-06, "loss": 0.4378, "num_input_tokens_seen": 35166976, "step": 16295 }, { "epoch": 2.991374564140209, "grad_norm": 39.050052642822266, "learning_rate": 9.925367824311639e-06, "loss": 0.3841, "num_input_tokens_seen": 35176896, "step": 16300 }, { "epoch": 2.9922921636997613, "grad_norm": 0.3476350009441376, "learning_rate": 9.92522992379649e-06, "loss": 0.3205, "num_input_tokens_seen": 35187040, "step": 16305 }, { "epoch": 2.9932097632593138, "grad_norm": 29.53853416442871, "learning_rate": 9.925091896956827e-06, "loss": 0.2122, "num_input_tokens_seen": 35196800, "step": 16310 }, { "epoch": 2.994127362818866, "grad_norm": 10.353729248046875, "learning_rate": 9.92495374379619e-06, "loss": 0.3241, "num_input_tokens_seen": 35207648, "step": 16315 }, { "epoch": 2.995044962378418, "grad_norm": 0.6458294987678528, "learning_rate": 9.924815464318121e-06, "loss": 0.4046, "num_input_tokens_seen": 35218304, "step": 16320 }, { "epoch": 2.9959625619379704, "grad_norm": 2.4162092208862305, "learning_rate": 9.92467705852617e-06, "loss": 0.1522, "num_input_tokens_seen": 35229408, "step": 16325 }, { "epoch": 2.9968801614975225, "grad_norm": 1.659819483757019, "learning_rate": 9.924538526423884e-06, "loss": 0.1853, "num_input_tokens_seen": 35238976, "step": 16330 }, { "epoch": 2.9977977610570745, "grad_norm": 18.066730499267578, "learning_rate": 9.924399868014817e-06, "loss": 0.2239, "num_input_tokens_seen": 35249312, "step": 16335 }, { "epoch": 2.998715360616627, "grad_norm": 19.976587295532227, "learning_rate": 9.924261083302528e-06, "loss": 0.1173, "num_input_tokens_seen": 35259328, "step": 16340 }, { "epoch": 2.999632960176179, "grad_norm": 0.45383742451667786, "learning_rate": 9.924122172290571e-06, "loss": 0.0868, "num_input_tokens_seen": 35270048, "step": 16345 }, { "epoch": 3.000550559735731, "grad_norm": 2.2664618492126465, "learning_rate": 9.923983134982514e-06, "loss": 0.2185, "num_input_tokens_seen": 35280544, "step": 16350 }, { "epoch": 3.0014681592952837, "grad_norm": 4.395901203155518, "learning_rate": 9.92384397138192e-06, "loss": 0.0913, "num_input_tokens_seen": 35291424, "step": 16355 }, { "epoch": 3.0023857588548357, "grad_norm": 0.22241820394992828, "learning_rate": 9.92370468149236e-06, "loss": 0.2953, "num_input_tokens_seen": 35301696, "step": 16360 }, { "epoch": 3.003303358414388, "grad_norm": 0.1121855303645134, "learning_rate": 9.923565265317406e-06, "loss": 0.2839, "num_input_tokens_seen": 35312448, "step": 16365 }, { "epoch": 3.0042209579739403, "grad_norm": 26.755090713500977, "learning_rate": 9.923425722860633e-06, "loss": 0.1686, "num_input_tokens_seen": 35322208, "step": 16370 }, { "epoch": 3.0051385575334923, "grad_norm": 6.172614097595215, "learning_rate": 9.923286054125621e-06, "loss": 0.2019, "num_input_tokens_seen": 35333472, "step": 16375 }, { "epoch": 3.0060561570930444, "grad_norm": 0.13816386461257935, "learning_rate": 9.923146259115953e-06, "loss": 0.1481, "num_input_tokens_seen": 35343328, "step": 16380 }, { "epoch": 3.006973756652597, "grad_norm": 75.16400146484375, "learning_rate": 9.923006337835213e-06, "loss": 0.3835, "num_input_tokens_seen": 35352992, "step": 16385 }, { "epoch": 3.007891356212149, "grad_norm": 9.308116912841797, "learning_rate": 9.92286629028699e-06, "loss": 0.1222, "num_input_tokens_seen": 35363552, "step": 16390 }, { "epoch": 3.008808955771701, "grad_norm": 0.7076749205589294, "learning_rate": 9.922726116474877e-06, "loss": 0.0627, "num_input_tokens_seen": 35374496, "step": 16395 }, { "epoch": 3.0097265553312536, "grad_norm": 56.80408477783203, "learning_rate": 9.922585816402468e-06, "loss": 0.4012, "num_input_tokens_seen": 35382496, "step": 16400 }, { "epoch": 3.0106441548908056, "grad_norm": 32.753326416015625, "learning_rate": 9.922445390073363e-06, "loss": 0.1942, "num_input_tokens_seen": 35393888, "step": 16405 }, { "epoch": 3.0115617544503577, "grad_norm": 7.619338035583496, "learning_rate": 9.922304837491164e-06, "loss": 0.3267, "num_input_tokens_seen": 35404224, "step": 16410 }, { "epoch": 3.01247935400991, "grad_norm": 10.170700073242188, "learning_rate": 9.922164158659472e-06, "loss": 0.119, "num_input_tokens_seen": 35414208, "step": 16415 }, { "epoch": 3.0133969535694622, "grad_norm": 19.63651466369629, "learning_rate": 9.9220233535819e-06, "loss": 0.1822, "num_input_tokens_seen": 35425280, "step": 16420 }, { "epoch": 3.0143145531290143, "grad_norm": 11.24061107635498, "learning_rate": 9.921882422262057e-06, "loss": 0.166, "num_input_tokens_seen": 35436224, "step": 16425 }, { "epoch": 3.015232152688567, "grad_norm": 0.23805975914001465, "learning_rate": 9.921741364703557e-06, "loss": 0.1586, "num_input_tokens_seen": 35447456, "step": 16430 }, { "epoch": 3.016149752248119, "grad_norm": 1.6594548225402832, "learning_rate": 9.921600180910019e-06, "loss": 0.7032, "num_input_tokens_seen": 35458144, "step": 16435 }, { "epoch": 3.017067351807671, "grad_norm": 26.18692970275879, "learning_rate": 9.921458870885066e-06, "loss": 0.1162, "num_input_tokens_seen": 35469696, "step": 16440 }, { "epoch": 3.0179849513672234, "grad_norm": 9.338956832885742, "learning_rate": 9.921317434632318e-06, "loss": 0.1426, "num_input_tokens_seen": 35481056, "step": 16445 }, { "epoch": 3.0189025509267755, "grad_norm": 3.7338521480560303, "learning_rate": 9.921175872155408e-06, "loss": 0.0583, "num_input_tokens_seen": 35491040, "step": 16450 }, { "epoch": 3.0198201504863276, "grad_norm": 10.896157264709473, "learning_rate": 9.921034183457963e-06, "loss": 0.3273, "num_input_tokens_seen": 35501600, "step": 16455 }, { "epoch": 3.02073775004588, "grad_norm": 0.7325127720832825, "learning_rate": 9.920892368543617e-06, "loss": 0.0737, "num_input_tokens_seen": 35512512, "step": 16460 }, { "epoch": 3.021655349605432, "grad_norm": 51.80061340332031, "learning_rate": 9.920750427416008e-06, "loss": 0.1539, "num_input_tokens_seen": 35523136, "step": 16465 }, { "epoch": 3.022572949164984, "grad_norm": 1.7815051078796387, "learning_rate": 9.920608360078778e-06, "loss": 0.2209, "num_input_tokens_seen": 35534880, "step": 16470 }, { "epoch": 3.0234905487245367, "grad_norm": 0.18095234036445618, "learning_rate": 9.920466166535571e-06, "loss": 0.1644, "num_input_tokens_seen": 35545984, "step": 16475 }, { "epoch": 3.0244081482840888, "grad_norm": 0.1362944096326828, "learning_rate": 9.920323846790032e-06, "loss": 0.0234, "num_input_tokens_seen": 35557600, "step": 16480 }, { "epoch": 3.025325747843641, "grad_norm": 39.79641342163086, "learning_rate": 9.920181400845811e-06, "loss": 0.4176, "num_input_tokens_seen": 35568768, "step": 16485 }, { "epoch": 3.0262433474031933, "grad_norm": 0.16118623316287994, "learning_rate": 9.920038828706563e-06, "loss": 0.3545, "num_input_tokens_seen": 35580928, "step": 16490 }, { "epoch": 3.0271609469627454, "grad_norm": 1.48127281665802, "learning_rate": 9.919896130375947e-06, "loss": 0.1145, "num_input_tokens_seen": 35591712, "step": 16495 }, { "epoch": 3.0280785465222975, "grad_norm": 0.40029674768447876, "learning_rate": 9.919753305857618e-06, "loss": 0.0792, "num_input_tokens_seen": 35603072, "step": 16500 }, { "epoch": 3.02899614608185, "grad_norm": 0.32795846462249756, "learning_rate": 9.919610355155243e-06, "loss": 0.2548, "num_input_tokens_seen": 35613216, "step": 16505 }, { "epoch": 3.029913745641402, "grad_norm": 0.21782755851745605, "learning_rate": 9.919467278272485e-06, "loss": 0.2431, "num_input_tokens_seen": 35624160, "step": 16510 }, { "epoch": 3.030831345200954, "grad_norm": 0.644412100315094, "learning_rate": 9.919324075213016e-06, "loss": 0.0569, "num_input_tokens_seen": 35636000, "step": 16515 }, { "epoch": 3.0317489447605066, "grad_norm": 0.11704204231500626, "learning_rate": 9.91918074598051e-06, "loss": 0.1096, "num_input_tokens_seen": 35646688, "step": 16520 }, { "epoch": 3.0326665443200587, "grad_norm": 6.404811382293701, "learning_rate": 9.919037290578644e-06, "loss": 0.3185, "num_input_tokens_seen": 35655680, "step": 16525 }, { "epoch": 3.0335841438796107, "grad_norm": 0.1829366236925125, "learning_rate": 9.918893709011092e-06, "loss": 0.1455, "num_input_tokens_seen": 35667232, "step": 16530 }, { "epoch": 3.0345017434391632, "grad_norm": 0.10325075685977936, "learning_rate": 9.918750001281541e-06, "loss": 0.334, "num_input_tokens_seen": 35677952, "step": 16535 }, { "epoch": 3.0354193429987153, "grad_norm": 12.059989929199219, "learning_rate": 9.918606167393675e-06, "loss": 0.2131, "num_input_tokens_seen": 35689088, "step": 16540 }, { "epoch": 3.0363369425582674, "grad_norm": 0.8487570285797119, "learning_rate": 9.918462207351185e-06, "loss": 0.1721, "num_input_tokens_seen": 35700384, "step": 16545 }, { "epoch": 3.03725454211782, "grad_norm": 5.2537312507629395, "learning_rate": 9.918318121157762e-06, "loss": 0.2268, "num_input_tokens_seen": 35711264, "step": 16550 }, { "epoch": 3.038172141677372, "grad_norm": 11.8935546875, "learning_rate": 9.918173908817101e-06, "loss": 0.0371, "num_input_tokens_seen": 35721920, "step": 16555 }, { "epoch": 3.039089741236924, "grad_norm": 27.509183883666992, "learning_rate": 9.918029570332903e-06, "loss": 0.1036, "num_input_tokens_seen": 35732256, "step": 16560 }, { "epoch": 3.0400073407964765, "grad_norm": 0.1982250213623047, "learning_rate": 9.91788510570887e-06, "loss": 0.0142, "num_input_tokens_seen": 35744480, "step": 16565 }, { "epoch": 3.0409249403560286, "grad_norm": 40.29535675048828, "learning_rate": 9.917740514948704e-06, "loss": 0.266, "num_input_tokens_seen": 35756000, "step": 16570 }, { "epoch": 3.0418425399155806, "grad_norm": 0.1823749542236328, "learning_rate": 9.917595798056116e-06, "loss": 0.0468, "num_input_tokens_seen": 35766048, "step": 16575 }, { "epoch": 3.042760139475133, "grad_norm": 52.26401901245117, "learning_rate": 9.917450955034818e-06, "loss": 0.1646, "num_input_tokens_seen": 35776960, "step": 16580 }, { "epoch": 3.043677739034685, "grad_norm": 0.5052943229675293, "learning_rate": 9.917305985888523e-06, "loss": 0.5731, "num_input_tokens_seen": 35787744, "step": 16585 }, { "epoch": 3.0445953385942373, "grad_norm": 21.956602096557617, "learning_rate": 9.917160890620952e-06, "loss": 0.3604, "num_input_tokens_seen": 35799424, "step": 16590 }, { "epoch": 3.0455129381537898, "grad_norm": 10.229998588562012, "learning_rate": 9.917015669235823e-06, "loss": 0.2726, "num_input_tokens_seen": 35808768, "step": 16595 }, { "epoch": 3.046430537713342, "grad_norm": 2.2006452083587646, "learning_rate": 9.916870321736864e-06, "loss": 0.138, "num_input_tokens_seen": 35819936, "step": 16600 }, { "epoch": 3.047348137272894, "grad_norm": 1.4605062007904053, "learning_rate": 9.916724848127803e-06, "loss": 0.105, "num_input_tokens_seen": 35830272, "step": 16605 }, { "epoch": 3.0482657368324464, "grad_norm": 0.3166528642177582, "learning_rate": 9.916579248412368e-06, "loss": 0.2361, "num_input_tokens_seen": 35841184, "step": 16610 }, { "epoch": 3.0491833363919985, "grad_norm": 1.2866387367248535, "learning_rate": 9.916433522594296e-06, "loss": 0.0047, "num_input_tokens_seen": 35851776, "step": 16615 }, { "epoch": 3.0501009359515505, "grad_norm": 0.10057788342237473, "learning_rate": 9.916287670677325e-06, "loss": 0.2103, "num_input_tokens_seen": 35860928, "step": 16620 }, { "epoch": 3.051018535511103, "grad_norm": 9.263368606567383, "learning_rate": 9.916141692665193e-06, "loss": 0.1193, "num_input_tokens_seen": 35873056, "step": 16625 }, { "epoch": 3.051936135070655, "grad_norm": 0.2088528275489807, "learning_rate": 9.915995588561647e-06, "loss": 0.0591, "num_input_tokens_seen": 35884320, "step": 16630 }, { "epoch": 3.0528537346302076, "grad_norm": 0.19873298704624176, "learning_rate": 9.915849358370433e-06, "loss": 0.1909, "num_input_tokens_seen": 35895104, "step": 16635 }, { "epoch": 3.0537713341897597, "grad_norm": 0.0654933974146843, "learning_rate": 9.9157030020953e-06, "loss": 0.0414, "num_input_tokens_seen": 35906752, "step": 16640 }, { "epoch": 3.0546889337493117, "grad_norm": 0.4147285521030426, "learning_rate": 9.915556519740008e-06, "loss": 0.1915, "num_input_tokens_seen": 35917824, "step": 16645 }, { "epoch": 3.0556065333088642, "grad_norm": 0.15779495239257812, "learning_rate": 9.915409911308307e-06, "loss": 0.3907, "num_input_tokens_seen": 35928992, "step": 16650 }, { "epoch": 3.0565241328684163, "grad_norm": 0.08505544066429138, "learning_rate": 9.91526317680396e-06, "loss": 0.0052, "num_input_tokens_seen": 35938976, "step": 16655 }, { "epoch": 3.0574417324279684, "grad_norm": 10.1148681640625, "learning_rate": 9.915116316230731e-06, "loss": 0.1104, "num_input_tokens_seen": 35949696, "step": 16660 }, { "epoch": 3.058359331987521, "grad_norm": 13.908896446228027, "learning_rate": 9.914969329592386e-06, "loss": 0.3359, "num_input_tokens_seen": 35961152, "step": 16665 }, { "epoch": 3.059276931547073, "grad_norm": 28.958738327026367, "learning_rate": 9.914822216892694e-06, "loss": 0.1663, "num_input_tokens_seen": 35973024, "step": 16670 }, { "epoch": 3.060194531106625, "grad_norm": 0.07944265753030777, "learning_rate": 9.91467497813543e-06, "loss": 0.06, "num_input_tokens_seen": 35985088, "step": 16675 }, { "epoch": 3.0611121306661775, "grad_norm": 0.06909836828708649, "learning_rate": 9.91452761332437e-06, "loss": 0.1723, "num_input_tokens_seen": 35994720, "step": 16680 }, { "epoch": 3.0620297302257296, "grad_norm": 15.220403671264648, "learning_rate": 9.914380122463295e-06, "loss": 0.1383, "num_input_tokens_seen": 36005120, "step": 16685 }, { "epoch": 3.0629473297852816, "grad_norm": 19.751680374145508, "learning_rate": 9.914232505555985e-06, "loss": 0.1625, "num_input_tokens_seen": 36017120, "step": 16690 }, { "epoch": 3.063864929344834, "grad_norm": 21.253524780273438, "learning_rate": 9.91408476260623e-06, "loss": 0.198, "num_input_tokens_seen": 36027392, "step": 16695 }, { "epoch": 3.064782528904386, "grad_norm": 33.19059371948242, "learning_rate": 9.913936893617813e-06, "loss": 0.1789, "num_input_tokens_seen": 36039264, "step": 16700 }, { "epoch": 3.0657001284639382, "grad_norm": 19.176258087158203, "learning_rate": 9.913788898594532e-06, "loss": 0.2488, "num_input_tokens_seen": 36049824, "step": 16705 }, { "epoch": 3.0666177280234908, "grad_norm": 9.023531913757324, "learning_rate": 9.913640777540183e-06, "loss": 0.2375, "num_input_tokens_seen": 36061056, "step": 16710 }, { "epoch": 3.067535327583043, "grad_norm": 7.854964733123779, "learning_rate": 9.913492530458561e-06, "loss": 0.1319, "num_input_tokens_seen": 36071424, "step": 16715 }, { "epoch": 3.068452927142595, "grad_norm": 0.539518415927887, "learning_rate": 9.913344157353472e-06, "loss": 0.042, "num_input_tokens_seen": 36081440, "step": 16720 }, { "epoch": 3.0693705267021474, "grad_norm": 0.7223531007766724, "learning_rate": 9.913195658228722e-06, "loss": 0.0101, "num_input_tokens_seen": 36092864, "step": 16725 }, { "epoch": 3.0702881262616994, "grad_norm": 10.314872741699219, "learning_rate": 9.913047033088117e-06, "loss": 0.2548, "num_input_tokens_seen": 36105024, "step": 16730 }, { "epoch": 3.0712057258212515, "grad_norm": 4.545102596282959, "learning_rate": 9.91289828193547e-06, "loss": 0.281, "num_input_tokens_seen": 36115616, "step": 16735 }, { "epoch": 3.072123325380804, "grad_norm": 4.74938440322876, "learning_rate": 9.912749404774596e-06, "loss": 0.2256, "num_input_tokens_seen": 36124672, "step": 16740 }, { "epoch": 3.073040924940356, "grad_norm": 4.797555923461914, "learning_rate": 9.912600401609314e-06, "loss": 0.3976, "num_input_tokens_seen": 36136160, "step": 16745 }, { "epoch": 3.073958524499908, "grad_norm": 6.988974094390869, "learning_rate": 9.912451272443444e-06, "loss": 0.1693, "num_input_tokens_seen": 36147008, "step": 16750 }, { "epoch": 3.0748761240594606, "grad_norm": 0.3738310933113098, "learning_rate": 9.912302017280814e-06, "loss": 0.1005, "num_input_tokens_seen": 36158080, "step": 16755 }, { "epoch": 3.0757937236190127, "grad_norm": 2.3272430896759033, "learning_rate": 9.912152636125252e-06, "loss": 0.0671, "num_input_tokens_seen": 36167296, "step": 16760 }, { "epoch": 3.0767113231785648, "grad_norm": 12.372183799743652, "learning_rate": 9.912003128980588e-06, "loss": 0.2377, "num_input_tokens_seen": 36178944, "step": 16765 }, { "epoch": 3.0776289227381173, "grad_norm": 0.2839040458202362, "learning_rate": 9.911853495850653e-06, "loss": 0.066, "num_input_tokens_seen": 36188096, "step": 16770 }, { "epoch": 3.0785465222976693, "grad_norm": 0.9837242960929871, "learning_rate": 9.911703736739292e-06, "loss": 0.1745, "num_input_tokens_seen": 36199360, "step": 16775 }, { "epoch": 3.0794641218572214, "grad_norm": 0.4530782401561737, "learning_rate": 9.911553851650342e-06, "loss": 0.1551, "num_input_tokens_seen": 36210464, "step": 16780 }, { "epoch": 3.080381721416774, "grad_norm": 19.063554763793945, "learning_rate": 9.911403840587648e-06, "loss": 0.4193, "num_input_tokens_seen": 36220192, "step": 16785 }, { "epoch": 3.081299320976326, "grad_norm": 0.12916912138462067, "learning_rate": 9.911253703555055e-06, "loss": 0.4765, "num_input_tokens_seen": 36231008, "step": 16790 }, { "epoch": 3.082216920535878, "grad_norm": 0.25956451892852783, "learning_rate": 9.91110344055642e-06, "loss": 0.0652, "num_input_tokens_seen": 36242272, "step": 16795 }, { "epoch": 3.0831345200954305, "grad_norm": 49.895050048828125, "learning_rate": 9.910953051595591e-06, "loss": 0.1347, "num_input_tokens_seen": 36252704, "step": 16800 }, { "epoch": 3.0840521196549826, "grad_norm": 13.320866584777832, "learning_rate": 9.910802536676427e-06, "loss": 0.103, "num_input_tokens_seen": 36263424, "step": 16805 }, { "epoch": 3.0849697192145347, "grad_norm": 7.0469069480896, "learning_rate": 9.910651895802791e-06, "loss": 0.2275, "num_input_tokens_seen": 36275072, "step": 16810 }, { "epoch": 3.085887318774087, "grad_norm": 10.847735404968262, "learning_rate": 9.910501128978543e-06, "loss": 0.3202, "num_input_tokens_seen": 36285600, "step": 16815 }, { "epoch": 3.0868049183336392, "grad_norm": 1.856876015663147, "learning_rate": 9.910350236207554e-06, "loss": 0.1804, "num_input_tokens_seen": 36294944, "step": 16820 }, { "epoch": 3.0877225178931913, "grad_norm": 0.8095316886901855, "learning_rate": 9.910199217493688e-06, "loss": 0.0195, "num_input_tokens_seen": 36305120, "step": 16825 }, { "epoch": 3.088640117452744, "grad_norm": 0.40907734632492065, "learning_rate": 9.910048072840825e-06, "loss": 0.0867, "num_input_tokens_seen": 36316832, "step": 16830 }, { "epoch": 3.089557717012296, "grad_norm": 33.27989959716797, "learning_rate": 9.909896802252838e-06, "loss": 0.3422, "num_input_tokens_seen": 36327424, "step": 16835 }, { "epoch": 3.090475316571848, "grad_norm": 8.318965911865234, "learning_rate": 9.909745405733609e-06, "loss": 0.1095, "num_input_tokens_seen": 36339040, "step": 16840 }, { "epoch": 3.0913929161314004, "grad_norm": 14.154346466064453, "learning_rate": 9.909593883287016e-06, "loss": 0.3458, "num_input_tokens_seen": 36350240, "step": 16845 }, { "epoch": 3.0923105156909525, "grad_norm": 16.50423812866211, "learning_rate": 9.909442234916953e-06, "loss": 0.1784, "num_input_tokens_seen": 36361376, "step": 16850 }, { "epoch": 3.0932281152505046, "grad_norm": 16.910219192504883, "learning_rate": 9.909290460627304e-06, "loss": 0.2626, "num_input_tokens_seen": 36373632, "step": 16855 }, { "epoch": 3.094145714810057, "grad_norm": 0.8867322206497192, "learning_rate": 9.909138560421964e-06, "loss": 0.195, "num_input_tokens_seen": 36384512, "step": 16860 }, { "epoch": 3.095063314369609, "grad_norm": 0.16302472352981567, "learning_rate": 9.908986534304827e-06, "loss": 0.4002, "num_input_tokens_seen": 36395072, "step": 16865 }, { "epoch": 3.095980913929161, "grad_norm": 9.188971519470215, "learning_rate": 9.908834382279795e-06, "loss": 0.269, "num_input_tokens_seen": 36405376, "step": 16870 }, { "epoch": 3.0968985134887137, "grad_norm": 11.313298225402832, "learning_rate": 9.908682104350769e-06, "loss": 0.1653, "num_input_tokens_seen": 36417632, "step": 16875 }, { "epoch": 3.0978161130482658, "grad_norm": 0.29467278718948364, "learning_rate": 9.908529700521654e-06, "loss": 0.1019, "num_input_tokens_seen": 36427232, "step": 16880 }, { "epoch": 3.098733712607818, "grad_norm": 10.702275276184082, "learning_rate": 9.908377170796362e-06, "loss": 0.0726, "num_input_tokens_seen": 36438336, "step": 16885 }, { "epoch": 3.0996513121673703, "grad_norm": 6.078681468963623, "learning_rate": 9.9082245151788e-06, "loss": 0.1321, "num_input_tokens_seen": 36449792, "step": 16890 }, { "epoch": 3.1005689117269224, "grad_norm": 0.3727269768714905, "learning_rate": 9.908071733672886e-06, "loss": 0.1098, "num_input_tokens_seen": 36461472, "step": 16895 }, { "epoch": 3.1014865112864745, "grad_norm": 0.3752902150154114, "learning_rate": 9.90791882628254e-06, "loss": 0.0414, "num_input_tokens_seen": 36471808, "step": 16900 }, { "epoch": 3.102404110846027, "grad_norm": 0.04347207769751549, "learning_rate": 9.907765793011684e-06, "loss": 0.1384, "num_input_tokens_seen": 36482944, "step": 16905 }, { "epoch": 3.103321710405579, "grad_norm": 0.24103903770446777, "learning_rate": 9.90761263386424e-06, "loss": 0.4084, "num_input_tokens_seen": 36493376, "step": 16910 }, { "epoch": 3.104239309965131, "grad_norm": 0.1511916071176529, "learning_rate": 9.90745934884414e-06, "loss": 0.0466, "num_input_tokens_seen": 36504160, "step": 16915 }, { "epoch": 3.1051569095246836, "grad_norm": 7.740330219268799, "learning_rate": 9.907305937955312e-06, "loss": 0.308, "num_input_tokens_seen": 36514336, "step": 16920 }, { "epoch": 3.1060745090842357, "grad_norm": 19.865774154663086, "learning_rate": 9.907152401201693e-06, "loss": 0.0204, "num_input_tokens_seen": 36525472, "step": 16925 }, { "epoch": 3.1069921086437877, "grad_norm": 0.20957805216312408, "learning_rate": 9.906998738587219e-06, "loss": 0.0111, "num_input_tokens_seen": 36534848, "step": 16930 }, { "epoch": 3.1079097082033402, "grad_norm": 26.864397048950195, "learning_rate": 9.906844950115836e-06, "loss": 0.2321, "num_input_tokens_seen": 36544000, "step": 16935 }, { "epoch": 3.1088273077628923, "grad_norm": 42.59136962890625, "learning_rate": 9.90669103579148e-06, "loss": 0.1969, "num_input_tokens_seen": 36553696, "step": 16940 }, { "epoch": 3.1097449073224444, "grad_norm": 27.63962745666504, "learning_rate": 9.906536995618106e-06, "loss": 0.1559, "num_input_tokens_seen": 36564896, "step": 16945 }, { "epoch": 3.110662506881997, "grad_norm": 0.4537482261657715, "learning_rate": 9.906382829599664e-06, "loss": 0.1053, "num_input_tokens_seen": 36575392, "step": 16950 }, { "epoch": 3.111580106441549, "grad_norm": 0.041712239384651184, "learning_rate": 9.906228537740103e-06, "loss": 0.2551, "num_input_tokens_seen": 36585600, "step": 16955 }, { "epoch": 3.112497706001101, "grad_norm": 0.36912471055984497, "learning_rate": 9.906074120043387e-06, "loss": 0.0104, "num_input_tokens_seen": 36595552, "step": 16960 }, { "epoch": 3.1134153055606535, "grad_norm": 1.1518961191177368, "learning_rate": 9.905919576513473e-06, "loss": 0.0681, "num_input_tokens_seen": 36606304, "step": 16965 }, { "epoch": 3.1143329051202056, "grad_norm": 0.11742287129163742, "learning_rate": 9.905764907154325e-06, "loss": 0.0596, "num_input_tokens_seen": 36617344, "step": 16970 }, { "epoch": 3.1152505046797576, "grad_norm": 38.197628021240234, "learning_rate": 9.90561011196991e-06, "loss": 0.3491, "num_input_tokens_seen": 36628992, "step": 16975 }, { "epoch": 3.11616810423931, "grad_norm": 2.316997528076172, "learning_rate": 9.9054551909642e-06, "loss": 0.3758, "num_input_tokens_seen": 36640064, "step": 16980 }, { "epoch": 3.117085703798862, "grad_norm": 2.6990113258361816, "learning_rate": 9.905300144141165e-06, "loss": 0.223, "num_input_tokens_seen": 36651072, "step": 16985 }, { "epoch": 3.1180033033584142, "grad_norm": 44.49589920043945, "learning_rate": 9.905144971504786e-06, "loss": 0.0435, "num_input_tokens_seen": 36662528, "step": 16990 }, { "epoch": 3.1189209029179668, "grad_norm": 0.131882905960083, "learning_rate": 9.904989673059038e-06, "loss": 0.0011, "num_input_tokens_seen": 36673632, "step": 16995 }, { "epoch": 3.119838502477519, "grad_norm": 10.884339332580566, "learning_rate": 9.90483424880791e-06, "loss": 0.2745, "num_input_tokens_seen": 36684480, "step": 17000 }, { "epoch": 3.120756102037071, "grad_norm": 0.61845862865448, "learning_rate": 9.904678698755383e-06, "loss": 0.0829, "num_input_tokens_seen": 36695520, "step": 17005 }, { "epoch": 3.1216737015966234, "grad_norm": 8.365193367004395, "learning_rate": 9.90452302290545e-06, "loss": 0.1954, "num_input_tokens_seen": 36706944, "step": 17010 }, { "epoch": 3.1225913011561754, "grad_norm": 0.09238845109939575, "learning_rate": 9.904367221262103e-06, "loss": 0.2528, "num_input_tokens_seen": 36717696, "step": 17015 }, { "epoch": 3.1235089007157275, "grad_norm": 0.14668898284435272, "learning_rate": 9.904211293829339e-06, "loss": 0.4266, "num_input_tokens_seen": 36727520, "step": 17020 }, { "epoch": 3.12442650027528, "grad_norm": 18.263744354248047, "learning_rate": 9.904055240611153e-06, "loss": 0.2541, "num_input_tokens_seen": 36738912, "step": 17025 }, { "epoch": 3.125344099834832, "grad_norm": 19.00367546081543, "learning_rate": 9.903899061611553e-06, "loss": 0.4395, "num_input_tokens_seen": 36749312, "step": 17030 }, { "epoch": 3.126261699394384, "grad_norm": 0.4194754958152771, "learning_rate": 9.903742756834543e-06, "loss": 0.1165, "num_input_tokens_seen": 36762464, "step": 17035 }, { "epoch": 3.1271792989539366, "grad_norm": 0.18136478960514069, "learning_rate": 9.90358632628413e-06, "loss": 0.2795, "num_input_tokens_seen": 36774528, "step": 17040 }, { "epoch": 3.1280968985134887, "grad_norm": 0.23639069497585297, "learning_rate": 9.903429769964326e-06, "loss": 0.0177, "num_input_tokens_seen": 36784448, "step": 17045 }, { "epoch": 3.1290144980730408, "grad_norm": 0.12191440910100937, "learning_rate": 9.90327308787915e-06, "loss": 0.0924, "num_input_tokens_seen": 36795328, "step": 17050 }, { "epoch": 3.1299320976325933, "grad_norm": 2.666790008544922, "learning_rate": 9.903116280032618e-06, "loss": 0.019, "num_input_tokens_seen": 36806208, "step": 17055 }, { "epoch": 3.1308496971921453, "grad_norm": 0.05040891841053963, "learning_rate": 9.902959346428753e-06, "loss": 0.1291, "num_input_tokens_seen": 36818304, "step": 17060 }, { "epoch": 3.1317672967516974, "grad_norm": 2.672271728515625, "learning_rate": 9.90280228707158e-06, "loss": 0.2137, "num_input_tokens_seen": 36828576, "step": 17065 }, { "epoch": 3.13268489631125, "grad_norm": 38.15877151489258, "learning_rate": 9.902645101965127e-06, "loss": 0.2665, "num_input_tokens_seen": 36839392, "step": 17070 }, { "epoch": 3.133602495870802, "grad_norm": 11.8862943649292, "learning_rate": 9.902487791113426e-06, "loss": 0.2264, "num_input_tokens_seen": 36850624, "step": 17075 }, { "epoch": 3.134520095430354, "grad_norm": 10.738024711608887, "learning_rate": 9.902330354520511e-06, "loss": 0.1565, "num_input_tokens_seen": 36860864, "step": 17080 }, { "epoch": 3.1354376949899065, "grad_norm": 13.252535820007324, "learning_rate": 9.902172792190418e-06, "loss": 0.6309, "num_input_tokens_seen": 36872352, "step": 17085 }, { "epoch": 3.1363552945494586, "grad_norm": 0.1248968094587326, "learning_rate": 9.902015104127194e-06, "loss": 0.1072, "num_input_tokens_seen": 36884352, "step": 17090 }, { "epoch": 3.1372728941090107, "grad_norm": 21.523576736450195, "learning_rate": 9.901857290334878e-06, "loss": 0.1727, "num_input_tokens_seen": 36896000, "step": 17095 }, { "epoch": 3.138190493668563, "grad_norm": 0.2597455084323883, "learning_rate": 9.901699350817519e-06, "loss": 0.2243, "num_input_tokens_seen": 36907424, "step": 17100 }, { "epoch": 3.1391080932281152, "grad_norm": 0.17424239218235016, "learning_rate": 9.901541285579171e-06, "loss": 0.1049, "num_input_tokens_seen": 36919232, "step": 17105 }, { "epoch": 3.1400256927876673, "grad_norm": 0.3420802354812622, "learning_rate": 9.901383094623883e-06, "loss": 0.15, "num_input_tokens_seen": 36930976, "step": 17110 }, { "epoch": 3.14094329234722, "grad_norm": 2.2461066246032715, "learning_rate": 9.901224777955718e-06, "loss": 0.1091, "num_input_tokens_seen": 36941792, "step": 17115 }, { "epoch": 3.141860891906772, "grad_norm": 1.8072046041488647, "learning_rate": 9.901066335578732e-06, "loss": 0.2038, "num_input_tokens_seen": 36952352, "step": 17120 }, { "epoch": 3.142778491466324, "grad_norm": 0.5592846870422363, "learning_rate": 9.900907767496992e-06, "loss": 0.0055, "num_input_tokens_seen": 36964544, "step": 17125 }, { "epoch": 3.1436960910258764, "grad_norm": 0.04554516822099686, "learning_rate": 9.900749073714562e-06, "loss": 0.0121, "num_input_tokens_seen": 36975072, "step": 17130 }, { "epoch": 3.1446136905854285, "grad_norm": 34.224334716796875, "learning_rate": 9.900590254235513e-06, "loss": 0.3916, "num_input_tokens_seen": 36984448, "step": 17135 }, { "epoch": 3.1455312901449806, "grad_norm": 10.701871871948242, "learning_rate": 9.90043130906392e-06, "loss": 0.5342, "num_input_tokens_seen": 36995232, "step": 17140 }, { "epoch": 3.146448889704533, "grad_norm": 13.586530685424805, "learning_rate": 9.90027223820386e-06, "loss": 0.2459, "num_input_tokens_seen": 37007136, "step": 17145 }, { "epoch": 3.147366489264085, "grad_norm": 6.4806718826293945, "learning_rate": 9.90011304165941e-06, "loss": 0.3912, "num_input_tokens_seen": 37018112, "step": 17150 }, { "epoch": 3.148284088823637, "grad_norm": 0.32812732458114624, "learning_rate": 9.899953719434655e-06, "loss": 0.2102, "num_input_tokens_seen": 37028800, "step": 17155 }, { "epoch": 3.1492016883831897, "grad_norm": 0.32482123374938965, "learning_rate": 9.899794271533684e-06, "loss": 0.2219, "num_input_tokens_seen": 37039424, "step": 17160 }, { "epoch": 3.1501192879427418, "grad_norm": 2.829625129699707, "learning_rate": 9.899634697960582e-06, "loss": 0.1301, "num_input_tokens_seen": 37049472, "step": 17165 }, { "epoch": 3.151036887502294, "grad_norm": 0.42224231362342834, "learning_rate": 9.899474998719443e-06, "loss": 0.0447, "num_input_tokens_seen": 37060896, "step": 17170 }, { "epoch": 3.1519544870618463, "grad_norm": 0.2046075463294983, "learning_rate": 9.899315173814366e-06, "loss": 0.021, "num_input_tokens_seen": 37070848, "step": 17175 }, { "epoch": 3.1528720866213984, "grad_norm": 8.895466804504395, "learning_rate": 9.899155223249445e-06, "loss": 0.1954, "num_input_tokens_seen": 37081248, "step": 17180 }, { "epoch": 3.1537896861809505, "grad_norm": 0.10386238247156143, "learning_rate": 9.898995147028786e-06, "loss": 0.1825, "num_input_tokens_seen": 37091360, "step": 17185 }, { "epoch": 3.154707285740503, "grad_norm": 0.11764340847730637, "learning_rate": 9.898834945156497e-06, "loss": 0.26, "num_input_tokens_seen": 37102432, "step": 17190 }, { "epoch": 3.155624885300055, "grad_norm": 22.319778442382812, "learning_rate": 9.898674617636684e-06, "loss": 0.2445, "num_input_tokens_seen": 37112736, "step": 17195 }, { "epoch": 3.156542484859607, "grad_norm": 0.5067368745803833, "learning_rate": 9.898514164473456e-06, "loss": 0.1595, "num_input_tokens_seen": 37123872, "step": 17200 }, { "epoch": 3.1574600844191596, "grad_norm": 19.09461784362793, "learning_rate": 9.898353585670934e-06, "loss": 0.2291, "num_input_tokens_seen": 37134144, "step": 17205 }, { "epoch": 3.1583776839787117, "grad_norm": 1.3755749464035034, "learning_rate": 9.898192881233233e-06, "loss": 0.0214, "num_input_tokens_seen": 37144800, "step": 17210 }, { "epoch": 3.1592952835382637, "grad_norm": 6.274546146392822, "learning_rate": 9.898032051164478e-06, "loss": 0.1556, "num_input_tokens_seen": 37155584, "step": 17215 }, { "epoch": 3.1602128830978162, "grad_norm": 0.09085913002490997, "learning_rate": 9.897871095468792e-06, "loss": 0.0079, "num_input_tokens_seen": 37164800, "step": 17220 }, { "epoch": 3.1611304826573683, "grad_norm": 36.04468536376953, "learning_rate": 9.897710014150301e-06, "loss": 0.0704, "num_input_tokens_seen": 37174464, "step": 17225 }, { "epoch": 3.1620480822169204, "grad_norm": 48.60871887207031, "learning_rate": 9.897548807213142e-06, "loss": 0.2364, "num_input_tokens_seen": 37185184, "step": 17230 }, { "epoch": 3.162965681776473, "grad_norm": 0.8706151247024536, "learning_rate": 9.897387474661443e-06, "loss": 0.1312, "num_input_tokens_seen": 37194720, "step": 17235 }, { "epoch": 3.163883281336025, "grad_norm": 0.308037132024765, "learning_rate": 9.897226016499348e-06, "loss": 0.022, "num_input_tokens_seen": 37205792, "step": 17240 }, { "epoch": 3.164800880895577, "grad_norm": 0.16994206607341766, "learning_rate": 9.897064432730996e-06, "loss": 0.0918, "num_input_tokens_seen": 37217344, "step": 17245 }, { "epoch": 3.1657184804551295, "grad_norm": 0.09708507359027863, "learning_rate": 9.89690272336053e-06, "loss": 0.265, "num_input_tokens_seen": 37227936, "step": 17250 }, { "epoch": 3.1666360800146816, "grad_norm": 0.25817251205444336, "learning_rate": 9.8967408883921e-06, "loss": 0.14, "num_input_tokens_seen": 37239040, "step": 17255 }, { "epoch": 3.1675536795742336, "grad_norm": 0.02768843062222004, "learning_rate": 9.896578927829854e-06, "loss": 0.0061, "num_input_tokens_seen": 37249856, "step": 17260 }, { "epoch": 3.168471279133786, "grad_norm": 0.22608159482479095, "learning_rate": 9.896416841677947e-06, "loss": 0.2624, "num_input_tokens_seen": 37260960, "step": 17265 }, { "epoch": 3.169388878693338, "grad_norm": 0.055000271648168564, "learning_rate": 9.896254629940539e-06, "loss": 0.0816, "num_input_tokens_seen": 37270784, "step": 17270 }, { "epoch": 3.1703064782528902, "grad_norm": 0.231241375207901, "learning_rate": 9.896092292621787e-06, "loss": 0.1245, "num_input_tokens_seen": 37281088, "step": 17275 }, { "epoch": 3.1712240778124428, "grad_norm": 14.31276798248291, "learning_rate": 9.895929829725856e-06, "loss": 0.3917, "num_input_tokens_seen": 37292576, "step": 17280 }, { "epoch": 3.172141677371995, "grad_norm": 41.12267303466797, "learning_rate": 9.895767241256912e-06, "loss": 0.4707, "num_input_tokens_seen": 37303488, "step": 17285 }, { "epoch": 3.173059276931547, "grad_norm": 8.738943099975586, "learning_rate": 9.895604527219127e-06, "loss": 0.1876, "num_input_tokens_seen": 37314528, "step": 17290 }, { "epoch": 3.1739768764910994, "grad_norm": 0.14039075374603271, "learning_rate": 9.895441687616673e-06, "loss": 0.0126, "num_input_tokens_seen": 37323616, "step": 17295 }, { "epoch": 3.1748944760506514, "grad_norm": 14.999330520629883, "learning_rate": 9.895278722453728e-06, "loss": 0.2177, "num_input_tokens_seen": 37334400, "step": 17300 }, { "epoch": 3.1758120756102035, "grad_norm": 29.505268096923828, "learning_rate": 9.895115631734469e-06, "loss": 0.1951, "num_input_tokens_seen": 37344000, "step": 17305 }, { "epoch": 3.176729675169756, "grad_norm": 15.464215278625488, "learning_rate": 9.894952415463082e-06, "loss": 0.3213, "num_input_tokens_seen": 37354080, "step": 17310 }, { "epoch": 3.177647274729308, "grad_norm": 0.06190560758113861, "learning_rate": 9.894789073643752e-06, "loss": 0.0826, "num_input_tokens_seen": 37364320, "step": 17315 }, { "epoch": 3.17856487428886, "grad_norm": 11.579899787902832, "learning_rate": 9.894625606280668e-06, "loss": 0.2705, "num_input_tokens_seen": 37375456, "step": 17320 }, { "epoch": 3.1794824738484126, "grad_norm": 0.06462106108665466, "learning_rate": 9.894462013378024e-06, "loss": 0.0084, "num_input_tokens_seen": 37386272, "step": 17325 }, { "epoch": 3.1804000734079647, "grad_norm": 2.355081558227539, "learning_rate": 9.894298294940015e-06, "loss": 0.1839, "num_input_tokens_seen": 37396768, "step": 17330 }, { "epoch": 3.1813176729675168, "grad_norm": 0.07507625967264175, "learning_rate": 9.894134450970838e-06, "loss": 0.2793, "num_input_tokens_seen": 37406624, "step": 17335 }, { "epoch": 3.1822352725270693, "grad_norm": 0.621013879776001, "learning_rate": 9.8939704814747e-06, "loss": 0.1522, "num_input_tokens_seen": 37418336, "step": 17340 }, { "epoch": 3.1831528720866213, "grad_norm": 15.628188133239746, "learning_rate": 9.893806386455804e-06, "loss": 0.0441, "num_input_tokens_seen": 37427648, "step": 17345 }, { "epoch": 3.1840704716461734, "grad_norm": 177.68856811523438, "learning_rate": 9.893642165918358e-06, "loss": 0.3022, "num_input_tokens_seen": 37438528, "step": 17350 }, { "epoch": 3.184988071205726, "grad_norm": 6.069974422454834, "learning_rate": 9.893477819866574e-06, "loss": 0.224, "num_input_tokens_seen": 37449536, "step": 17355 }, { "epoch": 3.185905670765278, "grad_norm": 0.3150264024734497, "learning_rate": 9.893313348304669e-06, "loss": 0.0872, "num_input_tokens_seen": 37460832, "step": 17360 }, { "epoch": 3.18682327032483, "grad_norm": 0.022823285311460495, "learning_rate": 9.893148751236861e-06, "loss": 0.1557, "num_input_tokens_seen": 37471936, "step": 17365 }, { "epoch": 3.1877408698843825, "grad_norm": 0.696909487247467, "learning_rate": 9.892984028667372e-06, "loss": 0.1358, "num_input_tokens_seen": 37482656, "step": 17370 }, { "epoch": 3.1886584694439346, "grad_norm": 6.01590633392334, "learning_rate": 9.892819180600426e-06, "loss": 0.2384, "num_input_tokens_seen": 37493696, "step": 17375 }, { "epoch": 3.189576069003487, "grad_norm": 15.011153221130371, "learning_rate": 9.89265420704025e-06, "loss": 0.3482, "num_input_tokens_seen": 37504736, "step": 17380 }, { "epoch": 3.190493668563039, "grad_norm": 2.876356363296509, "learning_rate": 9.892489107991077e-06, "loss": 0.1865, "num_input_tokens_seen": 37516512, "step": 17385 }, { "epoch": 3.1914112681225912, "grad_norm": 0.748880922794342, "learning_rate": 9.89232388345714e-06, "loss": 0.1305, "num_input_tokens_seen": 37526912, "step": 17390 }, { "epoch": 3.1923288676821437, "grad_norm": 0.7034685015678406, "learning_rate": 9.89215853344268e-06, "loss": 0.01, "num_input_tokens_seen": 37537120, "step": 17395 }, { "epoch": 3.193246467241696, "grad_norm": 7.068561553955078, "learning_rate": 9.891993057951935e-06, "loss": 0.1024, "num_input_tokens_seen": 37547744, "step": 17400 }, { "epoch": 3.194164066801248, "grad_norm": 8.267380714416504, "learning_rate": 9.891827456989149e-06, "loss": 0.2674, "num_input_tokens_seen": 37558752, "step": 17405 }, { "epoch": 3.1950816663608004, "grad_norm": 36.58951187133789, "learning_rate": 9.89166173055857e-06, "loss": 0.2139, "num_input_tokens_seen": 37569184, "step": 17410 }, { "epoch": 3.1959992659203524, "grad_norm": 28.832557678222656, "learning_rate": 9.89149587866445e-06, "loss": 0.236, "num_input_tokens_seen": 37580896, "step": 17415 }, { "epoch": 3.1969168654799045, "grad_norm": 17.60767936706543, "learning_rate": 9.891329901311043e-06, "loss": 0.2748, "num_input_tokens_seen": 37592384, "step": 17420 }, { "epoch": 3.197834465039457, "grad_norm": 0.1706201434135437, "learning_rate": 9.891163798502603e-06, "loss": 0.0154, "num_input_tokens_seen": 37602112, "step": 17425 }, { "epoch": 3.198752064599009, "grad_norm": 18.853431701660156, "learning_rate": 9.890997570243392e-06, "loss": 0.2284, "num_input_tokens_seen": 37612800, "step": 17430 }, { "epoch": 3.199669664158561, "grad_norm": 6.959889888763428, "learning_rate": 9.890831216537674e-06, "loss": 0.0881, "num_input_tokens_seen": 37623808, "step": 17435 }, { "epoch": 3.2005872637181136, "grad_norm": 0.06225425377488136, "learning_rate": 9.890664737389718e-06, "loss": 0.0168, "num_input_tokens_seen": 37634784, "step": 17440 }, { "epoch": 3.2015048632776657, "grad_norm": 37.099510192871094, "learning_rate": 9.890498132803788e-06, "loss": 0.2566, "num_input_tokens_seen": 37645376, "step": 17445 }, { "epoch": 3.2024224628372178, "grad_norm": 0.11779546737670898, "learning_rate": 9.89033140278416e-06, "loss": 0.174, "num_input_tokens_seen": 37656256, "step": 17450 }, { "epoch": 3.2033400623967703, "grad_norm": 0.6985101699829102, "learning_rate": 9.890164547335115e-06, "loss": 0.1347, "num_input_tokens_seen": 37667680, "step": 17455 }, { "epoch": 3.2042576619563223, "grad_norm": 13.488173484802246, "learning_rate": 9.889997566460926e-06, "loss": 0.3938, "num_input_tokens_seen": 37679072, "step": 17460 }, { "epoch": 3.2051752615158744, "grad_norm": 0.6705842614173889, "learning_rate": 9.889830460165877e-06, "loss": 0.1861, "num_input_tokens_seen": 37690976, "step": 17465 }, { "epoch": 3.206092861075427, "grad_norm": 0.3081401288509369, "learning_rate": 9.889663228454257e-06, "loss": 0.1967, "num_input_tokens_seen": 37701024, "step": 17470 }, { "epoch": 3.207010460634979, "grad_norm": 0.18150290846824646, "learning_rate": 9.889495871330352e-06, "loss": 0.0669, "num_input_tokens_seen": 37712000, "step": 17475 }, { "epoch": 3.207928060194531, "grad_norm": 32.141090393066406, "learning_rate": 9.889328388798459e-06, "loss": 0.0409, "num_input_tokens_seen": 37721888, "step": 17480 }, { "epoch": 3.2088456597540835, "grad_norm": 41.097782135009766, "learning_rate": 9.889160780862868e-06, "loss": 0.3005, "num_input_tokens_seen": 37732960, "step": 17485 }, { "epoch": 3.2097632593136356, "grad_norm": 25.842632293701172, "learning_rate": 9.88899304752788e-06, "loss": 0.1373, "num_input_tokens_seen": 37744128, "step": 17490 }, { "epoch": 3.2106808588731877, "grad_norm": 40.54561233520508, "learning_rate": 9.888825188797799e-06, "loss": 0.3822, "num_input_tokens_seen": 37753440, "step": 17495 }, { "epoch": 3.21159845843274, "grad_norm": 0.21383696794509888, "learning_rate": 9.888657204676928e-06, "loss": 0.39, "num_input_tokens_seen": 37764288, "step": 17500 }, { "epoch": 3.2125160579922922, "grad_norm": 0.15791626274585724, "learning_rate": 9.888489095169578e-06, "loss": 0.4378, "num_input_tokens_seen": 37773696, "step": 17505 }, { "epoch": 3.2134336575518443, "grad_norm": 0.8326143026351929, "learning_rate": 9.888320860280058e-06, "loss": 0.0296, "num_input_tokens_seen": 37784128, "step": 17510 }, { "epoch": 3.214351257111397, "grad_norm": 9.512679100036621, "learning_rate": 9.888152500012683e-06, "loss": 0.2194, "num_input_tokens_seen": 37794464, "step": 17515 }, { "epoch": 3.215268856670949, "grad_norm": 2.0405325889587402, "learning_rate": 9.887984014371774e-06, "loss": 0.1349, "num_input_tokens_seen": 37805472, "step": 17520 }, { "epoch": 3.216186456230501, "grad_norm": 1.1784318685531616, "learning_rate": 9.88781540336165e-06, "loss": 0.1551, "num_input_tokens_seen": 37816288, "step": 17525 }, { "epoch": 3.2171040557900534, "grad_norm": 21.502321243286133, "learning_rate": 9.887646666986637e-06, "loss": 0.1315, "num_input_tokens_seen": 37826240, "step": 17530 }, { "epoch": 3.2180216553496055, "grad_norm": 0.08338366448879242, "learning_rate": 9.887477805251062e-06, "loss": 0.0172, "num_input_tokens_seen": 37836992, "step": 17535 }, { "epoch": 3.2189392549091576, "grad_norm": 0.11104461550712585, "learning_rate": 9.887308818159256e-06, "loss": 0.2474, "num_input_tokens_seen": 37847520, "step": 17540 }, { "epoch": 3.21985685446871, "grad_norm": 0.25929680466651917, "learning_rate": 9.887139705715553e-06, "loss": 0.1145, "num_input_tokens_seen": 37858304, "step": 17545 }, { "epoch": 3.220774454028262, "grad_norm": 33.41831970214844, "learning_rate": 9.88697046792429e-06, "loss": 0.4776, "num_input_tokens_seen": 37869568, "step": 17550 }, { "epoch": 3.221692053587814, "grad_norm": 0.2582468092441559, "learning_rate": 9.886801104789811e-06, "loss": 0.0835, "num_input_tokens_seen": 37880768, "step": 17555 }, { "epoch": 3.2226096531473667, "grad_norm": 10.136407852172852, "learning_rate": 9.886631616316457e-06, "loss": 0.1796, "num_input_tokens_seen": 37891072, "step": 17560 }, { "epoch": 3.2235272527069188, "grad_norm": 6.6027607917785645, "learning_rate": 9.886462002508575e-06, "loss": 0.4816, "num_input_tokens_seen": 37901760, "step": 17565 }, { "epoch": 3.224444852266471, "grad_norm": 0.2975142002105713, "learning_rate": 9.886292263370516e-06, "loss": 0.2745, "num_input_tokens_seen": 37913376, "step": 17570 }, { "epoch": 3.2253624518260233, "grad_norm": 0.3725794851779938, "learning_rate": 9.886122398906633e-06, "loss": 0.1567, "num_input_tokens_seen": 37924896, "step": 17575 }, { "epoch": 3.2262800513855754, "grad_norm": 0.1424885243177414, "learning_rate": 9.885952409121284e-06, "loss": 0.0833, "num_input_tokens_seen": 37934752, "step": 17580 }, { "epoch": 3.2271976509451274, "grad_norm": 1.2854924201965332, "learning_rate": 9.88578229401883e-06, "loss": 0.1797, "num_input_tokens_seen": 37944512, "step": 17585 }, { "epoch": 3.22811525050468, "grad_norm": 23.41640281677246, "learning_rate": 9.885612053603628e-06, "loss": 0.0955, "num_input_tokens_seen": 37955648, "step": 17590 }, { "epoch": 3.229032850064232, "grad_norm": 0.5843503475189209, "learning_rate": 9.885441687880052e-06, "loss": 0.0549, "num_input_tokens_seen": 37966720, "step": 17595 }, { "epoch": 3.229950449623784, "grad_norm": 5.555686950683594, "learning_rate": 9.885271196852469e-06, "loss": 0.2165, "num_input_tokens_seen": 37976352, "step": 17600 }, { "epoch": 3.2308680491833366, "grad_norm": 33.1038818359375, "learning_rate": 9.885100580525248e-06, "loss": 0.4643, "num_input_tokens_seen": 37986080, "step": 17605 }, { "epoch": 3.2317856487428887, "grad_norm": 2.2367537021636963, "learning_rate": 9.884929838902771e-06, "loss": 0.171, "num_input_tokens_seen": 37996800, "step": 17610 }, { "epoch": 3.2327032483024407, "grad_norm": 0.22029873728752136, "learning_rate": 9.884758971989415e-06, "loss": 0.0618, "num_input_tokens_seen": 38006880, "step": 17615 }, { "epoch": 3.233620847861993, "grad_norm": 36.311790466308594, "learning_rate": 9.88458797978956e-06, "loss": 0.1478, "num_input_tokens_seen": 38017280, "step": 17620 }, { "epoch": 3.2345384474215453, "grad_norm": 24.19673728942871, "learning_rate": 9.884416862307596e-06, "loss": 0.1994, "num_input_tokens_seen": 38027808, "step": 17625 }, { "epoch": 3.2354560469810973, "grad_norm": 21.152141571044922, "learning_rate": 9.884245619547908e-06, "loss": 0.386, "num_input_tokens_seen": 38038240, "step": 17630 }, { "epoch": 3.23637364654065, "grad_norm": 0.5585193037986755, "learning_rate": 9.88407425151489e-06, "loss": 0.0265, "num_input_tokens_seen": 38049408, "step": 17635 }, { "epoch": 3.237291246100202, "grad_norm": 28.011869430541992, "learning_rate": 9.883902758212938e-06, "loss": 0.1466, "num_input_tokens_seen": 38060000, "step": 17640 }, { "epoch": 3.238208845659754, "grad_norm": 31.843687057495117, "learning_rate": 9.883731139646449e-06, "loss": 0.1519, "num_input_tokens_seen": 38071072, "step": 17645 }, { "epoch": 3.2391264452193065, "grad_norm": 0.13766244053840637, "learning_rate": 9.883559395819824e-06, "loss": 0.08, "num_input_tokens_seen": 38082304, "step": 17650 }, { "epoch": 3.2400440447788585, "grad_norm": 0.22299596667289734, "learning_rate": 9.883387526737471e-06, "loss": 0.0069, "num_input_tokens_seen": 38092800, "step": 17655 }, { "epoch": 3.2409616443384106, "grad_norm": 0.6245129108428955, "learning_rate": 9.883215532403796e-06, "loss": 0.2396, "num_input_tokens_seen": 38103776, "step": 17660 }, { "epoch": 3.241879243897963, "grad_norm": 0.08688147366046906, "learning_rate": 9.88304341282321e-06, "loss": 0.1927, "num_input_tokens_seen": 38113536, "step": 17665 }, { "epoch": 3.242796843457515, "grad_norm": 8.720917701721191, "learning_rate": 9.88287116800013e-06, "loss": 0.1056, "num_input_tokens_seen": 38124384, "step": 17670 }, { "epoch": 3.2437144430170672, "grad_norm": 0.3830893635749817, "learning_rate": 9.882698797938972e-06, "loss": 0.292, "num_input_tokens_seen": 38134944, "step": 17675 }, { "epoch": 3.2446320425766197, "grad_norm": 72.29645538330078, "learning_rate": 9.882526302644157e-06, "loss": 0.0564, "num_input_tokens_seen": 38146304, "step": 17680 }, { "epoch": 3.245549642136172, "grad_norm": 12.71413803100586, "learning_rate": 9.88235368212011e-06, "loss": 0.5599, "num_input_tokens_seen": 38157472, "step": 17685 }, { "epoch": 3.246467241695724, "grad_norm": 1.7737030982971191, "learning_rate": 9.882180936371257e-06, "loss": 0.2966, "num_input_tokens_seen": 38168000, "step": 17690 }, { "epoch": 3.2473848412552764, "grad_norm": 4.057696342468262, "learning_rate": 9.882008065402031e-06, "loss": 0.2609, "num_input_tokens_seen": 38179104, "step": 17695 }, { "epoch": 3.2483024408148284, "grad_norm": 1.7338825464248657, "learning_rate": 9.881835069216864e-06, "loss": 0.0385, "num_input_tokens_seen": 38190720, "step": 17700 }, { "epoch": 3.2492200403743805, "grad_norm": 0.7535633444786072, "learning_rate": 9.881661947820194e-06, "loss": 0.1469, "num_input_tokens_seen": 38201472, "step": 17705 }, { "epoch": 3.250137639933933, "grad_norm": 0.41243892908096313, "learning_rate": 9.88148870121646e-06, "loss": 0.0284, "num_input_tokens_seen": 38212320, "step": 17710 }, { "epoch": 3.251055239493485, "grad_norm": 20.109485626220703, "learning_rate": 9.881315329410108e-06, "loss": 0.1205, "num_input_tokens_seen": 38223232, "step": 17715 }, { "epoch": 3.251972839053037, "grad_norm": 0.155559241771698, "learning_rate": 9.88114183240558e-06, "loss": 0.1475, "num_input_tokens_seen": 38234080, "step": 17720 }, { "epoch": 3.2528904386125896, "grad_norm": 0.6561629772186279, "learning_rate": 9.880968210207333e-06, "loss": 0.346, "num_input_tokens_seen": 38245280, "step": 17725 }, { "epoch": 3.2538080381721417, "grad_norm": 40.62876510620117, "learning_rate": 9.880794462819814e-06, "loss": 0.2225, "num_input_tokens_seen": 38255136, "step": 17730 }, { "epoch": 3.2547256377316938, "grad_norm": 9.756327629089355, "learning_rate": 9.880620590247482e-06, "loss": 0.2211, "num_input_tokens_seen": 38267520, "step": 17735 }, { "epoch": 3.2556432372912463, "grad_norm": 0.6659395694732666, "learning_rate": 9.880446592494795e-06, "loss": 0.2882, "num_input_tokens_seen": 38278144, "step": 17740 }, { "epoch": 3.2565608368507983, "grad_norm": 48.47999572753906, "learning_rate": 9.880272469566219e-06, "loss": 0.161, "num_input_tokens_seen": 38289920, "step": 17745 }, { "epoch": 3.2574784364103504, "grad_norm": 15.779635429382324, "learning_rate": 9.880098221466217e-06, "loss": 0.3796, "num_input_tokens_seen": 38301600, "step": 17750 }, { "epoch": 3.258396035969903, "grad_norm": 20.842653274536133, "learning_rate": 9.879923848199257e-06, "loss": 0.1352, "num_input_tokens_seen": 38312512, "step": 17755 }, { "epoch": 3.259313635529455, "grad_norm": 0.19708985090255737, "learning_rate": 9.879749349769816e-06, "loss": 0.3007, "num_input_tokens_seen": 38322624, "step": 17760 }, { "epoch": 3.260231235089007, "grad_norm": 0.15072782337665558, "learning_rate": 9.879574726182369e-06, "loss": 0.201, "num_input_tokens_seen": 38334144, "step": 17765 }, { "epoch": 3.2611488346485595, "grad_norm": 25.4427547454834, "learning_rate": 9.879399977441389e-06, "loss": 0.2322, "num_input_tokens_seen": 38344640, "step": 17770 }, { "epoch": 3.2620664342081116, "grad_norm": 8.79291820526123, "learning_rate": 9.879225103551364e-06, "loss": 0.2758, "num_input_tokens_seen": 38355040, "step": 17775 }, { "epoch": 3.2629840337676637, "grad_norm": 0.5632711052894592, "learning_rate": 9.879050104516776e-06, "loss": 0.1169, "num_input_tokens_seen": 38366560, "step": 17780 }, { "epoch": 3.263901633327216, "grad_norm": 8.120366096496582, "learning_rate": 9.878874980342116e-06, "loss": 0.2608, "num_input_tokens_seen": 38376608, "step": 17785 }, { "epoch": 3.2648192328867682, "grad_norm": 0.25131088495254517, "learning_rate": 9.878699731031873e-06, "loss": 0.0118, "num_input_tokens_seen": 38386432, "step": 17790 }, { "epoch": 3.2657368324463203, "grad_norm": 0.2385140061378479, "learning_rate": 9.878524356590545e-06, "loss": 0.3399, "num_input_tokens_seen": 38396288, "step": 17795 }, { "epoch": 3.266654432005873, "grad_norm": 0.4802117645740509, "learning_rate": 9.878348857022626e-06, "loss": 0.3661, "num_input_tokens_seen": 38407008, "step": 17800 }, { "epoch": 3.267572031565425, "grad_norm": 0.34672603011131287, "learning_rate": 9.87817323233262e-06, "loss": 0.0137, "num_input_tokens_seen": 38416704, "step": 17805 }, { "epoch": 3.268489631124977, "grad_norm": 0.7070908546447754, "learning_rate": 9.877997482525032e-06, "loss": 0.183, "num_input_tokens_seen": 38426912, "step": 17810 }, { "epoch": 3.2694072306845294, "grad_norm": 19.170072555541992, "learning_rate": 9.87782160760437e-06, "loss": 0.2029, "num_input_tokens_seen": 38437312, "step": 17815 }, { "epoch": 3.2703248302440815, "grad_norm": 5.173304557800293, "learning_rate": 9.877645607575142e-06, "loss": 0.0935, "num_input_tokens_seen": 38448800, "step": 17820 }, { "epoch": 3.2712424298036336, "grad_norm": 0.2923598885536194, "learning_rate": 9.877469482441864e-06, "loss": 0.1637, "num_input_tokens_seen": 38460416, "step": 17825 }, { "epoch": 3.272160029363186, "grad_norm": 0.04054948687553406, "learning_rate": 9.877293232209053e-06, "loss": 0.0093, "num_input_tokens_seen": 38471008, "step": 17830 }, { "epoch": 3.273077628922738, "grad_norm": 1.3457188606262207, "learning_rate": 9.877116856881231e-06, "loss": 0.17, "num_input_tokens_seen": 38482368, "step": 17835 }, { "epoch": 3.27399522848229, "grad_norm": 14.929402351379395, "learning_rate": 9.87694035646292e-06, "loss": 0.5978, "num_input_tokens_seen": 38493120, "step": 17840 }, { "epoch": 3.2749128280418427, "grad_norm": 1.3816003799438477, "learning_rate": 9.876763730958644e-06, "loss": 0.1824, "num_input_tokens_seen": 38503360, "step": 17845 }, { "epoch": 3.2758304276013948, "grad_norm": 0.3160797953605652, "learning_rate": 9.876586980372941e-06, "loss": 0.1682, "num_input_tokens_seen": 38514560, "step": 17850 }, { "epoch": 3.276748027160947, "grad_norm": 9.22807502746582, "learning_rate": 9.876410104710338e-06, "loss": 0.2056, "num_input_tokens_seen": 38525472, "step": 17855 }, { "epoch": 3.2776656267204993, "grad_norm": 9.874238967895508, "learning_rate": 9.876233103975375e-06, "loss": 0.2761, "num_input_tokens_seen": 38534528, "step": 17860 }, { "epoch": 3.2785832262800514, "grad_norm": 0.1092105358839035, "learning_rate": 9.87605597817259e-06, "loss": 0.2084, "num_input_tokens_seen": 38543936, "step": 17865 }, { "epoch": 3.2795008258396035, "grad_norm": 0.11609043926000595, "learning_rate": 9.875878727306525e-06, "loss": 0.031, "num_input_tokens_seen": 38555104, "step": 17870 }, { "epoch": 3.280418425399156, "grad_norm": 0.5436837673187256, "learning_rate": 9.875701351381729e-06, "loss": 0.2639, "num_input_tokens_seen": 38565760, "step": 17875 }, { "epoch": 3.281336024958708, "grad_norm": 0.47942042350769043, "learning_rate": 9.875523850402748e-06, "loss": 0.2458, "num_input_tokens_seen": 38576544, "step": 17880 }, { "epoch": 3.28225362451826, "grad_norm": 0.44458070397377014, "learning_rate": 9.875346224374138e-06, "loss": 0.0153, "num_input_tokens_seen": 38587648, "step": 17885 }, { "epoch": 3.2831712240778126, "grad_norm": 0.16525937616825104, "learning_rate": 9.875168473300453e-06, "loss": 0.3154, "num_input_tokens_seen": 38597120, "step": 17890 }, { "epoch": 3.2840888236373647, "grad_norm": 0.7689719796180725, "learning_rate": 9.874990597186253e-06, "loss": 0.2312, "num_input_tokens_seen": 38608864, "step": 17895 }, { "epoch": 3.2850064231969167, "grad_norm": 4.578298091888428, "learning_rate": 9.874812596036099e-06, "loss": 0.3081, "num_input_tokens_seen": 38619776, "step": 17900 }, { "epoch": 3.285924022756469, "grad_norm": 22.64470672607422, "learning_rate": 9.874634469854558e-06, "loss": 0.0837, "num_input_tokens_seen": 38630784, "step": 17905 }, { "epoch": 3.2868416223160213, "grad_norm": 7.344946384429932, "learning_rate": 9.874456218646198e-06, "loss": 0.146, "num_input_tokens_seen": 38641952, "step": 17910 }, { "epoch": 3.2877592218755733, "grad_norm": 4.410371780395508, "learning_rate": 9.874277842415591e-06, "loss": 0.0121, "num_input_tokens_seen": 38652640, "step": 17915 }, { "epoch": 3.288676821435126, "grad_norm": 0.20178289711475372, "learning_rate": 9.87409934116731e-06, "loss": 0.1217, "num_input_tokens_seen": 38663552, "step": 17920 }, { "epoch": 3.289594420994678, "grad_norm": 0.10255066305398941, "learning_rate": 9.873920714905936e-06, "loss": 0.0545, "num_input_tokens_seen": 38673184, "step": 17925 }, { "epoch": 3.29051202055423, "grad_norm": 3.788768768310547, "learning_rate": 9.87374196363605e-06, "loss": 0.1934, "num_input_tokens_seen": 38682720, "step": 17930 }, { "epoch": 3.2914296201137825, "grad_norm": 0.26471513509750366, "learning_rate": 9.873563087362236e-06, "loss": 0.0055, "num_input_tokens_seen": 38693664, "step": 17935 }, { "epoch": 3.2923472196733345, "grad_norm": 0.05520015209913254, "learning_rate": 9.873384086089084e-06, "loss": 0.1297, "num_input_tokens_seen": 38704736, "step": 17940 }, { "epoch": 3.2932648192328866, "grad_norm": 0.169589564204216, "learning_rate": 9.87320495982118e-06, "loss": 0.246, "num_input_tokens_seen": 38714784, "step": 17945 }, { "epoch": 3.294182418792439, "grad_norm": 14.523127555847168, "learning_rate": 9.873025708563123e-06, "loss": 0.6884, "num_input_tokens_seen": 38727136, "step": 17950 }, { "epoch": 3.295100018351991, "grad_norm": 20.13621711730957, "learning_rate": 9.872846332319508e-06, "loss": 0.2036, "num_input_tokens_seen": 38739136, "step": 17955 }, { "epoch": 3.2960176179115432, "grad_norm": 0.9233500361442566, "learning_rate": 9.87266683109494e-06, "loss": 0.0064, "num_input_tokens_seen": 38751456, "step": 17960 }, { "epoch": 3.2969352174710957, "grad_norm": 2.6343753337860107, "learning_rate": 9.872487204894018e-06, "loss": 0.1355, "num_input_tokens_seen": 38762080, "step": 17965 }, { "epoch": 3.297852817030648, "grad_norm": 16.96883773803711, "learning_rate": 9.872307453721348e-06, "loss": 0.1506, "num_input_tokens_seen": 38772704, "step": 17970 }, { "epoch": 3.2987704165902, "grad_norm": 0.2520734965801239, "learning_rate": 9.872127577581547e-06, "loss": 0.1037, "num_input_tokens_seen": 38784736, "step": 17975 }, { "epoch": 3.2996880161497524, "grad_norm": 154.9898223876953, "learning_rate": 9.871947576479223e-06, "loss": 0.289, "num_input_tokens_seen": 38794976, "step": 17980 }, { "epoch": 3.3006056157093044, "grad_norm": 0.16978611052036285, "learning_rate": 9.871767450418995e-06, "loss": 0.3521, "num_input_tokens_seen": 38805440, "step": 17985 }, { "epoch": 3.3015232152688565, "grad_norm": 0.227323517203331, "learning_rate": 9.871587199405483e-06, "loss": 0.0298, "num_input_tokens_seen": 38816672, "step": 17990 }, { "epoch": 3.302440814828409, "grad_norm": 0.92540442943573, "learning_rate": 9.871406823443308e-06, "loss": 0.4926, "num_input_tokens_seen": 38826080, "step": 17995 }, { "epoch": 3.303358414387961, "grad_norm": 1.3629403114318848, "learning_rate": 9.8712263225371e-06, "loss": 0.2015, "num_input_tokens_seen": 38835904, "step": 18000 }, { "epoch": 3.304276013947513, "grad_norm": 0.054602064192295074, "learning_rate": 9.871045696691484e-06, "loss": 0.0065, "num_input_tokens_seen": 38846176, "step": 18005 }, { "epoch": 3.3051936135070656, "grad_norm": 11.241795539855957, "learning_rate": 9.870864945911097e-06, "loss": 0.0568, "num_input_tokens_seen": 38856512, "step": 18010 }, { "epoch": 3.3061112130666177, "grad_norm": 8.615677833557129, "learning_rate": 9.870684070200574e-06, "loss": 0.1023, "num_input_tokens_seen": 38867616, "step": 18015 }, { "epoch": 3.3070288126261698, "grad_norm": 17.3076229095459, "learning_rate": 9.870503069564552e-06, "loss": 0.7736, "num_input_tokens_seen": 38878720, "step": 18020 }, { "epoch": 3.3079464121857223, "grad_norm": 12.619658470153809, "learning_rate": 9.870321944007674e-06, "loss": 0.3061, "num_input_tokens_seen": 38889184, "step": 18025 }, { "epoch": 3.3088640117452743, "grad_norm": 2.858673572540283, "learning_rate": 9.870140693534589e-06, "loss": 0.1845, "num_input_tokens_seen": 38899616, "step": 18030 }, { "epoch": 3.3097816113048264, "grad_norm": 0.21959397196769714, "learning_rate": 9.869959318149942e-06, "loss": 0.0957, "num_input_tokens_seen": 38909312, "step": 18035 }, { "epoch": 3.310699210864379, "grad_norm": 8.333280563354492, "learning_rate": 9.869777817858385e-06, "loss": 0.2878, "num_input_tokens_seen": 38918816, "step": 18040 }, { "epoch": 3.311616810423931, "grad_norm": 12.111044883728027, "learning_rate": 9.869596192664576e-06, "loss": 0.3317, "num_input_tokens_seen": 38929728, "step": 18045 }, { "epoch": 3.312534409983483, "grad_norm": 56.70965576171875, "learning_rate": 9.869414442573172e-06, "loss": 0.4027, "num_input_tokens_seen": 38941696, "step": 18050 }, { "epoch": 3.3134520095430355, "grad_norm": 7.3558502197265625, "learning_rate": 9.869232567588836e-06, "loss": 0.2786, "num_input_tokens_seen": 38952160, "step": 18055 }, { "epoch": 3.3143696091025876, "grad_norm": 13.911046028137207, "learning_rate": 9.869050567716228e-06, "loss": 0.1692, "num_input_tokens_seen": 38960992, "step": 18060 }, { "epoch": 3.3152872086621397, "grad_norm": 10.148680686950684, "learning_rate": 9.86886844296002e-06, "loss": 0.3328, "num_input_tokens_seen": 38972928, "step": 18065 }, { "epoch": 3.316204808221692, "grad_norm": 0.8028570413589478, "learning_rate": 9.868686193324885e-06, "loss": 0.0265, "num_input_tokens_seen": 38982688, "step": 18070 }, { "epoch": 3.3171224077812442, "grad_norm": 7.636173248291016, "learning_rate": 9.868503818815492e-06, "loss": 0.1819, "num_input_tokens_seen": 38992864, "step": 18075 }, { "epoch": 3.3180400073407963, "grad_norm": 0.1957729309797287, "learning_rate": 9.868321319436522e-06, "loss": 0.0894, "num_input_tokens_seen": 39004096, "step": 18080 }, { "epoch": 3.318957606900349, "grad_norm": 26.44799041748047, "learning_rate": 9.868138695192658e-06, "loss": 0.5755, "num_input_tokens_seen": 39013888, "step": 18085 }, { "epoch": 3.319875206459901, "grad_norm": 0.35972970724105835, "learning_rate": 9.86795594608858e-06, "loss": 0.0556, "num_input_tokens_seen": 39024032, "step": 18090 }, { "epoch": 3.320792806019453, "grad_norm": 0.44802460074424744, "learning_rate": 9.867773072128974e-06, "loss": 0.2018, "num_input_tokens_seen": 39034912, "step": 18095 }, { "epoch": 3.3217104055790054, "grad_norm": 0.8971834182739258, "learning_rate": 9.867590073318536e-06, "loss": 0.0702, "num_input_tokens_seen": 39045792, "step": 18100 }, { "epoch": 3.3226280051385575, "grad_norm": 16.416067123413086, "learning_rate": 9.867406949661956e-06, "loss": 0.0841, "num_input_tokens_seen": 39056256, "step": 18105 }, { "epoch": 3.3235456046981096, "grad_norm": 29.30605697631836, "learning_rate": 9.86722370116393e-06, "loss": 0.3337, "num_input_tokens_seen": 39067456, "step": 18110 }, { "epoch": 3.324463204257662, "grad_norm": 6.492541790008545, "learning_rate": 9.867040327829162e-06, "loss": 0.2568, "num_input_tokens_seen": 39078496, "step": 18115 }, { "epoch": 3.325380803817214, "grad_norm": 11.249369621276855, "learning_rate": 9.866856829662351e-06, "loss": 0.112, "num_input_tokens_seen": 39088448, "step": 18120 }, { "epoch": 3.326298403376766, "grad_norm": 0.2785623371601105, "learning_rate": 9.866673206668207e-06, "loss": 0.1853, "num_input_tokens_seen": 39098912, "step": 18125 }, { "epoch": 3.3272160029363187, "grad_norm": 1.849753975868225, "learning_rate": 9.866489458851437e-06, "loss": 0.1264, "num_input_tokens_seen": 39107936, "step": 18130 }, { "epoch": 3.3281336024958708, "grad_norm": 7.825905799865723, "learning_rate": 9.866305586216754e-06, "loss": 0.3942, "num_input_tokens_seen": 39119296, "step": 18135 }, { "epoch": 3.329051202055423, "grad_norm": 1.9303019046783447, "learning_rate": 9.866121588768876e-06, "loss": 0.0438, "num_input_tokens_seen": 39130144, "step": 18140 }, { "epoch": 3.3299688016149753, "grad_norm": 1.3164942264556885, "learning_rate": 9.86593746651252e-06, "loss": 0.1928, "num_input_tokens_seen": 39141088, "step": 18145 }, { "epoch": 3.3308864011745274, "grad_norm": 0.35260236263275146, "learning_rate": 9.865753219452409e-06, "loss": 0.0749, "num_input_tokens_seen": 39151296, "step": 18150 }, { "epoch": 3.3318040007340795, "grad_norm": 0.6590719223022461, "learning_rate": 9.865568847593271e-06, "loss": 0.1796, "num_input_tokens_seen": 39161792, "step": 18155 }, { "epoch": 3.332721600293632, "grad_norm": 0.3735930621623993, "learning_rate": 9.865384350939833e-06, "loss": 0.02, "num_input_tokens_seen": 39172384, "step": 18160 }, { "epoch": 3.333639199853184, "grad_norm": 15.200984954833984, "learning_rate": 9.865199729496827e-06, "loss": 0.2627, "num_input_tokens_seen": 39184032, "step": 18165 }, { "epoch": 3.334556799412736, "grad_norm": 10.000663757324219, "learning_rate": 9.865014983268986e-06, "loss": 0.1165, "num_input_tokens_seen": 39194272, "step": 18170 }, { "epoch": 3.3354743989722886, "grad_norm": 0.8510237336158752, "learning_rate": 9.864830112261052e-06, "loss": 0.0141, "num_input_tokens_seen": 39204288, "step": 18175 }, { "epoch": 3.3363919985318407, "grad_norm": 11.1860990524292, "learning_rate": 9.864645116477767e-06, "loss": 0.0123, "num_input_tokens_seen": 39215520, "step": 18180 }, { "epoch": 3.3373095980913927, "grad_norm": 0.037493012845516205, "learning_rate": 9.864459995923872e-06, "loss": 0.1663, "num_input_tokens_seen": 39227072, "step": 18185 }, { "epoch": 3.338227197650945, "grad_norm": 0.06449148803949356, "learning_rate": 9.86427475060412e-06, "loss": 0.0939, "num_input_tokens_seen": 39238016, "step": 18190 }, { "epoch": 3.3391447972104973, "grad_norm": 12.26052474975586, "learning_rate": 9.864089380523256e-06, "loss": 0.1587, "num_input_tokens_seen": 39250080, "step": 18195 }, { "epoch": 3.3400623967700493, "grad_norm": 17.74271583557129, "learning_rate": 9.863903885686041e-06, "loss": 0.4786, "num_input_tokens_seen": 39261728, "step": 18200 }, { "epoch": 3.340979996329602, "grad_norm": 62.527957916259766, "learning_rate": 9.86371826609723e-06, "loss": 0.5888, "num_input_tokens_seen": 39272096, "step": 18205 }, { "epoch": 3.341897595889154, "grad_norm": 0.2041883021593094, "learning_rate": 9.863532521761581e-06, "loss": 0.2842, "num_input_tokens_seen": 39282176, "step": 18210 }, { "epoch": 3.342815195448706, "grad_norm": 6.367313385009766, "learning_rate": 9.86334665268386e-06, "loss": 0.142, "num_input_tokens_seen": 39294272, "step": 18215 }, { "epoch": 3.3437327950082585, "grad_norm": 6.744338512420654, "learning_rate": 9.863160658868838e-06, "loss": 0.1728, "num_input_tokens_seen": 39304032, "step": 18220 }, { "epoch": 3.3446503945678105, "grad_norm": 0.383424311876297, "learning_rate": 9.862974540321281e-06, "loss": 0.0723, "num_input_tokens_seen": 39315392, "step": 18225 }, { "epoch": 3.3455679941273626, "grad_norm": 28.563495635986328, "learning_rate": 9.862788297045964e-06, "loss": 0.2615, "num_input_tokens_seen": 39325792, "step": 18230 }, { "epoch": 3.346485593686915, "grad_norm": 9.8141508102417, "learning_rate": 9.862601929047663e-06, "loss": 0.017, "num_input_tokens_seen": 39337120, "step": 18235 }, { "epoch": 3.347403193246467, "grad_norm": 0.6058799624443054, "learning_rate": 9.86241543633116e-06, "loss": 0.1084, "num_input_tokens_seen": 39347552, "step": 18240 }, { "epoch": 3.3483207928060192, "grad_norm": 26.66366958618164, "learning_rate": 9.862228818901237e-06, "loss": 0.2769, "num_input_tokens_seen": 39358976, "step": 18245 }, { "epoch": 3.3492383923655717, "grad_norm": 14.773557662963867, "learning_rate": 9.86204207676268e-06, "loss": 0.1092, "num_input_tokens_seen": 39369184, "step": 18250 }, { "epoch": 3.350155991925124, "grad_norm": 0.04022824391722679, "learning_rate": 9.86185520992028e-06, "loss": 0.0676, "num_input_tokens_seen": 39379264, "step": 18255 }, { "epoch": 3.351073591484676, "grad_norm": 10.051650047302246, "learning_rate": 9.861668218378828e-06, "loss": 0.1892, "num_input_tokens_seen": 39389536, "step": 18260 }, { "epoch": 3.3519911910442284, "grad_norm": 13.713399887084961, "learning_rate": 9.861481102143122e-06, "loss": 0.3932, "num_input_tokens_seen": 39400448, "step": 18265 }, { "epoch": 3.3529087906037804, "grad_norm": 3.7952942848205566, "learning_rate": 9.86129386121796e-06, "loss": 0.1537, "num_input_tokens_seen": 39410368, "step": 18270 }, { "epoch": 3.3538263901633325, "grad_norm": 7.549315452575684, "learning_rate": 9.861106495608147e-06, "loss": 0.1516, "num_input_tokens_seen": 39422336, "step": 18275 }, { "epoch": 3.354743989722885, "grad_norm": 0.328500896692276, "learning_rate": 9.860919005318484e-06, "loss": 0.0058, "num_input_tokens_seen": 39433664, "step": 18280 }, { "epoch": 3.355661589282437, "grad_norm": 16.86250114440918, "learning_rate": 9.860731390353782e-06, "loss": 0.2967, "num_input_tokens_seen": 39445696, "step": 18285 }, { "epoch": 3.356579188841989, "grad_norm": 0.19343601167201996, "learning_rate": 9.860543650718853e-06, "loss": 0.0931, "num_input_tokens_seen": 39458336, "step": 18290 }, { "epoch": 3.3574967884015416, "grad_norm": 11.393383979797363, "learning_rate": 9.860355786418514e-06, "loss": 0.2846, "num_input_tokens_seen": 39469312, "step": 18295 }, { "epoch": 3.3584143879610937, "grad_norm": 10.093976020812988, "learning_rate": 9.86016779745758e-06, "loss": 0.3007, "num_input_tokens_seen": 39480576, "step": 18300 }, { "epoch": 3.3593319875206458, "grad_norm": 0.12599827349185944, "learning_rate": 9.859979683840877e-06, "loss": 0.1795, "num_input_tokens_seen": 39490240, "step": 18305 }, { "epoch": 3.3602495870801983, "grad_norm": 36.01531219482422, "learning_rate": 9.859791445573226e-06, "loss": 0.2145, "num_input_tokens_seen": 39501728, "step": 18310 }, { "epoch": 3.3611671866397503, "grad_norm": 0.9748455286026001, "learning_rate": 9.859603082659456e-06, "loss": 0.289, "num_input_tokens_seen": 39512224, "step": 18315 }, { "epoch": 3.3620847861993024, "grad_norm": 20.454431533813477, "learning_rate": 9.859414595104399e-06, "loss": 0.1903, "num_input_tokens_seen": 39522016, "step": 18320 }, { "epoch": 3.363002385758855, "grad_norm": 0.05355469509959221, "learning_rate": 9.85922598291289e-06, "loss": 0.2255, "num_input_tokens_seen": 39532864, "step": 18325 }, { "epoch": 3.363919985318407, "grad_norm": 28.01814079284668, "learning_rate": 9.859037246089766e-06, "loss": 0.1535, "num_input_tokens_seen": 39544128, "step": 18330 }, { "epoch": 3.364837584877959, "grad_norm": 0.15316247940063477, "learning_rate": 9.858848384639864e-06, "loss": 0.1605, "num_input_tokens_seen": 39556032, "step": 18335 }, { "epoch": 3.3657551844375115, "grad_norm": 0.05433760583400726, "learning_rate": 9.858659398568035e-06, "loss": 0.1571, "num_input_tokens_seen": 39566784, "step": 18340 }, { "epoch": 3.3666727839970636, "grad_norm": 22.04621124267578, "learning_rate": 9.858470287879123e-06, "loss": 0.0604, "num_input_tokens_seen": 39576192, "step": 18345 }, { "epoch": 3.3675903835566157, "grad_norm": 28.667829513549805, "learning_rate": 9.858281052577976e-06, "loss": 0.3488, "num_input_tokens_seen": 39585728, "step": 18350 }, { "epoch": 3.368507983116168, "grad_norm": 0.147802472114563, "learning_rate": 9.85809169266945e-06, "loss": 0.2414, "num_input_tokens_seen": 39596832, "step": 18355 }, { "epoch": 3.3694255826757202, "grad_norm": 0.07669826596975327, "learning_rate": 9.857902208158402e-06, "loss": 0.0074, "num_input_tokens_seen": 39608096, "step": 18360 }, { "epoch": 3.3703431822352723, "grad_norm": 0.11310958862304688, "learning_rate": 9.857712599049691e-06, "loss": 0.0456, "num_input_tokens_seen": 39618752, "step": 18365 }, { "epoch": 3.371260781794825, "grad_norm": 27.817861557006836, "learning_rate": 9.857522865348182e-06, "loss": 0.2638, "num_input_tokens_seen": 39629600, "step": 18370 }, { "epoch": 3.372178381354377, "grad_norm": 0.839387834072113, "learning_rate": 9.857333007058739e-06, "loss": 0.5827, "num_input_tokens_seen": 39639648, "step": 18375 }, { "epoch": 3.373095980913929, "grad_norm": 7.390506744384766, "learning_rate": 9.857143024186231e-06, "loss": 0.4385, "num_input_tokens_seen": 39650112, "step": 18380 }, { "epoch": 3.3740135804734814, "grad_norm": 8.562817573547363, "learning_rate": 9.856952916735533e-06, "loss": 0.3192, "num_input_tokens_seen": 39659936, "step": 18385 }, { "epoch": 3.3749311800330335, "grad_norm": 0.09830448031425476, "learning_rate": 9.856762684711522e-06, "loss": 0.014, "num_input_tokens_seen": 39669664, "step": 18390 }, { "epoch": 3.3758487795925856, "grad_norm": 13.10937213897705, "learning_rate": 9.856572328119074e-06, "loss": 0.081, "num_input_tokens_seen": 39680736, "step": 18395 }, { "epoch": 3.376766379152138, "grad_norm": 4.883517742156982, "learning_rate": 9.856381846963073e-06, "loss": 0.044, "num_input_tokens_seen": 39691776, "step": 18400 }, { "epoch": 3.37768397871169, "grad_norm": 6.918511867523193, "learning_rate": 9.856191241248405e-06, "loss": 0.2573, "num_input_tokens_seen": 39703424, "step": 18405 }, { "epoch": 3.378601578271242, "grad_norm": 2.1361141204833984, "learning_rate": 9.856000510979958e-06, "loss": 0.1489, "num_input_tokens_seen": 39713856, "step": 18410 }, { "epoch": 3.3795191778307947, "grad_norm": 0.45983752608299255, "learning_rate": 9.855809656162622e-06, "loss": 0.0813, "num_input_tokens_seen": 39725088, "step": 18415 }, { "epoch": 3.3804367773903468, "grad_norm": 0.19583635032176971, "learning_rate": 9.855618676801297e-06, "loss": 0.0096, "num_input_tokens_seen": 39735904, "step": 18420 }, { "epoch": 3.381354376949899, "grad_norm": 0.2380494326353073, "learning_rate": 9.855427572900877e-06, "loss": 0.1469, "num_input_tokens_seen": 39747712, "step": 18425 }, { "epoch": 3.3822719765094513, "grad_norm": 4.896843910217285, "learning_rate": 9.855236344466265e-06, "loss": 0.2977, "num_input_tokens_seen": 39758240, "step": 18430 }, { "epoch": 3.3831895760690034, "grad_norm": 8.258004188537598, "learning_rate": 9.855044991502367e-06, "loss": 0.1075, "num_input_tokens_seen": 39768416, "step": 18435 }, { "epoch": 3.3841071756285555, "grad_norm": 3.4706058502197266, "learning_rate": 9.854853514014088e-06, "loss": 0.3585, "num_input_tokens_seen": 39778944, "step": 18440 }, { "epoch": 3.385024775188108, "grad_norm": 7.957022190093994, "learning_rate": 9.85466191200634e-06, "loss": 0.2309, "num_input_tokens_seen": 39788576, "step": 18445 }, { "epoch": 3.38594237474766, "grad_norm": 0.13187092542648315, "learning_rate": 9.85447018548404e-06, "loss": 0.3007, "num_input_tokens_seen": 39799552, "step": 18450 }, { "epoch": 3.386859974307212, "grad_norm": 17.6829891204834, "learning_rate": 9.854278334452102e-06, "loss": 0.248, "num_input_tokens_seen": 39810464, "step": 18455 }, { "epoch": 3.3877775738667646, "grad_norm": 17.96186065673828, "learning_rate": 9.854086358915449e-06, "loss": 0.1445, "num_input_tokens_seen": 39821184, "step": 18460 }, { "epoch": 3.3886951734263167, "grad_norm": 4.415534019470215, "learning_rate": 9.853894258879004e-06, "loss": 0.1665, "num_input_tokens_seen": 39832768, "step": 18465 }, { "epoch": 3.389612772985869, "grad_norm": 2.0351381301879883, "learning_rate": 9.853702034347695e-06, "loss": 0.0681, "num_input_tokens_seen": 39843776, "step": 18470 }, { "epoch": 3.3905303725454212, "grad_norm": 0.26622474193573, "learning_rate": 9.85350968532645e-06, "loss": 0.1162, "num_input_tokens_seen": 39855040, "step": 18475 }, { "epoch": 3.3914479721049733, "grad_norm": 16.662288665771484, "learning_rate": 9.853317211820203e-06, "loss": 0.1058, "num_input_tokens_seen": 39865760, "step": 18480 }, { "epoch": 3.392365571664526, "grad_norm": 25.56892204284668, "learning_rate": 9.853124613833894e-06, "loss": 0.1553, "num_input_tokens_seen": 39876608, "step": 18485 }, { "epoch": 3.393283171224078, "grad_norm": 0.6673029661178589, "learning_rate": 9.852931891372459e-06, "loss": 0.098, "num_input_tokens_seen": 39887936, "step": 18490 }, { "epoch": 3.39420077078363, "grad_norm": 0.10396602749824524, "learning_rate": 9.852739044440842e-06, "loss": 0.0851, "num_input_tokens_seen": 39898560, "step": 18495 }, { "epoch": 3.3951183703431824, "grad_norm": 0.3642166256904602, "learning_rate": 9.85254607304399e-06, "loss": 0.0827, "num_input_tokens_seen": 39909280, "step": 18500 }, { "epoch": 3.3960359699027345, "grad_norm": 6.189029216766357, "learning_rate": 9.852352977186852e-06, "loss": 0.1988, "num_input_tokens_seen": 39920096, "step": 18505 }, { "epoch": 3.3969535694622865, "grad_norm": 0.07652502506971359, "learning_rate": 9.85215975687438e-06, "loss": 0.3095, "num_input_tokens_seen": 39930208, "step": 18510 }, { "epoch": 3.397871169021839, "grad_norm": 7.57151460647583, "learning_rate": 9.851966412111531e-06, "loss": 0.2601, "num_input_tokens_seen": 39941312, "step": 18515 }, { "epoch": 3.398788768581391, "grad_norm": 0.31395861506462097, "learning_rate": 9.851772942903263e-06, "loss": 0.1143, "num_input_tokens_seen": 39951168, "step": 18520 }, { "epoch": 3.399706368140943, "grad_norm": 14.607720375061035, "learning_rate": 9.85157934925454e-06, "loss": 0.3121, "num_input_tokens_seen": 39960608, "step": 18525 }, { "epoch": 3.4006239677004957, "grad_norm": 0.9401838779449463, "learning_rate": 9.851385631170325e-06, "loss": 0.1111, "num_input_tokens_seen": 39971648, "step": 18530 }, { "epoch": 3.4015415672600477, "grad_norm": 0.9310527443885803, "learning_rate": 9.851191788655587e-06, "loss": 0.0764, "num_input_tokens_seen": 39982912, "step": 18535 }, { "epoch": 3.4024591668196, "grad_norm": 0.38629159331321716, "learning_rate": 9.8509978217153e-06, "loss": 0.1689, "num_input_tokens_seen": 39993536, "step": 18540 }, { "epoch": 3.4033767663791523, "grad_norm": 2.1434948444366455, "learning_rate": 9.850803730354435e-06, "loss": 0.0403, "num_input_tokens_seen": 40005120, "step": 18545 }, { "epoch": 3.4042943659387044, "grad_norm": 0.17389725148677826, "learning_rate": 9.850609514577974e-06, "loss": 0.205, "num_input_tokens_seen": 40015584, "step": 18550 }, { "epoch": 3.4052119654982564, "grad_norm": 0.5018618106842041, "learning_rate": 9.850415174390895e-06, "loss": 0.2752, "num_input_tokens_seen": 40026656, "step": 18555 }, { "epoch": 3.406129565057809, "grad_norm": 36.61141586303711, "learning_rate": 9.850220709798186e-06, "loss": 0.1544, "num_input_tokens_seen": 40037760, "step": 18560 }, { "epoch": 3.407047164617361, "grad_norm": 0.9506096243858337, "learning_rate": 9.850026120804832e-06, "loss": 0.1285, "num_input_tokens_seen": 40049632, "step": 18565 }, { "epoch": 3.407964764176913, "grad_norm": 4.548303604125977, "learning_rate": 9.849831407415824e-06, "loss": 0.1127, "num_input_tokens_seen": 40061728, "step": 18570 }, { "epoch": 3.4088823637364656, "grad_norm": 0.12074219435453415, "learning_rate": 9.849636569636159e-06, "loss": 0.265, "num_input_tokens_seen": 40071296, "step": 18575 }, { "epoch": 3.4097999632960176, "grad_norm": 13.160449981689453, "learning_rate": 9.849441607470832e-06, "loss": 0.3452, "num_input_tokens_seen": 40083520, "step": 18580 }, { "epoch": 3.4107175628555697, "grad_norm": 0.2893124222755432, "learning_rate": 9.849246520924842e-06, "loss": 0.1569, "num_input_tokens_seen": 40094016, "step": 18585 }, { "epoch": 3.411635162415122, "grad_norm": 0.09585806727409363, "learning_rate": 9.849051310003194e-06, "loss": 0.0761, "num_input_tokens_seen": 40104288, "step": 18590 }, { "epoch": 3.4125527619746743, "grad_norm": 0.10547706484794617, "learning_rate": 9.848855974710897e-06, "loss": 0.1027, "num_input_tokens_seen": 40114528, "step": 18595 }, { "epoch": 3.4134703615342263, "grad_norm": 0.5156281590461731, "learning_rate": 9.84866051505296e-06, "loss": 0.211, "num_input_tokens_seen": 40124256, "step": 18600 }, { "epoch": 3.414387961093779, "grad_norm": 5.099662780761719, "learning_rate": 9.848464931034394e-06, "loss": 0.462, "num_input_tokens_seen": 40136256, "step": 18605 }, { "epoch": 3.415305560653331, "grad_norm": 6.7545928955078125, "learning_rate": 9.848269222660219e-06, "loss": 0.1406, "num_input_tokens_seen": 40147424, "step": 18610 }, { "epoch": 3.416223160212883, "grad_norm": 12.824808120727539, "learning_rate": 9.84807338993545e-06, "loss": 0.0227, "num_input_tokens_seen": 40158208, "step": 18615 }, { "epoch": 3.4171407597724355, "grad_norm": 5.363346576690674, "learning_rate": 9.847877432865113e-06, "loss": 0.1623, "num_input_tokens_seen": 40169120, "step": 18620 }, { "epoch": 3.4180583593319875, "grad_norm": 0.18071384727954865, "learning_rate": 9.847681351454235e-06, "loss": 0.0913, "num_input_tokens_seen": 40181088, "step": 18625 }, { "epoch": 3.4189759588915396, "grad_norm": 0.7866600155830383, "learning_rate": 9.847485145707842e-06, "loss": 0.2183, "num_input_tokens_seen": 40192064, "step": 18630 }, { "epoch": 3.419893558451092, "grad_norm": 0.5007334351539612, "learning_rate": 9.847288815630968e-06, "loss": 0.0438, "num_input_tokens_seen": 40203136, "step": 18635 }, { "epoch": 3.420811158010644, "grad_norm": 10.74838924407959, "learning_rate": 9.847092361228648e-06, "loss": 0.1298, "num_input_tokens_seen": 40213568, "step": 18640 }, { "epoch": 3.4217287575701962, "grad_norm": 1.9886786937713623, "learning_rate": 9.846895782505922e-06, "loss": 0.2577, "num_input_tokens_seen": 40224320, "step": 18645 }, { "epoch": 3.4226463571297487, "grad_norm": 0.17628052830696106, "learning_rate": 9.846699079467832e-06, "loss": 0.0559, "num_input_tokens_seen": 40236480, "step": 18650 }, { "epoch": 3.423563956689301, "grad_norm": 7.457215309143066, "learning_rate": 9.846502252119421e-06, "loss": 0.2887, "num_input_tokens_seen": 40247104, "step": 18655 }, { "epoch": 3.424481556248853, "grad_norm": 22.52288055419922, "learning_rate": 9.846305300465739e-06, "loss": 0.187, "num_input_tokens_seen": 40257216, "step": 18660 }, { "epoch": 3.4253991558084054, "grad_norm": 0.12324315309524536, "learning_rate": 9.846108224511836e-06, "loss": 0.0197, "num_input_tokens_seen": 40267392, "step": 18665 }, { "epoch": 3.4263167553679574, "grad_norm": 0.09792417287826538, "learning_rate": 9.845911024262771e-06, "loss": 0.1691, "num_input_tokens_seen": 40278272, "step": 18670 }, { "epoch": 3.4272343549275095, "grad_norm": 8.563191413879395, "learning_rate": 9.845713699723596e-06, "loss": 0.0949, "num_input_tokens_seen": 40288576, "step": 18675 }, { "epoch": 3.428151954487062, "grad_norm": 19.67731475830078, "learning_rate": 9.845516250899376e-06, "loss": 0.2441, "num_input_tokens_seen": 40300000, "step": 18680 }, { "epoch": 3.429069554046614, "grad_norm": 0.5386325120925903, "learning_rate": 9.845318677795173e-06, "loss": 0.1692, "num_input_tokens_seen": 40310752, "step": 18685 }, { "epoch": 3.429987153606166, "grad_norm": 7.616239070892334, "learning_rate": 9.845120980416057e-06, "loss": 0.2061, "num_input_tokens_seen": 40322144, "step": 18690 }, { "epoch": 3.4309047531657186, "grad_norm": 2.065453052520752, "learning_rate": 9.844923158767096e-06, "loss": 0.0759, "num_input_tokens_seen": 40333088, "step": 18695 }, { "epoch": 3.4318223527252707, "grad_norm": 0.09128110855817795, "learning_rate": 9.844725212853365e-06, "loss": 0.1533, "num_input_tokens_seen": 40344960, "step": 18700 }, { "epoch": 3.4327399522848228, "grad_norm": 0.0894981101155281, "learning_rate": 9.844527142679941e-06, "loss": 0.2281, "num_input_tokens_seen": 40357216, "step": 18705 }, { "epoch": 3.4336575518443753, "grad_norm": 0.22377751767635345, "learning_rate": 9.844328948251904e-06, "loss": 0.1983, "num_input_tokens_seen": 40368128, "step": 18710 }, { "epoch": 3.4345751514039273, "grad_norm": 0.06820018589496613, "learning_rate": 9.844130629574338e-06, "loss": 0.2316, "num_input_tokens_seen": 40379520, "step": 18715 }, { "epoch": 3.4354927509634794, "grad_norm": 0.5345984101295471, "learning_rate": 9.843932186652328e-06, "loss": 0.0973, "num_input_tokens_seen": 40389952, "step": 18720 }, { "epoch": 3.436410350523032, "grad_norm": 0.14597059786319733, "learning_rate": 9.843733619490965e-06, "loss": 0.2094, "num_input_tokens_seen": 40400992, "step": 18725 }, { "epoch": 3.437327950082584, "grad_norm": 2.1775994300842285, "learning_rate": 9.843534928095343e-06, "loss": 0.2518, "num_input_tokens_seen": 40412288, "step": 18730 }, { "epoch": 3.438245549642136, "grad_norm": 6.518863677978516, "learning_rate": 9.843336112470556e-06, "loss": 0.1688, "num_input_tokens_seen": 40423744, "step": 18735 }, { "epoch": 3.4391631492016885, "grad_norm": 0.40854763984680176, "learning_rate": 9.843137172621705e-06, "loss": 0.1516, "num_input_tokens_seen": 40435456, "step": 18740 }, { "epoch": 3.4400807487612406, "grad_norm": 29.84347915649414, "learning_rate": 9.842938108553892e-06, "loss": 0.2439, "num_input_tokens_seen": 40446976, "step": 18745 }, { "epoch": 3.4409983483207927, "grad_norm": 30.94170379638672, "learning_rate": 9.84273892027222e-06, "loss": 0.0497, "num_input_tokens_seen": 40457664, "step": 18750 }, { "epoch": 3.441915947880345, "grad_norm": 14.013007164001465, "learning_rate": 9.842539607781803e-06, "loss": 0.4448, "num_input_tokens_seen": 40468480, "step": 18755 }, { "epoch": 3.4428335474398972, "grad_norm": 0.41560059785842896, "learning_rate": 9.842340171087748e-06, "loss": 0.2823, "num_input_tokens_seen": 40479712, "step": 18760 }, { "epoch": 3.4437511469994493, "grad_norm": 0.2957639694213867, "learning_rate": 9.842140610195174e-06, "loss": 0.0343, "num_input_tokens_seen": 40490720, "step": 18765 }, { "epoch": 3.444668746559002, "grad_norm": 0.20225395262241364, "learning_rate": 9.841940925109198e-06, "loss": 0.1847, "num_input_tokens_seen": 40501664, "step": 18770 }, { "epoch": 3.445586346118554, "grad_norm": 11.84572982788086, "learning_rate": 9.841741115834942e-06, "loss": 0.1938, "num_input_tokens_seen": 40512704, "step": 18775 }, { "epoch": 3.446503945678106, "grad_norm": 0.1678258776664734, "learning_rate": 9.841541182377528e-06, "loss": 0.0073, "num_input_tokens_seen": 40523744, "step": 18780 }, { "epoch": 3.4474215452376584, "grad_norm": 23.882802963256836, "learning_rate": 9.841341124742089e-06, "loss": 0.1812, "num_input_tokens_seen": 40532736, "step": 18785 }, { "epoch": 3.4483391447972105, "grad_norm": 0.8145254850387573, "learning_rate": 9.841140942933752e-06, "loss": 0.1736, "num_input_tokens_seen": 40543584, "step": 18790 }, { "epoch": 3.4492567443567625, "grad_norm": 21.9256649017334, "learning_rate": 9.840940636957655e-06, "loss": 0.0796, "num_input_tokens_seen": 40553632, "step": 18795 }, { "epoch": 3.450174343916315, "grad_norm": 0.452676385641098, "learning_rate": 9.84074020681893e-06, "loss": 0.1277, "num_input_tokens_seen": 40565120, "step": 18800 }, { "epoch": 3.451091943475867, "grad_norm": 5.249226093292236, "learning_rate": 9.840539652522724e-06, "loss": 0.172, "num_input_tokens_seen": 40576256, "step": 18805 }, { "epoch": 3.452009543035419, "grad_norm": 7.090942859649658, "learning_rate": 9.840338974074178e-06, "loss": 0.0724, "num_input_tokens_seen": 40587648, "step": 18810 }, { "epoch": 3.4529271425949717, "grad_norm": 0.33902427554130554, "learning_rate": 9.840138171478437e-06, "loss": 0.2789, "num_input_tokens_seen": 40599616, "step": 18815 }, { "epoch": 3.4538447421545238, "grad_norm": 0.2134726494550705, "learning_rate": 9.839937244740655e-06, "loss": 0.0234, "num_input_tokens_seen": 40610848, "step": 18820 }, { "epoch": 3.454762341714076, "grad_norm": 0.32367855310440063, "learning_rate": 9.839736193865982e-06, "loss": 0.2884, "num_input_tokens_seen": 40622560, "step": 18825 }, { "epoch": 3.4556799412736283, "grad_norm": 0.44240838289260864, "learning_rate": 9.83953501885958e-06, "loss": 0.2304, "num_input_tokens_seen": 40633056, "step": 18830 }, { "epoch": 3.4565975408331804, "grad_norm": 37.38869094848633, "learning_rate": 9.839333719726603e-06, "loss": 0.0525, "num_input_tokens_seen": 40644416, "step": 18835 }, { "epoch": 3.4575151403927324, "grad_norm": 4.829779148101807, "learning_rate": 9.839132296472217e-06, "loss": 0.1583, "num_input_tokens_seen": 40653760, "step": 18840 }, { "epoch": 3.458432739952285, "grad_norm": 5.979888439178467, "learning_rate": 9.838930749101587e-06, "loss": 0.1783, "num_input_tokens_seen": 40664928, "step": 18845 }, { "epoch": 3.459350339511837, "grad_norm": 14.67837142944336, "learning_rate": 9.838729077619884e-06, "loss": 0.2228, "num_input_tokens_seen": 40676480, "step": 18850 }, { "epoch": 3.460267939071389, "grad_norm": 50.190208435058594, "learning_rate": 9.838527282032279e-06, "loss": 0.3, "num_input_tokens_seen": 40687488, "step": 18855 }, { "epoch": 3.4611855386309416, "grad_norm": 0.20066478848457336, "learning_rate": 9.838325362343948e-06, "loss": 0.1996, "num_input_tokens_seen": 40697440, "step": 18860 }, { "epoch": 3.4621031381904936, "grad_norm": 1.9078125953674316, "learning_rate": 9.838123318560072e-06, "loss": 0.1058, "num_input_tokens_seen": 40707840, "step": 18865 }, { "epoch": 3.4630207377500457, "grad_norm": 0.14734087884426117, "learning_rate": 9.837921150685828e-06, "loss": 0.0039, "num_input_tokens_seen": 40718624, "step": 18870 }, { "epoch": 3.463938337309598, "grad_norm": 0.10858921706676483, "learning_rate": 9.837718858726406e-06, "loss": 0.2357, "num_input_tokens_seen": 40728096, "step": 18875 }, { "epoch": 3.4648559368691503, "grad_norm": 0.42433395981788635, "learning_rate": 9.837516442686993e-06, "loss": 0.3366, "num_input_tokens_seen": 40739552, "step": 18880 }, { "epoch": 3.4657735364287023, "grad_norm": 4.185512542724609, "learning_rate": 9.837313902572783e-06, "loss": 0.261, "num_input_tokens_seen": 40751392, "step": 18885 }, { "epoch": 3.466691135988255, "grad_norm": 0.06700058281421661, "learning_rate": 9.837111238388966e-06, "loss": 0.0415, "num_input_tokens_seen": 40762272, "step": 18890 }, { "epoch": 3.467608735547807, "grad_norm": 30.83592414855957, "learning_rate": 9.836908450140743e-06, "loss": 0.1552, "num_input_tokens_seen": 40772288, "step": 18895 }, { "epoch": 3.468526335107359, "grad_norm": 10.913375854492188, "learning_rate": 9.836705537833315e-06, "loss": 0.122, "num_input_tokens_seen": 40784960, "step": 18900 }, { "epoch": 3.4694439346669115, "grad_norm": 21.95378303527832, "learning_rate": 9.836502501471886e-06, "loss": 0.1262, "num_input_tokens_seen": 40796896, "step": 18905 }, { "epoch": 3.4703615342264635, "grad_norm": 0.12967194616794586, "learning_rate": 9.836299341061663e-06, "loss": 0.0061, "num_input_tokens_seen": 40807808, "step": 18910 }, { "epoch": 3.4712791337860156, "grad_norm": 10.364523887634277, "learning_rate": 9.83609605660786e-06, "loss": 0.1991, "num_input_tokens_seen": 40818368, "step": 18915 }, { "epoch": 3.472196733345568, "grad_norm": 0.046508319675922394, "learning_rate": 9.835892648115686e-06, "loss": 0.0066, "num_input_tokens_seen": 40830048, "step": 18920 }, { "epoch": 3.47311433290512, "grad_norm": 0.18397966027259827, "learning_rate": 9.835689115590361e-06, "loss": 0.04, "num_input_tokens_seen": 40840928, "step": 18925 }, { "epoch": 3.4740319324646722, "grad_norm": 9.516895294189453, "learning_rate": 9.835485459037107e-06, "loss": 0.4758, "num_input_tokens_seen": 40852896, "step": 18930 }, { "epoch": 3.4749495320242247, "grad_norm": 8.08585262298584, "learning_rate": 9.835281678461141e-06, "loss": 0.3309, "num_input_tokens_seen": 40863936, "step": 18935 }, { "epoch": 3.475867131583777, "grad_norm": 0.3954916298389435, "learning_rate": 9.835077773867699e-06, "loss": 0.1204, "num_input_tokens_seen": 40874496, "step": 18940 }, { "epoch": 3.476784731143329, "grad_norm": 12.459527015686035, "learning_rate": 9.834873745262002e-06, "loss": 0.0148, "num_input_tokens_seen": 40885312, "step": 18945 }, { "epoch": 3.4777023307028814, "grad_norm": 1.3206589221954346, "learning_rate": 9.834669592649288e-06, "loss": 0.0117, "num_input_tokens_seen": 40895840, "step": 18950 }, { "epoch": 3.4786199302624334, "grad_norm": 23.048500061035156, "learning_rate": 9.83446531603479e-06, "loss": 0.324, "num_input_tokens_seen": 40904768, "step": 18955 }, { "epoch": 3.4795375298219855, "grad_norm": 0.26996520161628723, "learning_rate": 9.834260915423752e-06, "loss": 0.1348, "num_input_tokens_seen": 40915040, "step": 18960 }, { "epoch": 3.480455129381538, "grad_norm": 3.333540678024292, "learning_rate": 9.834056390821414e-06, "loss": 0.2244, "num_input_tokens_seen": 40926624, "step": 18965 }, { "epoch": 3.48137272894109, "grad_norm": 0.16812999546527863, "learning_rate": 9.833851742233022e-06, "loss": 0.0944, "num_input_tokens_seen": 40937248, "step": 18970 }, { "epoch": 3.482290328500642, "grad_norm": 0.13913287222385406, "learning_rate": 9.833646969663824e-06, "loss": 0.248, "num_input_tokens_seen": 40949792, "step": 18975 }, { "epoch": 3.4832079280601946, "grad_norm": 0.45899251103401184, "learning_rate": 9.83344207311907e-06, "loss": 0.2798, "num_input_tokens_seen": 40960608, "step": 18980 }, { "epoch": 3.4841255276197467, "grad_norm": 0.06086389720439911, "learning_rate": 9.833237052604021e-06, "loss": 0.1314, "num_input_tokens_seen": 40972128, "step": 18985 }, { "epoch": 3.4850431271792988, "grad_norm": 27.20171546936035, "learning_rate": 9.833031908123932e-06, "loss": 0.4304, "num_input_tokens_seen": 40982752, "step": 18990 }, { "epoch": 3.4859607267388513, "grad_norm": 0.26940974593162537, "learning_rate": 9.832826639684065e-06, "loss": 0.1269, "num_input_tokens_seen": 40994400, "step": 18995 }, { "epoch": 3.4868783262984033, "grad_norm": 0.0639592781662941, "learning_rate": 9.832621247289684e-06, "loss": 0.1178, "num_input_tokens_seen": 41004736, "step": 19000 }, { "epoch": 3.487795925857956, "grad_norm": 4.399785041809082, "learning_rate": 9.832415730946059e-06, "loss": 0.2045, "num_input_tokens_seen": 41015712, "step": 19005 }, { "epoch": 3.488713525417508, "grad_norm": 19.183185577392578, "learning_rate": 9.832210090658461e-06, "loss": 0.2111, "num_input_tokens_seen": 41027168, "step": 19010 }, { "epoch": 3.48963112497706, "grad_norm": 5.679355621337891, "learning_rate": 9.83200432643216e-06, "loss": 0.0911, "num_input_tokens_seen": 41038688, "step": 19015 }, { "epoch": 3.4905487245366125, "grad_norm": 0.47676247358322144, "learning_rate": 9.831798438272439e-06, "loss": 0.1629, "num_input_tokens_seen": 41050272, "step": 19020 }, { "epoch": 3.4914663240961645, "grad_norm": 6.494185447692871, "learning_rate": 9.831592426184577e-06, "loss": 0.2738, "num_input_tokens_seen": 41061984, "step": 19025 }, { "epoch": 3.4923839236557166, "grad_norm": 11.59051513671875, "learning_rate": 9.831386290173859e-06, "loss": 0.1984, "num_input_tokens_seen": 41073056, "step": 19030 }, { "epoch": 3.493301523215269, "grad_norm": 1.086983561515808, "learning_rate": 9.831180030245568e-06, "loss": 0.1603, "num_input_tokens_seen": 41083840, "step": 19035 }, { "epoch": 3.494219122774821, "grad_norm": 17.547636032104492, "learning_rate": 9.830973646404997e-06, "loss": 0.1743, "num_input_tokens_seen": 41093568, "step": 19040 }, { "epoch": 3.4951367223343732, "grad_norm": 3.970952033996582, "learning_rate": 9.83076713865744e-06, "loss": 0.3996, "num_input_tokens_seen": 41104480, "step": 19045 }, { "epoch": 3.4960543218939257, "grad_norm": 0.1471462845802307, "learning_rate": 9.830560507008194e-06, "loss": 0.2489, "num_input_tokens_seen": 41115584, "step": 19050 }, { "epoch": 3.496971921453478, "grad_norm": 11.429628372192383, "learning_rate": 9.830353751462555e-06, "loss": 0.3456, "num_input_tokens_seen": 41126240, "step": 19055 }, { "epoch": 3.49788952101303, "grad_norm": 0.7271351218223572, "learning_rate": 9.830146872025832e-06, "loss": 0.0298, "num_input_tokens_seen": 41137440, "step": 19060 }, { "epoch": 3.4988071205725824, "grad_norm": 14.243261337280273, "learning_rate": 9.829939868703327e-06, "loss": 0.3366, "num_input_tokens_seen": 41147520, "step": 19065 }, { "epoch": 3.4997247201321344, "grad_norm": 0.1470484733581543, "learning_rate": 9.82973274150035e-06, "loss": 0.186, "num_input_tokens_seen": 41158144, "step": 19070 }, { "epoch": 3.5006423196916865, "grad_norm": 0.3027132451534271, "learning_rate": 9.829525490422212e-06, "loss": 0.1539, "num_input_tokens_seen": 41168640, "step": 19075 }, { "epoch": 3.501559919251239, "grad_norm": 0.5669735670089722, "learning_rate": 9.82931811547423e-06, "loss": 0.1211, "num_input_tokens_seen": 41180192, "step": 19080 }, { "epoch": 3.502477518810791, "grad_norm": 3.600071668624878, "learning_rate": 9.829110616661723e-06, "loss": 0.1017, "num_input_tokens_seen": 41190720, "step": 19085 }, { "epoch": 3.503395118370343, "grad_norm": 0.13568946719169617, "learning_rate": 9.828902993990015e-06, "loss": 0.1809, "num_input_tokens_seen": 41201760, "step": 19090 }, { "epoch": 3.5043127179298956, "grad_norm": 0.6510378122329712, "learning_rate": 9.828695247464429e-06, "loss": 0.3039, "num_input_tokens_seen": 41212992, "step": 19095 }, { "epoch": 3.5052303174894477, "grad_norm": 9.852862358093262, "learning_rate": 9.828487377090293e-06, "loss": 0.3329, "num_input_tokens_seen": 41223808, "step": 19100 }, { "epoch": 3.5061479170489998, "grad_norm": 0.1682392954826355, "learning_rate": 9.828279382872939e-06, "loss": 0.215, "num_input_tokens_seen": 41235008, "step": 19105 }, { "epoch": 3.5070655166085523, "grad_norm": 0.24489450454711914, "learning_rate": 9.828071264817703e-06, "loss": 0.131, "num_input_tokens_seen": 41246592, "step": 19110 }, { "epoch": 3.5079831161681043, "grad_norm": 0.5797035694122314, "learning_rate": 9.827863022929922e-06, "loss": 0.1733, "num_input_tokens_seen": 41257312, "step": 19115 }, { "epoch": 3.5089007157276564, "grad_norm": 23.337081909179688, "learning_rate": 9.827654657214936e-06, "loss": 0.2401, "num_input_tokens_seen": 41266976, "step": 19120 }, { "epoch": 3.509818315287209, "grad_norm": 10.351507186889648, "learning_rate": 9.827446167678091e-06, "loss": 0.0213, "num_input_tokens_seen": 41277728, "step": 19125 }, { "epoch": 3.510735914846761, "grad_norm": 0.23253443837165833, "learning_rate": 9.827237554324733e-06, "loss": 0.0122, "num_input_tokens_seen": 41288416, "step": 19130 }, { "epoch": 3.511653514406313, "grad_norm": 31.3612060546875, "learning_rate": 9.827028817160214e-06, "loss": 0.1604, "num_input_tokens_seen": 41299008, "step": 19135 }, { "epoch": 3.5125711139658655, "grad_norm": 38.04725646972656, "learning_rate": 9.826819956189886e-06, "loss": 0.5789, "num_input_tokens_seen": 41309696, "step": 19140 }, { "epoch": 3.5134887135254176, "grad_norm": 0.24575510621070862, "learning_rate": 9.826610971419108e-06, "loss": 0.1597, "num_input_tokens_seen": 41321344, "step": 19145 }, { "epoch": 3.5144063130849696, "grad_norm": 12.071895599365234, "learning_rate": 9.826401862853238e-06, "loss": 0.171, "num_input_tokens_seen": 41331776, "step": 19150 }, { "epoch": 3.515323912644522, "grad_norm": 2.236938238143921, "learning_rate": 9.826192630497642e-06, "loss": 0.1932, "num_input_tokens_seen": 41343392, "step": 19155 }, { "epoch": 3.516241512204074, "grad_norm": 0.07746399194002151, "learning_rate": 9.825983274357684e-06, "loss": 0.1577, "num_input_tokens_seen": 41354208, "step": 19160 }, { "epoch": 3.5171591117636263, "grad_norm": 24.750110626220703, "learning_rate": 9.825773794438735e-06, "loss": 0.1434, "num_input_tokens_seen": 41364768, "step": 19165 }, { "epoch": 3.518076711323179, "grad_norm": 0.9630157351493835, "learning_rate": 9.825564190746166e-06, "loss": 0.1016, "num_input_tokens_seen": 41375744, "step": 19170 }, { "epoch": 3.518994310882731, "grad_norm": 25.291284561157227, "learning_rate": 9.825354463285357e-06, "loss": 0.3105, "num_input_tokens_seen": 41385984, "step": 19175 }, { "epoch": 3.519911910442283, "grad_norm": 0.12816573679447174, "learning_rate": 9.825144612061683e-06, "loss": 0.3683, "num_input_tokens_seen": 41396768, "step": 19180 }, { "epoch": 3.5208295100018354, "grad_norm": 2.0505423545837402, "learning_rate": 9.824934637080528e-06, "loss": 0.1322, "num_input_tokens_seen": 41407104, "step": 19185 }, { "epoch": 3.5217471095613875, "grad_norm": 13.569543838500977, "learning_rate": 9.824724538347278e-06, "loss": 0.1202, "num_input_tokens_seen": 41417536, "step": 19190 }, { "epoch": 3.5226647091209395, "grad_norm": 0.11516871303319931, "learning_rate": 9.824514315867321e-06, "loss": 0.1057, "num_input_tokens_seen": 41426688, "step": 19195 }, { "epoch": 3.523582308680492, "grad_norm": 0.7984564304351807, "learning_rate": 9.82430396964605e-06, "loss": 0.1596, "num_input_tokens_seen": 41438304, "step": 19200 }, { "epoch": 3.524499908240044, "grad_norm": 0.2605800926685333, "learning_rate": 9.824093499688858e-06, "loss": 0.1035, "num_input_tokens_seen": 41448736, "step": 19205 }, { "epoch": 3.525417507799596, "grad_norm": 7.755654335021973, "learning_rate": 9.823882906001145e-06, "loss": 0.4225, "num_input_tokens_seen": 41459712, "step": 19210 }, { "epoch": 3.5263351073591487, "grad_norm": 5.506470680236816, "learning_rate": 9.823672188588312e-06, "loss": 0.0597, "num_input_tokens_seen": 41470048, "step": 19215 }, { "epoch": 3.5272527069187007, "grad_norm": 0.9919108152389526, "learning_rate": 9.823461347455761e-06, "loss": 0.1327, "num_input_tokens_seen": 41481024, "step": 19220 }, { "epoch": 3.528170306478253, "grad_norm": 16.4996395111084, "learning_rate": 9.823250382608905e-06, "loss": 0.2202, "num_input_tokens_seen": 41492128, "step": 19225 }, { "epoch": 3.5290879060378053, "grad_norm": 6.892432689666748, "learning_rate": 9.823039294053152e-06, "loss": 0.5443, "num_input_tokens_seen": 41502752, "step": 19230 }, { "epoch": 3.5300055055973574, "grad_norm": 0.18300409615039825, "learning_rate": 9.822828081793913e-06, "loss": 0.2968, "num_input_tokens_seen": 41513376, "step": 19235 }, { "epoch": 3.5309231051569094, "grad_norm": 0.40145188570022583, "learning_rate": 9.822616745836613e-06, "loss": 0.1168, "num_input_tokens_seen": 41524672, "step": 19240 }, { "epoch": 3.531840704716462, "grad_norm": 42.97392272949219, "learning_rate": 9.822405286186664e-06, "loss": 0.3541, "num_input_tokens_seen": 41534240, "step": 19245 }, { "epoch": 3.532758304276014, "grad_norm": 5.319962978363037, "learning_rate": 9.822193702849496e-06, "loss": 0.0555, "num_input_tokens_seen": 41545184, "step": 19250 }, { "epoch": 3.533675903835566, "grad_norm": 11.031327247619629, "learning_rate": 9.821981995830532e-06, "loss": 0.2691, "num_input_tokens_seen": 41555424, "step": 19255 }, { "epoch": 3.5345935033951186, "grad_norm": 0.5108187794685364, "learning_rate": 9.821770165135203e-06, "loss": 0.0202, "num_input_tokens_seen": 41565632, "step": 19260 }, { "epoch": 3.5355111029546706, "grad_norm": 13.641730308532715, "learning_rate": 9.821558210768942e-06, "loss": 0.0135, "num_input_tokens_seen": 41576480, "step": 19265 }, { "epoch": 3.5364287025142227, "grad_norm": 41.4971923828125, "learning_rate": 9.821346132737188e-06, "loss": 0.1857, "num_input_tokens_seen": 41587968, "step": 19270 }, { "epoch": 3.537346302073775, "grad_norm": 25.519662857055664, "learning_rate": 9.821133931045375e-06, "loss": 0.1964, "num_input_tokens_seen": 41598240, "step": 19275 }, { "epoch": 3.5382639016333273, "grad_norm": 5.681023597717285, "learning_rate": 9.820921605698951e-06, "loss": 0.1162, "num_input_tokens_seen": 41608512, "step": 19280 }, { "epoch": 3.5391815011928793, "grad_norm": 0.7732439041137695, "learning_rate": 9.820709156703359e-06, "loss": 0.3816, "num_input_tokens_seen": 41620096, "step": 19285 }, { "epoch": 3.540099100752432, "grad_norm": 16.370630264282227, "learning_rate": 9.820496584064048e-06, "loss": 0.1239, "num_input_tokens_seen": 41631040, "step": 19290 }, { "epoch": 3.541016700311984, "grad_norm": 5.167906284332275, "learning_rate": 9.820283887786472e-06, "loss": 0.1886, "num_input_tokens_seen": 41640992, "step": 19295 }, { "epoch": 3.541934299871536, "grad_norm": 0.027630966156721115, "learning_rate": 9.820071067876084e-06, "loss": 0.1209, "num_input_tokens_seen": 41651648, "step": 19300 }, { "epoch": 3.5428518994310885, "grad_norm": 22.522783279418945, "learning_rate": 9.819858124338344e-06, "loss": 0.1363, "num_input_tokens_seen": 41662816, "step": 19305 }, { "epoch": 3.5437694989906405, "grad_norm": 0.102939173579216, "learning_rate": 9.819645057178713e-06, "loss": 0.0528, "num_input_tokens_seen": 41674240, "step": 19310 }, { "epoch": 3.5446870985501926, "grad_norm": 4.329491138458252, "learning_rate": 9.819431866402655e-06, "loss": 0.0145, "num_input_tokens_seen": 41684192, "step": 19315 }, { "epoch": 3.545604698109745, "grad_norm": 5.64921760559082, "learning_rate": 9.819218552015639e-06, "loss": 0.1902, "num_input_tokens_seen": 41694752, "step": 19320 }, { "epoch": 3.546522297669297, "grad_norm": 132.25711059570312, "learning_rate": 9.819005114023138e-06, "loss": 0.2297, "num_input_tokens_seen": 41705984, "step": 19325 }, { "epoch": 3.5474398972288492, "grad_norm": 11.314872741699219, "learning_rate": 9.818791552430625e-06, "loss": 0.0338, "num_input_tokens_seen": 41717536, "step": 19330 }, { "epoch": 3.5483574967884017, "grad_norm": 0.08403587341308594, "learning_rate": 9.818577867243575e-06, "loss": 0.3106, "num_input_tokens_seen": 41727776, "step": 19335 }, { "epoch": 3.549275096347954, "grad_norm": 0.0886334776878357, "learning_rate": 9.818364058467471e-06, "loss": 0.2141, "num_input_tokens_seen": 41737856, "step": 19340 }, { "epoch": 3.550192695907506, "grad_norm": 0.9376095533370972, "learning_rate": 9.818150126107798e-06, "loss": 0.1207, "num_input_tokens_seen": 41749280, "step": 19345 }, { "epoch": 3.5511102954670584, "grad_norm": 0.09295523911714554, "learning_rate": 9.817936070170042e-06, "loss": 0.1788, "num_input_tokens_seen": 41761280, "step": 19350 }, { "epoch": 3.5520278950266104, "grad_norm": 9.976542472839355, "learning_rate": 9.817721890659691e-06, "loss": 0.1124, "num_input_tokens_seen": 41772992, "step": 19355 }, { "epoch": 3.5529454945861625, "grad_norm": 6.656527042388916, "learning_rate": 9.817507587582242e-06, "loss": 0.4163, "num_input_tokens_seen": 41783456, "step": 19360 }, { "epoch": 3.553863094145715, "grad_norm": 9.537002563476562, "learning_rate": 9.81729316094319e-06, "loss": 0.2261, "num_input_tokens_seen": 41795104, "step": 19365 }, { "epoch": 3.554780693705267, "grad_norm": 38.937381744384766, "learning_rate": 9.817078610748034e-06, "loss": 0.1368, "num_input_tokens_seen": 41806176, "step": 19370 }, { "epoch": 3.555698293264819, "grad_norm": 0.9374805092811584, "learning_rate": 9.816863937002276e-06, "loss": 0.0824, "num_input_tokens_seen": 41816320, "step": 19375 }, { "epoch": 3.5566158928243716, "grad_norm": 4.712538719177246, "learning_rate": 9.816649139711424e-06, "loss": 0.3615, "num_input_tokens_seen": 41828960, "step": 19380 }, { "epoch": 3.5575334923839237, "grad_norm": 0.45234280824661255, "learning_rate": 9.816434218880989e-06, "loss": 0.1147, "num_input_tokens_seen": 41840160, "step": 19385 }, { "epoch": 3.5584510919434758, "grad_norm": 0.7251580953598022, "learning_rate": 9.81621917451648e-06, "loss": 0.1215, "num_input_tokens_seen": 41851232, "step": 19390 }, { "epoch": 3.5593686915030283, "grad_norm": 1.1979948282241821, "learning_rate": 9.816004006623411e-06, "loss": 0.1019, "num_input_tokens_seen": 41862016, "step": 19395 }, { "epoch": 3.5602862910625803, "grad_norm": 1.3796164989471436, "learning_rate": 9.815788715207306e-06, "loss": 0.2213, "num_input_tokens_seen": 41873472, "step": 19400 }, { "epoch": 3.5612038906221324, "grad_norm": 0.0791381448507309, "learning_rate": 9.815573300273684e-06, "loss": 0.0522, "num_input_tokens_seen": 41883840, "step": 19405 }, { "epoch": 3.562121490181685, "grad_norm": 0.7236403226852417, "learning_rate": 9.81535776182807e-06, "loss": 0.0605, "num_input_tokens_seen": 41894048, "step": 19410 }, { "epoch": 3.563039089741237, "grad_norm": 2.802889347076416, "learning_rate": 9.815142099875994e-06, "loss": 0.117, "num_input_tokens_seen": 41905376, "step": 19415 }, { "epoch": 3.563956689300789, "grad_norm": 5.588127136230469, "learning_rate": 9.814926314422983e-06, "loss": 0.2376, "num_input_tokens_seen": 41917536, "step": 19420 }, { "epoch": 3.5648742888603415, "grad_norm": 0.1864154189825058, "learning_rate": 9.814710405474577e-06, "loss": 0.1262, "num_input_tokens_seen": 41928224, "step": 19425 }, { "epoch": 3.5657918884198936, "grad_norm": 0.39173251390457153, "learning_rate": 9.81449437303631e-06, "loss": 0.0907, "num_input_tokens_seen": 41938912, "step": 19430 }, { "epoch": 3.5667094879794456, "grad_norm": 2.128056526184082, "learning_rate": 9.814278217113725e-06, "loss": 0.0107, "num_input_tokens_seen": 41949280, "step": 19435 }, { "epoch": 3.567627087538998, "grad_norm": 19.29863739013672, "learning_rate": 9.814061937712364e-06, "loss": 0.269, "num_input_tokens_seen": 41960288, "step": 19440 }, { "epoch": 3.56854468709855, "grad_norm": 3.872668504714966, "learning_rate": 9.813845534837776e-06, "loss": 0.0976, "num_input_tokens_seen": 41970272, "step": 19445 }, { "epoch": 3.5694622866581023, "grad_norm": 0.0265728160738945, "learning_rate": 9.813629008495511e-06, "loss": 0.1391, "num_input_tokens_seen": 41982368, "step": 19450 }, { "epoch": 3.570379886217655, "grad_norm": 11.856395721435547, "learning_rate": 9.813412358691122e-06, "loss": 0.416, "num_input_tokens_seen": 41993120, "step": 19455 }, { "epoch": 3.571297485777207, "grad_norm": 0.03936060145497322, "learning_rate": 9.813195585430166e-06, "loss": 0.0887, "num_input_tokens_seen": 42002880, "step": 19460 }, { "epoch": 3.572215085336759, "grad_norm": 25.630456924438477, "learning_rate": 9.812978688718204e-06, "loss": 0.013, "num_input_tokens_seen": 42013248, "step": 19465 }, { "epoch": 3.5731326848963114, "grad_norm": 20.818002700805664, "learning_rate": 9.812761668560797e-06, "loss": 0.1958, "num_input_tokens_seen": 42025056, "step": 19470 }, { "epoch": 3.5740502844558635, "grad_norm": 1.745359182357788, "learning_rate": 9.812544524963512e-06, "loss": 0.1335, "num_input_tokens_seen": 42036320, "step": 19475 }, { "epoch": 3.5749678840154155, "grad_norm": 10.735583305358887, "learning_rate": 9.81232725793192e-06, "loss": 0.1226, "num_input_tokens_seen": 42047168, "step": 19480 }, { "epoch": 3.575885483574968, "grad_norm": 7.202983379364014, "learning_rate": 9.812109867471591e-06, "loss": 0.129, "num_input_tokens_seen": 42058336, "step": 19485 }, { "epoch": 3.57680308313452, "grad_norm": 0.35700342059135437, "learning_rate": 9.811892353588103e-06, "loss": 0.1937, "num_input_tokens_seen": 42069024, "step": 19490 }, { "epoch": 3.577720682694072, "grad_norm": 0.9213614463806152, "learning_rate": 9.811674716287034e-06, "loss": 0.0095, "num_input_tokens_seen": 42079264, "step": 19495 }, { "epoch": 3.5786382822536247, "grad_norm": 0.1920875757932663, "learning_rate": 9.811456955573965e-06, "loss": 0.1836, "num_input_tokens_seen": 42089504, "step": 19500 }, { "epoch": 3.5795558818131767, "grad_norm": 0.060492999851703644, "learning_rate": 9.811239071454483e-06, "loss": 0.1911, "num_input_tokens_seen": 42100256, "step": 19505 }, { "epoch": 3.580473481372729, "grad_norm": 0.06411141157150269, "learning_rate": 9.811021063934174e-06, "loss": 0.232, "num_input_tokens_seen": 42112128, "step": 19510 }, { "epoch": 3.5813910809322813, "grad_norm": 0.06231953203678131, "learning_rate": 9.810802933018634e-06, "loss": 0.0281, "num_input_tokens_seen": 42124544, "step": 19515 }, { "epoch": 3.5823086804918334, "grad_norm": 30.102319717407227, "learning_rate": 9.810584678713454e-06, "loss": 0.2959, "num_input_tokens_seen": 42135168, "step": 19520 }, { "epoch": 3.5832262800513854, "grad_norm": 12.541627883911133, "learning_rate": 9.81036630102423e-06, "loss": 0.2318, "num_input_tokens_seen": 42146496, "step": 19525 }, { "epoch": 3.584143879610938, "grad_norm": 47.823001861572266, "learning_rate": 9.810147799956568e-06, "loss": 0.0411, "num_input_tokens_seen": 42157664, "step": 19530 }, { "epoch": 3.58506147917049, "grad_norm": 62.24592590332031, "learning_rate": 9.80992917551607e-06, "loss": 0.3048, "num_input_tokens_seen": 42167744, "step": 19535 }, { "epoch": 3.585979078730042, "grad_norm": 15.441588401794434, "learning_rate": 9.809710427708342e-06, "loss": 0.293, "num_input_tokens_seen": 42179392, "step": 19540 }, { "epoch": 3.5868966782895946, "grad_norm": 0.03254914656281471, "learning_rate": 9.809491556538999e-06, "loss": 0.083, "num_input_tokens_seen": 42189120, "step": 19545 }, { "epoch": 3.5878142778491466, "grad_norm": 0.4764321446418762, "learning_rate": 9.809272562013648e-06, "loss": 0.1977, "num_input_tokens_seen": 42200160, "step": 19550 }, { "epoch": 3.5887318774086987, "grad_norm": 0.4779336154460907, "learning_rate": 9.809053444137911e-06, "loss": 0.1157, "num_input_tokens_seen": 42210944, "step": 19555 }, { "epoch": 3.589649476968251, "grad_norm": 7.219424724578857, "learning_rate": 9.808834202917408e-06, "loss": 0.1598, "num_input_tokens_seen": 42221696, "step": 19560 }, { "epoch": 3.5905670765278033, "grad_norm": 7.311249256134033, "learning_rate": 9.808614838357759e-06, "loss": 0.2244, "num_input_tokens_seen": 42232896, "step": 19565 }, { "epoch": 3.5914846760873553, "grad_norm": 0.13534995913505554, "learning_rate": 9.808395350464592e-06, "loss": 0.1253, "num_input_tokens_seen": 42243200, "step": 19570 }, { "epoch": 3.592402275646908, "grad_norm": 0.16550882160663605, "learning_rate": 9.808175739243538e-06, "loss": 0.2986, "num_input_tokens_seen": 42254368, "step": 19575 }, { "epoch": 3.59331987520646, "grad_norm": 18.298912048339844, "learning_rate": 9.807956004700226e-06, "loss": 0.1494, "num_input_tokens_seen": 42265536, "step": 19580 }, { "epoch": 3.594237474766012, "grad_norm": 0.8960303068161011, "learning_rate": 9.807736146840295e-06, "loss": 0.1606, "num_input_tokens_seen": 42276192, "step": 19585 }, { "epoch": 3.5951550743255645, "grad_norm": 66.31971740722656, "learning_rate": 9.807516165669385e-06, "loss": 0.5063, "num_input_tokens_seen": 42286432, "step": 19590 }, { "epoch": 3.5960726738851165, "grad_norm": 0.08166347444057465, "learning_rate": 9.807296061193134e-06, "loss": 0.2429, "num_input_tokens_seen": 42298400, "step": 19595 }, { "epoch": 3.5969902734446686, "grad_norm": 0.5498729348182678, "learning_rate": 9.80707583341719e-06, "loss": 0.085, "num_input_tokens_seen": 42308928, "step": 19600 }, { "epoch": 3.597907873004221, "grad_norm": 22.608144760131836, "learning_rate": 9.806855482347202e-06, "loss": 0.2904, "num_input_tokens_seen": 42318752, "step": 19605 }, { "epoch": 3.598825472563773, "grad_norm": 0.10214384645223618, "learning_rate": 9.806635007988821e-06, "loss": 0.1087, "num_input_tokens_seen": 42329408, "step": 19610 }, { "epoch": 3.5997430721233252, "grad_norm": 8.36414909362793, "learning_rate": 9.8064144103477e-06, "loss": 0.1719, "num_input_tokens_seen": 42340064, "step": 19615 }, { "epoch": 3.6006606716828777, "grad_norm": 0.1754952073097229, "learning_rate": 9.8061936894295e-06, "loss": 0.2504, "num_input_tokens_seen": 42350272, "step": 19620 }, { "epoch": 3.60157827124243, "grad_norm": 0.4727093577384949, "learning_rate": 9.805972845239881e-06, "loss": 0.1326, "num_input_tokens_seen": 42360800, "step": 19625 }, { "epoch": 3.602495870801982, "grad_norm": 0.24991969764232635, "learning_rate": 9.805751877784507e-06, "loss": 0.0901, "num_input_tokens_seen": 42371168, "step": 19630 }, { "epoch": 3.6034134703615344, "grad_norm": 9.122883796691895, "learning_rate": 9.805530787069044e-06, "loss": 0.1403, "num_input_tokens_seen": 42381824, "step": 19635 }, { "epoch": 3.6043310699210864, "grad_norm": 0.18546371161937714, "learning_rate": 9.805309573099165e-06, "loss": 0.1794, "num_input_tokens_seen": 42393056, "step": 19640 }, { "epoch": 3.6052486694806385, "grad_norm": 10.641755104064941, "learning_rate": 9.805088235880545e-06, "loss": 0.2728, "num_input_tokens_seen": 42403840, "step": 19645 }, { "epoch": 3.606166269040191, "grad_norm": 0.5951786041259766, "learning_rate": 9.804866775418856e-06, "loss": 0.0931, "num_input_tokens_seen": 42416256, "step": 19650 }, { "epoch": 3.607083868599743, "grad_norm": 0.8516272306442261, "learning_rate": 9.804645191719784e-06, "loss": 0.1218, "num_input_tokens_seen": 42426368, "step": 19655 }, { "epoch": 3.608001468159295, "grad_norm": 15.989974021911621, "learning_rate": 9.804423484789008e-06, "loss": 0.1893, "num_input_tokens_seen": 42437728, "step": 19660 }, { "epoch": 3.6089190677188476, "grad_norm": 0.27222150564193726, "learning_rate": 9.804201654632215e-06, "loss": 0.041, "num_input_tokens_seen": 42448352, "step": 19665 }, { "epoch": 3.6098366672783997, "grad_norm": 15.76037311553955, "learning_rate": 9.803979701255095e-06, "loss": 0.1687, "num_input_tokens_seen": 42459200, "step": 19670 }, { "epoch": 3.6107542668379518, "grad_norm": 10.558021545410156, "learning_rate": 9.803757624663342e-06, "loss": 0.196, "num_input_tokens_seen": 42469216, "step": 19675 }, { "epoch": 3.6116718663975043, "grad_norm": 0.08809929341077805, "learning_rate": 9.80353542486265e-06, "loss": 0.2398, "num_input_tokens_seen": 42480096, "step": 19680 }, { "epoch": 3.6125894659570563, "grad_norm": 0.2515939772129059, "learning_rate": 9.803313101858723e-06, "loss": 0.0219, "num_input_tokens_seen": 42490880, "step": 19685 }, { "epoch": 3.6135070655166084, "grad_norm": 0.27630552649497986, "learning_rate": 9.803090655657258e-06, "loss": 0.0197, "num_input_tokens_seen": 42501728, "step": 19690 }, { "epoch": 3.614424665076161, "grad_norm": 0.5023151636123657, "learning_rate": 9.80286808626396e-06, "loss": 0.2374, "num_input_tokens_seen": 42512224, "step": 19695 }, { "epoch": 3.615342264635713, "grad_norm": 21.461654663085938, "learning_rate": 9.802645393684539e-06, "loss": 0.1477, "num_input_tokens_seen": 42521600, "step": 19700 }, { "epoch": 3.616259864195265, "grad_norm": 0.06278405338525772, "learning_rate": 9.802422577924708e-06, "loss": 0.0568, "num_input_tokens_seen": 42532288, "step": 19705 }, { "epoch": 3.6171774637548175, "grad_norm": 0.16703562438488007, "learning_rate": 9.802199638990181e-06, "loss": 0.3319, "num_input_tokens_seen": 42542368, "step": 19710 }, { "epoch": 3.6180950633143696, "grad_norm": 22.903669357299805, "learning_rate": 9.801976576886676e-06, "loss": 0.2324, "num_input_tokens_seen": 42551584, "step": 19715 }, { "epoch": 3.6190126628739216, "grad_norm": 0.14059320092201233, "learning_rate": 9.801753391619915e-06, "loss": 0.0091, "num_input_tokens_seen": 42562240, "step": 19720 }, { "epoch": 3.619930262433474, "grad_norm": 0.1898830682039261, "learning_rate": 9.80153008319562e-06, "loss": 0.1894, "num_input_tokens_seen": 42573440, "step": 19725 }, { "epoch": 3.620847861993026, "grad_norm": 0.17215897142887115, "learning_rate": 9.80130665161952e-06, "loss": 0.0045, "num_input_tokens_seen": 42583840, "step": 19730 }, { "epoch": 3.6217654615525783, "grad_norm": 7.7606730461120605, "learning_rate": 9.801083096897347e-06, "loss": 0.1534, "num_input_tokens_seen": 42594080, "step": 19735 }, { "epoch": 3.622683061112131, "grad_norm": 19.832904815673828, "learning_rate": 9.800859419034833e-06, "loss": 0.0834, "num_input_tokens_seen": 42604448, "step": 19740 }, { "epoch": 3.623600660671683, "grad_norm": 19.794965744018555, "learning_rate": 9.800635618037717e-06, "loss": 0.2362, "num_input_tokens_seen": 42615744, "step": 19745 }, { "epoch": 3.624518260231235, "grad_norm": 143.26422119140625, "learning_rate": 9.800411693911735e-06, "loss": 0.1017, "num_input_tokens_seen": 42627552, "step": 19750 }, { "epoch": 3.6254358597907874, "grad_norm": 0.1398877501487732, "learning_rate": 9.800187646662636e-06, "loss": 0.167, "num_input_tokens_seen": 42638400, "step": 19755 }, { "epoch": 3.6263534593503395, "grad_norm": 21.508302688598633, "learning_rate": 9.799963476296162e-06, "loss": 0.374, "num_input_tokens_seen": 42648672, "step": 19760 }, { "epoch": 3.6272710589098915, "grad_norm": 0.14287611842155457, "learning_rate": 9.799739182818062e-06, "loss": 0.1651, "num_input_tokens_seen": 42659776, "step": 19765 }, { "epoch": 3.628188658469444, "grad_norm": 13.972132682800293, "learning_rate": 9.799514766234093e-06, "loss": 0.1048, "num_input_tokens_seen": 42670496, "step": 19770 }, { "epoch": 3.629106258028996, "grad_norm": 0.5993027687072754, "learning_rate": 9.79929022655001e-06, "loss": 0.2916, "num_input_tokens_seen": 42682560, "step": 19775 }, { "epoch": 3.630023857588548, "grad_norm": 15.335832595825195, "learning_rate": 9.799065563771569e-06, "loss": 0.1313, "num_input_tokens_seen": 42692608, "step": 19780 }, { "epoch": 3.6309414571481007, "grad_norm": 0.12048392742872238, "learning_rate": 9.798840777904535e-06, "loss": 0.0402, "num_input_tokens_seen": 42703616, "step": 19785 }, { "epoch": 3.6318590567076527, "grad_norm": 0.048068780452013016, "learning_rate": 9.798615868954672e-06, "loss": 0.3588, "num_input_tokens_seen": 42713472, "step": 19790 }, { "epoch": 3.632776656267205, "grad_norm": 0.2623417377471924, "learning_rate": 9.79839083692775e-06, "loss": 0.0645, "num_input_tokens_seen": 42723712, "step": 19795 }, { "epoch": 3.6336942558267573, "grad_norm": 2.964958667755127, "learning_rate": 9.798165681829538e-06, "loss": 0.011, "num_input_tokens_seen": 42735136, "step": 19800 }, { "epoch": 3.6346118553863094, "grad_norm": 0.531523585319519, "learning_rate": 9.797940403665815e-06, "loss": 0.3146, "num_input_tokens_seen": 42746208, "step": 19805 }, { "epoch": 3.6355294549458614, "grad_norm": 35.065879821777344, "learning_rate": 9.797715002442356e-06, "loss": 0.1319, "num_input_tokens_seen": 42757568, "step": 19810 }, { "epoch": 3.636447054505414, "grad_norm": 0.1450129598379135, "learning_rate": 9.797489478164943e-06, "loss": 0.0971, "num_input_tokens_seen": 42767232, "step": 19815 }, { "epoch": 3.637364654064966, "grad_norm": 9.56781005859375, "learning_rate": 9.79726383083936e-06, "loss": 0.159, "num_input_tokens_seen": 42777568, "step": 19820 }, { "epoch": 3.638282253624518, "grad_norm": 9.594610214233398, "learning_rate": 9.797038060471395e-06, "loss": 0.3197, "num_input_tokens_seen": 42788256, "step": 19825 }, { "epoch": 3.6391998531840706, "grad_norm": 1.328224778175354, "learning_rate": 9.796812167066837e-06, "loss": 0.3116, "num_input_tokens_seen": 42798112, "step": 19830 }, { "epoch": 3.6401174527436226, "grad_norm": 10.79432201385498, "learning_rate": 9.796586150631485e-06, "loss": 0.2171, "num_input_tokens_seen": 42809440, "step": 19835 }, { "epoch": 3.6410350523031747, "grad_norm": 20.988325119018555, "learning_rate": 9.796360011171128e-06, "loss": 0.2813, "num_input_tokens_seen": 42820416, "step": 19840 }, { "epoch": 3.641952651862727, "grad_norm": 0.8131241202354431, "learning_rate": 9.796133748691575e-06, "loss": 0.0099, "num_input_tokens_seen": 42832000, "step": 19845 }, { "epoch": 3.6428702514222793, "grad_norm": 27.900327682495117, "learning_rate": 9.79590736319862e-06, "loss": 0.2578, "num_input_tokens_seen": 42841792, "step": 19850 }, { "epoch": 3.6437878509818313, "grad_norm": 0.1522253155708313, "learning_rate": 9.795680854698077e-06, "loss": 0.124, "num_input_tokens_seen": 42851680, "step": 19855 }, { "epoch": 3.644705450541384, "grad_norm": 0.48432525992393494, "learning_rate": 9.795454223195752e-06, "loss": 0.2638, "num_input_tokens_seen": 42861792, "step": 19860 }, { "epoch": 3.645623050100936, "grad_norm": 11.901447296142578, "learning_rate": 9.795227468697458e-06, "loss": 0.1517, "num_input_tokens_seen": 42871136, "step": 19865 }, { "epoch": 3.646540649660488, "grad_norm": 5.764828205108643, "learning_rate": 9.795000591209013e-06, "loss": 0.1311, "num_input_tokens_seen": 42882624, "step": 19870 }, { "epoch": 3.6474582492200405, "grad_norm": 7.171640872955322, "learning_rate": 9.794773590736233e-06, "loss": 0.1101, "num_input_tokens_seen": 42894272, "step": 19875 }, { "epoch": 3.6483758487795925, "grad_norm": 0.4407479763031006, "learning_rate": 9.794546467284941e-06, "loss": 0.2087, "num_input_tokens_seen": 42905632, "step": 19880 }, { "epoch": 3.6492934483391446, "grad_norm": 14.562520980834961, "learning_rate": 9.794319220860963e-06, "loss": 0.1359, "num_input_tokens_seen": 42917312, "step": 19885 }, { "epoch": 3.650211047898697, "grad_norm": 0.3812839090824127, "learning_rate": 9.79409185147013e-06, "loss": 0.1091, "num_input_tokens_seen": 42928480, "step": 19890 }, { "epoch": 3.651128647458249, "grad_norm": 0.08994103223085403, "learning_rate": 9.793864359118267e-06, "loss": 0.2115, "num_input_tokens_seen": 42939616, "step": 19895 }, { "epoch": 3.6520462470178012, "grad_norm": 26.53734588623047, "learning_rate": 9.793636743811218e-06, "loss": 0.1455, "num_input_tokens_seen": 42950272, "step": 19900 }, { "epoch": 3.6529638465773537, "grad_norm": 52.17524719238281, "learning_rate": 9.793409005554813e-06, "loss": 0.0464, "num_input_tokens_seen": 42961760, "step": 19905 }, { "epoch": 3.653881446136906, "grad_norm": 12.32036018371582, "learning_rate": 9.793181144354895e-06, "loss": 0.2778, "num_input_tokens_seen": 42973472, "step": 19910 }, { "epoch": 3.654799045696458, "grad_norm": 0.14788486063480377, "learning_rate": 9.792953160217311e-06, "loss": 0.4064, "num_input_tokens_seen": 42985248, "step": 19915 }, { "epoch": 3.6557166452560104, "grad_norm": 1.0368715524673462, "learning_rate": 9.792725053147908e-06, "loss": 0.2283, "num_input_tokens_seen": 42996960, "step": 19920 }, { "epoch": 3.6566342448155624, "grad_norm": 2.479898452758789, "learning_rate": 9.792496823152534e-06, "loss": 0.3232, "num_input_tokens_seen": 43007776, "step": 19925 }, { "epoch": 3.6575518443751145, "grad_norm": 33.51093292236328, "learning_rate": 9.792268470237046e-06, "loss": 0.3899, "num_input_tokens_seen": 43018336, "step": 19930 }, { "epoch": 3.658469443934667, "grad_norm": 36.784175872802734, "learning_rate": 9.792039994407297e-06, "loss": 0.128, "num_input_tokens_seen": 43028896, "step": 19935 }, { "epoch": 3.659387043494219, "grad_norm": 0.3861793279647827, "learning_rate": 9.79181139566915e-06, "loss": 0.2008, "num_input_tokens_seen": 43039168, "step": 19940 }, { "epoch": 3.660304643053771, "grad_norm": 22.277137756347656, "learning_rate": 9.791582674028465e-06, "loss": 0.1685, "num_input_tokens_seen": 43051520, "step": 19945 }, { "epoch": 3.6612222426133236, "grad_norm": 1.1971690654754639, "learning_rate": 9.791353829491112e-06, "loss": 0.3201, "num_input_tokens_seen": 43062688, "step": 19950 }, { "epoch": 3.6621398421728757, "grad_norm": 0.3979995548725128, "learning_rate": 9.791124862062962e-06, "loss": 0.0503, "num_input_tokens_seen": 43073056, "step": 19955 }, { "epoch": 3.6630574417324278, "grad_norm": 12.709593772888184, "learning_rate": 9.790895771749881e-06, "loss": 0.1743, "num_input_tokens_seen": 43085472, "step": 19960 }, { "epoch": 3.6639750412919803, "grad_norm": 0.20045502483844757, "learning_rate": 9.79066655855775e-06, "loss": 0.0482, "num_input_tokens_seen": 43095904, "step": 19965 }, { "epoch": 3.6648926408515323, "grad_norm": 0.169663205742836, "learning_rate": 9.790437222492448e-06, "loss": 0.0327, "num_input_tokens_seen": 43106400, "step": 19970 }, { "epoch": 3.6658102404110844, "grad_norm": 0.04839259013533592, "learning_rate": 9.790207763559855e-06, "loss": 0.0633, "num_input_tokens_seen": 43117920, "step": 19975 }, { "epoch": 3.666727839970637, "grad_norm": 17.028034210205078, "learning_rate": 9.789978181765857e-06, "loss": 0.4297, "num_input_tokens_seen": 43128320, "step": 19980 }, { "epoch": 3.667645439530189, "grad_norm": 9.915905952453613, "learning_rate": 9.789748477116343e-06, "loss": 0.273, "num_input_tokens_seen": 43139456, "step": 19985 }, { "epoch": 3.668563039089741, "grad_norm": 4.483170509338379, "learning_rate": 9.789518649617202e-06, "loss": 0.2422, "num_input_tokens_seen": 43150592, "step": 19990 }, { "epoch": 3.6694806386492935, "grad_norm": 0.35374557971954346, "learning_rate": 9.789288699274333e-06, "loss": 0.1373, "num_input_tokens_seen": 43161088, "step": 19995 }, { "epoch": 3.6703982382088456, "grad_norm": 0.4835081696510315, "learning_rate": 9.78905862609363e-06, "loss": 0.0088, "num_input_tokens_seen": 43171168, "step": 20000 }, { "epoch": 3.6713158377683976, "grad_norm": 0.08266939967870712, "learning_rate": 9.788828430080996e-06, "loss": 0.1565, "num_input_tokens_seen": 43182304, "step": 20005 }, { "epoch": 3.67223343732795, "grad_norm": 0.042388126254081726, "learning_rate": 9.788598111242335e-06, "loss": 0.0411, "num_input_tokens_seen": 43191904, "step": 20010 }, { "epoch": 3.673151036887502, "grad_norm": 22.85418701171875, "learning_rate": 9.788367669583554e-06, "loss": 0.4167, "num_input_tokens_seen": 43204160, "step": 20015 }, { "epoch": 3.6740686364470543, "grad_norm": 10.622634887695312, "learning_rate": 9.788137105110565e-06, "loss": 0.3159, "num_input_tokens_seen": 43215424, "step": 20020 }, { "epoch": 3.674986236006607, "grad_norm": 0.05337826535105705, "learning_rate": 9.787906417829279e-06, "loss": 0.022, "num_input_tokens_seen": 43226272, "step": 20025 }, { "epoch": 3.675903835566159, "grad_norm": 15.531798362731934, "learning_rate": 9.787675607745612e-06, "loss": 0.2766, "num_input_tokens_seen": 43236480, "step": 20030 }, { "epoch": 3.676821435125711, "grad_norm": 6.921792030334473, "learning_rate": 9.787444674865487e-06, "loss": 0.217, "num_input_tokens_seen": 43248288, "step": 20035 }, { "epoch": 3.6777390346852634, "grad_norm": 7.09867000579834, "learning_rate": 9.787213619194827e-06, "loss": 0.102, "num_input_tokens_seen": 43258144, "step": 20040 }, { "epoch": 3.6786566342448155, "grad_norm": 5.563171863555908, "learning_rate": 9.786982440739557e-06, "loss": 0.158, "num_input_tokens_seen": 43268960, "step": 20045 }, { "epoch": 3.6795742338043675, "grad_norm": 0.13875652849674225, "learning_rate": 9.786751139505607e-06, "loss": 0.1156, "num_input_tokens_seen": 43280512, "step": 20050 }, { "epoch": 3.68049183336392, "grad_norm": 0.5561264753341675, "learning_rate": 9.786519715498907e-06, "loss": 0.1976, "num_input_tokens_seen": 43292608, "step": 20055 }, { "epoch": 3.681409432923472, "grad_norm": 8.600552558898926, "learning_rate": 9.786288168725397e-06, "loss": 0.2574, "num_input_tokens_seen": 43303840, "step": 20060 }, { "epoch": 3.682327032483024, "grad_norm": 0.24424432218074799, "learning_rate": 9.786056499191013e-06, "loss": 0.1039, "num_input_tokens_seen": 43315200, "step": 20065 }, { "epoch": 3.6832446320425767, "grad_norm": 0.8957844972610474, "learning_rate": 9.785824706901696e-06, "loss": 0.0783, "num_input_tokens_seen": 43326368, "step": 20070 }, { "epoch": 3.6841622316021287, "grad_norm": 0.15366122126579285, "learning_rate": 9.785592791863394e-06, "loss": 0.1324, "num_input_tokens_seen": 43337792, "step": 20075 }, { "epoch": 3.685079831161681, "grad_norm": 0.2611982524394989, "learning_rate": 9.785360754082054e-06, "loss": 0.1199, "num_input_tokens_seen": 43347456, "step": 20080 }, { "epoch": 3.6859974307212333, "grad_norm": 28.701162338256836, "learning_rate": 9.785128593563627e-06, "loss": 0.0427, "num_input_tokens_seen": 43358272, "step": 20085 }, { "epoch": 3.6869150302807854, "grad_norm": 4.630122184753418, "learning_rate": 9.784896310314068e-06, "loss": 0.1522, "num_input_tokens_seen": 43369088, "step": 20090 }, { "epoch": 3.6878326298403374, "grad_norm": 4.496258735656738, "learning_rate": 9.784663904339336e-06, "loss": 0.2427, "num_input_tokens_seen": 43379520, "step": 20095 }, { "epoch": 3.68875022939989, "grad_norm": 26.765377044677734, "learning_rate": 9.784431375645387e-06, "loss": 0.3469, "num_input_tokens_seen": 43389952, "step": 20100 }, { "epoch": 3.689667828959442, "grad_norm": 0.11765250563621521, "learning_rate": 9.784198724238191e-06, "loss": 0.1011, "num_input_tokens_seen": 43400000, "step": 20105 }, { "epoch": 3.690585428518994, "grad_norm": 11.013115882873535, "learning_rate": 9.783965950123712e-06, "loss": 0.341, "num_input_tokens_seen": 43410528, "step": 20110 }, { "epoch": 3.6915030280785466, "grad_norm": 23.80423927307129, "learning_rate": 9.78373305330792e-06, "loss": 0.1322, "num_input_tokens_seen": 43422176, "step": 20115 }, { "epoch": 3.6924206276380986, "grad_norm": 6.351601600646973, "learning_rate": 9.78350003379679e-06, "loss": 0.2986, "num_input_tokens_seen": 43433248, "step": 20120 }, { "epoch": 3.6933382271976507, "grad_norm": 7.241975784301758, "learning_rate": 9.783266891596298e-06, "loss": 0.0845, "num_input_tokens_seen": 43444384, "step": 20125 }, { "epoch": 3.694255826757203, "grad_norm": 15.581396102905273, "learning_rate": 9.783033626712423e-06, "loss": 0.2048, "num_input_tokens_seen": 43455808, "step": 20130 }, { "epoch": 3.6951734263167553, "grad_norm": 9.077783584594727, "learning_rate": 9.782800239151149e-06, "loss": 0.2276, "num_input_tokens_seen": 43466144, "step": 20135 }, { "epoch": 3.6960910258763073, "grad_norm": 0.6829528212547302, "learning_rate": 9.78256672891846e-06, "loss": 0.1195, "num_input_tokens_seen": 43478240, "step": 20140 }, { "epoch": 3.69700862543586, "grad_norm": 12.595956802368164, "learning_rate": 9.78233309602035e-06, "loss": 0.2107, "num_input_tokens_seen": 43489504, "step": 20145 }, { "epoch": 3.697926224995412, "grad_norm": 0.47671839594841003, "learning_rate": 9.782099340462806e-06, "loss": 0.1615, "num_input_tokens_seen": 43499648, "step": 20150 }, { "epoch": 3.698843824554964, "grad_norm": 0.27897536754608154, "learning_rate": 9.781865462251827e-06, "loss": 0.3868, "num_input_tokens_seen": 43511136, "step": 20155 }, { "epoch": 3.6997614241145165, "grad_norm": 0.29446426033973694, "learning_rate": 9.781631461393408e-06, "loss": 0.2594, "num_input_tokens_seen": 43522624, "step": 20160 }, { "epoch": 3.7006790236740685, "grad_norm": 14.159351348876953, "learning_rate": 9.781397337893553e-06, "loss": 0.0722, "num_input_tokens_seen": 43533632, "step": 20165 }, { "epoch": 3.7015966232336206, "grad_norm": 1.36477530002594, "learning_rate": 9.781163091758269e-06, "loss": 0.1329, "num_input_tokens_seen": 43544512, "step": 20170 }, { "epoch": 3.702514222793173, "grad_norm": 0.18533039093017578, "learning_rate": 9.780928722993559e-06, "loss": 0.0197, "num_input_tokens_seen": 43554848, "step": 20175 }, { "epoch": 3.703431822352725, "grad_norm": 1.5633033514022827, "learning_rate": 9.780694231605438e-06, "loss": 0.1685, "num_input_tokens_seen": 43564768, "step": 20180 }, { "epoch": 3.7043494219122772, "grad_norm": 0.10324762761592865, "learning_rate": 9.78045961759992e-06, "loss": 0.1365, "num_input_tokens_seen": 43574528, "step": 20185 }, { "epoch": 3.7052670214718297, "grad_norm": 1.2967400550842285, "learning_rate": 9.780224880983023e-06, "loss": 0.1178, "num_input_tokens_seen": 43584480, "step": 20190 }, { "epoch": 3.706184621031382, "grad_norm": 0.12568232417106628, "learning_rate": 9.779990021760763e-06, "loss": 0.15, "num_input_tokens_seen": 43596096, "step": 20195 }, { "epoch": 3.707102220590934, "grad_norm": 0.04954377934336662, "learning_rate": 9.77975503993917e-06, "loss": 0.3123, "num_input_tokens_seen": 43606752, "step": 20200 }, { "epoch": 3.7080198201504864, "grad_norm": 6.566340923309326, "learning_rate": 9.779519935524267e-06, "loss": 0.252, "num_input_tokens_seen": 43617504, "step": 20205 }, { "epoch": 3.7089374197100384, "grad_norm": 0.17609356343746185, "learning_rate": 9.779284708522085e-06, "loss": 0.2152, "num_input_tokens_seen": 43628480, "step": 20210 }, { "epoch": 3.7098550192695905, "grad_norm": 5.899863243103027, "learning_rate": 9.77904935893866e-06, "loss": 0.1381, "num_input_tokens_seen": 43640320, "step": 20215 }, { "epoch": 3.710772618829143, "grad_norm": 24.17005157470703, "learning_rate": 9.778813886780023e-06, "loss": 0.2524, "num_input_tokens_seen": 43652320, "step": 20220 }, { "epoch": 3.711690218388695, "grad_norm": 0.0907418355345726, "learning_rate": 9.778578292052218e-06, "loss": 0.35, "num_input_tokens_seen": 43663648, "step": 20225 }, { "epoch": 3.712607817948247, "grad_norm": 35.640953063964844, "learning_rate": 9.778342574761285e-06, "loss": 0.2303, "num_input_tokens_seen": 43675456, "step": 20230 }, { "epoch": 3.7135254175077996, "grad_norm": 12.037981033325195, "learning_rate": 9.778106734913271e-06, "loss": 0.1272, "num_input_tokens_seen": 43686368, "step": 20235 }, { "epoch": 3.7144430170673517, "grad_norm": 5.274600028991699, "learning_rate": 9.777870772514224e-06, "loss": 0.2099, "num_input_tokens_seen": 43698080, "step": 20240 }, { "epoch": 3.7153606166269038, "grad_norm": 0.20588353276252747, "learning_rate": 9.777634687570197e-06, "loss": 0.1522, "num_input_tokens_seen": 43707136, "step": 20245 }, { "epoch": 3.7162782161864563, "grad_norm": 0.3260577917098999, "learning_rate": 9.777398480087246e-06, "loss": 0.1824, "num_input_tokens_seen": 43717248, "step": 20250 }, { "epoch": 3.7171958157460083, "grad_norm": 0.5464293956756592, "learning_rate": 9.777162150071427e-06, "loss": 0.2124, "num_input_tokens_seen": 43728000, "step": 20255 }, { "epoch": 3.7181134153055604, "grad_norm": 19.82135581970215, "learning_rate": 9.776925697528803e-06, "loss": 0.271, "num_input_tokens_seen": 43736416, "step": 20260 }, { "epoch": 3.719031014865113, "grad_norm": 0.08302781730890274, "learning_rate": 9.776689122465439e-06, "loss": 0.2447, "num_input_tokens_seen": 43747104, "step": 20265 }, { "epoch": 3.719948614424665, "grad_norm": 0.970123291015625, "learning_rate": 9.776452424887402e-06, "loss": 0.0973, "num_input_tokens_seen": 43757696, "step": 20270 }, { "epoch": 3.720866213984217, "grad_norm": 10.200821876525879, "learning_rate": 9.776215604800763e-06, "loss": 0.2887, "num_input_tokens_seen": 43769888, "step": 20275 }, { "epoch": 3.7217838135437695, "grad_norm": 7.997385025024414, "learning_rate": 9.775978662211596e-06, "loss": 0.1515, "num_input_tokens_seen": 43780576, "step": 20280 }, { "epoch": 3.7227014131033216, "grad_norm": 0.7272321581840515, "learning_rate": 9.775741597125979e-06, "loss": 0.0698, "num_input_tokens_seen": 43791776, "step": 20285 }, { "epoch": 3.7236190126628737, "grad_norm": 8.296564102172852, "learning_rate": 9.77550440954999e-06, "loss": 0.3793, "num_input_tokens_seen": 43802688, "step": 20290 }, { "epoch": 3.724536612222426, "grad_norm": 7.440361976623535, "learning_rate": 9.775267099489716e-06, "loss": 0.4143, "num_input_tokens_seen": 43813184, "step": 20295 }, { "epoch": 3.725454211781978, "grad_norm": 0.3743448853492737, "learning_rate": 9.775029666951242e-06, "loss": 0.1431, "num_input_tokens_seen": 43825248, "step": 20300 }, { "epoch": 3.7263718113415303, "grad_norm": 0.21096330881118774, "learning_rate": 9.774792111940657e-06, "loss": 0.0497, "num_input_tokens_seen": 43835968, "step": 20305 }, { "epoch": 3.727289410901083, "grad_norm": 0.4910774528980255, "learning_rate": 9.774554434464055e-06, "loss": 0.0928, "num_input_tokens_seen": 43845984, "step": 20310 }, { "epoch": 3.728207010460635, "grad_norm": 19.182933807373047, "learning_rate": 9.774316634527532e-06, "loss": 0.3054, "num_input_tokens_seen": 43855968, "step": 20315 }, { "epoch": 3.729124610020187, "grad_norm": 8.193650245666504, "learning_rate": 9.774078712137185e-06, "loss": 0.2319, "num_input_tokens_seen": 43867648, "step": 20320 }, { "epoch": 3.7300422095797394, "grad_norm": 4.9125075340271, "learning_rate": 9.77384066729912e-06, "loss": 0.2435, "num_input_tokens_seen": 43876992, "step": 20325 }, { "epoch": 3.7309598091392915, "grad_norm": 0.41344892978668213, "learning_rate": 9.77360250001944e-06, "loss": 0.1707, "num_input_tokens_seen": 43887392, "step": 20330 }, { "epoch": 3.731877408698844, "grad_norm": 4.755528450012207, "learning_rate": 9.773364210304254e-06, "loss": 0.1202, "num_input_tokens_seen": 43898784, "step": 20335 }, { "epoch": 3.732795008258396, "grad_norm": 11.381036758422852, "learning_rate": 9.773125798159674e-06, "loss": 0.1214, "num_input_tokens_seen": 43909536, "step": 20340 }, { "epoch": 3.733712607817948, "grad_norm": 14.230938911437988, "learning_rate": 9.772887263591817e-06, "loss": 0.3327, "num_input_tokens_seen": 43919200, "step": 20345 }, { "epoch": 3.7346302073775006, "grad_norm": 4.322297096252441, "learning_rate": 9.772648606606796e-06, "loss": 0.2027, "num_input_tokens_seen": 43930848, "step": 20350 }, { "epoch": 3.7355478069370527, "grad_norm": 4.346655368804932, "learning_rate": 9.772409827210738e-06, "loss": 0.0876, "num_input_tokens_seen": 43941280, "step": 20355 }, { "epoch": 3.7364654064966047, "grad_norm": 0.27026358246803284, "learning_rate": 9.772170925409764e-06, "loss": 0.2279, "num_input_tokens_seen": 43952224, "step": 20360 }, { "epoch": 3.7373830060561573, "grad_norm": 4.4306960105896, "learning_rate": 9.771931901209998e-06, "loss": 0.1423, "num_input_tokens_seen": 43963680, "step": 20365 }, { "epoch": 3.7383006056157093, "grad_norm": 20.765865325927734, "learning_rate": 9.771692754617578e-06, "loss": 0.2578, "num_input_tokens_seen": 43974112, "step": 20370 }, { "epoch": 3.7392182051752614, "grad_norm": 0.32729098200798035, "learning_rate": 9.771453485638635e-06, "loss": 0.1431, "num_input_tokens_seen": 43984288, "step": 20375 }, { "epoch": 3.740135804734814, "grad_norm": 0.21924394369125366, "learning_rate": 9.771214094279304e-06, "loss": 0.0648, "num_input_tokens_seen": 43995168, "step": 20380 }, { "epoch": 3.741053404294366, "grad_norm": 0.2658136487007141, "learning_rate": 9.770974580545727e-06, "loss": 0.3775, "num_input_tokens_seen": 44005920, "step": 20385 }, { "epoch": 3.741971003853918, "grad_norm": 57.860660552978516, "learning_rate": 9.770734944444044e-06, "loss": 0.1684, "num_input_tokens_seen": 44016128, "step": 20390 }, { "epoch": 3.7428886034134705, "grad_norm": 3.602304697036743, "learning_rate": 9.770495185980407e-06, "loss": 0.1542, "num_input_tokens_seen": 44027264, "step": 20395 }, { "epoch": 3.7438062029730226, "grad_norm": 0.11723601073026657, "learning_rate": 9.77025530516096e-06, "loss": 0.1488, "num_input_tokens_seen": 44038080, "step": 20400 }, { "epoch": 3.7447238025325746, "grad_norm": 25.620820999145508, "learning_rate": 9.770015301991858e-06, "loss": 0.1715, "num_input_tokens_seen": 44048832, "step": 20405 }, { "epoch": 3.745641402092127, "grad_norm": 43.43388366699219, "learning_rate": 9.769775176479256e-06, "loss": 0.3543, "num_input_tokens_seen": 44060000, "step": 20410 }, { "epoch": 3.746559001651679, "grad_norm": 0.30874770879745483, "learning_rate": 9.769534928629313e-06, "loss": 0.1915, "num_input_tokens_seen": 44070048, "step": 20415 }, { "epoch": 3.7474766012112313, "grad_norm": 27.610580444335938, "learning_rate": 9.769294558448192e-06, "loss": 0.1289, "num_input_tokens_seen": 44081280, "step": 20420 }, { "epoch": 3.748394200770784, "grad_norm": 5.133686065673828, "learning_rate": 9.769054065942056e-06, "loss": 0.3971, "num_input_tokens_seen": 44090848, "step": 20425 }, { "epoch": 3.749311800330336, "grad_norm": 1.3732898235321045, "learning_rate": 9.768813451117077e-06, "loss": 0.0579, "num_input_tokens_seen": 44101120, "step": 20430 }, { "epoch": 3.750229399889888, "grad_norm": 2.6559321880340576, "learning_rate": 9.768572713979423e-06, "loss": 0.0732, "num_input_tokens_seen": 44111616, "step": 20435 }, { "epoch": 3.7511469994494404, "grad_norm": 0.2234962433576584, "learning_rate": 9.768331854535268e-06, "loss": 0.1295, "num_input_tokens_seen": 44121760, "step": 20440 }, { "epoch": 3.7520645990089925, "grad_norm": 39.90768814086914, "learning_rate": 9.768090872790792e-06, "loss": 0.1119, "num_input_tokens_seen": 44132832, "step": 20445 }, { "epoch": 3.7529821985685445, "grad_norm": 0.12891237437725067, "learning_rate": 9.767849768752175e-06, "loss": 0.0107, "num_input_tokens_seen": 44143680, "step": 20450 }, { "epoch": 3.753899798128097, "grad_norm": 0.16623926162719727, "learning_rate": 9.767608542425601e-06, "loss": 0.2118, "num_input_tokens_seen": 44153632, "step": 20455 }, { "epoch": 3.754817397687649, "grad_norm": 4.607326507568359, "learning_rate": 9.76736719381726e-06, "loss": 0.2556, "num_input_tokens_seen": 44164480, "step": 20460 }, { "epoch": 3.755734997247201, "grad_norm": 0.13362562656402588, "learning_rate": 9.767125722933335e-06, "loss": 0.143, "num_input_tokens_seen": 44175744, "step": 20465 }, { "epoch": 3.7566525968067537, "grad_norm": 7.537233829498291, "learning_rate": 9.766884129780024e-06, "loss": 0.084, "num_input_tokens_seen": 44186496, "step": 20470 }, { "epoch": 3.7575701963663057, "grad_norm": 11.699238777160645, "learning_rate": 9.766642414363524e-06, "loss": 0.2939, "num_input_tokens_seen": 44196576, "step": 20475 }, { "epoch": 3.758487795925858, "grad_norm": 0.4679311215877533, "learning_rate": 9.766400576690034e-06, "loss": 0.1423, "num_input_tokens_seen": 44207488, "step": 20480 }, { "epoch": 3.7594053954854103, "grad_norm": 6.234134674072266, "learning_rate": 9.766158616765756e-06, "loss": 0.4308, "num_input_tokens_seen": 44219104, "step": 20485 }, { "epoch": 3.7603229950449624, "grad_norm": 2.1888628005981445, "learning_rate": 9.765916534596897e-06, "loss": 0.3595, "num_input_tokens_seen": 44229920, "step": 20490 }, { "epoch": 3.7612405946045144, "grad_norm": 23.07931900024414, "learning_rate": 9.765674330189664e-06, "loss": 0.0984, "num_input_tokens_seen": 44241600, "step": 20495 }, { "epoch": 3.762158194164067, "grad_norm": 0.11750933527946472, "learning_rate": 9.765432003550273e-06, "loss": 0.1649, "num_input_tokens_seen": 44252256, "step": 20500 }, { "epoch": 3.763075793723619, "grad_norm": 0.8382021188735962, "learning_rate": 9.765189554684936e-06, "loss": 0.0486, "num_input_tokens_seen": 44263648, "step": 20505 }, { "epoch": 3.763993393283171, "grad_norm": 3.881781816482544, "learning_rate": 9.76494698359987e-06, "loss": 0.1145, "num_input_tokens_seen": 44273120, "step": 20510 }, { "epoch": 3.7649109928427236, "grad_norm": 0.9930246472358704, "learning_rate": 9.7647042903013e-06, "loss": 0.0733, "num_input_tokens_seen": 44284544, "step": 20515 }, { "epoch": 3.7658285924022756, "grad_norm": 29.213117599487305, "learning_rate": 9.76446147479545e-06, "loss": 0.0199, "num_input_tokens_seen": 44295456, "step": 20520 }, { "epoch": 3.7667461919618277, "grad_norm": 10.867839813232422, "learning_rate": 9.764218537088548e-06, "loss": 0.115, "num_input_tokens_seen": 44306176, "step": 20525 }, { "epoch": 3.76766379152138, "grad_norm": 0.3874683678150177, "learning_rate": 9.763975477186824e-06, "loss": 0.2127, "num_input_tokens_seen": 44316992, "step": 20530 }, { "epoch": 3.7685813910809323, "grad_norm": 0.31679657101631165, "learning_rate": 9.763732295096513e-06, "loss": 0.0045, "num_input_tokens_seen": 44328608, "step": 20535 }, { "epoch": 3.7694989906404843, "grad_norm": 0.1496211439371109, "learning_rate": 9.76348899082385e-06, "loss": 0.1115, "num_input_tokens_seen": 44338400, "step": 20540 }, { "epoch": 3.770416590200037, "grad_norm": 17.748401641845703, "learning_rate": 9.76324556437508e-06, "loss": 0.3447, "num_input_tokens_seen": 44347648, "step": 20545 }, { "epoch": 3.771334189759589, "grad_norm": 0.13210108876228333, "learning_rate": 9.763002015756443e-06, "loss": 0.1425, "num_input_tokens_seen": 44358976, "step": 20550 }, { "epoch": 3.7722517893191414, "grad_norm": 0.3533439040184021, "learning_rate": 9.762758344974184e-06, "loss": 0.132, "num_input_tokens_seen": 44370368, "step": 20555 }, { "epoch": 3.7731693888786935, "grad_norm": 1.682247519493103, "learning_rate": 9.762514552034557e-06, "loss": 0.1469, "num_input_tokens_seen": 44381536, "step": 20560 }, { "epoch": 3.7740869884382455, "grad_norm": 0.5438202023506165, "learning_rate": 9.762270636943812e-06, "loss": 0.2164, "num_input_tokens_seen": 44393024, "step": 20565 }, { "epoch": 3.775004587997798, "grad_norm": 0.08040689677000046, "learning_rate": 9.762026599708205e-06, "loss": 0.2031, "num_input_tokens_seen": 44404640, "step": 20570 }, { "epoch": 3.77592218755735, "grad_norm": 15.829058647155762, "learning_rate": 9.761782440333997e-06, "loss": 0.1458, "num_input_tokens_seen": 44415552, "step": 20575 }, { "epoch": 3.776839787116902, "grad_norm": 9.163994789123535, "learning_rate": 9.76153815882745e-06, "loss": 0.296, "num_input_tokens_seen": 44425888, "step": 20580 }, { "epoch": 3.7777573866764547, "grad_norm": 0.3157896399497986, "learning_rate": 9.76129375519483e-06, "loss": 0.3413, "num_input_tokens_seen": 44435840, "step": 20585 }, { "epoch": 3.7786749862360067, "grad_norm": 0.06844376027584076, "learning_rate": 9.761049229442404e-06, "loss": 0.0078, "num_input_tokens_seen": 44446560, "step": 20590 }, { "epoch": 3.779592585795559, "grad_norm": 8.194412231445312, "learning_rate": 9.760804581576443e-06, "loss": 0.0781, "num_input_tokens_seen": 44457248, "step": 20595 }, { "epoch": 3.7805101853551113, "grad_norm": 19.522733688354492, "learning_rate": 9.760559811603223e-06, "loss": 0.5395, "num_input_tokens_seen": 44468768, "step": 20600 }, { "epoch": 3.7814277849146634, "grad_norm": 4.668386936187744, "learning_rate": 9.760314919529024e-06, "loss": 0.1453, "num_input_tokens_seen": 44479968, "step": 20605 }, { "epoch": 3.7823453844742154, "grad_norm": 0.11508844792842865, "learning_rate": 9.760069905360124e-06, "loss": 0.174, "num_input_tokens_seen": 44490912, "step": 20610 }, { "epoch": 3.783262984033768, "grad_norm": 0.16699115931987762, "learning_rate": 9.759824769102807e-06, "loss": 0.0234, "num_input_tokens_seen": 44501792, "step": 20615 }, { "epoch": 3.78418058359332, "grad_norm": 0.18403637409210205, "learning_rate": 9.759579510763362e-06, "loss": 0.1295, "num_input_tokens_seen": 44512704, "step": 20620 }, { "epoch": 3.785098183152872, "grad_norm": 0.3384774327278137, "learning_rate": 9.759334130348082e-06, "loss": 0.0919, "num_input_tokens_seen": 44522752, "step": 20625 }, { "epoch": 3.7860157827124246, "grad_norm": 0.07888241112232208, "learning_rate": 9.759088627863255e-06, "loss": 0.2591, "num_input_tokens_seen": 44533440, "step": 20630 }, { "epoch": 3.7869333822719766, "grad_norm": 0.27126672863960266, "learning_rate": 9.758843003315182e-06, "loss": 0.2597, "num_input_tokens_seen": 44544608, "step": 20635 }, { "epoch": 3.7878509818315287, "grad_norm": 1.1096796989440918, "learning_rate": 9.75859725671016e-06, "loss": 0.1071, "num_input_tokens_seen": 44555648, "step": 20640 }, { "epoch": 3.788768581391081, "grad_norm": 12.584399223327637, "learning_rate": 9.758351388054496e-06, "loss": 0.2848, "num_input_tokens_seen": 44564640, "step": 20645 }, { "epoch": 3.7896861809506333, "grad_norm": 3.285414695739746, "learning_rate": 9.758105397354492e-06, "loss": 0.3939, "num_input_tokens_seen": 44574784, "step": 20650 }, { "epoch": 3.7906037805101853, "grad_norm": 0.5693961381912231, "learning_rate": 9.75785928461646e-06, "loss": 0.0117, "num_input_tokens_seen": 44584224, "step": 20655 }, { "epoch": 3.791521380069738, "grad_norm": 10.856274604797363, "learning_rate": 9.75761304984671e-06, "loss": 0.363, "num_input_tokens_seen": 44593728, "step": 20660 }, { "epoch": 3.79243897962929, "grad_norm": 0.13851244747638702, "learning_rate": 9.757366693051559e-06, "loss": 0.093, "num_input_tokens_seen": 44604768, "step": 20665 }, { "epoch": 3.793356579188842, "grad_norm": 17.600427627563477, "learning_rate": 9.757120214237326e-06, "loss": 0.1364, "num_input_tokens_seen": 44614784, "step": 20670 }, { "epoch": 3.7942741787483945, "grad_norm": 0.28285279870033264, "learning_rate": 9.756873613410333e-06, "loss": 0.2319, "num_input_tokens_seen": 44626464, "step": 20675 }, { "epoch": 3.7951917783079465, "grad_norm": 0.14801755547523499, "learning_rate": 9.756626890576904e-06, "loss": 0.1465, "num_input_tokens_seen": 44636704, "step": 20680 }, { "epoch": 3.7961093778674986, "grad_norm": 1.0155311822891235, "learning_rate": 9.756380045743368e-06, "loss": 0.0713, "num_input_tokens_seen": 44647648, "step": 20685 }, { "epoch": 3.797026977427051, "grad_norm": 0.07000736147165298, "learning_rate": 9.756133078916054e-06, "loss": 0.1541, "num_input_tokens_seen": 44658720, "step": 20690 }, { "epoch": 3.797944576986603, "grad_norm": 16.069303512573242, "learning_rate": 9.7558859901013e-06, "loss": 0.0186, "num_input_tokens_seen": 44669984, "step": 20695 }, { "epoch": 3.798862176546155, "grad_norm": 0.35073986649513245, "learning_rate": 9.755638779305439e-06, "loss": 0.0997, "num_input_tokens_seen": 44680064, "step": 20700 }, { "epoch": 3.7997797761057077, "grad_norm": 20.340354919433594, "learning_rate": 9.755391446534814e-06, "loss": 0.1448, "num_input_tokens_seen": 44691296, "step": 20705 }, { "epoch": 3.80069737566526, "grad_norm": 0.12838687002658844, "learning_rate": 9.75514399179577e-06, "loss": 0.0828, "num_input_tokens_seen": 44701792, "step": 20710 }, { "epoch": 3.801614975224812, "grad_norm": 1.821650743484497, "learning_rate": 9.754896415094651e-06, "loss": 0.1664, "num_input_tokens_seen": 44712896, "step": 20715 }, { "epoch": 3.8025325747843643, "grad_norm": 0.10153508186340332, "learning_rate": 9.754648716437808e-06, "loss": 0.183, "num_input_tokens_seen": 44724480, "step": 20720 }, { "epoch": 3.8034501743439164, "grad_norm": 0.12318148463964462, "learning_rate": 9.754400895831597e-06, "loss": 0.0161, "num_input_tokens_seen": 44736256, "step": 20725 }, { "epoch": 3.8043677739034685, "grad_norm": 22.05887794494629, "learning_rate": 9.754152953282369e-06, "loss": 0.062, "num_input_tokens_seen": 44746944, "step": 20730 }, { "epoch": 3.805285373463021, "grad_norm": 23.461769104003906, "learning_rate": 9.753904888796489e-06, "loss": 0.1226, "num_input_tokens_seen": 44758560, "step": 20735 }, { "epoch": 3.806202973022573, "grad_norm": 0.14400723576545715, "learning_rate": 9.753656702380314e-06, "loss": 0.1534, "num_input_tokens_seen": 44768160, "step": 20740 }, { "epoch": 3.807120572582125, "grad_norm": 35.11739730834961, "learning_rate": 9.753408394040214e-06, "loss": 0.4399, "num_input_tokens_seen": 44779136, "step": 20745 }, { "epoch": 3.8080381721416776, "grad_norm": 12.216094970703125, "learning_rate": 9.753159963782554e-06, "loss": 0.1725, "num_input_tokens_seen": 44791168, "step": 20750 }, { "epoch": 3.8089557717012297, "grad_norm": 0.017189564183354378, "learning_rate": 9.752911411613709e-06, "loss": 0.1152, "num_input_tokens_seen": 44802464, "step": 20755 }, { "epoch": 3.8098733712607817, "grad_norm": 0.6985018849372864, "learning_rate": 9.752662737540051e-06, "loss": 0.1976, "num_input_tokens_seen": 44813408, "step": 20760 }, { "epoch": 3.8107909708203342, "grad_norm": 21.825090408325195, "learning_rate": 9.75241394156796e-06, "loss": 0.1831, "num_input_tokens_seen": 44823264, "step": 20765 }, { "epoch": 3.8117085703798863, "grad_norm": 0.26561570167541504, "learning_rate": 9.75216502370382e-06, "loss": 0.0211, "num_input_tokens_seen": 44834272, "step": 20770 }, { "epoch": 3.8126261699394384, "grad_norm": 4.793585300445557, "learning_rate": 9.751915983954009e-06, "loss": 0.1441, "num_input_tokens_seen": 44845856, "step": 20775 }, { "epoch": 3.813543769498991, "grad_norm": 16.445018768310547, "learning_rate": 9.751666822324919e-06, "loss": 0.1908, "num_input_tokens_seen": 44855744, "step": 20780 }, { "epoch": 3.814461369058543, "grad_norm": 0.07467900961637497, "learning_rate": 9.751417538822938e-06, "loss": 0.007, "num_input_tokens_seen": 44867328, "step": 20785 }, { "epoch": 3.815378968618095, "grad_norm": 14.50687313079834, "learning_rate": 9.751168133454462e-06, "loss": 0.102, "num_input_tokens_seen": 44876736, "step": 20790 }, { "epoch": 3.8162965681776475, "grad_norm": 9.121020317077637, "learning_rate": 9.75091860622589e-06, "loss": 0.2487, "num_input_tokens_seen": 44887584, "step": 20795 }, { "epoch": 3.8172141677371996, "grad_norm": 0.3393010199069977, "learning_rate": 9.750668957143616e-06, "loss": 0.3156, "num_input_tokens_seen": 44897888, "step": 20800 }, { "epoch": 3.8181317672967516, "grad_norm": 0.6548700332641602, "learning_rate": 9.750419186214047e-06, "loss": 0.1088, "num_input_tokens_seen": 44909632, "step": 20805 }, { "epoch": 3.819049366856304, "grad_norm": 3.897843837738037, "learning_rate": 9.750169293443586e-06, "loss": 0.3072, "num_input_tokens_seen": 44920096, "step": 20810 }, { "epoch": 3.819966966415856, "grad_norm": 5.973634243011475, "learning_rate": 9.749919278838648e-06, "loss": 0.2035, "num_input_tokens_seen": 44931104, "step": 20815 }, { "epoch": 3.8208845659754083, "grad_norm": 0.8566745519638062, "learning_rate": 9.74966914240564e-06, "loss": 0.0124, "num_input_tokens_seen": 44942304, "step": 20820 }, { "epoch": 3.8218021655349608, "grad_norm": 0.21345433592796326, "learning_rate": 9.74941888415098e-06, "loss": 0.11, "num_input_tokens_seen": 44954304, "step": 20825 }, { "epoch": 3.822719765094513, "grad_norm": 10.208418846130371, "learning_rate": 9.749168504081088e-06, "loss": 0.224, "num_input_tokens_seen": 44964928, "step": 20830 }, { "epoch": 3.823637364654065, "grad_norm": 21.831703186035156, "learning_rate": 9.748918002202384e-06, "loss": 0.1076, "num_input_tokens_seen": 44975296, "step": 20835 }, { "epoch": 3.8245549642136174, "grad_norm": 5.567067623138428, "learning_rate": 9.748667378521292e-06, "loss": 0.1565, "num_input_tokens_seen": 44985664, "step": 20840 }, { "epoch": 3.8254725637731695, "grad_norm": 0.22127094864845276, "learning_rate": 9.748416633044242e-06, "loss": 0.194, "num_input_tokens_seen": 44995264, "step": 20845 }, { "epoch": 3.8263901633327215, "grad_norm": 22.279499053955078, "learning_rate": 9.748165765777666e-06, "loss": 0.0242, "num_input_tokens_seen": 45004736, "step": 20850 }, { "epoch": 3.827307762892274, "grad_norm": 0.30769407749176025, "learning_rate": 9.747914776727997e-06, "loss": 0.1734, "num_input_tokens_seen": 45014848, "step": 20855 }, { "epoch": 3.828225362451826, "grad_norm": 3.749335765838623, "learning_rate": 9.747663665901672e-06, "loss": 0.2864, "num_input_tokens_seen": 45025728, "step": 20860 }, { "epoch": 3.829142962011378, "grad_norm": 0.48268452286720276, "learning_rate": 9.747412433305132e-06, "loss": 0.2085, "num_input_tokens_seen": 45037408, "step": 20865 }, { "epoch": 3.8300605615709307, "grad_norm": 0.5134632587432861, "learning_rate": 9.747161078944821e-06, "loss": 0.124, "num_input_tokens_seen": 45049664, "step": 20870 }, { "epoch": 3.8309781611304827, "grad_norm": 7.2587103843688965, "learning_rate": 9.746909602827187e-06, "loss": 0.1783, "num_input_tokens_seen": 45059968, "step": 20875 }, { "epoch": 3.831895760690035, "grad_norm": 0.03784118592739105, "learning_rate": 9.746658004958676e-06, "loss": 0.1794, "num_input_tokens_seen": 45070816, "step": 20880 }, { "epoch": 3.8328133602495873, "grad_norm": 10.562713623046875, "learning_rate": 9.746406285345747e-06, "loss": 0.2217, "num_input_tokens_seen": 45081760, "step": 20885 }, { "epoch": 3.8337309598091394, "grad_norm": 0.20522597432136536, "learning_rate": 9.746154443994851e-06, "loss": 0.2856, "num_input_tokens_seen": 45092352, "step": 20890 }, { "epoch": 3.8346485593686914, "grad_norm": 0.16186366975307465, "learning_rate": 9.745902480912449e-06, "loss": 0.2722, "num_input_tokens_seen": 45101312, "step": 20895 }, { "epoch": 3.835566158928244, "grad_norm": 1.9480112791061401, "learning_rate": 9.745650396105004e-06, "loss": 0.1031, "num_input_tokens_seen": 45112160, "step": 20900 }, { "epoch": 3.836483758487796, "grad_norm": 0.5965113043785095, "learning_rate": 9.745398189578983e-06, "loss": 0.2929, "num_input_tokens_seen": 45122592, "step": 20905 }, { "epoch": 3.837401358047348, "grad_norm": 0.23352588713169098, "learning_rate": 9.745145861340852e-06, "loss": 0.1197, "num_input_tokens_seen": 45132736, "step": 20910 }, { "epoch": 3.8383189576069006, "grad_norm": 14.492959976196289, "learning_rate": 9.744893411397085e-06, "loss": 0.2438, "num_input_tokens_seen": 45144288, "step": 20915 }, { "epoch": 3.8392365571664526, "grad_norm": 5.23449182510376, "learning_rate": 9.744640839754154e-06, "loss": 0.1857, "num_input_tokens_seen": 45155328, "step": 20920 }, { "epoch": 3.8401541567260047, "grad_norm": 0.25270459055900574, "learning_rate": 9.74438814641854e-06, "loss": 0.1363, "num_input_tokens_seen": 45165760, "step": 20925 }, { "epoch": 3.841071756285557, "grad_norm": 0.9123604893684387, "learning_rate": 9.744135331396724e-06, "loss": 0.1211, "num_input_tokens_seen": 45177216, "step": 20930 }, { "epoch": 3.8419893558451093, "grad_norm": 16.326007843017578, "learning_rate": 9.743882394695187e-06, "loss": 0.2149, "num_input_tokens_seen": 45188512, "step": 20935 }, { "epoch": 3.8429069554046613, "grad_norm": 0.22316810488700867, "learning_rate": 9.743629336320422e-06, "loss": 0.3243, "num_input_tokens_seen": 45198528, "step": 20940 }, { "epoch": 3.843824554964214, "grad_norm": 13.221829414367676, "learning_rate": 9.743376156278915e-06, "loss": 0.1126, "num_input_tokens_seen": 45208352, "step": 20945 }, { "epoch": 3.844742154523766, "grad_norm": 0.24194318056106567, "learning_rate": 9.743122854577162e-06, "loss": 0.1171, "num_input_tokens_seen": 45218336, "step": 20950 }, { "epoch": 3.845659754083318, "grad_norm": 6.957243919372559, "learning_rate": 9.742869431221658e-06, "loss": 0.0111, "num_input_tokens_seen": 45227808, "step": 20955 }, { "epoch": 3.8465773536428705, "grad_norm": 43.21405792236328, "learning_rate": 9.742615886218905e-06, "loss": 0.1581, "num_input_tokens_seen": 45239584, "step": 20960 }, { "epoch": 3.8474949532024225, "grad_norm": 0.23817287385463715, "learning_rate": 9.742362219575403e-06, "loss": 0.4451, "num_input_tokens_seen": 45251264, "step": 20965 }, { "epoch": 3.8484125527619746, "grad_norm": 0.19816149771213531, "learning_rate": 9.742108431297662e-06, "loss": 0.0299, "num_input_tokens_seen": 45261696, "step": 20970 }, { "epoch": 3.849330152321527, "grad_norm": 7.7808074951171875, "learning_rate": 9.741854521392186e-06, "loss": 0.1805, "num_input_tokens_seen": 45272640, "step": 20975 }, { "epoch": 3.850247751881079, "grad_norm": 33.3726692199707, "learning_rate": 9.741600489865494e-06, "loss": 0.2449, "num_input_tokens_seen": 45282336, "step": 20980 }, { "epoch": 3.851165351440631, "grad_norm": 0.17213356494903564, "learning_rate": 9.741346336724098e-06, "loss": 0.0126, "num_input_tokens_seen": 45293056, "step": 20985 }, { "epoch": 3.8520829510001837, "grad_norm": 15.34333324432373, "learning_rate": 9.741092061974516e-06, "loss": 0.5558, "num_input_tokens_seen": 45303584, "step": 20990 }, { "epoch": 3.853000550559736, "grad_norm": 0.16334044933319092, "learning_rate": 9.74083766562327e-06, "loss": 0.1929, "num_input_tokens_seen": 45315008, "step": 20995 }, { "epoch": 3.853918150119288, "grad_norm": 9.119438171386719, "learning_rate": 9.740583147676887e-06, "loss": 0.1271, "num_input_tokens_seen": 45326592, "step": 21000 }, { "epoch": 3.8548357496788404, "grad_norm": 1.5528050661087036, "learning_rate": 9.740328508141894e-06, "loss": 0.1258, "num_input_tokens_seen": 45337728, "step": 21005 }, { "epoch": 3.8557533492383924, "grad_norm": 0.3521054685115814, "learning_rate": 9.74007374702482e-06, "loss": 0.0185, "num_input_tokens_seen": 45347456, "step": 21010 }, { "epoch": 3.8566709487979445, "grad_norm": 9.481961250305176, "learning_rate": 9.739818864332203e-06, "loss": 0.1244, "num_input_tokens_seen": 45358432, "step": 21015 }, { "epoch": 3.857588548357497, "grad_norm": 0.34389540553092957, "learning_rate": 9.739563860070576e-06, "loss": 0.2053, "num_input_tokens_seen": 45369120, "step": 21020 }, { "epoch": 3.858506147917049, "grad_norm": 0.09920582175254822, "learning_rate": 9.739308734246482e-06, "loss": 0.0734, "num_input_tokens_seen": 45379936, "step": 21025 }, { "epoch": 3.859423747476601, "grad_norm": 74.90845489501953, "learning_rate": 9.739053486866464e-06, "loss": 0.1932, "num_input_tokens_seen": 45390112, "step": 21030 }, { "epoch": 3.8603413470361536, "grad_norm": 39.0236930847168, "learning_rate": 9.73879811793707e-06, "loss": 0.1251, "num_input_tokens_seen": 45401152, "step": 21035 }, { "epoch": 3.8612589465957057, "grad_norm": 26.952558517456055, "learning_rate": 9.738542627464848e-06, "loss": 0.081, "num_input_tokens_seen": 45412384, "step": 21040 }, { "epoch": 3.8621765461552577, "grad_norm": 2.1258575916290283, "learning_rate": 9.738287015456351e-06, "loss": 0.0061, "num_input_tokens_seen": 45424608, "step": 21045 }, { "epoch": 3.8630941457148102, "grad_norm": 0.1720859855413437, "learning_rate": 9.738031281918137e-06, "loss": 0.0201, "num_input_tokens_seen": 45435648, "step": 21050 }, { "epoch": 3.8640117452743623, "grad_norm": 0.08051160722970963, "learning_rate": 9.737775426856763e-06, "loss": 0.015, "num_input_tokens_seen": 45447008, "step": 21055 }, { "epoch": 3.8649293448339144, "grad_norm": 27.595487594604492, "learning_rate": 9.737519450278795e-06, "loss": 0.4806, "num_input_tokens_seen": 45457888, "step": 21060 }, { "epoch": 3.865846944393467, "grad_norm": 0.3581274449825287, "learning_rate": 9.737263352190792e-06, "loss": 0.0085, "num_input_tokens_seen": 45467328, "step": 21065 }, { "epoch": 3.866764543953019, "grad_norm": 11.308333396911621, "learning_rate": 9.737007132599326e-06, "loss": 0.2822, "num_input_tokens_seen": 45478176, "step": 21070 }, { "epoch": 3.867682143512571, "grad_norm": 10.001386642456055, "learning_rate": 9.736750791510972e-06, "loss": 0.1617, "num_input_tokens_seen": 45489376, "step": 21075 }, { "epoch": 3.8685997430721235, "grad_norm": 5.392536163330078, "learning_rate": 9.736494328932298e-06, "loss": 0.0049, "num_input_tokens_seen": 45500096, "step": 21080 }, { "epoch": 3.8695173426316756, "grad_norm": 0.7487553954124451, "learning_rate": 9.736237744869887e-06, "loss": 0.4973, "num_input_tokens_seen": 45510848, "step": 21085 }, { "epoch": 3.8704349421912276, "grad_norm": 20.489065170288086, "learning_rate": 9.735981039330319e-06, "loss": 0.0345, "num_input_tokens_seen": 45522080, "step": 21090 }, { "epoch": 3.87135254175078, "grad_norm": 8.693989753723145, "learning_rate": 9.735724212320177e-06, "loss": 0.2906, "num_input_tokens_seen": 45533376, "step": 21095 }, { "epoch": 3.872270141310332, "grad_norm": 0.2525765001773834, "learning_rate": 9.735467263846048e-06, "loss": 0.0055, "num_input_tokens_seen": 45543904, "step": 21100 }, { "epoch": 3.8731877408698843, "grad_norm": 0.03896785154938698, "learning_rate": 9.735210193914524e-06, "loss": 0.0385, "num_input_tokens_seen": 45554752, "step": 21105 }, { "epoch": 3.8741053404294368, "grad_norm": 1.20057213306427, "learning_rate": 9.734953002532195e-06, "loss": 0.2623, "num_input_tokens_seen": 45562976, "step": 21110 }, { "epoch": 3.875022939988989, "grad_norm": 9.962821960449219, "learning_rate": 9.734695689705664e-06, "loss": 0.2491, "num_input_tokens_seen": 45573536, "step": 21115 }, { "epoch": 3.875940539548541, "grad_norm": 23.75237274169922, "learning_rate": 9.734438255441523e-06, "loss": 0.2941, "num_input_tokens_seen": 45584672, "step": 21120 }, { "epoch": 3.8768581391080934, "grad_norm": 0.49176260828971863, "learning_rate": 9.73418069974638e-06, "loss": 0.4587, "num_input_tokens_seen": 45596128, "step": 21125 }, { "epoch": 3.8777757386676455, "grad_norm": 0.09479757398366928, "learning_rate": 9.73392302262684e-06, "loss": 0.0369, "num_input_tokens_seen": 45606752, "step": 21130 }, { "epoch": 3.8786933382271975, "grad_norm": 0.18284042179584503, "learning_rate": 9.73366522408951e-06, "loss": 0.095, "num_input_tokens_seen": 45617696, "step": 21135 }, { "epoch": 3.87961093778675, "grad_norm": 0.2789718806743622, "learning_rate": 9.733407304141005e-06, "loss": 0.0998, "num_input_tokens_seen": 45628576, "step": 21140 }, { "epoch": 3.880528537346302, "grad_norm": 10.097197532653809, "learning_rate": 9.733149262787937e-06, "loss": 0.4392, "num_input_tokens_seen": 45639200, "step": 21145 }, { "epoch": 3.881446136905854, "grad_norm": 11.698262214660645, "learning_rate": 9.732891100036927e-06, "loss": 0.2875, "num_input_tokens_seen": 45649312, "step": 21150 }, { "epoch": 3.8823637364654067, "grad_norm": 22.061691284179688, "learning_rate": 9.732632815894596e-06, "loss": 0.3086, "num_input_tokens_seen": 45660256, "step": 21155 }, { "epoch": 3.8832813360249587, "grad_norm": 9.841263771057129, "learning_rate": 9.732374410367569e-06, "loss": 0.1172, "num_input_tokens_seen": 45671136, "step": 21160 }, { "epoch": 3.884198935584511, "grad_norm": 28.7949161529541, "learning_rate": 9.732115883462471e-06, "loss": 0.3852, "num_input_tokens_seen": 45681792, "step": 21165 }, { "epoch": 3.8851165351440633, "grad_norm": 1.973951816558838, "learning_rate": 9.731857235185935e-06, "loss": 0.2536, "num_input_tokens_seen": 45693440, "step": 21170 }, { "epoch": 3.8860341347036154, "grad_norm": 6.761317729949951, "learning_rate": 9.731598465544596e-06, "loss": 0.2867, "num_input_tokens_seen": 45703104, "step": 21175 }, { "epoch": 3.8869517342631674, "grad_norm": 10.125208854675293, "learning_rate": 9.731339574545089e-06, "loss": 0.0967, "num_input_tokens_seen": 45713312, "step": 21180 }, { "epoch": 3.88786933382272, "grad_norm": 17.631284713745117, "learning_rate": 9.731080562194056e-06, "loss": 0.1473, "num_input_tokens_seen": 45724800, "step": 21185 }, { "epoch": 3.888786933382272, "grad_norm": 4.345939636230469, "learning_rate": 9.730821428498136e-06, "loss": 0.1327, "num_input_tokens_seen": 45735872, "step": 21190 }, { "epoch": 3.889704532941824, "grad_norm": 1.0697451829910278, "learning_rate": 9.73056217346398e-06, "loss": 0.107, "num_input_tokens_seen": 45747232, "step": 21195 }, { "epoch": 3.8906221325013766, "grad_norm": 0.3401890993118286, "learning_rate": 9.730302797098237e-06, "loss": 0.2924, "num_input_tokens_seen": 45757472, "step": 21200 }, { "epoch": 3.8915397320609286, "grad_norm": 22.944849014282227, "learning_rate": 9.730043299407557e-06, "loss": 0.2595, "num_input_tokens_seen": 45767392, "step": 21205 }, { "epoch": 3.8924573316204807, "grad_norm": 4.368509769439697, "learning_rate": 9.7297836803986e-06, "loss": 0.0885, "num_input_tokens_seen": 45778784, "step": 21210 }, { "epoch": 3.893374931180033, "grad_norm": 0.9448306560516357, "learning_rate": 9.729523940078019e-06, "loss": 0.0104, "num_input_tokens_seen": 45789152, "step": 21215 }, { "epoch": 3.8942925307395853, "grad_norm": 2.845777750015259, "learning_rate": 9.72926407845248e-06, "loss": 0.0852, "num_input_tokens_seen": 45798592, "step": 21220 }, { "epoch": 3.8952101302991373, "grad_norm": 64.11561584472656, "learning_rate": 9.729004095528647e-06, "loss": 0.1981, "num_input_tokens_seen": 45810336, "step": 21225 }, { "epoch": 3.89612772985869, "grad_norm": 0.13938689231872559, "learning_rate": 9.728743991313187e-06, "loss": 0.0052, "num_input_tokens_seen": 45819744, "step": 21230 }, { "epoch": 3.897045329418242, "grad_norm": 0.36720791459083557, "learning_rate": 9.728483765812774e-06, "loss": 0.1164, "num_input_tokens_seen": 45830976, "step": 21235 }, { "epoch": 3.897962928977794, "grad_norm": 0.8115993142127991, "learning_rate": 9.728223419034081e-06, "loss": 0.155, "num_input_tokens_seen": 45841152, "step": 21240 }, { "epoch": 3.8988805285373465, "grad_norm": 0.14531119167804718, "learning_rate": 9.727962950983787e-06, "loss": 0.2225, "num_input_tokens_seen": 45851392, "step": 21245 }, { "epoch": 3.8997981280968985, "grad_norm": 29.577669143676758, "learning_rate": 9.727702361668568e-06, "loss": 0.4307, "num_input_tokens_seen": 45862752, "step": 21250 }, { "epoch": 3.9007157276564506, "grad_norm": 13.371681213378906, "learning_rate": 9.727441651095112e-06, "loss": 0.3544, "num_input_tokens_seen": 45874080, "step": 21255 }, { "epoch": 3.901633327216003, "grad_norm": 15.651227951049805, "learning_rate": 9.727180819270105e-06, "loss": 0.1289, "num_input_tokens_seen": 45885920, "step": 21260 }, { "epoch": 3.902550926775555, "grad_norm": 31.243274688720703, "learning_rate": 9.726919866200236e-06, "loss": 0.2184, "num_input_tokens_seen": 45894368, "step": 21265 }, { "epoch": 3.903468526335107, "grad_norm": 1.735417127609253, "learning_rate": 9.726658791892198e-06, "loss": 0.0239, "num_input_tokens_seen": 45906080, "step": 21270 }, { "epoch": 3.9043861258946597, "grad_norm": 0.16708500683307648, "learning_rate": 9.726397596352688e-06, "loss": 0.0838, "num_input_tokens_seen": 45916576, "step": 21275 }, { "epoch": 3.905303725454212, "grad_norm": 27.58372688293457, "learning_rate": 9.726136279588405e-06, "loss": 0.3361, "num_input_tokens_seen": 45927680, "step": 21280 }, { "epoch": 3.906221325013764, "grad_norm": 40.751094818115234, "learning_rate": 9.725874841606051e-06, "loss": 0.1471, "num_input_tokens_seen": 45938848, "step": 21285 }, { "epoch": 3.9071389245733164, "grad_norm": 0.1003899872303009, "learning_rate": 9.725613282412332e-06, "loss": 0.0266, "num_input_tokens_seen": 45949344, "step": 21290 }, { "epoch": 3.9080565241328684, "grad_norm": 37.72734832763672, "learning_rate": 9.725351602013957e-06, "loss": 0.046, "num_input_tokens_seen": 45959232, "step": 21295 }, { "epoch": 3.9089741236924205, "grad_norm": 0.15040719509124756, "learning_rate": 9.725089800417635e-06, "loss": 0.1246, "num_input_tokens_seen": 45970528, "step": 21300 }, { "epoch": 3.909891723251973, "grad_norm": 24.43030548095703, "learning_rate": 9.724827877630086e-06, "loss": 0.0707, "num_input_tokens_seen": 45981184, "step": 21305 }, { "epoch": 3.910809322811525, "grad_norm": 0.21089020371437073, "learning_rate": 9.724565833658022e-06, "loss": 0.3697, "num_input_tokens_seen": 45992352, "step": 21310 }, { "epoch": 3.911726922371077, "grad_norm": 5.100823402404785, "learning_rate": 9.724303668508168e-06, "loss": 0.0192, "num_input_tokens_seen": 46004000, "step": 21315 }, { "epoch": 3.9126445219306296, "grad_norm": 0.06368843466043472, "learning_rate": 9.724041382187247e-06, "loss": 0.2185, "num_input_tokens_seen": 46014624, "step": 21320 }, { "epoch": 3.9135621214901817, "grad_norm": 7.389667987823486, "learning_rate": 9.723778974701985e-06, "loss": 0.1671, "num_input_tokens_seen": 46025184, "step": 21325 }, { "epoch": 3.9144797210497337, "grad_norm": 0.20845995843410492, "learning_rate": 9.723516446059115e-06, "loss": 0.2085, "num_input_tokens_seen": 46035232, "step": 21330 }, { "epoch": 3.9153973206092862, "grad_norm": 32.46462631225586, "learning_rate": 9.723253796265369e-06, "loss": 0.2951, "num_input_tokens_seen": 46046720, "step": 21335 }, { "epoch": 3.9163149201688383, "grad_norm": 0.44122982025146484, "learning_rate": 9.722991025327481e-06, "loss": 0.2551, "num_input_tokens_seen": 46057856, "step": 21340 }, { "epoch": 3.9172325197283904, "grad_norm": 6.463986396789551, "learning_rate": 9.722728133252195e-06, "loss": 0.2397, "num_input_tokens_seen": 46068000, "step": 21345 }, { "epoch": 3.918150119287943, "grad_norm": 18.992748260498047, "learning_rate": 9.722465120046252e-06, "loss": 0.2865, "num_input_tokens_seen": 46079040, "step": 21350 }, { "epoch": 3.919067718847495, "grad_norm": 0.06772197782993317, "learning_rate": 9.722201985716397e-06, "loss": 0.0061, "num_input_tokens_seen": 46090048, "step": 21355 }, { "epoch": 3.919985318407047, "grad_norm": 0.21227329969406128, "learning_rate": 9.72193873026938e-06, "loss": 0.2743, "num_input_tokens_seen": 46101312, "step": 21360 }, { "epoch": 3.9209029179665995, "grad_norm": 11.851228713989258, "learning_rate": 9.721675353711955e-06, "loss": 0.2345, "num_input_tokens_seen": 46111712, "step": 21365 }, { "epoch": 3.9218205175261516, "grad_norm": 16.43223762512207, "learning_rate": 9.721411856050873e-06, "loss": 0.2556, "num_input_tokens_seen": 46122144, "step": 21370 }, { "epoch": 3.9227381170857036, "grad_norm": 0.1325100064277649, "learning_rate": 9.721148237292896e-06, "loss": 0.0074, "num_input_tokens_seen": 46131776, "step": 21375 }, { "epoch": 3.923655716645256, "grad_norm": 23.502134323120117, "learning_rate": 9.720884497444782e-06, "loss": 0.428, "num_input_tokens_seen": 46143264, "step": 21380 }, { "epoch": 3.924573316204808, "grad_norm": 0.04570214822888374, "learning_rate": 9.720620636513299e-06, "loss": 0.113, "num_input_tokens_seen": 46153856, "step": 21385 }, { "epoch": 3.9254909157643603, "grad_norm": 0.2511567175388336, "learning_rate": 9.720356654505212e-06, "loss": 0.2967, "num_input_tokens_seen": 46164032, "step": 21390 }, { "epoch": 3.9264085153239128, "grad_norm": 13.696093559265137, "learning_rate": 9.720092551427292e-06, "loss": 0.1345, "num_input_tokens_seen": 46174336, "step": 21395 }, { "epoch": 3.927326114883465, "grad_norm": 6.139646053314209, "learning_rate": 9.719828327286314e-06, "loss": 0.1434, "num_input_tokens_seen": 46184864, "step": 21400 }, { "epoch": 3.928243714443017, "grad_norm": 5.905651092529297, "learning_rate": 9.719563982089055e-06, "loss": 0.1198, "num_input_tokens_seen": 46195008, "step": 21405 }, { "epoch": 3.9291613140025694, "grad_norm": 0.12693819403648376, "learning_rate": 9.719299515842295e-06, "loss": 0.018, "num_input_tokens_seen": 46206848, "step": 21410 }, { "epoch": 3.9300789135621215, "grad_norm": 18.787626266479492, "learning_rate": 9.719034928552815e-06, "loss": 0.2801, "num_input_tokens_seen": 46216832, "step": 21415 }, { "epoch": 3.9309965131216735, "grad_norm": 0.6117498874664307, "learning_rate": 9.718770220227405e-06, "loss": 0.0092, "num_input_tokens_seen": 46227392, "step": 21420 }, { "epoch": 3.931914112681226, "grad_norm": 1.8509893417358398, "learning_rate": 9.71850539087285e-06, "loss": 0.0228, "num_input_tokens_seen": 46237696, "step": 21425 }, { "epoch": 3.932831712240778, "grad_norm": 27.77765655517578, "learning_rate": 9.718240440495946e-06, "loss": 0.0946, "num_input_tokens_seen": 46248800, "step": 21430 }, { "epoch": 3.93374931180033, "grad_norm": 0.04805593565106392, "learning_rate": 9.717975369103488e-06, "loss": 0.0979, "num_input_tokens_seen": 46259520, "step": 21435 }, { "epoch": 3.9346669113598827, "grad_norm": 0.038654327392578125, "learning_rate": 9.717710176702273e-06, "loss": 0.2069, "num_input_tokens_seen": 46270336, "step": 21440 }, { "epoch": 3.9355845109194347, "grad_norm": 0.22040769457817078, "learning_rate": 9.717444863299104e-06, "loss": 0.3031, "num_input_tokens_seen": 46281792, "step": 21445 }, { "epoch": 3.936502110478987, "grad_norm": 38.38492965698242, "learning_rate": 9.717179428900784e-06, "loss": 0.5647, "num_input_tokens_seen": 46291936, "step": 21450 }, { "epoch": 3.9374197100385393, "grad_norm": 0.33587881922721863, "learning_rate": 9.716913873514127e-06, "loss": 0.0796, "num_input_tokens_seen": 46303072, "step": 21455 }, { "epoch": 3.9383373095980914, "grad_norm": 0.11130905151367188, "learning_rate": 9.716648197145937e-06, "loss": 0.1776, "num_input_tokens_seen": 46313120, "step": 21460 }, { "epoch": 3.9392549091576434, "grad_norm": 12.030526161193848, "learning_rate": 9.71638239980303e-06, "loss": 0.122, "num_input_tokens_seen": 46324480, "step": 21465 }, { "epoch": 3.940172508717196, "grad_norm": 4.767214775085449, "learning_rate": 9.716116481492225e-06, "loss": 0.1333, "num_input_tokens_seen": 46336000, "step": 21470 }, { "epoch": 3.941090108276748, "grad_norm": 27.764144897460938, "learning_rate": 9.71585044222034e-06, "loss": 0.2742, "num_input_tokens_seen": 46346976, "step": 21475 }, { "epoch": 3.9420077078363, "grad_norm": 0.24732144176959991, "learning_rate": 9.715584281994202e-06, "loss": 0.0044, "num_input_tokens_seen": 46358656, "step": 21480 }, { "epoch": 3.9429253073958526, "grad_norm": 0.559569776058197, "learning_rate": 9.715318000820635e-06, "loss": 0.0658, "num_input_tokens_seen": 46370560, "step": 21485 }, { "epoch": 3.9438429069554046, "grad_norm": 0.8811321258544922, "learning_rate": 9.71505159870647e-06, "loss": 0.1506, "num_input_tokens_seen": 46381760, "step": 21490 }, { "epoch": 3.9447605065149567, "grad_norm": 0.06273913383483887, "learning_rate": 9.71478507565854e-06, "loss": 0.1752, "num_input_tokens_seen": 46392480, "step": 21495 }, { "epoch": 3.945678106074509, "grad_norm": 0.17006522417068481, "learning_rate": 9.714518431683678e-06, "loss": 0.0214, "num_input_tokens_seen": 46402624, "step": 21500 }, { "epoch": 3.9465957056340613, "grad_norm": 24.265350341796875, "learning_rate": 9.714251666788726e-06, "loss": 0.1758, "num_input_tokens_seen": 46413504, "step": 21505 }, { "epoch": 3.9475133051936133, "grad_norm": 2.2568259239196777, "learning_rate": 9.713984780980525e-06, "loss": 0.1696, "num_input_tokens_seen": 46423808, "step": 21510 }, { "epoch": 3.948430904753166, "grad_norm": 0.3668844997882843, "learning_rate": 9.71371777426592e-06, "loss": 0.1168, "num_input_tokens_seen": 46433696, "step": 21515 }, { "epoch": 3.949348504312718, "grad_norm": 0.10964721441268921, "learning_rate": 9.713450646651762e-06, "loss": 0.1757, "num_input_tokens_seen": 46444384, "step": 21520 }, { "epoch": 3.95026610387227, "grad_norm": 35.74907302856445, "learning_rate": 9.713183398144898e-06, "loss": 0.2486, "num_input_tokens_seen": 46455776, "step": 21525 }, { "epoch": 3.9511837034318225, "grad_norm": 1.1295782327651978, "learning_rate": 9.712916028752185e-06, "loss": 0.0376, "num_input_tokens_seen": 46466240, "step": 21530 }, { "epoch": 3.9521013029913745, "grad_norm": 0.16950677335262299, "learning_rate": 9.71264853848048e-06, "loss": 0.0098, "num_input_tokens_seen": 46476416, "step": 21535 }, { "epoch": 3.9530189025509266, "grad_norm": 10.692164421081543, "learning_rate": 9.712380927336645e-06, "loss": 0.1214, "num_input_tokens_seen": 46486912, "step": 21540 }, { "epoch": 3.953936502110479, "grad_norm": 0.14062882959842682, "learning_rate": 9.712113195327541e-06, "loss": 0.094, "num_input_tokens_seen": 46496928, "step": 21545 }, { "epoch": 3.954854101670031, "grad_norm": 0.0680442526936531, "learning_rate": 9.711845342460037e-06, "loss": 0.1228, "num_input_tokens_seen": 46508576, "step": 21550 }, { "epoch": 3.955771701229583, "grad_norm": 3.4860424995422363, "learning_rate": 9.711577368741003e-06, "loss": 0.0627, "num_input_tokens_seen": 46519392, "step": 21555 }, { "epoch": 3.9566893007891357, "grad_norm": 0.038877107203006744, "learning_rate": 9.711309274177312e-06, "loss": 0.2257, "num_input_tokens_seen": 46529888, "step": 21560 }, { "epoch": 3.957606900348688, "grad_norm": 0.07732003927230835, "learning_rate": 9.71104105877584e-06, "loss": 0.0448, "num_input_tokens_seen": 46541056, "step": 21565 }, { "epoch": 3.95852449990824, "grad_norm": 0.10336467623710632, "learning_rate": 9.710772722543467e-06, "loss": 0.0074, "num_input_tokens_seen": 46551840, "step": 21570 }, { "epoch": 3.9594420994677924, "grad_norm": 0.1131608635187149, "learning_rate": 9.710504265487074e-06, "loss": 0.0705, "num_input_tokens_seen": 46562208, "step": 21575 }, { "epoch": 3.9603596990273444, "grad_norm": 16.52979278564453, "learning_rate": 9.710235687613545e-06, "loss": 0.3971, "num_input_tokens_seen": 46572544, "step": 21580 }, { "epoch": 3.9612772985868965, "grad_norm": 3.0925214290618896, "learning_rate": 9.709966988929774e-06, "loss": 0.2299, "num_input_tokens_seen": 46583488, "step": 21585 }, { "epoch": 3.962194898146449, "grad_norm": 17.183862686157227, "learning_rate": 9.709698169442647e-06, "loss": 0.1317, "num_input_tokens_seen": 46595072, "step": 21590 }, { "epoch": 3.963112497706001, "grad_norm": 0.055722154676914215, "learning_rate": 9.709429229159065e-06, "loss": 0.1308, "num_input_tokens_seen": 46606144, "step": 21595 }, { "epoch": 3.964030097265553, "grad_norm": 19.01856231689453, "learning_rate": 9.709160168085918e-06, "loss": 0.1972, "num_input_tokens_seen": 46616672, "step": 21600 }, { "epoch": 3.9649476968251056, "grad_norm": 11.247922897338867, "learning_rate": 9.708890986230114e-06, "loss": 0.2912, "num_input_tokens_seen": 46628192, "step": 21605 }, { "epoch": 3.9658652963846577, "grad_norm": 2.4865355491638184, "learning_rate": 9.708621683598553e-06, "loss": 0.1047, "num_input_tokens_seen": 46639104, "step": 21610 }, { "epoch": 3.9667828959442097, "grad_norm": 16.485620498657227, "learning_rate": 9.708352260198144e-06, "loss": 0.1466, "num_input_tokens_seen": 46650560, "step": 21615 }, { "epoch": 3.9677004955037622, "grad_norm": 3.187626361846924, "learning_rate": 9.708082716035799e-06, "loss": 0.1847, "num_input_tokens_seen": 46661312, "step": 21620 }, { "epoch": 3.9686180950633143, "grad_norm": 0.5412716269493103, "learning_rate": 9.707813051118426e-06, "loss": 0.1577, "num_input_tokens_seen": 46673024, "step": 21625 }, { "epoch": 3.9695356946228664, "grad_norm": 0.5408676266670227, "learning_rate": 9.707543265452945e-06, "loss": 0.2808, "num_input_tokens_seen": 46682912, "step": 21630 }, { "epoch": 3.970453294182419, "grad_norm": 26.870210647583008, "learning_rate": 9.707273359046276e-06, "loss": 0.1414, "num_input_tokens_seen": 46692032, "step": 21635 }, { "epoch": 3.971370893741971, "grad_norm": 6.141726493835449, "learning_rate": 9.707003331905341e-06, "loss": 0.3294, "num_input_tokens_seen": 46702752, "step": 21640 }, { "epoch": 3.972288493301523, "grad_norm": 11.276972770690918, "learning_rate": 9.706733184037066e-06, "loss": 0.1361, "num_input_tokens_seen": 46713312, "step": 21645 }, { "epoch": 3.9732060928610755, "grad_norm": 0.040712181478738785, "learning_rate": 9.70646291544838e-06, "loss": 0.1134, "num_input_tokens_seen": 46724064, "step": 21650 }, { "epoch": 3.9741236924206276, "grad_norm": 19.116281509399414, "learning_rate": 9.706192526146213e-06, "loss": 0.2456, "num_input_tokens_seen": 46735840, "step": 21655 }, { "epoch": 3.9750412919801796, "grad_norm": 39.06148910522461, "learning_rate": 9.705922016137502e-06, "loss": 0.067, "num_input_tokens_seen": 46746624, "step": 21660 }, { "epoch": 3.975958891539732, "grad_norm": 34.937835693359375, "learning_rate": 9.705651385429185e-06, "loss": 0.3302, "num_input_tokens_seen": 46756160, "step": 21665 }, { "epoch": 3.976876491099284, "grad_norm": 8.005719184875488, "learning_rate": 9.705380634028204e-06, "loss": 0.0216, "num_input_tokens_seen": 46766656, "step": 21670 }, { "epoch": 3.9777940906588363, "grad_norm": 0.21304510533809662, "learning_rate": 9.705109761941502e-06, "loss": 0.2335, "num_input_tokens_seen": 46776544, "step": 21675 }, { "epoch": 3.9787116902183888, "grad_norm": 0.18873320519924164, "learning_rate": 9.704838769176026e-06, "loss": 0.3519, "num_input_tokens_seen": 46787712, "step": 21680 }, { "epoch": 3.979629289777941, "grad_norm": 0.0762953907251358, "learning_rate": 9.704567655738728e-06, "loss": 0.0557, "num_input_tokens_seen": 46798624, "step": 21685 }, { "epoch": 3.980546889337493, "grad_norm": 0.16954253613948822, "learning_rate": 9.704296421636562e-06, "loss": 0.081, "num_input_tokens_seen": 46810208, "step": 21690 }, { "epoch": 3.9814644888970454, "grad_norm": 0.23854108154773712, "learning_rate": 9.704025066876484e-06, "loss": 0.1091, "num_input_tokens_seen": 46820608, "step": 21695 }, { "epoch": 3.9823820884565975, "grad_norm": 13.941696166992188, "learning_rate": 9.703753591465451e-06, "loss": 0.0115, "num_input_tokens_seen": 46831200, "step": 21700 }, { "epoch": 3.9832996880161495, "grad_norm": 6.120765209197998, "learning_rate": 9.70348199541043e-06, "loss": 0.1668, "num_input_tokens_seen": 46840352, "step": 21705 }, { "epoch": 3.984217287575702, "grad_norm": 0.05542254447937012, "learning_rate": 9.703210278718386e-06, "loss": 0.2005, "num_input_tokens_seen": 46850592, "step": 21710 }, { "epoch": 3.985134887135254, "grad_norm": 26.372360229492188, "learning_rate": 9.702938441396288e-06, "loss": 0.1254, "num_input_tokens_seen": 46862080, "step": 21715 }, { "epoch": 3.986052486694806, "grad_norm": 12.461454391479492, "learning_rate": 9.702666483451107e-06, "loss": 0.2389, "num_input_tokens_seen": 46874304, "step": 21720 }, { "epoch": 3.9869700862543587, "grad_norm": 0.5529577136039734, "learning_rate": 9.702394404889818e-06, "loss": 0.0064, "num_input_tokens_seen": 46885056, "step": 21725 }, { "epoch": 3.9878876858139107, "grad_norm": 0.038443829864263535, "learning_rate": 9.702122205719402e-06, "loss": 0.1429, "num_input_tokens_seen": 46896096, "step": 21730 }, { "epoch": 3.988805285373463, "grad_norm": 0.0990476980805397, "learning_rate": 9.701849885946838e-06, "loss": 0.3527, "num_input_tokens_seen": 46907264, "step": 21735 }, { "epoch": 3.9897228849330153, "grad_norm": 0.10354126989841461, "learning_rate": 9.701577445579113e-06, "loss": 0.0735, "num_input_tokens_seen": 46918400, "step": 21740 }, { "epoch": 3.9906404844925674, "grad_norm": 7.424066066741943, "learning_rate": 9.701304884623213e-06, "loss": 0.0923, "num_input_tokens_seen": 46930496, "step": 21745 }, { "epoch": 3.9915580840521194, "grad_norm": 2.580033779144287, "learning_rate": 9.701032203086129e-06, "loss": 0.2793, "num_input_tokens_seen": 46941664, "step": 21750 }, { "epoch": 3.992475683611672, "grad_norm": 7.515063762664795, "learning_rate": 9.700759400974855e-06, "loss": 0.1229, "num_input_tokens_seen": 46952288, "step": 21755 }, { "epoch": 3.993393283171224, "grad_norm": 0.13151735067367554, "learning_rate": 9.70048647829639e-06, "loss": 0.1013, "num_input_tokens_seen": 46963616, "step": 21760 }, { "epoch": 3.994310882730776, "grad_norm": 0.11653253436088562, "learning_rate": 9.700213435057727e-06, "loss": 0.1109, "num_input_tokens_seen": 46974080, "step": 21765 }, { "epoch": 3.9952284822903286, "grad_norm": 9.856169700622559, "learning_rate": 9.699940271265877e-06, "loss": 0.536, "num_input_tokens_seen": 46984128, "step": 21770 }, { "epoch": 3.9961460818498806, "grad_norm": 0.15283353626728058, "learning_rate": 9.699666986927843e-06, "loss": 0.0562, "num_input_tokens_seen": 46994592, "step": 21775 }, { "epoch": 3.9970636814094327, "grad_norm": 4.287156105041504, "learning_rate": 9.699393582050636e-06, "loss": 0.1313, "num_input_tokens_seen": 47006240, "step": 21780 }, { "epoch": 3.997981280968985, "grad_norm": 3.0121405124664307, "learning_rate": 9.699120056641264e-06, "loss": 0.2101, "num_input_tokens_seen": 47016896, "step": 21785 }, { "epoch": 3.9988988805285373, "grad_norm": 33.74163055419922, "learning_rate": 9.698846410706749e-06, "loss": 0.1228, "num_input_tokens_seen": 47028096, "step": 21790 }, { "epoch": 3.9998164800880893, "grad_norm": 0.3090672492980957, "learning_rate": 9.698572644254102e-06, "loss": 0.0961, "num_input_tokens_seen": 47039712, "step": 21795 }, { "epoch": 4.0, "eval_loss": 0.18936754763126373, "eval_runtime": 178.7026, "eval_samples_per_second": 30.492, "eval_steps_per_second": 7.627, "num_input_tokens_seen": 47040736, "step": 21796 }, { "epoch": 4.000734079647642, "grad_norm": 0.06087575852870941, "learning_rate": 9.698298757290351e-06, "loss": 0.0146, "num_input_tokens_seen": 47048992, "step": 21800 }, { "epoch": 4.001651679207194, "grad_norm": 0.08344823867082596, "learning_rate": 9.698024749822522e-06, "loss": 0.1738, "num_input_tokens_seen": 47061984, "step": 21805 }, { "epoch": 4.002569278766746, "grad_norm": 32.81877899169922, "learning_rate": 9.697750621857634e-06, "loss": 0.1122, "num_input_tokens_seen": 47073184, "step": 21810 }, { "epoch": 4.0034868783262985, "grad_norm": 6.135368824005127, "learning_rate": 9.697476373402726e-06, "loss": 0.4001, "num_input_tokens_seen": 47083232, "step": 21815 }, { "epoch": 4.004404477885851, "grad_norm": 0.17718274891376495, "learning_rate": 9.697202004464829e-06, "loss": 0.1586, "num_input_tokens_seen": 47093920, "step": 21820 }, { "epoch": 4.005322077445403, "grad_norm": 0.08766060322523117, "learning_rate": 9.69692751505098e-06, "loss": 0.2359, "num_input_tokens_seen": 47105280, "step": 21825 }, { "epoch": 4.006239677004955, "grad_norm": 31.732803344726562, "learning_rate": 9.696652905168222e-06, "loss": 0.115, "num_input_tokens_seen": 47116576, "step": 21830 }, { "epoch": 4.007157276564508, "grad_norm": 0.04084112122654915, "learning_rate": 9.696378174823593e-06, "loss": 0.0884, "num_input_tokens_seen": 47127232, "step": 21835 }, { "epoch": 4.008074876124059, "grad_norm": 0.12330394238233566, "learning_rate": 9.696103324024145e-06, "loss": 0.0897, "num_input_tokens_seen": 47140192, "step": 21840 }, { "epoch": 4.008992475683612, "grad_norm": 0.05030069500207901, "learning_rate": 9.695828352776923e-06, "loss": 0.1738, "num_input_tokens_seen": 47151264, "step": 21845 }, { "epoch": 4.009910075243164, "grad_norm": 0.06235123053193092, "learning_rate": 9.695553261088984e-06, "loss": 0.2322, "num_input_tokens_seen": 47160832, "step": 21850 }, { "epoch": 4.010827674802716, "grad_norm": 4.22477912902832, "learning_rate": 9.69527804896738e-06, "loss": 0.2145, "num_input_tokens_seen": 47171776, "step": 21855 }, { "epoch": 4.011745274362268, "grad_norm": 0.32051628828048706, "learning_rate": 9.69500271641917e-06, "loss": 0.1475, "num_input_tokens_seen": 47181600, "step": 21860 }, { "epoch": 4.012662873921821, "grad_norm": 0.141384094953537, "learning_rate": 9.694727263451419e-06, "loss": 0.2219, "num_input_tokens_seen": 47190912, "step": 21865 }, { "epoch": 4.0135804734813725, "grad_norm": 10.309810638427734, "learning_rate": 9.694451690071189e-06, "loss": 0.0906, "num_input_tokens_seen": 47202816, "step": 21870 }, { "epoch": 4.014498073040925, "grad_norm": 48.38840103149414, "learning_rate": 9.69417599628555e-06, "loss": 0.2439, "num_input_tokens_seen": 47214528, "step": 21875 }, { "epoch": 4.0154156726004775, "grad_norm": 2.3585903644561768, "learning_rate": 9.693900182101569e-06, "loss": 0.1263, "num_input_tokens_seen": 47225728, "step": 21880 }, { "epoch": 4.016333272160029, "grad_norm": 0.47874918580055237, "learning_rate": 9.693624247526326e-06, "loss": 0.0067, "num_input_tokens_seen": 47236608, "step": 21885 }, { "epoch": 4.017250871719582, "grad_norm": 0.34994837641716003, "learning_rate": 9.693348192566893e-06, "loss": 0.0377, "num_input_tokens_seen": 47248160, "step": 21890 }, { "epoch": 4.018168471279134, "grad_norm": 22.122102737426758, "learning_rate": 9.693072017230355e-06, "loss": 0.1944, "num_input_tokens_seen": 47258400, "step": 21895 }, { "epoch": 4.019086070838686, "grad_norm": 3.4098541736602783, "learning_rate": 9.692795721523794e-06, "loss": 0.0568, "num_input_tokens_seen": 47269568, "step": 21900 }, { "epoch": 4.020003670398238, "grad_norm": 21.277511596679688, "learning_rate": 9.692519305454293e-06, "loss": 0.0635, "num_input_tokens_seen": 47279904, "step": 21905 }, { "epoch": 4.020921269957791, "grad_norm": 0.018552320078015327, "learning_rate": 9.692242769028946e-06, "loss": 0.1609, "num_input_tokens_seen": 47289728, "step": 21910 }, { "epoch": 4.021838869517342, "grad_norm": 0.08487089723348618, "learning_rate": 9.691966112254846e-06, "loss": 0.1055, "num_input_tokens_seen": 47300544, "step": 21915 }, { "epoch": 4.022756469076895, "grad_norm": 0.0673113688826561, "learning_rate": 9.691689335139084e-06, "loss": 0.0019, "num_input_tokens_seen": 47310912, "step": 21920 }, { "epoch": 4.023674068636447, "grad_norm": 0.5264179706573486, "learning_rate": 9.691412437688764e-06, "loss": 0.0205, "num_input_tokens_seen": 47321120, "step": 21925 }, { "epoch": 4.024591668195999, "grad_norm": 0.21362680196762085, "learning_rate": 9.691135419910987e-06, "loss": 0.0496, "num_input_tokens_seen": 47331424, "step": 21930 }, { "epoch": 4.0255092677555515, "grad_norm": 0.043630450963974, "learning_rate": 9.690858281812853e-06, "loss": 0.4246, "num_input_tokens_seen": 47342176, "step": 21935 }, { "epoch": 4.026426867315104, "grad_norm": 0.06275320798158646, "learning_rate": 9.690581023401479e-06, "loss": 0.1244, "num_input_tokens_seen": 47354176, "step": 21940 }, { "epoch": 4.027344466874656, "grad_norm": 23.951993942260742, "learning_rate": 9.69030364468397e-06, "loss": 0.4186, "num_input_tokens_seen": 47365632, "step": 21945 }, { "epoch": 4.028262066434208, "grad_norm": 64.9967041015625, "learning_rate": 9.69002614566744e-06, "loss": 0.1809, "num_input_tokens_seen": 47377216, "step": 21950 }, { "epoch": 4.029179665993761, "grad_norm": 0.1523493528366089, "learning_rate": 9.68974852635901e-06, "loss": 0.1127, "num_input_tokens_seen": 47387584, "step": 21955 }, { "epoch": 4.030097265553312, "grad_norm": 16.119735717773438, "learning_rate": 9.689470786765798e-06, "loss": 0.0206, "num_input_tokens_seen": 47398432, "step": 21960 }, { "epoch": 4.031014865112865, "grad_norm": 0.05065983906388283, "learning_rate": 9.689192926894929e-06, "loss": 0.1718, "num_input_tokens_seen": 47409696, "step": 21965 }, { "epoch": 4.031932464672417, "grad_norm": 0.059487234801054, "learning_rate": 9.688914946753528e-06, "loss": 0.0082, "num_input_tokens_seen": 47420416, "step": 21970 }, { "epoch": 4.032850064231969, "grad_norm": 0.03295964002609253, "learning_rate": 9.688636846348727e-06, "loss": 0.3381, "num_input_tokens_seen": 47430464, "step": 21975 }, { "epoch": 4.033767663791521, "grad_norm": 7.34779691696167, "learning_rate": 9.688358625687657e-06, "loss": 0.0739, "num_input_tokens_seen": 47442176, "step": 21980 }, { "epoch": 4.034685263351074, "grad_norm": 0.21408820152282715, "learning_rate": 9.688080284777454e-06, "loss": 0.126, "num_input_tokens_seen": 47453088, "step": 21985 }, { "epoch": 4.0356028629106255, "grad_norm": 2.438886880874634, "learning_rate": 9.687801823625258e-06, "loss": 0.1345, "num_input_tokens_seen": 47463136, "step": 21990 }, { "epoch": 4.036520462470178, "grad_norm": 2.0447757244110107, "learning_rate": 9.687523242238212e-06, "loss": 0.3307, "num_input_tokens_seen": 47474752, "step": 21995 }, { "epoch": 4.0374380620297305, "grad_norm": 13.662235260009766, "learning_rate": 9.687244540623459e-06, "loss": 0.385, "num_input_tokens_seen": 47485184, "step": 22000 }, { "epoch": 4.038355661589282, "grad_norm": 5.041102886199951, "learning_rate": 9.686965718788146e-06, "loss": 0.1215, "num_input_tokens_seen": 47495296, "step": 22005 }, { "epoch": 4.039273261148835, "grad_norm": 26.69053840637207, "learning_rate": 9.68668677673943e-06, "loss": 0.2703, "num_input_tokens_seen": 47506144, "step": 22010 }, { "epoch": 4.040190860708387, "grad_norm": 3.8810954093933105, "learning_rate": 9.68640771448446e-06, "loss": 0.0996, "num_input_tokens_seen": 47517216, "step": 22015 }, { "epoch": 4.041108460267939, "grad_norm": 0.6624907851219177, "learning_rate": 9.686128532030395e-06, "loss": 0.2322, "num_input_tokens_seen": 47526848, "step": 22020 }, { "epoch": 4.042026059827491, "grad_norm": 2.4181268215179443, "learning_rate": 9.685849229384397e-06, "loss": 0.113, "num_input_tokens_seen": 47537184, "step": 22025 }, { "epoch": 4.042943659387044, "grad_norm": 11.144287109375, "learning_rate": 9.685569806553627e-06, "loss": 0.2943, "num_input_tokens_seen": 47547744, "step": 22030 }, { "epoch": 4.043861258946595, "grad_norm": 0.24486836791038513, "learning_rate": 9.685290263545255e-06, "loss": 0.1493, "num_input_tokens_seen": 47558496, "step": 22035 }, { "epoch": 4.044778858506148, "grad_norm": 0.5262001752853394, "learning_rate": 9.685010600366448e-06, "loss": 0.0126, "num_input_tokens_seen": 47569472, "step": 22040 }, { "epoch": 4.0456964580657, "grad_norm": 85.99349975585938, "learning_rate": 9.684730817024382e-06, "loss": 0.0398, "num_input_tokens_seen": 47580256, "step": 22045 }, { "epoch": 4.046614057625252, "grad_norm": 0.3804488778114319, "learning_rate": 9.68445091352623e-06, "loss": 0.1457, "num_input_tokens_seen": 47590496, "step": 22050 }, { "epoch": 4.047531657184805, "grad_norm": 0.09610088914632797, "learning_rate": 9.684170889879171e-06, "loss": 0.03, "num_input_tokens_seen": 47601792, "step": 22055 }, { "epoch": 4.048449256744357, "grad_norm": 0.054921433329582214, "learning_rate": 9.683890746090393e-06, "loss": 0.1797, "num_input_tokens_seen": 47611968, "step": 22060 }, { "epoch": 4.049366856303909, "grad_norm": 4.907333850860596, "learning_rate": 9.683610482167072e-06, "loss": 0.0529, "num_input_tokens_seen": 47623488, "step": 22065 }, { "epoch": 4.050284455863461, "grad_norm": 0.07807104289531708, "learning_rate": 9.683330098116403e-06, "loss": 0.1716, "num_input_tokens_seen": 47633952, "step": 22070 }, { "epoch": 4.051202055423014, "grad_norm": 12.151208877563477, "learning_rate": 9.683049593945575e-06, "loss": 0.1408, "num_input_tokens_seen": 47644896, "step": 22075 }, { "epoch": 4.052119654982565, "grad_norm": 0.1916130632162094, "learning_rate": 9.682768969661784e-06, "loss": 0.0229, "num_input_tokens_seen": 47655552, "step": 22080 }, { "epoch": 4.053037254542118, "grad_norm": 0.47914764285087585, "learning_rate": 9.682488225272227e-06, "loss": 0.0023, "num_input_tokens_seen": 47667264, "step": 22085 }, { "epoch": 4.05395485410167, "grad_norm": 0.12773454189300537, "learning_rate": 9.682207360784102e-06, "loss": 0.1047, "num_input_tokens_seen": 47679584, "step": 22090 }, { "epoch": 4.054872453661222, "grad_norm": 71.49625396728516, "learning_rate": 9.681926376204616e-06, "loss": 0.1874, "num_input_tokens_seen": 47689984, "step": 22095 }, { "epoch": 4.0557900532207745, "grad_norm": 0.26141372323036194, "learning_rate": 9.681645271540976e-06, "loss": 0.1844, "num_input_tokens_seen": 47700480, "step": 22100 }, { "epoch": 4.056707652780327, "grad_norm": 0.017264094203710556, "learning_rate": 9.681364046800388e-06, "loss": 0.1177, "num_input_tokens_seen": 47711296, "step": 22105 }, { "epoch": 4.057625252339879, "grad_norm": 0.04644821584224701, "learning_rate": 9.68108270199007e-06, "loss": 0.0013, "num_input_tokens_seen": 47722272, "step": 22110 }, { "epoch": 4.058542851899431, "grad_norm": 0.075698122382164, "learning_rate": 9.680801237117234e-06, "loss": 0.1367, "num_input_tokens_seen": 47732544, "step": 22115 }, { "epoch": 4.059460451458984, "grad_norm": 0.02677568979561329, "learning_rate": 9.680519652189101e-06, "loss": 0.0196, "num_input_tokens_seen": 47744224, "step": 22120 }, { "epoch": 4.060378051018535, "grad_norm": 13.396858215332031, "learning_rate": 9.680237947212896e-06, "loss": 0.2777, "num_input_tokens_seen": 47755328, "step": 22125 }, { "epoch": 4.061295650578088, "grad_norm": 2.8288114070892334, "learning_rate": 9.67995612219584e-06, "loss": 0.1962, "num_input_tokens_seen": 47766816, "step": 22130 }, { "epoch": 4.06221325013764, "grad_norm": 2.1557564735412598, "learning_rate": 9.67967417714516e-06, "loss": 0.151, "num_input_tokens_seen": 47777696, "step": 22135 }, { "epoch": 4.063130849697192, "grad_norm": 17.229259490966797, "learning_rate": 9.679392112068094e-06, "loss": 0.2779, "num_input_tokens_seen": 47788896, "step": 22140 }, { "epoch": 4.064048449256744, "grad_norm": 0.1831100881099701, "learning_rate": 9.67910992697187e-06, "loss": 0.2317, "num_input_tokens_seen": 47800128, "step": 22145 }, { "epoch": 4.064966048816297, "grad_norm": 0.05580033361911774, "learning_rate": 9.67882762186373e-06, "loss": 0.3006, "num_input_tokens_seen": 47809632, "step": 22150 }, { "epoch": 4.0658836483758485, "grad_norm": 20.707908630371094, "learning_rate": 9.67854519675091e-06, "loss": 0.3444, "num_input_tokens_seen": 47820704, "step": 22155 }, { "epoch": 4.066801247935401, "grad_norm": 6.067226409912109, "learning_rate": 9.67826265164066e-06, "loss": 0.2288, "num_input_tokens_seen": 47831488, "step": 22160 }, { "epoch": 4.0677188474949535, "grad_norm": 2.7994720935821533, "learning_rate": 9.677979986540223e-06, "loss": 0.0944, "num_input_tokens_seen": 47841824, "step": 22165 }, { "epoch": 4.068636447054505, "grad_norm": 0.4112352132797241, "learning_rate": 9.677697201456848e-06, "loss": 0.1007, "num_input_tokens_seen": 47853824, "step": 22170 }, { "epoch": 4.069554046614058, "grad_norm": 5.299017906188965, "learning_rate": 9.67741429639779e-06, "loss": 0.0203, "num_input_tokens_seen": 47863680, "step": 22175 }, { "epoch": 4.07047164617361, "grad_norm": 0.14772729575634003, "learning_rate": 9.677131271370307e-06, "loss": 0.1557, "num_input_tokens_seen": 47875456, "step": 22180 }, { "epoch": 4.071389245733162, "grad_norm": 14.056199073791504, "learning_rate": 9.676848126381654e-06, "loss": 0.1885, "num_input_tokens_seen": 47886272, "step": 22185 }, { "epoch": 4.072306845292714, "grad_norm": 0.03865863382816315, "learning_rate": 9.676564861439095e-06, "loss": 0.0049, "num_input_tokens_seen": 47895392, "step": 22190 }, { "epoch": 4.073224444852267, "grad_norm": 5.187193870544434, "learning_rate": 9.676281476549896e-06, "loss": 0.2733, "num_input_tokens_seen": 47905792, "step": 22195 }, { "epoch": 4.074142044411818, "grad_norm": 92.66426849365234, "learning_rate": 9.675997971721325e-06, "loss": 0.1514, "num_input_tokens_seen": 47915232, "step": 22200 }, { "epoch": 4.075059643971371, "grad_norm": 0.11311151832342148, "learning_rate": 9.675714346960651e-06, "loss": 0.0779, "num_input_tokens_seen": 47926080, "step": 22205 }, { "epoch": 4.075977243530923, "grad_norm": 0.08799917995929718, "learning_rate": 9.675430602275153e-06, "loss": 0.1039, "num_input_tokens_seen": 47937408, "step": 22210 }, { "epoch": 4.076894843090475, "grad_norm": 0.08311108499765396, "learning_rate": 9.675146737672106e-06, "loss": 0.0457, "num_input_tokens_seen": 47947200, "step": 22215 }, { "epoch": 4.0778124426500275, "grad_norm": 0.06035497784614563, "learning_rate": 9.674862753158788e-06, "loss": 0.112, "num_input_tokens_seen": 47958208, "step": 22220 }, { "epoch": 4.07873004220958, "grad_norm": 0.03722492232918739, "learning_rate": 9.67457864874249e-06, "loss": 0.0037, "num_input_tokens_seen": 47969248, "step": 22225 }, { "epoch": 4.079647641769132, "grad_norm": 0.06141573563218117, "learning_rate": 9.674294424430493e-06, "loss": 0.1134, "num_input_tokens_seen": 47979616, "step": 22230 }, { "epoch": 4.080565241328684, "grad_norm": 7.187922477722168, "learning_rate": 9.674010080230087e-06, "loss": 0.1541, "num_input_tokens_seen": 47989824, "step": 22235 }, { "epoch": 4.081482840888237, "grad_norm": 0.46026885509490967, "learning_rate": 9.673725616148568e-06, "loss": 0.1409, "num_input_tokens_seen": 48001408, "step": 22240 }, { "epoch": 4.082400440447788, "grad_norm": 0.1011955738067627, "learning_rate": 9.673441032193232e-06, "loss": 0.0568, "num_input_tokens_seen": 48011328, "step": 22245 }, { "epoch": 4.083318040007341, "grad_norm": 10.002695083618164, "learning_rate": 9.673156328371374e-06, "loss": 0.3657, "num_input_tokens_seen": 48022592, "step": 22250 }, { "epoch": 4.084235639566893, "grad_norm": 0.05042067915201187, "learning_rate": 9.6728715046903e-06, "loss": 0.0045, "num_input_tokens_seen": 48032000, "step": 22255 }, { "epoch": 4.085153239126445, "grad_norm": 0.04887860640883446, "learning_rate": 9.672586561157313e-06, "loss": 0.1134, "num_input_tokens_seen": 48042784, "step": 22260 }, { "epoch": 4.086070838685997, "grad_norm": 5.204895973205566, "learning_rate": 9.672301497779725e-06, "loss": 0.1314, "num_input_tokens_seen": 48054144, "step": 22265 }, { "epoch": 4.08698843824555, "grad_norm": 0.10685569792985916, "learning_rate": 9.672016314564843e-06, "loss": 0.0033, "num_input_tokens_seen": 48064192, "step": 22270 }, { "epoch": 4.0879060378051015, "grad_norm": 0.1830292046070099, "learning_rate": 9.671731011519984e-06, "loss": 0.1733, "num_input_tokens_seen": 48075072, "step": 22275 }, { "epoch": 4.088823637364654, "grad_norm": 0.06652442365884781, "learning_rate": 9.671445588652465e-06, "loss": 0.0044, "num_input_tokens_seen": 48086272, "step": 22280 }, { "epoch": 4.0897412369242065, "grad_norm": 0.0945502296090126, "learning_rate": 9.671160045969607e-06, "loss": 0.1193, "num_input_tokens_seen": 48097536, "step": 22285 }, { "epoch": 4.090658836483758, "grad_norm": 0.06903061270713806, "learning_rate": 9.670874383478734e-06, "loss": 0.1625, "num_input_tokens_seen": 48109440, "step": 22290 }, { "epoch": 4.091576436043311, "grad_norm": 0.23244868218898773, "learning_rate": 9.670588601187171e-06, "loss": 0.004, "num_input_tokens_seen": 48121024, "step": 22295 }, { "epoch": 4.092494035602863, "grad_norm": 2.9851150512695312, "learning_rate": 9.670302699102251e-06, "loss": 0.1893, "num_input_tokens_seen": 48131584, "step": 22300 }, { "epoch": 4.093411635162415, "grad_norm": 24.294809341430664, "learning_rate": 9.670016677231304e-06, "loss": 0.0319, "num_input_tokens_seen": 48142048, "step": 22305 }, { "epoch": 4.094329234721967, "grad_norm": 49.48405838012695, "learning_rate": 9.669730535581667e-06, "loss": 0.2439, "num_input_tokens_seen": 48153312, "step": 22310 }, { "epoch": 4.09524683428152, "grad_norm": 0.5984136462211609, "learning_rate": 9.66944427416068e-06, "loss": 0.1483, "num_input_tokens_seen": 48164960, "step": 22315 }, { "epoch": 4.096164433841071, "grad_norm": 0.06546302884817123, "learning_rate": 9.669157892975684e-06, "loss": 0.1579, "num_input_tokens_seen": 48176512, "step": 22320 }, { "epoch": 4.097082033400624, "grad_norm": 3.512115001678467, "learning_rate": 9.668871392034023e-06, "loss": 0.0453, "num_input_tokens_seen": 48185888, "step": 22325 }, { "epoch": 4.097999632960176, "grad_norm": 1.3796216249465942, "learning_rate": 9.668584771343047e-06, "loss": 0.2631, "num_input_tokens_seen": 48196640, "step": 22330 }, { "epoch": 4.098917232519728, "grad_norm": 78.00120544433594, "learning_rate": 9.66829803091011e-06, "loss": 0.2876, "num_input_tokens_seen": 48207936, "step": 22335 }, { "epoch": 4.099834832079281, "grad_norm": 0.04622770473361015, "learning_rate": 9.668011170742562e-06, "loss": 0.1403, "num_input_tokens_seen": 48218656, "step": 22340 }, { "epoch": 4.100752431638833, "grad_norm": 25.57807731628418, "learning_rate": 9.667724190847763e-06, "loss": 0.2719, "num_input_tokens_seen": 48229568, "step": 22345 }, { "epoch": 4.101670031198385, "grad_norm": 21.033098220825195, "learning_rate": 9.667437091233071e-06, "loss": 0.2683, "num_input_tokens_seen": 48240256, "step": 22350 }, { "epoch": 4.102587630757937, "grad_norm": 1.9776077270507812, "learning_rate": 9.667149871905853e-06, "loss": 0.1832, "num_input_tokens_seen": 48250720, "step": 22355 }, { "epoch": 4.10350523031749, "grad_norm": 0.025135604664683342, "learning_rate": 9.666862532873474e-06, "loss": 0.0047, "num_input_tokens_seen": 48261824, "step": 22360 }, { "epoch": 4.104422829877041, "grad_norm": 0.12578245997428894, "learning_rate": 9.666575074143303e-06, "loss": 0.2173, "num_input_tokens_seen": 48272224, "step": 22365 }, { "epoch": 4.105340429436594, "grad_norm": 12.131465911865234, "learning_rate": 9.666287495722714e-06, "loss": 0.3092, "num_input_tokens_seen": 48282912, "step": 22370 }, { "epoch": 4.106258028996146, "grad_norm": 0.049908775836229324, "learning_rate": 9.665999797619086e-06, "loss": 0.1248, "num_input_tokens_seen": 48293344, "step": 22375 }, { "epoch": 4.107175628555698, "grad_norm": 5.409110069274902, "learning_rate": 9.665711979839792e-06, "loss": 0.1671, "num_input_tokens_seen": 48304544, "step": 22380 }, { "epoch": 4.1080932281152505, "grad_norm": 28.39167022705078, "learning_rate": 9.665424042392216e-06, "loss": 0.3147, "num_input_tokens_seen": 48314464, "step": 22385 }, { "epoch": 4.109010827674803, "grad_norm": 0.2050730586051941, "learning_rate": 9.665135985283746e-06, "loss": 0.0082, "num_input_tokens_seen": 48325312, "step": 22390 }, { "epoch": 4.109928427234355, "grad_norm": 5.884538173675537, "learning_rate": 9.664847808521767e-06, "loss": 0.0909, "num_input_tokens_seen": 48336320, "step": 22395 }, { "epoch": 4.110846026793907, "grad_norm": 0.0569303072988987, "learning_rate": 9.664559512113672e-06, "loss": 0.0102, "num_input_tokens_seen": 48346624, "step": 22400 }, { "epoch": 4.11176362635346, "grad_norm": 0.0626533254981041, "learning_rate": 9.664271096066856e-06, "loss": 0.003, "num_input_tokens_seen": 48358368, "step": 22405 }, { "epoch": 4.112681225913011, "grad_norm": 10.523097038269043, "learning_rate": 9.663982560388714e-06, "loss": 0.2017, "num_input_tokens_seen": 48369952, "step": 22410 }, { "epoch": 4.113598825472564, "grad_norm": 12.344098091125488, "learning_rate": 9.663693905086649e-06, "loss": 0.0653, "num_input_tokens_seen": 48380800, "step": 22415 }, { "epoch": 4.114516425032116, "grad_norm": 0.04176614433526993, "learning_rate": 9.663405130168063e-06, "loss": 0.2246, "num_input_tokens_seen": 48391552, "step": 22420 }, { "epoch": 4.115434024591668, "grad_norm": 17.67206573486328, "learning_rate": 9.663116235640362e-06, "loss": 0.3288, "num_input_tokens_seen": 48403424, "step": 22425 }, { "epoch": 4.11635162415122, "grad_norm": 0.2089739441871643, "learning_rate": 9.662827221510958e-06, "loss": 0.1385, "num_input_tokens_seen": 48415552, "step": 22430 }, { "epoch": 4.117269223710773, "grad_norm": 0.13629640638828278, "learning_rate": 9.66253808778726e-06, "loss": 0.1526, "num_input_tokens_seen": 48427648, "step": 22435 }, { "epoch": 4.1181868232703245, "grad_norm": 1.0265028476715088, "learning_rate": 9.66224883447669e-06, "loss": 0.0301, "num_input_tokens_seen": 48438464, "step": 22440 }, { "epoch": 4.119104422829877, "grad_norm": 68.05955505371094, "learning_rate": 9.661959461586662e-06, "loss": 0.4538, "num_input_tokens_seen": 48447840, "step": 22445 }, { "epoch": 4.1200220223894295, "grad_norm": 0.21934950351715088, "learning_rate": 9.6616699691246e-06, "loss": 0.0853, "num_input_tokens_seen": 48458880, "step": 22450 }, { "epoch": 4.120939621948981, "grad_norm": 7.100584030151367, "learning_rate": 9.661380357097924e-06, "loss": 0.3851, "num_input_tokens_seen": 48469184, "step": 22455 }, { "epoch": 4.121857221508534, "grad_norm": 6.398625373840332, "learning_rate": 9.661090625514071e-06, "loss": 0.2602, "num_input_tokens_seen": 48480256, "step": 22460 }, { "epoch": 4.122774821068086, "grad_norm": 6.038959980010986, "learning_rate": 9.660800774380466e-06, "loss": 0.0991, "num_input_tokens_seen": 48490880, "step": 22465 }, { "epoch": 4.123692420627638, "grad_norm": 20.857467651367188, "learning_rate": 9.660510803704543e-06, "loss": 0.2898, "num_input_tokens_seen": 48500928, "step": 22470 }, { "epoch": 4.12461002018719, "grad_norm": 11.225513458251953, "learning_rate": 9.660220713493743e-06, "loss": 0.1835, "num_input_tokens_seen": 48512160, "step": 22475 }, { "epoch": 4.125527619746743, "grad_norm": 0.06490039825439453, "learning_rate": 9.659930503755504e-06, "loss": 0.0271, "num_input_tokens_seen": 48523328, "step": 22480 }, { "epoch": 4.126445219306294, "grad_norm": 11.342082023620605, "learning_rate": 9.65964017449727e-06, "loss": 0.2804, "num_input_tokens_seen": 48534592, "step": 22485 }, { "epoch": 4.127362818865847, "grad_norm": 6.309587001800537, "learning_rate": 9.659349725726487e-06, "loss": 0.0494, "num_input_tokens_seen": 48544448, "step": 22490 }, { "epoch": 4.128280418425399, "grad_norm": 19.704458236694336, "learning_rate": 9.659059157450606e-06, "loss": 0.1508, "num_input_tokens_seen": 48555488, "step": 22495 }, { "epoch": 4.129198017984951, "grad_norm": 24.4770450592041, "learning_rate": 9.658768469677076e-06, "loss": 0.2684, "num_input_tokens_seen": 48567360, "step": 22500 }, { "epoch": 4.1301156175445035, "grad_norm": 18.057157516479492, "learning_rate": 9.658477662413358e-06, "loss": 0.2515, "num_input_tokens_seen": 48578432, "step": 22505 }, { "epoch": 4.131033217104056, "grad_norm": 0.07657789438962936, "learning_rate": 9.658186735666905e-06, "loss": 0.1255, "num_input_tokens_seen": 48590976, "step": 22510 }, { "epoch": 4.131950816663608, "grad_norm": 4.185677528381348, "learning_rate": 9.657895689445186e-06, "loss": 0.0842, "num_input_tokens_seen": 48602144, "step": 22515 }, { "epoch": 4.13286841622316, "grad_norm": 0.24896828830242157, "learning_rate": 9.657604523755657e-06, "loss": 0.164, "num_input_tokens_seen": 48611008, "step": 22520 }, { "epoch": 4.133786015782713, "grad_norm": 0.5038557052612305, "learning_rate": 9.657313238605792e-06, "loss": 0.0077, "num_input_tokens_seen": 48621280, "step": 22525 }, { "epoch": 4.134703615342264, "grad_norm": 9.503588676452637, "learning_rate": 9.657021834003061e-06, "loss": 0.1175, "num_input_tokens_seen": 48632448, "step": 22530 }, { "epoch": 4.135621214901817, "grad_norm": 2.6173183917999268, "learning_rate": 9.656730309954938e-06, "loss": 0.0457, "num_input_tokens_seen": 48642464, "step": 22535 }, { "epoch": 4.136538814461369, "grad_norm": 5.91951847076416, "learning_rate": 9.6564386664689e-06, "loss": 0.2318, "num_input_tokens_seen": 48653440, "step": 22540 }, { "epoch": 4.137456414020921, "grad_norm": 0.3959018290042877, "learning_rate": 9.656146903552427e-06, "loss": 0.3549, "num_input_tokens_seen": 48663968, "step": 22545 }, { "epoch": 4.138374013580473, "grad_norm": 0.3258296847343445, "learning_rate": 9.655855021213002e-06, "loss": 0.3019, "num_input_tokens_seen": 48674304, "step": 22550 }, { "epoch": 4.139291613140026, "grad_norm": 0.10214348137378693, "learning_rate": 9.655563019458112e-06, "loss": 0.0075, "num_input_tokens_seen": 48684736, "step": 22555 }, { "epoch": 4.1402092126995775, "grad_norm": 0.4981134831905365, "learning_rate": 9.655270898295246e-06, "loss": 0.1939, "num_input_tokens_seen": 48696032, "step": 22560 }, { "epoch": 4.14112681225913, "grad_norm": 2.7541394233703613, "learning_rate": 9.654978657731895e-06, "loss": 0.1235, "num_input_tokens_seen": 48706528, "step": 22565 }, { "epoch": 4.1420444118186825, "grad_norm": 0.4562009871006012, "learning_rate": 9.654686297775557e-06, "loss": 0.1808, "num_input_tokens_seen": 48717536, "step": 22570 }, { "epoch": 4.142962011378234, "grad_norm": 0.26238492131233215, "learning_rate": 9.65439381843373e-06, "loss": 0.069, "num_input_tokens_seen": 48727424, "step": 22575 }, { "epoch": 4.143879610937787, "grad_norm": 5.4328293800354, "learning_rate": 9.654101219713915e-06, "loss": 0.3343, "num_input_tokens_seen": 48737152, "step": 22580 }, { "epoch": 4.144797210497339, "grad_norm": 34.97937774658203, "learning_rate": 9.653808501623617e-06, "loss": 0.2062, "num_input_tokens_seen": 48749024, "step": 22585 }, { "epoch": 4.145714810056891, "grad_norm": 0.47394096851348877, "learning_rate": 9.653515664170343e-06, "loss": 0.1099, "num_input_tokens_seen": 48759904, "step": 22590 }, { "epoch": 4.146632409616443, "grad_norm": 0.9183849096298218, "learning_rate": 9.653222707361605e-06, "loss": 0.0265, "num_input_tokens_seen": 48772000, "step": 22595 }, { "epoch": 4.147550009175996, "grad_norm": 0.6533794403076172, "learning_rate": 9.652929631204917e-06, "loss": 0.2353, "num_input_tokens_seen": 48784096, "step": 22600 }, { "epoch": 4.148467608735547, "grad_norm": 18.43858528137207, "learning_rate": 9.652636435707793e-06, "loss": 0.1011, "num_input_tokens_seen": 48794464, "step": 22605 }, { "epoch": 4.1493852082951, "grad_norm": 0.19048535823822021, "learning_rate": 9.652343120877758e-06, "loss": 0.0286, "num_input_tokens_seen": 48804928, "step": 22610 }, { "epoch": 4.150302807854652, "grad_norm": 14.293072700500488, "learning_rate": 9.652049686722332e-06, "loss": 0.2203, "num_input_tokens_seen": 48814944, "step": 22615 }, { "epoch": 4.151220407414204, "grad_norm": 0.9252825379371643, "learning_rate": 9.651756133249041e-06, "loss": 0.3202, "num_input_tokens_seen": 48826688, "step": 22620 }, { "epoch": 4.152138006973757, "grad_norm": 21.888797760009766, "learning_rate": 9.651462460465415e-06, "loss": 0.1641, "num_input_tokens_seen": 48837120, "step": 22625 }, { "epoch": 4.153055606533309, "grad_norm": 19.710811614990234, "learning_rate": 9.651168668378987e-06, "loss": 0.18, "num_input_tokens_seen": 48846944, "step": 22630 }, { "epoch": 4.153973206092861, "grad_norm": 0.04447672516107559, "learning_rate": 9.650874756997289e-06, "loss": 0.0047, "num_input_tokens_seen": 48855936, "step": 22635 }, { "epoch": 4.154890805652413, "grad_norm": 8.802922248840332, "learning_rate": 9.650580726327863e-06, "loss": 0.2124, "num_input_tokens_seen": 48866912, "step": 22640 }, { "epoch": 4.155808405211966, "grad_norm": 6.6227030754089355, "learning_rate": 9.65028657637825e-06, "loss": 0.2085, "num_input_tokens_seen": 48877056, "step": 22645 }, { "epoch": 4.156726004771517, "grad_norm": 0.16031846404075623, "learning_rate": 9.649992307155992e-06, "loss": 0.073, "num_input_tokens_seen": 48887936, "step": 22650 }, { "epoch": 4.15764360433107, "grad_norm": 0.1512308269739151, "learning_rate": 9.64969791866864e-06, "loss": 0.2509, "num_input_tokens_seen": 48899104, "step": 22655 }, { "epoch": 4.158561203890622, "grad_norm": 10.953701972961426, "learning_rate": 9.64940341092374e-06, "loss": 0.2415, "num_input_tokens_seen": 48909984, "step": 22660 }, { "epoch": 4.159478803450174, "grad_norm": 7.883016586303711, "learning_rate": 9.64910878392885e-06, "loss": 0.1055, "num_input_tokens_seen": 48920352, "step": 22665 }, { "epoch": 4.1603964030097265, "grad_norm": 0.07889001071453094, "learning_rate": 9.648814037691524e-06, "loss": 0.0088, "num_input_tokens_seen": 48929952, "step": 22670 }, { "epoch": 4.161314002569279, "grad_norm": 12.167373657226562, "learning_rate": 9.648519172219326e-06, "loss": 0.1882, "num_input_tokens_seen": 48940512, "step": 22675 }, { "epoch": 4.162231602128831, "grad_norm": 10.968720436096191, "learning_rate": 9.648224187519812e-06, "loss": 0.0951, "num_input_tokens_seen": 48950432, "step": 22680 }, { "epoch": 4.163149201688383, "grad_norm": 0.076833076775074, "learning_rate": 9.647929083600555e-06, "loss": 0.225, "num_input_tokens_seen": 48962016, "step": 22685 }, { "epoch": 4.164066801247936, "grad_norm": 0.4032931327819824, "learning_rate": 9.647633860469118e-06, "loss": 0.0088, "num_input_tokens_seen": 48972064, "step": 22690 }, { "epoch": 4.164984400807487, "grad_norm": 11.281493186950684, "learning_rate": 9.647338518133078e-06, "loss": 0.2518, "num_input_tokens_seen": 48983680, "step": 22695 }, { "epoch": 4.16590200036704, "grad_norm": 0.0523017942905426, "learning_rate": 9.647043056600006e-06, "loss": 0.1429, "num_input_tokens_seen": 48995392, "step": 22700 }, { "epoch": 4.166819599926592, "grad_norm": 25.559932708740234, "learning_rate": 9.646747475877483e-06, "loss": 0.1342, "num_input_tokens_seen": 49007552, "step": 22705 }, { "epoch": 4.167737199486144, "grad_norm": 26.382047653198242, "learning_rate": 9.646451775973088e-06, "loss": 0.2642, "num_input_tokens_seen": 49020000, "step": 22710 }, { "epoch": 4.168654799045696, "grad_norm": 0.19774065911769867, "learning_rate": 9.646155956894407e-06, "loss": 0.0285, "num_input_tokens_seen": 49031488, "step": 22715 }, { "epoch": 4.169572398605249, "grad_norm": 0.1620066612958908, "learning_rate": 9.645860018649027e-06, "loss": 0.1205, "num_input_tokens_seen": 49042688, "step": 22720 }, { "epoch": 4.1704899981648005, "grad_norm": 0.23924680054187775, "learning_rate": 9.645563961244537e-06, "loss": 0.0724, "num_input_tokens_seen": 49053408, "step": 22725 }, { "epoch": 4.171407597724353, "grad_norm": 7.6484856605529785, "learning_rate": 9.645267784688531e-06, "loss": 0.3906, "num_input_tokens_seen": 49063424, "step": 22730 }, { "epoch": 4.1723251972839055, "grad_norm": 8.730568885803223, "learning_rate": 9.644971488988606e-06, "loss": 0.0689, "num_input_tokens_seen": 49073280, "step": 22735 }, { "epoch": 4.173242796843457, "grad_norm": 0.09550249576568604, "learning_rate": 9.644675074152364e-06, "loss": 0.0716, "num_input_tokens_seen": 49083392, "step": 22740 }, { "epoch": 4.17416039640301, "grad_norm": 13.451776504516602, "learning_rate": 9.644378540187402e-06, "loss": 0.1916, "num_input_tokens_seen": 49093216, "step": 22745 }, { "epoch": 4.175077995962562, "grad_norm": 0.42556318640708923, "learning_rate": 9.644081887101329e-06, "loss": 0.1956, "num_input_tokens_seen": 49104128, "step": 22750 }, { "epoch": 4.175995595522114, "grad_norm": 0.8658515810966492, "learning_rate": 9.643785114901754e-06, "loss": 0.1235, "num_input_tokens_seen": 49115584, "step": 22755 }, { "epoch": 4.176913195081666, "grad_norm": 1.694260597229004, "learning_rate": 9.643488223596287e-06, "loss": 0.1598, "num_input_tokens_seen": 49127296, "step": 22760 }, { "epoch": 4.177830794641219, "grad_norm": 6.940282821655273, "learning_rate": 9.643191213192545e-06, "loss": 0.1201, "num_input_tokens_seen": 49138208, "step": 22765 }, { "epoch": 4.17874839420077, "grad_norm": 4.672567844390869, "learning_rate": 9.642894083698145e-06, "loss": 0.047, "num_input_tokens_seen": 49149024, "step": 22770 }, { "epoch": 4.179665993760323, "grad_norm": 0.7088920474052429, "learning_rate": 9.642596835120705e-06, "loss": 0.0052, "num_input_tokens_seen": 49160576, "step": 22775 }, { "epoch": 4.180583593319875, "grad_norm": 0.17243175208568573, "learning_rate": 9.642299467467854e-06, "loss": 0.0111, "num_input_tokens_seen": 49170560, "step": 22780 }, { "epoch": 4.181501192879427, "grad_norm": 0.045556582510471344, "learning_rate": 9.642001980747216e-06, "loss": 0.0818, "num_input_tokens_seen": 49182176, "step": 22785 }, { "epoch": 4.1824187924389795, "grad_norm": 0.4550413489341736, "learning_rate": 9.641704374966421e-06, "loss": 0.1659, "num_input_tokens_seen": 49193440, "step": 22790 }, { "epoch": 4.183336391998532, "grad_norm": 30.862276077270508, "learning_rate": 9.641406650133104e-06, "loss": 0.0175, "num_input_tokens_seen": 49204864, "step": 22795 }, { "epoch": 4.184253991558084, "grad_norm": 0.026434149593114853, "learning_rate": 9.6411088062549e-06, "loss": 0.0102, "num_input_tokens_seen": 49217600, "step": 22800 }, { "epoch": 4.185171591117636, "grad_norm": 0.25826606154441833, "learning_rate": 9.640810843339445e-06, "loss": 0.0956, "num_input_tokens_seen": 49228384, "step": 22805 }, { "epoch": 4.186089190677189, "grad_norm": 0.017725463956594467, "learning_rate": 9.640512761394389e-06, "loss": 0.4226, "num_input_tokens_seen": 49239104, "step": 22810 }, { "epoch": 4.18700679023674, "grad_norm": 14.101605415344238, "learning_rate": 9.64021456042737e-06, "loss": 0.2249, "num_input_tokens_seen": 49248992, "step": 22815 }, { "epoch": 4.187924389796293, "grad_norm": 8.94633674621582, "learning_rate": 9.63991624044604e-06, "loss": 0.2381, "num_input_tokens_seen": 49258944, "step": 22820 }, { "epoch": 4.188841989355845, "grad_norm": 1.3935853242874146, "learning_rate": 9.63961780145805e-06, "loss": 0.1344, "num_input_tokens_seen": 49270016, "step": 22825 }, { "epoch": 4.189759588915397, "grad_norm": 0.43466833233833313, "learning_rate": 9.639319243471052e-06, "loss": 0.0049, "num_input_tokens_seen": 49281440, "step": 22830 }, { "epoch": 4.190677188474949, "grad_norm": 24.2968807220459, "learning_rate": 9.639020566492708e-06, "loss": 0.1615, "num_input_tokens_seen": 49291584, "step": 22835 }, { "epoch": 4.191594788034502, "grad_norm": 8.698412895202637, "learning_rate": 9.638721770530677e-06, "loss": 0.1499, "num_input_tokens_seen": 49302496, "step": 22840 }, { "epoch": 4.1925123875940535, "grad_norm": 19.252838134765625, "learning_rate": 9.63842285559262e-06, "loss": 0.1146, "num_input_tokens_seen": 49312000, "step": 22845 }, { "epoch": 4.193429987153606, "grad_norm": 0.035341814160346985, "learning_rate": 9.638123821686206e-06, "loss": 0.2726, "num_input_tokens_seen": 49321280, "step": 22850 }, { "epoch": 4.1943475867131585, "grad_norm": 0.028472239151597023, "learning_rate": 9.637824668819104e-06, "loss": 0.141, "num_input_tokens_seen": 49331392, "step": 22855 }, { "epoch": 4.19526518627271, "grad_norm": 0.03984347730875015, "learning_rate": 9.63752539699899e-06, "loss": 0.2126, "num_input_tokens_seen": 49343520, "step": 22860 }, { "epoch": 4.196182785832263, "grad_norm": 2.814099073410034, "learning_rate": 9.637226006233533e-06, "loss": 0.2679, "num_input_tokens_seen": 49355296, "step": 22865 }, { "epoch": 4.197100385391815, "grad_norm": 6.1324849128723145, "learning_rate": 9.63692649653042e-06, "loss": 0.3824, "num_input_tokens_seen": 49366144, "step": 22870 }, { "epoch": 4.198017984951367, "grad_norm": 50.16313171386719, "learning_rate": 9.636626867897325e-06, "loss": 0.0454, "num_input_tokens_seen": 49377440, "step": 22875 }, { "epoch": 4.198935584510919, "grad_norm": 0.25624901056289673, "learning_rate": 9.63632712034194e-06, "loss": 0.0873, "num_input_tokens_seen": 49387872, "step": 22880 }, { "epoch": 4.199853184070472, "grad_norm": 0.0362868495285511, "learning_rate": 9.636027253871949e-06, "loss": 0.1406, "num_input_tokens_seen": 49397984, "step": 22885 }, { "epoch": 4.200770783630023, "grad_norm": 0.2533729672431946, "learning_rate": 9.635727268495043e-06, "loss": 0.0047, "num_input_tokens_seen": 49408576, "step": 22890 }, { "epoch": 4.201688383189576, "grad_norm": 0.017742637544870377, "learning_rate": 9.63542716421892e-06, "loss": 0.0087, "num_input_tokens_seen": 49419584, "step": 22895 }, { "epoch": 4.202605982749128, "grad_norm": 0.3015630841255188, "learning_rate": 9.635126941051271e-06, "loss": 0.0987, "num_input_tokens_seen": 49430656, "step": 22900 }, { "epoch": 4.20352358230868, "grad_norm": 7.950138568878174, "learning_rate": 9.634826598999802e-06, "loss": 0.1189, "num_input_tokens_seen": 49440640, "step": 22905 }, { "epoch": 4.204441181868233, "grad_norm": 0.04585615545511246, "learning_rate": 9.634526138072215e-06, "loss": 0.0475, "num_input_tokens_seen": 49451264, "step": 22910 }, { "epoch": 4.205358781427785, "grad_norm": 0.28931671380996704, "learning_rate": 9.634225558276214e-06, "loss": 0.2654, "num_input_tokens_seen": 49462080, "step": 22915 }, { "epoch": 4.206276380987337, "grad_norm": 23.640262603759766, "learning_rate": 9.63392485961951e-06, "loss": 0.1875, "num_input_tokens_seen": 49472992, "step": 22920 }, { "epoch": 4.207193980546889, "grad_norm": 75.6625747680664, "learning_rate": 9.633624042109815e-06, "loss": 0.1751, "num_input_tokens_seen": 49483520, "step": 22925 }, { "epoch": 4.208111580106442, "grad_norm": 0.5522878170013428, "learning_rate": 9.633323105754844e-06, "loss": 0.0597, "num_input_tokens_seen": 49493408, "step": 22930 }, { "epoch": 4.209029179665993, "grad_norm": 3.6542932987213135, "learning_rate": 9.633022050562318e-06, "loss": 0.3349, "num_input_tokens_seen": 49504864, "step": 22935 }, { "epoch": 4.209946779225546, "grad_norm": 9.376090049743652, "learning_rate": 9.632720876539956e-06, "loss": 0.163, "num_input_tokens_seen": 49515584, "step": 22940 }, { "epoch": 4.210864378785098, "grad_norm": 0.2584770619869232, "learning_rate": 9.632419583695484e-06, "loss": 0.0995, "num_input_tokens_seen": 49526688, "step": 22945 }, { "epoch": 4.21178197834465, "grad_norm": 14.353804588317871, "learning_rate": 9.63211817203663e-06, "loss": 0.0444, "num_input_tokens_seen": 49537792, "step": 22950 }, { "epoch": 4.2126995779042025, "grad_norm": 0.16711653769016266, "learning_rate": 9.631816641571123e-06, "loss": 0.2401, "num_input_tokens_seen": 49548384, "step": 22955 }, { "epoch": 4.213617177463755, "grad_norm": 0.059740543365478516, "learning_rate": 9.631514992306698e-06, "loss": 0.1096, "num_input_tokens_seen": 49559488, "step": 22960 }, { "epoch": 4.214534777023307, "grad_norm": 35.50243377685547, "learning_rate": 9.631213224251091e-06, "loss": 0.2048, "num_input_tokens_seen": 49570112, "step": 22965 }, { "epoch": 4.215452376582859, "grad_norm": 7.208070278167725, "learning_rate": 9.630911337412044e-06, "loss": 0.1508, "num_input_tokens_seen": 49580832, "step": 22970 }, { "epoch": 4.216369976142412, "grad_norm": 1.195664644241333, "learning_rate": 9.630609331797297e-06, "loss": 0.0186, "num_input_tokens_seen": 49591296, "step": 22975 }, { "epoch": 4.217287575701963, "grad_norm": 0.09094962477684021, "learning_rate": 9.630307207414598e-06, "loss": 0.005, "num_input_tokens_seen": 49602784, "step": 22980 }, { "epoch": 4.218205175261516, "grad_norm": 4.5684123039245605, "learning_rate": 9.630004964271696e-06, "loss": 0.014, "num_input_tokens_seen": 49613472, "step": 22985 }, { "epoch": 4.219122774821068, "grad_norm": 0.04016357660293579, "learning_rate": 9.629702602376341e-06, "loss": 0.0342, "num_input_tokens_seen": 49625248, "step": 22990 }, { "epoch": 4.22004037438062, "grad_norm": 0.03854112699627876, "learning_rate": 9.629400121736291e-06, "loss": 0.4265, "num_input_tokens_seen": 49635456, "step": 22995 }, { "epoch": 4.220957973940172, "grad_norm": 0.2209557145833969, "learning_rate": 9.629097522359304e-06, "loss": 0.0627, "num_input_tokens_seen": 49646176, "step": 23000 }, { "epoch": 4.221875573499725, "grad_norm": 13.587462425231934, "learning_rate": 9.628794804253137e-06, "loss": 0.0622, "num_input_tokens_seen": 49656576, "step": 23005 }, { "epoch": 4.2227931730592765, "grad_norm": 5.821496963500977, "learning_rate": 9.62849196742556e-06, "loss": 0.1646, "num_input_tokens_seen": 49666944, "step": 23010 }, { "epoch": 4.223710772618829, "grad_norm": 7.60090970993042, "learning_rate": 9.628189011884335e-06, "loss": 0.247, "num_input_tokens_seen": 49676512, "step": 23015 }, { "epoch": 4.2246283721783815, "grad_norm": 9.978618621826172, "learning_rate": 9.627885937637236e-06, "loss": 0.2648, "num_input_tokens_seen": 49687552, "step": 23020 }, { "epoch": 4.225545971737933, "grad_norm": 0.0685112252831459, "learning_rate": 9.627582744692036e-06, "loss": 0.0096, "num_input_tokens_seen": 49698848, "step": 23025 }, { "epoch": 4.226463571297486, "grad_norm": 1.8703813552856445, "learning_rate": 9.62727943305651e-06, "loss": 0.1906, "num_input_tokens_seen": 49709344, "step": 23030 }, { "epoch": 4.227381170857038, "grad_norm": 12.500919342041016, "learning_rate": 9.626976002738438e-06, "loss": 0.0978, "num_input_tokens_seen": 49719904, "step": 23035 }, { "epoch": 4.22829877041659, "grad_norm": 0.06144511699676514, "learning_rate": 9.626672453745603e-06, "loss": 0.1714, "num_input_tokens_seen": 49729632, "step": 23040 }, { "epoch": 4.229216369976142, "grad_norm": 0.3971754312515259, "learning_rate": 9.626368786085792e-06, "loss": 0.0913, "num_input_tokens_seen": 49740768, "step": 23045 }, { "epoch": 4.230133969535695, "grad_norm": 0.19836485385894775, "learning_rate": 9.626064999766788e-06, "loss": 0.0284, "num_input_tokens_seen": 49751072, "step": 23050 }, { "epoch": 4.231051569095246, "grad_norm": 5.341311931610107, "learning_rate": 9.62576109479639e-06, "loss": 0.163, "num_input_tokens_seen": 49761984, "step": 23055 }, { "epoch": 4.231969168654799, "grad_norm": 17.832725524902344, "learning_rate": 9.625457071182388e-06, "loss": 0.1562, "num_input_tokens_seen": 49773536, "step": 23060 }, { "epoch": 4.232886768214351, "grad_norm": 9.966115951538086, "learning_rate": 9.625152928932579e-06, "loss": 0.0889, "num_input_tokens_seen": 49784000, "step": 23065 }, { "epoch": 4.233804367773904, "grad_norm": 6.173554420471191, "learning_rate": 9.62484866805477e-06, "loss": 0.1245, "num_input_tokens_seen": 49794048, "step": 23070 }, { "epoch": 4.2347219673334555, "grad_norm": 4.802831649780273, "learning_rate": 9.624544288556757e-06, "loss": 0.1014, "num_input_tokens_seen": 49804864, "step": 23075 }, { "epoch": 4.235639566893008, "grad_norm": 0.9444950819015503, "learning_rate": 9.62423979044635e-06, "loss": 0.3206, "num_input_tokens_seen": 49814944, "step": 23080 }, { "epoch": 4.2365571664525605, "grad_norm": 0.5690320134162903, "learning_rate": 9.623935173731362e-06, "loss": 0.1346, "num_input_tokens_seen": 49825056, "step": 23085 }, { "epoch": 4.237474766012112, "grad_norm": 29.74315643310547, "learning_rate": 9.623630438419602e-06, "loss": 0.2713, "num_input_tokens_seen": 49835616, "step": 23090 }, { "epoch": 4.238392365571665, "grad_norm": 8.756902694702148, "learning_rate": 9.623325584518887e-06, "loss": 0.3009, "num_input_tokens_seen": 49846816, "step": 23095 }, { "epoch": 4.239309965131217, "grad_norm": 0.39760246872901917, "learning_rate": 9.623020612037036e-06, "loss": 0.171, "num_input_tokens_seen": 49857088, "step": 23100 }, { "epoch": 4.240227564690769, "grad_norm": 1.6413058042526245, "learning_rate": 9.622715520981871e-06, "loss": 0.2012, "num_input_tokens_seen": 49867136, "step": 23105 }, { "epoch": 4.241145164250321, "grad_norm": 0.22727219760417938, "learning_rate": 9.622410311361219e-06, "loss": 0.0865, "num_input_tokens_seen": 49877568, "step": 23110 }, { "epoch": 4.242062763809874, "grad_norm": 0.3674411475658417, "learning_rate": 9.622104983182905e-06, "loss": 0.1368, "num_input_tokens_seen": 49888128, "step": 23115 }, { "epoch": 4.242980363369425, "grad_norm": 0.6449320316314697, "learning_rate": 9.621799536454763e-06, "loss": 0.169, "num_input_tokens_seen": 49899264, "step": 23120 }, { "epoch": 4.243897962928978, "grad_norm": 3.515042304992676, "learning_rate": 9.621493971184625e-06, "loss": 0.1539, "num_input_tokens_seen": 49909920, "step": 23125 }, { "epoch": 4.24481556248853, "grad_norm": 0.14629045128822327, "learning_rate": 9.621188287380331e-06, "loss": 0.1314, "num_input_tokens_seen": 49920704, "step": 23130 }, { "epoch": 4.245733162048082, "grad_norm": 4.726999282836914, "learning_rate": 9.620882485049718e-06, "loss": 0.2221, "num_input_tokens_seen": 49931552, "step": 23135 }, { "epoch": 4.2466507616076345, "grad_norm": 4.537502765655518, "learning_rate": 9.620576564200632e-06, "loss": 0.2441, "num_input_tokens_seen": 49942784, "step": 23140 }, { "epoch": 4.247568361167187, "grad_norm": 0.39028608798980713, "learning_rate": 9.620270524840918e-06, "loss": 0.0829, "num_input_tokens_seen": 49953504, "step": 23145 }, { "epoch": 4.248485960726739, "grad_norm": 6.37429141998291, "learning_rate": 9.619964366978426e-06, "loss": 0.1869, "num_input_tokens_seen": 49963456, "step": 23150 }, { "epoch": 4.249403560286291, "grad_norm": 4.451339244842529, "learning_rate": 9.619658090621008e-06, "loss": 0.2684, "num_input_tokens_seen": 49973696, "step": 23155 }, { "epoch": 4.250321159845844, "grad_norm": 11.753779411315918, "learning_rate": 9.61935169577652e-06, "loss": 0.0727, "num_input_tokens_seen": 49984448, "step": 23160 }, { "epoch": 4.251238759405395, "grad_norm": 3.4419734477996826, "learning_rate": 9.61904518245282e-06, "loss": 0.1458, "num_input_tokens_seen": 49995264, "step": 23165 }, { "epoch": 4.252156358964948, "grad_norm": 0.1365334391593933, "learning_rate": 9.61873855065777e-06, "loss": 0.1314, "num_input_tokens_seen": 50005888, "step": 23170 }, { "epoch": 4.2530739585245, "grad_norm": 5.253973960876465, "learning_rate": 9.618431800399236e-06, "loss": 0.0505, "num_input_tokens_seen": 50015552, "step": 23175 }, { "epoch": 4.253991558084052, "grad_norm": 18.135826110839844, "learning_rate": 9.61812493168508e-06, "loss": 0.0616, "num_input_tokens_seen": 50026048, "step": 23180 }, { "epoch": 4.2549091576436044, "grad_norm": 0.15189816057682037, "learning_rate": 9.617817944523181e-06, "loss": 0.1929, "num_input_tokens_seen": 50036448, "step": 23185 }, { "epoch": 4.255826757203157, "grad_norm": 0.13456644117832184, "learning_rate": 9.617510838921407e-06, "loss": 0.2848, "num_input_tokens_seen": 50046688, "step": 23190 }, { "epoch": 4.256744356762709, "grad_norm": 0.10500821471214294, "learning_rate": 9.617203614887639e-06, "loss": 0.0262, "num_input_tokens_seen": 50057248, "step": 23195 }, { "epoch": 4.257661956322261, "grad_norm": 0.13006630539894104, "learning_rate": 9.616896272429752e-06, "loss": 0.0647, "num_input_tokens_seen": 50068576, "step": 23200 }, { "epoch": 4.258579555881814, "grad_norm": 12.698649406433105, "learning_rate": 9.61658881155563e-06, "loss": 0.1821, "num_input_tokens_seen": 50079904, "step": 23205 }, { "epoch": 4.259497155441365, "grad_norm": 22.26877212524414, "learning_rate": 9.61628123227316e-06, "loss": 0.3136, "num_input_tokens_seen": 50091648, "step": 23210 }, { "epoch": 4.260414755000918, "grad_norm": 0.0717819556593895, "learning_rate": 9.615973534590235e-06, "loss": 0.1333, "num_input_tokens_seen": 50102432, "step": 23215 }, { "epoch": 4.26133235456047, "grad_norm": 14.230853080749512, "learning_rate": 9.61566571851474e-06, "loss": 0.2574, "num_input_tokens_seen": 50114400, "step": 23220 }, { "epoch": 4.262249954120022, "grad_norm": 0.7038887739181519, "learning_rate": 9.615357784054572e-06, "loss": 0.0515, "num_input_tokens_seen": 50125184, "step": 23225 }, { "epoch": 4.263167553679574, "grad_norm": 0.06382311880588531, "learning_rate": 9.615049731217632e-06, "loss": 0.1176, "num_input_tokens_seen": 50135840, "step": 23230 }, { "epoch": 4.264085153239127, "grad_norm": 10.513602256774902, "learning_rate": 9.61474156001182e-06, "loss": 0.0163, "num_input_tokens_seen": 50146240, "step": 23235 }, { "epoch": 4.2650027527986785, "grad_norm": 0.2888842523097992, "learning_rate": 9.614433270445036e-06, "loss": 0.1189, "num_input_tokens_seen": 50157664, "step": 23240 }, { "epoch": 4.265920352358231, "grad_norm": 0.12151241302490234, "learning_rate": 9.614124862525192e-06, "loss": 0.0849, "num_input_tokens_seen": 50168768, "step": 23245 }, { "epoch": 4.2668379519177835, "grad_norm": 4.695502281188965, "learning_rate": 9.613816336260198e-06, "loss": 0.185, "num_input_tokens_seen": 50180032, "step": 23250 }, { "epoch": 4.267755551477335, "grad_norm": 0.2095254510641098, "learning_rate": 9.613507691657965e-06, "loss": 0.0907, "num_input_tokens_seen": 50191424, "step": 23255 }, { "epoch": 4.268673151036888, "grad_norm": 10.281052589416504, "learning_rate": 9.613198928726408e-06, "loss": 0.2951, "num_input_tokens_seen": 50202688, "step": 23260 }, { "epoch": 4.26959075059644, "grad_norm": 0.06596467643976212, "learning_rate": 9.612890047473449e-06, "loss": 0.0745, "num_input_tokens_seen": 50213696, "step": 23265 }, { "epoch": 4.270508350155992, "grad_norm": 17.55710220336914, "learning_rate": 9.612581047907011e-06, "loss": 0.1956, "num_input_tokens_seen": 50224192, "step": 23270 }, { "epoch": 4.271425949715544, "grad_norm": 29.019121170043945, "learning_rate": 9.612271930035017e-06, "loss": 0.129, "num_input_tokens_seen": 50235904, "step": 23275 }, { "epoch": 4.272343549275097, "grad_norm": 0.12164571136236191, "learning_rate": 9.611962693865395e-06, "loss": 0.1209, "num_input_tokens_seen": 50246720, "step": 23280 }, { "epoch": 4.273261148834648, "grad_norm": 0.06556221842765808, "learning_rate": 9.61165333940608e-06, "loss": 0.1327, "num_input_tokens_seen": 50257888, "step": 23285 }, { "epoch": 4.274178748394201, "grad_norm": 0.1622520536184311, "learning_rate": 9.611343866665004e-06, "loss": 0.1124, "num_input_tokens_seen": 50268736, "step": 23290 }, { "epoch": 4.275096347953753, "grad_norm": 55.43881607055664, "learning_rate": 9.611034275650104e-06, "loss": 0.1007, "num_input_tokens_seen": 50279776, "step": 23295 }, { "epoch": 4.276013947513305, "grad_norm": 11.238231658935547, "learning_rate": 9.610724566369322e-06, "loss": 0.1602, "num_input_tokens_seen": 50290496, "step": 23300 }, { "epoch": 4.2769315470728575, "grad_norm": 6.456679821014404, "learning_rate": 9.6104147388306e-06, "loss": 0.1958, "num_input_tokens_seen": 50301024, "step": 23305 }, { "epoch": 4.27784914663241, "grad_norm": 27.76593589782715, "learning_rate": 9.610104793041885e-06, "loss": 0.2593, "num_input_tokens_seen": 50312256, "step": 23310 }, { "epoch": 4.278766746191962, "grad_norm": 2.5871493816375732, "learning_rate": 9.609794729011128e-06, "loss": 0.1399, "num_input_tokens_seen": 50322720, "step": 23315 }, { "epoch": 4.279684345751514, "grad_norm": 0.06169911473989487, "learning_rate": 9.609484546746282e-06, "loss": 0.0427, "num_input_tokens_seen": 50334208, "step": 23320 }, { "epoch": 4.280601945311067, "grad_norm": 0.12821491062641144, "learning_rate": 9.6091742462553e-06, "loss": 0.1406, "num_input_tokens_seen": 50344064, "step": 23325 }, { "epoch": 4.281519544870618, "grad_norm": 0.11574982106685638, "learning_rate": 9.608863827546142e-06, "loss": 0.0994, "num_input_tokens_seen": 50354880, "step": 23330 }, { "epoch": 4.282437144430171, "grad_norm": 0.6562226414680481, "learning_rate": 9.60855329062677e-06, "loss": 0.2052, "num_input_tokens_seen": 50366528, "step": 23335 }, { "epoch": 4.283354743989723, "grad_norm": 0.15410295128822327, "learning_rate": 9.60824263550515e-06, "loss": 0.1833, "num_input_tokens_seen": 50377440, "step": 23340 }, { "epoch": 4.284272343549275, "grad_norm": 0.34326016902923584, "learning_rate": 9.607931862189246e-06, "loss": 0.0164, "num_input_tokens_seen": 50388224, "step": 23345 }, { "epoch": 4.285189943108827, "grad_norm": 36.45830154418945, "learning_rate": 9.607620970687032e-06, "loss": 0.2494, "num_input_tokens_seen": 50397984, "step": 23350 }, { "epoch": 4.28610754266838, "grad_norm": 0.0907263457775116, "learning_rate": 9.607309961006484e-06, "loss": 0.2845, "num_input_tokens_seen": 50409568, "step": 23355 }, { "epoch": 4.2870251422279315, "grad_norm": 0.36234042048454285, "learning_rate": 9.606998833155574e-06, "loss": 0.071, "num_input_tokens_seen": 50420576, "step": 23360 }, { "epoch": 4.287942741787484, "grad_norm": 28.212974548339844, "learning_rate": 9.606687587142284e-06, "loss": 0.1113, "num_input_tokens_seen": 50431392, "step": 23365 }, { "epoch": 4.2888603413470365, "grad_norm": 3.3729326725006104, "learning_rate": 9.606376222974599e-06, "loss": 0.1083, "num_input_tokens_seen": 50441056, "step": 23370 }, { "epoch": 4.289777940906588, "grad_norm": 36.324867248535156, "learning_rate": 9.606064740660501e-06, "loss": 0.1757, "num_input_tokens_seen": 50453024, "step": 23375 }, { "epoch": 4.290695540466141, "grad_norm": 0.7261828780174255, "learning_rate": 9.60575314020798e-06, "loss": 0.0654, "num_input_tokens_seen": 50463616, "step": 23380 }, { "epoch": 4.291613140025693, "grad_norm": 46.04784393310547, "learning_rate": 9.605441421625032e-06, "loss": 0.0788, "num_input_tokens_seen": 50473632, "step": 23385 }, { "epoch": 4.292530739585245, "grad_norm": 0.0938330739736557, "learning_rate": 9.605129584919649e-06, "loss": 0.0213, "num_input_tokens_seen": 50483808, "step": 23390 }, { "epoch": 4.293448339144797, "grad_norm": 0.030813485383987427, "learning_rate": 9.604817630099827e-06, "loss": 0.0811, "num_input_tokens_seen": 50494784, "step": 23395 }, { "epoch": 4.29436593870435, "grad_norm": 14.536201477050781, "learning_rate": 9.604505557173573e-06, "loss": 0.3053, "num_input_tokens_seen": 50505344, "step": 23400 }, { "epoch": 4.295283538263901, "grad_norm": 0.14227180182933807, "learning_rate": 9.604193366148887e-06, "loss": 0.2326, "num_input_tokens_seen": 50515680, "step": 23405 }, { "epoch": 4.296201137823454, "grad_norm": 4.284005165100098, "learning_rate": 9.603881057033775e-06, "loss": 0.0996, "num_input_tokens_seen": 50526592, "step": 23410 }, { "epoch": 4.297118737383006, "grad_norm": 0.26511794328689575, "learning_rate": 9.60356862983625e-06, "loss": 0.085, "num_input_tokens_seen": 50536704, "step": 23415 }, { "epoch": 4.298036336942558, "grad_norm": 0.20455268025398254, "learning_rate": 9.603256084564325e-06, "loss": 0.2458, "num_input_tokens_seen": 50548000, "step": 23420 }, { "epoch": 4.2989539365021106, "grad_norm": 25.635709762573242, "learning_rate": 9.602943421226017e-06, "loss": 0.1047, "num_input_tokens_seen": 50559616, "step": 23425 }, { "epoch": 4.299871536061663, "grad_norm": 23.344327926635742, "learning_rate": 9.602630639829343e-06, "loss": 0.2214, "num_input_tokens_seen": 50569312, "step": 23430 }, { "epoch": 4.300789135621215, "grad_norm": 28.42580795288086, "learning_rate": 9.602317740382325e-06, "loss": 0.2718, "num_input_tokens_seen": 50579328, "step": 23435 }, { "epoch": 4.301706735180767, "grad_norm": 0.09524093568325043, "learning_rate": 9.602004722892993e-06, "loss": 0.139, "num_input_tokens_seen": 50589760, "step": 23440 }, { "epoch": 4.30262433474032, "grad_norm": 0.7190080285072327, "learning_rate": 9.60169158736937e-06, "loss": 0.0148, "num_input_tokens_seen": 50600608, "step": 23445 }, { "epoch": 4.303541934299871, "grad_norm": 25.54874610900879, "learning_rate": 9.60137833381949e-06, "loss": 0.1001, "num_input_tokens_seen": 50610688, "step": 23450 }, { "epoch": 4.304459533859424, "grad_norm": 0.09018006175756454, "learning_rate": 9.601064962251387e-06, "loss": 0.2555, "num_input_tokens_seen": 50621728, "step": 23455 }, { "epoch": 4.305377133418976, "grad_norm": 0.6665626764297485, "learning_rate": 9.6007514726731e-06, "loss": 0.1237, "num_input_tokens_seen": 50631712, "step": 23460 }, { "epoch": 4.306294732978528, "grad_norm": 0.0914514809846878, "learning_rate": 9.600437865092667e-06, "loss": 0.2316, "num_input_tokens_seen": 50642528, "step": 23465 }, { "epoch": 4.3072123325380804, "grad_norm": 1.6120169162750244, "learning_rate": 9.600124139518134e-06, "loss": 0.1459, "num_input_tokens_seen": 50652704, "step": 23470 }, { "epoch": 4.308129932097633, "grad_norm": 0.019702473655343056, "learning_rate": 9.599810295957545e-06, "loss": 0.2941, "num_input_tokens_seen": 50663712, "step": 23475 }, { "epoch": 4.309047531657185, "grad_norm": 32.26804733276367, "learning_rate": 9.599496334418952e-06, "loss": 0.2223, "num_input_tokens_seen": 50674784, "step": 23480 }, { "epoch": 4.309965131216737, "grad_norm": 27.648256301879883, "learning_rate": 9.599182254910407e-06, "loss": 0.0879, "num_input_tokens_seen": 50684352, "step": 23485 }, { "epoch": 4.31088273077629, "grad_norm": 4.292042255401611, "learning_rate": 9.598868057439965e-06, "loss": 0.3024, "num_input_tokens_seen": 50695584, "step": 23490 }, { "epoch": 4.311800330335841, "grad_norm": 9.268238067626953, "learning_rate": 9.598553742015685e-06, "loss": 0.0867, "num_input_tokens_seen": 50706400, "step": 23495 }, { "epoch": 4.312717929895394, "grad_norm": 0.2291639745235443, "learning_rate": 9.598239308645627e-06, "loss": 0.0958, "num_input_tokens_seen": 50717376, "step": 23500 }, { "epoch": 4.313635529454946, "grad_norm": 0.547892153263092, "learning_rate": 9.59792475733786e-06, "loss": 0.2557, "num_input_tokens_seen": 50728000, "step": 23505 }, { "epoch": 4.314553129014498, "grad_norm": 72.72742462158203, "learning_rate": 9.597610088100446e-06, "loss": 0.3466, "num_input_tokens_seen": 50738432, "step": 23510 }, { "epoch": 4.31547072857405, "grad_norm": 3.832723379135132, "learning_rate": 9.59729530094146e-06, "loss": 0.0385, "num_input_tokens_seen": 50749536, "step": 23515 }, { "epoch": 4.316388328133603, "grad_norm": 0.05185819789767265, "learning_rate": 9.596980395868976e-06, "loss": 0.0679, "num_input_tokens_seen": 50759840, "step": 23520 }, { "epoch": 4.3173059276931545, "grad_norm": 16.28123664855957, "learning_rate": 9.59666537289107e-06, "loss": 0.3419, "num_input_tokens_seen": 50770016, "step": 23525 }, { "epoch": 4.318223527252707, "grad_norm": 0.37377026677131653, "learning_rate": 9.59635023201582e-06, "loss": 0.0394, "num_input_tokens_seen": 50781952, "step": 23530 }, { "epoch": 4.3191411268122595, "grad_norm": 0.11116161942481995, "learning_rate": 9.59603497325131e-06, "loss": 0.0233, "num_input_tokens_seen": 50792960, "step": 23535 }, { "epoch": 4.320058726371811, "grad_norm": 1.9385249614715576, "learning_rate": 9.595719596605626e-06, "loss": 0.2013, "num_input_tokens_seen": 50803360, "step": 23540 }, { "epoch": 4.320976325931364, "grad_norm": 0.05711859464645386, "learning_rate": 9.595404102086858e-06, "loss": 0.1129, "num_input_tokens_seen": 50812672, "step": 23545 }, { "epoch": 4.321893925490916, "grad_norm": 0.1028052344918251, "learning_rate": 9.595088489703096e-06, "loss": 0.0787, "num_input_tokens_seen": 50823296, "step": 23550 }, { "epoch": 4.322811525050468, "grad_norm": 0.11301324516534805, "learning_rate": 9.594772759462436e-06, "loss": 0.1096, "num_input_tokens_seen": 50833888, "step": 23555 }, { "epoch": 4.32372912461002, "grad_norm": 12.741293907165527, "learning_rate": 9.594456911372978e-06, "loss": 0.3006, "num_input_tokens_seen": 50843904, "step": 23560 }, { "epoch": 4.324646724169573, "grad_norm": 0.05926654487848282, "learning_rate": 9.594140945442819e-06, "loss": 0.0025, "num_input_tokens_seen": 50852480, "step": 23565 }, { "epoch": 4.325564323729124, "grad_norm": 0.03981003537774086, "learning_rate": 9.593824861680065e-06, "loss": 0.0443, "num_input_tokens_seen": 50863232, "step": 23570 }, { "epoch": 4.326481923288677, "grad_norm": 0.742175281047821, "learning_rate": 9.593508660092824e-06, "loss": 0.0878, "num_input_tokens_seen": 50873536, "step": 23575 }, { "epoch": 4.327399522848229, "grad_norm": 0.02881467714905739, "learning_rate": 9.593192340689204e-06, "loss": 0.0062, "num_input_tokens_seen": 50883360, "step": 23580 }, { "epoch": 4.328317122407781, "grad_norm": 0.05013616755604744, "learning_rate": 9.592875903477321e-06, "loss": 0.1076, "num_input_tokens_seen": 50892768, "step": 23585 }, { "epoch": 4.3292347219673335, "grad_norm": 1.8770495653152466, "learning_rate": 9.592559348465289e-06, "loss": 0.3035, "num_input_tokens_seen": 50903136, "step": 23590 }, { "epoch": 4.330152321526886, "grad_norm": 17.84748077392578, "learning_rate": 9.592242675661227e-06, "loss": 0.0391, "num_input_tokens_seen": 50913216, "step": 23595 }, { "epoch": 4.331069921086438, "grad_norm": 0.1061304435133934, "learning_rate": 9.591925885073257e-06, "loss": 0.2575, "num_input_tokens_seen": 50924128, "step": 23600 }, { "epoch": 4.33198752064599, "grad_norm": 0.0752595067024231, "learning_rate": 9.591608976709505e-06, "loss": 0.0578, "num_input_tokens_seen": 50933952, "step": 23605 }, { "epoch": 4.332905120205543, "grad_norm": 0.060467395931482315, "learning_rate": 9.5912919505781e-06, "loss": 0.1732, "num_input_tokens_seen": 50945792, "step": 23610 }, { "epoch": 4.333822719765094, "grad_norm": 0.18559721112251282, "learning_rate": 9.590974806687173e-06, "loss": 0.0186, "num_input_tokens_seen": 50956928, "step": 23615 }, { "epoch": 4.334740319324647, "grad_norm": 22.18149185180664, "learning_rate": 9.590657545044856e-06, "loss": 0.2409, "num_input_tokens_seen": 50968064, "step": 23620 }, { "epoch": 4.335657918884199, "grad_norm": 26.075639724731445, "learning_rate": 9.590340165659288e-06, "loss": 0.0911, "num_input_tokens_seen": 50978336, "step": 23625 }, { "epoch": 4.336575518443751, "grad_norm": 9.872845649719238, "learning_rate": 9.59002266853861e-06, "loss": 0.415, "num_input_tokens_seen": 50989536, "step": 23630 }, { "epoch": 4.337493118003303, "grad_norm": 0.0495581328868866, "learning_rate": 9.589705053690963e-06, "loss": 0.1563, "num_input_tokens_seen": 51001728, "step": 23635 }, { "epoch": 4.338410717562856, "grad_norm": 12.824967384338379, "learning_rate": 9.589387321124496e-06, "loss": 0.3695, "num_input_tokens_seen": 51014720, "step": 23640 }, { "epoch": 4.3393283171224075, "grad_norm": 20.054346084594727, "learning_rate": 9.589069470847358e-06, "loss": 0.0761, "num_input_tokens_seen": 51025536, "step": 23645 }, { "epoch": 4.34024591668196, "grad_norm": 0.15329614281654358, "learning_rate": 9.588751502867699e-06, "loss": 0.086, "num_input_tokens_seen": 51034752, "step": 23650 }, { "epoch": 4.3411635162415125, "grad_norm": 1.012352705001831, "learning_rate": 9.588433417193677e-06, "loss": 0.2044, "num_input_tokens_seen": 51044640, "step": 23655 }, { "epoch": 4.342081115801064, "grad_norm": 13.355831146240234, "learning_rate": 9.588115213833447e-06, "loss": 0.0094, "num_input_tokens_seen": 51054880, "step": 23660 }, { "epoch": 4.342998715360617, "grad_norm": 5.888176441192627, "learning_rate": 9.587796892795175e-06, "loss": 0.2934, "num_input_tokens_seen": 51064384, "step": 23665 }, { "epoch": 4.343916314920169, "grad_norm": 1.5330110788345337, "learning_rate": 9.587478454087022e-06, "loss": 0.1038, "num_input_tokens_seen": 51074784, "step": 23670 }, { "epoch": 4.344833914479721, "grad_norm": 0.33735841512680054, "learning_rate": 9.587159897717157e-06, "loss": 0.1694, "num_input_tokens_seen": 51086688, "step": 23675 }, { "epoch": 4.345751514039273, "grad_norm": 31.705463409423828, "learning_rate": 9.586841223693751e-06, "loss": 0.229, "num_input_tokens_seen": 51097600, "step": 23680 }, { "epoch": 4.346669113598826, "grad_norm": 10.662971496582031, "learning_rate": 9.586522432024974e-06, "loss": 0.3114, "num_input_tokens_seen": 51108832, "step": 23685 }, { "epoch": 4.347586713158377, "grad_norm": 60.13887405395508, "learning_rate": 9.586203522719007e-06, "loss": 0.0636, "num_input_tokens_seen": 51119872, "step": 23690 }, { "epoch": 4.34850431271793, "grad_norm": 5.488757133483887, "learning_rate": 9.585884495784027e-06, "loss": 0.4723, "num_input_tokens_seen": 51130592, "step": 23695 }, { "epoch": 4.349421912277482, "grad_norm": 0.8088352084159851, "learning_rate": 9.585565351228218e-06, "loss": 0.0083, "num_input_tokens_seen": 51141792, "step": 23700 }, { "epoch": 4.350339511837034, "grad_norm": 0.756000816822052, "learning_rate": 9.585246089059765e-06, "loss": 0.0648, "num_input_tokens_seen": 51152160, "step": 23705 }, { "epoch": 4.3512571113965866, "grad_norm": 0.4289403259754181, "learning_rate": 9.584926709286855e-06, "loss": 0.0274, "num_input_tokens_seen": 51162400, "step": 23710 }, { "epoch": 4.352174710956139, "grad_norm": 28.676855087280273, "learning_rate": 9.584607211917681e-06, "loss": 0.083, "num_input_tokens_seen": 51173600, "step": 23715 }, { "epoch": 4.353092310515691, "grad_norm": 4.9079909324646, "learning_rate": 9.584287596960437e-06, "loss": 0.2302, "num_input_tokens_seen": 51184512, "step": 23720 }, { "epoch": 4.354009910075243, "grad_norm": 0.18717044591903687, "learning_rate": 9.583967864423323e-06, "loss": 0.2057, "num_input_tokens_seen": 51194848, "step": 23725 }, { "epoch": 4.354927509634796, "grad_norm": 65.63626098632812, "learning_rate": 9.583648014314537e-06, "loss": 0.1538, "num_input_tokens_seen": 51205024, "step": 23730 }, { "epoch": 4.355845109194347, "grad_norm": 0.24254079163074493, "learning_rate": 9.583328046642283e-06, "loss": 0.0142, "num_input_tokens_seen": 51214944, "step": 23735 }, { "epoch": 4.3567627087539, "grad_norm": 0.08990448713302612, "learning_rate": 9.583007961414769e-06, "loss": 0.0851, "num_input_tokens_seen": 51226176, "step": 23740 }, { "epoch": 4.357680308313452, "grad_norm": 2.100212574005127, "learning_rate": 9.582687758640204e-06, "loss": 0.1811, "num_input_tokens_seen": 51237632, "step": 23745 }, { "epoch": 4.358597907873004, "grad_norm": 0.10480672121047974, "learning_rate": 9.582367438326799e-06, "loss": 0.0069, "num_input_tokens_seen": 51247328, "step": 23750 }, { "epoch": 4.3595155074325564, "grad_norm": 8.386570930480957, "learning_rate": 9.58204700048277e-06, "loss": 0.1019, "num_input_tokens_seen": 51256672, "step": 23755 }, { "epoch": 4.360433106992109, "grad_norm": 10.12106990814209, "learning_rate": 9.58172644511634e-06, "loss": 0.1493, "num_input_tokens_seen": 51266656, "step": 23760 }, { "epoch": 4.361350706551661, "grad_norm": 0.039969898760318756, "learning_rate": 9.581405772235726e-06, "loss": 0.2343, "num_input_tokens_seen": 51276544, "step": 23765 }, { "epoch": 4.362268306111213, "grad_norm": 18.225893020629883, "learning_rate": 9.581084981849156e-06, "loss": 0.3093, "num_input_tokens_seen": 51287424, "step": 23770 }, { "epoch": 4.363185905670766, "grad_norm": 3.3079006671905518, "learning_rate": 9.580764073964855e-06, "loss": 0.1355, "num_input_tokens_seen": 51298784, "step": 23775 }, { "epoch": 4.364103505230317, "grad_norm": 0.7168108224868774, "learning_rate": 9.580443048591055e-06, "loss": 0.0044, "num_input_tokens_seen": 51310080, "step": 23780 }, { "epoch": 4.36502110478987, "grad_norm": 0.5340812802314758, "learning_rate": 9.58012190573599e-06, "loss": 0.0617, "num_input_tokens_seen": 51320480, "step": 23785 }, { "epoch": 4.365938704349422, "grad_norm": 9.269046783447266, "learning_rate": 9.579800645407897e-06, "loss": 0.2194, "num_input_tokens_seen": 51331104, "step": 23790 }, { "epoch": 4.366856303908974, "grad_norm": 21.889650344848633, "learning_rate": 9.579479267615016e-06, "loss": 0.1312, "num_input_tokens_seen": 51341984, "step": 23795 }, { "epoch": 4.367773903468526, "grad_norm": 0.8714179396629333, "learning_rate": 9.579157772365589e-06, "loss": 0.0972, "num_input_tokens_seen": 51352672, "step": 23800 }, { "epoch": 4.368691503028079, "grad_norm": 0.13162608444690704, "learning_rate": 9.578836159667861e-06, "loss": 0.1109, "num_input_tokens_seen": 51363264, "step": 23805 }, { "epoch": 4.3696091025876305, "grad_norm": 1.8655799627304077, "learning_rate": 9.578514429530084e-06, "loss": 0.1113, "num_input_tokens_seen": 51374080, "step": 23810 }, { "epoch": 4.370526702147183, "grad_norm": 0.07196254283189774, "learning_rate": 9.578192581960506e-06, "loss": 0.0068, "num_input_tokens_seen": 51385824, "step": 23815 }, { "epoch": 4.3714443017067355, "grad_norm": 11.030750274658203, "learning_rate": 9.577870616967386e-06, "loss": 0.1154, "num_input_tokens_seen": 51396864, "step": 23820 }, { "epoch": 4.372361901266287, "grad_norm": 57.63529586791992, "learning_rate": 9.577548534558979e-06, "loss": 0.2005, "num_input_tokens_seen": 51408544, "step": 23825 }, { "epoch": 4.37327950082584, "grad_norm": 0.020325597375631332, "learning_rate": 9.577226334743546e-06, "loss": 0.0033, "num_input_tokens_seen": 51418368, "step": 23830 }, { "epoch": 4.374197100385392, "grad_norm": 0.017587870359420776, "learning_rate": 9.576904017529351e-06, "loss": 0.0219, "num_input_tokens_seen": 51429632, "step": 23835 }, { "epoch": 4.375114699944944, "grad_norm": 0.3135976195335388, "learning_rate": 9.576581582924663e-06, "loss": 0.0029, "num_input_tokens_seen": 51440352, "step": 23840 }, { "epoch": 4.376032299504496, "grad_norm": 35.31826400756836, "learning_rate": 9.57625903093775e-06, "loss": 0.2928, "num_input_tokens_seen": 51450368, "step": 23845 }, { "epoch": 4.376949899064049, "grad_norm": 0.26258745789527893, "learning_rate": 9.575936361576884e-06, "loss": 0.0024, "num_input_tokens_seen": 51461920, "step": 23850 }, { "epoch": 4.3778674986236, "grad_norm": 0.05553887039422989, "learning_rate": 9.575613574850344e-06, "loss": 0.1469, "num_input_tokens_seen": 51473248, "step": 23855 }, { "epoch": 4.378785098183153, "grad_norm": 14.480026245117188, "learning_rate": 9.575290670766406e-06, "loss": 0.2669, "num_input_tokens_seen": 51484448, "step": 23860 }, { "epoch": 4.379702697742705, "grad_norm": 0.062082644551992416, "learning_rate": 9.574967649333354e-06, "loss": 0.0046, "num_input_tokens_seen": 51494752, "step": 23865 }, { "epoch": 4.380620297302257, "grad_norm": 9.195098876953125, "learning_rate": 9.574644510559472e-06, "loss": 0.0111, "num_input_tokens_seen": 51505472, "step": 23870 }, { "epoch": 4.3815378968618095, "grad_norm": 33.2442626953125, "learning_rate": 9.57432125445305e-06, "loss": 0.2879, "num_input_tokens_seen": 51515424, "step": 23875 }, { "epoch": 4.382455496421362, "grad_norm": 2.239999294281006, "learning_rate": 9.573997881022377e-06, "loss": 0.3382, "num_input_tokens_seen": 51526912, "step": 23880 }, { "epoch": 4.383373095980914, "grad_norm": 0.01854429952800274, "learning_rate": 9.573674390275746e-06, "loss": 0.1217, "num_input_tokens_seen": 51537632, "step": 23885 }, { "epoch": 4.384290695540466, "grad_norm": 4.2342352867126465, "learning_rate": 9.573350782221456e-06, "loss": 0.0071, "num_input_tokens_seen": 51547328, "step": 23890 }, { "epoch": 4.385208295100019, "grad_norm": 13.801365852355957, "learning_rate": 9.573027056867807e-06, "loss": 0.2948, "num_input_tokens_seen": 51558720, "step": 23895 }, { "epoch": 4.38612589465957, "grad_norm": 0.04500025883316994, "learning_rate": 9.5727032142231e-06, "loss": 0.5544, "num_input_tokens_seen": 51570304, "step": 23900 }, { "epoch": 4.387043494219123, "grad_norm": 0.1546841561794281, "learning_rate": 9.572379254295645e-06, "loss": 0.281, "num_input_tokens_seen": 51580768, "step": 23905 }, { "epoch": 4.387961093778675, "grad_norm": 0.09255164861679077, "learning_rate": 9.572055177093747e-06, "loss": 0.0564, "num_input_tokens_seen": 51593088, "step": 23910 }, { "epoch": 4.388878693338227, "grad_norm": 0.3290390074253082, "learning_rate": 9.57173098262572e-06, "loss": 0.1966, "num_input_tokens_seen": 51603328, "step": 23915 }, { "epoch": 4.389796292897779, "grad_norm": 5.365823268890381, "learning_rate": 9.571406670899879e-06, "loss": 0.1294, "num_input_tokens_seen": 51615680, "step": 23920 }, { "epoch": 4.390713892457332, "grad_norm": 0.13019706308841705, "learning_rate": 9.57108224192454e-06, "loss": 0.3182, "num_input_tokens_seen": 51626464, "step": 23925 }, { "epoch": 4.3916314920168835, "grad_norm": 0.12311115860939026, "learning_rate": 9.57075769570803e-06, "loss": 0.0797, "num_input_tokens_seen": 51637600, "step": 23930 }, { "epoch": 4.392549091576436, "grad_norm": 1.2925776243209839, "learning_rate": 9.570433032258666e-06, "loss": 0.0726, "num_input_tokens_seen": 51648416, "step": 23935 }, { "epoch": 4.3934666911359885, "grad_norm": 0.13704945147037506, "learning_rate": 9.57010825158478e-06, "loss": 0.2147, "num_input_tokens_seen": 51658624, "step": 23940 }, { "epoch": 4.39438429069554, "grad_norm": 0.1901206374168396, "learning_rate": 9.569783353694699e-06, "loss": 0.072, "num_input_tokens_seen": 51669120, "step": 23945 }, { "epoch": 4.395301890255093, "grad_norm": 1.7396305799484253, "learning_rate": 9.56945833859676e-06, "loss": 0.007, "num_input_tokens_seen": 51679392, "step": 23950 }, { "epoch": 4.396219489814645, "grad_norm": 9.855169296264648, "learning_rate": 9.569133206299294e-06, "loss": 0.1897, "num_input_tokens_seen": 51690400, "step": 23955 }, { "epoch": 4.397137089374197, "grad_norm": 0.05353483557701111, "learning_rate": 9.568807956810645e-06, "loss": 0.0027, "num_input_tokens_seen": 51701248, "step": 23960 }, { "epoch": 4.398054688933749, "grad_norm": 0.03876363858580589, "learning_rate": 9.56848259013915e-06, "loss": 0.0036, "num_input_tokens_seen": 51712032, "step": 23965 }, { "epoch": 4.398972288493302, "grad_norm": 0.5565535426139832, "learning_rate": 9.568157106293158e-06, "loss": 0.1069, "num_input_tokens_seen": 51722048, "step": 23970 }, { "epoch": 4.399889888052853, "grad_norm": 0.47407862544059753, "learning_rate": 9.567831505281018e-06, "loss": 0.0185, "num_input_tokens_seen": 51734080, "step": 23975 }, { "epoch": 4.400807487612406, "grad_norm": 0.2881547510623932, "learning_rate": 9.567505787111078e-06, "loss": 0.0065, "num_input_tokens_seen": 51744960, "step": 23980 }, { "epoch": 4.401725087171958, "grad_norm": 0.040614400058984756, "learning_rate": 9.567179951791695e-06, "loss": 0.1589, "num_input_tokens_seen": 51755520, "step": 23985 }, { "epoch": 4.40264268673151, "grad_norm": 10.362123489379883, "learning_rate": 9.566853999331222e-06, "loss": 0.3909, "num_input_tokens_seen": 51765952, "step": 23990 }, { "epoch": 4.4035602862910626, "grad_norm": 45.09419250488281, "learning_rate": 9.566527929738023e-06, "loss": 0.4681, "num_input_tokens_seen": 51777152, "step": 23995 }, { "epoch": 4.404477885850615, "grad_norm": 5.2061285972595215, "learning_rate": 9.56620174302046e-06, "loss": 0.0077, "num_input_tokens_seen": 51787552, "step": 24000 }, { "epoch": 4.405395485410167, "grad_norm": 25.79252052307129, "learning_rate": 9.565875439186901e-06, "loss": 0.4114, "num_input_tokens_seen": 51798656, "step": 24005 }, { "epoch": 4.406313084969719, "grad_norm": 0.03532395511865616, "learning_rate": 9.565549018245712e-06, "loss": 0.0436, "num_input_tokens_seen": 51808736, "step": 24010 }, { "epoch": 4.407230684529272, "grad_norm": 0.25143933296203613, "learning_rate": 9.565222480205268e-06, "loss": 0.1076, "num_input_tokens_seen": 51818432, "step": 24015 }, { "epoch": 4.408148284088823, "grad_norm": 0.11803437024354935, "learning_rate": 9.56489582507394e-06, "loss": 0.0919, "num_input_tokens_seen": 51828032, "step": 24020 }, { "epoch": 4.409065883648376, "grad_norm": 0.11863385140895844, "learning_rate": 9.564569052860111e-06, "loss": 0.0144, "num_input_tokens_seen": 51838880, "step": 24025 }, { "epoch": 4.409983483207928, "grad_norm": 76.13721466064453, "learning_rate": 9.56424216357216e-06, "loss": 0.1776, "num_input_tokens_seen": 51850816, "step": 24030 }, { "epoch": 4.41090108276748, "grad_norm": 6.9175028800964355, "learning_rate": 9.56391515721847e-06, "loss": 0.0102, "num_input_tokens_seen": 51861024, "step": 24035 }, { "epoch": 4.4118186823270324, "grad_norm": 0.031384099274873734, "learning_rate": 9.563588033807431e-06, "loss": 0.0038, "num_input_tokens_seen": 51871264, "step": 24040 }, { "epoch": 4.412736281886585, "grad_norm": 0.023467253893613815, "learning_rate": 9.563260793347433e-06, "loss": 0.1929, "num_input_tokens_seen": 51881280, "step": 24045 }, { "epoch": 4.413653881446137, "grad_norm": 14.19885540008545, "learning_rate": 9.562933435846868e-06, "loss": 0.0391, "num_input_tokens_seen": 51892320, "step": 24050 }, { "epoch": 4.414571481005689, "grad_norm": 0.012151972390711308, "learning_rate": 9.56260596131413e-06, "loss": 0.0022, "num_input_tokens_seen": 51902848, "step": 24055 }, { "epoch": 4.415489080565242, "grad_norm": 0.01032968983054161, "learning_rate": 9.562278369757623e-06, "loss": 0.0103, "num_input_tokens_seen": 51913504, "step": 24060 }, { "epoch": 4.416406680124793, "grad_norm": 0.469563364982605, "learning_rate": 9.561950661185744e-06, "loss": 0.0061, "num_input_tokens_seen": 51924672, "step": 24065 }, { "epoch": 4.417324279684346, "grad_norm": 0.013872710056602955, "learning_rate": 9.561622835606903e-06, "loss": 0.1381, "num_input_tokens_seen": 51936032, "step": 24070 }, { "epoch": 4.418241879243898, "grad_norm": 1.7537580728530884, "learning_rate": 9.561294893029504e-06, "loss": 0.0038, "num_input_tokens_seen": 51945952, "step": 24075 }, { "epoch": 4.41915947880345, "grad_norm": 40.843509674072266, "learning_rate": 9.560966833461964e-06, "loss": 0.2213, "num_input_tokens_seen": 51956768, "step": 24080 }, { "epoch": 4.420077078363002, "grad_norm": 17.496320724487305, "learning_rate": 9.56063865691269e-06, "loss": 0.7005, "num_input_tokens_seen": 51967776, "step": 24085 }, { "epoch": 4.420994677922555, "grad_norm": 0.5615138411521912, "learning_rate": 9.560310363390105e-06, "loss": 0.2055, "num_input_tokens_seen": 51978336, "step": 24090 }, { "epoch": 4.4219122774821065, "grad_norm": 0.08631449937820435, "learning_rate": 9.559981952902626e-06, "loss": 0.1062, "num_input_tokens_seen": 51989632, "step": 24095 }, { "epoch": 4.422829877041659, "grad_norm": 17.98162269592285, "learning_rate": 9.559653425458677e-06, "loss": 0.1526, "num_input_tokens_seen": 51999904, "step": 24100 }, { "epoch": 4.4237474766012115, "grad_norm": 6.259000301361084, "learning_rate": 9.559324781066686e-06, "loss": 0.0658, "num_input_tokens_seen": 52008928, "step": 24105 }, { "epoch": 4.424665076160763, "grad_norm": 27.584409713745117, "learning_rate": 9.558996019735078e-06, "loss": 0.1889, "num_input_tokens_seen": 52020896, "step": 24110 }, { "epoch": 4.425582675720316, "grad_norm": 3.9257543087005615, "learning_rate": 9.55866714147229e-06, "loss": 0.3319, "num_input_tokens_seen": 52030240, "step": 24115 }, { "epoch": 4.426500275279868, "grad_norm": 14.318163871765137, "learning_rate": 9.558338146286756e-06, "loss": 0.2038, "num_input_tokens_seen": 52040288, "step": 24120 }, { "epoch": 4.42741787483942, "grad_norm": 1.6339516639709473, "learning_rate": 9.55800903418691e-06, "loss": 0.0506, "num_input_tokens_seen": 52050592, "step": 24125 }, { "epoch": 4.428335474398972, "grad_norm": 0.5370038151741028, "learning_rate": 9.5576798051812e-06, "loss": 0.1002, "num_input_tokens_seen": 52060384, "step": 24130 }, { "epoch": 4.429253073958525, "grad_norm": 0.0894952118396759, "learning_rate": 9.557350459278065e-06, "loss": 0.0296, "num_input_tokens_seen": 52071712, "step": 24135 }, { "epoch": 4.430170673518076, "grad_norm": 0.35233384370803833, "learning_rate": 9.557020996485954e-06, "loss": 0.1114, "num_input_tokens_seen": 52083936, "step": 24140 }, { "epoch": 4.431088273077629, "grad_norm": 0.045547597110271454, "learning_rate": 9.556691416813317e-06, "loss": 0.0157, "num_input_tokens_seen": 52094112, "step": 24145 }, { "epoch": 4.432005872637181, "grad_norm": 0.23354670405387878, "learning_rate": 9.556361720268609e-06, "loss": 0.4493, "num_input_tokens_seen": 52105248, "step": 24150 }, { "epoch": 4.432923472196733, "grad_norm": 0.7705771923065186, "learning_rate": 9.556031906860283e-06, "loss": 0.0323, "num_input_tokens_seen": 52115968, "step": 24155 }, { "epoch": 4.4338410717562855, "grad_norm": 10.85600471496582, "learning_rate": 9.555701976596801e-06, "loss": 0.2072, "num_input_tokens_seen": 52127264, "step": 24160 }, { "epoch": 4.434758671315838, "grad_norm": 5.139575958251953, "learning_rate": 9.555371929486621e-06, "loss": 0.0248, "num_input_tokens_seen": 52137920, "step": 24165 }, { "epoch": 4.43567627087539, "grad_norm": 29.180274963378906, "learning_rate": 9.555041765538215e-06, "loss": 0.1383, "num_input_tokens_seen": 52148928, "step": 24170 }, { "epoch": 4.436593870434942, "grad_norm": 0.040412552654743195, "learning_rate": 9.554711484760046e-06, "loss": 0.0528, "num_input_tokens_seen": 52161280, "step": 24175 }, { "epoch": 4.437511469994495, "grad_norm": 1.6008273363113403, "learning_rate": 9.554381087160586e-06, "loss": 0.0648, "num_input_tokens_seen": 52171360, "step": 24180 }, { "epoch": 4.438429069554046, "grad_norm": 0.05658550560474396, "learning_rate": 9.55405057274831e-06, "loss": 0.1532, "num_input_tokens_seen": 52181056, "step": 24185 }, { "epoch": 4.439346669113599, "grad_norm": 1.9811887741088867, "learning_rate": 9.553719941531695e-06, "loss": 0.1953, "num_input_tokens_seen": 52191840, "step": 24190 }, { "epoch": 4.440264268673151, "grad_norm": 0.1405249536037445, "learning_rate": 9.553389193519221e-06, "loss": 0.1816, "num_input_tokens_seen": 52202816, "step": 24195 }, { "epoch": 4.441181868232703, "grad_norm": 5.063675880432129, "learning_rate": 9.553058328719372e-06, "loss": 0.1865, "num_input_tokens_seen": 52212416, "step": 24200 }, { "epoch": 4.442099467792255, "grad_norm": 45.98286819458008, "learning_rate": 9.552727347140634e-06, "loss": 0.221, "num_input_tokens_seen": 52223392, "step": 24205 }, { "epoch": 4.443017067351808, "grad_norm": 2.658015727996826, "learning_rate": 9.552396248791494e-06, "loss": 0.2107, "num_input_tokens_seen": 52235392, "step": 24210 }, { "epoch": 4.4439346669113595, "grad_norm": 19.006486892700195, "learning_rate": 9.552065033680446e-06, "loss": 0.2709, "num_input_tokens_seen": 52246272, "step": 24215 }, { "epoch": 4.444852266470912, "grad_norm": 0.6802879571914673, "learning_rate": 9.551733701815985e-06, "loss": 0.1119, "num_input_tokens_seen": 52256960, "step": 24220 }, { "epoch": 4.4457698660304645, "grad_norm": 7.2316436767578125, "learning_rate": 9.55140225320661e-06, "loss": 0.0926, "num_input_tokens_seen": 52267392, "step": 24225 }, { "epoch": 4.446687465590016, "grad_norm": 0.3110688030719757, "learning_rate": 9.55107068786082e-06, "loss": 0.0113, "num_input_tokens_seen": 52277568, "step": 24230 }, { "epoch": 4.447605065149569, "grad_norm": 0.5820779204368591, "learning_rate": 9.550739005787122e-06, "loss": 0.0303, "num_input_tokens_seen": 52288064, "step": 24235 }, { "epoch": 4.448522664709121, "grad_norm": 0.09493785351514816, "learning_rate": 9.55040720699402e-06, "loss": 0.1535, "num_input_tokens_seen": 52299040, "step": 24240 }, { "epoch": 4.449440264268673, "grad_norm": 0.09960992634296417, "learning_rate": 9.550075291490026e-06, "loss": 0.2181, "num_input_tokens_seen": 52308320, "step": 24245 }, { "epoch": 4.450357863828225, "grad_norm": 15.735551834106445, "learning_rate": 9.549743259283653e-06, "loss": 0.0865, "num_input_tokens_seen": 52318368, "step": 24250 }, { "epoch": 4.451275463387778, "grad_norm": 0.37650519609451294, "learning_rate": 9.549411110383418e-06, "loss": 0.2086, "num_input_tokens_seen": 52329664, "step": 24255 }, { "epoch": 4.452193062947329, "grad_norm": 0.03371273726224899, "learning_rate": 9.549078844797837e-06, "loss": 0.0921, "num_input_tokens_seen": 52340448, "step": 24260 }, { "epoch": 4.453110662506882, "grad_norm": 1.1854249238967896, "learning_rate": 9.548746462535434e-06, "loss": 0.2464, "num_input_tokens_seen": 52351328, "step": 24265 }, { "epoch": 4.454028262066434, "grad_norm": 38.95380783081055, "learning_rate": 9.548413963604736e-06, "loss": 0.3153, "num_input_tokens_seen": 52361920, "step": 24270 }, { "epoch": 4.454945861625986, "grad_norm": 0.687324047088623, "learning_rate": 9.548081348014268e-06, "loss": 0.0091, "num_input_tokens_seen": 52372928, "step": 24275 }, { "epoch": 4.4558634611855386, "grad_norm": 0.14442533254623413, "learning_rate": 9.547748615772563e-06, "loss": 0.0054, "num_input_tokens_seen": 52384000, "step": 24280 }, { "epoch": 4.456781060745091, "grad_norm": 5.165969371795654, "learning_rate": 9.547415766888152e-06, "loss": 0.3042, "num_input_tokens_seen": 52393600, "step": 24285 }, { "epoch": 4.457698660304643, "grad_norm": 5.047612190246582, "learning_rate": 9.547082801369578e-06, "loss": 0.3786, "num_input_tokens_seen": 52404416, "step": 24290 }, { "epoch": 4.458616259864195, "grad_norm": 2.9709599018096924, "learning_rate": 9.546749719225376e-06, "loss": 0.2139, "num_input_tokens_seen": 52416416, "step": 24295 }, { "epoch": 4.459533859423748, "grad_norm": 0.646722137928009, "learning_rate": 9.54641652046409e-06, "loss": 0.0318, "num_input_tokens_seen": 52426336, "step": 24300 }, { "epoch": 4.460451458983299, "grad_norm": 0.22912219166755676, "learning_rate": 9.546083205094268e-06, "loss": 0.1075, "num_input_tokens_seen": 52437536, "step": 24305 }, { "epoch": 4.461369058542852, "grad_norm": 9.404534339904785, "learning_rate": 9.545749773124455e-06, "loss": 0.4014, "num_input_tokens_seen": 52448768, "step": 24310 }, { "epoch": 4.462286658102404, "grad_norm": 0.9003527760505676, "learning_rate": 9.545416224563207e-06, "loss": 0.1325, "num_input_tokens_seen": 52459968, "step": 24315 }, { "epoch": 4.463204257661956, "grad_norm": 0.19725026190280914, "learning_rate": 9.545082559419078e-06, "loss": 0.1977, "num_input_tokens_seen": 52471200, "step": 24320 }, { "epoch": 4.4641218572215084, "grad_norm": 26.8006534576416, "learning_rate": 9.544748777700626e-06, "loss": 0.0741, "num_input_tokens_seen": 52481952, "step": 24325 }, { "epoch": 4.465039456781061, "grad_norm": 14.675177574157715, "learning_rate": 9.54441487941641e-06, "loss": 0.1028, "num_input_tokens_seen": 52493056, "step": 24330 }, { "epoch": 4.465957056340613, "grad_norm": 2.7636215686798096, "learning_rate": 9.544080864574995e-06, "loss": 0.1765, "num_input_tokens_seen": 52503936, "step": 24335 }, { "epoch": 4.466874655900165, "grad_norm": 7.338534355163574, "learning_rate": 9.543746733184952e-06, "loss": 0.1324, "num_input_tokens_seen": 52515488, "step": 24340 }, { "epoch": 4.467792255459718, "grad_norm": 0.08078055083751678, "learning_rate": 9.543412485254845e-06, "loss": 0.0929, "num_input_tokens_seen": 52525664, "step": 24345 }, { "epoch": 4.468709855019269, "grad_norm": 3.948747396469116, "learning_rate": 9.54307812079325e-06, "loss": 0.1009, "num_input_tokens_seen": 52537280, "step": 24350 }, { "epoch": 4.469627454578822, "grad_norm": 0.24978694319725037, "learning_rate": 9.542743639808743e-06, "loss": 0.086, "num_input_tokens_seen": 52548576, "step": 24355 }, { "epoch": 4.470545054138374, "grad_norm": 10.033140182495117, "learning_rate": 9.5424090423099e-06, "loss": 0.3653, "num_input_tokens_seen": 52560320, "step": 24360 }, { "epoch": 4.471462653697926, "grad_norm": 33.11885452270508, "learning_rate": 9.542074328305307e-06, "loss": 0.0938, "num_input_tokens_seen": 52570976, "step": 24365 }, { "epoch": 4.472380253257478, "grad_norm": 0.05421167612075806, "learning_rate": 9.541739497803546e-06, "loss": 0.1555, "num_input_tokens_seen": 52582496, "step": 24370 }, { "epoch": 4.473297852817031, "grad_norm": 8.277716636657715, "learning_rate": 9.541404550813207e-06, "loss": 0.106, "num_input_tokens_seen": 52593408, "step": 24375 }, { "epoch": 4.4742154523765825, "grad_norm": 3.674628257751465, "learning_rate": 9.54106948734288e-06, "loss": 0.2761, "num_input_tokens_seen": 52604256, "step": 24380 }, { "epoch": 4.475133051936135, "grad_norm": 18.0859317779541, "learning_rate": 9.540734307401158e-06, "loss": 0.3364, "num_input_tokens_seen": 52613760, "step": 24385 }, { "epoch": 4.4760506514956875, "grad_norm": 0.15633761882781982, "learning_rate": 9.540399010996639e-06, "loss": 0.0642, "num_input_tokens_seen": 52624992, "step": 24390 }, { "epoch": 4.476968251055239, "grad_norm": 59.81486511230469, "learning_rate": 9.54006359813792e-06, "loss": 0.2261, "num_input_tokens_seen": 52636704, "step": 24395 }, { "epoch": 4.477885850614792, "grad_norm": 0.18224868178367615, "learning_rate": 9.539728068833608e-06, "loss": 0.009, "num_input_tokens_seen": 52647808, "step": 24400 }, { "epoch": 4.478803450174344, "grad_norm": 0.09482885152101517, "learning_rate": 9.539392423092309e-06, "loss": 0.0417, "num_input_tokens_seen": 52657312, "step": 24405 }, { "epoch": 4.479721049733896, "grad_norm": 3.8530938625335693, "learning_rate": 9.539056660922627e-06, "loss": 0.2266, "num_input_tokens_seen": 52667360, "step": 24410 }, { "epoch": 4.480638649293448, "grad_norm": 0.04211662709712982, "learning_rate": 9.538720782333178e-06, "loss": 0.0048, "num_input_tokens_seen": 52677952, "step": 24415 }, { "epoch": 4.481556248853001, "grad_norm": 0.6970040202140808, "learning_rate": 9.538384787332572e-06, "loss": 0.0993, "num_input_tokens_seen": 52689408, "step": 24420 }, { "epoch": 4.482473848412552, "grad_norm": 0.051374584436416626, "learning_rate": 9.538048675929434e-06, "loss": 0.0862, "num_input_tokens_seen": 52700000, "step": 24425 }, { "epoch": 4.483391447972105, "grad_norm": 0.14450418949127197, "learning_rate": 9.53771244813238e-06, "loss": 0.0792, "num_input_tokens_seen": 52711520, "step": 24430 }, { "epoch": 4.484309047531657, "grad_norm": 40.62688064575195, "learning_rate": 9.537376103950034e-06, "loss": 0.2565, "num_input_tokens_seen": 52722400, "step": 24435 }, { "epoch": 4.485226647091209, "grad_norm": 0.056356582790613174, "learning_rate": 9.537039643391025e-06, "loss": 0.2133, "num_input_tokens_seen": 52733152, "step": 24440 }, { "epoch": 4.4861442466507615, "grad_norm": 20.265727996826172, "learning_rate": 9.536703066463976e-06, "loss": 0.1542, "num_input_tokens_seen": 52743712, "step": 24445 }, { "epoch": 4.487061846210314, "grad_norm": 0.06497462093830109, "learning_rate": 9.536366373177529e-06, "loss": 0.1573, "num_input_tokens_seen": 52754304, "step": 24450 }, { "epoch": 4.487979445769866, "grad_norm": 0.06535030901432037, "learning_rate": 9.536029563540314e-06, "loss": 0.176, "num_input_tokens_seen": 52765472, "step": 24455 }, { "epoch": 4.488897045329418, "grad_norm": 55.520450592041016, "learning_rate": 9.535692637560972e-06, "loss": 0.042, "num_input_tokens_seen": 52777120, "step": 24460 }, { "epoch": 4.489814644888971, "grad_norm": 0.5222662091255188, "learning_rate": 9.535355595248142e-06, "loss": 0.016, "num_input_tokens_seen": 52788480, "step": 24465 }, { "epoch": 4.490732244448522, "grad_norm": 0.39840108156204224, "learning_rate": 9.53501843661047e-06, "loss": 0.008, "num_input_tokens_seen": 52798464, "step": 24470 }, { "epoch": 4.491649844008075, "grad_norm": 10.698315620422363, "learning_rate": 9.534681161656606e-06, "loss": 0.1157, "num_input_tokens_seen": 52810048, "step": 24475 }, { "epoch": 4.492567443567627, "grad_norm": 0.3402295708656311, "learning_rate": 9.534343770395196e-06, "loss": 0.0767, "num_input_tokens_seen": 52820448, "step": 24480 }, { "epoch": 4.493485043127179, "grad_norm": 0.03685485199093819, "learning_rate": 9.534006262834896e-06, "loss": 0.1066, "num_input_tokens_seen": 52832096, "step": 24485 }, { "epoch": 4.494402642686731, "grad_norm": 0.11976426094770432, "learning_rate": 9.533668638984363e-06, "loss": 0.0775, "num_input_tokens_seen": 52842976, "step": 24490 }, { "epoch": 4.495320242246284, "grad_norm": 0.10855798423290253, "learning_rate": 9.533330898852256e-06, "loss": 0.0025, "num_input_tokens_seen": 52851488, "step": 24495 }, { "epoch": 4.4962378418058355, "grad_norm": 12.535348892211914, "learning_rate": 9.532993042447238e-06, "loss": 0.2373, "num_input_tokens_seen": 52862112, "step": 24500 }, { "epoch": 4.497155441365388, "grad_norm": 0.893043041229248, "learning_rate": 9.532655069777972e-06, "loss": 0.0935, "num_input_tokens_seen": 52874080, "step": 24505 }, { "epoch": 4.4980730409249405, "grad_norm": 61.823543548583984, "learning_rate": 9.532316980853132e-06, "loss": 0.1798, "num_input_tokens_seen": 52885728, "step": 24510 }, { "epoch": 4.498990640484492, "grad_norm": 49.853065490722656, "learning_rate": 9.531978775681383e-06, "loss": 0.2621, "num_input_tokens_seen": 52896800, "step": 24515 }, { "epoch": 4.499908240044045, "grad_norm": 18.59262466430664, "learning_rate": 9.531640454271403e-06, "loss": 0.4636, "num_input_tokens_seen": 52907744, "step": 24520 }, { "epoch": 4.500825839603597, "grad_norm": 0.05534692853689194, "learning_rate": 9.53130201663187e-06, "loss": 0.1052, "num_input_tokens_seen": 52918784, "step": 24525 }, { "epoch": 4.50174343916315, "grad_norm": 0.1349812000989914, "learning_rate": 9.530963462771461e-06, "loss": 0.0031, "num_input_tokens_seen": 52929920, "step": 24530 }, { "epoch": 4.502661038722701, "grad_norm": 57.557090759277344, "learning_rate": 9.530624792698862e-06, "loss": 0.1917, "num_input_tokens_seen": 52939968, "step": 24535 }, { "epoch": 4.503578638282254, "grad_norm": 0.0407065823674202, "learning_rate": 9.53028600642276e-06, "loss": 0.0041, "num_input_tokens_seen": 52951296, "step": 24540 }, { "epoch": 4.504496237841806, "grad_norm": 10.902474403381348, "learning_rate": 9.529947103951843e-06, "loss": 0.2716, "num_input_tokens_seen": 52962592, "step": 24545 }, { "epoch": 4.505413837401358, "grad_norm": 0.23990055918693542, "learning_rate": 9.529608085294803e-06, "loss": 0.2213, "num_input_tokens_seen": 52972288, "step": 24550 }, { "epoch": 4.50633143696091, "grad_norm": 6.12394380569458, "learning_rate": 9.529268950460335e-06, "loss": 0.1899, "num_input_tokens_seen": 52983360, "step": 24555 }, { "epoch": 4.507249036520463, "grad_norm": 0.11797326058149338, "learning_rate": 9.528929699457138e-06, "loss": 0.1128, "num_input_tokens_seen": 52994048, "step": 24560 }, { "epoch": 4.5081666360800146, "grad_norm": 9.738866806030273, "learning_rate": 9.528590332293917e-06, "loss": 0.194, "num_input_tokens_seen": 53004800, "step": 24565 }, { "epoch": 4.509084235639567, "grad_norm": 0.17423559725284576, "learning_rate": 9.52825084897937e-06, "loss": 0.0949, "num_input_tokens_seen": 53015712, "step": 24570 }, { "epoch": 4.51000183519912, "grad_norm": 1.9038403034210205, "learning_rate": 9.527911249522207e-06, "loss": 0.1054, "num_input_tokens_seen": 53026208, "step": 24575 }, { "epoch": 4.510919434758671, "grad_norm": 0.1399168223142624, "learning_rate": 9.527571533931137e-06, "loss": 0.0971, "num_input_tokens_seen": 53037184, "step": 24580 }, { "epoch": 4.511837034318224, "grad_norm": 0.113319031894207, "learning_rate": 9.527231702214876e-06, "loss": 0.1327, "num_input_tokens_seen": 53047904, "step": 24585 }, { "epoch": 4.512754633877776, "grad_norm": 0.05952514335513115, "learning_rate": 9.52689175438214e-06, "loss": 0.0743, "num_input_tokens_seen": 53059072, "step": 24590 }, { "epoch": 4.513672233437328, "grad_norm": 0.6829928755760193, "learning_rate": 9.526551690441643e-06, "loss": 0.1627, "num_input_tokens_seen": 53069568, "step": 24595 }, { "epoch": 4.51458983299688, "grad_norm": 3.8541173934936523, "learning_rate": 9.526211510402112e-06, "loss": 0.541, "num_input_tokens_seen": 53080608, "step": 24600 }, { "epoch": 4.515507432556433, "grad_norm": 0.40178608894348145, "learning_rate": 9.525871214272272e-06, "loss": 0.0936, "num_input_tokens_seen": 53091392, "step": 24605 }, { "epoch": 4.5164250321159845, "grad_norm": 0.16392596065998077, "learning_rate": 9.525530802060847e-06, "loss": 0.0251, "num_input_tokens_seen": 53100832, "step": 24610 }, { "epoch": 4.517342631675537, "grad_norm": 0.09092940390110016, "learning_rate": 9.525190273776574e-06, "loss": 0.1178, "num_input_tokens_seen": 53112960, "step": 24615 }, { "epoch": 4.5182602312350895, "grad_norm": 0.0842198058962822, "learning_rate": 9.52484962942818e-06, "loss": 0.0042, "num_input_tokens_seen": 53123904, "step": 24620 }, { "epoch": 4.519177830794641, "grad_norm": 6.576432228088379, "learning_rate": 9.52450886902441e-06, "loss": 0.3183, "num_input_tokens_seen": 53134656, "step": 24625 }, { "epoch": 4.520095430354194, "grad_norm": 21.705291748046875, "learning_rate": 9.524167992573998e-06, "loss": 0.1278, "num_input_tokens_seen": 53145024, "step": 24630 }, { "epoch": 4.521013029913746, "grad_norm": 6.368873119354248, "learning_rate": 9.52382700008569e-06, "loss": 0.3857, "num_input_tokens_seen": 53156064, "step": 24635 }, { "epoch": 4.521930629473298, "grad_norm": 2.8577919006347656, "learning_rate": 9.523485891568229e-06, "loss": 0.0902, "num_input_tokens_seen": 53167680, "step": 24640 }, { "epoch": 4.52284822903285, "grad_norm": 4.148054599761963, "learning_rate": 9.523144667030366e-06, "loss": 0.271, "num_input_tokens_seen": 53177696, "step": 24645 }, { "epoch": 4.523765828592403, "grad_norm": 0.18204885721206665, "learning_rate": 9.522803326480853e-06, "loss": 0.098, "num_input_tokens_seen": 53188224, "step": 24650 }, { "epoch": 4.524683428151954, "grad_norm": 8.054116249084473, "learning_rate": 9.522461869928445e-06, "loss": 0.1011, "num_input_tokens_seen": 53200064, "step": 24655 }, { "epoch": 4.525601027711507, "grad_norm": 4.172760009765625, "learning_rate": 9.522120297381898e-06, "loss": 0.155, "num_input_tokens_seen": 53211520, "step": 24660 }, { "epoch": 4.526518627271059, "grad_norm": 17.61050796508789, "learning_rate": 9.521778608849973e-06, "loss": 0.25, "num_input_tokens_seen": 53222336, "step": 24665 }, { "epoch": 4.527436226830611, "grad_norm": 2.0656747817993164, "learning_rate": 9.521436804341438e-06, "loss": 0.1114, "num_input_tokens_seen": 53233632, "step": 24670 }, { "epoch": 4.5283538263901635, "grad_norm": 0.21283839643001556, "learning_rate": 9.521094883865055e-06, "loss": 0.1358, "num_input_tokens_seen": 53244576, "step": 24675 }, { "epoch": 4.529271425949716, "grad_norm": 1.9370906352996826, "learning_rate": 9.520752847429595e-06, "loss": 0.2418, "num_input_tokens_seen": 53255904, "step": 24680 }, { "epoch": 4.530189025509268, "grad_norm": 19.513978958129883, "learning_rate": 9.520410695043832e-06, "loss": 0.2562, "num_input_tokens_seen": 53267232, "step": 24685 }, { "epoch": 4.53110662506882, "grad_norm": 0.43846380710601807, "learning_rate": 9.52006842671654e-06, "loss": 0.138, "num_input_tokens_seen": 53278304, "step": 24690 }, { "epoch": 4.532024224628373, "grad_norm": 16.021915435791016, "learning_rate": 9.519726042456499e-06, "loss": 0.1189, "num_input_tokens_seen": 53290048, "step": 24695 }, { "epoch": 4.532941824187924, "grad_norm": 0.10220944136381149, "learning_rate": 9.519383542272488e-06, "loss": 0.03, "num_input_tokens_seen": 53301184, "step": 24700 }, { "epoch": 4.533859423747477, "grad_norm": 50.058895111083984, "learning_rate": 9.519040926173295e-06, "loss": 0.1431, "num_input_tokens_seen": 53311872, "step": 24705 }, { "epoch": 4.534777023307029, "grad_norm": 0.23544436693191528, "learning_rate": 9.518698194167706e-06, "loss": 0.128, "num_input_tokens_seen": 53323008, "step": 24710 }, { "epoch": 4.535694622866581, "grad_norm": 4.254037380218506, "learning_rate": 9.518355346264511e-06, "loss": 0.2164, "num_input_tokens_seen": 53333184, "step": 24715 }, { "epoch": 4.536612222426133, "grad_norm": 22.741514205932617, "learning_rate": 9.518012382472505e-06, "loss": 0.0836, "num_input_tokens_seen": 53346336, "step": 24720 }, { "epoch": 4.537529821985686, "grad_norm": 0.16671106219291687, "learning_rate": 9.517669302800483e-06, "loss": 0.1615, "num_input_tokens_seen": 53355776, "step": 24725 }, { "epoch": 4.5384474215452375, "grad_norm": 0.0536336749792099, "learning_rate": 9.517326107257245e-06, "loss": 0.0544, "num_input_tokens_seen": 53365280, "step": 24730 }, { "epoch": 4.53936502110479, "grad_norm": 5.320530414581299, "learning_rate": 9.516982795851594e-06, "loss": 0.1059, "num_input_tokens_seen": 53377120, "step": 24735 }, { "epoch": 4.5402826206643425, "grad_norm": 4.157164573669434, "learning_rate": 9.516639368592335e-06, "loss": 0.1765, "num_input_tokens_seen": 53388128, "step": 24740 }, { "epoch": 4.541200220223894, "grad_norm": 0.061118874698877335, "learning_rate": 9.516295825488278e-06, "loss": 0.2553, "num_input_tokens_seen": 53398784, "step": 24745 }, { "epoch": 4.542117819783447, "grad_norm": 0.0810529813170433, "learning_rate": 9.51595216654823e-06, "loss": 0.1443, "num_input_tokens_seen": 53408352, "step": 24750 }, { "epoch": 4.543035419342999, "grad_norm": 0.5521316528320312, "learning_rate": 9.51560839178101e-06, "loss": 0.3391, "num_input_tokens_seen": 53419264, "step": 24755 }, { "epoch": 4.543953018902551, "grad_norm": 6.987216949462891, "learning_rate": 9.515264501195431e-06, "loss": 0.2193, "num_input_tokens_seen": 53429664, "step": 24760 }, { "epoch": 4.544870618462103, "grad_norm": 0.14587293565273285, "learning_rate": 9.514920494800318e-06, "loss": 0.1711, "num_input_tokens_seen": 53439328, "step": 24765 }, { "epoch": 4.545788218021656, "grad_norm": 44.60710906982422, "learning_rate": 9.51457637260449e-06, "loss": 0.2213, "num_input_tokens_seen": 53449696, "step": 24770 }, { "epoch": 4.546705817581207, "grad_norm": 0.8607567548751831, "learning_rate": 9.514232134616777e-06, "loss": 0.3421, "num_input_tokens_seen": 53461152, "step": 24775 }, { "epoch": 4.54762341714076, "grad_norm": 4.685725688934326, "learning_rate": 9.513887780846004e-06, "loss": 0.1834, "num_input_tokens_seen": 53471136, "step": 24780 }, { "epoch": 4.548541016700312, "grad_norm": 0.4688757658004761, "learning_rate": 9.513543311301007e-06, "loss": 0.287, "num_input_tokens_seen": 53483392, "step": 24785 }, { "epoch": 4.549458616259864, "grad_norm": 0.14851343631744385, "learning_rate": 9.513198725990618e-06, "loss": 0.2139, "num_input_tokens_seen": 53494816, "step": 24790 }, { "epoch": 4.5503762158194165, "grad_norm": 14.701760292053223, "learning_rate": 9.512854024923678e-06, "loss": 0.2122, "num_input_tokens_seen": 53505728, "step": 24795 }, { "epoch": 4.551293815378969, "grad_norm": 5.872395038604736, "learning_rate": 9.512509208109026e-06, "loss": 0.308, "num_input_tokens_seen": 53516672, "step": 24800 }, { "epoch": 4.552211414938521, "grad_norm": 6.651060104370117, "learning_rate": 9.512164275555507e-06, "loss": 0.3038, "num_input_tokens_seen": 53528512, "step": 24805 }, { "epoch": 4.553129014498073, "grad_norm": 0.5337440371513367, "learning_rate": 9.511819227271965e-06, "loss": 0.221, "num_input_tokens_seen": 53538880, "step": 24810 }, { "epoch": 4.554046614057626, "grad_norm": 1.6921855211257935, "learning_rate": 9.511474063267255e-06, "loss": 0.2426, "num_input_tokens_seen": 53549344, "step": 24815 }, { "epoch": 4.554964213617177, "grad_norm": 22.021595001220703, "learning_rate": 9.511128783550228e-06, "loss": 0.1755, "num_input_tokens_seen": 53558528, "step": 24820 }, { "epoch": 4.55588181317673, "grad_norm": 0.624494194984436, "learning_rate": 9.510783388129737e-06, "loss": 0.1021, "num_input_tokens_seen": 53570208, "step": 24825 }, { "epoch": 4.556799412736282, "grad_norm": 2.0579280853271484, "learning_rate": 9.510437877014645e-06, "loss": 0.2526, "num_input_tokens_seen": 53580032, "step": 24830 }, { "epoch": 4.557717012295834, "grad_norm": 0.18325749039649963, "learning_rate": 9.510092250213811e-06, "loss": 0.1024, "num_input_tokens_seen": 53591040, "step": 24835 }, { "epoch": 4.558634611855386, "grad_norm": 0.39695656299591064, "learning_rate": 9.509746507736101e-06, "loss": 0.0146, "num_input_tokens_seen": 53602592, "step": 24840 }, { "epoch": 4.559552211414939, "grad_norm": 3.1862170696258545, "learning_rate": 9.509400649590383e-06, "loss": 0.4276, "num_input_tokens_seen": 53612480, "step": 24845 }, { "epoch": 4.560469810974491, "grad_norm": 15.149975776672363, "learning_rate": 9.509054675785528e-06, "loss": 0.1827, "num_input_tokens_seen": 53622144, "step": 24850 }, { "epoch": 4.561387410534043, "grad_norm": 3.634336233139038, "learning_rate": 9.508708586330407e-06, "loss": 0.0179, "num_input_tokens_seen": 53633664, "step": 24855 }, { "epoch": 4.562305010093596, "grad_norm": 0.23449745774269104, "learning_rate": 9.508362381233898e-06, "loss": 0.0111, "num_input_tokens_seen": 53644928, "step": 24860 }, { "epoch": 4.563222609653147, "grad_norm": 6.580470561981201, "learning_rate": 9.508016060504883e-06, "loss": 0.3081, "num_input_tokens_seen": 53655744, "step": 24865 }, { "epoch": 4.5641402092127, "grad_norm": 4.797910213470459, "learning_rate": 9.507669624152242e-06, "loss": 0.1833, "num_input_tokens_seen": 53666560, "step": 24870 }, { "epoch": 4.565057808772252, "grad_norm": 0.2974550127983093, "learning_rate": 9.507323072184864e-06, "loss": 0.1802, "num_input_tokens_seen": 53677568, "step": 24875 }, { "epoch": 4.565975408331804, "grad_norm": 10.187708854675293, "learning_rate": 9.506976404611632e-06, "loss": 0.109, "num_input_tokens_seen": 53689216, "step": 24880 }, { "epoch": 4.566893007891356, "grad_norm": 0.43725845217704773, "learning_rate": 9.506629621441442e-06, "loss": 0.0877, "num_input_tokens_seen": 53700032, "step": 24885 }, { "epoch": 4.567810607450909, "grad_norm": 0.7023571729660034, "learning_rate": 9.506282722683186e-06, "loss": 0.0112, "num_input_tokens_seen": 53710784, "step": 24890 }, { "epoch": 4.5687282070104605, "grad_norm": 0.1704610139131546, "learning_rate": 9.505935708345762e-06, "loss": 0.0516, "num_input_tokens_seen": 53720352, "step": 24895 }, { "epoch": 4.569645806570013, "grad_norm": 0.19590941071510315, "learning_rate": 9.505588578438073e-06, "loss": 0.1937, "num_input_tokens_seen": 53730560, "step": 24900 }, { "epoch": 4.5705634061295655, "grad_norm": 0.3401220142841339, "learning_rate": 9.505241332969016e-06, "loss": 0.0068, "num_input_tokens_seen": 53741408, "step": 24905 }, { "epoch": 4.571481005689117, "grad_norm": 17.505279541015625, "learning_rate": 9.504893971947503e-06, "loss": 0.0689, "num_input_tokens_seen": 53752640, "step": 24910 }, { "epoch": 4.57239860524867, "grad_norm": 0.15023624897003174, "learning_rate": 9.504546495382443e-06, "loss": 0.3325, "num_input_tokens_seen": 53763072, "step": 24915 }, { "epoch": 4.573316204808222, "grad_norm": 0.07611633092164993, "learning_rate": 9.504198903282746e-06, "loss": 0.1228, "num_input_tokens_seen": 53773184, "step": 24920 }, { "epoch": 4.574233804367774, "grad_norm": 0.9655165076255798, "learning_rate": 9.503851195657328e-06, "loss": 0.0098, "num_input_tokens_seen": 53782688, "step": 24925 }, { "epoch": 4.575151403927326, "grad_norm": 0.7536416053771973, "learning_rate": 9.503503372515107e-06, "loss": 0.3915, "num_input_tokens_seen": 53794208, "step": 24930 }, { "epoch": 4.576069003486879, "grad_norm": 0.06723442673683167, "learning_rate": 9.503155433865003e-06, "loss": 0.0448, "num_input_tokens_seen": 53803360, "step": 24935 }, { "epoch": 4.57698660304643, "grad_norm": 0.037335991859436035, "learning_rate": 9.502807379715943e-06, "loss": 0.0916, "num_input_tokens_seen": 53814848, "step": 24940 }, { "epoch": 4.577904202605983, "grad_norm": 30.229537963867188, "learning_rate": 9.502459210076853e-06, "loss": 0.0724, "num_input_tokens_seen": 53825408, "step": 24945 }, { "epoch": 4.578821802165535, "grad_norm": 0.42704540491104126, "learning_rate": 9.50211092495666e-06, "loss": 0.2725, "num_input_tokens_seen": 53836992, "step": 24950 }, { "epoch": 4.579739401725087, "grad_norm": 12.24023151397705, "learning_rate": 9.501762524364301e-06, "loss": 0.1595, "num_input_tokens_seen": 53847488, "step": 24955 }, { "epoch": 4.5806570012846395, "grad_norm": 0.04701755568385124, "learning_rate": 9.50141400830871e-06, "loss": 0.1365, "num_input_tokens_seen": 53858368, "step": 24960 }, { "epoch": 4.581574600844192, "grad_norm": 24.10062599182129, "learning_rate": 9.501065376798828e-06, "loss": 0.1519, "num_input_tokens_seen": 53869888, "step": 24965 }, { "epoch": 4.582492200403744, "grad_norm": 0.3890707790851593, "learning_rate": 9.500716629843594e-06, "loss": 0.0056, "num_input_tokens_seen": 53881632, "step": 24970 }, { "epoch": 4.583409799963296, "grad_norm": 0.044165000319480896, "learning_rate": 9.500367767451952e-06, "loss": 0.0028, "num_input_tokens_seen": 53891808, "step": 24975 }, { "epoch": 4.584327399522849, "grad_norm": 6.516854286193848, "learning_rate": 9.500018789632855e-06, "loss": 0.0772, "num_input_tokens_seen": 53903136, "step": 24980 }, { "epoch": 4.5852449990824, "grad_norm": 0.11607764661312103, "learning_rate": 9.499669696395248e-06, "loss": 0.0251, "num_input_tokens_seen": 53914592, "step": 24985 }, { "epoch": 4.586162598641953, "grad_norm": 0.06316720694303513, "learning_rate": 9.499320487748087e-06, "loss": 0.0737, "num_input_tokens_seen": 53924224, "step": 24990 }, { "epoch": 4.587080198201505, "grad_norm": 0.16560006141662598, "learning_rate": 9.49897116370033e-06, "loss": 0.0228, "num_input_tokens_seen": 53936736, "step": 24995 }, { "epoch": 4.587997797761057, "grad_norm": 0.10379546135663986, "learning_rate": 9.498621724260934e-06, "loss": 0.228, "num_input_tokens_seen": 53948352, "step": 25000 }, { "epoch": 4.588915397320609, "grad_norm": 0.09103091806173325, "learning_rate": 9.498272169438865e-06, "loss": 0.0991, "num_input_tokens_seen": 53959136, "step": 25005 }, { "epoch": 4.589832996880162, "grad_norm": 0.3433803617954254, "learning_rate": 9.497922499243085e-06, "loss": 0.0613, "num_input_tokens_seen": 53971648, "step": 25010 }, { "epoch": 4.5907505964397135, "grad_norm": 0.02402600087225437, "learning_rate": 9.497572713682565e-06, "loss": 0.0185, "num_input_tokens_seen": 53981632, "step": 25015 }, { "epoch": 4.591668195999266, "grad_norm": 3.2946248054504395, "learning_rate": 9.497222812766276e-06, "loss": 0.1065, "num_input_tokens_seen": 53992640, "step": 25020 }, { "epoch": 4.5925857955588185, "grad_norm": 1.3408492803573608, "learning_rate": 9.49687279650319e-06, "loss": 0.2892, "num_input_tokens_seen": 54003072, "step": 25025 }, { "epoch": 4.59350339511837, "grad_norm": 0.040414582937955856, "learning_rate": 9.496522664902288e-06, "loss": 0.1749, "num_input_tokens_seen": 54013600, "step": 25030 }, { "epoch": 4.594420994677923, "grad_norm": 0.10126058757305145, "learning_rate": 9.496172417972547e-06, "loss": 0.2645, "num_input_tokens_seen": 54023360, "step": 25035 }, { "epoch": 4.595338594237475, "grad_norm": 1.1947681903839111, "learning_rate": 9.495822055722953e-06, "loss": 0.0741, "num_input_tokens_seen": 54034880, "step": 25040 }, { "epoch": 4.596256193797027, "grad_norm": 4.6442670822143555, "learning_rate": 9.495471578162492e-06, "loss": 0.1933, "num_input_tokens_seen": 54046976, "step": 25045 }, { "epoch": 4.597173793356579, "grad_norm": 4.92830228805542, "learning_rate": 9.495120985300152e-06, "loss": 0.3282, "num_input_tokens_seen": 54057824, "step": 25050 }, { "epoch": 4.598091392916132, "grad_norm": 0.02789267711341381, "learning_rate": 9.494770277144925e-06, "loss": 0.13, "num_input_tokens_seen": 54069376, "step": 25055 }, { "epoch": 4.599008992475683, "grad_norm": 4.428612232208252, "learning_rate": 9.494419453705806e-06, "loss": 0.2024, "num_input_tokens_seen": 54079360, "step": 25060 }, { "epoch": 4.599926592035236, "grad_norm": 0.9355136752128601, "learning_rate": 9.494068514991794e-06, "loss": 0.2042, "num_input_tokens_seen": 54089664, "step": 25065 }, { "epoch": 4.600844191594788, "grad_norm": 0.11751634627580643, "learning_rate": 9.493717461011891e-06, "loss": 0.0066, "num_input_tokens_seen": 54101120, "step": 25070 }, { "epoch": 4.60176179115434, "grad_norm": 3.5936663150787354, "learning_rate": 9.493366291775098e-06, "loss": 0.1224, "num_input_tokens_seen": 54110816, "step": 25075 }, { "epoch": 4.6026793907138925, "grad_norm": 0.4662827253341675, "learning_rate": 9.493015007290424e-06, "loss": 0.1688, "num_input_tokens_seen": 54121568, "step": 25080 }, { "epoch": 4.603596990273445, "grad_norm": 14.671904563903809, "learning_rate": 9.49266360756688e-06, "loss": 0.1225, "num_input_tokens_seen": 54133120, "step": 25085 }, { "epoch": 4.604514589832997, "grad_norm": 9.915040969848633, "learning_rate": 9.492312092613476e-06, "loss": 0.3073, "num_input_tokens_seen": 54145024, "step": 25090 }, { "epoch": 4.605432189392549, "grad_norm": 7.030444145202637, "learning_rate": 9.49196046243923e-06, "loss": 0.3016, "num_input_tokens_seen": 54155296, "step": 25095 }, { "epoch": 4.606349788952102, "grad_norm": 6.4551825523376465, "learning_rate": 9.49160871705316e-06, "loss": 0.0552, "num_input_tokens_seen": 54164576, "step": 25100 }, { "epoch": 4.607267388511653, "grad_norm": 3.3730907440185547, "learning_rate": 9.491256856464288e-06, "loss": 0.1605, "num_input_tokens_seen": 54174688, "step": 25105 }, { "epoch": 4.608184988071206, "grad_norm": 0.38216841220855713, "learning_rate": 9.490904880681638e-06, "loss": 0.1069, "num_input_tokens_seen": 54185088, "step": 25110 }, { "epoch": 4.609102587630758, "grad_norm": 0.6470818519592285, "learning_rate": 9.490552789714238e-06, "loss": 0.0691, "num_input_tokens_seen": 54195648, "step": 25115 }, { "epoch": 4.61002018719031, "grad_norm": 0.08382438868284225, "learning_rate": 9.490200583571119e-06, "loss": 0.0461, "num_input_tokens_seen": 54205952, "step": 25120 }, { "epoch": 4.610937786749862, "grad_norm": 5.3367390632629395, "learning_rate": 9.489848262261314e-06, "loss": 0.0989, "num_input_tokens_seen": 54217408, "step": 25125 }, { "epoch": 4.611855386309415, "grad_norm": 0.06956414133310318, "learning_rate": 9.48949582579386e-06, "loss": 0.1638, "num_input_tokens_seen": 54228192, "step": 25130 }, { "epoch": 4.612772985868967, "grad_norm": 0.13298603892326355, "learning_rate": 9.489143274177797e-06, "loss": 0.0065, "num_input_tokens_seen": 54239744, "step": 25135 }, { "epoch": 4.613690585428519, "grad_norm": 0.1281926929950714, "learning_rate": 9.488790607422165e-06, "loss": 0.0166, "num_input_tokens_seen": 54250208, "step": 25140 }, { "epoch": 4.614608184988072, "grad_norm": 0.06342872977256775, "learning_rate": 9.488437825536012e-06, "loss": 0.0497, "num_input_tokens_seen": 54261472, "step": 25145 }, { "epoch": 4.615525784547623, "grad_norm": 0.3844238817691803, "learning_rate": 9.488084928528385e-06, "loss": 0.0272, "num_input_tokens_seen": 54272160, "step": 25150 }, { "epoch": 4.616443384107176, "grad_norm": 0.02469441294670105, "learning_rate": 9.487731916408334e-06, "loss": 0.1186, "num_input_tokens_seen": 54281504, "step": 25155 }, { "epoch": 4.617360983666728, "grad_norm": 36.20651626586914, "learning_rate": 9.487378789184915e-06, "loss": 0.2588, "num_input_tokens_seen": 54292896, "step": 25160 }, { "epoch": 4.61827858322628, "grad_norm": 6.436038017272949, "learning_rate": 9.487025546867187e-06, "loss": 0.1494, "num_input_tokens_seen": 54303936, "step": 25165 }, { "epoch": 4.619196182785832, "grad_norm": 0.3815179169178009, "learning_rate": 9.486672189464206e-06, "loss": 0.0497, "num_input_tokens_seen": 54315648, "step": 25170 }, { "epoch": 4.620113782345385, "grad_norm": 0.0649825781583786, "learning_rate": 9.48631871698504e-06, "loss": 0.1644, "num_input_tokens_seen": 54325568, "step": 25175 }, { "epoch": 4.6210313819049365, "grad_norm": 10.804323196411133, "learning_rate": 9.485965129438748e-06, "loss": 0.1269, "num_input_tokens_seen": 54336128, "step": 25180 }, { "epoch": 4.621948981464489, "grad_norm": 0.12724514305591583, "learning_rate": 9.485611426834405e-06, "loss": 0.2024, "num_input_tokens_seen": 54346848, "step": 25185 }, { "epoch": 4.6228665810240415, "grad_norm": 1.2116239070892334, "learning_rate": 9.48525760918108e-06, "loss": 0.067, "num_input_tokens_seen": 54357280, "step": 25190 }, { "epoch": 4.623784180583593, "grad_norm": 0.039537034928798676, "learning_rate": 9.484903676487852e-06, "loss": 0.1613, "num_input_tokens_seen": 54368384, "step": 25195 }, { "epoch": 4.624701780143146, "grad_norm": 20.436479568481445, "learning_rate": 9.484549628763793e-06, "loss": 0.344, "num_input_tokens_seen": 54378912, "step": 25200 }, { "epoch": 4.625619379702698, "grad_norm": 1.085242748260498, "learning_rate": 9.484195466017986e-06, "loss": 0.1085, "num_input_tokens_seen": 54389952, "step": 25205 }, { "epoch": 4.62653697926225, "grad_norm": 0.33556726574897766, "learning_rate": 9.483841188259516e-06, "loss": 0.1369, "num_input_tokens_seen": 54399808, "step": 25210 }, { "epoch": 4.627454578821802, "grad_norm": 19.373929977416992, "learning_rate": 9.483486795497469e-06, "loss": 0.4347, "num_input_tokens_seen": 54409888, "step": 25215 }, { "epoch": 4.628372178381355, "grad_norm": 7.748358726501465, "learning_rate": 9.483132287740934e-06, "loss": 0.1015, "num_input_tokens_seen": 54420416, "step": 25220 }, { "epoch": 4.629289777940906, "grad_norm": 0.250840961933136, "learning_rate": 9.482777664999005e-06, "loss": 0.2688, "num_input_tokens_seen": 54430944, "step": 25225 }, { "epoch": 4.630207377500459, "grad_norm": 11.076003074645996, "learning_rate": 9.482422927280775e-06, "loss": 0.3103, "num_input_tokens_seen": 54441472, "step": 25230 }, { "epoch": 4.631124977060011, "grad_norm": 0.13756826519966125, "learning_rate": 9.482068074595345e-06, "loss": 0.0332, "num_input_tokens_seen": 54452832, "step": 25235 }, { "epoch": 4.632042576619563, "grad_norm": 0.07481387257575989, "learning_rate": 9.481713106951816e-06, "loss": 0.1731, "num_input_tokens_seen": 54463104, "step": 25240 }, { "epoch": 4.6329601761791155, "grad_norm": 0.062067028135061264, "learning_rate": 9.48135802435929e-06, "loss": 0.0074, "num_input_tokens_seen": 54472288, "step": 25245 }, { "epoch": 4.633877775738668, "grad_norm": 0.040423352271318436, "learning_rate": 9.481002826826878e-06, "loss": 0.1967, "num_input_tokens_seen": 54482688, "step": 25250 }, { "epoch": 4.63479537529822, "grad_norm": 17.61248016357422, "learning_rate": 9.480647514363689e-06, "loss": 0.1361, "num_input_tokens_seen": 54493248, "step": 25255 }, { "epoch": 4.635712974857772, "grad_norm": 18.03791618347168, "learning_rate": 9.480292086978835e-06, "loss": 0.1987, "num_input_tokens_seen": 54503680, "step": 25260 }, { "epoch": 4.636630574417325, "grad_norm": 82.99226379394531, "learning_rate": 9.47993654468143e-06, "loss": 0.2602, "num_input_tokens_seen": 54515072, "step": 25265 }, { "epoch": 4.637548173976876, "grad_norm": 14.446859359741211, "learning_rate": 9.4795808874806e-06, "loss": 0.119, "num_input_tokens_seen": 54526368, "step": 25270 }, { "epoch": 4.638465773536429, "grad_norm": 0.29026123881340027, "learning_rate": 9.47922511538546e-06, "loss": 0.0046, "num_input_tokens_seen": 54537888, "step": 25275 }, { "epoch": 4.639383373095981, "grad_norm": 2.2374281883239746, "learning_rate": 9.478869228405138e-06, "loss": 0.0514, "num_input_tokens_seen": 54548192, "step": 25280 }, { "epoch": 4.640300972655533, "grad_norm": 6.1182074546813965, "learning_rate": 9.478513226548765e-06, "loss": 0.1829, "num_input_tokens_seen": 54558432, "step": 25285 }, { "epoch": 4.641218572215085, "grad_norm": 0.3813277781009674, "learning_rate": 9.478157109825466e-06, "loss": 0.0052, "num_input_tokens_seen": 54569152, "step": 25290 }, { "epoch": 4.642136171774638, "grad_norm": 0.11885568499565125, "learning_rate": 9.47780087824438e-06, "loss": 0.0037, "num_input_tokens_seen": 54579328, "step": 25295 }, { "epoch": 4.6430537713341895, "grad_norm": 0.9428800344467163, "learning_rate": 9.477444531814639e-06, "loss": 0.2978, "num_input_tokens_seen": 54590464, "step": 25300 }, { "epoch": 4.643971370893742, "grad_norm": 0.33799782395362854, "learning_rate": 9.477088070545386e-06, "loss": 0.0083, "num_input_tokens_seen": 54601920, "step": 25305 }, { "epoch": 4.6448889704532945, "grad_norm": 0.20239394903182983, "learning_rate": 9.476731494445762e-06, "loss": 0.0072, "num_input_tokens_seen": 54612672, "step": 25310 }, { "epoch": 4.645806570012846, "grad_norm": 0.05238522216677666, "learning_rate": 9.476374803524915e-06, "loss": 0.0999, "num_input_tokens_seen": 54622464, "step": 25315 }, { "epoch": 4.646724169572399, "grad_norm": 0.09220141917467117, "learning_rate": 9.476017997791991e-06, "loss": 0.0028, "num_input_tokens_seen": 54633856, "step": 25320 }, { "epoch": 4.647641769131951, "grad_norm": 14.989142417907715, "learning_rate": 9.475661077256144e-06, "loss": 0.2011, "num_input_tokens_seen": 54644224, "step": 25325 }, { "epoch": 4.648559368691503, "grad_norm": 10.540220260620117, "learning_rate": 9.475304041926525e-06, "loss": 0.4392, "num_input_tokens_seen": 54654560, "step": 25330 }, { "epoch": 4.649476968251055, "grad_norm": 0.03535496070981026, "learning_rate": 9.474946891812295e-06, "loss": 0.1105, "num_input_tokens_seen": 54666208, "step": 25335 }, { "epoch": 4.650394567810608, "grad_norm": 0.5392913222312927, "learning_rate": 9.474589626922612e-06, "loss": 0.1564, "num_input_tokens_seen": 54677088, "step": 25340 }, { "epoch": 4.651312167370159, "grad_norm": 5.42704963684082, "learning_rate": 9.47423224726664e-06, "loss": 0.272, "num_input_tokens_seen": 54687584, "step": 25345 }, { "epoch": 4.652229766929712, "grad_norm": 0.08600087463855743, "learning_rate": 9.473874752853544e-06, "loss": 0.0753, "num_input_tokens_seen": 54696480, "step": 25350 }, { "epoch": 4.653147366489264, "grad_norm": 0.018105916678905487, "learning_rate": 9.473517143692497e-06, "loss": 0.4246, "num_input_tokens_seen": 54707328, "step": 25355 }, { "epoch": 4.654064966048816, "grad_norm": 4.986767292022705, "learning_rate": 9.473159419792668e-06, "loss": 0.3868, "num_input_tokens_seen": 54718368, "step": 25360 }, { "epoch": 4.6549825656083685, "grad_norm": 0.08270084112882614, "learning_rate": 9.472801581163232e-06, "loss": 0.1628, "num_input_tokens_seen": 54728800, "step": 25365 }, { "epoch": 4.655900165167921, "grad_norm": 23.587797164916992, "learning_rate": 9.472443627813369e-06, "loss": 0.1713, "num_input_tokens_seen": 54740128, "step": 25370 }, { "epoch": 4.656817764727473, "grad_norm": 1.791684865951538, "learning_rate": 9.472085559752256e-06, "loss": 0.1503, "num_input_tokens_seen": 54750720, "step": 25375 }, { "epoch": 4.657735364287025, "grad_norm": 0.17221927642822266, "learning_rate": 9.471727376989081e-06, "loss": 0.1266, "num_input_tokens_seen": 54761984, "step": 25380 }, { "epoch": 4.658652963846578, "grad_norm": 0.02003832533955574, "learning_rate": 9.47136907953303e-06, "loss": 0.0209, "num_input_tokens_seen": 54772320, "step": 25385 }, { "epoch": 4.659570563406129, "grad_norm": 19.8448543548584, "learning_rate": 9.47101066739329e-06, "loss": 0.0768, "num_input_tokens_seen": 54783456, "step": 25390 }, { "epoch": 4.660488162965682, "grad_norm": 0.41986119747161865, "learning_rate": 9.470652140579057e-06, "loss": 0.1661, "num_input_tokens_seen": 54793888, "step": 25395 }, { "epoch": 4.661405762525234, "grad_norm": 0.0182194747030735, "learning_rate": 9.470293499099526e-06, "loss": 0.0017, "num_input_tokens_seen": 54805408, "step": 25400 }, { "epoch": 4.662323362084786, "grad_norm": 28.195093154907227, "learning_rate": 9.469934742963896e-06, "loss": 0.4268, "num_input_tokens_seen": 54815776, "step": 25405 }, { "epoch": 4.663240961644338, "grad_norm": 0.032690830528736115, "learning_rate": 9.469575872181366e-06, "loss": 0.0037, "num_input_tokens_seen": 54828064, "step": 25410 }, { "epoch": 4.664158561203891, "grad_norm": 0.03238856792449951, "learning_rate": 9.469216886761142e-06, "loss": 0.1596, "num_input_tokens_seen": 54835744, "step": 25415 }, { "epoch": 4.665076160763443, "grad_norm": 0.16973765194416046, "learning_rate": 9.468857786712434e-06, "loss": 0.0589, "num_input_tokens_seen": 54847072, "step": 25420 }, { "epoch": 4.665993760322995, "grad_norm": 0.4439946711063385, "learning_rate": 9.468498572044447e-06, "loss": 0.007, "num_input_tokens_seen": 54858560, "step": 25425 }, { "epoch": 4.666911359882548, "grad_norm": 0.10029865801334381, "learning_rate": 9.468139242766397e-06, "loss": 0.1876, "num_input_tokens_seen": 54867008, "step": 25430 }, { "epoch": 4.667828959442099, "grad_norm": 20.150196075439453, "learning_rate": 9.467779798887502e-06, "loss": 0.3281, "num_input_tokens_seen": 54878720, "step": 25435 }, { "epoch": 4.668746559001652, "grad_norm": 9.348651885986328, "learning_rate": 9.467420240416978e-06, "loss": 0.1707, "num_input_tokens_seen": 54888864, "step": 25440 }, { "epoch": 4.669664158561204, "grad_norm": 9.59784984588623, "learning_rate": 9.46706056736405e-06, "loss": 0.2107, "num_input_tokens_seen": 54899296, "step": 25445 }, { "epoch": 4.670581758120756, "grad_norm": 0.07701339572668076, "learning_rate": 9.466700779737942e-06, "loss": 0.0032, "num_input_tokens_seen": 54911392, "step": 25450 }, { "epoch": 4.671499357680308, "grad_norm": 6.551478862762451, "learning_rate": 9.466340877547882e-06, "loss": 0.5737, "num_input_tokens_seen": 54922144, "step": 25455 }, { "epoch": 4.672416957239861, "grad_norm": 9.747894287109375, "learning_rate": 9.465980860803098e-06, "loss": 0.1787, "num_input_tokens_seen": 54931680, "step": 25460 }, { "epoch": 4.6733345567994125, "grad_norm": 9.591092109680176, "learning_rate": 9.46562072951283e-06, "loss": 0.1697, "num_input_tokens_seen": 54942656, "step": 25465 }, { "epoch": 4.674252156358965, "grad_norm": 0.2321164608001709, "learning_rate": 9.465260483686309e-06, "loss": 0.0649, "num_input_tokens_seen": 54955040, "step": 25470 }, { "epoch": 4.6751697559185175, "grad_norm": 0.06387350708246231, "learning_rate": 9.46490012333278e-06, "loss": 0.0025, "num_input_tokens_seen": 54966208, "step": 25475 }, { "epoch": 4.676087355478069, "grad_norm": 0.42843931913375854, "learning_rate": 9.46453964846148e-06, "loss": 0.0246, "num_input_tokens_seen": 54976608, "step": 25480 }, { "epoch": 4.677004955037622, "grad_norm": 1.4477527141571045, "learning_rate": 9.464179059081657e-06, "loss": 0.1825, "num_input_tokens_seen": 54986848, "step": 25485 }, { "epoch": 4.677922554597174, "grad_norm": 10.457273483276367, "learning_rate": 9.463818355202562e-06, "loss": 0.0743, "num_input_tokens_seen": 54997920, "step": 25490 }, { "epoch": 4.678840154156726, "grad_norm": 0.17094704508781433, "learning_rate": 9.463457536833443e-06, "loss": 0.0033, "num_input_tokens_seen": 55008576, "step": 25495 }, { "epoch": 4.679757753716278, "grad_norm": 14.642302513122559, "learning_rate": 9.463096603983557e-06, "loss": 0.2615, "num_input_tokens_seen": 55020704, "step": 25500 }, { "epoch": 4.680675353275831, "grad_norm": 56.044471740722656, "learning_rate": 9.46273555666216e-06, "loss": 0.3608, "num_input_tokens_seen": 55032096, "step": 25505 }, { "epoch": 4.681592952835382, "grad_norm": 18.880403518676758, "learning_rate": 9.462374394878513e-06, "loss": 0.1348, "num_input_tokens_seen": 55042912, "step": 25510 }, { "epoch": 4.682510552394935, "grad_norm": 36.53056335449219, "learning_rate": 9.462013118641878e-06, "loss": 0.1097, "num_input_tokens_seen": 55053888, "step": 25515 }, { "epoch": 4.683428151954487, "grad_norm": 10.631155014038086, "learning_rate": 9.461651727961523e-06, "loss": 0.3962, "num_input_tokens_seen": 55063584, "step": 25520 }, { "epoch": 4.684345751514039, "grad_norm": 0.21901023387908936, "learning_rate": 9.461290222846716e-06, "loss": 0.1585, "num_input_tokens_seen": 55074592, "step": 25525 }, { "epoch": 4.6852633510735915, "grad_norm": 0.119644895195961, "learning_rate": 9.460928603306728e-06, "loss": 0.0644, "num_input_tokens_seen": 55085120, "step": 25530 }, { "epoch": 4.686180950633144, "grad_norm": 3.9372072219848633, "learning_rate": 9.460566869350835e-06, "loss": 0.3016, "num_input_tokens_seen": 55097024, "step": 25535 }, { "epoch": 4.687098550192696, "grad_norm": 0.11463363468647003, "learning_rate": 9.460205020988316e-06, "loss": 0.0969, "num_input_tokens_seen": 55107168, "step": 25540 }, { "epoch": 4.688016149752248, "grad_norm": 0.1819506287574768, "learning_rate": 9.459843058228451e-06, "loss": 0.1055, "num_input_tokens_seen": 55118368, "step": 25545 }, { "epoch": 4.688933749311801, "grad_norm": 0.05958584323525429, "learning_rate": 9.459480981080523e-06, "loss": 0.2934, "num_input_tokens_seen": 55128320, "step": 25550 }, { "epoch": 4.689851348871352, "grad_norm": 16.650148391723633, "learning_rate": 9.459118789553818e-06, "loss": 0.1915, "num_input_tokens_seen": 55139136, "step": 25555 }, { "epoch": 4.690768948430905, "grad_norm": 24.602100372314453, "learning_rate": 9.45875648365763e-06, "loss": 0.0667, "num_input_tokens_seen": 55149376, "step": 25560 }, { "epoch": 4.691686547990457, "grad_norm": 3.536714553833008, "learning_rate": 9.458394063401249e-06, "loss": 0.0596, "num_input_tokens_seen": 55158976, "step": 25565 }, { "epoch": 4.692604147550009, "grad_norm": 0.0702250525355339, "learning_rate": 9.458031528793968e-06, "loss": 0.0495, "num_input_tokens_seen": 55169568, "step": 25570 }, { "epoch": 4.693521747109561, "grad_norm": 0.363816499710083, "learning_rate": 9.457668879845088e-06, "loss": 0.1299, "num_input_tokens_seen": 55180800, "step": 25575 }, { "epoch": 4.694439346669114, "grad_norm": 15.615225791931152, "learning_rate": 9.457306116563909e-06, "loss": 0.1721, "num_input_tokens_seen": 55191264, "step": 25580 }, { "epoch": 4.6953569462286655, "grad_norm": 0.06351540982723236, "learning_rate": 9.456943238959738e-06, "loss": 0.0381, "num_input_tokens_seen": 55202656, "step": 25585 }, { "epoch": 4.696274545788218, "grad_norm": 7.931920528411865, "learning_rate": 9.45658024704188e-06, "loss": 0.2951, "num_input_tokens_seen": 55213184, "step": 25590 }, { "epoch": 4.6971921453477705, "grad_norm": 6.428482532501221, "learning_rate": 9.456217140819645e-06, "loss": 0.2032, "num_input_tokens_seen": 55224928, "step": 25595 }, { "epoch": 4.698109744907322, "grad_norm": 14.0687255859375, "learning_rate": 9.45585392030235e-06, "loss": 0.1054, "num_input_tokens_seen": 55236832, "step": 25600 }, { "epoch": 4.699027344466875, "grad_norm": 1.0918309688568115, "learning_rate": 9.455490585499304e-06, "loss": 0.2118, "num_input_tokens_seen": 55246624, "step": 25605 }, { "epoch": 4.699944944026427, "grad_norm": 0.5665529370307922, "learning_rate": 9.455127136419832e-06, "loss": 0.1214, "num_input_tokens_seen": 55257856, "step": 25610 }, { "epoch": 4.700862543585979, "grad_norm": 6.499648571014404, "learning_rate": 9.454763573073253e-06, "loss": 0.3107, "num_input_tokens_seen": 55268800, "step": 25615 }, { "epoch": 4.701780143145531, "grad_norm": 11.316855430603027, "learning_rate": 9.454399895468893e-06, "loss": 0.1642, "num_input_tokens_seen": 55278784, "step": 25620 }, { "epoch": 4.702697742705084, "grad_norm": 14.649163246154785, "learning_rate": 9.454036103616078e-06, "loss": 0.1749, "num_input_tokens_seen": 55288448, "step": 25625 }, { "epoch": 4.703615342264635, "grad_norm": 1.1534305810928345, "learning_rate": 9.453672197524142e-06, "loss": 0.1622, "num_input_tokens_seen": 55297856, "step": 25630 }, { "epoch": 4.704532941824188, "grad_norm": 18.936885833740234, "learning_rate": 9.453308177202416e-06, "loss": 0.4016, "num_input_tokens_seen": 55308320, "step": 25635 }, { "epoch": 4.70545054138374, "grad_norm": 7.562952995300293, "learning_rate": 9.452944042660238e-06, "loss": 0.1997, "num_input_tokens_seen": 55317824, "step": 25640 }, { "epoch": 4.706368140943292, "grad_norm": 6.311835765838623, "learning_rate": 9.452579793906945e-06, "loss": 0.1122, "num_input_tokens_seen": 55329216, "step": 25645 }, { "epoch": 4.7072857405028445, "grad_norm": 0.15973061323165894, "learning_rate": 9.452215430951883e-06, "loss": 0.2761, "num_input_tokens_seen": 55340864, "step": 25650 }, { "epoch": 4.708203340062397, "grad_norm": 28.2310848236084, "learning_rate": 9.451850953804393e-06, "loss": 0.1287, "num_input_tokens_seen": 55352640, "step": 25655 }, { "epoch": 4.709120939621949, "grad_norm": 0.24692402780056, "learning_rate": 9.451486362473829e-06, "loss": 0.0064, "num_input_tokens_seen": 55363328, "step": 25660 }, { "epoch": 4.710038539181501, "grad_norm": 0.8200125098228455, "learning_rate": 9.451121656969537e-06, "loss": 0.0664, "num_input_tokens_seen": 55372416, "step": 25665 }, { "epoch": 4.710956138741054, "grad_norm": 0.12224438786506653, "learning_rate": 9.450756837300873e-06, "loss": 0.0941, "num_input_tokens_seen": 55382752, "step": 25670 }, { "epoch": 4.711873738300605, "grad_norm": 0.13381874561309814, "learning_rate": 9.450391903477196e-06, "loss": 0.0798, "num_input_tokens_seen": 55393248, "step": 25675 }, { "epoch": 4.712791337860158, "grad_norm": 23.995445251464844, "learning_rate": 9.450026855507861e-06, "loss": 0.0248, "num_input_tokens_seen": 55403680, "step": 25680 }, { "epoch": 4.71370893741971, "grad_norm": 1.5148674249649048, "learning_rate": 9.449661693402237e-06, "loss": 0.1697, "num_input_tokens_seen": 55413664, "step": 25685 }, { "epoch": 4.714626536979262, "grad_norm": 7.3228840827941895, "learning_rate": 9.449296417169685e-06, "loss": 0.1473, "num_input_tokens_seen": 55424032, "step": 25690 }, { "epoch": 4.715544136538814, "grad_norm": 7.215498924255371, "learning_rate": 9.448931026819577e-06, "loss": 0.0743, "num_input_tokens_seen": 55435232, "step": 25695 }, { "epoch": 4.716461736098367, "grad_norm": 74.89575958251953, "learning_rate": 9.448565522361282e-06, "loss": 0.3516, "num_input_tokens_seen": 55445696, "step": 25700 }, { "epoch": 4.717379335657919, "grad_norm": 0.19239750504493713, "learning_rate": 9.448199903804178e-06, "loss": 0.147, "num_input_tokens_seen": 55457024, "step": 25705 }, { "epoch": 4.718296935217471, "grad_norm": 0.9030780792236328, "learning_rate": 9.44783417115764e-06, "loss": 0.161, "num_input_tokens_seen": 55467744, "step": 25710 }, { "epoch": 4.719214534777024, "grad_norm": 0.026946546509861946, "learning_rate": 9.447468324431049e-06, "loss": 0.0824, "num_input_tokens_seen": 55479488, "step": 25715 }, { "epoch": 4.720132134336575, "grad_norm": 4.207733631134033, "learning_rate": 9.447102363633787e-06, "loss": 0.2995, "num_input_tokens_seen": 55490528, "step": 25720 }, { "epoch": 4.721049733896128, "grad_norm": 32.028079986572266, "learning_rate": 9.446736288775242e-06, "loss": 0.1294, "num_input_tokens_seen": 55502080, "step": 25725 }, { "epoch": 4.72196733345568, "grad_norm": 0.06631498783826828, "learning_rate": 9.446370099864803e-06, "loss": 0.1081, "num_input_tokens_seen": 55512160, "step": 25730 }, { "epoch": 4.722884933015232, "grad_norm": 0.08300282061100006, "learning_rate": 9.446003796911864e-06, "loss": 0.2062, "num_input_tokens_seen": 55523392, "step": 25735 }, { "epoch": 4.723802532574784, "grad_norm": 23.714353561401367, "learning_rate": 9.445637379925816e-06, "loss": 0.0452, "num_input_tokens_seen": 55534656, "step": 25740 }, { "epoch": 4.724720132134337, "grad_norm": 0.16816744208335876, "learning_rate": 9.445270848916061e-06, "loss": 0.1229, "num_input_tokens_seen": 55545952, "step": 25745 }, { "epoch": 4.7256377316938885, "grad_norm": 10.803869247436523, "learning_rate": 9.444904203891999e-06, "loss": 0.0887, "num_input_tokens_seen": 55557920, "step": 25750 }, { "epoch": 4.726555331253441, "grad_norm": 10.987812995910645, "learning_rate": 9.44453744486303e-06, "loss": 0.2335, "num_input_tokens_seen": 55568416, "step": 25755 }, { "epoch": 4.7274729308129935, "grad_norm": 0.5817030072212219, "learning_rate": 9.444170571838566e-06, "loss": 0.1625, "num_input_tokens_seen": 55579264, "step": 25760 }, { "epoch": 4.728390530372545, "grad_norm": 0.1027279868721962, "learning_rate": 9.443803584828016e-06, "loss": 0.0763, "num_input_tokens_seen": 55589440, "step": 25765 }, { "epoch": 4.729308129932098, "grad_norm": 0.08669397979974747, "learning_rate": 9.443436483840788e-06, "loss": 0.0421, "num_input_tokens_seen": 55601120, "step": 25770 }, { "epoch": 4.73022572949165, "grad_norm": 13.064998626708984, "learning_rate": 9.443069268886304e-06, "loss": 0.2912, "num_input_tokens_seen": 55610752, "step": 25775 }, { "epoch": 4.731143329051202, "grad_norm": 0.15460853278636932, "learning_rate": 9.442701939973978e-06, "loss": 0.1791, "num_input_tokens_seen": 55621216, "step": 25780 }, { "epoch": 4.732060928610754, "grad_norm": 28.561872482299805, "learning_rate": 9.442334497113233e-06, "loss": 0.0589, "num_input_tokens_seen": 55633088, "step": 25785 }, { "epoch": 4.732978528170307, "grad_norm": 0.0759604275226593, "learning_rate": 9.441966940313493e-06, "loss": 0.1707, "num_input_tokens_seen": 55644352, "step": 25790 }, { "epoch": 4.733896127729858, "grad_norm": 0.0653078630566597, "learning_rate": 9.441599269584185e-06, "loss": 0.1099, "num_input_tokens_seen": 55655328, "step": 25795 }, { "epoch": 4.734813727289411, "grad_norm": 33.771751403808594, "learning_rate": 9.44123148493474e-06, "loss": 0.1658, "num_input_tokens_seen": 55665632, "step": 25800 }, { "epoch": 4.735731326848963, "grad_norm": 40.489898681640625, "learning_rate": 9.44086358637459e-06, "loss": 0.2452, "num_input_tokens_seen": 55676928, "step": 25805 }, { "epoch": 4.736648926408515, "grad_norm": 2.115757465362549, "learning_rate": 9.440495573913174e-06, "loss": 0.1256, "num_input_tokens_seen": 55688800, "step": 25810 }, { "epoch": 4.7375665259680675, "grad_norm": 0.0935039147734642, "learning_rate": 9.440127447559926e-06, "loss": 0.0544, "num_input_tokens_seen": 55700160, "step": 25815 }, { "epoch": 4.73848412552762, "grad_norm": 0.24015803635120392, "learning_rate": 9.439759207324292e-06, "loss": 0.0917, "num_input_tokens_seen": 55711136, "step": 25820 }, { "epoch": 4.739401725087172, "grad_norm": 0.05820830166339874, "learning_rate": 9.439390853215716e-06, "loss": 0.2063, "num_input_tokens_seen": 55722880, "step": 25825 }, { "epoch": 4.740319324646724, "grad_norm": 0.06837176531553268, "learning_rate": 9.439022385243643e-06, "loss": 0.1072, "num_input_tokens_seen": 55734144, "step": 25830 }, { "epoch": 4.741236924206277, "grad_norm": 0.486105740070343, "learning_rate": 9.438653803417526e-06, "loss": 0.0945, "num_input_tokens_seen": 55746176, "step": 25835 }, { "epoch": 4.742154523765828, "grad_norm": 9.523035049438477, "learning_rate": 9.438285107746819e-06, "loss": 0.1243, "num_input_tokens_seen": 55756608, "step": 25840 }, { "epoch": 4.743072123325381, "grad_norm": 7.278800964355469, "learning_rate": 9.437916298240979e-06, "loss": 0.4973, "num_input_tokens_seen": 55767808, "step": 25845 }, { "epoch": 4.743989722884933, "grad_norm": 8.647076606750488, "learning_rate": 9.437547374909462e-06, "loss": 0.1846, "num_input_tokens_seen": 55778368, "step": 25850 }, { "epoch": 4.744907322444485, "grad_norm": 0.29223671555519104, "learning_rate": 9.437178337761733e-06, "loss": 0.1908, "num_input_tokens_seen": 55790144, "step": 25855 }, { "epoch": 4.745824922004037, "grad_norm": 0.09575367718935013, "learning_rate": 9.436809186807257e-06, "loss": 0.0089, "num_input_tokens_seen": 55801152, "step": 25860 }, { "epoch": 4.74674252156359, "grad_norm": 2.894073486328125, "learning_rate": 9.436439922055502e-06, "loss": 0.1458, "num_input_tokens_seen": 55811936, "step": 25865 }, { "epoch": 4.7476601211231415, "grad_norm": 15.625991821289062, "learning_rate": 9.436070543515939e-06, "loss": 0.2034, "num_input_tokens_seen": 55821888, "step": 25870 }, { "epoch": 4.748577720682694, "grad_norm": 0.04123656824231148, "learning_rate": 9.43570105119804e-06, "loss": 0.0241, "num_input_tokens_seen": 55832960, "step": 25875 }, { "epoch": 4.7494953202422465, "grad_norm": 0.18740519881248474, "learning_rate": 9.435331445111285e-06, "loss": 0.0054, "num_input_tokens_seen": 55844608, "step": 25880 }, { "epoch": 4.750412919801798, "grad_norm": 0.019443659111857414, "learning_rate": 9.434961725265153e-06, "loss": 0.0017, "num_input_tokens_seen": 55855328, "step": 25885 }, { "epoch": 4.751330519361351, "grad_norm": 6.601075649261475, "learning_rate": 9.434591891669125e-06, "loss": 0.2591, "num_input_tokens_seen": 55866400, "step": 25890 }, { "epoch": 4.752248118920903, "grad_norm": 0.3300332725048065, "learning_rate": 9.43422194433269e-06, "loss": 0.0038, "num_input_tokens_seen": 55877376, "step": 25895 }, { "epoch": 4.753165718480455, "grad_norm": 0.0690932646393776, "learning_rate": 9.433851883265334e-06, "loss": 0.0458, "num_input_tokens_seen": 55887648, "step": 25900 }, { "epoch": 4.754083318040007, "grad_norm": 42.23427963256836, "learning_rate": 9.433481708476548e-06, "loss": 0.1902, "num_input_tokens_seen": 55898656, "step": 25905 }, { "epoch": 4.75500091759956, "grad_norm": 0.13231715559959412, "learning_rate": 9.433111419975828e-06, "loss": 0.01, "num_input_tokens_seen": 55910016, "step": 25910 }, { "epoch": 4.755918517159111, "grad_norm": 0.09240362793207169, "learning_rate": 9.432741017772671e-06, "loss": 0.1198, "num_input_tokens_seen": 55921472, "step": 25915 }, { "epoch": 4.756836116718664, "grad_norm": 0.049897219985723495, "learning_rate": 9.432370501876577e-06, "loss": 0.0034, "num_input_tokens_seen": 55931808, "step": 25920 }, { "epoch": 4.757753716278216, "grad_norm": 0.11940499395132065, "learning_rate": 9.431999872297048e-06, "loss": 0.152, "num_input_tokens_seen": 55942336, "step": 25925 }, { "epoch": 4.758671315837768, "grad_norm": 5.215487957000732, "learning_rate": 9.431629129043593e-06, "loss": 0.2826, "num_input_tokens_seen": 55952736, "step": 25930 }, { "epoch": 4.7595889153973205, "grad_norm": 0.4318558871746063, "learning_rate": 9.431258272125718e-06, "loss": 0.0935, "num_input_tokens_seen": 55963680, "step": 25935 }, { "epoch": 4.760506514956873, "grad_norm": 5.250555992126465, "learning_rate": 9.430887301552936e-06, "loss": 0.0306, "num_input_tokens_seen": 55974080, "step": 25940 }, { "epoch": 4.761424114516425, "grad_norm": 8.65074634552002, "learning_rate": 9.430516217334762e-06, "loss": 0.2568, "num_input_tokens_seen": 55985344, "step": 25945 }, { "epoch": 4.762341714075977, "grad_norm": 3.5891613960266113, "learning_rate": 9.430145019480715e-06, "loss": 0.0757, "num_input_tokens_seen": 55996192, "step": 25950 }, { "epoch": 4.76325931363553, "grad_norm": 23.770305633544922, "learning_rate": 9.429773708000314e-06, "loss": 0.1195, "num_input_tokens_seen": 56006240, "step": 25955 }, { "epoch": 4.764176913195081, "grad_norm": 0.07809089124202728, "learning_rate": 9.429402282903082e-06, "loss": 0.1034, "num_input_tokens_seen": 56016576, "step": 25960 }, { "epoch": 4.765094512754634, "grad_norm": 0.07793329656124115, "learning_rate": 9.429030744198547e-06, "loss": 0.1239, "num_input_tokens_seen": 56026240, "step": 25965 }, { "epoch": 4.766012112314186, "grad_norm": 0.13272172212600708, "learning_rate": 9.428659091896237e-06, "loss": 0.0712, "num_input_tokens_seen": 56036128, "step": 25970 }, { "epoch": 4.766929711873738, "grad_norm": 16.26955795288086, "learning_rate": 9.428287326005687e-06, "loss": 0.2288, "num_input_tokens_seen": 56047520, "step": 25975 }, { "epoch": 4.76784731143329, "grad_norm": 0.04799269884824753, "learning_rate": 9.427915446536428e-06, "loss": 0.3425, "num_input_tokens_seen": 56057472, "step": 25980 }, { "epoch": 4.768764910992843, "grad_norm": 0.05205145850777626, "learning_rate": 9.427543453498003e-06, "loss": 0.1159, "num_input_tokens_seen": 56068736, "step": 25985 }, { "epoch": 4.769682510552395, "grad_norm": 0.0805225595831871, "learning_rate": 9.427171346899949e-06, "loss": 0.1214, "num_input_tokens_seen": 56080416, "step": 25990 }, { "epoch": 4.770600110111947, "grad_norm": 0.5228779315948486, "learning_rate": 9.426799126751811e-06, "loss": 0.0285, "num_input_tokens_seen": 56091136, "step": 25995 }, { "epoch": 4.7715177096715, "grad_norm": 0.15707585215568542, "learning_rate": 9.426426793063136e-06, "loss": 0.0044, "num_input_tokens_seen": 56100832, "step": 26000 }, { "epoch": 4.772435309231051, "grad_norm": 0.03209733963012695, "learning_rate": 9.426054345843476e-06, "loss": 0.244, "num_input_tokens_seen": 56112032, "step": 26005 }, { "epoch": 4.773352908790604, "grad_norm": 0.7504227757453918, "learning_rate": 9.42568178510238e-06, "loss": 0.1572, "num_input_tokens_seen": 56122880, "step": 26010 }, { "epoch": 4.774270508350156, "grad_norm": 0.9457300305366516, "learning_rate": 9.425309110849407e-06, "loss": 0.1184, "num_input_tokens_seen": 56134208, "step": 26015 }, { "epoch": 4.775188107909708, "grad_norm": 22.60230255126953, "learning_rate": 9.424936323094111e-06, "loss": 0.2003, "num_input_tokens_seen": 56144448, "step": 26020 }, { "epoch": 4.77610570746926, "grad_norm": 0.1284276694059372, "learning_rate": 9.42456342184606e-06, "loss": 0.1076, "num_input_tokens_seen": 56154816, "step": 26025 }, { "epoch": 4.777023307028813, "grad_norm": 5.700422286987305, "learning_rate": 9.424190407114812e-06, "loss": 0.2254, "num_input_tokens_seen": 56164320, "step": 26030 }, { "epoch": 4.7779409065883645, "grad_norm": 0.11270678043365479, "learning_rate": 9.42381727890994e-06, "loss": 0.313, "num_input_tokens_seen": 56175136, "step": 26035 }, { "epoch": 4.778858506147917, "grad_norm": 14.40765380859375, "learning_rate": 9.423444037241007e-06, "loss": 0.3023, "num_input_tokens_seen": 56185600, "step": 26040 }, { "epoch": 4.7797761057074695, "grad_norm": 0.3723125457763672, "learning_rate": 9.423070682117592e-06, "loss": 0.1607, "num_input_tokens_seen": 56196896, "step": 26045 }, { "epoch": 4.780693705267021, "grad_norm": 8.475874900817871, "learning_rate": 9.42269721354927e-06, "loss": 0.0859, "num_input_tokens_seen": 56207872, "step": 26050 }, { "epoch": 4.781611304826574, "grad_norm": 80.83881378173828, "learning_rate": 9.422323631545618e-06, "loss": 0.0319, "num_input_tokens_seen": 56220000, "step": 26055 }, { "epoch": 4.782528904386126, "grad_norm": 0.33009424805641174, "learning_rate": 9.42194993611622e-06, "loss": 0.3266, "num_input_tokens_seen": 56230784, "step": 26060 }, { "epoch": 4.783446503945678, "grad_norm": 0.039389293640851974, "learning_rate": 9.421576127270658e-06, "loss": 0.0039, "num_input_tokens_seen": 56241728, "step": 26065 }, { "epoch": 4.78436410350523, "grad_norm": 0.8372045755386353, "learning_rate": 9.421202205018522e-06, "loss": 0.0175, "num_input_tokens_seen": 56253984, "step": 26070 }, { "epoch": 4.785281703064783, "grad_norm": 1.4073127508163452, "learning_rate": 9.420828169369403e-06, "loss": 0.3247, "num_input_tokens_seen": 56265408, "step": 26075 }, { "epoch": 4.786199302624334, "grad_norm": 0.20896275341510773, "learning_rate": 9.42045402033289e-06, "loss": 0.1773, "num_input_tokens_seen": 56276896, "step": 26080 }, { "epoch": 4.787116902183887, "grad_norm": 15.9902982711792, "learning_rate": 9.420079757918585e-06, "loss": 0.0703, "num_input_tokens_seen": 56287232, "step": 26085 }, { "epoch": 4.788034501743439, "grad_norm": 27.85875129699707, "learning_rate": 9.419705382136084e-06, "loss": 0.1261, "num_input_tokens_seen": 56298048, "step": 26090 }, { "epoch": 4.788952101302991, "grad_norm": 29.556272506713867, "learning_rate": 9.41933089299499e-06, "loss": 0.237, "num_input_tokens_seen": 56308512, "step": 26095 }, { "epoch": 4.7898697008625435, "grad_norm": 7.020510673522949, "learning_rate": 9.418956290504908e-06, "loss": 0.1675, "num_input_tokens_seen": 56319520, "step": 26100 }, { "epoch": 4.790787300422096, "grad_norm": 0.30766111612319946, "learning_rate": 9.418581574675445e-06, "loss": 0.2503, "num_input_tokens_seen": 56330560, "step": 26105 }, { "epoch": 4.791704899981648, "grad_norm": 0.874790370464325, "learning_rate": 9.418206745516213e-06, "loss": 0.1489, "num_input_tokens_seen": 56340224, "step": 26110 }, { "epoch": 4.7926224995412, "grad_norm": 0.32637032866477966, "learning_rate": 9.417831803036826e-06, "loss": 0.0936, "num_input_tokens_seen": 56349664, "step": 26115 }, { "epoch": 4.793540099100753, "grad_norm": 2.9965951442718506, "learning_rate": 9.4174567472469e-06, "loss": 0.1027, "num_input_tokens_seen": 56360064, "step": 26120 }, { "epoch": 4.794457698660304, "grad_norm": 1.3385193347930908, "learning_rate": 9.417081578156055e-06, "loss": 0.0948, "num_input_tokens_seen": 56370496, "step": 26125 }, { "epoch": 4.795375298219857, "grad_norm": 0.46051251888275146, "learning_rate": 9.416706295773914e-06, "loss": 0.0633, "num_input_tokens_seen": 56381376, "step": 26130 }, { "epoch": 4.796292897779409, "grad_norm": 11.223549842834473, "learning_rate": 9.416330900110102e-06, "loss": 0.2846, "num_input_tokens_seen": 56391840, "step": 26135 }, { "epoch": 4.797210497338961, "grad_norm": 0.09073437750339508, "learning_rate": 9.415955391174245e-06, "loss": 0.0985, "num_input_tokens_seen": 56404352, "step": 26140 }, { "epoch": 4.798128096898513, "grad_norm": 11.113865852355957, "learning_rate": 9.415579768975979e-06, "loss": 0.1184, "num_input_tokens_seen": 56414752, "step": 26145 }, { "epoch": 4.799045696458066, "grad_norm": 1.9059667587280273, "learning_rate": 9.415204033524934e-06, "loss": 0.0618, "num_input_tokens_seen": 56425152, "step": 26150 }, { "epoch": 4.7999632960176175, "grad_norm": 0.4040699899196625, "learning_rate": 9.41482818483075e-06, "loss": 0.0859, "num_input_tokens_seen": 56436192, "step": 26155 }, { "epoch": 4.80088089557717, "grad_norm": 6.568210601806641, "learning_rate": 9.414452222903064e-06, "loss": 0.1393, "num_input_tokens_seen": 56447072, "step": 26160 }, { "epoch": 4.8017984951367225, "grad_norm": 15.041221618652344, "learning_rate": 9.41407614775152e-06, "loss": 0.1655, "num_input_tokens_seen": 56458432, "step": 26165 }, { "epoch": 4.802716094696274, "grad_norm": 0.19072631001472473, "learning_rate": 9.413699959385762e-06, "loss": 0.2304, "num_input_tokens_seen": 56469632, "step": 26170 }, { "epoch": 4.803633694255827, "grad_norm": 7.193053245544434, "learning_rate": 9.413323657815444e-06, "loss": 0.214, "num_input_tokens_seen": 56480704, "step": 26175 }, { "epoch": 4.804551293815379, "grad_norm": 0.2645215094089508, "learning_rate": 9.412947243050213e-06, "loss": 0.2472, "num_input_tokens_seen": 56490976, "step": 26180 }, { "epoch": 4.805468893374931, "grad_norm": 0.07218215614557266, "learning_rate": 9.412570715099725e-06, "loss": 0.1806, "num_input_tokens_seen": 56502208, "step": 26185 }, { "epoch": 4.806386492934483, "grad_norm": 40.399017333984375, "learning_rate": 9.412194073973637e-06, "loss": 0.2476, "num_input_tokens_seen": 56513024, "step": 26190 }, { "epoch": 4.807304092494036, "grad_norm": 0.10416270047426224, "learning_rate": 9.411817319681608e-06, "loss": 0.1052, "num_input_tokens_seen": 56524064, "step": 26195 }, { "epoch": 4.808221692053587, "grad_norm": 0.17778198421001434, "learning_rate": 9.411440452233305e-06, "loss": 0.1547, "num_input_tokens_seen": 56534400, "step": 26200 }, { "epoch": 4.80913929161314, "grad_norm": 1.991815447807312, "learning_rate": 9.41106347163839e-06, "loss": 0.3268, "num_input_tokens_seen": 56546592, "step": 26205 }, { "epoch": 4.810056891172692, "grad_norm": 0.43028849363327026, "learning_rate": 9.410686377906532e-06, "loss": 0.0162, "num_input_tokens_seen": 56556992, "step": 26210 }, { "epoch": 4.810974490732244, "grad_norm": 0.5080344080924988, "learning_rate": 9.410309171047407e-06, "loss": 0.0239, "num_input_tokens_seen": 56567680, "step": 26215 }, { "epoch": 4.8118920902917965, "grad_norm": 76.41572570800781, "learning_rate": 9.409931851070687e-06, "loss": 0.1307, "num_input_tokens_seen": 56578880, "step": 26220 }, { "epoch": 4.812809689851349, "grad_norm": 9.650664329528809, "learning_rate": 9.40955441798605e-06, "loss": 0.1227, "num_input_tokens_seen": 56589568, "step": 26225 }, { "epoch": 4.813727289410901, "grad_norm": 0.059194035828113556, "learning_rate": 9.409176871803175e-06, "loss": 0.0745, "num_input_tokens_seen": 56600032, "step": 26230 }, { "epoch": 4.814644888970453, "grad_norm": 0.16237398982048035, "learning_rate": 9.408799212531745e-06, "loss": 0.0065, "num_input_tokens_seen": 56611040, "step": 26235 }, { "epoch": 4.815562488530006, "grad_norm": 0.06039385870099068, "learning_rate": 9.40842144018145e-06, "loss": 0.0077, "num_input_tokens_seen": 56621632, "step": 26240 }, { "epoch": 4.816480088089557, "grad_norm": 55.69379806518555, "learning_rate": 9.408043554761979e-06, "loss": 0.2714, "num_input_tokens_seen": 56632864, "step": 26245 }, { "epoch": 4.81739768764911, "grad_norm": 9.385635375976562, "learning_rate": 9.40766555628302e-06, "loss": 0.4592, "num_input_tokens_seen": 56644128, "step": 26250 }, { "epoch": 4.818315287208662, "grad_norm": 0.04237307608127594, "learning_rate": 9.407287444754275e-06, "loss": 0.0413, "num_input_tokens_seen": 56655424, "step": 26255 }, { "epoch": 4.819232886768214, "grad_norm": 0.8724440932273865, "learning_rate": 9.406909220185435e-06, "loss": 0.1037, "num_input_tokens_seen": 56666176, "step": 26260 }, { "epoch": 4.820150486327766, "grad_norm": 0.342090904712677, "learning_rate": 9.406530882586202e-06, "loss": 0.1025, "num_input_tokens_seen": 56677664, "step": 26265 }, { "epoch": 4.821068085887319, "grad_norm": 6.813841342926025, "learning_rate": 9.406152431966283e-06, "loss": 0.1977, "num_input_tokens_seen": 56688352, "step": 26270 }, { "epoch": 4.821985685446871, "grad_norm": 6.245283126831055, "learning_rate": 9.405773868335384e-06, "loss": 0.1096, "num_input_tokens_seen": 56699552, "step": 26275 }, { "epoch": 4.822903285006423, "grad_norm": 13.428279876708984, "learning_rate": 9.405395191703212e-06, "loss": 0.0846, "num_input_tokens_seen": 56710144, "step": 26280 }, { "epoch": 4.823820884565976, "grad_norm": 15.225489616394043, "learning_rate": 9.40501640207948e-06, "loss": 0.1847, "num_input_tokens_seen": 56720064, "step": 26285 }, { "epoch": 4.824738484125527, "grad_norm": 47.76399230957031, "learning_rate": 9.404637499473907e-06, "loss": 0.1917, "num_input_tokens_seen": 56729120, "step": 26290 }, { "epoch": 4.82565608368508, "grad_norm": 9.852595329284668, "learning_rate": 9.404258483896208e-06, "loss": 0.1931, "num_input_tokens_seen": 56739392, "step": 26295 }, { "epoch": 4.826573683244632, "grad_norm": 0.10493435710668564, "learning_rate": 9.403879355356104e-06, "loss": 0.0136, "num_input_tokens_seen": 56750016, "step": 26300 }, { "epoch": 4.827491282804184, "grad_norm": 0.15862511098384857, "learning_rate": 9.403500113863321e-06, "loss": 0.3373, "num_input_tokens_seen": 56760736, "step": 26305 }, { "epoch": 4.828408882363736, "grad_norm": 7.769266605377197, "learning_rate": 9.403120759427583e-06, "loss": 0.1354, "num_input_tokens_seen": 56771072, "step": 26310 }, { "epoch": 4.829326481923289, "grad_norm": 0.046874262392520905, "learning_rate": 9.402741292058625e-06, "loss": 0.211, "num_input_tokens_seen": 56782976, "step": 26315 }, { "epoch": 4.8302440814828405, "grad_norm": 0.09970642626285553, "learning_rate": 9.402361711766175e-06, "loss": 0.0031, "num_input_tokens_seen": 56792256, "step": 26320 }, { "epoch": 4.831161681042393, "grad_norm": 12.1920747756958, "learning_rate": 9.401982018559969e-06, "loss": 0.2425, "num_input_tokens_seen": 56802624, "step": 26325 }, { "epoch": 4.8320792806019455, "grad_norm": 0.598638653755188, "learning_rate": 9.401602212449748e-06, "loss": 0.0051, "num_input_tokens_seen": 56813760, "step": 26330 }, { "epoch": 4.832996880161497, "grad_norm": 6.848821640014648, "learning_rate": 9.401222293445252e-06, "loss": 0.1331, "num_input_tokens_seen": 56824512, "step": 26335 }, { "epoch": 4.83391447972105, "grad_norm": 11.174210548400879, "learning_rate": 9.400842261556225e-06, "loss": 0.0672, "num_input_tokens_seen": 56835520, "step": 26340 }, { "epoch": 4.834832079280602, "grad_norm": 113.7739486694336, "learning_rate": 9.400462116792415e-06, "loss": 0.0412, "num_input_tokens_seen": 56845280, "step": 26345 }, { "epoch": 4.835749678840154, "grad_norm": 27.843093872070312, "learning_rate": 9.400081859163572e-06, "loss": 0.086, "num_input_tokens_seen": 56856928, "step": 26350 }, { "epoch": 4.836667278399706, "grad_norm": 25.68032455444336, "learning_rate": 9.399701488679447e-06, "loss": 0.0977, "num_input_tokens_seen": 56867968, "step": 26355 }, { "epoch": 4.837584877959259, "grad_norm": 0.10026613622903824, "learning_rate": 9.3993210053498e-06, "loss": 0.2189, "num_input_tokens_seen": 56876864, "step": 26360 }, { "epoch": 4.83850247751881, "grad_norm": 0.3068980574607849, "learning_rate": 9.398940409184387e-06, "loss": 0.0085, "num_input_tokens_seen": 56888160, "step": 26365 }, { "epoch": 4.839420077078363, "grad_norm": 0.8589603900909424, "learning_rate": 9.398559700192969e-06, "loss": 0.1298, "num_input_tokens_seen": 56900032, "step": 26370 }, { "epoch": 4.840337676637915, "grad_norm": 48.254390716552734, "learning_rate": 9.398178878385313e-06, "loss": 0.0592, "num_input_tokens_seen": 56909152, "step": 26375 }, { "epoch": 4.841255276197467, "grad_norm": 0.23427288234233856, "learning_rate": 9.397797943771184e-06, "loss": 0.1516, "num_input_tokens_seen": 56918912, "step": 26380 }, { "epoch": 4.8421728757570195, "grad_norm": 63.27155303955078, "learning_rate": 9.397416896360354e-06, "loss": 0.1788, "num_input_tokens_seen": 56929664, "step": 26385 }, { "epoch": 4.843090475316572, "grad_norm": 0.07933708280324936, "learning_rate": 9.397035736162598e-06, "loss": 0.0036, "num_input_tokens_seen": 56941408, "step": 26390 }, { "epoch": 4.844008074876124, "grad_norm": 7.584558010101318, "learning_rate": 9.396654463187689e-06, "loss": 0.2485, "num_input_tokens_seen": 56951712, "step": 26395 }, { "epoch": 4.844925674435676, "grad_norm": 0.09239543229341507, "learning_rate": 9.396273077445406e-06, "loss": 0.2465, "num_input_tokens_seen": 56962752, "step": 26400 }, { "epoch": 4.845843273995229, "grad_norm": 6.862573623657227, "learning_rate": 9.395891578945535e-06, "loss": 0.3746, "num_input_tokens_seen": 56973632, "step": 26405 }, { "epoch": 4.84676087355478, "grad_norm": 0.14271898567676544, "learning_rate": 9.395509967697856e-06, "loss": 0.1202, "num_input_tokens_seen": 56983712, "step": 26410 }, { "epoch": 4.847678473114333, "grad_norm": 0.03805718943476677, "learning_rate": 9.39512824371216e-06, "loss": 0.2462, "num_input_tokens_seen": 56994816, "step": 26415 }, { "epoch": 4.848596072673885, "grad_norm": 2.985833168029785, "learning_rate": 9.394746406998234e-06, "loss": 0.0598, "num_input_tokens_seen": 57004672, "step": 26420 }, { "epoch": 4.849513672233437, "grad_norm": 0.06250132620334625, "learning_rate": 9.394364457565876e-06, "loss": 0.0546, "num_input_tokens_seen": 57015488, "step": 26425 }, { "epoch": 4.850431271792989, "grad_norm": 0.08474311977624893, "learning_rate": 9.39398239542488e-06, "loss": 0.0844, "num_input_tokens_seen": 57025472, "step": 26430 }, { "epoch": 4.851348871352542, "grad_norm": 1.4453208446502686, "learning_rate": 9.393600220585044e-06, "loss": 0.0659, "num_input_tokens_seen": 57036192, "step": 26435 }, { "epoch": 4.8522664709120935, "grad_norm": 0.047301895916461945, "learning_rate": 9.393217933056173e-06, "loss": 0.0028, "num_input_tokens_seen": 57047200, "step": 26440 }, { "epoch": 4.853184070471646, "grad_norm": 0.09579753875732422, "learning_rate": 9.392835532848071e-06, "loss": 0.0505, "num_input_tokens_seen": 57057376, "step": 26445 }, { "epoch": 4.8541016700311985, "grad_norm": 38.14703369140625, "learning_rate": 9.392453019970547e-06, "loss": 0.2256, "num_input_tokens_seen": 57068704, "step": 26450 }, { "epoch": 4.85501926959075, "grad_norm": 0.04057004302740097, "learning_rate": 9.392070394433408e-06, "loss": 0.0116, "num_input_tokens_seen": 57079904, "step": 26455 }, { "epoch": 4.855936869150303, "grad_norm": 0.021984674036502838, "learning_rate": 9.391687656246473e-06, "loss": 0.0946, "num_input_tokens_seen": 57091552, "step": 26460 }, { "epoch": 4.856854468709855, "grad_norm": 2.0331871509552, "learning_rate": 9.391304805419554e-06, "loss": 0.0922, "num_input_tokens_seen": 57102528, "step": 26465 }, { "epoch": 4.857772068269407, "grad_norm": 0.035588983446359634, "learning_rate": 9.390921841962475e-06, "loss": 0.1065, "num_input_tokens_seen": 57113984, "step": 26470 }, { "epoch": 4.858689667828959, "grad_norm": 18.458356857299805, "learning_rate": 9.390538765885055e-06, "loss": 0.1658, "num_input_tokens_seen": 57124768, "step": 26475 }, { "epoch": 4.859607267388512, "grad_norm": 8.51984691619873, "learning_rate": 9.39015557719712e-06, "loss": 0.3964, "num_input_tokens_seen": 57135936, "step": 26480 }, { "epoch": 4.860524866948063, "grad_norm": 0.4404771625995636, "learning_rate": 9.389772275908499e-06, "loss": 0.0035, "num_input_tokens_seen": 57145696, "step": 26485 }, { "epoch": 4.861442466507616, "grad_norm": 33.29496383666992, "learning_rate": 9.389388862029023e-06, "loss": 0.2282, "num_input_tokens_seen": 57156480, "step": 26490 }, { "epoch": 4.862360066067168, "grad_norm": 21.448890686035156, "learning_rate": 9.389005335568524e-06, "loss": 0.2179, "num_input_tokens_seen": 57166880, "step": 26495 }, { "epoch": 4.86327766562672, "grad_norm": 1.8934262990951538, "learning_rate": 9.388621696536842e-06, "loss": 0.1901, "num_input_tokens_seen": 57176768, "step": 26500 }, { "epoch": 4.8641952651862725, "grad_norm": 0.05373063310980797, "learning_rate": 9.388237944943814e-06, "loss": 0.1279, "num_input_tokens_seen": 57187680, "step": 26505 }, { "epoch": 4.865112864745825, "grad_norm": 0.05000840499997139, "learning_rate": 9.387854080799286e-06, "loss": 0.0097, "num_input_tokens_seen": 57198816, "step": 26510 }, { "epoch": 4.866030464305377, "grad_norm": 0.05602341517806053, "learning_rate": 9.3874701041131e-06, "loss": 0.1231, "num_input_tokens_seen": 57209216, "step": 26515 }, { "epoch": 4.866948063864929, "grad_norm": 0.12465459853410721, "learning_rate": 9.387086014895105e-06, "loss": 0.1093, "num_input_tokens_seen": 57220064, "step": 26520 }, { "epoch": 4.867865663424482, "grad_norm": 0.14082059264183044, "learning_rate": 9.386701813155155e-06, "loss": 0.2182, "num_input_tokens_seen": 57231232, "step": 26525 }, { "epoch": 4.868783262984033, "grad_norm": 45.34250259399414, "learning_rate": 9.3863174989031e-06, "loss": 0.2383, "num_input_tokens_seen": 57242144, "step": 26530 }, { "epoch": 4.869700862543586, "grad_norm": 27.178531646728516, "learning_rate": 9.3859330721488e-06, "loss": 0.055, "num_input_tokens_seen": 57253856, "step": 26535 }, { "epoch": 4.870618462103138, "grad_norm": 0.1285216212272644, "learning_rate": 9.385548532902115e-06, "loss": 0.1461, "num_input_tokens_seen": 57264768, "step": 26540 }, { "epoch": 4.87153606166269, "grad_norm": 5.314609527587891, "learning_rate": 9.385163881172907e-06, "loss": 0.0821, "num_input_tokens_seen": 57278080, "step": 26545 }, { "epoch": 4.872453661222242, "grad_norm": 28.333738327026367, "learning_rate": 9.384779116971042e-06, "loss": 0.0898, "num_input_tokens_seen": 57287200, "step": 26550 }, { "epoch": 4.873371260781795, "grad_norm": 0.23003657162189484, "learning_rate": 9.384394240306388e-06, "loss": 0.234, "num_input_tokens_seen": 57297184, "step": 26555 }, { "epoch": 4.874288860341347, "grad_norm": 0.07038197666406631, "learning_rate": 9.384009251188816e-06, "loss": 0.0761, "num_input_tokens_seen": 57307488, "step": 26560 }, { "epoch": 4.875206459900899, "grad_norm": 0.15799608826637268, "learning_rate": 9.383624149628202e-06, "loss": 0.0058, "num_input_tokens_seen": 57318848, "step": 26565 }, { "epoch": 4.876124059460452, "grad_norm": 22.24369239807129, "learning_rate": 9.383238935634424e-06, "loss": 0.2333, "num_input_tokens_seen": 57329312, "step": 26570 }, { "epoch": 4.877041659020003, "grad_norm": 0.027092214673757553, "learning_rate": 9.38285360921736e-06, "loss": 0.0832, "num_input_tokens_seen": 57339360, "step": 26575 }, { "epoch": 4.877959258579556, "grad_norm": 31.902202606201172, "learning_rate": 9.382468170386894e-06, "loss": 0.1721, "num_input_tokens_seen": 57350400, "step": 26580 }, { "epoch": 4.878876858139108, "grad_norm": 0.3227378726005554, "learning_rate": 9.38208261915291e-06, "loss": 0.3346, "num_input_tokens_seen": 57360768, "step": 26585 }, { "epoch": 4.87979445769866, "grad_norm": 0.029634511098265648, "learning_rate": 9.3816969555253e-06, "loss": 0.1299, "num_input_tokens_seen": 57371808, "step": 26590 }, { "epoch": 4.880712057258212, "grad_norm": 11.04957103729248, "learning_rate": 9.381311179513954e-06, "loss": 0.0574, "num_input_tokens_seen": 57382208, "step": 26595 }, { "epoch": 4.881629656817765, "grad_norm": 0.6786595582962036, "learning_rate": 9.380925291128767e-06, "loss": 0.0585, "num_input_tokens_seen": 57392448, "step": 26600 }, { "epoch": 4.8825472563773165, "grad_norm": 15.227014541625977, "learning_rate": 9.380539290379634e-06, "loss": 0.3588, "num_input_tokens_seen": 57403616, "step": 26605 }, { "epoch": 4.883464855936869, "grad_norm": 0.15898334980010986, "learning_rate": 9.380153177276459e-06, "loss": 0.2257, "num_input_tokens_seen": 57414976, "step": 26610 }, { "epoch": 4.8843824554964215, "grad_norm": 0.13523828983306885, "learning_rate": 9.379766951829144e-06, "loss": 0.0517, "num_input_tokens_seen": 57425792, "step": 26615 }, { "epoch": 4.885300055055973, "grad_norm": 0.13871784508228302, "learning_rate": 9.379380614047594e-06, "loss": 0.0046, "num_input_tokens_seen": 57436448, "step": 26620 }, { "epoch": 4.886217654615526, "grad_norm": 0.3992937207221985, "learning_rate": 9.378994163941719e-06, "loss": 0.2264, "num_input_tokens_seen": 57446080, "step": 26625 }, { "epoch": 4.887135254175078, "grad_norm": 0.2790718078613281, "learning_rate": 9.37860760152143e-06, "loss": 0.2529, "num_input_tokens_seen": 57457248, "step": 26630 }, { "epoch": 4.88805285373463, "grad_norm": 0.1536167412996292, "learning_rate": 9.378220926796641e-06, "loss": 0.1247, "num_input_tokens_seen": 57467040, "step": 26635 }, { "epoch": 4.888970453294182, "grad_norm": 23.299406051635742, "learning_rate": 9.377834139777274e-06, "loss": 0.2165, "num_input_tokens_seen": 57478144, "step": 26640 }, { "epoch": 4.889888052853735, "grad_norm": 0.1775747835636139, "learning_rate": 9.377447240473245e-06, "loss": 0.011, "num_input_tokens_seen": 57488864, "step": 26645 }, { "epoch": 4.890805652413286, "grad_norm": 0.2602931559085846, "learning_rate": 9.377060228894478e-06, "loss": 0.1757, "num_input_tokens_seen": 57499424, "step": 26650 }, { "epoch": 4.891723251972839, "grad_norm": 0.2851066291332245, "learning_rate": 9.3766731050509e-06, "loss": 0.1569, "num_input_tokens_seen": 57510016, "step": 26655 }, { "epoch": 4.892640851532391, "grad_norm": 4.859748840332031, "learning_rate": 9.376285868952441e-06, "loss": 0.407, "num_input_tokens_seen": 57521216, "step": 26660 }, { "epoch": 4.893558451091943, "grad_norm": 6.634129047393799, "learning_rate": 9.375898520609032e-06, "loss": 0.1037, "num_input_tokens_seen": 57530976, "step": 26665 }, { "epoch": 4.8944760506514955, "grad_norm": 6.881369113922119, "learning_rate": 9.375511060030606e-06, "loss": 0.0868, "num_input_tokens_seen": 57541632, "step": 26670 }, { "epoch": 4.895393650211048, "grad_norm": 0.18223729729652405, "learning_rate": 9.375123487227104e-06, "loss": 0.2365, "num_input_tokens_seen": 57552896, "step": 26675 }, { "epoch": 4.8963112497706, "grad_norm": 39.452972412109375, "learning_rate": 9.374735802208468e-06, "loss": 0.3249, "num_input_tokens_seen": 57563104, "step": 26680 }, { "epoch": 4.897228849330152, "grad_norm": 42.72172927856445, "learning_rate": 9.374348004984635e-06, "loss": 0.0483, "num_input_tokens_seen": 57573344, "step": 26685 }, { "epoch": 4.898146448889705, "grad_norm": 10.951295852661133, "learning_rate": 9.373960095565558e-06, "loss": 0.2406, "num_input_tokens_seen": 57583424, "step": 26690 }, { "epoch": 4.899064048449256, "grad_norm": 0.07027152180671692, "learning_rate": 9.373572073961182e-06, "loss": 0.1236, "num_input_tokens_seen": 57594272, "step": 26695 }, { "epoch": 4.899981648008809, "grad_norm": 0.13177552819252014, "learning_rate": 9.37318394018146e-06, "loss": 0.0505, "num_input_tokens_seen": 57606400, "step": 26700 }, { "epoch": 4.900899247568361, "grad_norm": 0.07093830406665802, "learning_rate": 9.37279569423635e-06, "loss": 0.2004, "num_input_tokens_seen": 57617664, "step": 26705 }, { "epoch": 4.901816847127913, "grad_norm": 1.363570213317871, "learning_rate": 9.372407336135807e-06, "loss": 0.0059, "num_input_tokens_seen": 57627168, "step": 26710 }, { "epoch": 4.902734446687465, "grad_norm": 1.1190887689590454, "learning_rate": 9.372018865889792e-06, "loss": 0.0975, "num_input_tokens_seen": 57637856, "step": 26715 }, { "epoch": 4.903652046247018, "grad_norm": 25.548433303833008, "learning_rate": 9.371630283508269e-06, "loss": 0.1239, "num_input_tokens_seen": 57648928, "step": 26720 }, { "epoch": 4.90456964580657, "grad_norm": 36.06340026855469, "learning_rate": 9.371241589001205e-06, "loss": 0.0663, "num_input_tokens_seen": 57660096, "step": 26725 }, { "epoch": 4.905487245366122, "grad_norm": 0.14900365471839905, "learning_rate": 9.370852782378567e-06, "loss": 0.177, "num_input_tokens_seen": 57672096, "step": 26730 }, { "epoch": 4.9064048449256745, "grad_norm": 0.7830213904380798, "learning_rate": 9.370463863650333e-06, "loss": 0.0215, "num_input_tokens_seen": 57683040, "step": 26735 }, { "epoch": 4.907322444485227, "grad_norm": 0.2752332091331482, "learning_rate": 9.370074832826473e-06, "loss": 0.2408, "num_input_tokens_seen": 57693920, "step": 26740 }, { "epoch": 4.908240044044779, "grad_norm": 0.3075062930583954, "learning_rate": 9.369685689916965e-06, "loss": 0.0678, "num_input_tokens_seen": 57704800, "step": 26745 }, { "epoch": 4.909157643604331, "grad_norm": 0.16068029403686523, "learning_rate": 9.369296434931794e-06, "loss": 0.3804, "num_input_tokens_seen": 57715552, "step": 26750 }, { "epoch": 4.910075243163884, "grad_norm": 13.25813102722168, "learning_rate": 9.36890706788094e-06, "loss": 0.4614, "num_input_tokens_seen": 57727168, "step": 26755 }, { "epoch": 4.910992842723435, "grad_norm": 8.042285919189453, "learning_rate": 9.36851758877439e-06, "loss": 0.3934, "num_input_tokens_seen": 57737280, "step": 26760 }, { "epoch": 4.911910442282988, "grad_norm": 4.927129745483398, "learning_rate": 9.368127997622135e-06, "loss": 0.0755, "num_input_tokens_seen": 57747200, "step": 26765 }, { "epoch": 4.91282804184254, "grad_norm": 37.617652893066406, "learning_rate": 9.367738294434167e-06, "loss": 0.2237, "num_input_tokens_seen": 57757568, "step": 26770 }, { "epoch": 4.913745641402092, "grad_norm": 9.274492263793945, "learning_rate": 9.367348479220481e-06, "loss": 0.2371, "num_input_tokens_seen": 57768896, "step": 26775 }, { "epoch": 4.914663240961644, "grad_norm": 0.1653280109167099, "learning_rate": 9.366958551991077e-06, "loss": 0.1061, "num_input_tokens_seen": 57778976, "step": 26780 }, { "epoch": 4.915580840521197, "grad_norm": 9.32227897644043, "learning_rate": 9.366568512755952e-06, "loss": 0.0973, "num_input_tokens_seen": 57790336, "step": 26785 }, { "epoch": 4.9164984400807485, "grad_norm": 0.4086892008781433, "learning_rate": 9.366178361525114e-06, "loss": 0.258, "num_input_tokens_seen": 57801216, "step": 26790 }, { "epoch": 4.917416039640301, "grad_norm": 1.2682249546051025, "learning_rate": 9.365788098308566e-06, "loss": 0.1338, "num_input_tokens_seen": 57811232, "step": 26795 }, { "epoch": 4.918333639199854, "grad_norm": 0.1244756355881691, "learning_rate": 9.36539772311632e-06, "loss": 0.1887, "num_input_tokens_seen": 57821728, "step": 26800 }, { "epoch": 4.919251238759405, "grad_norm": 0.34134745597839355, "learning_rate": 9.36500723595839e-06, "loss": 0.0056, "num_input_tokens_seen": 57832832, "step": 26805 }, { "epoch": 4.920168838318958, "grad_norm": 9.708321571350098, "learning_rate": 9.364616636844788e-06, "loss": 0.439, "num_input_tokens_seen": 57844160, "step": 26810 }, { "epoch": 4.92108643787851, "grad_norm": 0.208861842751503, "learning_rate": 9.364225925785533e-06, "loss": 0.0052, "num_input_tokens_seen": 57855584, "step": 26815 }, { "epoch": 4.922004037438062, "grad_norm": 1.8099355697631836, "learning_rate": 9.363835102790649e-06, "loss": 0.1241, "num_input_tokens_seen": 57864736, "step": 26820 }, { "epoch": 4.922921636997614, "grad_norm": 0.1955425888299942, "learning_rate": 9.363444167870158e-06, "loss": 0.0643, "num_input_tokens_seen": 57876480, "step": 26825 }, { "epoch": 4.923839236557167, "grad_norm": 0.25334152579307556, "learning_rate": 9.363053121034084e-06, "loss": 0.0064, "num_input_tokens_seen": 57888384, "step": 26830 }, { "epoch": 4.924756836116718, "grad_norm": 20.396163940429688, "learning_rate": 9.362661962292464e-06, "loss": 0.1752, "num_input_tokens_seen": 57899200, "step": 26835 }, { "epoch": 4.925674435676271, "grad_norm": 0.2699410319328308, "learning_rate": 9.362270691655322e-06, "loss": 0.0079, "num_input_tokens_seen": 57909984, "step": 26840 }, { "epoch": 4.9265920352358235, "grad_norm": 0.10796865075826645, "learning_rate": 9.361879309132699e-06, "loss": 0.1197, "num_input_tokens_seen": 57921184, "step": 26845 }, { "epoch": 4.927509634795375, "grad_norm": 26.909032821655273, "learning_rate": 9.361487814734633e-06, "loss": 0.5127, "num_input_tokens_seen": 57930144, "step": 26850 }, { "epoch": 4.928427234354928, "grad_norm": 138.51063537597656, "learning_rate": 9.361096208471163e-06, "loss": 0.1999, "num_input_tokens_seen": 57941216, "step": 26855 }, { "epoch": 4.92934483391448, "grad_norm": 17.325355529785156, "learning_rate": 9.360704490352334e-06, "loss": 0.0177, "num_input_tokens_seen": 57952512, "step": 26860 }, { "epoch": 4.930262433474032, "grad_norm": 7.8841872215271, "learning_rate": 9.360312660388196e-06, "loss": 0.2385, "num_input_tokens_seen": 57962560, "step": 26865 }, { "epoch": 4.931180033033584, "grad_norm": 0.4270418882369995, "learning_rate": 9.359920718588793e-06, "loss": 0.0825, "num_input_tokens_seen": 57974944, "step": 26870 }, { "epoch": 4.932097632593137, "grad_norm": 0.08185942471027374, "learning_rate": 9.359528664964183e-06, "loss": 0.063, "num_input_tokens_seen": 57985760, "step": 26875 }, { "epoch": 4.933015232152688, "grad_norm": 0.19387313723564148, "learning_rate": 9.359136499524418e-06, "loss": 0.129, "num_input_tokens_seen": 57996544, "step": 26880 }, { "epoch": 4.933932831712241, "grad_norm": 0.051646336913108826, "learning_rate": 9.35874422227956e-06, "loss": 0.0994, "num_input_tokens_seen": 58008032, "step": 26885 }, { "epoch": 4.934850431271793, "grad_norm": 0.07280320674180984, "learning_rate": 9.358351833239666e-06, "loss": 0.1303, "num_input_tokens_seen": 58019584, "step": 26890 }, { "epoch": 4.935768030831345, "grad_norm": 0.09618134796619415, "learning_rate": 9.357959332414803e-06, "loss": 0.0066, "num_input_tokens_seen": 58031552, "step": 26895 }, { "epoch": 4.9366856303908975, "grad_norm": 0.3368220031261444, "learning_rate": 9.357566719815036e-06, "loss": 0.195, "num_input_tokens_seen": 58043424, "step": 26900 }, { "epoch": 4.93760322995045, "grad_norm": 32.97353744506836, "learning_rate": 9.357173995450438e-06, "loss": 0.2193, "num_input_tokens_seen": 58055168, "step": 26905 }, { "epoch": 4.938520829510002, "grad_norm": 0.1399231255054474, "learning_rate": 9.356781159331078e-06, "loss": 0.0537, "num_input_tokens_seen": 58066336, "step": 26910 }, { "epoch": 4.939438429069554, "grad_norm": 5.782751560211182, "learning_rate": 9.356388211467037e-06, "loss": 0.2702, "num_input_tokens_seen": 58077312, "step": 26915 }, { "epoch": 4.940356028629107, "grad_norm": 0.5270522832870483, "learning_rate": 9.355995151868387e-06, "loss": 0.1139, "num_input_tokens_seen": 58088064, "step": 26920 }, { "epoch": 4.941273628188658, "grad_norm": 0.16830392181873322, "learning_rate": 9.355601980545215e-06, "loss": 0.2813, "num_input_tokens_seen": 58098432, "step": 26925 }, { "epoch": 4.942191227748211, "grad_norm": 5.2640461921691895, "learning_rate": 9.3552086975076e-06, "loss": 0.0496, "num_input_tokens_seen": 58108384, "step": 26930 }, { "epoch": 4.943108827307763, "grad_norm": 0.135401651263237, "learning_rate": 9.354815302765634e-06, "loss": 0.0118, "num_input_tokens_seen": 58118080, "step": 26935 }, { "epoch": 4.944026426867315, "grad_norm": 0.2283869832754135, "learning_rate": 9.354421796329405e-06, "loss": 0.2103, "num_input_tokens_seen": 58128864, "step": 26940 }, { "epoch": 4.944944026426867, "grad_norm": 0.2884003221988678, "learning_rate": 9.354028178209005e-06, "loss": 0.1988, "num_input_tokens_seen": 58138720, "step": 26945 }, { "epoch": 4.94586162598642, "grad_norm": 0.8426826000213623, "learning_rate": 9.35363444841453e-06, "loss": 0.2436, "num_input_tokens_seen": 58149664, "step": 26950 }, { "epoch": 4.9467792255459715, "grad_norm": 4.7244062423706055, "learning_rate": 9.35324060695608e-06, "loss": 0.2085, "num_input_tokens_seen": 58158784, "step": 26955 }, { "epoch": 4.947696825105524, "grad_norm": 27.002574920654297, "learning_rate": 9.352846653843755e-06, "loss": 0.1244, "num_input_tokens_seen": 58168768, "step": 26960 }, { "epoch": 4.9486144246650765, "grad_norm": 45.169334411621094, "learning_rate": 9.352452589087658e-06, "loss": 0.1095, "num_input_tokens_seen": 58179424, "step": 26965 }, { "epoch": 4.949532024224628, "grad_norm": 13.148152351379395, "learning_rate": 9.3520584126979e-06, "loss": 0.0902, "num_input_tokens_seen": 58190528, "step": 26970 }, { "epoch": 4.950449623784181, "grad_norm": 0.3677327632904053, "learning_rate": 9.351664124684587e-06, "loss": 0.1112, "num_input_tokens_seen": 58201536, "step": 26975 }, { "epoch": 4.951367223343733, "grad_norm": 0.069620780646801, "learning_rate": 9.351269725057834e-06, "loss": 0.2267, "num_input_tokens_seen": 58211968, "step": 26980 }, { "epoch": 4.952284822903285, "grad_norm": 0.8025585412979126, "learning_rate": 9.350875213827757e-06, "loss": 0.0155, "num_input_tokens_seen": 58222624, "step": 26985 }, { "epoch": 4.953202422462837, "grad_norm": 0.2983609139919281, "learning_rate": 9.350480591004474e-06, "loss": 0.0147, "num_input_tokens_seen": 58232640, "step": 26990 }, { "epoch": 4.95412002202239, "grad_norm": 0.040414925664663315, "learning_rate": 9.350085856598107e-06, "loss": 0.0835, "num_input_tokens_seen": 58243200, "step": 26995 }, { "epoch": 4.955037621581941, "grad_norm": 0.22341445088386536, "learning_rate": 9.34969101061878e-06, "loss": 0.0043, "num_input_tokens_seen": 58254112, "step": 27000 }, { "epoch": 4.955955221141494, "grad_norm": 0.05902956798672676, "learning_rate": 9.349296053076617e-06, "loss": 0.1132, "num_input_tokens_seen": 58264224, "step": 27005 }, { "epoch": 4.956872820701046, "grad_norm": 0.38791966438293457, "learning_rate": 9.348900983981754e-06, "loss": 0.0111, "num_input_tokens_seen": 58273504, "step": 27010 }, { "epoch": 4.957790420260598, "grad_norm": 0.009978687390685081, "learning_rate": 9.348505803344322e-06, "loss": 0.0974, "num_input_tokens_seen": 58284928, "step": 27015 }, { "epoch": 4.9587080198201505, "grad_norm": 8.74555492401123, "learning_rate": 9.348110511174453e-06, "loss": 0.2227, "num_input_tokens_seen": 58295552, "step": 27020 }, { "epoch": 4.959625619379703, "grad_norm": 5.541821479797363, "learning_rate": 9.347715107482289e-06, "loss": 0.3399, "num_input_tokens_seen": 58308480, "step": 27025 }, { "epoch": 4.960543218939255, "grad_norm": 0.9632958769798279, "learning_rate": 9.347319592277971e-06, "loss": 0.0108, "num_input_tokens_seen": 58318656, "step": 27030 }, { "epoch": 4.961460818498807, "grad_norm": 0.026840683072805405, "learning_rate": 9.346923965571644e-06, "loss": 0.1698, "num_input_tokens_seen": 58329472, "step": 27035 }, { "epoch": 4.96237841805836, "grad_norm": 0.21372613310813904, "learning_rate": 9.346528227373453e-06, "loss": 0.0449, "num_input_tokens_seen": 58339744, "step": 27040 }, { "epoch": 4.963296017617911, "grad_norm": 15.464970588684082, "learning_rate": 9.346132377693549e-06, "loss": 0.4559, "num_input_tokens_seen": 58351168, "step": 27045 }, { "epoch": 4.964213617177464, "grad_norm": 6.766055583953857, "learning_rate": 9.345736416542087e-06, "loss": 0.1016, "num_input_tokens_seen": 58360576, "step": 27050 }, { "epoch": 4.965131216737016, "grad_norm": 19.739444732666016, "learning_rate": 9.345340343929222e-06, "loss": 0.1848, "num_input_tokens_seen": 58369824, "step": 27055 }, { "epoch": 4.966048816296568, "grad_norm": 28.914152145385742, "learning_rate": 9.34494415986511e-06, "loss": 0.1018, "num_input_tokens_seen": 58381120, "step": 27060 }, { "epoch": 4.96696641585612, "grad_norm": 0.05376090109348297, "learning_rate": 9.344547864359915e-06, "loss": 0.2159, "num_input_tokens_seen": 58391360, "step": 27065 }, { "epoch": 4.967884015415673, "grad_norm": 1.340243935585022, "learning_rate": 9.344151457423801e-06, "loss": 0.0082, "num_input_tokens_seen": 58400800, "step": 27070 }, { "epoch": 4.9688016149752245, "grad_norm": 1.1194512844085693, "learning_rate": 9.343754939066934e-06, "loss": 0.1323, "num_input_tokens_seen": 58411872, "step": 27075 }, { "epoch": 4.969719214534777, "grad_norm": 21.79458999633789, "learning_rate": 9.343358309299484e-06, "loss": 0.18, "num_input_tokens_seen": 58421952, "step": 27080 }, { "epoch": 4.97063681409433, "grad_norm": 0.19888752698898315, "learning_rate": 9.342961568131627e-06, "loss": 0.0433, "num_input_tokens_seen": 58434080, "step": 27085 }, { "epoch": 4.971554413653881, "grad_norm": 71.80441284179688, "learning_rate": 9.342564715573536e-06, "loss": 0.1429, "num_input_tokens_seen": 58443648, "step": 27090 }, { "epoch": 4.972472013213434, "grad_norm": 8.137468338012695, "learning_rate": 9.342167751635392e-06, "loss": 0.2189, "num_input_tokens_seen": 58455680, "step": 27095 }, { "epoch": 4.973389612772986, "grad_norm": 13.864805221557617, "learning_rate": 9.341770676327372e-06, "loss": 0.1185, "num_input_tokens_seen": 58467200, "step": 27100 }, { "epoch": 4.974307212332538, "grad_norm": 0.7953340411186218, "learning_rate": 9.341373489659667e-06, "loss": 0.0058, "num_input_tokens_seen": 58476928, "step": 27105 }, { "epoch": 4.97522481189209, "grad_norm": 0.039754197001457214, "learning_rate": 9.340976191642458e-06, "loss": 0.0034, "num_input_tokens_seen": 58487360, "step": 27110 }, { "epoch": 4.976142411451643, "grad_norm": 26.729692459106445, "learning_rate": 9.340578782285938e-06, "loss": 0.1381, "num_input_tokens_seen": 58498112, "step": 27115 }, { "epoch": 4.977060011011194, "grad_norm": 0.0958600640296936, "learning_rate": 9.3401812616003e-06, "loss": 0.3762, "num_input_tokens_seen": 58507776, "step": 27120 }, { "epoch": 4.977977610570747, "grad_norm": 0.09848857671022415, "learning_rate": 9.33978362959574e-06, "loss": 0.1205, "num_input_tokens_seen": 58519168, "step": 27125 }, { "epoch": 4.9788952101302995, "grad_norm": 0.029375175014138222, "learning_rate": 9.339385886282453e-06, "loss": 0.0912, "num_input_tokens_seen": 58529312, "step": 27130 }, { "epoch": 4.979812809689851, "grad_norm": 0.022603509947657585, "learning_rate": 9.338988031670645e-06, "loss": 0.1214, "num_input_tokens_seen": 58538720, "step": 27135 }, { "epoch": 4.980730409249404, "grad_norm": 2.025036573410034, "learning_rate": 9.33859006577052e-06, "loss": 0.1964, "num_input_tokens_seen": 58548640, "step": 27140 }, { "epoch": 4.981648008808956, "grad_norm": 14.095205307006836, "learning_rate": 9.338191988592282e-06, "loss": 0.256, "num_input_tokens_seen": 58560384, "step": 27145 }, { "epoch": 4.982565608368508, "grad_norm": 0.02798950858414173, "learning_rate": 9.337793800146145e-06, "loss": 0.5281, "num_input_tokens_seen": 58571168, "step": 27150 }, { "epoch": 4.98348320792806, "grad_norm": 16.56488609313965, "learning_rate": 9.33739550044232e-06, "loss": 0.1282, "num_input_tokens_seen": 58582656, "step": 27155 }, { "epoch": 4.984400807487613, "grad_norm": 7.819151878356934, "learning_rate": 9.33699708949102e-06, "loss": 0.3588, "num_input_tokens_seen": 58593888, "step": 27160 }, { "epoch": 4.985318407047164, "grad_norm": 6.376404285430908, "learning_rate": 9.336598567302469e-06, "loss": 0.1331, "num_input_tokens_seen": 58605632, "step": 27165 }, { "epoch": 4.986236006606717, "grad_norm": 35.91206741333008, "learning_rate": 9.336199933886885e-06, "loss": 0.0543, "num_input_tokens_seen": 58617152, "step": 27170 }, { "epoch": 4.987153606166269, "grad_norm": 26.71863555908203, "learning_rate": 9.335801189254495e-06, "loss": 0.1467, "num_input_tokens_seen": 58628864, "step": 27175 }, { "epoch": 4.988071205725821, "grad_norm": 0.04814611375331879, "learning_rate": 9.335402333415522e-06, "loss": 0.28, "num_input_tokens_seen": 58639168, "step": 27180 }, { "epoch": 4.9889888052853735, "grad_norm": 0.06311027705669403, "learning_rate": 9.3350033663802e-06, "loss": 0.0544, "num_input_tokens_seen": 58650208, "step": 27185 }, { "epoch": 4.989906404844926, "grad_norm": 1.821596622467041, "learning_rate": 9.33460428815876e-06, "loss": 0.2622, "num_input_tokens_seen": 58660992, "step": 27190 }, { "epoch": 4.990824004404478, "grad_norm": 2.703378200531006, "learning_rate": 9.334205098761436e-06, "loss": 0.3193, "num_input_tokens_seen": 58671680, "step": 27195 }, { "epoch": 4.99174160396403, "grad_norm": 0.14109158515930176, "learning_rate": 9.33380579819847e-06, "loss": 0.2562, "num_input_tokens_seen": 58682272, "step": 27200 }, { "epoch": 4.992659203523583, "grad_norm": 5.67425012588501, "learning_rate": 9.333406386480103e-06, "loss": 0.1096, "num_input_tokens_seen": 58693088, "step": 27205 }, { "epoch": 4.993576803083134, "grad_norm": 1.1832174062728882, "learning_rate": 9.333006863616577e-06, "loss": 0.0107, "num_input_tokens_seen": 58703552, "step": 27210 }, { "epoch": 4.994494402642687, "grad_norm": 5.887487411499023, "learning_rate": 9.332607229618142e-06, "loss": 0.1484, "num_input_tokens_seen": 58714752, "step": 27215 }, { "epoch": 4.995412002202239, "grad_norm": 4.274848461151123, "learning_rate": 9.332207484495046e-06, "loss": 0.2114, "num_input_tokens_seen": 58726112, "step": 27220 }, { "epoch": 4.996329601761791, "grad_norm": 0.1349128931760788, "learning_rate": 9.33180762825754e-06, "loss": 0.138, "num_input_tokens_seen": 58736512, "step": 27225 }, { "epoch": 4.997247201321343, "grad_norm": 66.01850128173828, "learning_rate": 9.331407660915886e-06, "loss": 0.1247, "num_input_tokens_seen": 58748096, "step": 27230 }, { "epoch": 4.998164800880896, "grad_norm": 0.1079307496547699, "learning_rate": 9.331007582480336e-06, "loss": 0.1739, "num_input_tokens_seen": 58758816, "step": 27235 }, { "epoch": 4.9990824004404475, "grad_norm": 0.1338498890399933, "learning_rate": 9.330607392961153e-06, "loss": 0.0033, "num_input_tokens_seen": 58768736, "step": 27240 }, { "epoch": 5.0, "grad_norm": 0.13740283250808716, "learning_rate": 9.330207092368604e-06, "loss": 0.0893, "num_input_tokens_seen": 58777232, "step": 27245 }, { "epoch": 5.0009175995595525, "grad_norm": 0.34879276156425476, "learning_rate": 9.329806680712954e-06, "loss": 0.0244, "num_input_tokens_seen": 58788112, "step": 27250 }, { "epoch": 5.001835199119104, "grad_norm": 0.0436086431145668, "learning_rate": 9.329406158004473e-06, "loss": 0.0023, "num_input_tokens_seen": 58798992, "step": 27255 }, { "epoch": 5.002752798678657, "grad_norm": 16.994783401489258, "learning_rate": 9.329005524253435e-06, "loss": 0.2655, "num_input_tokens_seen": 58809808, "step": 27260 }, { "epoch": 5.003670398238209, "grad_norm": 9.170504570007324, "learning_rate": 9.328604779470115e-06, "loss": 0.1088, "num_input_tokens_seen": 58819472, "step": 27265 }, { "epoch": 5.004587997797761, "grad_norm": 0.3507433831691742, "learning_rate": 9.328203923664789e-06, "loss": 0.0567, "num_input_tokens_seen": 58829552, "step": 27270 }, { "epoch": 5.005505597357313, "grad_norm": 11.57200813293457, "learning_rate": 9.327802956847741e-06, "loss": 0.2444, "num_input_tokens_seen": 58840848, "step": 27275 }, { "epoch": 5.006423196916866, "grad_norm": 0.04740903154015541, "learning_rate": 9.327401879029257e-06, "loss": 0.002, "num_input_tokens_seen": 58851664, "step": 27280 }, { "epoch": 5.007340796476417, "grad_norm": 45.78585433959961, "learning_rate": 9.327000690219619e-06, "loss": 0.2439, "num_input_tokens_seen": 58862800, "step": 27285 }, { "epoch": 5.00825839603597, "grad_norm": 0.028373437002301216, "learning_rate": 9.326599390429119e-06, "loss": 0.1322, "num_input_tokens_seen": 58872784, "step": 27290 }, { "epoch": 5.009175995595522, "grad_norm": 47.570613861083984, "learning_rate": 9.326197979668052e-06, "loss": 0.0364, "num_input_tokens_seen": 58882896, "step": 27295 }, { "epoch": 5.010093595155074, "grad_norm": 0.0930088683962822, "learning_rate": 9.325796457946712e-06, "loss": 0.0062, "num_input_tokens_seen": 58893776, "step": 27300 }, { "epoch": 5.0110111947146265, "grad_norm": 11.046996116638184, "learning_rate": 9.325394825275396e-06, "loss": 0.1432, "num_input_tokens_seen": 58905040, "step": 27305 }, { "epoch": 5.011928794274179, "grad_norm": 20.115589141845703, "learning_rate": 9.324993081664407e-06, "loss": 0.1078, "num_input_tokens_seen": 58915696, "step": 27310 }, { "epoch": 5.012846393833731, "grad_norm": 0.6115995645523071, "learning_rate": 9.324591227124049e-06, "loss": 0.1392, "num_input_tokens_seen": 58926192, "step": 27315 }, { "epoch": 5.013763993393283, "grad_norm": 0.4854693114757538, "learning_rate": 9.32418926166463e-06, "loss": 0.0952, "num_input_tokens_seen": 58938224, "step": 27320 }, { "epoch": 5.014681592952836, "grad_norm": 7.037130355834961, "learning_rate": 9.323787185296456e-06, "loss": 0.091, "num_input_tokens_seen": 58950256, "step": 27325 }, { "epoch": 5.015599192512387, "grad_norm": 0.049002423882484436, "learning_rate": 9.323384998029842e-06, "loss": 0.0247, "num_input_tokens_seen": 58961040, "step": 27330 }, { "epoch": 5.01651679207194, "grad_norm": 0.33472955226898193, "learning_rate": 9.322982699875104e-06, "loss": 0.1015, "num_input_tokens_seen": 58973104, "step": 27335 }, { "epoch": 5.017434391631492, "grad_norm": 0.14963805675506592, "learning_rate": 9.32258029084256e-06, "loss": 0.3595, "num_input_tokens_seen": 58984688, "step": 27340 }, { "epoch": 5.018351991191044, "grad_norm": 1.3012325763702393, "learning_rate": 9.322177770942532e-06, "loss": 0.0093, "num_input_tokens_seen": 58995632, "step": 27345 }, { "epoch": 5.019269590750596, "grad_norm": 0.10591322928667068, "learning_rate": 9.32177514018534e-06, "loss": 0.0028, "num_input_tokens_seen": 59006128, "step": 27350 }, { "epoch": 5.020187190310149, "grad_norm": 3.4823575019836426, "learning_rate": 9.321372398581315e-06, "loss": 0.3194, "num_input_tokens_seen": 59016528, "step": 27355 }, { "epoch": 5.0211047898697005, "grad_norm": 8.639378547668457, "learning_rate": 9.320969546140786e-06, "loss": 0.0066, "num_input_tokens_seen": 59026480, "step": 27360 }, { "epoch": 5.022022389429253, "grad_norm": 0.023026829585433006, "learning_rate": 9.320566582874085e-06, "loss": 0.4882, "num_input_tokens_seen": 59037040, "step": 27365 }, { "epoch": 5.022939988988806, "grad_norm": 0.058540135622024536, "learning_rate": 9.320163508791546e-06, "loss": 0.0081, "num_input_tokens_seen": 59048816, "step": 27370 }, { "epoch": 5.023857588548357, "grad_norm": 0.18078161776065826, "learning_rate": 9.319760323903511e-06, "loss": 0.1427, "num_input_tokens_seen": 59060240, "step": 27375 }, { "epoch": 5.02477518810791, "grad_norm": 2.442337989807129, "learning_rate": 9.319357028220319e-06, "loss": 0.1479, "num_input_tokens_seen": 59071056, "step": 27380 }, { "epoch": 5.025692787667462, "grad_norm": 0.17263545095920563, "learning_rate": 9.318953621752312e-06, "loss": 0.038, "num_input_tokens_seen": 59082448, "step": 27385 }, { "epoch": 5.026610387227014, "grad_norm": 6.239565849304199, "learning_rate": 9.318550104509838e-06, "loss": 0.1633, "num_input_tokens_seen": 59093328, "step": 27390 }, { "epoch": 5.027527986786566, "grad_norm": 0.08828794956207275, "learning_rate": 9.318146476503249e-06, "loss": 0.0667, "num_input_tokens_seen": 59104336, "step": 27395 }, { "epoch": 5.028445586346119, "grad_norm": 0.2381272315979004, "learning_rate": 9.317742737742894e-06, "loss": 0.2095, "num_input_tokens_seen": 59115152, "step": 27400 }, { "epoch": 5.02936318590567, "grad_norm": 0.07903048396110535, "learning_rate": 9.317338888239129e-06, "loss": 0.1001, "num_input_tokens_seen": 59125520, "step": 27405 }, { "epoch": 5.030280785465223, "grad_norm": 0.051028069108724594, "learning_rate": 9.316934928002313e-06, "loss": 0.0041, "num_input_tokens_seen": 59135824, "step": 27410 }, { "epoch": 5.0311983850247755, "grad_norm": 0.05156579986214638, "learning_rate": 9.316530857042807e-06, "loss": 0.0732, "num_input_tokens_seen": 59145744, "step": 27415 }, { "epoch": 5.032115984584327, "grad_norm": 0.04113437607884407, "learning_rate": 9.316126675370975e-06, "loss": 0.0026, "num_input_tokens_seen": 59156784, "step": 27420 }, { "epoch": 5.03303358414388, "grad_norm": 0.06581993401050568, "learning_rate": 9.315722382997184e-06, "loss": 0.0039, "num_input_tokens_seen": 59168656, "step": 27425 }, { "epoch": 5.033951183703432, "grad_norm": 0.39860743284225464, "learning_rate": 9.315317979931802e-06, "loss": 0.119, "num_input_tokens_seen": 59178928, "step": 27430 }, { "epoch": 5.034868783262984, "grad_norm": 0.9604637622833252, "learning_rate": 9.314913466185201e-06, "loss": 0.1239, "num_input_tokens_seen": 59189936, "step": 27435 }, { "epoch": 5.035786382822536, "grad_norm": 0.06304062157869339, "learning_rate": 9.314508841767757e-06, "loss": 0.1045, "num_input_tokens_seen": 59201136, "step": 27440 }, { "epoch": 5.036703982382089, "grad_norm": 0.07820513844490051, "learning_rate": 9.31410410668985e-06, "loss": 0.1628, "num_input_tokens_seen": 59212400, "step": 27445 }, { "epoch": 5.03762158194164, "grad_norm": 0.021523715928196907, "learning_rate": 9.313699260961856e-06, "loss": 0.1612, "num_input_tokens_seen": 59222928, "step": 27450 }, { "epoch": 5.038539181501193, "grad_norm": 13.463720321655273, "learning_rate": 9.313294304594164e-06, "loss": 0.2744, "num_input_tokens_seen": 59234320, "step": 27455 }, { "epoch": 5.039456781060745, "grad_norm": 0.05997820198535919, "learning_rate": 9.312889237597158e-06, "loss": 0.2478, "num_input_tokens_seen": 59245104, "step": 27460 }, { "epoch": 5.040374380620297, "grad_norm": 0.07100880891084671, "learning_rate": 9.312484059981226e-06, "loss": 0.2382, "num_input_tokens_seen": 59257008, "step": 27465 }, { "epoch": 5.0412919801798495, "grad_norm": 0.33115100860595703, "learning_rate": 9.312078771756763e-06, "loss": 0.0028, "num_input_tokens_seen": 59267888, "step": 27470 }, { "epoch": 5.042209579739402, "grad_norm": 0.3349256217479706, "learning_rate": 9.311673372934162e-06, "loss": 0.0036, "num_input_tokens_seen": 59278736, "step": 27475 }, { "epoch": 5.043127179298954, "grad_norm": 4.7066426277160645, "learning_rate": 9.311267863523821e-06, "loss": 0.3464, "num_input_tokens_seen": 59290192, "step": 27480 }, { "epoch": 5.044044778858506, "grad_norm": 0.0878133624792099, "learning_rate": 9.310862243536142e-06, "loss": 0.0671, "num_input_tokens_seen": 59301072, "step": 27485 }, { "epoch": 5.044962378418059, "grad_norm": 0.34428781270980835, "learning_rate": 9.310456512981526e-06, "loss": 0.0252, "num_input_tokens_seen": 59311568, "step": 27490 }, { "epoch": 5.04587997797761, "grad_norm": 0.05453091487288475, "learning_rate": 9.310050671870383e-06, "loss": 0.1061, "num_input_tokens_seen": 59322064, "step": 27495 }, { "epoch": 5.046797577537163, "grad_norm": 21.03411293029785, "learning_rate": 9.30964472021312e-06, "loss": 0.1041, "num_input_tokens_seen": 59333264, "step": 27500 }, { "epoch": 5.047715177096715, "grad_norm": 0.16736923158168793, "learning_rate": 9.309238658020148e-06, "loss": 0.0986, "num_input_tokens_seen": 59343856, "step": 27505 }, { "epoch": 5.048632776656267, "grad_norm": 0.31741011142730713, "learning_rate": 9.308832485301885e-06, "loss": 0.0426, "num_input_tokens_seen": 59354736, "step": 27510 }, { "epoch": 5.049550376215819, "grad_norm": 0.09986025094985962, "learning_rate": 9.308426202068746e-06, "loss": 0.1219, "num_input_tokens_seen": 59366640, "step": 27515 }, { "epoch": 5.050467975775372, "grad_norm": 0.1455574780702591, "learning_rate": 9.308019808331153e-06, "loss": 0.0557, "num_input_tokens_seen": 59377360, "step": 27520 }, { "epoch": 5.0513855753349235, "grad_norm": 0.12894009053707123, "learning_rate": 9.307613304099527e-06, "loss": 0.1044, "num_input_tokens_seen": 59386640, "step": 27525 }, { "epoch": 5.052303174894476, "grad_norm": 0.04734230786561966, "learning_rate": 9.307206689384298e-06, "loss": 0.121, "num_input_tokens_seen": 59397488, "step": 27530 }, { "epoch": 5.0532207744540285, "grad_norm": 0.03153955191373825, "learning_rate": 9.30679996419589e-06, "loss": 0.0019, "num_input_tokens_seen": 59408464, "step": 27535 }, { "epoch": 5.05413837401358, "grad_norm": 0.06581044942140579, "learning_rate": 9.306393128544741e-06, "loss": 0.0039, "num_input_tokens_seen": 59419696, "step": 27540 }, { "epoch": 5.055055973573133, "grad_norm": 1.9428001642227173, "learning_rate": 9.305986182441282e-06, "loss": 0.0521, "num_input_tokens_seen": 59431856, "step": 27545 }, { "epoch": 5.055973573132685, "grad_norm": 0.08420220762491226, "learning_rate": 9.305579125895949e-06, "loss": 0.1206, "num_input_tokens_seen": 59442128, "step": 27550 }, { "epoch": 5.056891172692237, "grad_norm": 0.014914716593921185, "learning_rate": 9.305171958919185e-06, "loss": 0.2654, "num_input_tokens_seen": 59452848, "step": 27555 }, { "epoch": 5.057808772251789, "grad_norm": 7.772786617279053, "learning_rate": 9.304764681521435e-06, "loss": 0.3074, "num_input_tokens_seen": 59464304, "step": 27560 }, { "epoch": 5.058726371811342, "grad_norm": 0.03372615575790405, "learning_rate": 9.304357293713141e-06, "loss": 0.0035, "num_input_tokens_seen": 59475824, "step": 27565 }, { "epoch": 5.059643971370893, "grad_norm": 0.21671460568904877, "learning_rate": 9.303949795504755e-06, "loss": 0.0101, "num_input_tokens_seen": 59486768, "step": 27570 }, { "epoch": 5.060561570930446, "grad_norm": 0.5297439694404602, "learning_rate": 9.303542186906724e-06, "loss": 0.1584, "num_input_tokens_seen": 59498256, "step": 27575 }, { "epoch": 5.061479170489998, "grad_norm": 2.625704765319824, "learning_rate": 9.303134467929508e-06, "loss": 0.1806, "num_input_tokens_seen": 59507792, "step": 27580 }, { "epoch": 5.06239677004955, "grad_norm": 0.028638890013098717, "learning_rate": 9.302726638583563e-06, "loss": 0.0056, "num_input_tokens_seen": 59519280, "step": 27585 }, { "epoch": 5.0633143696091025, "grad_norm": 36.2432861328125, "learning_rate": 9.302318698879346e-06, "loss": 0.2513, "num_input_tokens_seen": 59530544, "step": 27590 }, { "epoch": 5.064231969168655, "grad_norm": 0.04168487712740898, "learning_rate": 9.301910648827325e-06, "loss": 0.0716, "num_input_tokens_seen": 59540240, "step": 27595 }, { "epoch": 5.065149568728207, "grad_norm": 16.218353271484375, "learning_rate": 9.30150248843796e-06, "loss": 0.1042, "num_input_tokens_seen": 59551344, "step": 27600 }, { "epoch": 5.066067168287759, "grad_norm": 0.09676475822925568, "learning_rate": 9.301094217721727e-06, "loss": 0.3525, "num_input_tokens_seen": 59562736, "step": 27605 }, { "epoch": 5.066984767847312, "grad_norm": 0.14166924357414246, "learning_rate": 9.30068583668909e-06, "loss": 0.0029, "num_input_tokens_seen": 59573392, "step": 27610 }, { "epoch": 5.067902367406863, "grad_norm": 0.5301834344863892, "learning_rate": 9.300277345350528e-06, "loss": 0.0048, "num_input_tokens_seen": 59583472, "step": 27615 }, { "epoch": 5.068819966966416, "grad_norm": 0.04400104284286499, "learning_rate": 9.299868743716518e-06, "loss": 0.1724, "num_input_tokens_seen": 59593776, "step": 27620 }, { "epoch": 5.069737566525968, "grad_norm": 5.305747032165527, "learning_rate": 9.299460031797537e-06, "loss": 0.1581, "num_input_tokens_seen": 59603504, "step": 27625 }, { "epoch": 5.07065516608552, "grad_norm": 0.155680850148201, "learning_rate": 9.29905120960407e-06, "loss": 0.224, "num_input_tokens_seen": 59615408, "step": 27630 }, { "epoch": 5.071572765645072, "grad_norm": 1.1958401203155518, "learning_rate": 9.298642277146603e-06, "loss": 0.0832, "num_input_tokens_seen": 59626704, "step": 27635 }, { "epoch": 5.072490365204625, "grad_norm": 0.1768520027399063, "learning_rate": 9.298233234435625e-06, "loss": 0.1236, "num_input_tokens_seen": 59639152, "step": 27640 }, { "epoch": 5.0734079647641765, "grad_norm": 0.6855888962745667, "learning_rate": 9.297824081481625e-06, "loss": 0.1106, "num_input_tokens_seen": 59650064, "step": 27645 }, { "epoch": 5.074325564323729, "grad_norm": 0.03951036185026169, "learning_rate": 9.297414818295098e-06, "loss": 0.1036, "num_input_tokens_seen": 59660560, "step": 27650 }, { "epoch": 5.075243163883282, "grad_norm": 0.05984797328710556, "learning_rate": 9.297005444886542e-06, "loss": 0.1453, "num_input_tokens_seen": 59671280, "step": 27655 }, { "epoch": 5.076160763442833, "grad_norm": 0.08192166686058044, "learning_rate": 9.296595961266456e-06, "loss": 0.1294, "num_input_tokens_seen": 59681456, "step": 27660 }, { "epoch": 5.077078363002386, "grad_norm": 0.031382832676172256, "learning_rate": 9.296186367445343e-06, "loss": 0.1045, "num_input_tokens_seen": 59693232, "step": 27665 }, { "epoch": 5.077995962561938, "grad_norm": 0.042051609605550766, "learning_rate": 9.295776663433707e-06, "loss": 0.3376, "num_input_tokens_seen": 59703376, "step": 27670 }, { "epoch": 5.07891356212149, "grad_norm": 0.04780379310250282, "learning_rate": 9.295366849242058e-06, "loss": 0.2048, "num_input_tokens_seen": 59714512, "step": 27675 }, { "epoch": 5.079831161681042, "grad_norm": 0.08932933211326599, "learning_rate": 9.294956924880907e-06, "loss": 0.1019, "num_input_tokens_seen": 59724816, "step": 27680 }, { "epoch": 5.080748761240595, "grad_norm": 15.230380058288574, "learning_rate": 9.294546890360768e-06, "loss": 0.0951, "num_input_tokens_seen": 59736048, "step": 27685 }, { "epoch": 5.081666360800146, "grad_norm": 0.03974435478448868, "learning_rate": 9.294136745692155e-06, "loss": 0.0042, "num_input_tokens_seen": 59747280, "step": 27690 }, { "epoch": 5.082583960359699, "grad_norm": 0.03064950928092003, "learning_rate": 9.293726490885591e-06, "loss": 0.0573, "num_input_tokens_seen": 59757712, "step": 27695 }, { "epoch": 5.0835015599192515, "grad_norm": 56.75859451293945, "learning_rate": 9.293316125951597e-06, "loss": 0.205, "num_input_tokens_seen": 59769232, "step": 27700 }, { "epoch": 5.084419159478803, "grad_norm": 4.958972454071045, "learning_rate": 9.292905650900699e-06, "loss": 0.2135, "num_input_tokens_seen": 59780464, "step": 27705 }, { "epoch": 5.085336759038356, "grad_norm": 0.02748049609363079, "learning_rate": 9.292495065743424e-06, "loss": 0.08, "num_input_tokens_seen": 59791504, "step": 27710 }, { "epoch": 5.086254358597908, "grad_norm": 0.16937300562858582, "learning_rate": 9.292084370490304e-06, "loss": 0.0033, "num_input_tokens_seen": 59800688, "step": 27715 }, { "epoch": 5.08717195815746, "grad_norm": 87.15380859375, "learning_rate": 9.29167356515187e-06, "loss": 0.3003, "num_input_tokens_seen": 59810896, "step": 27720 }, { "epoch": 5.088089557717012, "grad_norm": 130.35166931152344, "learning_rate": 9.291262649738663e-06, "loss": 0.0826, "num_input_tokens_seen": 59821840, "step": 27725 }, { "epoch": 5.089007157276565, "grad_norm": 0.08546663820743561, "learning_rate": 9.29085162426122e-06, "loss": 0.0019, "num_input_tokens_seen": 59832144, "step": 27730 }, { "epoch": 5.089924756836116, "grad_norm": 0.2959466576576233, "learning_rate": 9.29044048873008e-06, "loss": 0.0047, "num_input_tokens_seen": 59842768, "step": 27735 }, { "epoch": 5.090842356395669, "grad_norm": 0.08712073415517807, "learning_rate": 9.290029243155793e-06, "loss": 0.0492, "num_input_tokens_seen": 59853872, "step": 27740 }, { "epoch": 5.091759955955221, "grad_norm": 0.3174402415752411, "learning_rate": 9.289617887548905e-06, "loss": 0.1366, "num_input_tokens_seen": 59865776, "step": 27745 }, { "epoch": 5.092677555514773, "grad_norm": 0.08252664655447006, "learning_rate": 9.289206421919966e-06, "loss": 0.1849, "num_input_tokens_seen": 59876784, "step": 27750 }, { "epoch": 5.0935951550743255, "grad_norm": 0.022872472181916237, "learning_rate": 9.28879484627953e-06, "loss": 0.0041, "num_input_tokens_seen": 59886064, "step": 27755 }, { "epoch": 5.094512754633878, "grad_norm": 6.375261306762695, "learning_rate": 9.288383160638154e-06, "loss": 0.4391, "num_input_tokens_seen": 59896784, "step": 27760 }, { "epoch": 5.09543035419343, "grad_norm": 0.2003912776708603, "learning_rate": 9.287971365006396e-06, "loss": 0.0757, "num_input_tokens_seen": 59907792, "step": 27765 }, { "epoch": 5.096347953752982, "grad_norm": 8.806075096130371, "learning_rate": 9.287559459394818e-06, "loss": 0.183, "num_input_tokens_seen": 59918480, "step": 27770 }, { "epoch": 5.097265553312535, "grad_norm": 50.628318786621094, "learning_rate": 9.287147443813985e-06, "loss": 0.1669, "num_input_tokens_seen": 59929360, "step": 27775 }, { "epoch": 5.098183152872086, "grad_norm": 0.02608020044863224, "learning_rate": 9.286735318274464e-06, "loss": 0.0014, "num_input_tokens_seen": 59939792, "step": 27780 }, { "epoch": 5.099100752431639, "grad_norm": 0.6326534748077393, "learning_rate": 9.286323082786828e-06, "loss": 0.0096, "num_input_tokens_seen": 59950864, "step": 27785 }, { "epoch": 5.100018351991191, "grad_norm": 18.416942596435547, "learning_rate": 9.285910737361645e-06, "loss": 0.2328, "num_input_tokens_seen": 59961136, "step": 27790 }, { "epoch": 5.100935951550743, "grad_norm": 56.420310974121094, "learning_rate": 9.285498282009497e-06, "loss": 0.258, "num_input_tokens_seen": 59971920, "step": 27795 }, { "epoch": 5.101853551110295, "grad_norm": 0.1854235827922821, "learning_rate": 9.285085716740958e-06, "loss": 0.1121, "num_input_tokens_seen": 59982576, "step": 27800 }, { "epoch": 5.102771150669848, "grad_norm": 0.08218955993652344, "learning_rate": 9.284673041566613e-06, "loss": 0.0961, "num_input_tokens_seen": 59992976, "step": 27805 }, { "epoch": 5.1036887502293995, "grad_norm": 0.12151291966438293, "learning_rate": 9.284260256497044e-06, "loss": 0.0071, "num_input_tokens_seen": 60004624, "step": 27810 }, { "epoch": 5.104606349788952, "grad_norm": 0.3323342502117157, "learning_rate": 9.28384736154284e-06, "loss": 0.1468, "num_input_tokens_seen": 60015984, "step": 27815 }, { "epoch": 5.1055239493485045, "grad_norm": 0.04208502173423767, "learning_rate": 9.283434356714591e-06, "loss": 0.2195, "num_input_tokens_seen": 60028176, "step": 27820 }, { "epoch": 5.106441548908056, "grad_norm": 32.72900390625, "learning_rate": 9.28302124202289e-06, "loss": 0.2718, "num_input_tokens_seen": 60038320, "step": 27825 }, { "epoch": 5.107359148467609, "grad_norm": 15.701876640319824, "learning_rate": 9.28260801747833e-06, "loss": 0.1028, "num_input_tokens_seen": 60048336, "step": 27830 }, { "epoch": 5.108276748027161, "grad_norm": 0.029039807617664337, "learning_rate": 9.282194683091512e-06, "loss": 0.094, "num_input_tokens_seen": 60059888, "step": 27835 }, { "epoch": 5.109194347586713, "grad_norm": 5.646650314331055, "learning_rate": 9.281781238873038e-06, "loss": 0.2459, "num_input_tokens_seen": 60071056, "step": 27840 }, { "epoch": 5.110111947146265, "grad_norm": 0.0836092010140419, "learning_rate": 9.281367684833512e-06, "loss": 0.1142, "num_input_tokens_seen": 60082288, "step": 27845 }, { "epoch": 5.111029546705818, "grad_norm": 7.915777206420898, "learning_rate": 9.280954020983541e-06, "loss": 0.1992, "num_input_tokens_seen": 60092176, "step": 27850 }, { "epoch": 5.111947146265369, "grad_norm": 0.03653063625097275, "learning_rate": 9.280540247333732e-06, "loss": 0.0024, "num_input_tokens_seen": 60103696, "step": 27855 }, { "epoch": 5.112864745824922, "grad_norm": 0.2813667058944702, "learning_rate": 9.280126363894701e-06, "loss": 0.2294, "num_input_tokens_seen": 60114864, "step": 27860 }, { "epoch": 5.113782345384474, "grad_norm": 0.16016089916229248, "learning_rate": 9.279712370677062e-06, "loss": 0.1406, "num_input_tokens_seen": 60125520, "step": 27865 }, { "epoch": 5.114699944944026, "grad_norm": 7.8367462158203125, "learning_rate": 9.279298267691436e-06, "loss": 0.1014, "num_input_tokens_seen": 60136400, "step": 27870 }, { "epoch": 5.1156175445035785, "grad_norm": 0.08397334069013596, "learning_rate": 9.27888405494844e-06, "loss": 0.0114, "num_input_tokens_seen": 60147536, "step": 27875 }, { "epoch": 5.116535144063131, "grad_norm": 0.11452952027320862, "learning_rate": 9.2784697324587e-06, "loss": 0.0038, "num_input_tokens_seen": 60157680, "step": 27880 }, { "epoch": 5.117452743622683, "grad_norm": 11.693483352661133, "learning_rate": 9.27805530023284e-06, "loss": 0.2926, "num_input_tokens_seen": 60168944, "step": 27885 }, { "epoch": 5.118370343182235, "grad_norm": 11.25756549835205, "learning_rate": 9.277640758281494e-06, "loss": 0.1082, "num_input_tokens_seen": 60179824, "step": 27890 }, { "epoch": 5.119287942741788, "grad_norm": 1.1671329736709595, "learning_rate": 9.277226106615293e-06, "loss": 0.203, "num_input_tokens_seen": 60190960, "step": 27895 }, { "epoch": 5.120205542301339, "grad_norm": 0.8544614911079407, "learning_rate": 9.276811345244869e-06, "loss": 0.0055, "num_input_tokens_seen": 60202256, "step": 27900 }, { "epoch": 5.121123141860892, "grad_norm": 32.09827423095703, "learning_rate": 9.276396474180864e-06, "loss": 0.0664, "num_input_tokens_seen": 60211856, "step": 27905 }, { "epoch": 5.122040741420444, "grad_norm": 0.36921849846839905, "learning_rate": 9.27598149343392e-06, "loss": 0.0724, "num_input_tokens_seen": 60222352, "step": 27910 }, { "epoch": 5.122958340979996, "grad_norm": 17.57756233215332, "learning_rate": 9.275566403014673e-06, "loss": 0.1091, "num_input_tokens_seen": 60233584, "step": 27915 }, { "epoch": 5.123875940539548, "grad_norm": 8.393569946289062, "learning_rate": 9.275151202933776e-06, "loss": 0.1946, "num_input_tokens_seen": 60242320, "step": 27920 }, { "epoch": 5.124793540099101, "grad_norm": 0.34162119030952454, "learning_rate": 9.274735893201878e-06, "loss": 0.1117, "num_input_tokens_seen": 60253296, "step": 27925 }, { "epoch": 5.1257111396586525, "grad_norm": 7.109851360321045, "learning_rate": 9.274320473829628e-06, "loss": 0.3348, "num_input_tokens_seen": 60265456, "step": 27930 }, { "epoch": 5.126628739218205, "grad_norm": 0.08064606785774231, "learning_rate": 9.273904944827684e-06, "loss": 0.003, "num_input_tokens_seen": 60274512, "step": 27935 }, { "epoch": 5.127546338777758, "grad_norm": 17.962764739990234, "learning_rate": 9.2734893062067e-06, "loss": 0.0434, "num_input_tokens_seen": 60285680, "step": 27940 }, { "epoch": 5.128463938337309, "grad_norm": 0.07590044289827347, "learning_rate": 9.27307355797734e-06, "loss": 0.0854, "num_input_tokens_seen": 60296624, "step": 27945 }, { "epoch": 5.129381537896862, "grad_norm": 0.05124666169285774, "learning_rate": 9.272657700150264e-06, "loss": 0.0636, "num_input_tokens_seen": 60308336, "step": 27950 }, { "epoch": 5.130299137456414, "grad_norm": 0.091291643679142, "learning_rate": 9.272241732736144e-06, "loss": 0.0989, "num_input_tokens_seen": 60319696, "step": 27955 }, { "epoch": 5.131216737015966, "grad_norm": 0.35918325185775757, "learning_rate": 9.271825655745642e-06, "loss": 0.0117, "num_input_tokens_seen": 60330288, "step": 27960 }, { "epoch": 5.132134336575518, "grad_norm": 0.07237672060728073, "learning_rate": 9.271409469189432e-06, "loss": 0.0059, "num_input_tokens_seen": 60340848, "step": 27965 }, { "epoch": 5.133051936135071, "grad_norm": 0.06213391199707985, "learning_rate": 9.270993173078192e-06, "loss": 0.1222, "num_input_tokens_seen": 60352560, "step": 27970 }, { "epoch": 5.133969535694622, "grad_norm": 34.79912185668945, "learning_rate": 9.270576767422594e-06, "loss": 0.232, "num_input_tokens_seen": 60361680, "step": 27975 }, { "epoch": 5.134887135254175, "grad_norm": 4.99588680267334, "learning_rate": 9.270160252233322e-06, "loss": 0.0842, "num_input_tokens_seen": 60372144, "step": 27980 }, { "epoch": 5.1358047348137275, "grad_norm": 5.744509220123291, "learning_rate": 9.269743627521057e-06, "loss": 0.1991, "num_input_tokens_seen": 60383536, "step": 27985 }, { "epoch": 5.136722334373279, "grad_norm": 0.024289147928357124, "learning_rate": 9.269326893296486e-06, "loss": 0.0027, "num_input_tokens_seen": 60394416, "step": 27990 }, { "epoch": 5.137639933932832, "grad_norm": 0.12924644351005554, "learning_rate": 9.268910049570297e-06, "loss": 0.0051, "num_input_tokens_seen": 60405904, "step": 27995 }, { "epoch": 5.138557533492384, "grad_norm": 0.08445384353399277, "learning_rate": 9.268493096353181e-06, "loss": 0.0029, "num_input_tokens_seen": 60418064, "step": 28000 }, { "epoch": 5.139475133051936, "grad_norm": 0.05545461177825928, "learning_rate": 9.268076033655832e-06, "loss": 0.1428, "num_input_tokens_seen": 60429776, "step": 28005 }, { "epoch": 5.140392732611488, "grad_norm": 0.07844256609678268, "learning_rate": 9.26765886148895e-06, "loss": 0.002, "num_input_tokens_seen": 60440624, "step": 28010 }, { "epoch": 5.141310332171041, "grad_norm": 0.005533692892640829, "learning_rate": 9.267241579863232e-06, "loss": 0.0045, "num_input_tokens_seen": 60450256, "step": 28015 }, { "epoch": 5.142227931730593, "grad_norm": 0.21385011076927185, "learning_rate": 9.266824188789378e-06, "loss": 0.0036, "num_input_tokens_seen": 60461488, "step": 28020 }, { "epoch": 5.143145531290145, "grad_norm": 0.07662972807884216, "learning_rate": 9.2664066882781e-06, "loss": 0.0014, "num_input_tokens_seen": 60471440, "step": 28025 }, { "epoch": 5.144063130849697, "grad_norm": 0.15014424920082092, "learning_rate": 9.265989078340101e-06, "loss": 0.0442, "num_input_tokens_seen": 60482672, "step": 28030 }, { "epoch": 5.14498073040925, "grad_norm": 8.963907241821289, "learning_rate": 9.265571358986094e-06, "loss": 0.0778, "num_input_tokens_seen": 60493424, "step": 28035 }, { "epoch": 5.1458983299688015, "grad_norm": 0.04887004941701889, "learning_rate": 9.265153530226794e-06, "loss": 0.0021, "num_input_tokens_seen": 60504400, "step": 28040 }, { "epoch": 5.146815929528354, "grad_norm": 0.019645048305392265, "learning_rate": 9.264735592072915e-06, "loss": 0.0062, "num_input_tokens_seen": 60515536, "step": 28045 }, { "epoch": 5.1477335290879065, "grad_norm": 0.022910749539732933, "learning_rate": 9.264317544535178e-06, "loss": 0.0656, "num_input_tokens_seen": 60525488, "step": 28050 }, { "epoch": 5.148651128647458, "grad_norm": 0.015176993794739246, "learning_rate": 9.263899387624305e-06, "loss": 0.0133, "num_input_tokens_seen": 60538512, "step": 28055 }, { "epoch": 5.149568728207011, "grad_norm": 0.033320799469947815, "learning_rate": 9.26348112135102e-06, "loss": 0.1944, "num_input_tokens_seen": 60549072, "step": 28060 }, { "epoch": 5.150486327766563, "grad_norm": 0.12809622287750244, "learning_rate": 9.263062745726054e-06, "loss": 0.271, "num_input_tokens_seen": 60560272, "step": 28065 }, { "epoch": 5.151403927326115, "grad_norm": 0.12572315335273743, "learning_rate": 9.262644260760137e-06, "loss": 0.0019, "num_input_tokens_seen": 60570480, "step": 28070 }, { "epoch": 5.152321526885667, "grad_norm": 0.022938691079616547, "learning_rate": 9.262225666463998e-06, "loss": 0.0747, "num_input_tokens_seen": 60582928, "step": 28075 }, { "epoch": 5.15323912644522, "grad_norm": 19.68353271484375, "learning_rate": 9.261806962848379e-06, "loss": 0.3537, "num_input_tokens_seen": 60594704, "step": 28080 }, { "epoch": 5.154156726004771, "grad_norm": 0.1852390021085739, "learning_rate": 9.261388149924015e-06, "loss": 0.0011, "num_input_tokens_seen": 60604464, "step": 28085 }, { "epoch": 5.155074325564324, "grad_norm": 0.009860371239483356, "learning_rate": 9.26096922770165e-06, "loss": 0.1416, "num_input_tokens_seen": 60615056, "step": 28090 }, { "epoch": 5.155991925123876, "grad_norm": 3.5425312519073486, "learning_rate": 9.260550196192027e-06, "loss": 0.1593, "num_input_tokens_seen": 60625776, "step": 28095 }, { "epoch": 5.156909524683428, "grad_norm": 0.030124539509415627, "learning_rate": 9.260131055405897e-06, "loss": 0.002, "num_input_tokens_seen": 60636944, "step": 28100 }, { "epoch": 5.1578271242429805, "grad_norm": 0.04260767996311188, "learning_rate": 9.259711805354006e-06, "loss": 0.0017, "num_input_tokens_seen": 60647536, "step": 28105 }, { "epoch": 5.158744723802533, "grad_norm": 0.02388264611363411, "learning_rate": 9.25929244604711e-06, "loss": 0.0022, "num_input_tokens_seen": 60658736, "step": 28110 }, { "epoch": 5.159662323362085, "grad_norm": 0.022367751225829124, "learning_rate": 9.258872977495964e-06, "loss": 0.1966, "num_input_tokens_seen": 60668848, "step": 28115 }, { "epoch": 5.160579922921637, "grad_norm": 0.15118145942687988, "learning_rate": 9.258453399711327e-06, "loss": 0.1111, "num_input_tokens_seen": 60679184, "step": 28120 }, { "epoch": 5.16149752248119, "grad_norm": 0.1342422366142273, "learning_rate": 9.25803371270396e-06, "loss": 0.241, "num_input_tokens_seen": 60690160, "step": 28125 }, { "epoch": 5.162415122040741, "grad_norm": 12.45328140258789, "learning_rate": 9.257613916484628e-06, "loss": 0.1969, "num_input_tokens_seen": 60700880, "step": 28130 }, { "epoch": 5.163332721600294, "grad_norm": 0.36346590518951416, "learning_rate": 9.257194011064097e-06, "loss": 0.0699, "num_input_tokens_seen": 60710768, "step": 28135 }, { "epoch": 5.164250321159846, "grad_norm": 0.6913465857505798, "learning_rate": 9.256773996453139e-06, "loss": 0.0075, "num_input_tokens_seen": 60720720, "step": 28140 }, { "epoch": 5.165167920719398, "grad_norm": 9.192731857299805, "learning_rate": 9.256353872662524e-06, "loss": 0.1359, "num_input_tokens_seen": 60731312, "step": 28145 }, { "epoch": 5.16608552027895, "grad_norm": 0.048382263630628586, "learning_rate": 9.25593363970303e-06, "loss": 0.0932, "num_input_tokens_seen": 60741968, "step": 28150 }, { "epoch": 5.167003119838503, "grad_norm": 1.6545023918151855, "learning_rate": 9.255513297585434e-06, "loss": 0.0053, "num_input_tokens_seen": 60752112, "step": 28155 }, { "epoch": 5.1679207193980545, "grad_norm": 0.016795113682746887, "learning_rate": 9.255092846320517e-06, "loss": 0.0024, "num_input_tokens_seen": 60762800, "step": 28160 }, { "epoch": 5.168838318957607, "grad_norm": 0.02848973497748375, "learning_rate": 9.254672285919064e-06, "loss": 0.2212, "num_input_tokens_seen": 60773392, "step": 28165 }, { "epoch": 5.1697559185171595, "grad_norm": 5.837393760681152, "learning_rate": 9.25425161639186e-06, "loss": 0.2046, "num_input_tokens_seen": 60785104, "step": 28170 }, { "epoch": 5.170673518076711, "grad_norm": 0.015068771317601204, "learning_rate": 9.253830837749695e-06, "loss": 0.2622, "num_input_tokens_seen": 60795760, "step": 28175 }, { "epoch": 5.171591117636264, "grad_norm": 0.17194931209087372, "learning_rate": 9.253409950003363e-06, "loss": 0.0028, "num_input_tokens_seen": 60806320, "step": 28180 }, { "epoch": 5.172508717195816, "grad_norm": 0.6274782419204712, "learning_rate": 9.252988953163658e-06, "loss": 0.123, "num_input_tokens_seen": 60817584, "step": 28185 }, { "epoch": 5.173426316755368, "grad_norm": 0.08967345207929611, "learning_rate": 9.252567847241378e-06, "loss": 0.129, "num_input_tokens_seen": 60828528, "step": 28190 }, { "epoch": 5.17434391631492, "grad_norm": 25.9775390625, "learning_rate": 9.252146632247323e-06, "loss": 0.1053, "num_input_tokens_seen": 60839184, "step": 28195 }, { "epoch": 5.175261515874473, "grad_norm": 0.10846471786499023, "learning_rate": 9.251725308192299e-06, "loss": 0.1589, "num_input_tokens_seen": 60850960, "step": 28200 }, { "epoch": 5.176179115434024, "grad_norm": 0.017174743115901947, "learning_rate": 9.251303875087108e-06, "loss": 0.1397, "num_input_tokens_seen": 60861904, "step": 28205 }, { "epoch": 5.177096714993577, "grad_norm": 42.42960739135742, "learning_rate": 9.250882332942562e-06, "loss": 0.2595, "num_input_tokens_seen": 60873008, "step": 28210 }, { "epoch": 5.178014314553129, "grad_norm": 6.924376010894775, "learning_rate": 9.250460681769473e-06, "loss": 0.0908, "num_input_tokens_seen": 60883088, "step": 28215 }, { "epoch": 5.178931914112681, "grad_norm": 0.11356101930141449, "learning_rate": 9.250038921578655e-06, "loss": 0.0834, "num_input_tokens_seen": 60893712, "step": 28220 }, { "epoch": 5.179849513672234, "grad_norm": 0.0502614751458168, "learning_rate": 9.249617052380926e-06, "loss": 0.0051, "num_input_tokens_seen": 60904720, "step": 28225 }, { "epoch": 5.180767113231786, "grad_norm": 14.034026145935059, "learning_rate": 9.249195074187105e-06, "loss": 0.2226, "num_input_tokens_seen": 60916400, "step": 28230 }, { "epoch": 5.181684712791338, "grad_norm": 0.01982760988175869, "learning_rate": 9.248772987008015e-06, "loss": 0.23, "num_input_tokens_seen": 60927632, "step": 28235 }, { "epoch": 5.18260231235089, "grad_norm": 0.33719250559806824, "learning_rate": 9.248350790854486e-06, "loss": 0.0066, "num_input_tokens_seen": 60938320, "step": 28240 }, { "epoch": 5.183519911910443, "grad_norm": 153.67782592773438, "learning_rate": 9.24792848573734e-06, "loss": 0.1371, "num_input_tokens_seen": 60949552, "step": 28245 }, { "epoch": 5.184437511469994, "grad_norm": 5.89522647857666, "learning_rate": 9.247506071667416e-06, "loss": 0.1307, "num_input_tokens_seen": 60960208, "step": 28250 }, { "epoch": 5.185355111029547, "grad_norm": 32.4122200012207, "learning_rate": 9.247083548655542e-06, "loss": 0.0522, "num_input_tokens_seen": 60971568, "step": 28255 }, { "epoch": 5.186272710589099, "grad_norm": 0.27495449781417847, "learning_rate": 9.246660916712557e-06, "loss": 0.0031, "num_input_tokens_seen": 60981808, "step": 28260 }, { "epoch": 5.187190310148651, "grad_norm": 18.087900161743164, "learning_rate": 9.246238175849302e-06, "loss": 0.3082, "num_input_tokens_seen": 60991728, "step": 28265 }, { "epoch": 5.1881079097082035, "grad_norm": 10.770936965942383, "learning_rate": 9.245815326076619e-06, "loss": 0.1875, "num_input_tokens_seen": 61002128, "step": 28270 }, { "epoch": 5.189025509267756, "grad_norm": 9.527936935424805, "learning_rate": 9.245392367405353e-06, "loss": 0.2207, "num_input_tokens_seen": 61014736, "step": 28275 }, { "epoch": 5.189943108827308, "grad_norm": 4.8698225021362305, "learning_rate": 9.244969299846352e-06, "loss": 0.0657, "num_input_tokens_seen": 61025936, "step": 28280 }, { "epoch": 5.19086070838686, "grad_norm": 0.19016440212726593, "learning_rate": 9.244546123410468e-06, "loss": 0.1176, "num_input_tokens_seen": 61035920, "step": 28285 }, { "epoch": 5.191778307946413, "grad_norm": 0.046827077865600586, "learning_rate": 9.244122838108554e-06, "loss": 0.0648, "num_input_tokens_seen": 61045296, "step": 28290 }, { "epoch": 5.192695907505964, "grad_norm": 0.05073472857475281, "learning_rate": 9.243699443951469e-06, "loss": 0.0253, "num_input_tokens_seen": 61055856, "step": 28295 }, { "epoch": 5.193613507065517, "grad_norm": 0.03121901862323284, "learning_rate": 9.243275940950067e-06, "loss": 0.0076, "num_input_tokens_seen": 61067376, "step": 28300 }, { "epoch": 5.194531106625069, "grad_norm": 4.4184770584106445, "learning_rate": 9.242852329115215e-06, "loss": 0.1623, "num_input_tokens_seen": 61079056, "step": 28305 }, { "epoch": 5.195448706184621, "grad_norm": 0.027857402339577675, "learning_rate": 9.24242860845778e-06, "loss": 0.1192, "num_input_tokens_seen": 61090448, "step": 28310 }, { "epoch": 5.196366305744173, "grad_norm": 0.9388145804405212, "learning_rate": 9.242004778988622e-06, "loss": 0.0411, "num_input_tokens_seen": 61101264, "step": 28315 }, { "epoch": 5.197283905303726, "grad_norm": 3.2447593212127686, "learning_rate": 9.241580840718617e-06, "loss": 0.2237, "num_input_tokens_seen": 61111888, "step": 28320 }, { "epoch": 5.1982015048632775, "grad_norm": 64.76506042480469, "learning_rate": 9.241156793658638e-06, "loss": 0.135, "num_input_tokens_seen": 61122352, "step": 28325 }, { "epoch": 5.19911910442283, "grad_norm": 0.10569998621940613, "learning_rate": 9.240732637819559e-06, "loss": 0.0413, "num_input_tokens_seen": 61133232, "step": 28330 }, { "epoch": 5.2000367039823825, "grad_norm": 1.8902831077575684, "learning_rate": 9.240308373212261e-06, "loss": 0.0062, "num_input_tokens_seen": 61144016, "step": 28335 }, { "epoch": 5.200954303541934, "grad_norm": 17.692211151123047, "learning_rate": 9.239883999847626e-06, "loss": 0.0984, "num_input_tokens_seen": 61154960, "step": 28340 }, { "epoch": 5.201871903101487, "grad_norm": 0.1785038709640503, "learning_rate": 9.239459517736537e-06, "loss": 0.0071, "num_input_tokens_seen": 61167088, "step": 28345 }, { "epoch": 5.202789502661039, "grad_norm": 13.35816764831543, "learning_rate": 9.239034926889882e-06, "loss": 0.0955, "num_input_tokens_seen": 61177232, "step": 28350 }, { "epoch": 5.203707102220591, "grad_norm": 0.10894611477851868, "learning_rate": 9.238610227318551e-06, "loss": 0.0022, "num_input_tokens_seen": 61188304, "step": 28355 }, { "epoch": 5.204624701780143, "grad_norm": 0.06046316400170326, "learning_rate": 9.238185419033438e-06, "loss": 0.1892, "num_input_tokens_seen": 61197744, "step": 28360 }, { "epoch": 5.205542301339696, "grad_norm": 0.03395800665020943, "learning_rate": 9.237760502045436e-06, "loss": 0.2546, "num_input_tokens_seen": 61208656, "step": 28365 }, { "epoch": 5.206459900899247, "grad_norm": 5.261996269226074, "learning_rate": 9.237335476365447e-06, "loss": 0.0118, "num_input_tokens_seen": 61220592, "step": 28370 }, { "epoch": 5.2073775004588, "grad_norm": 0.029702093452215195, "learning_rate": 9.236910342004367e-06, "loss": 0.2534, "num_input_tokens_seen": 61231120, "step": 28375 }, { "epoch": 5.208295100018352, "grad_norm": 0.04355503246188164, "learning_rate": 9.236485098973107e-06, "loss": 0.1461, "num_input_tokens_seen": 61241200, "step": 28380 }, { "epoch": 5.209212699577904, "grad_norm": 0.16844870150089264, "learning_rate": 9.236059747282569e-06, "loss": 0.0036, "num_input_tokens_seen": 61252624, "step": 28385 }, { "epoch": 5.2101302991374565, "grad_norm": 16.314804077148438, "learning_rate": 9.235634286943663e-06, "loss": 0.2029, "num_input_tokens_seen": 61262032, "step": 28390 }, { "epoch": 5.211047898697009, "grad_norm": 6.316625118255615, "learning_rate": 9.235208717967301e-06, "loss": 0.1279, "num_input_tokens_seen": 61272144, "step": 28395 }, { "epoch": 5.211965498256561, "grad_norm": 0.16453880071640015, "learning_rate": 9.234783040364402e-06, "loss": 0.2325, "num_input_tokens_seen": 61284048, "step": 28400 }, { "epoch": 5.212883097816113, "grad_norm": 16.351829528808594, "learning_rate": 9.23435725414588e-06, "loss": 0.1698, "num_input_tokens_seen": 61295184, "step": 28405 }, { "epoch": 5.213800697375666, "grad_norm": 1.30169677734375, "learning_rate": 9.233931359322658e-06, "loss": 0.0033, "num_input_tokens_seen": 61307312, "step": 28410 }, { "epoch": 5.214718296935217, "grad_norm": 0.061293844133615494, "learning_rate": 9.233505355905658e-06, "loss": 0.1441, "num_input_tokens_seen": 61318800, "step": 28415 }, { "epoch": 5.21563589649477, "grad_norm": 2.9851841926574707, "learning_rate": 9.233079243905806e-06, "loss": 0.007, "num_input_tokens_seen": 61328752, "step": 28420 }, { "epoch": 5.216553496054322, "grad_norm": 0.27693861722946167, "learning_rate": 9.232653023334033e-06, "loss": 0.0037, "num_input_tokens_seen": 61339856, "step": 28425 }, { "epoch": 5.217471095613874, "grad_norm": 0.11065651476383209, "learning_rate": 9.23222669420127e-06, "loss": 0.002, "num_input_tokens_seen": 61350800, "step": 28430 }, { "epoch": 5.218388695173426, "grad_norm": 35.393890380859375, "learning_rate": 9.231800256518451e-06, "loss": 0.1541, "num_input_tokens_seen": 61361680, "step": 28435 }, { "epoch": 5.219306294732979, "grad_norm": 0.08782190829515457, "learning_rate": 9.231373710296516e-06, "loss": 0.1361, "num_input_tokens_seen": 61371888, "step": 28440 }, { "epoch": 5.2202238942925305, "grad_norm": 0.020471271127462387, "learning_rate": 9.230947055546402e-06, "loss": 0.0035, "num_input_tokens_seen": 61382800, "step": 28445 }, { "epoch": 5.221141493852083, "grad_norm": 7.109040260314941, "learning_rate": 9.230520292279053e-06, "loss": 0.17, "num_input_tokens_seen": 61394352, "step": 28450 }, { "epoch": 5.2220590934116355, "grad_norm": 0.14028093218803406, "learning_rate": 9.230093420505415e-06, "loss": 0.2011, "num_input_tokens_seen": 61404624, "step": 28455 }, { "epoch": 5.222976692971187, "grad_norm": 0.20226795971393585, "learning_rate": 9.229666440236438e-06, "loss": 0.0908, "num_input_tokens_seen": 61415984, "step": 28460 }, { "epoch": 5.22389429253074, "grad_norm": 0.03592048957943916, "learning_rate": 9.229239351483072e-06, "loss": 0.0136, "num_input_tokens_seen": 61427120, "step": 28465 }, { "epoch": 5.224811892090292, "grad_norm": 0.18044857680797577, "learning_rate": 9.22881215425627e-06, "loss": 0.143, "num_input_tokens_seen": 61437264, "step": 28470 }, { "epoch": 5.225729491649844, "grad_norm": 4.7750630378723145, "learning_rate": 9.22838484856699e-06, "loss": 0.3121, "num_input_tokens_seen": 61448080, "step": 28475 }, { "epoch": 5.226647091209396, "grad_norm": 0.08600176125764847, "learning_rate": 9.227957434426196e-06, "loss": 0.0015, "num_input_tokens_seen": 61459056, "step": 28480 }, { "epoch": 5.227564690768949, "grad_norm": 0.33785557746887207, "learning_rate": 9.227529911844844e-06, "loss": 0.3295, "num_input_tokens_seen": 61469552, "step": 28485 }, { "epoch": 5.2284822903285, "grad_norm": 0.09173158556222916, "learning_rate": 9.227102280833901e-06, "loss": 0.357, "num_input_tokens_seen": 61480432, "step": 28490 }, { "epoch": 5.229399889888053, "grad_norm": 0.0584137849509716, "learning_rate": 9.226674541404337e-06, "loss": 0.0272, "num_input_tokens_seen": 61490896, "step": 28495 }, { "epoch": 5.230317489447605, "grad_norm": 10.455331802368164, "learning_rate": 9.22624669356712e-06, "loss": 0.1076, "num_input_tokens_seen": 61501872, "step": 28500 }, { "epoch": 5.231235089007157, "grad_norm": 0.5511296391487122, "learning_rate": 9.225818737333227e-06, "loss": 0.1334, "num_input_tokens_seen": 61512912, "step": 28505 }, { "epoch": 5.23215268856671, "grad_norm": 0.042560264468193054, "learning_rate": 9.225390672713635e-06, "loss": 0.0034, "num_input_tokens_seen": 61522800, "step": 28510 }, { "epoch": 5.233070288126262, "grad_norm": 10.999274253845215, "learning_rate": 9.224962499719317e-06, "loss": 0.1734, "num_input_tokens_seen": 61533136, "step": 28515 }, { "epoch": 5.233987887685814, "grad_norm": 68.65274047851562, "learning_rate": 9.224534218361261e-06, "loss": 0.2075, "num_input_tokens_seen": 61542672, "step": 28520 }, { "epoch": 5.234905487245366, "grad_norm": 0.376729816198349, "learning_rate": 9.22410582865045e-06, "loss": 0.1485, "num_input_tokens_seen": 61551888, "step": 28525 }, { "epoch": 5.235823086804919, "grad_norm": 1.5450023412704468, "learning_rate": 9.22367733059787e-06, "loss": 0.0441, "num_input_tokens_seen": 61563120, "step": 28530 }, { "epoch": 5.23674068636447, "grad_norm": 0.16305118799209595, "learning_rate": 9.223248724214513e-06, "loss": 0.0039, "num_input_tokens_seen": 61573136, "step": 28535 }, { "epoch": 5.237658285924023, "grad_norm": 0.16813890635967255, "learning_rate": 9.222820009511373e-06, "loss": 0.0055, "num_input_tokens_seen": 61584784, "step": 28540 }, { "epoch": 5.238575885483575, "grad_norm": 0.0974215641617775, "learning_rate": 9.222391186499442e-06, "loss": 0.0061, "num_input_tokens_seen": 61594896, "step": 28545 }, { "epoch": 5.239493485043127, "grad_norm": 0.0431380458176136, "learning_rate": 9.221962255189723e-06, "loss": 0.0014, "num_input_tokens_seen": 61606128, "step": 28550 }, { "epoch": 5.2404110846026795, "grad_norm": 0.052486225962638855, "learning_rate": 9.221533215593214e-06, "loss": 0.2213, "num_input_tokens_seen": 61616560, "step": 28555 }, { "epoch": 5.241328684162232, "grad_norm": 0.04349887743592262, "learning_rate": 9.221104067720923e-06, "loss": 0.479, "num_input_tokens_seen": 61628208, "step": 28560 }, { "epoch": 5.242246283721784, "grad_norm": 1.282472848892212, "learning_rate": 9.220674811583855e-06, "loss": 0.188, "num_input_tokens_seen": 61639888, "step": 28565 }, { "epoch": 5.243163883281336, "grad_norm": 1.3605989217758179, "learning_rate": 9.220245447193016e-06, "loss": 0.0055, "num_input_tokens_seen": 61651024, "step": 28570 }, { "epoch": 5.244081482840889, "grad_norm": 0.044119708240032196, "learning_rate": 9.219815974559425e-06, "loss": 0.0038, "num_input_tokens_seen": 61661840, "step": 28575 }, { "epoch": 5.24499908240044, "grad_norm": 0.026950495317578316, "learning_rate": 9.219386393694094e-06, "loss": 0.2368, "num_input_tokens_seen": 61672336, "step": 28580 }, { "epoch": 5.245916681959993, "grad_norm": 0.1134415864944458, "learning_rate": 9.218956704608042e-06, "loss": 0.1048, "num_input_tokens_seen": 61682160, "step": 28585 }, { "epoch": 5.246834281519545, "grad_norm": 33.96921920776367, "learning_rate": 9.218526907312289e-06, "loss": 0.3272, "num_input_tokens_seen": 61693552, "step": 28590 }, { "epoch": 5.247751881079097, "grad_norm": 53.223304748535156, "learning_rate": 9.218097001817857e-06, "loss": 0.1215, "num_input_tokens_seen": 61705616, "step": 28595 }, { "epoch": 5.248669480638649, "grad_norm": 0.09248464554548264, "learning_rate": 9.217666988135777e-06, "loss": 0.1302, "num_input_tokens_seen": 61714992, "step": 28600 }, { "epoch": 5.249587080198202, "grad_norm": 129.27651977539062, "learning_rate": 9.217236866277072e-06, "loss": 0.1643, "num_input_tokens_seen": 61725104, "step": 28605 }, { "epoch": 5.2505046797577535, "grad_norm": 0.1299320012331009, "learning_rate": 9.216806636252782e-06, "loss": 0.2461, "num_input_tokens_seen": 61734768, "step": 28610 }, { "epoch": 5.251422279317306, "grad_norm": 0.16184638440608978, "learning_rate": 9.216376298073935e-06, "loss": 0.2033, "num_input_tokens_seen": 61744976, "step": 28615 }, { "epoch": 5.2523398788768585, "grad_norm": 1.1078901290893555, "learning_rate": 9.21594585175157e-06, "loss": 0.0042, "num_input_tokens_seen": 61756400, "step": 28620 }, { "epoch": 5.25325747843641, "grad_norm": 0.08340781927108765, "learning_rate": 9.21551529729673e-06, "loss": 0.0948, "num_input_tokens_seen": 61766704, "step": 28625 }, { "epoch": 5.254175077995963, "grad_norm": 0.2699241042137146, "learning_rate": 9.215084634720455e-06, "loss": 0.1364, "num_input_tokens_seen": 61777968, "step": 28630 }, { "epoch": 5.255092677555515, "grad_norm": 0.046515658497810364, "learning_rate": 9.214653864033791e-06, "loss": 0.1948, "num_input_tokens_seen": 61788400, "step": 28635 }, { "epoch": 5.256010277115067, "grad_norm": 0.1449914127588272, "learning_rate": 9.21422298524779e-06, "loss": 0.049, "num_input_tokens_seen": 61799792, "step": 28640 }, { "epoch": 5.256927876674619, "grad_norm": 0.733224630355835, "learning_rate": 9.213791998373498e-06, "loss": 0.0059, "num_input_tokens_seen": 61809744, "step": 28645 }, { "epoch": 5.257845476234172, "grad_norm": 21.62721061706543, "learning_rate": 9.213360903421973e-06, "loss": 0.1836, "num_input_tokens_seen": 61820784, "step": 28650 }, { "epoch": 5.258763075793723, "grad_norm": 0.04004598408937454, "learning_rate": 9.212929700404272e-06, "loss": 0.0023, "num_input_tokens_seen": 61830992, "step": 28655 }, { "epoch": 5.259680675353276, "grad_norm": 22.84467887878418, "learning_rate": 9.212498389331452e-06, "loss": 0.1691, "num_input_tokens_seen": 61841648, "step": 28660 }, { "epoch": 5.260598274912828, "grad_norm": 0.17581091821193695, "learning_rate": 9.212066970214579e-06, "loss": 0.0078, "num_input_tokens_seen": 61852080, "step": 28665 }, { "epoch": 5.26151587447238, "grad_norm": 0.057616230100393295, "learning_rate": 9.211635443064714e-06, "loss": 0.1272, "num_input_tokens_seen": 61864720, "step": 28670 }, { "epoch": 5.2624334740319325, "grad_norm": 0.03946683928370476, "learning_rate": 9.21120380789293e-06, "loss": 0.1774, "num_input_tokens_seen": 61875888, "step": 28675 }, { "epoch": 5.263351073591485, "grad_norm": 8.39029312133789, "learning_rate": 9.210772064710293e-06, "loss": 0.2341, "num_input_tokens_seen": 61886768, "step": 28680 }, { "epoch": 5.264268673151037, "grad_norm": 1.6753977537155151, "learning_rate": 9.21034021352788e-06, "loss": 0.0034, "num_input_tokens_seen": 61897840, "step": 28685 }, { "epoch": 5.265186272710589, "grad_norm": 0.12613144516944885, "learning_rate": 9.209908254356765e-06, "loss": 0.078, "num_input_tokens_seen": 61909328, "step": 28690 }, { "epoch": 5.266103872270142, "grad_norm": 51.43113327026367, "learning_rate": 9.209476187208029e-06, "loss": 0.1218, "num_input_tokens_seen": 61919952, "step": 28695 }, { "epoch": 5.267021471829693, "grad_norm": 0.14409928023815155, "learning_rate": 9.209044012092752e-06, "loss": 0.0017, "num_input_tokens_seen": 61930512, "step": 28700 }, { "epoch": 5.267939071389246, "grad_norm": 25.73630142211914, "learning_rate": 9.208611729022019e-06, "loss": 0.2705, "num_input_tokens_seen": 61939984, "step": 28705 }, { "epoch": 5.268856670948798, "grad_norm": 0.05697886273264885, "learning_rate": 9.208179338006919e-06, "loss": 0.1932, "num_input_tokens_seen": 61951312, "step": 28710 }, { "epoch": 5.26977427050835, "grad_norm": 0.019304655492305756, "learning_rate": 9.20774683905854e-06, "loss": 0.0391, "num_input_tokens_seen": 61962128, "step": 28715 }, { "epoch": 5.270691870067902, "grad_norm": 7.833621025085449, "learning_rate": 9.207314232187978e-06, "loss": 0.1898, "num_input_tokens_seen": 61974000, "step": 28720 }, { "epoch": 5.271609469627455, "grad_norm": 0.11892123520374298, "learning_rate": 9.206881517406325e-06, "loss": 0.0051, "num_input_tokens_seen": 61983088, "step": 28725 }, { "epoch": 5.2725270691870065, "grad_norm": 0.01877511478960514, "learning_rate": 9.206448694724682e-06, "loss": 0.1295, "num_input_tokens_seen": 61992304, "step": 28730 }, { "epoch": 5.273444668746559, "grad_norm": 0.03904787451028824, "learning_rate": 9.206015764154149e-06, "loss": 0.0028, "num_input_tokens_seen": 62002768, "step": 28735 }, { "epoch": 5.2743622683061115, "grad_norm": 15.62590217590332, "learning_rate": 9.205582725705831e-06, "loss": 0.0968, "num_input_tokens_seen": 62013424, "step": 28740 }, { "epoch": 5.275279867865663, "grad_norm": 0.06679652631282806, "learning_rate": 9.205149579390833e-06, "loss": 0.0036, "num_input_tokens_seen": 62025040, "step": 28745 }, { "epoch": 5.276197467425216, "grad_norm": 1.0630435943603516, "learning_rate": 9.204716325220266e-06, "loss": 0.1276, "num_input_tokens_seen": 62034864, "step": 28750 }, { "epoch": 5.277115066984768, "grad_norm": 0.14734092354774475, "learning_rate": 9.204282963205242e-06, "loss": 0.1271, "num_input_tokens_seen": 62046800, "step": 28755 }, { "epoch": 5.27803266654432, "grad_norm": 6.658567428588867, "learning_rate": 9.203849493356875e-06, "loss": 0.235, "num_input_tokens_seen": 62057360, "step": 28760 }, { "epoch": 5.278950266103872, "grad_norm": 22.572053909301758, "learning_rate": 9.203415915686287e-06, "loss": 0.3943, "num_input_tokens_seen": 62068048, "step": 28765 }, { "epoch": 5.279867865663425, "grad_norm": 26.91941261291504, "learning_rate": 9.202982230204594e-06, "loss": 0.0092, "num_input_tokens_seen": 62079248, "step": 28770 }, { "epoch": 5.280785465222976, "grad_norm": 44.05988693237305, "learning_rate": 9.20254843692292e-06, "loss": 0.1105, "num_input_tokens_seen": 62090096, "step": 28775 }, { "epoch": 5.281703064782529, "grad_norm": 0.019026529043912888, "learning_rate": 9.202114535852392e-06, "loss": 0.0195, "num_input_tokens_seen": 62101104, "step": 28780 }, { "epoch": 5.282620664342081, "grad_norm": 7.7994184494018555, "learning_rate": 9.201680527004139e-06, "loss": 0.5625, "num_input_tokens_seen": 62112176, "step": 28785 }, { "epoch": 5.283538263901633, "grad_norm": 0.1456713080406189, "learning_rate": 9.201246410389293e-06, "loss": 0.0102, "num_input_tokens_seen": 62123120, "step": 28790 }, { "epoch": 5.284455863461186, "grad_norm": 37.73915481567383, "learning_rate": 9.200812186018987e-06, "loss": 0.205, "num_input_tokens_seen": 62134224, "step": 28795 }, { "epoch": 5.285373463020738, "grad_norm": 6.44993257522583, "learning_rate": 9.20037785390436e-06, "loss": 0.3608, "num_input_tokens_seen": 62144624, "step": 28800 }, { "epoch": 5.28629106258029, "grad_norm": 6.465346813201904, "learning_rate": 9.19994341405655e-06, "loss": 0.1061, "num_input_tokens_seen": 62154320, "step": 28805 }, { "epoch": 5.287208662139842, "grad_norm": 13.335904121398926, "learning_rate": 9.199508866486701e-06, "loss": 0.0584, "num_input_tokens_seen": 62165008, "step": 28810 }, { "epoch": 5.288126261699395, "grad_norm": 0.1058557853102684, "learning_rate": 9.19907421120596e-06, "loss": 0.2437, "num_input_tokens_seen": 62176144, "step": 28815 }, { "epoch": 5.289043861258946, "grad_norm": 11.803646087646484, "learning_rate": 9.198639448225472e-06, "loss": 0.1379, "num_input_tokens_seen": 62188240, "step": 28820 }, { "epoch": 5.289961460818499, "grad_norm": 0.03168758749961853, "learning_rate": 9.198204577556388e-06, "loss": 0.0672, "num_input_tokens_seen": 62199408, "step": 28825 }, { "epoch": 5.290879060378051, "grad_norm": 0.5017791390419006, "learning_rate": 9.197769599209867e-06, "loss": 0.2205, "num_input_tokens_seen": 62209520, "step": 28830 }, { "epoch": 5.291796659937603, "grad_norm": 0.07213135063648224, "learning_rate": 9.19733451319706e-06, "loss": 0.003, "num_input_tokens_seen": 62220176, "step": 28835 }, { "epoch": 5.2927142594971555, "grad_norm": 0.0803065299987793, "learning_rate": 9.196899319529126e-06, "loss": 0.114, "num_input_tokens_seen": 62231664, "step": 28840 }, { "epoch": 5.293631859056708, "grad_norm": 2.4759697914123535, "learning_rate": 9.19646401821723e-06, "loss": 0.3896, "num_input_tokens_seen": 62242352, "step": 28845 }, { "epoch": 5.29454945861626, "grad_norm": 19.623451232910156, "learning_rate": 9.196028609272538e-06, "loss": 0.2567, "num_input_tokens_seen": 62253328, "step": 28850 }, { "epoch": 5.295467058175812, "grad_norm": 0.8014490604400635, "learning_rate": 9.195593092706214e-06, "loss": 0.084, "num_input_tokens_seen": 62263088, "step": 28855 }, { "epoch": 5.296384657735365, "grad_norm": 22.991518020629883, "learning_rate": 9.19515746852943e-06, "loss": 0.2009, "num_input_tokens_seen": 62273968, "step": 28860 }, { "epoch": 5.297302257294916, "grad_norm": 0.05609649419784546, "learning_rate": 9.194721736753358e-06, "loss": 0.3895, "num_input_tokens_seen": 62284720, "step": 28865 }, { "epoch": 5.298219856854469, "grad_norm": 12.523056983947754, "learning_rate": 9.194285897389175e-06, "loss": 0.2055, "num_input_tokens_seen": 62294928, "step": 28870 }, { "epoch": 5.299137456414021, "grad_norm": 10.245582580566406, "learning_rate": 9.19384995044806e-06, "loss": 0.1941, "num_input_tokens_seen": 62306960, "step": 28875 }, { "epoch": 5.300055055973573, "grad_norm": 0.09678089618682861, "learning_rate": 9.193413895941192e-06, "loss": 0.0048, "num_input_tokens_seen": 62318384, "step": 28880 }, { "epoch": 5.300972655533125, "grad_norm": 1.3580355644226074, "learning_rate": 9.192977733879758e-06, "loss": 0.3403, "num_input_tokens_seen": 62329936, "step": 28885 }, { "epoch": 5.301890255092678, "grad_norm": 0.04942060634493828, "learning_rate": 9.192541464274944e-06, "loss": 0.0291, "num_input_tokens_seen": 62340368, "step": 28890 }, { "epoch": 5.3028078546522295, "grad_norm": 0.054738208651542664, "learning_rate": 9.192105087137938e-06, "loss": 0.1884, "num_input_tokens_seen": 62348656, "step": 28895 }, { "epoch": 5.303725454211782, "grad_norm": 3.2033698558807373, "learning_rate": 9.191668602479935e-06, "loss": 0.006, "num_input_tokens_seen": 62359760, "step": 28900 }, { "epoch": 5.3046430537713345, "grad_norm": 1.1441305875778198, "learning_rate": 9.191232010312128e-06, "loss": 0.3211, "num_input_tokens_seen": 62370256, "step": 28905 }, { "epoch": 5.305560653330886, "grad_norm": 6.676259994506836, "learning_rate": 9.190795310645716e-06, "loss": 0.1344, "num_input_tokens_seen": 62380272, "step": 28910 }, { "epoch": 5.306478252890439, "grad_norm": 0.15168853104114532, "learning_rate": 9.190358503491901e-06, "loss": 0.0036, "num_input_tokens_seen": 62390256, "step": 28915 }, { "epoch": 5.307395852449991, "grad_norm": 8.311594009399414, "learning_rate": 9.189921588861883e-06, "loss": 0.2761, "num_input_tokens_seen": 62400944, "step": 28920 }, { "epoch": 5.308313452009543, "grad_norm": 0.035916686058044434, "learning_rate": 9.18948456676687e-06, "loss": 0.0029, "num_input_tokens_seen": 62412688, "step": 28925 }, { "epoch": 5.309231051569095, "grad_norm": 0.885026216506958, "learning_rate": 9.189047437218072e-06, "loss": 0.2229, "num_input_tokens_seen": 62423568, "step": 28930 }, { "epoch": 5.310148651128648, "grad_norm": 0.15426324307918549, "learning_rate": 9.1886102002267e-06, "loss": 0.1179, "num_input_tokens_seen": 62434960, "step": 28935 }, { "epoch": 5.311066250688199, "grad_norm": 0.13947099447250366, "learning_rate": 9.188172855803966e-06, "loss": 0.0077, "num_input_tokens_seen": 62445392, "step": 28940 }, { "epoch": 5.311983850247752, "grad_norm": 20.96347427368164, "learning_rate": 9.187735403961091e-06, "loss": 0.1244, "num_input_tokens_seen": 62456432, "step": 28945 }, { "epoch": 5.312901449807304, "grad_norm": 0.11157233268022537, "learning_rate": 9.187297844709293e-06, "loss": 0.2543, "num_input_tokens_seen": 62467344, "step": 28950 }, { "epoch": 5.313819049366856, "grad_norm": 0.14121674001216888, "learning_rate": 9.186860178059794e-06, "loss": 0.1456, "num_input_tokens_seen": 62478160, "step": 28955 }, { "epoch": 5.3147366489264085, "grad_norm": 43.734439849853516, "learning_rate": 9.186422404023822e-06, "loss": 0.236, "num_input_tokens_seen": 62488528, "step": 28960 }, { "epoch": 5.315654248485961, "grad_norm": 0.13442571461200714, "learning_rate": 9.185984522612602e-06, "loss": 0.2693, "num_input_tokens_seen": 62498384, "step": 28965 }, { "epoch": 5.316571848045513, "grad_norm": 0.033660780638456345, "learning_rate": 9.185546533837368e-06, "loss": 0.208, "num_input_tokens_seen": 62508400, "step": 28970 }, { "epoch": 5.317489447605065, "grad_norm": 4.163565158843994, "learning_rate": 9.185108437709354e-06, "loss": 0.0949, "num_input_tokens_seen": 62520336, "step": 28975 }, { "epoch": 5.318407047164618, "grad_norm": 0.1376950740814209, "learning_rate": 9.184670234239792e-06, "loss": 0.1306, "num_input_tokens_seen": 62531376, "step": 28980 }, { "epoch": 5.319324646724169, "grad_norm": 7.373912811279297, "learning_rate": 9.184231923439924e-06, "loss": 0.2013, "num_input_tokens_seen": 62542256, "step": 28985 }, { "epoch": 5.320242246283722, "grad_norm": 0.140781432390213, "learning_rate": 9.183793505320996e-06, "loss": 0.1083, "num_input_tokens_seen": 62553776, "step": 28990 }, { "epoch": 5.321159845843274, "grad_norm": 0.2326153814792633, "learning_rate": 9.183354979894244e-06, "loss": 0.1911, "num_input_tokens_seen": 62564464, "step": 28995 }, { "epoch": 5.322077445402826, "grad_norm": 0.1704491376876831, "learning_rate": 9.182916347170923e-06, "loss": 0.1646, "num_input_tokens_seen": 62574768, "step": 29000 }, { "epoch": 5.322995044962378, "grad_norm": 0.18645665049552917, "learning_rate": 9.182477607162281e-06, "loss": 0.1611, "num_input_tokens_seen": 62584720, "step": 29005 }, { "epoch": 5.323912644521931, "grad_norm": 0.12979991734027863, "learning_rate": 9.18203875987957e-06, "loss": 0.0824, "num_input_tokens_seen": 62595760, "step": 29010 }, { "epoch": 5.3248302440814825, "grad_norm": 9.46983814239502, "learning_rate": 9.181599805334045e-06, "loss": 0.1956, "num_input_tokens_seen": 62606480, "step": 29015 }, { "epoch": 5.325747843641035, "grad_norm": 0.06786897778511047, "learning_rate": 9.181160743536968e-06, "loss": 0.0365, "num_input_tokens_seen": 62618384, "step": 29020 }, { "epoch": 5.3266654432005875, "grad_norm": 3.3958261013031006, "learning_rate": 9.180721574499598e-06, "loss": 0.0055, "num_input_tokens_seen": 62629968, "step": 29025 }, { "epoch": 5.327583042760139, "grad_norm": 0.11720237135887146, "learning_rate": 9.180282298233197e-06, "loss": 0.0527, "num_input_tokens_seen": 62640464, "step": 29030 }, { "epoch": 5.328500642319692, "grad_norm": 0.21375444531440735, "learning_rate": 9.179842914749035e-06, "loss": 0.0158, "num_input_tokens_seen": 62652080, "step": 29035 }, { "epoch": 5.329418241879244, "grad_norm": 0.04061446711421013, "learning_rate": 9.17940342405838e-06, "loss": 0.1012, "num_input_tokens_seen": 62662160, "step": 29040 }, { "epoch": 5.330335841438796, "grad_norm": 0.15979474782943726, "learning_rate": 9.178963826172506e-06, "loss": 0.1396, "num_input_tokens_seen": 62674128, "step": 29045 }, { "epoch": 5.331253440998348, "grad_norm": 7.404269218444824, "learning_rate": 9.178524121102687e-06, "loss": 0.104, "num_input_tokens_seen": 62684848, "step": 29050 }, { "epoch": 5.332171040557901, "grad_norm": 2.919203519821167, "learning_rate": 9.178084308860199e-06, "loss": 0.0698, "num_input_tokens_seen": 62695632, "step": 29055 }, { "epoch": 5.333088640117452, "grad_norm": 10.384854316711426, "learning_rate": 9.177644389456324e-06, "loss": 0.1174, "num_input_tokens_seen": 62707760, "step": 29060 }, { "epoch": 5.334006239677005, "grad_norm": 0.16420356929302216, "learning_rate": 9.177204362902345e-06, "loss": 0.2442, "num_input_tokens_seen": 62719184, "step": 29065 }, { "epoch": 5.334923839236557, "grad_norm": 0.0611240528523922, "learning_rate": 9.17676422920955e-06, "loss": 0.2998, "num_input_tokens_seen": 62732016, "step": 29070 }, { "epoch": 5.335841438796109, "grad_norm": 0.1606963723897934, "learning_rate": 9.176323988389224e-06, "loss": 0.0466, "num_input_tokens_seen": 62741136, "step": 29075 }, { "epoch": 5.336759038355662, "grad_norm": 0.53505939245224, "learning_rate": 9.17588364045266e-06, "loss": 0.1284, "num_input_tokens_seen": 62753616, "step": 29080 }, { "epoch": 5.337676637915214, "grad_norm": 39.547855377197266, "learning_rate": 9.175443185411155e-06, "loss": 0.0887, "num_input_tokens_seen": 62764208, "step": 29085 }, { "epoch": 5.338594237474766, "grad_norm": 0.1040208488702774, "learning_rate": 9.175002623276e-06, "loss": 0.1844, "num_input_tokens_seen": 62775504, "step": 29090 }, { "epoch": 5.339511837034318, "grad_norm": 0.23597177863121033, "learning_rate": 9.174561954058503e-06, "loss": 0.3694, "num_input_tokens_seen": 62786960, "step": 29095 }, { "epoch": 5.340429436593871, "grad_norm": 0.14153613150119781, "learning_rate": 9.174121177769959e-06, "loss": 0.0205, "num_input_tokens_seen": 62798032, "step": 29100 }, { "epoch": 5.341347036153422, "grad_norm": 0.12290380150079727, "learning_rate": 9.173680294421678e-06, "loss": 0.0653, "num_input_tokens_seen": 62808240, "step": 29105 }, { "epoch": 5.342264635712975, "grad_norm": 0.7614112496376038, "learning_rate": 9.173239304024964e-06, "loss": 0.2475, "num_input_tokens_seen": 62820240, "step": 29110 }, { "epoch": 5.343182235272527, "grad_norm": 3.70967960357666, "learning_rate": 9.17279820659113e-06, "loss": 0.213, "num_input_tokens_seen": 62829744, "step": 29115 }, { "epoch": 5.344099834832079, "grad_norm": 0.5057421922683716, "learning_rate": 9.17235700213149e-06, "loss": 0.1053, "num_input_tokens_seen": 62840816, "step": 29120 }, { "epoch": 5.3450174343916315, "grad_norm": 34.55794906616211, "learning_rate": 9.171915690657359e-06, "loss": 0.0204, "num_input_tokens_seen": 62852272, "step": 29125 }, { "epoch": 5.345935033951184, "grad_norm": 0.07195896655321121, "learning_rate": 9.171474272180057e-06, "loss": 0.066, "num_input_tokens_seen": 62863152, "step": 29130 }, { "epoch": 5.346852633510736, "grad_norm": 0.1825709193944931, "learning_rate": 9.171032746710905e-06, "loss": 0.0024, "num_input_tokens_seen": 62873200, "step": 29135 }, { "epoch": 5.347770233070288, "grad_norm": 0.6078130602836609, "learning_rate": 9.170591114261226e-06, "loss": 0.1372, "num_input_tokens_seen": 62883344, "step": 29140 }, { "epoch": 5.348687832629841, "grad_norm": 35.95907974243164, "learning_rate": 9.170149374842352e-06, "loss": 0.5575, "num_input_tokens_seen": 62894704, "step": 29145 }, { "epoch": 5.349605432189392, "grad_norm": 0.8735979795455933, "learning_rate": 9.169707528465606e-06, "loss": 0.1422, "num_input_tokens_seen": 62905840, "step": 29150 }, { "epoch": 5.350523031748945, "grad_norm": 0.7624432444572449, "learning_rate": 9.169265575142328e-06, "loss": 0.0031, "num_input_tokens_seen": 62916432, "step": 29155 }, { "epoch": 5.351440631308497, "grad_norm": 10.114401817321777, "learning_rate": 9.168823514883846e-06, "loss": 0.1934, "num_input_tokens_seen": 62927536, "step": 29160 }, { "epoch": 5.352358230868049, "grad_norm": 0.31293192505836487, "learning_rate": 9.168381347701505e-06, "loss": 0.1096, "num_input_tokens_seen": 62937424, "step": 29165 }, { "epoch": 5.353275830427601, "grad_norm": 0.20890000462532043, "learning_rate": 9.167939073606642e-06, "loss": 0.2858, "num_input_tokens_seen": 62947952, "step": 29170 }, { "epoch": 5.354193429987154, "grad_norm": 10.560715675354004, "learning_rate": 9.1674966926106e-06, "loss": 0.0802, "num_input_tokens_seen": 62958704, "step": 29175 }, { "epoch": 5.3551110295467055, "grad_norm": 0.5410689115524292, "learning_rate": 9.16705420472473e-06, "loss": 0.0056, "num_input_tokens_seen": 62970288, "step": 29180 }, { "epoch": 5.356028629106258, "grad_norm": 15.05921459197998, "learning_rate": 9.166611609960375e-06, "loss": 0.2691, "num_input_tokens_seen": 62982032, "step": 29185 }, { "epoch": 5.3569462286658105, "grad_norm": 1.928705096244812, "learning_rate": 9.166168908328891e-06, "loss": 0.0741, "num_input_tokens_seen": 62993968, "step": 29190 }, { "epoch": 5.357863828225362, "grad_norm": 1.291312336921692, "learning_rate": 9.16572609984163e-06, "loss": 0.1555, "num_input_tokens_seen": 63004080, "step": 29195 }, { "epoch": 5.358781427784915, "grad_norm": 6.045060634613037, "learning_rate": 9.165283184509953e-06, "loss": 0.0833, "num_input_tokens_seen": 63014480, "step": 29200 }, { "epoch": 5.359699027344467, "grad_norm": 0.10798867791891098, "learning_rate": 9.164840162345216e-06, "loss": 0.2182, "num_input_tokens_seen": 63025040, "step": 29205 }, { "epoch": 5.360616626904019, "grad_norm": 0.16799360513687134, "learning_rate": 9.164397033358787e-06, "loss": 0.1144, "num_input_tokens_seen": 63035344, "step": 29210 }, { "epoch": 5.361534226463571, "grad_norm": 5.8855366706848145, "learning_rate": 9.163953797562026e-06, "loss": 0.1347, "num_input_tokens_seen": 63045552, "step": 29215 }, { "epoch": 5.362451826023124, "grad_norm": 12.90312385559082, "learning_rate": 9.163510454966304e-06, "loss": 0.1642, "num_input_tokens_seen": 63056752, "step": 29220 }, { "epoch": 5.363369425582675, "grad_norm": 0.38207751512527466, "learning_rate": 9.16306700558299e-06, "loss": 0.2399, "num_input_tokens_seen": 63068976, "step": 29225 }, { "epoch": 5.364287025142228, "grad_norm": 0.22113148868083954, "learning_rate": 9.162623449423463e-06, "loss": 0.0087, "num_input_tokens_seen": 63079632, "step": 29230 }, { "epoch": 5.36520462470178, "grad_norm": 29.087867736816406, "learning_rate": 9.162179786499093e-06, "loss": 0.164, "num_input_tokens_seen": 63088880, "step": 29235 }, { "epoch": 5.366122224261332, "grad_norm": 1.2697068452835083, "learning_rate": 9.161736016821264e-06, "loss": 0.0904, "num_input_tokens_seen": 63099056, "step": 29240 }, { "epoch": 5.3670398238208845, "grad_norm": 13.955648422241211, "learning_rate": 9.161292140401354e-06, "loss": 0.1371, "num_input_tokens_seen": 63109840, "step": 29245 }, { "epoch": 5.367957423380437, "grad_norm": 0.7523242831230164, "learning_rate": 9.160848157250752e-06, "loss": 0.0039, "num_input_tokens_seen": 63120368, "step": 29250 }, { "epoch": 5.368875022939989, "grad_norm": 0.04138696938753128, "learning_rate": 9.160404067380843e-06, "loss": 0.323, "num_input_tokens_seen": 63131600, "step": 29255 }, { "epoch": 5.369792622499541, "grad_norm": 0.49961337447166443, "learning_rate": 9.159959870803018e-06, "loss": 0.0855, "num_input_tokens_seen": 63142064, "step": 29260 }, { "epoch": 5.370710222059094, "grad_norm": 0.36388325691223145, "learning_rate": 9.15951556752867e-06, "loss": 0.1664, "num_input_tokens_seen": 63153136, "step": 29265 }, { "epoch": 5.371627821618645, "grad_norm": 5.431521892547607, "learning_rate": 9.159071157569193e-06, "loss": 0.2997, "num_input_tokens_seen": 63163696, "step": 29270 }, { "epoch": 5.372545421178198, "grad_norm": 0.22536161541938782, "learning_rate": 9.158626640935987e-06, "loss": 0.0396, "num_input_tokens_seen": 63176144, "step": 29275 }, { "epoch": 5.37346302073775, "grad_norm": 27.491151809692383, "learning_rate": 9.158182017640453e-06, "loss": 0.198, "num_input_tokens_seen": 63186320, "step": 29280 }, { "epoch": 5.374380620297302, "grad_norm": 0.29528144001960754, "learning_rate": 9.157737287693997e-06, "loss": 0.1895, "num_input_tokens_seen": 63197712, "step": 29285 }, { "epoch": 5.375298219856854, "grad_norm": 0.17058397829532623, "learning_rate": 9.157292451108022e-06, "loss": 0.0519, "num_input_tokens_seen": 63209392, "step": 29290 }, { "epoch": 5.376215819416407, "grad_norm": 0.04724545776844025, "learning_rate": 9.156847507893937e-06, "loss": 0.204, "num_input_tokens_seen": 63220176, "step": 29295 }, { "epoch": 5.3771334189759585, "grad_norm": 11.564254760742188, "learning_rate": 9.156402458063158e-06, "loss": 0.1802, "num_input_tokens_seen": 63230832, "step": 29300 }, { "epoch": 5.378051018535511, "grad_norm": 0.5515941977500916, "learning_rate": 9.155957301627098e-06, "loss": 0.071, "num_input_tokens_seen": 63240752, "step": 29305 }, { "epoch": 5.3789686180950635, "grad_norm": 8.065489768981934, "learning_rate": 9.155512038597174e-06, "loss": 0.1005, "num_input_tokens_seen": 63250992, "step": 29310 }, { "epoch": 5.379886217654615, "grad_norm": 37.53623580932617, "learning_rate": 9.155066668984806e-06, "loss": 0.0823, "num_input_tokens_seen": 63262224, "step": 29315 }, { "epoch": 5.380803817214168, "grad_norm": 15.132834434509277, "learning_rate": 9.154621192801419e-06, "loss": 0.2994, "num_input_tokens_seen": 63274160, "step": 29320 }, { "epoch": 5.38172141677372, "grad_norm": 18.289745330810547, "learning_rate": 9.154175610058437e-06, "loss": 0.1175, "num_input_tokens_seen": 63285104, "step": 29325 }, { "epoch": 5.382639016333272, "grad_norm": 4.9769206047058105, "learning_rate": 9.153729920767288e-06, "loss": 0.2224, "num_input_tokens_seen": 63295888, "step": 29330 }, { "epoch": 5.383556615892824, "grad_norm": 0.06488224118947983, "learning_rate": 9.153284124939405e-06, "loss": 0.1264, "num_input_tokens_seen": 63306768, "step": 29335 }, { "epoch": 5.384474215452377, "grad_norm": 64.70860290527344, "learning_rate": 9.15283822258622e-06, "loss": 0.1682, "num_input_tokens_seen": 63316528, "step": 29340 }, { "epoch": 5.385391815011928, "grad_norm": 25.19695281982422, "learning_rate": 9.152392213719173e-06, "loss": 0.2155, "num_input_tokens_seen": 63327472, "step": 29345 }, { "epoch": 5.386309414571481, "grad_norm": 24.382831573486328, "learning_rate": 9.1519460983497e-06, "loss": 0.1103, "num_input_tokens_seen": 63337200, "step": 29350 }, { "epoch": 5.387227014131033, "grad_norm": 0.0885150209069252, "learning_rate": 9.151499876489244e-06, "loss": 0.0049, "num_input_tokens_seen": 63349456, "step": 29355 }, { "epoch": 5.388144613690585, "grad_norm": 0.03348487243056297, "learning_rate": 9.151053548149253e-06, "loss": 0.1643, "num_input_tokens_seen": 63361040, "step": 29360 }, { "epoch": 5.389062213250138, "grad_norm": 11.443076133728027, "learning_rate": 9.15060711334117e-06, "loss": 0.0917, "num_input_tokens_seen": 63371376, "step": 29365 }, { "epoch": 5.38997981280969, "grad_norm": 97.56034088134766, "learning_rate": 9.150160572076447e-06, "loss": 0.0874, "num_input_tokens_seen": 63381104, "step": 29370 }, { "epoch": 5.390897412369242, "grad_norm": 3.1648688316345215, "learning_rate": 9.149713924366539e-06, "loss": 0.0138, "num_input_tokens_seen": 63391376, "step": 29375 }, { "epoch": 5.391815011928794, "grad_norm": 17.664762496948242, "learning_rate": 9.1492671702229e-06, "loss": 0.2095, "num_input_tokens_seen": 63402896, "step": 29380 }, { "epoch": 5.392732611488347, "grad_norm": 0.28528720140457153, "learning_rate": 9.148820309656988e-06, "loss": 0.0037, "num_input_tokens_seen": 63412688, "step": 29385 }, { "epoch": 5.393650211047898, "grad_norm": 1.0943759679794312, "learning_rate": 9.148373342680265e-06, "loss": 0.1101, "num_input_tokens_seen": 63423760, "step": 29390 }, { "epoch": 5.394567810607451, "grad_norm": 0.3990880846977234, "learning_rate": 9.147926269304195e-06, "loss": 0.1174, "num_input_tokens_seen": 63434576, "step": 29395 }, { "epoch": 5.395485410167003, "grad_norm": 0.6056625843048096, "learning_rate": 9.147479089540247e-06, "loss": 0.0168, "num_input_tokens_seen": 63446384, "step": 29400 }, { "epoch": 5.396403009726555, "grad_norm": 0.022248461842536926, "learning_rate": 9.147031803399887e-06, "loss": 0.0302, "num_input_tokens_seen": 63457680, "step": 29405 }, { "epoch": 5.3973206092861075, "grad_norm": 0.0459069199860096, "learning_rate": 9.146584410894588e-06, "loss": 0.0619, "num_input_tokens_seen": 63468880, "step": 29410 }, { "epoch": 5.39823820884566, "grad_norm": 3.751187801361084, "learning_rate": 9.146136912035825e-06, "loss": 0.1106, "num_input_tokens_seen": 63481904, "step": 29415 }, { "epoch": 5.399155808405212, "grad_norm": 0.10261189192533493, "learning_rate": 9.145689306835077e-06, "loss": 0.1856, "num_input_tokens_seen": 63493392, "step": 29420 }, { "epoch": 5.400073407964764, "grad_norm": 0.10081475228071213, "learning_rate": 9.145241595303824e-06, "loss": 0.1626, "num_input_tokens_seen": 63505104, "step": 29425 }, { "epoch": 5.400991007524317, "grad_norm": 0.04263227432966232, "learning_rate": 9.14479377745355e-06, "loss": 0.0025, "num_input_tokens_seen": 63515184, "step": 29430 }, { "epoch": 5.401908607083868, "grad_norm": 14.131169319152832, "learning_rate": 9.144345853295736e-06, "loss": 0.5458, "num_input_tokens_seen": 63527088, "step": 29435 }, { "epoch": 5.402826206643421, "grad_norm": 0.025455337017774582, "learning_rate": 9.143897822841877e-06, "loss": 0.002, "num_input_tokens_seen": 63537744, "step": 29440 }, { "epoch": 5.403743806202973, "grad_norm": 0.02408747933804989, "learning_rate": 9.143449686103459e-06, "loss": 0.0121, "num_input_tokens_seen": 63548400, "step": 29445 }, { "epoch": 5.404661405762525, "grad_norm": 0.16360418498516083, "learning_rate": 9.143001443091979e-06, "loss": 0.2263, "num_input_tokens_seen": 63559248, "step": 29450 }, { "epoch": 5.405579005322077, "grad_norm": 33.55685043334961, "learning_rate": 9.142553093818934e-06, "loss": 0.0974, "num_input_tokens_seen": 63569968, "step": 29455 }, { "epoch": 5.40649660488163, "grad_norm": 0.02625812031328678, "learning_rate": 9.14210463829582e-06, "loss": 0.011, "num_input_tokens_seen": 63580432, "step": 29460 }, { "epoch": 5.4074142044411815, "grad_norm": 0.5181043744087219, "learning_rate": 9.141656076534144e-06, "loss": 0.1933, "num_input_tokens_seen": 63592304, "step": 29465 }, { "epoch": 5.408331804000734, "grad_norm": 5.453309535980225, "learning_rate": 9.14120740854541e-06, "loss": 0.4925, "num_input_tokens_seen": 63603344, "step": 29470 }, { "epoch": 5.4092494035602865, "grad_norm": 36.43525695800781, "learning_rate": 9.140758634341123e-06, "loss": 0.018, "num_input_tokens_seen": 63614096, "step": 29475 }, { "epoch": 5.410167003119838, "grad_norm": 1.3889483213424683, "learning_rate": 9.140309753932794e-06, "loss": 0.0074, "num_input_tokens_seen": 63624144, "step": 29480 }, { "epoch": 5.411084602679391, "grad_norm": 0.6221736669540405, "learning_rate": 9.139860767331937e-06, "loss": 0.1328, "num_input_tokens_seen": 63635312, "step": 29485 }, { "epoch": 5.412002202238943, "grad_norm": 36.292354583740234, "learning_rate": 9.139411674550067e-06, "loss": 0.3848, "num_input_tokens_seen": 63647120, "step": 29490 }, { "epoch": 5.412919801798495, "grad_norm": 0.19228951632976532, "learning_rate": 9.138962475598703e-06, "loss": 0.0614, "num_input_tokens_seen": 63657680, "step": 29495 }, { "epoch": 5.413837401358047, "grad_norm": 24.4726505279541, "learning_rate": 9.138513170489366e-06, "loss": 0.1425, "num_input_tokens_seen": 63668784, "step": 29500 }, { "epoch": 5.4147550009176, "grad_norm": 20.19101905822754, "learning_rate": 9.138063759233581e-06, "loss": 0.1004, "num_input_tokens_seen": 63680432, "step": 29505 }, { "epoch": 5.415672600477151, "grad_norm": 0.2559973895549774, "learning_rate": 9.137614241842876e-06, "loss": 0.1404, "num_input_tokens_seen": 63691248, "step": 29510 }, { "epoch": 5.416590200036704, "grad_norm": 0.028464429080486298, "learning_rate": 9.137164618328776e-06, "loss": 0.0032, "num_input_tokens_seen": 63702704, "step": 29515 }, { "epoch": 5.417507799596256, "grad_norm": 12.081185340881348, "learning_rate": 9.136714888702816e-06, "loss": 0.3138, "num_input_tokens_seen": 63713168, "step": 29520 }, { "epoch": 5.418425399155808, "grad_norm": 0.06046811491250992, "learning_rate": 9.136265052976529e-06, "loss": 0.2124, "num_input_tokens_seen": 63724944, "step": 29525 }, { "epoch": 5.4193429987153605, "grad_norm": 0.2658141255378723, "learning_rate": 9.135815111161456e-06, "loss": 0.0922, "num_input_tokens_seen": 63734768, "step": 29530 }, { "epoch": 5.420260598274913, "grad_norm": 0.057273417711257935, "learning_rate": 9.135365063269134e-06, "loss": 0.0035, "num_input_tokens_seen": 63746000, "step": 29535 }, { "epoch": 5.421178197834465, "grad_norm": 0.025556491687893867, "learning_rate": 9.134914909311109e-06, "loss": 0.005, "num_input_tokens_seen": 63755408, "step": 29540 }, { "epoch": 5.422095797394017, "grad_norm": 0.10594923794269562, "learning_rate": 9.134464649298923e-06, "loss": 0.2, "num_input_tokens_seen": 63767408, "step": 29545 }, { "epoch": 5.42301339695357, "grad_norm": 0.016594817861914635, "learning_rate": 9.134014283244129e-06, "loss": 0.2562, "num_input_tokens_seen": 63778864, "step": 29550 }, { "epoch": 5.423930996513121, "grad_norm": 6.792561054229736, "learning_rate": 9.133563811158275e-06, "loss": 0.0116, "num_input_tokens_seen": 63790416, "step": 29555 }, { "epoch": 5.424848596072674, "grad_norm": 0.1793321818113327, "learning_rate": 9.133113233052914e-06, "loss": 0.123, "num_input_tokens_seen": 63800720, "step": 29560 }, { "epoch": 5.425766195632226, "grad_norm": 0.3457157015800476, "learning_rate": 9.132662548939606e-06, "loss": 0.004, "num_input_tokens_seen": 63811824, "step": 29565 }, { "epoch": 5.426683795191778, "grad_norm": 8.015119552612305, "learning_rate": 9.13221175882991e-06, "loss": 0.3679, "num_input_tokens_seen": 63821872, "step": 29570 }, { "epoch": 5.42760139475133, "grad_norm": 1.8623058795928955, "learning_rate": 9.131760862735383e-06, "loss": 0.3003, "num_input_tokens_seen": 63832912, "step": 29575 }, { "epoch": 5.428518994310883, "grad_norm": 0.06871352344751358, "learning_rate": 9.131309860667596e-06, "loss": 0.0506, "num_input_tokens_seen": 63843888, "step": 29580 }, { "epoch": 5.4294365938704345, "grad_norm": 0.5699175000190735, "learning_rate": 9.130858752638114e-06, "loss": 0.079, "num_input_tokens_seen": 63854320, "step": 29585 }, { "epoch": 5.430354193429987, "grad_norm": 21.758995056152344, "learning_rate": 9.130407538658506e-06, "loss": 0.6354, "num_input_tokens_seen": 63865264, "step": 29590 }, { "epoch": 5.4312717929895395, "grad_norm": 119.28683471679688, "learning_rate": 9.129956218740348e-06, "loss": 0.2029, "num_input_tokens_seen": 63875984, "step": 29595 }, { "epoch": 5.432189392549091, "grad_norm": 0.22068321704864502, "learning_rate": 9.129504792895211e-06, "loss": 0.0242, "num_input_tokens_seen": 63886640, "step": 29600 }, { "epoch": 5.433106992108644, "grad_norm": 0.09035173803567886, "learning_rate": 9.12905326113468e-06, "loss": 0.2132, "num_input_tokens_seen": 63897328, "step": 29605 }, { "epoch": 5.434024591668196, "grad_norm": 7.2476606369018555, "learning_rate": 9.12860162347033e-06, "loss": 0.2235, "num_input_tokens_seen": 63909264, "step": 29610 }, { "epoch": 5.434942191227748, "grad_norm": 8.369373321533203, "learning_rate": 9.128149879913749e-06, "loss": 0.3331, "num_input_tokens_seen": 63920048, "step": 29615 }, { "epoch": 5.4358597907873, "grad_norm": 1.208448886871338, "learning_rate": 9.127698030476518e-06, "loss": 0.106, "num_input_tokens_seen": 63930896, "step": 29620 }, { "epoch": 5.436777390346853, "grad_norm": 16.277542114257812, "learning_rate": 9.127246075170232e-06, "loss": 0.167, "num_input_tokens_seen": 63941456, "step": 29625 }, { "epoch": 5.437694989906404, "grad_norm": 0.9120864272117615, "learning_rate": 9.126794014006482e-06, "loss": 0.0067, "num_input_tokens_seen": 63952880, "step": 29630 }, { "epoch": 5.438612589465957, "grad_norm": 24.502405166625977, "learning_rate": 9.12634184699686e-06, "loss": 0.1892, "num_input_tokens_seen": 63964048, "step": 29635 }, { "epoch": 5.439530189025509, "grad_norm": 0.08094003796577454, "learning_rate": 9.125889574152964e-06, "loss": 0.1192, "num_input_tokens_seen": 63974576, "step": 29640 }, { "epoch": 5.440447788585061, "grad_norm": 0.04095366224646568, "learning_rate": 9.125437195486397e-06, "loss": 0.0177, "num_input_tokens_seen": 63985648, "step": 29645 }, { "epoch": 5.441365388144614, "grad_norm": 41.40134048461914, "learning_rate": 9.12498471100876e-06, "loss": 0.0182, "num_input_tokens_seen": 63996240, "step": 29650 }, { "epoch": 5.442282987704166, "grad_norm": 0.9513952136039734, "learning_rate": 9.124532120731656e-06, "loss": 0.0049, "num_input_tokens_seen": 64007664, "step": 29655 }, { "epoch": 5.443200587263718, "grad_norm": 34.17547607421875, "learning_rate": 9.124079424666696e-06, "loss": 0.0408, "num_input_tokens_seen": 64018096, "step": 29660 }, { "epoch": 5.44411818682327, "grad_norm": 0.1429167091846466, "learning_rate": 9.123626622825492e-06, "loss": 0.2914, "num_input_tokens_seen": 64027152, "step": 29665 }, { "epoch": 5.445035786382823, "grad_norm": 4.379701137542725, "learning_rate": 9.123173715219656e-06, "loss": 0.2216, "num_input_tokens_seen": 64037072, "step": 29670 }, { "epoch": 5.445953385942374, "grad_norm": 0.05735870078206062, "learning_rate": 9.122720701860804e-06, "loss": 0.2182, "num_input_tokens_seen": 64048560, "step": 29675 }, { "epoch": 5.446870985501927, "grad_norm": 0.09435179084539413, "learning_rate": 9.122267582760555e-06, "loss": 0.1059, "num_input_tokens_seen": 64058992, "step": 29680 }, { "epoch": 5.447788585061479, "grad_norm": 0.1279316544532776, "learning_rate": 9.121814357930533e-06, "loss": 0.1994, "num_input_tokens_seen": 64069168, "step": 29685 }, { "epoch": 5.448706184621031, "grad_norm": 0.4381127953529358, "learning_rate": 9.121361027382358e-06, "loss": 0.1156, "num_input_tokens_seen": 64080464, "step": 29690 }, { "epoch": 5.4496237841805835, "grad_norm": 0.19184570014476776, "learning_rate": 9.120907591127663e-06, "loss": 0.1269, "num_input_tokens_seen": 64092208, "step": 29695 }, { "epoch": 5.450541383740136, "grad_norm": 0.07841689884662628, "learning_rate": 9.120454049178075e-06, "loss": 0.1158, "num_input_tokens_seen": 64103472, "step": 29700 }, { "epoch": 5.451458983299688, "grad_norm": 15.49651050567627, "learning_rate": 9.120000401545226e-06, "loss": 0.4229, "num_input_tokens_seen": 64114576, "step": 29705 }, { "epoch": 5.45237658285924, "grad_norm": 4.559532642364502, "learning_rate": 9.11954664824075e-06, "loss": 0.0054, "num_input_tokens_seen": 64126096, "step": 29710 }, { "epoch": 5.453294182418793, "grad_norm": 0.1392979919910431, "learning_rate": 9.119092789276292e-06, "loss": 0.0023, "num_input_tokens_seen": 64136336, "step": 29715 }, { "epoch": 5.454211781978344, "grad_norm": 0.061008527874946594, "learning_rate": 9.118638824663483e-06, "loss": 0.0033, "num_input_tokens_seen": 64146928, "step": 29720 }, { "epoch": 5.455129381537897, "grad_norm": 0.11351518332958221, "learning_rate": 9.118184754413975e-06, "loss": 0.0028, "num_input_tokens_seen": 64157680, "step": 29725 }, { "epoch": 5.456046981097449, "grad_norm": 12.407520294189453, "learning_rate": 9.11773057853941e-06, "loss": 0.1832, "num_input_tokens_seen": 64167696, "step": 29730 }, { "epoch": 5.456964580657001, "grad_norm": 8.09295654296875, "learning_rate": 9.117276297051437e-06, "loss": 0.0106, "num_input_tokens_seen": 64179088, "step": 29735 }, { "epoch": 5.457882180216553, "grad_norm": 0.1816888302564621, "learning_rate": 9.116821909961708e-06, "loss": 0.3415, "num_input_tokens_seen": 64188432, "step": 29740 }, { "epoch": 5.458799779776106, "grad_norm": 0.17017623782157898, "learning_rate": 9.116367417281877e-06, "loss": 0.3069, "num_input_tokens_seen": 64199472, "step": 29745 }, { "epoch": 5.4597173793356575, "grad_norm": 1.2519588470458984, "learning_rate": 9.115912819023602e-06, "loss": 0.2365, "num_input_tokens_seen": 64210448, "step": 29750 }, { "epoch": 5.46063497889521, "grad_norm": 0.42591527104377747, "learning_rate": 9.115458115198544e-06, "loss": 0.0996, "num_input_tokens_seen": 64221200, "step": 29755 }, { "epoch": 5.4615525784547625, "grad_norm": 0.08846604079008102, "learning_rate": 9.115003305818362e-06, "loss": 0.0021, "num_input_tokens_seen": 64232112, "step": 29760 }, { "epoch": 5.462470178014314, "grad_norm": 0.026013903319835663, "learning_rate": 9.114548390894723e-06, "loss": 0.0722, "num_input_tokens_seen": 64242192, "step": 29765 }, { "epoch": 5.463387777573867, "grad_norm": 0.021570855751633644, "learning_rate": 9.114093370439294e-06, "loss": 0.0031, "num_input_tokens_seen": 64253456, "step": 29770 }, { "epoch": 5.464305377133419, "grad_norm": 2.092790365219116, "learning_rate": 9.113638244463749e-06, "loss": 0.1414, "num_input_tokens_seen": 64263600, "step": 29775 }, { "epoch": 5.465222976692971, "grad_norm": 0.01212972030043602, "learning_rate": 9.113183012979756e-06, "loss": 0.3103, "num_input_tokens_seen": 64274128, "step": 29780 }, { "epoch": 5.466140576252523, "grad_norm": 6.554244518280029, "learning_rate": 9.112727675998993e-06, "loss": 0.1603, "num_input_tokens_seen": 64284240, "step": 29785 }, { "epoch": 5.467058175812076, "grad_norm": 0.03036525286734104, "learning_rate": 9.11227223353314e-06, "loss": 0.0304, "num_input_tokens_seen": 64294064, "step": 29790 }, { "epoch": 5.467975775371627, "grad_norm": 0.7334669232368469, "learning_rate": 9.11181668559388e-06, "loss": 0.0033, "num_input_tokens_seen": 64305232, "step": 29795 }, { "epoch": 5.46889337493118, "grad_norm": 0.2573608160018921, "learning_rate": 9.111361032192894e-06, "loss": 0.0038, "num_input_tokens_seen": 64315984, "step": 29800 }, { "epoch": 5.469810974490732, "grad_norm": 25.171979904174805, "learning_rate": 9.110905273341869e-06, "loss": 0.3431, "num_input_tokens_seen": 64326224, "step": 29805 }, { "epoch": 5.470728574050284, "grad_norm": 0.10623924434185028, "learning_rate": 9.110449409052492e-06, "loss": 0.1823, "num_input_tokens_seen": 64336528, "step": 29810 }, { "epoch": 5.4716461736098365, "grad_norm": 0.04203924909234047, "learning_rate": 9.109993439336462e-06, "loss": 0.2623, "num_input_tokens_seen": 64347696, "step": 29815 }, { "epoch": 5.472563773169389, "grad_norm": 0.13616324961185455, "learning_rate": 9.10953736420547e-06, "loss": 0.0872, "num_input_tokens_seen": 64357904, "step": 29820 }, { "epoch": 5.473481372728941, "grad_norm": 0.6270551085472107, "learning_rate": 9.109081183671212e-06, "loss": 0.0679, "num_input_tokens_seen": 64369200, "step": 29825 }, { "epoch": 5.474398972288493, "grad_norm": 34.32249069213867, "learning_rate": 9.108624897745391e-06, "loss": 0.0106, "num_input_tokens_seen": 64380560, "step": 29830 }, { "epoch": 5.475316571848046, "grad_norm": 8.744987487792969, "learning_rate": 9.10816850643971e-06, "loss": 0.36, "num_input_tokens_seen": 64391536, "step": 29835 }, { "epoch": 5.476234171407597, "grad_norm": 0.030120013281702995, "learning_rate": 9.107712009765872e-06, "loss": 0.2579, "num_input_tokens_seen": 64401808, "step": 29840 }, { "epoch": 5.47715177096715, "grad_norm": 13.417220115661621, "learning_rate": 9.10725540773559e-06, "loss": 0.2804, "num_input_tokens_seen": 64411376, "step": 29845 }, { "epoch": 5.478069370526702, "grad_norm": 0.3229110836982727, "learning_rate": 9.106798700360571e-06, "loss": 0.0816, "num_input_tokens_seen": 64422608, "step": 29850 }, { "epoch": 5.478986970086254, "grad_norm": 0.046310558915138245, "learning_rate": 9.106341887652531e-06, "loss": 0.325, "num_input_tokens_seen": 64433840, "step": 29855 }, { "epoch": 5.479904569645806, "grad_norm": 0.09025723487138748, "learning_rate": 9.105884969623184e-06, "loss": 0.0637, "num_input_tokens_seen": 64444752, "step": 29860 }, { "epoch": 5.480822169205359, "grad_norm": 1.5786497592926025, "learning_rate": 9.105427946284251e-06, "loss": 0.1252, "num_input_tokens_seen": 64454768, "step": 29865 }, { "epoch": 5.4817397687649105, "grad_norm": 5.361443519592285, "learning_rate": 9.104970817647456e-06, "loss": 0.1498, "num_input_tokens_seen": 64466192, "step": 29870 }, { "epoch": 5.482657368324463, "grad_norm": 0.10826615989208221, "learning_rate": 9.104513583724522e-06, "loss": 0.1714, "num_input_tokens_seen": 64477904, "step": 29875 }, { "epoch": 5.4835749678840155, "grad_norm": 0.09446028620004654, "learning_rate": 9.104056244527173e-06, "loss": 0.0693, "num_input_tokens_seen": 64489200, "step": 29880 }, { "epoch": 5.484492567443567, "grad_norm": 0.13048362731933594, "learning_rate": 9.103598800067144e-06, "loss": 0.1534, "num_input_tokens_seen": 64501200, "step": 29885 }, { "epoch": 5.48541016700312, "grad_norm": 0.453415185213089, "learning_rate": 9.103141250356166e-06, "loss": 0.0117, "num_input_tokens_seen": 64511920, "step": 29890 }, { "epoch": 5.486327766562672, "grad_norm": 8.739190101623535, "learning_rate": 9.102683595405973e-06, "loss": 0.1876, "num_input_tokens_seen": 64522032, "step": 29895 }, { "epoch": 5.487245366122224, "grad_norm": 0.12722031772136688, "learning_rate": 9.102225835228306e-06, "loss": 0.0043, "num_input_tokens_seen": 64532688, "step": 29900 }, { "epoch": 5.488162965681776, "grad_norm": 0.24668405950069427, "learning_rate": 9.101767969834903e-06, "loss": 0.0077, "num_input_tokens_seen": 64544496, "step": 29905 }, { "epoch": 5.489080565241329, "grad_norm": 0.2164325714111328, "learning_rate": 9.101309999237509e-06, "loss": 0.0888, "num_input_tokens_seen": 64555312, "step": 29910 }, { "epoch": 5.48999816480088, "grad_norm": 0.09823514521121979, "learning_rate": 9.100851923447871e-06, "loss": 0.0027, "num_input_tokens_seen": 64564752, "step": 29915 }, { "epoch": 5.490915764360433, "grad_norm": 0.48543989658355713, "learning_rate": 9.100393742477736e-06, "loss": 0.241, "num_input_tokens_seen": 64574800, "step": 29920 }, { "epoch": 5.4918333639199854, "grad_norm": 8.687843322753906, "learning_rate": 9.099935456338856e-06, "loss": 0.1886, "num_input_tokens_seen": 64585008, "step": 29925 }, { "epoch": 5.492750963479537, "grad_norm": 0.046201497316360474, "learning_rate": 9.099477065042986e-06, "loss": 0.2458, "num_input_tokens_seen": 64595536, "step": 29930 }, { "epoch": 5.49366856303909, "grad_norm": 33.06683349609375, "learning_rate": 9.099018568601884e-06, "loss": 0.3509, "num_input_tokens_seen": 64606896, "step": 29935 }, { "epoch": 5.494586162598642, "grad_norm": 26.229679107666016, "learning_rate": 9.098559967027308e-06, "loss": 0.2779, "num_input_tokens_seen": 64617328, "step": 29940 }, { "epoch": 5.495503762158194, "grad_norm": 27.094505310058594, "learning_rate": 9.09810126033102e-06, "loss": 0.2239, "num_input_tokens_seen": 64628464, "step": 29945 }, { "epoch": 5.496421361717746, "grad_norm": 70.81146240234375, "learning_rate": 9.097642448524788e-06, "loss": 0.0436, "num_input_tokens_seen": 64638928, "step": 29950 }, { "epoch": 5.497338961277299, "grad_norm": 11.968539237976074, "learning_rate": 9.097183531620377e-06, "loss": 0.1436, "num_input_tokens_seen": 64649648, "step": 29955 }, { "epoch": 5.498256560836851, "grad_norm": 0.40070101618766785, "learning_rate": 9.09672450962956e-06, "loss": 0.0044, "num_input_tokens_seen": 64659216, "step": 29960 }, { "epoch": 5.499174160396403, "grad_norm": 4.99365758895874, "learning_rate": 9.096265382564105e-06, "loss": 0.2573, "num_input_tokens_seen": 64669648, "step": 29965 }, { "epoch": 5.500091759955955, "grad_norm": 0.15361660718917847, "learning_rate": 9.095806150435796e-06, "loss": 0.0869, "num_input_tokens_seen": 64679600, "step": 29970 }, { "epoch": 5.501009359515507, "grad_norm": 0.30648574233055115, "learning_rate": 9.095346813256404e-06, "loss": 0.2247, "num_input_tokens_seen": 64691376, "step": 29975 }, { "epoch": 5.5019269590750595, "grad_norm": 24.86008644104004, "learning_rate": 9.094887371037713e-06, "loss": 0.1214, "num_input_tokens_seen": 64702192, "step": 29980 }, { "epoch": 5.502844558634612, "grad_norm": 7.423140525817871, "learning_rate": 9.09442782379151e-06, "loss": 0.2909, "num_input_tokens_seen": 64712496, "step": 29985 }, { "epoch": 5.503762158194164, "grad_norm": 0.8715084195137024, "learning_rate": 9.093968171529578e-06, "loss": 0.0067, "num_input_tokens_seen": 64722544, "step": 29990 }, { "epoch": 5.504679757753716, "grad_norm": 0.04811154678463936, "learning_rate": 9.093508414263708e-06, "loss": 0.0057, "num_input_tokens_seen": 64733104, "step": 29995 }, { "epoch": 5.505597357313269, "grad_norm": 1.5736067295074463, "learning_rate": 9.09304855200569e-06, "loss": 0.0088, "num_input_tokens_seen": 64744112, "step": 30000 }, { "epoch": 5.50651495687282, "grad_norm": 0.02328583039343357, "learning_rate": 9.09258858476732e-06, "loss": 0.3845, "num_input_tokens_seen": 64754736, "step": 30005 }, { "epoch": 5.507432556432373, "grad_norm": 0.24097146093845367, "learning_rate": 9.0921285125604e-06, "loss": 0.2777, "num_input_tokens_seen": 64765072, "step": 30010 }, { "epoch": 5.508350155991925, "grad_norm": 6.4157185554504395, "learning_rate": 9.091668335396721e-06, "loss": 0.0081, "num_input_tokens_seen": 64775280, "step": 30015 }, { "epoch": 5.509267755551477, "grad_norm": 0.12925583124160767, "learning_rate": 9.091208053288093e-06, "loss": 0.0503, "num_input_tokens_seen": 64785872, "step": 30020 }, { "epoch": 5.510185355111029, "grad_norm": 0.07312936335802078, "learning_rate": 9.090747666246319e-06, "loss": 0.2264, "num_input_tokens_seen": 64796944, "step": 30025 }, { "epoch": 5.511102954670582, "grad_norm": 0.05911799520254135, "learning_rate": 9.090287174283208e-06, "loss": 0.0035, "num_input_tokens_seen": 64807984, "step": 30030 }, { "epoch": 5.5120205542301335, "grad_norm": 0.05077512934803963, "learning_rate": 9.08982657741057e-06, "loss": 0.1488, "num_input_tokens_seen": 64817584, "step": 30035 }, { "epoch": 5.512938153789686, "grad_norm": 5.287816047668457, "learning_rate": 9.089365875640217e-06, "loss": 0.2668, "num_input_tokens_seen": 64827664, "step": 30040 }, { "epoch": 5.5138557533492385, "grad_norm": 6.214962005615234, "learning_rate": 9.088905068983968e-06, "loss": 0.1933, "num_input_tokens_seen": 64838672, "step": 30045 }, { "epoch": 5.51477335290879, "grad_norm": 2.0521366596221924, "learning_rate": 9.088444157453643e-06, "loss": 0.1073, "num_input_tokens_seen": 64850576, "step": 30050 }, { "epoch": 5.515690952468343, "grad_norm": 0.0380445271730423, "learning_rate": 9.08798314106106e-06, "loss": 0.0155, "num_input_tokens_seen": 64861168, "step": 30055 }, { "epoch": 5.516608552027895, "grad_norm": 0.05409475788474083, "learning_rate": 9.087522019818048e-06, "loss": 0.0316, "num_input_tokens_seen": 64872784, "step": 30060 }, { "epoch": 5.517526151587447, "grad_norm": 8.167729377746582, "learning_rate": 9.087060793736428e-06, "loss": 0.1785, "num_input_tokens_seen": 64883632, "step": 30065 }, { "epoch": 5.518443751146999, "grad_norm": 4.038292407989502, "learning_rate": 9.086599462828034e-06, "loss": 0.1536, "num_input_tokens_seen": 64894064, "step": 30070 }, { "epoch": 5.519361350706552, "grad_norm": 0.13003486394882202, "learning_rate": 9.0861380271047e-06, "loss": 0.074, "num_input_tokens_seen": 64905872, "step": 30075 }, { "epoch": 5.520278950266103, "grad_norm": 0.08614163845777512, "learning_rate": 9.085676486578256e-06, "loss": 0.1006, "num_input_tokens_seen": 64916016, "step": 30080 }, { "epoch": 5.521196549825656, "grad_norm": 11.332477569580078, "learning_rate": 9.085214841260542e-06, "loss": 0.141, "num_input_tokens_seen": 64927312, "step": 30085 }, { "epoch": 5.522114149385208, "grad_norm": 0.12406893819570541, "learning_rate": 9.0847530911634e-06, "loss": 0.0513, "num_input_tokens_seen": 64936784, "step": 30090 }, { "epoch": 5.52303174894476, "grad_norm": 0.17068712413311005, "learning_rate": 9.084291236298671e-06, "loss": 0.1877, "num_input_tokens_seen": 64947376, "step": 30095 }, { "epoch": 5.5239493485043125, "grad_norm": 1.1356779336929321, "learning_rate": 9.083829276678202e-06, "loss": 0.0043, "num_input_tokens_seen": 64957968, "step": 30100 }, { "epoch": 5.524866948063865, "grad_norm": 23.060543060302734, "learning_rate": 9.083367212313843e-06, "loss": 0.168, "num_input_tokens_seen": 64969072, "step": 30105 }, { "epoch": 5.525784547623417, "grad_norm": 6.122862815856934, "learning_rate": 9.082905043217443e-06, "loss": 0.337, "num_input_tokens_seen": 64981136, "step": 30110 }, { "epoch": 5.526702147182969, "grad_norm": 0.0835273414850235, "learning_rate": 9.082442769400854e-06, "loss": 0.2283, "num_input_tokens_seen": 64992464, "step": 30115 }, { "epoch": 5.527619746742522, "grad_norm": 0.2168460190296173, "learning_rate": 9.081980390875938e-06, "loss": 0.125, "num_input_tokens_seen": 65003120, "step": 30120 }, { "epoch": 5.528537346302074, "grad_norm": 4.566126346588135, "learning_rate": 9.08151790765455e-06, "loss": 0.194, "num_input_tokens_seen": 65013456, "step": 30125 }, { "epoch": 5.529454945861626, "grad_norm": 0.3642832338809967, "learning_rate": 9.081055319748555e-06, "loss": 0.144, "num_input_tokens_seen": 65024688, "step": 30130 }, { "epoch": 5.530372545421178, "grad_norm": 0.10887758433818817, "learning_rate": 9.080592627169815e-06, "loss": 0.1473, "num_input_tokens_seen": 65034960, "step": 30135 }, { "epoch": 5.531290144980731, "grad_norm": 0.11270695179700851, "learning_rate": 9.080129829930199e-06, "loss": 0.1006, "num_input_tokens_seen": 65046704, "step": 30140 }, { "epoch": 5.532207744540282, "grad_norm": 32.76130294799805, "learning_rate": 9.079666928041577e-06, "loss": 0.1532, "num_input_tokens_seen": 65056688, "step": 30145 }, { "epoch": 5.533125344099835, "grad_norm": 98.28623962402344, "learning_rate": 9.07920392151582e-06, "loss": 0.1457, "num_input_tokens_seen": 65065840, "step": 30150 }, { "epoch": 5.534042943659387, "grad_norm": 0.5276376008987427, "learning_rate": 9.078740810364806e-06, "loss": 0.0072, "num_input_tokens_seen": 65076624, "step": 30155 }, { "epoch": 5.534960543218939, "grad_norm": 0.2571928799152374, "learning_rate": 9.07827759460041e-06, "loss": 0.1033, "num_input_tokens_seen": 65088816, "step": 30160 }, { "epoch": 5.5358781427784916, "grad_norm": 0.39157983660697937, "learning_rate": 9.077814274234516e-06, "loss": 0.0709, "num_input_tokens_seen": 65099248, "step": 30165 }, { "epoch": 5.536795742338044, "grad_norm": 0.21319912374019623, "learning_rate": 9.077350849279005e-06, "loss": 0.1765, "num_input_tokens_seen": 65109968, "step": 30170 }, { "epoch": 5.537713341897596, "grad_norm": 0.05191747844219208, "learning_rate": 9.076887319745763e-06, "loss": 0.0631, "num_input_tokens_seen": 65120976, "step": 30175 }, { "epoch": 5.538630941457148, "grad_norm": 0.3271487355232239, "learning_rate": 9.07642368564668e-06, "loss": 0.2389, "num_input_tokens_seen": 65131408, "step": 30180 }, { "epoch": 5.539548541016701, "grad_norm": 0.8074063062667847, "learning_rate": 9.075959946993649e-06, "loss": 0.1866, "num_input_tokens_seen": 65142448, "step": 30185 }, { "epoch": 5.540466140576252, "grad_norm": 97.6225357055664, "learning_rate": 9.075496103798562e-06, "loss": 0.0262, "num_input_tokens_seen": 65153008, "step": 30190 }, { "epoch": 5.541383740135805, "grad_norm": 9.27142333984375, "learning_rate": 9.075032156073316e-06, "loss": 0.2294, "num_input_tokens_seen": 65164272, "step": 30195 }, { "epoch": 5.542301339695357, "grad_norm": 0.16474595665931702, "learning_rate": 9.074568103829812e-06, "loss": 0.1083, "num_input_tokens_seen": 65174320, "step": 30200 }, { "epoch": 5.543218939254909, "grad_norm": 0.051553938537836075, "learning_rate": 9.07410394707995e-06, "loss": 0.0317, "num_input_tokens_seen": 65185744, "step": 30205 }, { "epoch": 5.5441365388144614, "grad_norm": 0.1942657083272934, "learning_rate": 9.073639685835636e-06, "loss": 0.0097, "num_input_tokens_seen": 65197008, "step": 30210 }, { "epoch": 5.545054138374014, "grad_norm": 0.01851368136703968, "learning_rate": 9.073175320108779e-06, "loss": 0.0049, "num_input_tokens_seen": 65208176, "step": 30215 }, { "epoch": 5.545971737933566, "grad_norm": 0.6350447535514832, "learning_rate": 9.072710849911287e-06, "loss": 0.4662, "num_input_tokens_seen": 65217904, "step": 30220 }, { "epoch": 5.546889337493118, "grad_norm": 0.04425080493092537, "learning_rate": 9.072246275255073e-06, "loss": 0.0016, "num_input_tokens_seen": 65228464, "step": 30225 }, { "epoch": 5.547806937052671, "grad_norm": 78.726806640625, "learning_rate": 9.071781596152054e-06, "loss": 0.2378, "num_input_tokens_seen": 65239952, "step": 30230 }, { "epoch": 5.548724536612222, "grad_norm": 1.5547078847885132, "learning_rate": 9.071316812614147e-06, "loss": 0.0335, "num_input_tokens_seen": 65250192, "step": 30235 }, { "epoch": 5.549642136171775, "grad_norm": 0.052545540034770966, "learning_rate": 9.070851924653275e-06, "loss": 0.1091, "num_input_tokens_seen": 65261712, "step": 30240 }, { "epoch": 5.550559735731327, "grad_norm": 0.15222999453544617, "learning_rate": 9.07038693228136e-06, "loss": 0.1364, "num_input_tokens_seen": 65272112, "step": 30245 }, { "epoch": 5.551477335290879, "grad_norm": 0.7071388363838196, "learning_rate": 9.06992183551033e-06, "loss": 0.0845, "num_input_tokens_seen": 65282320, "step": 30250 }, { "epoch": 5.552394934850431, "grad_norm": 0.04840220883488655, "learning_rate": 9.06945663435211e-06, "loss": 0.0608, "num_input_tokens_seen": 65292336, "step": 30255 }, { "epoch": 5.553312534409984, "grad_norm": 0.054461851716041565, "learning_rate": 9.068991328818637e-06, "loss": 0.0034, "num_input_tokens_seen": 65302832, "step": 30260 }, { "epoch": 5.5542301339695355, "grad_norm": 52.66769027709961, "learning_rate": 9.068525918921841e-06, "loss": 0.378, "num_input_tokens_seen": 65312976, "step": 30265 }, { "epoch": 5.555147733529088, "grad_norm": 43.99406814575195, "learning_rate": 9.068060404673663e-06, "loss": 0.2716, "num_input_tokens_seen": 65324144, "step": 30270 }, { "epoch": 5.5560653330886405, "grad_norm": 0.02130798064172268, "learning_rate": 9.067594786086038e-06, "loss": 0.004, "num_input_tokens_seen": 65336048, "step": 30275 }, { "epoch": 5.556982932648192, "grad_norm": 0.24439433217048645, "learning_rate": 9.067129063170912e-06, "loss": 0.163, "num_input_tokens_seen": 65346992, "step": 30280 }, { "epoch": 5.557900532207745, "grad_norm": 20.209280014038086, "learning_rate": 9.066663235940229e-06, "loss": 0.2033, "num_input_tokens_seen": 65358672, "step": 30285 }, { "epoch": 5.558818131767297, "grad_norm": 0.5857483148574829, "learning_rate": 9.066197304405936e-06, "loss": 0.1878, "num_input_tokens_seen": 65369232, "step": 30290 }, { "epoch": 5.559735731326849, "grad_norm": 0.04279262199997902, "learning_rate": 9.065731268579985e-06, "loss": 0.0021, "num_input_tokens_seen": 65380144, "step": 30295 }, { "epoch": 5.560653330886401, "grad_norm": 0.15403999388217926, "learning_rate": 9.065265128474327e-06, "loss": 0.0046, "num_input_tokens_seen": 65391376, "step": 30300 }, { "epoch": 5.561570930445954, "grad_norm": 0.04175664857029915, "learning_rate": 9.064798884100921e-06, "loss": 0.0009, "num_input_tokens_seen": 65403184, "step": 30305 }, { "epoch": 5.562488530005505, "grad_norm": 23.440895080566406, "learning_rate": 9.064332535471723e-06, "loss": 0.106, "num_input_tokens_seen": 65413360, "step": 30310 }, { "epoch": 5.563406129565058, "grad_norm": 0.1721128225326538, "learning_rate": 9.063866082598694e-06, "loss": 0.2278, "num_input_tokens_seen": 65424048, "step": 30315 }, { "epoch": 5.56432372912461, "grad_norm": 0.8467010259628296, "learning_rate": 9.063399525493798e-06, "loss": 0.0629, "num_input_tokens_seen": 65434352, "step": 30320 }, { "epoch": 5.565241328684162, "grad_norm": 0.5511311292648315, "learning_rate": 9.062932864169003e-06, "loss": 0.0626, "num_input_tokens_seen": 65445456, "step": 30325 }, { "epoch": 5.5661589282437145, "grad_norm": 35.3405647277832, "learning_rate": 9.062466098636277e-06, "loss": 0.032, "num_input_tokens_seen": 65456016, "step": 30330 }, { "epoch": 5.567076527803267, "grad_norm": 0.1129029244184494, "learning_rate": 9.061999228907592e-06, "loss": 0.0537, "num_input_tokens_seen": 65466736, "step": 30335 }, { "epoch": 5.567994127362819, "grad_norm": 0.06796469539403915, "learning_rate": 9.061532254994922e-06, "loss": 0.0025, "num_input_tokens_seen": 65476304, "step": 30340 }, { "epoch": 5.568911726922371, "grad_norm": 84.80520629882812, "learning_rate": 9.061065176910244e-06, "loss": 0.1204, "num_input_tokens_seen": 65487024, "step": 30345 }, { "epoch": 5.569829326481924, "grad_norm": 0.03533284738659859, "learning_rate": 9.06059799466554e-06, "loss": 0.003, "num_input_tokens_seen": 65498288, "step": 30350 }, { "epoch": 5.570746926041475, "grad_norm": 7.051278591156006, "learning_rate": 9.060130708272788e-06, "loss": 0.2963, "num_input_tokens_seen": 65509328, "step": 30355 }, { "epoch": 5.571664525601028, "grad_norm": 11.25738525390625, "learning_rate": 9.059663317743976e-06, "loss": 0.4371, "num_input_tokens_seen": 65520272, "step": 30360 }, { "epoch": 5.57258212516058, "grad_norm": 0.019085366278886795, "learning_rate": 9.059195823091094e-06, "loss": 0.0252, "num_input_tokens_seen": 65531568, "step": 30365 }, { "epoch": 5.573499724720132, "grad_norm": 2.3402750492095947, "learning_rate": 9.058728224326129e-06, "loss": 0.2476, "num_input_tokens_seen": 65542704, "step": 30370 }, { "epoch": 5.574417324279684, "grad_norm": 5.554444313049316, "learning_rate": 9.058260521461075e-06, "loss": 0.2741, "num_input_tokens_seen": 65554032, "step": 30375 }, { "epoch": 5.575334923839237, "grad_norm": 0.8316598534584045, "learning_rate": 9.05779271450793e-06, "loss": 0.1336, "num_input_tokens_seen": 65564656, "step": 30380 }, { "epoch": 5.5762525233987885, "grad_norm": 25.016382217407227, "learning_rate": 9.05732480347869e-06, "loss": 0.2192, "num_input_tokens_seen": 65574384, "step": 30385 }, { "epoch": 5.577170122958341, "grad_norm": 0.2692806124687195, "learning_rate": 9.056856788385358e-06, "loss": 0.2078, "num_input_tokens_seen": 65585680, "step": 30390 }, { "epoch": 5.5780877225178935, "grad_norm": 0.2856689691543579, "learning_rate": 9.056388669239934e-06, "loss": 0.057, "num_input_tokens_seen": 65597136, "step": 30395 }, { "epoch": 5.579005322077445, "grad_norm": 5.280974388122559, "learning_rate": 9.055920446054432e-06, "loss": 0.0406, "num_input_tokens_seen": 65607600, "step": 30400 }, { "epoch": 5.579922921636998, "grad_norm": 0.08633895963430405, "learning_rate": 9.055452118840852e-06, "loss": 0.0033, "num_input_tokens_seen": 65618992, "step": 30405 }, { "epoch": 5.58084052119655, "grad_norm": 0.04341244325041771, "learning_rate": 9.054983687611213e-06, "loss": 0.0031, "num_input_tokens_seen": 65630448, "step": 30410 }, { "epoch": 5.581758120756102, "grad_norm": 0.030551517382264137, "learning_rate": 9.054515152377528e-06, "loss": 0.1637, "num_input_tokens_seen": 65640144, "step": 30415 }, { "epoch": 5.582675720315654, "grad_norm": 0.01448536105453968, "learning_rate": 9.054046513151813e-06, "loss": 0.2375, "num_input_tokens_seen": 65651856, "step": 30420 }, { "epoch": 5.583593319875207, "grad_norm": 0.07451522350311279, "learning_rate": 9.053577769946088e-06, "loss": 0.0083, "num_input_tokens_seen": 65662160, "step": 30425 }, { "epoch": 5.584510919434758, "grad_norm": 21.652706146240234, "learning_rate": 9.053108922772374e-06, "loss": 0.0898, "num_input_tokens_seen": 65673712, "step": 30430 }, { "epoch": 5.585428518994311, "grad_norm": 30.531307220458984, "learning_rate": 9.052639971642699e-06, "loss": 0.301, "num_input_tokens_seen": 65684016, "step": 30435 }, { "epoch": 5.586346118553863, "grad_norm": 0.09267120808362961, "learning_rate": 9.052170916569088e-06, "loss": 0.0927, "num_input_tokens_seen": 65695088, "step": 30440 }, { "epoch": 5.587263718113415, "grad_norm": 6.615311145782471, "learning_rate": 9.051701757563575e-06, "loss": 0.2433, "num_input_tokens_seen": 65706224, "step": 30445 }, { "epoch": 5.5881813176729676, "grad_norm": 0.02477540634572506, "learning_rate": 9.051232494638191e-06, "loss": 0.1297, "num_input_tokens_seen": 65718064, "step": 30450 }, { "epoch": 5.58909891723252, "grad_norm": 24.37436866760254, "learning_rate": 9.050763127804973e-06, "loss": 0.0705, "num_input_tokens_seen": 65726512, "step": 30455 }, { "epoch": 5.590016516792072, "grad_norm": 0.020763924345374107, "learning_rate": 9.050293657075959e-06, "loss": 0.002, "num_input_tokens_seen": 65737584, "step": 30460 }, { "epoch": 5.590934116351624, "grad_norm": 9.876019477844238, "learning_rate": 9.04982408246319e-06, "loss": 0.4244, "num_input_tokens_seen": 65747888, "step": 30465 }, { "epoch": 5.591851715911177, "grad_norm": 0.31950995326042175, "learning_rate": 9.04935440397871e-06, "loss": 0.0693, "num_input_tokens_seen": 65758416, "step": 30470 }, { "epoch": 5.592769315470728, "grad_norm": 8.811104774475098, "learning_rate": 9.048884621634563e-06, "loss": 0.2808, "num_input_tokens_seen": 65768112, "step": 30475 }, { "epoch": 5.593686915030281, "grad_norm": 0.03396891802549362, "learning_rate": 9.048414735442804e-06, "loss": 0.1092, "num_input_tokens_seen": 65779760, "step": 30480 }, { "epoch": 5.594604514589833, "grad_norm": 0.7287931442260742, "learning_rate": 9.047944745415481e-06, "loss": 0.0043, "num_input_tokens_seen": 65789072, "step": 30485 }, { "epoch": 5.595522114149385, "grad_norm": 0.3468591272830963, "learning_rate": 9.047474651564646e-06, "loss": 0.0803, "num_input_tokens_seen": 65800816, "step": 30490 }, { "epoch": 5.5964397137089374, "grad_norm": 0.30473023653030396, "learning_rate": 9.047004453902364e-06, "loss": 0.2356, "num_input_tokens_seen": 65811728, "step": 30495 }, { "epoch": 5.59735731326849, "grad_norm": 0.11648283153772354, "learning_rate": 9.046534152440687e-06, "loss": 0.1194, "num_input_tokens_seen": 65822512, "step": 30500 }, { "epoch": 5.598274912828042, "grad_norm": 0.16096577048301697, "learning_rate": 9.04606374719168e-06, "loss": 0.2332, "num_input_tokens_seen": 65831440, "step": 30505 }, { "epoch": 5.599192512387594, "grad_norm": 1.1819368600845337, "learning_rate": 9.04559323816741e-06, "loss": 0.0643, "num_input_tokens_seen": 65842160, "step": 30510 }, { "epoch": 5.600110111947147, "grad_norm": 0.23073424398899078, "learning_rate": 9.045122625379944e-06, "loss": 0.1542, "num_input_tokens_seen": 65852528, "step": 30515 }, { "epoch": 5.601027711506698, "grad_norm": 0.249049112200737, "learning_rate": 9.04465190884135e-06, "loss": 0.0061, "num_input_tokens_seen": 65864016, "step": 30520 }, { "epoch": 5.601945311066251, "grad_norm": 0.07335641235113144, "learning_rate": 9.044181088563705e-06, "loss": 0.1153, "num_input_tokens_seen": 65874864, "step": 30525 }, { "epoch": 5.602862910625803, "grad_norm": 30.923179626464844, "learning_rate": 9.043710164559083e-06, "loss": 0.0559, "num_input_tokens_seen": 65886928, "step": 30530 }, { "epoch": 5.603780510185355, "grad_norm": 10.126801490783691, "learning_rate": 9.043239136839562e-06, "loss": 0.3123, "num_input_tokens_seen": 65898128, "step": 30535 }, { "epoch": 5.604698109744907, "grad_norm": 0.1898404061794281, "learning_rate": 9.042768005417225e-06, "loss": 0.011, "num_input_tokens_seen": 65909872, "step": 30540 }, { "epoch": 5.60561570930446, "grad_norm": 19.48233413696289, "learning_rate": 9.042296770304151e-06, "loss": 0.0945, "num_input_tokens_seen": 65920528, "step": 30545 }, { "epoch": 5.6065333088640115, "grad_norm": 0.09429707378149033, "learning_rate": 9.041825431512433e-06, "loss": 0.0035, "num_input_tokens_seen": 65930544, "step": 30550 }, { "epoch": 5.607450908423564, "grad_norm": 0.5116870403289795, "learning_rate": 9.041353989054156e-06, "loss": 0.0599, "num_input_tokens_seen": 65941392, "step": 30555 }, { "epoch": 5.6083685079831165, "grad_norm": 0.24976404011249542, "learning_rate": 9.040882442941412e-06, "loss": 0.4297, "num_input_tokens_seen": 65951824, "step": 30560 }, { "epoch": 5.609286107542668, "grad_norm": 0.1634344905614853, "learning_rate": 9.040410793186297e-06, "loss": 0.007, "num_input_tokens_seen": 65962736, "step": 30565 }, { "epoch": 5.610203707102221, "grad_norm": 0.13905420899391174, "learning_rate": 9.039939039800907e-06, "loss": 0.0248, "num_input_tokens_seen": 65974000, "step": 30570 }, { "epoch": 5.611121306661773, "grad_norm": 0.05864843353629112, "learning_rate": 9.039467182797342e-06, "loss": 0.0016, "num_input_tokens_seen": 65984528, "step": 30575 }, { "epoch": 5.612038906221325, "grad_norm": 0.036970075219869614, "learning_rate": 9.038995222187703e-06, "loss": 0.0703, "num_input_tokens_seen": 65994960, "step": 30580 }, { "epoch": 5.612956505780877, "grad_norm": 0.021255185827612877, "learning_rate": 9.038523157984099e-06, "loss": 0.0718, "num_input_tokens_seen": 66006704, "step": 30585 }, { "epoch": 5.61387410534043, "grad_norm": 0.011649318039417267, "learning_rate": 9.038050990198633e-06, "loss": 0.0532, "num_input_tokens_seen": 66019088, "step": 30590 }, { "epoch": 5.614791704899981, "grad_norm": 0.016475219279527664, "learning_rate": 9.037578718843418e-06, "loss": 0.1872, "num_input_tokens_seen": 66031216, "step": 30595 }, { "epoch": 5.615709304459534, "grad_norm": 39.68974685668945, "learning_rate": 9.037106343930566e-06, "loss": 0.0614, "num_input_tokens_seen": 66042512, "step": 30600 }, { "epoch": 5.616626904019086, "grad_norm": 0.027357103303074837, "learning_rate": 9.036633865472195e-06, "loss": 0.1347, "num_input_tokens_seen": 66053200, "step": 30605 }, { "epoch": 5.617544503578638, "grad_norm": 6.44368839263916, "learning_rate": 9.03616128348042e-06, "loss": 0.1004, "num_input_tokens_seen": 66064272, "step": 30610 }, { "epoch": 5.6184621031381905, "grad_norm": 0.09353803843259811, "learning_rate": 9.035688597967364e-06, "loss": 0.1949, "num_input_tokens_seen": 66075184, "step": 30615 }, { "epoch": 5.619379702697743, "grad_norm": 0.16259349882602692, "learning_rate": 9.03521580894515e-06, "loss": 0.1298, "num_input_tokens_seen": 66086352, "step": 30620 }, { "epoch": 5.620297302257295, "grad_norm": 0.025255851447582245, "learning_rate": 9.034742916425905e-06, "loss": 0.001, "num_input_tokens_seen": 66097200, "step": 30625 }, { "epoch": 5.621214901816847, "grad_norm": 0.18549585342407227, "learning_rate": 9.034269920421758e-06, "loss": 0.0949, "num_input_tokens_seen": 66108048, "step": 30630 }, { "epoch": 5.6221325013764, "grad_norm": 22.859546661376953, "learning_rate": 9.03379682094484e-06, "loss": 0.2185, "num_input_tokens_seen": 66119664, "step": 30635 }, { "epoch": 5.623050100935951, "grad_norm": 0.30993571877479553, "learning_rate": 9.033323618007283e-06, "loss": 0.4328, "num_input_tokens_seen": 66130672, "step": 30640 }, { "epoch": 5.623967700495504, "grad_norm": 0.3229001462459564, "learning_rate": 9.032850311621229e-06, "loss": 0.0454, "num_input_tokens_seen": 66141264, "step": 30645 }, { "epoch": 5.624885300055056, "grad_norm": 0.11507243663072586, "learning_rate": 9.032376901798814e-06, "loss": 0.2135, "num_input_tokens_seen": 66153168, "step": 30650 }, { "epoch": 5.625802899614608, "grad_norm": 0.9258742928504944, "learning_rate": 9.03190338855218e-06, "loss": 0.31, "num_input_tokens_seen": 66163792, "step": 30655 }, { "epoch": 5.62672049917416, "grad_norm": 0.16031932830810547, "learning_rate": 9.031429771893473e-06, "loss": 0.0856, "num_input_tokens_seen": 66174352, "step": 30660 }, { "epoch": 5.627638098733713, "grad_norm": 0.08062110841274261, "learning_rate": 9.030956051834842e-06, "loss": 0.3564, "num_input_tokens_seen": 66185712, "step": 30665 }, { "epoch": 5.6285556982932645, "grad_norm": 1.3042978048324585, "learning_rate": 9.030482228388436e-06, "loss": 0.2491, "num_input_tokens_seen": 66196176, "step": 30670 }, { "epoch": 5.629473297852817, "grad_norm": 0.12739165127277374, "learning_rate": 9.030008301566406e-06, "loss": 0.0054, "num_input_tokens_seen": 66207312, "step": 30675 }, { "epoch": 5.6303908974123695, "grad_norm": 0.09303514659404755, "learning_rate": 9.029534271380912e-06, "loss": 0.0052, "num_input_tokens_seen": 66218352, "step": 30680 }, { "epoch": 5.631308496971921, "grad_norm": 0.12067326158285141, "learning_rate": 9.029060137844106e-06, "loss": 0.0448, "num_input_tokens_seen": 66229520, "step": 30685 }, { "epoch": 5.632226096531474, "grad_norm": 5.280059337615967, "learning_rate": 9.028585900968152e-06, "loss": 0.4135, "num_input_tokens_seen": 66240432, "step": 30690 }, { "epoch": 5.633143696091026, "grad_norm": 30.7103271484375, "learning_rate": 9.028111560765214e-06, "loss": 0.0789, "num_input_tokens_seen": 66252976, "step": 30695 }, { "epoch": 5.634061295650578, "grad_norm": 0.13907666504383087, "learning_rate": 9.027637117247459e-06, "loss": 0.1213, "num_input_tokens_seen": 66264176, "step": 30700 }, { "epoch": 5.63497889521013, "grad_norm": 0.11376135051250458, "learning_rate": 9.027162570427052e-06, "loss": 0.0686, "num_input_tokens_seen": 66276080, "step": 30705 }, { "epoch": 5.635896494769683, "grad_norm": 0.13171467185020447, "learning_rate": 9.026687920316168e-06, "loss": 0.1798, "num_input_tokens_seen": 66285744, "step": 30710 }, { "epoch": 5.636814094329234, "grad_norm": 10.874724388122559, "learning_rate": 9.026213166926977e-06, "loss": 0.0286, "num_input_tokens_seen": 66296496, "step": 30715 }, { "epoch": 5.637731693888787, "grad_norm": 1.6038148403167725, "learning_rate": 9.025738310271663e-06, "loss": 0.1376, "num_input_tokens_seen": 66308144, "step": 30720 }, { "epoch": 5.638649293448339, "grad_norm": 24.947322845458984, "learning_rate": 9.025263350362397e-06, "loss": 0.0696, "num_input_tokens_seen": 66319248, "step": 30725 }, { "epoch": 5.639566893007891, "grad_norm": 1.7539112567901611, "learning_rate": 9.024788287211365e-06, "loss": 0.062, "num_input_tokens_seen": 66330192, "step": 30730 }, { "epoch": 5.6404844925674436, "grad_norm": 22.52704429626465, "learning_rate": 9.024313120830754e-06, "loss": 0.1109, "num_input_tokens_seen": 66340752, "step": 30735 }, { "epoch": 5.641402092126996, "grad_norm": 10.142600059509277, "learning_rate": 9.023837851232746e-06, "loss": 0.1778, "num_input_tokens_seen": 66351536, "step": 30740 }, { "epoch": 5.642319691686548, "grad_norm": 51.17525100708008, "learning_rate": 9.023362478429533e-06, "loss": 0.274, "num_input_tokens_seen": 66362128, "step": 30745 }, { "epoch": 5.6432372912461, "grad_norm": 0.40259650349617004, "learning_rate": 9.02288700243331e-06, "loss": 0.1545, "num_input_tokens_seen": 66373904, "step": 30750 }, { "epoch": 5.644154890805653, "grad_norm": 6.8537797927856445, "learning_rate": 9.02241142325627e-06, "loss": 0.0621, "num_input_tokens_seen": 66384336, "step": 30755 }, { "epoch": 5.645072490365204, "grad_norm": 6.062689304351807, "learning_rate": 9.02193574091061e-06, "loss": 0.1173, "num_input_tokens_seen": 66395536, "step": 30760 }, { "epoch": 5.645990089924757, "grad_norm": 0.1328095942735672, "learning_rate": 9.021459955408532e-06, "loss": 0.0521, "num_input_tokens_seen": 66405968, "step": 30765 }, { "epoch": 5.646907689484309, "grad_norm": 11.860091209411621, "learning_rate": 9.020984066762239e-06, "loss": 0.1836, "num_input_tokens_seen": 66415024, "step": 30770 }, { "epoch": 5.647825289043861, "grad_norm": 0.08014735579490662, "learning_rate": 9.020508074983939e-06, "loss": 0.0374, "num_input_tokens_seen": 66425296, "step": 30775 }, { "epoch": 5.6487428886034134, "grad_norm": 10.753229141235352, "learning_rate": 9.020031980085835e-06, "loss": 0.3101, "num_input_tokens_seen": 66436432, "step": 30780 }, { "epoch": 5.649660488162966, "grad_norm": 45.134220123291016, "learning_rate": 9.019555782080143e-06, "loss": 0.0189, "num_input_tokens_seen": 66447696, "step": 30785 }, { "epoch": 5.650578087722518, "grad_norm": 35.6851806640625, "learning_rate": 9.019079480979074e-06, "loss": 0.2527, "num_input_tokens_seen": 66459024, "step": 30790 }, { "epoch": 5.65149568728207, "grad_norm": 1.0853787660598755, "learning_rate": 9.018603076794845e-06, "loss": 0.0046, "num_input_tokens_seen": 66469680, "step": 30795 }, { "epoch": 5.652413286841623, "grad_norm": 0.09991669654846191, "learning_rate": 9.018126569539675e-06, "loss": 0.1105, "num_input_tokens_seen": 66481008, "step": 30800 }, { "epoch": 5.653330886401174, "grad_norm": 0.482485294342041, "learning_rate": 9.017649959225787e-06, "loss": 0.0058, "num_input_tokens_seen": 66491632, "step": 30805 }, { "epoch": 5.654248485960727, "grad_norm": 16.479782104492188, "learning_rate": 9.017173245865404e-06, "loss": 0.1445, "num_input_tokens_seen": 66501840, "step": 30810 }, { "epoch": 5.655166085520279, "grad_norm": 143.450439453125, "learning_rate": 9.016696429470753e-06, "loss": 0.0764, "num_input_tokens_seen": 66512656, "step": 30815 }, { "epoch": 5.656083685079831, "grad_norm": 0.14855384826660156, "learning_rate": 9.016219510054064e-06, "loss": 0.0969, "num_input_tokens_seen": 66523120, "step": 30820 }, { "epoch": 5.657001284639383, "grad_norm": 13.773694038391113, "learning_rate": 9.01574248762757e-06, "loss": 0.0484, "num_input_tokens_seen": 66533360, "step": 30825 }, { "epoch": 5.657918884198936, "grad_norm": 0.4401271343231201, "learning_rate": 9.015265362203505e-06, "loss": 0.0131, "num_input_tokens_seen": 66544208, "step": 30830 }, { "epoch": 5.6588364837584875, "grad_norm": 0.04944837838411331, "learning_rate": 9.014788133794105e-06, "loss": 0.3089, "num_input_tokens_seen": 66554896, "step": 30835 }, { "epoch": 5.65975408331804, "grad_norm": 0.04192923754453659, "learning_rate": 9.014310802411613e-06, "loss": 0.018, "num_input_tokens_seen": 66566064, "step": 30840 }, { "epoch": 5.6606716828775925, "grad_norm": 0.10796792805194855, "learning_rate": 9.01383336806827e-06, "loss": 0.1151, "num_input_tokens_seen": 66577104, "step": 30845 }, { "epoch": 5.661589282437144, "grad_norm": 0.040279656648635864, "learning_rate": 9.013355830776323e-06, "loss": 0.1182, "num_input_tokens_seen": 66587696, "step": 30850 }, { "epoch": 5.662506881996697, "grad_norm": 0.19243910908699036, "learning_rate": 9.012878190548018e-06, "loss": 0.006, "num_input_tokens_seen": 66596688, "step": 30855 }, { "epoch": 5.663424481556249, "grad_norm": 0.019397705793380737, "learning_rate": 9.012400447395607e-06, "loss": 0.0498, "num_input_tokens_seen": 66607472, "step": 30860 }, { "epoch": 5.664342081115801, "grad_norm": 0.1675458401441574, "learning_rate": 9.011922601331345e-06, "loss": 0.4943, "num_input_tokens_seen": 66618320, "step": 30865 }, { "epoch": 5.665259680675353, "grad_norm": 0.017937565222382545, "learning_rate": 9.011444652367483e-06, "loss": 0.0006, "num_input_tokens_seen": 66628400, "step": 30870 }, { "epoch": 5.666177280234906, "grad_norm": 0.10095537453889847, "learning_rate": 9.010966600516284e-06, "loss": 0.0687, "num_input_tokens_seen": 66639248, "step": 30875 }, { "epoch": 5.667094879794457, "grad_norm": 0.04106781259179115, "learning_rate": 9.010488445790008e-06, "loss": 0.1087, "num_input_tokens_seen": 66648656, "step": 30880 }, { "epoch": 5.66801247935401, "grad_norm": 0.04324505478143692, "learning_rate": 9.010010188200922e-06, "loss": 0.4252, "num_input_tokens_seen": 66658672, "step": 30885 }, { "epoch": 5.668930078913562, "grad_norm": 0.2010403871536255, "learning_rate": 9.009531827761286e-06, "loss": 0.0172, "num_input_tokens_seen": 66669552, "step": 30890 }, { "epoch": 5.669847678473114, "grad_norm": 22.952667236328125, "learning_rate": 9.009053364483374e-06, "loss": 0.2285, "num_input_tokens_seen": 66681072, "step": 30895 }, { "epoch": 5.6707652780326665, "grad_norm": 7.279782295227051, "learning_rate": 9.008574798379457e-06, "loss": 0.2138, "num_input_tokens_seen": 66691984, "step": 30900 }, { "epoch": 5.671682877592219, "grad_norm": 7.001574516296387, "learning_rate": 9.008096129461808e-06, "loss": 0.2056, "num_input_tokens_seen": 66702448, "step": 30905 }, { "epoch": 5.672600477151771, "grad_norm": 2.851252555847168, "learning_rate": 9.007617357742707e-06, "loss": 0.3518, "num_input_tokens_seen": 66713104, "step": 30910 }, { "epoch": 5.673518076711323, "grad_norm": 1.005842685699463, "learning_rate": 9.00713848323443e-06, "loss": 0.2395, "num_input_tokens_seen": 66724656, "step": 30915 }, { "epoch": 5.674435676270876, "grad_norm": 10.581319808959961, "learning_rate": 9.006659505949264e-06, "loss": 0.1313, "num_input_tokens_seen": 66736272, "step": 30920 }, { "epoch": 5.675353275830427, "grad_norm": 14.856578826904297, "learning_rate": 9.00618042589949e-06, "loss": 0.1515, "num_input_tokens_seen": 66747184, "step": 30925 }, { "epoch": 5.67627087538998, "grad_norm": 5.839112281799316, "learning_rate": 9.005701243097397e-06, "loss": 0.1111, "num_input_tokens_seen": 66758192, "step": 30930 }, { "epoch": 5.677188474949532, "grad_norm": 0.15955115854740143, "learning_rate": 9.005221957555274e-06, "loss": 0.1798, "num_input_tokens_seen": 66769008, "step": 30935 }, { "epoch": 5.678106074509084, "grad_norm": 0.36798742413520813, "learning_rate": 9.004742569285418e-06, "loss": 0.1299, "num_input_tokens_seen": 66779600, "step": 30940 }, { "epoch": 5.679023674068636, "grad_norm": 0.895107090473175, "learning_rate": 9.00426307830012e-06, "loss": 0.0983, "num_input_tokens_seen": 66790224, "step": 30945 }, { "epoch": 5.679941273628189, "grad_norm": 38.0841178894043, "learning_rate": 9.003783484611681e-06, "loss": 0.1213, "num_input_tokens_seen": 66801584, "step": 30950 }, { "epoch": 5.6808588731877405, "grad_norm": 0.06317780166864395, "learning_rate": 9.0033037882324e-06, "loss": 0.0071, "num_input_tokens_seen": 66813552, "step": 30955 }, { "epoch": 5.681776472747293, "grad_norm": 0.6535272598266602, "learning_rate": 9.002823989174582e-06, "loss": 0.0107, "num_input_tokens_seen": 66824688, "step": 30960 }, { "epoch": 5.6826940723068455, "grad_norm": 20.601520538330078, "learning_rate": 9.002344087450535e-06, "loss": 0.0491, "num_input_tokens_seen": 66834960, "step": 30965 }, { "epoch": 5.683611671866397, "grad_norm": 0.052487555891275406, "learning_rate": 9.00186408307256e-06, "loss": 0.0044, "num_input_tokens_seen": 66844880, "step": 30970 }, { "epoch": 5.68452927142595, "grad_norm": 0.017890384420752525, "learning_rate": 9.001383976052977e-06, "loss": 0.1307, "num_input_tokens_seen": 66855408, "step": 30975 }, { "epoch": 5.685446870985502, "grad_norm": 0.1140037551522255, "learning_rate": 9.000903766404097e-06, "loss": 0.1412, "num_input_tokens_seen": 66866224, "step": 30980 }, { "epoch": 5.686364470545054, "grad_norm": 3.7015202045440674, "learning_rate": 9.000423454138235e-06, "loss": 0.1924, "num_input_tokens_seen": 66877552, "step": 30985 }, { "epoch": 5.687282070104606, "grad_norm": 0.530907154083252, "learning_rate": 8.999943039267711e-06, "loss": 0.1086, "num_input_tokens_seen": 66889680, "step": 30990 }, { "epoch": 5.688199669664159, "grad_norm": 0.1332254558801651, "learning_rate": 8.999462521804849e-06, "loss": 0.1932, "num_input_tokens_seen": 66900112, "step": 30995 }, { "epoch": 5.68911726922371, "grad_norm": 0.017587967216968536, "learning_rate": 8.998981901761971e-06, "loss": 0.3886, "num_input_tokens_seen": 66911312, "step": 31000 }, { "epoch": 5.690034868783263, "grad_norm": 0.03983381763100624, "learning_rate": 8.998501179151405e-06, "loss": 0.0752, "num_input_tokens_seen": 66922128, "step": 31005 }, { "epoch": 5.690952468342815, "grad_norm": 36.946922302246094, "learning_rate": 8.998020353985481e-06, "loss": 0.0158, "num_input_tokens_seen": 66932592, "step": 31010 }, { "epoch": 5.691870067902367, "grad_norm": 0.043566394597291946, "learning_rate": 8.997539426276532e-06, "loss": 0.002, "num_input_tokens_seen": 66943632, "step": 31015 }, { "epoch": 5.6927876674619196, "grad_norm": 0.08598367869853973, "learning_rate": 8.997058396036891e-06, "loss": 0.0728, "num_input_tokens_seen": 66955024, "step": 31020 }, { "epoch": 5.693705267021472, "grad_norm": 0.08019941300153732, "learning_rate": 8.996577263278897e-06, "loss": 0.2175, "num_input_tokens_seen": 66965936, "step": 31025 }, { "epoch": 5.694622866581024, "grad_norm": 8.803245544433594, "learning_rate": 8.99609602801489e-06, "loss": 0.1369, "num_input_tokens_seen": 66976816, "step": 31030 }, { "epoch": 5.695540466140576, "grad_norm": 0.10752604156732559, "learning_rate": 8.995614690257216e-06, "loss": 0.082, "num_input_tokens_seen": 66987248, "step": 31035 }, { "epoch": 5.696458065700129, "grad_norm": 42.51823043823242, "learning_rate": 8.995133250018215e-06, "loss": 0.0472, "num_input_tokens_seen": 66997776, "step": 31040 }, { "epoch": 5.69737566525968, "grad_norm": 0.03179153427481651, "learning_rate": 8.994651707310241e-06, "loss": 0.0293, "num_input_tokens_seen": 67008400, "step": 31045 }, { "epoch": 5.698293264819233, "grad_norm": 0.0657179206609726, "learning_rate": 8.994170062145639e-06, "loss": 0.1325, "num_input_tokens_seen": 67019760, "step": 31050 }, { "epoch": 5.699210864378785, "grad_norm": 0.06091896817088127, "learning_rate": 8.993688314536766e-06, "loss": 0.0011, "num_input_tokens_seen": 67031088, "step": 31055 }, { "epoch": 5.700128463938337, "grad_norm": 0.06495974212884903, "learning_rate": 8.993206464495979e-06, "loss": 0.1636, "num_input_tokens_seen": 67042352, "step": 31060 }, { "epoch": 5.7010460634978894, "grad_norm": 0.20533417165279388, "learning_rate": 8.992724512035632e-06, "loss": 0.1161, "num_input_tokens_seen": 67053136, "step": 31065 }, { "epoch": 5.701963663057442, "grad_norm": 0.5625013709068298, "learning_rate": 8.992242457168091e-06, "loss": 0.0076, "num_input_tokens_seen": 67064048, "step": 31070 }, { "epoch": 5.702881262616994, "grad_norm": 65.09622955322266, "learning_rate": 8.991760299905718e-06, "loss": 0.1201, "num_input_tokens_seen": 67075632, "step": 31075 }, { "epoch": 5.703798862176546, "grad_norm": 0.043272484093904495, "learning_rate": 8.991278040260882e-06, "loss": 0.1348, "num_input_tokens_seen": 67086992, "step": 31080 }, { "epoch": 5.704716461736099, "grad_norm": 0.11682938039302826, "learning_rate": 8.990795678245949e-06, "loss": 0.003, "num_input_tokens_seen": 67097904, "step": 31085 }, { "epoch": 5.70563406129565, "grad_norm": 0.06399870663881302, "learning_rate": 8.990313213873291e-06, "loss": 0.4362, "num_input_tokens_seen": 67106896, "step": 31090 }, { "epoch": 5.706551660855203, "grad_norm": 51.07589340209961, "learning_rate": 8.989830647155285e-06, "loss": 0.0111, "num_input_tokens_seen": 67118416, "step": 31095 }, { "epoch": 5.707469260414755, "grad_norm": 10.078742027282715, "learning_rate": 8.989347978104305e-06, "loss": 0.1094, "num_input_tokens_seen": 67129104, "step": 31100 }, { "epoch": 5.708386859974307, "grad_norm": 0.9018994569778442, "learning_rate": 8.988865206732733e-06, "loss": 0.1497, "num_input_tokens_seen": 67140528, "step": 31105 }, { "epoch": 5.709304459533859, "grad_norm": 0.07358510792255402, "learning_rate": 8.988382333052951e-06, "loss": 0.0955, "num_input_tokens_seen": 67150800, "step": 31110 }, { "epoch": 5.710222059093412, "grad_norm": 1.4112651348114014, "learning_rate": 8.987899357077343e-06, "loss": 0.0034, "num_input_tokens_seen": 67162000, "step": 31115 }, { "epoch": 5.7111396586529635, "grad_norm": 0.1410091519355774, "learning_rate": 8.987416278818298e-06, "loss": 0.1044, "num_input_tokens_seen": 67171184, "step": 31120 }, { "epoch": 5.712057258212516, "grad_norm": 10.093505859375, "learning_rate": 8.986933098288205e-06, "loss": 0.174, "num_input_tokens_seen": 67181584, "step": 31125 }, { "epoch": 5.7129748577720685, "grad_norm": 13.061795234680176, "learning_rate": 8.986449815499456e-06, "loss": 0.1181, "num_input_tokens_seen": 67192112, "step": 31130 }, { "epoch": 5.71389245733162, "grad_norm": 18.2398624420166, "learning_rate": 8.98596643046445e-06, "loss": 0.1114, "num_input_tokens_seen": 67202800, "step": 31135 }, { "epoch": 5.714810056891173, "grad_norm": 8.884442329406738, "learning_rate": 8.985482943195581e-06, "loss": 0.3403, "num_input_tokens_seen": 67213680, "step": 31140 }, { "epoch": 5.715727656450725, "grad_norm": 56.0846061706543, "learning_rate": 8.984999353705252e-06, "loss": 0.2375, "num_input_tokens_seen": 67225552, "step": 31145 }, { "epoch": 5.716645256010277, "grad_norm": 0.03253186494112015, "learning_rate": 8.984515662005865e-06, "loss": 0.239, "num_input_tokens_seen": 67235056, "step": 31150 }, { "epoch": 5.717562855569829, "grad_norm": 0.11179307103157043, "learning_rate": 8.984031868109828e-06, "loss": 0.2061, "num_input_tokens_seen": 67245552, "step": 31155 }, { "epoch": 5.718480455129382, "grad_norm": 11.091896057128906, "learning_rate": 8.983547972029547e-06, "loss": 0.2222, "num_input_tokens_seen": 67257168, "step": 31160 }, { "epoch": 5.719398054688933, "grad_norm": 1.8119301795959473, "learning_rate": 8.983063973777436e-06, "loss": 0.1238, "num_input_tokens_seen": 67266544, "step": 31165 }, { "epoch": 5.720315654248486, "grad_norm": 6.277852535247803, "learning_rate": 8.982579873365906e-06, "loss": 0.0873, "num_input_tokens_seen": 67276784, "step": 31170 }, { "epoch": 5.721233253808038, "grad_norm": 48.518096923828125, "learning_rate": 8.982095670807376e-06, "loss": 0.1367, "num_input_tokens_seen": 67286576, "step": 31175 }, { "epoch": 5.72215085336759, "grad_norm": 0.045171402394771576, "learning_rate": 8.981611366114263e-06, "loss": 0.0051, "num_input_tokens_seen": 67298224, "step": 31180 }, { "epoch": 5.7230684529271425, "grad_norm": 0.3602280616760254, "learning_rate": 8.981126959298988e-06, "loss": 0.0029, "num_input_tokens_seen": 67309136, "step": 31185 }, { "epoch": 5.723986052486695, "grad_norm": 8.216277122497559, "learning_rate": 8.980642450373977e-06, "loss": 0.0063, "num_input_tokens_seen": 67320016, "step": 31190 }, { "epoch": 5.7249036520462475, "grad_norm": 5.513883113861084, "learning_rate": 8.98015783935166e-06, "loss": 0.1443, "num_input_tokens_seen": 67331056, "step": 31195 }, { "epoch": 5.725821251605799, "grad_norm": 0.21134021878242493, "learning_rate": 8.97967312624446e-06, "loss": 0.0818, "num_input_tokens_seen": 67342480, "step": 31200 }, { "epoch": 5.726738851165352, "grad_norm": 10.535643577575684, "learning_rate": 8.979188311064812e-06, "loss": 0.1492, "num_input_tokens_seen": 67353392, "step": 31205 }, { "epoch": 5.727656450724904, "grad_norm": 0.871564507484436, "learning_rate": 8.978703393825152e-06, "loss": 0.0028, "num_input_tokens_seen": 67363184, "step": 31210 }, { "epoch": 5.728574050284456, "grad_norm": 59.454959869384766, "learning_rate": 8.978218374537917e-06, "loss": 0.0282, "num_input_tokens_seen": 67373936, "step": 31215 }, { "epoch": 5.729491649844008, "grad_norm": 0.11607356369495392, "learning_rate": 8.977733253215545e-06, "loss": 0.0975, "num_input_tokens_seen": 67383920, "step": 31220 }, { "epoch": 5.730409249403561, "grad_norm": 9.39270305633545, "learning_rate": 8.97724802987048e-06, "loss": 0.0062, "num_input_tokens_seen": 67394288, "step": 31225 }, { "epoch": 5.731326848963112, "grad_norm": 0.0845961719751358, "learning_rate": 8.97676270451517e-06, "loss": 0.1586, "num_input_tokens_seen": 67404912, "step": 31230 }, { "epoch": 5.732244448522665, "grad_norm": 9.539677619934082, "learning_rate": 8.976277277162055e-06, "loss": 0.4497, "num_input_tokens_seen": 67415472, "step": 31235 }, { "epoch": 5.733162048082217, "grad_norm": 12.865145683288574, "learning_rate": 8.975791747823595e-06, "loss": 0.0217, "num_input_tokens_seen": 67426896, "step": 31240 }, { "epoch": 5.734079647641769, "grad_norm": 0.06483416259288788, "learning_rate": 8.975306116512236e-06, "loss": 0.0783, "num_input_tokens_seen": 67438512, "step": 31245 }, { "epoch": 5.7349972472013215, "grad_norm": 28.034366607666016, "learning_rate": 8.974820383240439e-06, "loss": 0.2636, "num_input_tokens_seen": 67449232, "step": 31250 }, { "epoch": 5.735914846760874, "grad_norm": 33.78131103515625, "learning_rate": 8.974334548020657e-06, "loss": 0.1526, "num_input_tokens_seen": 67459984, "step": 31255 }, { "epoch": 5.736832446320426, "grad_norm": 14.25334644317627, "learning_rate": 8.973848610865354e-06, "loss": 0.1886, "num_input_tokens_seen": 67471696, "step": 31260 }, { "epoch": 5.737750045879978, "grad_norm": 3.670199155807495, "learning_rate": 8.973362571786993e-06, "loss": 0.0037, "num_input_tokens_seen": 67483088, "step": 31265 }, { "epoch": 5.738667645439531, "grad_norm": 0.20050278306007385, "learning_rate": 8.972876430798041e-06, "loss": 0.124, "num_input_tokens_seen": 67493680, "step": 31270 }, { "epoch": 5.739585244999082, "grad_norm": 0.0977022722363472, "learning_rate": 8.972390187910966e-06, "loss": 0.2114, "num_input_tokens_seen": 67504880, "step": 31275 }, { "epoch": 5.740502844558635, "grad_norm": 0.09796828031539917, "learning_rate": 8.971903843138238e-06, "loss": 0.0435, "num_input_tokens_seen": 67515184, "step": 31280 }, { "epoch": 5.741420444118187, "grad_norm": 0.07854470610618591, "learning_rate": 8.971417396492333e-06, "loss": 0.0342, "num_input_tokens_seen": 67526640, "step": 31285 }, { "epoch": 5.742338043677739, "grad_norm": 0.3871440887451172, "learning_rate": 8.970930847985727e-06, "loss": 0.1886, "num_input_tokens_seen": 67536528, "step": 31290 }, { "epoch": 5.743255643237291, "grad_norm": 0.04047681763768196, "learning_rate": 8.970444197630898e-06, "loss": 0.1275, "num_input_tokens_seen": 67547152, "step": 31295 }, { "epoch": 5.744173242796844, "grad_norm": 0.08327042311429977, "learning_rate": 8.969957445440332e-06, "loss": 0.1143, "num_input_tokens_seen": 67556944, "step": 31300 }, { "epoch": 5.7450908423563956, "grad_norm": 0.1236468181014061, "learning_rate": 8.969470591426507e-06, "loss": 0.0947, "num_input_tokens_seen": 67566864, "step": 31305 }, { "epoch": 5.746008441915948, "grad_norm": 10.417633056640625, "learning_rate": 8.968983635601916e-06, "loss": 0.191, "num_input_tokens_seen": 67576816, "step": 31310 }, { "epoch": 5.746926041475501, "grad_norm": 0.051824480295181274, "learning_rate": 8.968496577979045e-06, "loss": 0.1078, "num_input_tokens_seen": 67586448, "step": 31315 }, { "epoch": 5.747843641035052, "grad_norm": 0.03622710332274437, "learning_rate": 8.968009418570388e-06, "loss": 0.1338, "num_input_tokens_seen": 67597264, "step": 31320 }, { "epoch": 5.748761240594605, "grad_norm": 0.019709942862391472, "learning_rate": 8.967522157388439e-06, "loss": 0.2265, "num_input_tokens_seen": 67608752, "step": 31325 }, { "epoch": 5.749678840154157, "grad_norm": 12.343653678894043, "learning_rate": 8.967034794445695e-06, "loss": 0.1775, "num_input_tokens_seen": 67619920, "step": 31330 }, { "epoch": 5.750596439713709, "grad_norm": 89.38876342773438, "learning_rate": 8.966547329754658e-06, "loss": 0.2281, "num_input_tokens_seen": 67631184, "step": 31335 }, { "epoch": 5.751514039273261, "grad_norm": 0.03405490145087242, "learning_rate": 8.966059763327828e-06, "loss": 0.1538, "num_input_tokens_seen": 67641200, "step": 31340 }, { "epoch": 5.752431638832814, "grad_norm": 1.8950345516204834, "learning_rate": 8.965572095177714e-06, "loss": 0.102, "num_input_tokens_seen": 67651664, "step": 31345 }, { "epoch": 5.7533492383923655, "grad_norm": 0.029432538896799088, "learning_rate": 8.96508432531682e-06, "loss": 0.0824, "num_input_tokens_seen": 67661904, "step": 31350 }, { "epoch": 5.754266837951918, "grad_norm": 1.0916414260864258, "learning_rate": 8.964596453757659e-06, "loss": 0.0029, "num_input_tokens_seen": 67670512, "step": 31355 }, { "epoch": 5.7551844375114705, "grad_norm": 31.402063369750977, "learning_rate": 8.964108480512744e-06, "loss": 0.1388, "num_input_tokens_seen": 67680656, "step": 31360 }, { "epoch": 5.756102037071022, "grad_norm": 0.107747383415699, "learning_rate": 8.963620405594591e-06, "loss": 0.2284, "num_input_tokens_seen": 67691408, "step": 31365 }, { "epoch": 5.757019636630575, "grad_norm": 0.0663238912820816, "learning_rate": 8.963132229015716e-06, "loss": 0.2711, "num_input_tokens_seen": 67701584, "step": 31370 }, { "epoch": 5.757937236190127, "grad_norm": 0.29902365803718567, "learning_rate": 8.962643950788643e-06, "loss": 0.005, "num_input_tokens_seen": 67710864, "step": 31375 }, { "epoch": 5.758854835749679, "grad_norm": 7.051577568054199, "learning_rate": 8.962155570925892e-06, "loss": 0.2187, "num_input_tokens_seen": 67721520, "step": 31380 }, { "epoch": 5.759772435309231, "grad_norm": 0.1036536917090416, "learning_rate": 8.961667089439993e-06, "loss": 0.0105, "num_input_tokens_seen": 67732208, "step": 31385 }, { "epoch": 5.760690034868784, "grad_norm": 0.13663198053836823, "learning_rate": 8.961178506343473e-06, "loss": 0.0811, "num_input_tokens_seen": 67743408, "step": 31390 }, { "epoch": 5.761607634428335, "grad_norm": 0.059858303517103195, "learning_rate": 8.960689821648864e-06, "loss": 0.1101, "num_input_tokens_seen": 67754352, "step": 31395 }, { "epoch": 5.762525233987888, "grad_norm": 0.09227102994918823, "learning_rate": 8.960201035368698e-06, "loss": 0.0263, "num_input_tokens_seen": 67765840, "step": 31400 }, { "epoch": 5.76344283354744, "grad_norm": 0.11895830184221268, "learning_rate": 8.959712147515515e-06, "loss": 0.3698, "num_input_tokens_seen": 67777488, "step": 31405 }, { "epoch": 5.764360433106992, "grad_norm": 0.10856369137763977, "learning_rate": 8.959223158101852e-06, "loss": 0.2722, "num_input_tokens_seen": 67787920, "step": 31410 }, { "epoch": 5.7652780326665445, "grad_norm": 12.998234748840332, "learning_rate": 8.95873406714025e-06, "loss": 0.1737, "num_input_tokens_seen": 67799504, "step": 31415 }, { "epoch": 5.766195632226097, "grad_norm": 0.20379973948001862, "learning_rate": 8.958244874643255e-06, "loss": 0.1671, "num_input_tokens_seen": 67810640, "step": 31420 }, { "epoch": 5.767113231785649, "grad_norm": 0.18912284076213837, "learning_rate": 8.957755580623416e-06, "loss": 0.1181, "num_input_tokens_seen": 67821040, "step": 31425 }, { "epoch": 5.768030831345201, "grad_norm": 0.21747225522994995, "learning_rate": 8.957266185093279e-06, "loss": 0.1221, "num_input_tokens_seen": 67832784, "step": 31430 }, { "epoch": 5.768948430904754, "grad_norm": 11.915511131286621, "learning_rate": 8.956776688065397e-06, "loss": 0.1725, "num_input_tokens_seen": 67842480, "step": 31435 }, { "epoch": 5.769866030464305, "grad_norm": 6.206552982330322, "learning_rate": 8.956287089552325e-06, "loss": 0.4075, "num_input_tokens_seen": 67853360, "step": 31440 }, { "epoch": 5.770783630023858, "grad_norm": 0.10264518111944199, "learning_rate": 8.955797389566622e-06, "loss": 0.1699, "num_input_tokens_seen": 67864944, "step": 31445 }, { "epoch": 5.77170122958341, "grad_norm": 14.149754524230957, "learning_rate": 8.955307588120847e-06, "loss": 0.0141, "num_input_tokens_seen": 67876080, "step": 31450 }, { "epoch": 5.772618829142962, "grad_norm": 1.203224778175354, "learning_rate": 8.954817685227561e-06, "loss": 0.1204, "num_input_tokens_seen": 67886672, "step": 31455 }, { "epoch": 5.773536428702514, "grad_norm": 15.799782752990723, "learning_rate": 8.954327680899333e-06, "loss": 0.1722, "num_input_tokens_seen": 67897936, "step": 31460 }, { "epoch": 5.774454028262067, "grad_norm": 0.13184067606925964, "learning_rate": 8.953837575148726e-06, "loss": 0.0069, "num_input_tokens_seen": 67908368, "step": 31465 }, { "epoch": 5.7753716278216185, "grad_norm": 0.08643735200166702, "learning_rate": 8.953347367988314e-06, "loss": 0.2297, "num_input_tokens_seen": 67918288, "step": 31470 }, { "epoch": 5.776289227381171, "grad_norm": 42.424007415771484, "learning_rate": 8.95285705943067e-06, "loss": 0.0852, "num_input_tokens_seen": 67928400, "step": 31475 }, { "epoch": 5.7772068269407235, "grad_norm": 11.70337963104248, "learning_rate": 8.952366649488368e-06, "loss": 0.0172, "num_input_tokens_seen": 67940048, "step": 31480 }, { "epoch": 5.778124426500275, "grad_norm": 0.5334041714668274, "learning_rate": 8.951876138173988e-06, "loss": 0.0187, "num_input_tokens_seen": 67951024, "step": 31485 }, { "epoch": 5.779042026059828, "grad_norm": 0.16141366958618164, "learning_rate": 8.951385525500109e-06, "loss": 0.0045, "num_input_tokens_seen": 67961744, "step": 31490 }, { "epoch": 5.77995962561938, "grad_norm": 22.00267791748047, "learning_rate": 8.950894811479317e-06, "loss": 0.1407, "num_input_tokens_seen": 67974064, "step": 31495 }, { "epoch": 5.780877225178932, "grad_norm": 0.12307039648294449, "learning_rate": 8.950403996124193e-06, "loss": 0.1356, "num_input_tokens_seen": 67984048, "step": 31500 }, { "epoch": 5.781794824738484, "grad_norm": 2.534884214401245, "learning_rate": 8.949913079447333e-06, "loss": 0.548, "num_input_tokens_seen": 67994864, "step": 31505 }, { "epoch": 5.782712424298037, "grad_norm": 29.147565841674805, "learning_rate": 8.949422061461322e-06, "loss": 0.1443, "num_input_tokens_seen": 68005744, "step": 31510 }, { "epoch": 5.783630023857588, "grad_norm": 0.07156211882829666, "learning_rate": 8.948930942178756e-06, "loss": 0.0196, "num_input_tokens_seen": 68016976, "step": 31515 }, { "epoch": 5.784547623417141, "grad_norm": 0.05965101346373558, "learning_rate": 8.948439721612232e-06, "loss": 0.0071, "num_input_tokens_seen": 68027888, "step": 31520 }, { "epoch": 5.785465222976693, "grad_norm": 0.16215839982032776, "learning_rate": 8.94794839977435e-06, "loss": 0.0038, "num_input_tokens_seen": 68038672, "step": 31525 }, { "epoch": 5.786382822536245, "grad_norm": 12.03934383392334, "learning_rate": 8.94745697667771e-06, "loss": 0.3143, "num_input_tokens_seen": 68049648, "step": 31530 }, { "epoch": 5.7873004220957975, "grad_norm": 0.2547158896923065, "learning_rate": 8.946965452334915e-06, "loss": 0.0039, "num_input_tokens_seen": 68059536, "step": 31535 }, { "epoch": 5.78821802165535, "grad_norm": 44.88242721557617, "learning_rate": 8.946473826758574e-06, "loss": 0.0235, "num_input_tokens_seen": 68069840, "step": 31540 }, { "epoch": 5.789135621214902, "grad_norm": 6.4767584800720215, "learning_rate": 8.945982099961297e-06, "loss": 0.2294, "num_input_tokens_seen": 68081488, "step": 31545 }, { "epoch": 5.790053220774454, "grad_norm": 0.05962943285703659, "learning_rate": 8.945490271955693e-06, "loss": 0.1976, "num_input_tokens_seen": 68093072, "step": 31550 }, { "epoch": 5.790970820334007, "grad_norm": 12.867923736572266, "learning_rate": 8.944998342754379e-06, "loss": 0.0142, "num_input_tokens_seen": 68104368, "step": 31555 }, { "epoch": 5.791888419893558, "grad_norm": 25.885181427001953, "learning_rate": 8.944506312369971e-06, "loss": 0.0984, "num_input_tokens_seen": 68115088, "step": 31560 }, { "epoch": 5.792806019453111, "grad_norm": 0.031562238931655884, "learning_rate": 8.944014180815093e-06, "loss": 0.171, "num_input_tokens_seen": 68124880, "step": 31565 }, { "epoch": 5.793723619012663, "grad_norm": 0.02702207677066326, "learning_rate": 8.943521948102361e-06, "loss": 0.0918, "num_input_tokens_seen": 68136304, "step": 31570 }, { "epoch": 5.794641218572215, "grad_norm": 0.047938209027051926, "learning_rate": 8.943029614244404e-06, "loss": 0.0018, "num_input_tokens_seen": 68148336, "step": 31575 }, { "epoch": 5.795558818131767, "grad_norm": 16.407304763793945, "learning_rate": 8.942537179253848e-06, "loss": 0.1026, "num_input_tokens_seen": 68159376, "step": 31580 }, { "epoch": 5.79647641769132, "grad_norm": 0.33116671442985535, "learning_rate": 8.942044643143323e-06, "loss": 0.1859, "num_input_tokens_seen": 68170416, "step": 31585 }, { "epoch": 5.797394017250872, "grad_norm": 0.10263919830322266, "learning_rate": 8.941552005925463e-06, "loss": 0.0036, "num_input_tokens_seen": 68181008, "step": 31590 }, { "epoch": 5.798311616810424, "grad_norm": 0.40885767340660095, "learning_rate": 8.941059267612903e-06, "loss": 0.0024, "num_input_tokens_seen": 68190960, "step": 31595 }, { "epoch": 5.799229216369977, "grad_norm": 7.6157755851745605, "learning_rate": 8.94056642821828e-06, "loss": 0.2273, "num_input_tokens_seen": 68201456, "step": 31600 }, { "epoch": 5.800146815929528, "grad_norm": 0.6131685972213745, "learning_rate": 8.940073487754236e-06, "loss": 0.0048, "num_input_tokens_seen": 68213296, "step": 31605 }, { "epoch": 5.801064415489081, "grad_norm": 0.20808091759681702, "learning_rate": 8.939580446233416e-06, "loss": 0.012, "num_input_tokens_seen": 68223888, "step": 31610 }, { "epoch": 5.801982015048633, "grad_norm": 0.07479389011859894, "learning_rate": 8.93908730366846e-06, "loss": 0.125, "num_input_tokens_seen": 68234864, "step": 31615 }, { "epoch": 5.802899614608185, "grad_norm": 0.01636206917464733, "learning_rate": 8.938594060072022e-06, "loss": 0.1226, "num_input_tokens_seen": 68245872, "step": 31620 }, { "epoch": 5.803817214167737, "grad_norm": 0.10770533233880997, "learning_rate": 8.93810071545675e-06, "loss": 0.3366, "num_input_tokens_seen": 68256080, "step": 31625 }, { "epoch": 5.80473481372729, "grad_norm": 0.6548166275024414, "learning_rate": 8.937607269835298e-06, "loss": 0.0038, "num_input_tokens_seen": 68266224, "step": 31630 }, { "epoch": 5.8056524132868415, "grad_norm": 0.051182568073272705, "learning_rate": 8.937113723220322e-06, "loss": 0.2783, "num_input_tokens_seen": 68277424, "step": 31635 }, { "epoch": 5.806570012846394, "grad_norm": 0.14794187247753143, "learning_rate": 8.93662007562448e-06, "loss": 0.084, "num_input_tokens_seen": 68287824, "step": 31640 }, { "epoch": 5.8074876124059465, "grad_norm": 0.16350431740283966, "learning_rate": 8.936126327060438e-06, "loss": 0.2918, "num_input_tokens_seen": 68297040, "step": 31645 }, { "epoch": 5.808405211965498, "grad_norm": 0.9875724911689758, "learning_rate": 8.935632477540852e-06, "loss": 0.0041, "num_input_tokens_seen": 68308880, "step": 31650 }, { "epoch": 5.809322811525051, "grad_norm": 0.03346380218863487, "learning_rate": 8.935138527078394e-06, "loss": 0.0911, "num_input_tokens_seen": 68319216, "step": 31655 }, { "epoch": 5.810240411084603, "grad_norm": 0.13953205943107605, "learning_rate": 8.934644475685732e-06, "loss": 0.1327, "num_input_tokens_seen": 68331280, "step": 31660 }, { "epoch": 5.811158010644155, "grad_norm": 0.0145075973123312, "learning_rate": 8.934150323375537e-06, "loss": 0.2816, "num_input_tokens_seen": 68341712, "step": 31665 }, { "epoch": 5.812075610203707, "grad_norm": 0.13033151626586914, "learning_rate": 8.933656070160483e-06, "loss": 0.2915, "num_input_tokens_seen": 68353072, "step": 31670 }, { "epoch": 5.81299320976326, "grad_norm": 18.13225746154785, "learning_rate": 8.933161716053249e-06, "loss": 0.2295, "num_input_tokens_seen": 68363344, "step": 31675 }, { "epoch": 5.813910809322811, "grad_norm": 0.12074199318885803, "learning_rate": 8.932667261066511e-06, "loss": 0.2836, "num_input_tokens_seen": 68375312, "step": 31680 }, { "epoch": 5.814828408882364, "grad_norm": 11.354260444641113, "learning_rate": 8.932172705212954e-06, "loss": 0.0783, "num_input_tokens_seen": 68385712, "step": 31685 }, { "epoch": 5.815746008441916, "grad_norm": 1.275866985321045, "learning_rate": 8.931678048505263e-06, "loss": 0.0297, "num_input_tokens_seen": 68396656, "step": 31690 }, { "epoch": 5.816663608001468, "grad_norm": 30.82210922241211, "learning_rate": 8.931183290956121e-06, "loss": 0.0859, "num_input_tokens_seen": 68407056, "step": 31695 }, { "epoch": 5.8175812075610205, "grad_norm": 0.08427371084690094, "learning_rate": 8.930688432578221e-06, "loss": 0.1544, "num_input_tokens_seen": 68416688, "step": 31700 }, { "epoch": 5.818498807120573, "grad_norm": 84.1693115234375, "learning_rate": 8.930193473384255e-06, "loss": 0.2734, "num_input_tokens_seen": 68428400, "step": 31705 }, { "epoch": 5.819416406680125, "grad_norm": 6.181521892547607, "learning_rate": 8.929698413386918e-06, "loss": 0.1631, "num_input_tokens_seen": 68438928, "step": 31710 }, { "epoch": 5.820334006239677, "grad_norm": 0.3346767723560333, "learning_rate": 8.929203252598907e-06, "loss": 0.1611, "num_input_tokens_seen": 68450032, "step": 31715 }, { "epoch": 5.82125160579923, "grad_norm": 1.5787657499313354, "learning_rate": 8.928707991032923e-06, "loss": 0.0045, "num_input_tokens_seen": 68461296, "step": 31720 }, { "epoch": 5.822169205358781, "grad_norm": 0.30657580494880676, "learning_rate": 8.928212628701667e-06, "loss": 0.0041, "num_input_tokens_seen": 68473008, "step": 31725 }, { "epoch": 5.823086804918334, "grad_norm": 0.04236256331205368, "learning_rate": 8.927717165617844e-06, "loss": 0.3812, "num_input_tokens_seen": 68483088, "step": 31730 }, { "epoch": 5.824004404477886, "grad_norm": 0.05411320552229881, "learning_rate": 8.927221601794165e-06, "loss": 0.1152, "num_input_tokens_seen": 68494160, "step": 31735 }, { "epoch": 5.824922004037438, "grad_norm": 2.866969585418701, "learning_rate": 8.926725937243337e-06, "loss": 0.1875, "num_input_tokens_seen": 68505328, "step": 31740 }, { "epoch": 5.82583960359699, "grad_norm": 0.40087220072746277, "learning_rate": 8.926230171978076e-06, "loss": 0.0704, "num_input_tokens_seen": 68516560, "step": 31745 }, { "epoch": 5.826757203156543, "grad_norm": 0.5813639163970947, "learning_rate": 8.925734306011096e-06, "loss": 0.0772, "num_input_tokens_seen": 68527184, "step": 31750 }, { "epoch": 5.8276748027160945, "grad_norm": 0.16652937233448029, "learning_rate": 8.925238339355115e-06, "loss": 0.0029, "num_input_tokens_seen": 68537168, "step": 31755 }, { "epoch": 5.828592402275647, "grad_norm": 0.025834666565060616, "learning_rate": 8.924742272022855e-06, "loss": 0.1058, "num_input_tokens_seen": 68548528, "step": 31760 }, { "epoch": 5.8295100018351995, "grad_norm": 3.838944673538208, "learning_rate": 8.924246104027036e-06, "loss": 0.0296, "num_input_tokens_seen": 68559824, "step": 31765 }, { "epoch": 5.830427601394751, "grad_norm": 7.730770111083984, "learning_rate": 8.92374983538039e-06, "loss": 0.122, "num_input_tokens_seen": 68571024, "step": 31770 }, { "epoch": 5.831345200954304, "grad_norm": 56.82780838012695, "learning_rate": 8.92325346609564e-06, "loss": 0.209, "num_input_tokens_seen": 68581520, "step": 31775 }, { "epoch": 5.832262800513856, "grad_norm": 0.09158734232187271, "learning_rate": 8.92275699618552e-06, "loss": 0.1808, "num_input_tokens_seen": 68591600, "step": 31780 }, { "epoch": 5.833180400073408, "grad_norm": 7.5915422439575195, "learning_rate": 8.922260425662762e-06, "loss": 0.2445, "num_input_tokens_seen": 68601968, "step": 31785 }, { "epoch": 5.83409799963296, "grad_norm": 0.08913777023553848, "learning_rate": 8.921763754540102e-06, "loss": 0.0027, "num_input_tokens_seen": 68612016, "step": 31790 }, { "epoch": 5.835015599192513, "grad_norm": 0.03516131266951561, "learning_rate": 8.921266982830282e-06, "loss": 0.0017, "num_input_tokens_seen": 68622672, "step": 31795 }, { "epoch": 5.835933198752064, "grad_norm": 20.137094497680664, "learning_rate": 8.92077011054604e-06, "loss": 0.2389, "num_input_tokens_seen": 68633840, "step": 31800 }, { "epoch": 5.836850798311617, "grad_norm": 0.05870705097913742, "learning_rate": 8.920273137700121e-06, "loss": 0.0642, "num_input_tokens_seen": 68644592, "step": 31805 }, { "epoch": 5.837768397871169, "grad_norm": 0.07224953174591064, "learning_rate": 8.919776064305274e-06, "loss": 0.0239, "num_input_tokens_seen": 68655184, "step": 31810 }, { "epoch": 5.838685997430721, "grad_norm": 0.38794592022895813, "learning_rate": 8.919278890374243e-06, "loss": 0.0029, "num_input_tokens_seen": 68665840, "step": 31815 }, { "epoch": 5.8396035969902735, "grad_norm": 0.09180091321468353, "learning_rate": 8.918781615919785e-06, "loss": 0.1717, "num_input_tokens_seen": 68675632, "step": 31820 }, { "epoch": 5.840521196549826, "grad_norm": 0.18989545106887817, "learning_rate": 8.918284240954653e-06, "loss": 0.0108, "num_input_tokens_seen": 68685456, "step": 31825 }, { "epoch": 5.841438796109378, "grad_norm": 11.383578300476074, "learning_rate": 8.9177867654916e-06, "loss": 0.379, "num_input_tokens_seen": 68695280, "step": 31830 }, { "epoch": 5.84235639566893, "grad_norm": 0.08540976047515869, "learning_rate": 8.91728918954339e-06, "loss": 0.2581, "num_input_tokens_seen": 68705488, "step": 31835 }, { "epoch": 5.843273995228483, "grad_norm": 0.4485798180103302, "learning_rate": 8.916791513122784e-06, "loss": 0.0046, "num_input_tokens_seen": 68716720, "step": 31840 }, { "epoch": 5.844191594788034, "grad_norm": 0.08237172663211823, "learning_rate": 8.916293736242546e-06, "loss": 0.0834, "num_input_tokens_seen": 68727344, "step": 31845 }, { "epoch": 5.845109194347587, "grad_norm": 0.216994047164917, "learning_rate": 8.915795858915444e-06, "loss": 0.3425, "num_input_tokens_seen": 68737456, "step": 31850 }, { "epoch": 5.846026793907139, "grad_norm": 0.23856322467327118, "learning_rate": 8.915297881154246e-06, "loss": 0.4096, "num_input_tokens_seen": 68748592, "step": 31855 }, { "epoch": 5.846944393466691, "grad_norm": 0.6666396856307983, "learning_rate": 8.914799802971725e-06, "loss": 0.0384, "num_input_tokens_seen": 68760112, "step": 31860 }, { "epoch": 5.847861993026243, "grad_norm": 0.06311637908220291, "learning_rate": 8.914301624380657e-06, "loss": 0.1189, "num_input_tokens_seen": 68770864, "step": 31865 }, { "epoch": 5.848779592585796, "grad_norm": 0.16150815784931183, "learning_rate": 8.91380334539382e-06, "loss": 0.108, "num_input_tokens_seen": 68781488, "step": 31870 }, { "epoch": 5.849697192145348, "grad_norm": 0.056146394461393356, "learning_rate": 8.913304966023993e-06, "loss": 0.2761, "num_input_tokens_seen": 68790992, "step": 31875 }, { "epoch": 5.8506147917049, "grad_norm": 0.30732354521751404, "learning_rate": 8.912806486283956e-06, "loss": 0.0034, "num_input_tokens_seen": 68800240, "step": 31880 }, { "epoch": 5.851532391264453, "grad_norm": 112.59532928466797, "learning_rate": 8.9123079061865e-06, "loss": 0.0967, "num_input_tokens_seen": 68810192, "step": 31885 }, { "epoch": 5.852449990824004, "grad_norm": 18.270465850830078, "learning_rate": 8.911809225744407e-06, "loss": 0.0089, "num_input_tokens_seen": 68821264, "step": 31890 }, { "epoch": 5.853367590383557, "grad_norm": 14.08195686340332, "learning_rate": 8.911310444970473e-06, "loss": 0.1029, "num_input_tokens_seen": 68832656, "step": 31895 }, { "epoch": 5.854285189943109, "grad_norm": 0.04017465561628342, "learning_rate": 8.910811563877486e-06, "loss": 0.005, "num_input_tokens_seen": 68845008, "step": 31900 }, { "epoch": 5.855202789502661, "grad_norm": 0.1088244840502739, "learning_rate": 8.910312582478245e-06, "loss": 0.0829, "num_input_tokens_seen": 68857296, "step": 31905 }, { "epoch": 5.856120389062213, "grad_norm": 6.1307454109191895, "learning_rate": 8.909813500785546e-06, "loss": 0.1779, "num_input_tokens_seen": 68868368, "step": 31910 }, { "epoch": 5.857037988621766, "grad_norm": 0.0496232770383358, "learning_rate": 8.90931431881219e-06, "loss": 0.2365, "num_input_tokens_seen": 68879312, "step": 31915 }, { "epoch": 5.8579555881813175, "grad_norm": 0.19166304171085358, "learning_rate": 8.90881503657098e-06, "loss": 0.0074, "num_input_tokens_seen": 68888528, "step": 31920 }, { "epoch": 5.85887318774087, "grad_norm": 6.432265281677246, "learning_rate": 8.908315654074724e-06, "loss": 0.0891, "num_input_tokens_seen": 68898992, "step": 31925 }, { "epoch": 5.8597907873004225, "grad_norm": 0.039928361773490906, "learning_rate": 8.907816171336229e-06, "loss": 0.0471, "num_input_tokens_seen": 68910224, "step": 31930 }, { "epoch": 5.860708386859974, "grad_norm": 0.4842046797275543, "learning_rate": 8.907316588368305e-06, "loss": 0.0036, "num_input_tokens_seen": 68920784, "step": 31935 }, { "epoch": 5.861625986419527, "grad_norm": 0.07506497204303741, "learning_rate": 8.906816905183766e-06, "loss": 0.1879, "num_input_tokens_seen": 68929680, "step": 31940 }, { "epoch": 5.862543585979079, "grad_norm": 11.013386726379395, "learning_rate": 8.90631712179543e-06, "loss": 0.1902, "num_input_tokens_seen": 68940400, "step": 31945 }, { "epoch": 5.863461185538631, "grad_norm": 40.11905288696289, "learning_rate": 8.905817238216112e-06, "loss": 0.5584, "num_input_tokens_seen": 68951952, "step": 31950 }, { "epoch": 5.864378785098183, "grad_norm": 43.9194221496582, "learning_rate": 8.905317254458636e-06, "loss": 0.024, "num_input_tokens_seen": 68963248, "step": 31955 }, { "epoch": 5.865296384657736, "grad_norm": 13.524542808532715, "learning_rate": 8.904817170535825e-06, "loss": 0.3098, "num_input_tokens_seen": 68974800, "step": 31960 }, { "epoch": 5.866213984217287, "grad_norm": 17.00539207458496, "learning_rate": 8.904316986460506e-06, "loss": 0.4477, "num_input_tokens_seen": 68984880, "step": 31965 }, { "epoch": 5.86713158377684, "grad_norm": 0.05522933229804039, "learning_rate": 8.903816702245507e-06, "loss": 0.0058, "num_input_tokens_seen": 68995504, "step": 31970 }, { "epoch": 5.868049183336392, "grad_norm": 2.1003329753875732, "learning_rate": 8.90331631790366e-06, "loss": 0.0192, "num_input_tokens_seen": 69006160, "step": 31975 }, { "epoch": 5.868966782895944, "grad_norm": 0.33314695954322815, "learning_rate": 8.902815833447799e-06, "loss": 0.1227, "num_input_tokens_seen": 69017712, "step": 31980 }, { "epoch": 5.8698843824554965, "grad_norm": 0.049551017582416534, "learning_rate": 8.90231524889076e-06, "loss": 0.2259, "num_input_tokens_seen": 69029232, "step": 31985 }, { "epoch": 5.870801982015049, "grad_norm": 75.79684448242188, "learning_rate": 8.901814564245385e-06, "loss": 0.2737, "num_input_tokens_seen": 69040112, "step": 31990 }, { "epoch": 5.871719581574601, "grad_norm": 28.934497833251953, "learning_rate": 8.901313779524512e-06, "loss": 0.1073, "num_input_tokens_seen": 69051376, "step": 31995 }, { "epoch": 5.872637181134153, "grad_norm": 0.06726974248886108, "learning_rate": 8.900812894740986e-06, "loss": 0.0155, "num_input_tokens_seen": 69062608, "step": 32000 }, { "epoch": 5.873554780693706, "grad_norm": 0.3653315007686615, "learning_rate": 8.900311909907658e-06, "loss": 0.0031, "num_input_tokens_seen": 69073168, "step": 32005 }, { "epoch": 5.874472380253257, "grad_norm": 0.042770255357027054, "learning_rate": 8.899810825037373e-06, "loss": 0.145, "num_input_tokens_seen": 69084400, "step": 32010 }, { "epoch": 5.87538997981281, "grad_norm": 21.354982376098633, "learning_rate": 8.899309640142984e-06, "loss": 0.175, "num_input_tokens_seen": 69095792, "step": 32015 }, { "epoch": 5.876307579372362, "grad_norm": 0.062228649854660034, "learning_rate": 8.898808355237346e-06, "loss": 0.2742, "num_input_tokens_seen": 69107536, "step": 32020 }, { "epoch": 5.877225178931914, "grad_norm": 32.252323150634766, "learning_rate": 8.898306970333316e-06, "loss": 0.0147, "num_input_tokens_seen": 69118352, "step": 32025 }, { "epoch": 5.878142778491466, "grad_norm": 0.23113784193992615, "learning_rate": 8.897805485443756e-06, "loss": 0.0695, "num_input_tokens_seen": 69128784, "step": 32030 }, { "epoch": 5.879060378051019, "grad_norm": 0.06329655647277832, "learning_rate": 8.897303900581525e-06, "loss": 0.198, "num_input_tokens_seen": 69138064, "step": 32035 }, { "epoch": 5.8799779776105705, "grad_norm": 27.50393295288086, "learning_rate": 8.89680221575949e-06, "loss": 0.3337, "num_input_tokens_seen": 69148112, "step": 32040 }, { "epoch": 5.880895577170123, "grad_norm": 5.0691447257995605, "learning_rate": 8.896300430990516e-06, "loss": 0.0061, "num_input_tokens_seen": 69160464, "step": 32045 }, { "epoch": 5.8818131767296755, "grad_norm": 0.06668804585933685, "learning_rate": 8.895798546287477e-06, "loss": 0.0152, "num_input_tokens_seen": 69172240, "step": 32050 }, { "epoch": 5.882730776289227, "grad_norm": 2.2179102897644043, "learning_rate": 8.895296561663241e-06, "loss": 0.1082, "num_input_tokens_seen": 69183376, "step": 32055 }, { "epoch": 5.88364837584878, "grad_norm": 8.227094650268555, "learning_rate": 8.894794477130686e-06, "loss": 0.2167, "num_input_tokens_seen": 69192560, "step": 32060 }, { "epoch": 5.884565975408332, "grad_norm": 0.0901767909526825, "learning_rate": 8.89429229270269e-06, "loss": 0.3135, "num_input_tokens_seen": 69202256, "step": 32065 }, { "epoch": 5.885483574967884, "grad_norm": 9.233527183532715, "learning_rate": 8.893790008392132e-06, "loss": 0.0133, "num_input_tokens_seen": 69213424, "step": 32070 }, { "epoch": 5.886401174527436, "grad_norm": 1.7280542850494385, "learning_rate": 8.893287624211896e-06, "loss": 0.0081, "num_input_tokens_seen": 69224496, "step": 32075 }, { "epoch": 5.887318774086989, "grad_norm": 0.04542730003595352, "learning_rate": 8.892785140174864e-06, "loss": 0.1969, "num_input_tokens_seen": 69236048, "step": 32080 }, { "epoch": 5.88823637364654, "grad_norm": 82.18380737304688, "learning_rate": 8.892282556293928e-06, "loss": 0.1941, "num_input_tokens_seen": 69246704, "step": 32085 }, { "epoch": 5.889153973206093, "grad_norm": 0.1692356914281845, "learning_rate": 8.891779872581976e-06, "loss": 0.0887, "num_input_tokens_seen": 69257680, "step": 32090 }, { "epoch": 5.890071572765645, "grad_norm": 0.7114604115486145, "learning_rate": 8.891277089051902e-06, "loss": 0.0066, "num_input_tokens_seen": 69268976, "step": 32095 }, { "epoch": 5.890989172325197, "grad_norm": 0.05707436054944992, "learning_rate": 8.890774205716602e-06, "loss": 0.0034, "num_input_tokens_seen": 69280336, "step": 32100 }, { "epoch": 5.8919067718847495, "grad_norm": 1.4557859897613525, "learning_rate": 8.890271222588974e-06, "loss": 0.0053, "num_input_tokens_seen": 69291856, "step": 32105 }, { "epoch": 5.892824371444302, "grad_norm": 0.1062036007642746, "learning_rate": 8.88976813968192e-06, "loss": 0.107, "num_input_tokens_seen": 69301968, "step": 32110 }, { "epoch": 5.893741971003854, "grad_norm": 25.070425033569336, "learning_rate": 8.889264957008339e-06, "loss": 0.1997, "num_input_tokens_seen": 69311472, "step": 32115 }, { "epoch": 5.894659570563406, "grad_norm": 4.310213088989258, "learning_rate": 8.888761674581141e-06, "loss": 0.1609, "num_input_tokens_seen": 69323632, "step": 32120 }, { "epoch": 5.895577170122959, "grad_norm": 0.17235296964645386, "learning_rate": 8.888258292413233e-06, "loss": 0.0889, "num_input_tokens_seen": 69333680, "step": 32125 }, { "epoch": 5.89649476968251, "grad_norm": 23.277429580688477, "learning_rate": 8.887754810517525e-06, "loss": 0.0624, "num_input_tokens_seen": 69343184, "step": 32130 }, { "epoch": 5.897412369242063, "grad_norm": 0.08517672121524811, "learning_rate": 8.887251228906934e-06, "loss": 0.0046, "num_input_tokens_seen": 69353360, "step": 32135 }, { "epoch": 5.898329968801615, "grad_norm": 23.577489852905273, "learning_rate": 8.886747547594372e-06, "loss": 0.1661, "num_input_tokens_seen": 69364784, "step": 32140 }, { "epoch": 5.899247568361167, "grad_norm": 67.30699157714844, "learning_rate": 8.88624376659276e-06, "loss": 0.026, "num_input_tokens_seen": 69376624, "step": 32145 }, { "epoch": 5.900165167920719, "grad_norm": 0.09819413721561432, "learning_rate": 8.885739885915018e-06, "loss": 0.025, "num_input_tokens_seen": 69386416, "step": 32150 }, { "epoch": 5.901082767480272, "grad_norm": 0.23412923514842987, "learning_rate": 8.88523590557407e-06, "loss": 0.1156, "num_input_tokens_seen": 69396752, "step": 32155 }, { "epoch": 5.902000367039824, "grad_norm": 0.5408881902694702, "learning_rate": 8.884731825582844e-06, "loss": 0.0028, "num_input_tokens_seen": 69406096, "step": 32160 }, { "epoch": 5.902917966599376, "grad_norm": 0.0696612223982811, "learning_rate": 8.884227645954267e-06, "loss": 0.0939, "num_input_tokens_seen": 69416368, "step": 32165 }, { "epoch": 5.903835566158929, "grad_norm": 0.03045862913131714, "learning_rate": 8.883723366701268e-06, "loss": 0.1127, "num_input_tokens_seen": 69426576, "step": 32170 }, { "epoch": 5.90475316571848, "grad_norm": 0.04677632078528404, "learning_rate": 8.883218987836788e-06, "loss": 0.2517, "num_input_tokens_seen": 69436304, "step": 32175 }, { "epoch": 5.905670765278033, "grad_norm": 0.2053629755973816, "learning_rate": 8.882714509373758e-06, "loss": 0.002, "num_input_tokens_seen": 69447728, "step": 32180 }, { "epoch": 5.906588364837585, "grad_norm": 0.07870300859212875, "learning_rate": 8.882209931325118e-06, "loss": 0.0021, "num_input_tokens_seen": 69457840, "step": 32185 }, { "epoch": 5.907505964397137, "grad_norm": 0.9747955203056335, "learning_rate": 8.88170525370381e-06, "loss": 0.0228, "num_input_tokens_seen": 69469296, "step": 32190 }, { "epoch": 5.908423563956689, "grad_norm": 0.06537698209285736, "learning_rate": 8.881200476522778e-06, "loss": 0.0587, "num_input_tokens_seen": 69479376, "step": 32195 }, { "epoch": 5.909341163516242, "grad_norm": 0.016772381961345673, "learning_rate": 8.88069559979497e-06, "loss": 0.0023, "num_input_tokens_seen": 69490320, "step": 32200 }, { "epoch": 5.9102587630757935, "grad_norm": 0.014121784828603268, "learning_rate": 8.880190623533334e-06, "loss": 0.0761, "num_input_tokens_seen": 69501520, "step": 32205 }, { "epoch": 5.911176362635346, "grad_norm": 0.009577852673828602, "learning_rate": 8.879685547750823e-06, "loss": 0.0029, "num_input_tokens_seen": 69510960, "step": 32210 }, { "epoch": 5.9120939621948985, "grad_norm": 0.04204529523849487, "learning_rate": 8.87918037246039e-06, "loss": 0.3105, "num_input_tokens_seen": 69521520, "step": 32215 }, { "epoch": 5.91301156175445, "grad_norm": 0.02474026381969452, "learning_rate": 8.878675097674991e-06, "loss": 0.0687, "num_input_tokens_seen": 69532528, "step": 32220 }, { "epoch": 5.913929161314003, "grad_norm": 27.475801467895508, "learning_rate": 8.87816972340759e-06, "loss": 0.4371, "num_input_tokens_seen": 69543408, "step": 32225 }, { "epoch": 5.914846760873555, "grad_norm": 0.35637620091438293, "learning_rate": 8.877664249671145e-06, "loss": 0.0026, "num_input_tokens_seen": 69554640, "step": 32230 }, { "epoch": 5.915764360433107, "grad_norm": 0.029365571215748787, "learning_rate": 8.877158676478622e-06, "loss": 0.1423, "num_input_tokens_seen": 69565200, "step": 32235 }, { "epoch": 5.916681959992659, "grad_norm": 10.862005233764648, "learning_rate": 8.876653003842987e-06, "loss": 0.3684, "num_input_tokens_seen": 69575888, "step": 32240 }, { "epoch": 5.917599559552212, "grad_norm": 1.5400925874710083, "learning_rate": 8.876147231777212e-06, "loss": 0.1208, "num_input_tokens_seen": 69586224, "step": 32245 }, { "epoch": 5.918517159111763, "grad_norm": 0.14219990372657776, "learning_rate": 8.875641360294267e-06, "loss": 0.1247, "num_input_tokens_seen": 69597008, "step": 32250 }, { "epoch": 5.919434758671316, "grad_norm": 0.037717171013355255, "learning_rate": 8.87513538940713e-06, "loss": 0.0058, "num_input_tokens_seen": 69606512, "step": 32255 }, { "epoch": 5.920352358230868, "grad_norm": 0.1597435623407364, "learning_rate": 8.874629319128774e-06, "loss": 0.0988, "num_input_tokens_seen": 69616784, "step": 32260 }, { "epoch": 5.92126995779042, "grad_norm": 33.696041107177734, "learning_rate": 8.87412314947218e-06, "loss": 0.3809, "num_input_tokens_seen": 69627312, "step": 32265 }, { "epoch": 5.9221875573499725, "grad_norm": 0.16713209450244904, "learning_rate": 8.873616880450335e-06, "loss": 0.0804, "num_input_tokens_seen": 69638000, "step": 32270 }, { "epoch": 5.923105156909525, "grad_norm": 4.305044651031494, "learning_rate": 8.873110512076218e-06, "loss": 0.1192, "num_input_tokens_seen": 69649264, "step": 32275 }, { "epoch": 5.924022756469077, "grad_norm": 0.07633037865161896, "learning_rate": 8.87260404436282e-06, "loss": 0.2495, "num_input_tokens_seen": 69660112, "step": 32280 }, { "epoch": 5.924940356028629, "grad_norm": 8.515779495239258, "learning_rate": 8.87209747732313e-06, "loss": 0.1391, "num_input_tokens_seen": 69671536, "step": 32285 }, { "epoch": 5.925857955588182, "grad_norm": 5.34775447845459, "learning_rate": 8.871590810970143e-06, "loss": 0.1393, "num_input_tokens_seen": 69682608, "step": 32290 }, { "epoch": 5.926775555147733, "grad_norm": 0.16620612144470215, "learning_rate": 8.871084045316849e-06, "loss": 0.2535, "num_input_tokens_seen": 69694064, "step": 32295 }, { "epoch": 5.927693154707286, "grad_norm": 0.36818936467170715, "learning_rate": 8.87057718037625e-06, "loss": 0.2827, "num_input_tokens_seen": 69704304, "step": 32300 }, { "epoch": 5.928610754266838, "grad_norm": 16.48908805847168, "learning_rate": 8.870070216161346e-06, "loss": 0.284, "num_input_tokens_seen": 69715280, "step": 32305 }, { "epoch": 5.92952835382639, "grad_norm": 0.14274878799915314, "learning_rate": 8.869563152685139e-06, "loss": 0.0035, "num_input_tokens_seen": 69725840, "step": 32310 }, { "epoch": 5.930445953385942, "grad_norm": 0.24155056476593018, "learning_rate": 8.869055989960633e-06, "loss": 0.0109, "num_input_tokens_seen": 69736720, "step": 32315 }, { "epoch": 5.931363552945495, "grad_norm": 6.360698699951172, "learning_rate": 8.868548728000838e-06, "loss": 0.1868, "num_input_tokens_seen": 69747888, "step": 32320 }, { "epoch": 5.9322811525050465, "grad_norm": 0.255706250667572, "learning_rate": 8.868041366818762e-06, "loss": 0.0045, "num_input_tokens_seen": 69757296, "step": 32325 }, { "epoch": 5.933198752064599, "grad_norm": 8.475006103515625, "learning_rate": 8.867533906427424e-06, "loss": 0.2829, "num_input_tokens_seen": 69767600, "step": 32330 }, { "epoch": 5.9341163516241515, "grad_norm": 2.1218366622924805, "learning_rate": 8.867026346839832e-06, "loss": 0.0085, "num_input_tokens_seen": 69777936, "step": 32335 }, { "epoch": 5.935033951183703, "grad_norm": 0.14965525269508362, "learning_rate": 8.866518688069008e-06, "loss": 0.0043, "num_input_tokens_seen": 69788944, "step": 32340 }, { "epoch": 5.935951550743256, "grad_norm": 0.07712744921445847, "learning_rate": 8.866010930127974e-06, "loss": 0.1669, "num_input_tokens_seen": 69801392, "step": 32345 }, { "epoch": 5.936869150302808, "grad_norm": 0.07554560154676437, "learning_rate": 8.865503073029751e-06, "loss": 0.2032, "num_input_tokens_seen": 69813520, "step": 32350 }, { "epoch": 5.93778674986236, "grad_norm": 0.048636697232723236, "learning_rate": 8.864995116787363e-06, "loss": 0.1002, "num_input_tokens_seen": 69825008, "step": 32355 }, { "epoch": 5.938704349421912, "grad_norm": 5.228460788726807, "learning_rate": 8.864487061413842e-06, "loss": 0.1836, "num_input_tokens_seen": 69836368, "step": 32360 }, { "epoch": 5.939621948981465, "grad_norm": 2.212637424468994, "learning_rate": 8.863978906922219e-06, "loss": 0.0073, "num_input_tokens_seen": 69847280, "step": 32365 }, { "epoch": 5.940539548541016, "grad_norm": 1.1194180250167847, "learning_rate": 8.863470653325523e-06, "loss": 0.1188, "num_input_tokens_seen": 69859024, "step": 32370 }, { "epoch": 5.941457148100569, "grad_norm": 0.04571658745408058, "learning_rate": 8.862962300636793e-06, "loss": 0.0983, "num_input_tokens_seen": 69869744, "step": 32375 }, { "epoch": 5.942374747660121, "grad_norm": 69.17476654052734, "learning_rate": 8.862453848869067e-06, "loss": 0.1255, "num_input_tokens_seen": 69881232, "step": 32380 }, { "epoch": 5.943292347219673, "grad_norm": 0.1225489154458046, "learning_rate": 8.861945298035389e-06, "loss": 0.1537, "num_input_tokens_seen": 69891600, "step": 32385 }, { "epoch": 5.9442099467792255, "grad_norm": 0.12847979366779327, "learning_rate": 8.861436648148796e-06, "loss": 0.0375, "num_input_tokens_seen": 69901104, "step": 32390 }, { "epoch": 5.945127546338778, "grad_norm": 0.10490462183952332, "learning_rate": 8.86092789922234e-06, "loss": 0.004, "num_input_tokens_seen": 69911888, "step": 32395 }, { "epoch": 5.94604514589833, "grad_norm": 0.7723609805107117, "learning_rate": 8.860419051269064e-06, "loss": 0.0247, "num_input_tokens_seen": 69923056, "step": 32400 }, { "epoch": 5.946962745457882, "grad_norm": 0.02770000882446766, "learning_rate": 8.859910104302025e-06, "loss": 0.2316, "num_input_tokens_seen": 69933552, "step": 32405 }, { "epoch": 5.947880345017435, "grad_norm": 13.62104320526123, "learning_rate": 8.859401058334274e-06, "loss": 0.1803, "num_input_tokens_seen": 69944432, "step": 32410 }, { "epoch": 5.948797944576986, "grad_norm": 0.1275041401386261, "learning_rate": 8.858891913378866e-06, "loss": 0.0102, "num_input_tokens_seen": 69954992, "step": 32415 }, { "epoch": 5.949715544136539, "grad_norm": 10.604561805725098, "learning_rate": 8.858382669448863e-06, "loss": 0.1542, "num_input_tokens_seen": 69966000, "step": 32420 }, { "epoch": 5.950633143696091, "grad_norm": 0.0389799065887928, "learning_rate": 8.857873326557325e-06, "loss": 0.0014, "num_input_tokens_seen": 69976400, "step": 32425 }, { "epoch": 5.951550743255643, "grad_norm": 0.029162853956222534, "learning_rate": 8.857363884717314e-06, "loss": 0.0134, "num_input_tokens_seen": 69987536, "step": 32430 }, { "epoch": 5.952468342815195, "grad_norm": 0.1623055785894394, "learning_rate": 8.856854343941896e-06, "loss": 0.0012, "num_input_tokens_seen": 69998928, "step": 32435 }, { "epoch": 5.953385942374748, "grad_norm": 0.12966956198215485, "learning_rate": 8.856344704244143e-06, "loss": 0.1152, "num_input_tokens_seen": 70008144, "step": 32440 }, { "epoch": 5.9543035419343, "grad_norm": 0.18434731662273407, "learning_rate": 8.855834965637127e-06, "loss": 0.3647, "num_input_tokens_seen": 70018960, "step": 32445 }, { "epoch": 5.955221141493852, "grad_norm": 0.06974612921476364, "learning_rate": 8.855325128133918e-06, "loss": 0.0019, "num_input_tokens_seen": 70030672, "step": 32450 }, { "epoch": 5.956138741053405, "grad_norm": 6.903306007385254, "learning_rate": 8.854815191747596e-06, "loss": 0.1993, "num_input_tokens_seen": 70043088, "step": 32455 }, { "epoch": 5.957056340612956, "grad_norm": 93.88909912109375, "learning_rate": 8.854305156491239e-06, "loss": 0.2152, "num_input_tokens_seen": 70054608, "step": 32460 }, { "epoch": 5.957973940172509, "grad_norm": 0.5628851652145386, "learning_rate": 8.853795022377927e-06, "loss": 0.0024, "num_input_tokens_seen": 70065296, "step": 32465 }, { "epoch": 5.958891539732061, "grad_norm": 48.91729736328125, "learning_rate": 8.853284789420746e-06, "loss": 0.2112, "num_input_tokens_seen": 70076304, "step": 32470 }, { "epoch": 5.959809139291613, "grad_norm": 5.293886661529541, "learning_rate": 8.852774457632782e-06, "loss": 0.3507, "num_input_tokens_seen": 70088080, "step": 32475 }, { "epoch": 5.960726738851165, "grad_norm": 0.05428225174546242, "learning_rate": 8.852264027027124e-06, "loss": 0.2005, "num_input_tokens_seen": 70098960, "step": 32480 }, { "epoch": 5.961644338410718, "grad_norm": 0.028041185811161995, "learning_rate": 8.851753497616866e-06, "loss": 0.0241, "num_input_tokens_seen": 70110160, "step": 32485 }, { "epoch": 5.9625619379702695, "grad_norm": 0.14485007524490356, "learning_rate": 8.851242869415099e-06, "loss": 0.0067, "num_input_tokens_seen": 70122352, "step": 32490 }, { "epoch": 5.963479537529822, "grad_norm": 0.07398030906915665, "learning_rate": 8.850732142434921e-06, "loss": 0.0048, "num_input_tokens_seen": 70133232, "step": 32495 }, { "epoch": 5.9643971370893745, "grad_norm": 0.13939309120178223, "learning_rate": 8.850221316689432e-06, "loss": 0.0358, "num_input_tokens_seen": 70144592, "step": 32500 }, { "epoch": 5.965314736648926, "grad_norm": 0.15954819321632385, "learning_rate": 8.849710392191735e-06, "loss": 0.0855, "num_input_tokens_seen": 70156400, "step": 32505 }, { "epoch": 5.966232336208479, "grad_norm": 4.581618309020996, "learning_rate": 8.84919936895493e-06, "loss": 0.1625, "num_input_tokens_seen": 70166320, "step": 32510 }, { "epoch": 5.967149935768031, "grad_norm": 74.63755798339844, "learning_rate": 8.84868824699213e-06, "loss": 0.4465, "num_input_tokens_seen": 70178000, "step": 32515 }, { "epoch": 5.968067535327583, "grad_norm": 6.4055891036987305, "learning_rate": 8.84817702631644e-06, "loss": 0.3406, "num_input_tokens_seen": 70189520, "step": 32520 }, { "epoch": 5.968985134887135, "grad_norm": 0.6504117250442505, "learning_rate": 8.847665706940973e-06, "loss": 0.0842, "num_input_tokens_seen": 70200752, "step": 32525 }, { "epoch": 5.969902734446688, "grad_norm": 0.4935556948184967, "learning_rate": 8.847154288878845e-06, "loss": 0.1094, "num_input_tokens_seen": 70211568, "step": 32530 }, { "epoch": 5.970820334006239, "grad_norm": 0.1240197941660881, "learning_rate": 8.84664277214317e-06, "loss": 0.1389, "num_input_tokens_seen": 70222640, "step": 32535 }, { "epoch": 5.971737933565792, "grad_norm": 0.15583202242851257, "learning_rate": 8.84613115674707e-06, "loss": 0.0043, "num_input_tokens_seen": 70232048, "step": 32540 }, { "epoch": 5.972655533125344, "grad_norm": 0.05511702969670296, "learning_rate": 8.84561944270367e-06, "loss": 0.0047, "num_input_tokens_seen": 70242288, "step": 32545 }, { "epoch": 5.973573132684896, "grad_norm": 2.3850231170654297, "learning_rate": 8.845107630026088e-06, "loss": 0.0082, "num_input_tokens_seen": 70253456, "step": 32550 }, { "epoch": 5.9744907322444485, "grad_norm": 103.27020263671875, "learning_rate": 8.844595718727457e-06, "loss": 0.1595, "num_input_tokens_seen": 70264400, "step": 32555 }, { "epoch": 5.975408331804001, "grad_norm": 0.049411602318286896, "learning_rate": 8.844083708820903e-06, "loss": 0.1564, "num_input_tokens_seen": 70275600, "step": 32560 }, { "epoch": 5.976325931363553, "grad_norm": 55.78664779663086, "learning_rate": 8.84357160031956e-06, "loss": 0.4464, "num_input_tokens_seen": 70286608, "step": 32565 }, { "epoch": 5.977243530923105, "grad_norm": 7.046534538269043, "learning_rate": 8.843059393236563e-06, "loss": 0.0939, "num_input_tokens_seen": 70297328, "step": 32570 }, { "epoch": 5.978161130482658, "grad_norm": 0.04853501170873642, "learning_rate": 8.842547087585047e-06, "loss": 0.0906, "num_input_tokens_seen": 70308368, "step": 32575 }, { "epoch": 5.979078730042209, "grad_norm": 0.034600526094436646, "learning_rate": 8.842034683378156e-06, "loss": 0.109, "num_input_tokens_seen": 70316976, "step": 32580 }, { "epoch": 5.979996329601762, "grad_norm": 0.3231644928455353, "learning_rate": 8.841522180629029e-06, "loss": 0.0971, "num_input_tokens_seen": 70327568, "step": 32585 }, { "epoch": 5.980913929161314, "grad_norm": 1.3703328371047974, "learning_rate": 8.84100957935081e-06, "loss": 0.1259, "num_input_tokens_seen": 70338640, "step": 32590 }, { "epoch": 5.981831528720866, "grad_norm": 0.07370159029960632, "learning_rate": 8.840496879556651e-06, "loss": 0.0034, "num_input_tokens_seen": 70348784, "step": 32595 }, { "epoch": 5.982749128280418, "grad_norm": 56.747806549072266, "learning_rate": 8.839984081259699e-06, "loss": 0.1523, "num_input_tokens_seen": 70358896, "step": 32600 }, { "epoch": 5.983666727839971, "grad_norm": 11.839159965515137, "learning_rate": 8.839471184473105e-06, "loss": 0.1048, "num_input_tokens_seen": 70369520, "step": 32605 }, { "epoch": 5.9845843273995225, "grad_norm": 0.04582863301038742, "learning_rate": 8.838958189210028e-06, "loss": 0.0029, "num_input_tokens_seen": 70380496, "step": 32610 }, { "epoch": 5.985501926959075, "grad_norm": 0.14769165217876434, "learning_rate": 8.838445095483622e-06, "loss": 0.2455, "num_input_tokens_seen": 70390896, "step": 32615 }, { "epoch": 5.9864195265186275, "grad_norm": 0.09744371473789215, "learning_rate": 8.837931903307047e-06, "loss": 0.2255, "num_input_tokens_seen": 70401136, "step": 32620 }, { "epoch": 5.987337126078179, "grad_norm": 5.641452312469482, "learning_rate": 8.83741861269347e-06, "loss": 0.1425, "num_input_tokens_seen": 70412720, "step": 32625 }, { "epoch": 5.988254725637732, "grad_norm": 23.321971893310547, "learning_rate": 8.836905223656052e-06, "loss": 0.1631, "num_input_tokens_seen": 70423792, "step": 32630 }, { "epoch": 5.989172325197284, "grad_norm": 0.18536566197872162, "learning_rate": 8.83639173620796e-06, "loss": 0.0673, "num_input_tokens_seen": 70432976, "step": 32635 }, { "epoch": 5.990089924756836, "grad_norm": 0.36247819662094116, "learning_rate": 8.835878150362368e-06, "loss": 0.2587, "num_input_tokens_seen": 70442832, "step": 32640 }, { "epoch": 5.991007524316388, "grad_norm": 29.12080955505371, "learning_rate": 8.835364466132443e-06, "loss": 0.2044, "num_input_tokens_seen": 70454544, "step": 32645 }, { "epoch": 5.991925123875941, "grad_norm": 0.06109551712870598, "learning_rate": 8.834850683531367e-06, "loss": 0.0058, "num_input_tokens_seen": 70463984, "step": 32650 }, { "epoch": 5.992842723435492, "grad_norm": 3.05892276763916, "learning_rate": 8.834336802572314e-06, "loss": 0.1038, "num_input_tokens_seen": 70474800, "step": 32655 }, { "epoch": 5.993760322995045, "grad_norm": 166.60446166992188, "learning_rate": 8.833822823268466e-06, "loss": 0.1618, "num_input_tokens_seen": 70485456, "step": 32660 }, { "epoch": 5.994677922554597, "grad_norm": 1.7264610528945923, "learning_rate": 8.833308745633001e-06, "loss": 0.2639, "num_input_tokens_seen": 70496656, "step": 32665 }, { "epoch": 5.995595522114149, "grad_norm": 0.40273815393447876, "learning_rate": 8.83279456967911e-06, "loss": 0.0471, "num_input_tokens_seen": 70507376, "step": 32670 }, { "epoch": 5.9965131216737015, "grad_norm": 0.10813534259796143, "learning_rate": 8.832280295419978e-06, "loss": 0.0782, "num_input_tokens_seen": 70517968, "step": 32675 }, { "epoch": 5.997430721233254, "grad_norm": 10.707539558410645, "learning_rate": 8.831765922868796e-06, "loss": 0.1182, "num_input_tokens_seen": 70529168, "step": 32680 }, { "epoch": 5.998348320792806, "grad_norm": 4.502449035644531, "learning_rate": 8.831251452038757e-06, "loss": 0.1854, "num_input_tokens_seen": 70539600, "step": 32685 }, { "epoch": 5.999265920352358, "grad_norm": 0.6314659118652344, "learning_rate": 8.830736882943057e-06, "loss": 0.0933, "num_input_tokens_seen": 70550224, "step": 32690 }, { "epoch": 6.0, "eval_loss": 0.1661907583475113, "eval_runtime": 178.8561, "eval_samples_per_second": 30.466, "eval_steps_per_second": 7.621, "num_input_tokens_seen": 70557920, "step": 32694 }, { "epoch": 6.000183519911911, "grad_norm": 0.06949122250080109, "learning_rate": 8.83022221559489e-06, "loss": 0.1376, "num_input_tokens_seen": 70559840, "step": 32695 }, { "epoch": 6.001101119471462, "grad_norm": 16.394176483154297, "learning_rate": 8.829707450007463e-06, "loss": 0.0866, "num_input_tokens_seen": 70568320, "step": 32700 }, { "epoch": 6.002018719031015, "grad_norm": 0.5063822865486145, "learning_rate": 8.829192586193972e-06, "loss": 0.0103, "num_input_tokens_seen": 70578048, "step": 32705 }, { "epoch": 6.002936318590567, "grad_norm": 0.13694529235363007, "learning_rate": 8.82867762416763e-06, "loss": 0.0058, "num_input_tokens_seen": 70590304, "step": 32710 }, { "epoch": 6.003853918150119, "grad_norm": 14.945273399353027, "learning_rate": 8.828162563941638e-06, "loss": 0.0521, "num_input_tokens_seen": 70601280, "step": 32715 }, { "epoch": 6.004771517709671, "grad_norm": 0.23131045699119568, "learning_rate": 8.827647405529209e-06, "loss": 0.046, "num_input_tokens_seen": 70611776, "step": 32720 }, { "epoch": 6.005689117269224, "grad_norm": 0.32205888628959656, "learning_rate": 8.827132148943557e-06, "loss": 0.0051, "num_input_tokens_seen": 70622624, "step": 32725 }, { "epoch": 6.006606716828776, "grad_norm": 0.21114592254161835, "learning_rate": 8.826616794197898e-06, "loss": 0.0355, "num_input_tokens_seen": 70633408, "step": 32730 }, { "epoch": 6.007524316388328, "grad_norm": 0.0223365630954504, "learning_rate": 8.82610134130545e-06, "loss": 0.0096, "num_input_tokens_seen": 70644032, "step": 32735 }, { "epoch": 6.008441915947881, "grad_norm": 0.5782767534255981, "learning_rate": 8.825585790279429e-06, "loss": 0.0198, "num_input_tokens_seen": 70653472, "step": 32740 }, { "epoch": 6.009359515507432, "grad_norm": 0.015458768233656883, "learning_rate": 8.825070141133064e-06, "loss": 0.0811, "num_input_tokens_seen": 70664128, "step": 32745 }, { "epoch": 6.010277115066985, "grad_norm": 2.931281089782715, "learning_rate": 8.824554393879578e-06, "loss": 0.0044, "num_input_tokens_seen": 70673792, "step": 32750 }, { "epoch": 6.011194714626537, "grad_norm": 47.33933639526367, "learning_rate": 8.824038548532199e-06, "loss": 0.175, "num_input_tokens_seen": 70685120, "step": 32755 }, { "epoch": 6.012112314186089, "grad_norm": 0.21488399803638458, "learning_rate": 8.823522605104157e-06, "loss": 0.004, "num_input_tokens_seen": 70696352, "step": 32760 }, { "epoch": 6.013029913745641, "grad_norm": 0.20856873691082, "learning_rate": 8.823006563608687e-06, "loss": 0.0016, "num_input_tokens_seen": 70706944, "step": 32765 }, { "epoch": 6.013947513305194, "grad_norm": 0.11702760308980942, "learning_rate": 8.822490424059025e-06, "loss": 0.31, "num_input_tokens_seen": 70718944, "step": 32770 }, { "epoch": 6.0148651128647455, "grad_norm": 0.07717490196228027, "learning_rate": 8.821974186468405e-06, "loss": 0.0022, "num_input_tokens_seen": 70729184, "step": 32775 }, { "epoch": 6.015782712424298, "grad_norm": 0.024529416114091873, "learning_rate": 8.821457850850073e-06, "loss": 0.1794, "num_input_tokens_seen": 70739904, "step": 32780 }, { "epoch": 6.0167003119838505, "grad_norm": 7.289487838745117, "learning_rate": 8.820941417217269e-06, "loss": 0.1926, "num_input_tokens_seen": 70751008, "step": 32785 }, { "epoch": 6.017617911543402, "grad_norm": 23.30786895751953, "learning_rate": 8.82042488558324e-06, "loss": 0.1209, "num_input_tokens_seen": 70762176, "step": 32790 }, { "epoch": 6.018535511102955, "grad_norm": 0.17611321806907654, "learning_rate": 8.819908255961234e-06, "loss": 0.0018, "num_input_tokens_seen": 70772640, "step": 32795 }, { "epoch": 6.019453110662507, "grad_norm": 0.9105533361434937, "learning_rate": 8.8193915283645e-06, "loss": 0.0601, "num_input_tokens_seen": 70784064, "step": 32800 }, { "epoch": 6.020370710222059, "grad_norm": 0.2119373381137848, "learning_rate": 8.818874702806294e-06, "loss": 0.1405, "num_input_tokens_seen": 70795072, "step": 32805 }, { "epoch": 6.021288309781611, "grad_norm": 0.07801855355501175, "learning_rate": 8.81835777929987e-06, "loss": 0.1859, "num_input_tokens_seen": 70805760, "step": 32810 }, { "epoch": 6.022205909341164, "grad_norm": 0.05388723313808441, "learning_rate": 8.817840757858487e-06, "loss": 0.1541, "num_input_tokens_seen": 70815424, "step": 32815 }, { "epoch": 6.023123508900715, "grad_norm": 0.4858800768852234, "learning_rate": 8.817323638495408e-06, "loss": 0.004, "num_input_tokens_seen": 70826592, "step": 32820 }, { "epoch": 6.024041108460268, "grad_norm": 46.40989303588867, "learning_rate": 8.81680642122389e-06, "loss": 0.1098, "num_input_tokens_seen": 70838368, "step": 32825 }, { "epoch": 6.02495870801982, "grad_norm": 0.024610918015241623, "learning_rate": 8.816289106057206e-06, "loss": 0.0018, "num_input_tokens_seen": 70848096, "step": 32830 }, { "epoch": 6.025876307579372, "grad_norm": 0.08376172184944153, "learning_rate": 8.815771693008619e-06, "loss": 0.0033, "num_input_tokens_seen": 70858368, "step": 32835 }, { "epoch": 6.0267939071389245, "grad_norm": 4.589264869689941, "learning_rate": 8.815254182091403e-06, "loss": 0.1732, "num_input_tokens_seen": 70867808, "step": 32840 }, { "epoch": 6.027711506698477, "grad_norm": 0.06614966690540314, "learning_rate": 8.81473657331883e-06, "loss": 0.0962, "num_input_tokens_seen": 70878272, "step": 32845 }, { "epoch": 6.028629106258029, "grad_norm": 0.05710214376449585, "learning_rate": 8.814218866704176e-06, "loss": 0.2433, "num_input_tokens_seen": 70889888, "step": 32850 }, { "epoch": 6.029546705817581, "grad_norm": 0.02709774300456047, "learning_rate": 8.813701062260722e-06, "loss": 0.1022, "num_input_tokens_seen": 70901536, "step": 32855 }, { "epoch": 6.030464305377134, "grad_norm": 2.1661882400512695, "learning_rate": 8.813183160001743e-06, "loss": 0.13, "num_input_tokens_seen": 70911648, "step": 32860 }, { "epoch": 6.031381904936685, "grad_norm": 5.9951090812683105, "learning_rate": 8.812665159940527e-06, "loss": 0.206, "num_input_tokens_seen": 70921856, "step": 32865 }, { "epoch": 6.032299504496238, "grad_norm": 0.04238414391875267, "learning_rate": 8.812147062090361e-06, "loss": 0.0091, "num_input_tokens_seen": 70931872, "step": 32870 }, { "epoch": 6.03321710405579, "grad_norm": 0.13397786021232605, "learning_rate": 8.811628866464529e-06, "loss": 0.1913, "num_input_tokens_seen": 70943008, "step": 32875 }, { "epoch": 6.034134703615342, "grad_norm": 8.577743530273438, "learning_rate": 8.811110573076324e-06, "loss": 0.169, "num_input_tokens_seen": 70954976, "step": 32880 }, { "epoch": 6.035052303174894, "grad_norm": 7.044325351715088, "learning_rate": 8.81059218193904e-06, "loss": 0.1053, "num_input_tokens_seen": 70965920, "step": 32885 }, { "epoch": 6.035969902734447, "grad_norm": 9.33340072631836, "learning_rate": 8.810073693065973e-06, "loss": 0.0049, "num_input_tokens_seen": 70977088, "step": 32890 }, { "epoch": 6.0368875022939985, "grad_norm": 0.07202594727277756, "learning_rate": 8.809555106470418e-06, "loss": 0.0029, "num_input_tokens_seen": 70987872, "step": 32895 }, { "epoch": 6.037805101853551, "grad_norm": 6.9678425788879395, "learning_rate": 8.809036422165681e-06, "loss": 0.1365, "num_input_tokens_seen": 70997664, "step": 32900 }, { "epoch": 6.0387227014131035, "grad_norm": 0.04636405408382416, "learning_rate": 8.808517640165064e-06, "loss": 0.0065, "num_input_tokens_seen": 71007744, "step": 32905 }, { "epoch": 6.039640300972655, "grad_norm": 2.895158290863037, "learning_rate": 8.807998760481871e-06, "loss": 0.0037, "num_input_tokens_seen": 71019584, "step": 32910 }, { "epoch": 6.040557900532208, "grad_norm": 5.072793006896973, "learning_rate": 8.807479783129411e-06, "loss": 0.2311, "num_input_tokens_seen": 71030272, "step": 32915 }, { "epoch": 6.04147550009176, "grad_norm": 7.58828067779541, "learning_rate": 8.806960708120997e-06, "loss": 0.175, "num_input_tokens_seen": 71042016, "step": 32920 }, { "epoch": 6.042393099651312, "grad_norm": 0.2660251259803772, "learning_rate": 8.806441535469941e-06, "loss": 0.1056, "num_input_tokens_seen": 71052608, "step": 32925 }, { "epoch": 6.043310699210864, "grad_norm": 0.14664171636104584, "learning_rate": 8.80592226518956e-06, "loss": 0.0153, "num_input_tokens_seen": 71064224, "step": 32930 }, { "epoch": 6.044228298770417, "grad_norm": 0.025157053023576736, "learning_rate": 8.80540289729317e-06, "loss": 0.0096, "num_input_tokens_seen": 71074816, "step": 32935 }, { "epoch": 6.045145898329968, "grad_norm": 0.14804723858833313, "learning_rate": 8.804883431794094e-06, "loss": 0.1831, "num_input_tokens_seen": 71085824, "step": 32940 }, { "epoch": 6.046063497889521, "grad_norm": 0.08708903193473816, "learning_rate": 8.804363868705654e-06, "loss": 0.2734, "num_input_tokens_seen": 71096096, "step": 32945 }, { "epoch": 6.046981097449073, "grad_norm": 0.12146369367837906, "learning_rate": 8.803844208041177e-06, "loss": 0.0034, "num_input_tokens_seen": 71106560, "step": 32950 }, { "epoch": 6.047898697008625, "grad_norm": 0.14187157154083252, "learning_rate": 8.803324449813992e-06, "loss": 0.1003, "num_input_tokens_seen": 71116960, "step": 32955 }, { "epoch": 6.0488162965681775, "grad_norm": 10.22118091583252, "learning_rate": 8.80280459403743e-06, "loss": 0.1273, "num_input_tokens_seen": 71127552, "step": 32960 }, { "epoch": 6.04973389612773, "grad_norm": 0.10574638843536377, "learning_rate": 8.802284640724825e-06, "loss": 0.0699, "num_input_tokens_seen": 71137600, "step": 32965 }, { "epoch": 6.050651495687282, "grad_norm": 0.25086960196495056, "learning_rate": 8.80176458988951e-06, "loss": 0.1213, "num_input_tokens_seen": 71149024, "step": 32970 }, { "epoch": 6.051569095246834, "grad_norm": 0.055969443172216415, "learning_rate": 8.801244441544828e-06, "loss": 0.0647, "num_input_tokens_seen": 71159328, "step": 32975 }, { "epoch": 6.052486694806387, "grad_norm": 0.06937850266695023, "learning_rate": 8.800724195704114e-06, "loss": 0.0034, "num_input_tokens_seen": 71167712, "step": 32980 }, { "epoch": 6.053404294365938, "grad_norm": 4.732209205627441, "learning_rate": 8.800203852380718e-06, "loss": 0.0897, "num_input_tokens_seen": 71178304, "step": 32985 }, { "epoch": 6.054321893925491, "grad_norm": 10.421548843383789, "learning_rate": 8.79968341158798e-06, "loss": 0.2785, "num_input_tokens_seen": 71189952, "step": 32990 }, { "epoch": 6.055239493485043, "grad_norm": 1.1279547214508057, "learning_rate": 8.799162873339253e-06, "loss": 0.0101, "num_input_tokens_seen": 71200288, "step": 32995 }, { "epoch": 6.056157093044595, "grad_norm": 0.0973685160279274, "learning_rate": 8.798642237647888e-06, "loss": 0.1286, "num_input_tokens_seen": 71211392, "step": 33000 }, { "epoch": 6.057074692604147, "grad_norm": 0.060599181801080704, "learning_rate": 8.798121504527235e-06, "loss": 0.0054, "num_input_tokens_seen": 71221984, "step": 33005 }, { "epoch": 6.0579922921637, "grad_norm": 0.2910236418247223, "learning_rate": 8.797600673990652e-06, "loss": 0.0857, "num_input_tokens_seen": 71233056, "step": 33010 }, { "epoch": 6.058909891723252, "grad_norm": 0.04044916108250618, "learning_rate": 8.797079746051497e-06, "loss": 0.1668, "num_input_tokens_seen": 71244000, "step": 33015 }, { "epoch": 6.059827491282804, "grad_norm": 35.128456115722656, "learning_rate": 8.796558720723133e-06, "loss": 0.269, "num_input_tokens_seen": 71254528, "step": 33020 }, { "epoch": 6.060745090842357, "grad_norm": 43.88581085205078, "learning_rate": 8.796037598018923e-06, "loss": 0.1247, "num_input_tokens_seen": 71265376, "step": 33025 }, { "epoch": 6.061662690401908, "grad_norm": 36.007774353027344, "learning_rate": 8.79551637795223e-06, "loss": 0.0441, "num_input_tokens_seen": 71276128, "step": 33030 }, { "epoch": 6.062580289961461, "grad_norm": 0.2129846215248108, "learning_rate": 8.794995060536424e-06, "loss": 0.0874, "num_input_tokens_seen": 71286912, "step": 33035 }, { "epoch": 6.063497889521013, "grad_norm": 0.25777533650398254, "learning_rate": 8.794473645784878e-06, "loss": 0.0538, "num_input_tokens_seen": 71295392, "step": 33040 }, { "epoch": 6.064415489080565, "grad_norm": 0.07442299276590347, "learning_rate": 8.793952133710962e-06, "loss": 0.056, "num_input_tokens_seen": 71306240, "step": 33045 }, { "epoch": 6.065333088640117, "grad_norm": 62.33656311035156, "learning_rate": 8.793430524328055e-06, "loss": 0.1053, "num_input_tokens_seen": 71317888, "step": 33050 }, { "epoch": 6.06625068819967, "grad_norm": 0.19960260391235352, "learning_rate": 8.792908817649534e-06, "loss": 0.193, "num_input_tokens_seen": 71327680, "step": 33055 }, { "epoch": 6.0671682877592215, "grad_norm": 5.22976016998291, "learning_rate": 8.792387013688781e-06, "loss": 0.138, "num_input_tokens_seen": 71337536, "step": 33060 }, { "epoch": 6.068085887318774, "grad_norm": 0.16570645570755005, "learning_rate": 8.791865112459178e-06, "loss": 0.0022, "num_input_tokens_seen": 71348768, "step": 33065 }, { "epoch": 6.0690034868783265, "grad_norm": 0.033510662615299225, "learning_rate": 8.791343113974112e-06, "loss": 0.0027, "num_input_tokens_seen": 71359360, "step": 33070 }, { "epoch": 6.069921086437878, "grad_norm": 0.02625473216176033, "learning_rate": 8.790821018246971e-06, "loss": 0.005, "num_input_tokens_seen": 71369184, "step": 33075 }, { "epoch": 6.070838685997431, "grad_norm": 0.14278990030288696, "learning_rate": 8.790298825291145e-06, "loss": 0.0032, "num_input_tokens_seen": 71379456, "step": 33080 }, { "epoch": 6.071756285556983, "grad_norm": 0.07047523558139801, "learning_rate": 8.789776535120032e-06, "loss": 0.0029, "num_input_tokens_seen": 71389280, "step": 33085 }, { "epoch": 6.072673885116535, "grad_norm": 0.018428195267915726, "learning_rate": 8.789254147747022e-06, "loss": 0.0752, "num_input_tokens_seen": 71399776, "step": 33090 }, { "epoch": 6.073591484676087, "grad_norm": 0.08718814700841904, "learning_rate": 8.788731663185516e-06, "loss": 0.3152, "num_input_tokens_seen": 71410944, "step": 33095 }, { "epoch": 6.07450908423564, "grad_norm": 0.06087939813733101, "learning_rate": 8.788209081448916e-06, "loss": 0.1571, "num_input_tokens_seen": 71422560, "step": 33100 }, { "epoch": 6.075426683795191, "grad_norm": 0.06693320721387863, "learning_rate": 8.787686402550622e-06, "loss": 0.125, "num_input_tokens_seen": 71433376, "step": 33105 }, { "epoch": 6.076344283354744, "grad_norm": 0.1769997477531433, "learning_rate": 8.787163626504043e-06, "loss": 0.2333, "num_input_tokens_seen": 71444480, "step": 33110 }, { "epoch": 6.077261882914296, "grad_norm": 0.07469597458839417, "learning_rate": 8.786640753322588e-06, "loss": 0.2055, "num_input_tokens_seen": 71455264, "step": 33115 }, { "epoch": 6.078179482473848, "grad_norm": 0.2186829298734665, "learning_rate": 8.786117783019666e-06, "loss": 0.0042, "num_input_tokens_seen": 71467008, "step": 33120 }, { "epoch": 6.0790970820334005, "grad_norm": 79.38052368164062, "learning_rate": 8.78559471560869e-06, "loss": 0.0536, "num_input_tokens_seen": 71477600, "step": 33125 }, { "epoch": 6.080014681592953, "grad_norm": 0.025808531790971756, "learning_rate": 8.785071551103075e-06, "loss": 0.0043, "num_input_tokens_seen": 71489280, "step": 33130 }, { "epoch": 6.080932281152505, "grad_norm": 0.37099751830101013, "learning_rate": 8.784548289516244e-06, "loss": 0.0041, "num_input_tokens_seen": 71499712, "step": 33135 }, { "epoch": 6.081849880712057, "grad_norm": 0.025754358619451523, "learning_rate": 8.784024930861614e-06, "loss": 0.0883, "num_input_tokens_seen": 71510784, "step": 33140 }, { "epoch": 6.08276748027161, "grad_norm": 0.05744581297039986, "learning_rate": 8.78350147515261e-06, "loss": 0.0026, "num_input_tokens_seen": 71519872, "step": 33145 }, { "epoch": 6.083685079831161, "grad_norm": 6.982203960418701, "learning_rate": 8.782977922402656e-06, "loss": 0.0061, "num_input_tokens_seen": 71530368, "step": 33150 }, { "epoch": 6.084602679390714, "grad_norm": 0.015142971649765968, "learning_rate": 8.782454272625181e-06, "loss": 0.1511, "num_input_tokens_seen": 71541568, "step": 33155 }, { "epoch": 6.085520278950266, "grad_norm": 0.09893719851970673, "learning_rate": 8.781930525833617e-06, "loss": 0.3301, "num_input_tokens_seen": 71553280, "step": 33160 }, { "epoch": 6.086437878509818, "grad_norm": 6.599518299102783, "learning_rate": 8.781406682041395e-06, "loss": 0.115, "num_input_tokens_seen": 71562144, "step": 33165 }, { "epoch": 6.08735547806937, "grad_norm": 0.07741014659404755, "learning_rate": 8.780882741261954e-06, "loss": 0.0017, "num_input_tokens_seen": 71572640, "step": 33170 }, { "epoch": 6.088273077628923, "grad_norm": 0.544040322303772, "learning_rate": 8.780358703508732e-06, "loss": 0.0059, "num_input_tokens_seen": 71583680, "step": 33175 }, { "epoch": 6.0891906771884745, "grad_norm": 0.033742718398571014, "learning_rate": 8.779834568795165e-06, "loss": 0.1171, "num_input_tokens_seen": 71594304, "step": 33180 }, { "epoch": 6.090108276748027, "grad_norm": 0.04251980409026146, "learning_rate": 8.779310337134702e-06, "loss": 0.2864, "num_input_tokens_seen": 71604032, "step": 33185 }, { "epoch": 6.0910258763075795, "grad_norm": 22.579071044921875, "learning_rate": 8.778786008540785e-06, "loss": 0.1219, "num_input_tokens_seen": 71615808, "step": 33190 }, { "epoch": 6.091943475867131, "grad_norm": 0.5581523776054382, "learning_rate": 8.778261583026864e-06, "loss": 0.0044, "num_input_tokens_seen": 71626144, "step": 33195 }, { "epoch": 6.092861075426684, "grad_norm": 0.0797581821680069, "learning_rate": 8.777737060606388e-06, "loss": 0.0098, "num_input_tokens_seen": 71636832, "step": 33200 }, { "epoch": 6.093778674986236, "grad_norm": 0.44249486923217773, "learning_rate": 8.777212441292811e-06, "loss": 0.006, "num_input_tokens_seen": 71647968, "step": 33205 }, { "epoch": 6.094696274545788, "grad_norm": 5.99365234375, "learning_rate": 8.776687725099591e-06, "loss": 0.1304, "num_input_tokens_seen": 71660512, "step": 33210 }, { "epoch": 6.09561387410534, "grad_norm": 0.06274454295635223, "learning_rate": 8.776162912040183e-06, "loss": 0.0026, "num_input_tokens_seen": 71671904, "step": 33215 }, { "epoch": 6.096531473664893, "grad_norm": 0.13127683103084564, "learning_rate": 8.775638002128048e-06, "loss": 0.0865, "num_input_tokens_seen": 71682848, "step": 33220 }, { "epoch": 6.097449073224444, "grad_norm": 0.0417015515267849, "learning_rate": 8.775112995376653e-06, "loss": 0.0988, "num_input_tokens_seen": 71694432, "step": 33225 }, { "epoch": 6.098366672783997, "grad_norm": 0.07530663907527924, "learning_rate": 8.774587891799457e-06, "loss": 0.0017, "num_input_tokens_seen": 71704064, "step": 33230 }, { "epoch": 6.099284272343549, "grad_norm": 39.65349197387695, "learning_rate": 8.774062691409934e-06, "loss": 0.1657, "num_input_tokens_seen": 71714784, "step": 33235 }, { "epoch": 6.100201871903101, "grad_norm": 0.051143378019332886, "learning_rate": 8.773537394221552e-06, "loss": 0.0022, "num_input_tokens_seen": 71724480, "step": 33240 }, { "epoch": 6.1011194714626535, "grad_norm": 0.08418294042348862, "learning_rate": 8.773012000247784e-06, "loss": 0.0019, "num_input_tokens_seen": 71735872, "step": 33245 }, { "epoch": 6.102037071022206, "grad_norm": 0.03848769888281822, "learning_rate": 8.772486509502105e-06, "loss": 0.1544, "num_input_tokens_seen": 71747936, "step": 33250 }, { "epoch": 6.102954670581759, "grad_norm": 7.221470355987549, "learning_rate": 8.771960921997995e-06, "loss": 0.1435, "num_input_tokens_seen": 71759712, "step": 33255 }, { "epoch": 6.10387227014131, "grad_norm": 0.10664816200733185, "learning_rate": 8.771435237748932e-06, "loss": 0.2209, "num_input_tokens_seen": 71769984, "step": 33260 }, { "epoch": 6.104789869700863, "grad_norm": 15.635700225830078, "learning_rate": 8.770909456768401e-06, "loss": 0.1397, "num_input_tokens_seen": 71780608, "step": 33265 }, { "epoch": 6.105707469260415, "grad_norm": 0.13883574306964874, "learning_rate": 8.770383579069888e-06, "loss": 0.1798, "num_input_tokens_seen": 71790464, "step": 33270 }, { "epoch": 6.106625068819967, "grad_norm": 0.07462166994810104, "learning_rate": 8.769857604666878e-06, "loss": 0.0034, "num_input_tokens_seen": 71801408, "step": 33275 }, { "epoch": 6.107542668379519, "grad_norm": 4.296675682067871, "learning_rate": 8.769331533572864e-06, "loss": 0.0891, "num_input_tokens_seen": 71812448, "step": 33280 }, { "epoch": 6.108460267939072, "grad_norm": 0.02194291353225708, "learning_rate": 8.768805365801338e-06, "loss": 0.1126, "num_input_tokens_seen": 71823360, "step": 33285 }, { "epoch": 6.109377867498623, "grad_norm": 0.10008560866117477, "learning_rate": 8.768279101365796e-06, "loss": 0.1121, "num_input_tokens_seen": 71833888, "step": 33290 }, { "epoch": 6.110295467058176, "grad_norm": 0.05981845036149025, "learning_rate": 8.767752740279736e-06, "loss": 0.1206, "num_input_tokens_seen": 71844928, "step": 33295 }, { "epoch": 6.1112130666177285, "grad_norm": 72.00000762939453, "learning_rate": 8.767226282556656e-06, "loss": 0.1429, "num_input_tokens_seen": 71856320, "step": 33300 }, { "epoch": 6.11213066617728, "grad_norm": 0.14230060577392578, "learning_rate": 8.76669972821006e-06, "loss": 0.0019, "num_input_tokens_seen": 71867232, "step": 33305 }, { "epoch": 6.113048265736833, "grad_norm": 0.07576403021812439, "learning_rate": 8.766173077253456e-06, "loss": 0.1705, "num_input_tokens_seen": 71878720, "step": 33310 }, { "epoch": 6.113965865296385, "grad_norm": 0.029033590108156204, "learning_rate": 8.765646329700348e-06, "loss": 0.0211, "num_input_tokens_seen": 71890208, "step": 33315 }, { "epoch": 6.114883464855937, "grad_norm": 0.03292291238903999, "learning_rate": 8.765119485564248e-06, "loss": 0.0016, "num_input_tokens_seen": 71900224, "step": 33320 }, { "epoch": 6.115801064415489, "grad_norm": 0.018362555652856827, "learning_rate": 8.76459254485867e-06, "loss": 0.2243, "num_input_tokens_seen": 71911072, "step": 33325 }, { "epoch": 6.116718663975042, "grad_norm": 0.040582966059446335, "learning_rate": 8.764065507597125e-06, "loss": 0.1571, "num_input_tokens_seen": 71921088, "step": 33330 }, { "epoch": 6.117636263534593, "grad_norm": 0.08601649105548859, "learning_rate": 8.763538373793136e-06, "loss": 0.0023, "num_input_tokens_seen": 71931616, "step": 33335 }, { "epoch": 6.118553863094146, "grad_norm": 0.058773212134838104, "learning_rate": 8.763011143460221e-06, "loss": 0.0259, "num_input_tokens_seen": 71943392, "step": 33340 }, { "epoch": 6.119471462653698, "grad_norm": 7.03880500793457, "learning_rate": 8.762483816611901e-06, "loss": 0.1573, "num_input_tokens_seen": 71954176, "step": 33345 }, { "epoch": 6.12038906221325, "grad_norm": 0.06471416354179382, "learning_rate": 8.761956393261703e-06, "loss": 0.0024, "num_input_tokens_seen": 71964992, "step": 33350 }, { "epoch": 6.1213066617728025, "grad_norm": 0.1531161218881607, "learning_rate": 8.761428873423155e-06, "loss": 0.1345, "num_input_tokens_seen": 71974848, "step": 33355 }, { "epoch": 6.122224261332355, "grad_norm": 12.00192642211914, "learning_rate": 8.760901257109784e-06, "loss": 0.1927, "num_input_tokens_seen": 71985472, "step": 33360 }, { "epoch": 6.123141860891907, "grad_norm": 0.06442525237798691, "learning_rate": 8.760373544335125e-06, "loss": 0.0033, "num_input_tokens_seen": 71997024, "step": 33365 }, { "epoch": 6.124059460451459, "grad_norm": 0.06060895696282387, "learning_rate": 8.759845735112714e-06, "loss": 0.3165, "num_input_tokens_seen": 72008512, "step": 33370 }, { "epoch": 6.124977060011012, "grad_norm": 0.1178504005074501, "learning_rate": 8.759317829456086e-06, "loss": 0.2587, "num_input_tokens_seen": 72019488, "step": 33375 }, { "epoch": 6.125894659570563, "grad_norm": 5.854964256286621, "learning_rate": 8.758789827378782e-06, "loss": 0.1677, "num_input_tokens_seen": 72030336, "step": 33380 }, { "epoch": 6.126812259130116, "grad_norm": 0.38760846853256226, "learning_rate": 8.758261728894345e-06, "loss": 0.0682, "num_input_tokens_seen": 72041888, "step": 33385 }, { "epoch": 6.127729858689668, "grad_norm": 0.205656960606575, "learning_rate": 8.757733534016319e-06, "loss": 0.0039, "num_input_tokens_seen": 72052800, "step": 33390 }, { "epoch": 6.12864745824922, "grad_norm": 0.3889963924884796, "learning_rate": 8.757205242758252e-06, "loss": 0.1229, "num_input_tokens_seen": 72064032, "step": 33395 }, { "epoch": 6.129565057808772, "grad_norm": 0.04014979675412178, "learning_rate": 8.756676855133694e-06, "loss": 0.0957, "num_input_tokens_seen": 72075072, "step": 33400 }, { "epoch": 6.130482657368325, "grad_norm": 0.14821437001228333, "learning_rate": 8.756148371156197e-06, "loss": 0.0023, "num_input_tokens_seen": 72085792, "step": 33405 }, { "epoch": 6.1314002569278765, "grad_norm": 0.1497359573841095, "learning_rate": 8.755619790839316e-06, "loss": 0.1174, "num_input_tokens_seen": 72096928, "step": 33410 }, { "epoch": 6.132317856487429, "grad_norm": 0.07148201763629913, "learning_rate": 8.755091114196608e-06, "loss": 0.0051, "num_input_tokens_seen": 72108128, "step": 33415 }, { "epoch": 6.1332354560469815, "grad_norm": 0.12187417596578598, "learning_rate": 8.754562341241631e-06, "loss": 0.1069, "num_input_tokens_seen": 72119200, "step": 33420 }, { "epoch": 6.134153055606533, "grad_norm": 0.1526930183172226, "learning_rate": 8.75403347198795e-06, "loss": 0.0912, "num_input_tokens_seen": 72129184, "step": 33425 }, { "epoch": 6.135070655166086, "grad_norm": 0.05914589762687683, "learning_rate": 8.75350450644913e-06, "loss": 0.0027, "num_input_tokens_seen": 72141312, "step": 33430 }, { "epoch": 6.135988254725638, "grad_norm": 26.662803649902344, "learning_rate": 8.752975444638737e-06, "loss": 0.0085, "num_input_tokens_seen": 72153120, "step": 33435 }, { "epoch": 6.13690585428519, "grad_norm": 0.0640711560845375, "learning_rate": 8.752446286570337e-06, "loss": 0.1899, "num_input_tokens_seen": 72162976, "step": 33440 }, { "epoch": 6.137823453844742, "grad_norm": 0.04956089332699776, "learning_rate": 8.751917032257508e-06, "loss": 0.1454, "num_input_tokens_seen": 72172192, "step": 33445 }, { "epoch": 6.138741053404295, "grad_norm": 0.41451358795166016, "learning_rate": 8.751387681713822e-06, "loss": 0.0019, "num_input_tokens_seen": 72183712, "step": 33450 }, { "epoch": 6.139658652963846, "grad_norm": 8.247995376586914, "learning_rate": 8.750858234952856e-06, "loss": 0.1354, "num_input_tokens_seen": 72194720, "step": 33455 }, { "epoch": 6.140576252523399, "grad_norm": 0.1144271194934845, "learning_rate": 8.750328691988191e-06, "loss": 0.0018, "num_input_tokens_seen": 72204992, "step": 33460 }, { "epoch": 6.141493852082951, "grad_norm": 0.011425462551414967, "learning_rate": 8.749799052833405e-06, "loss": 0.0985, "num_input_tokens_seen": 72216128, "step": 33465 }, { "epoch": 6.142411451642503, "grad_norm": 6.410295009613037, "learning_rate": 8.749269317502085e-06, "loss": 0.2598, "num_input_tokens_seen": 72226528, "step": 33470 }, { "epoch": 6.1433290512020555, "grad_norm": 13.724102020263672, "learning_rate": 8.74873948600782e-06, "loss": 0.1666, "num_input_tokens_seen": 72237280, "step": 33475 }, { "epoch": 6.144246650761608, "grad_norm": 0.0944366455078125, "learning_rate": 8.748209558364195e-06, "loss": 0.1887, "num_input_tokens_seen": 72247488, "step": 33480 }, { "epoch": 6.14516425032116, "grad_norm": 23.5617733001709, "learning_rate": 8.747679534584806e-06, "loss": 0.0577, "num_input_tokens_seen": 72258880, "step": 33485 }, { "epoch": 6.146081849880712, "grad_norm": 0.24743086099624634, "learning_rate": 8.747149414683245e-06, "loss": 0.0053, "num_input_tokens_seen": 72268544, "step": 33490 }, { "epoch": 6.146999449440265, "grad_norm": 0.019858520478010178, "learning_rate": 8.746619198673108e-06, "loss": 0.0023, "num_input_tokens_seen": 72279232, "step": 33495 }, { "epoch": 6.147917048999816, "grad_norm": 0.16479888558387756, "learning_rate": 8.746088886567996e-06, "loss": 0.0037, "num_input_tokens_seen": 72290208, "step": 33500 }, { "epoch": 6.148834648559369, "grad_norm": 9.152706146240234, "learning_rate": 8.745558478381508e-06, "loss": 0.0937, "num_input_tokens_seen": 72300384, "step": 33505 }, { "epoch": 6.149752248118921, "grad_norm": 6.997534275054932, "learning_rate": 8.745027974127253e-06, "loss": 0.1561, "num_input_tokens_seen": 72311456, "step": 33510 }, { "epoch": 6.150669847678473, "grad_norm": 0.13939550518989563, "learning_rate": 8.744497373818834e-06, "loss": 0.0026, "num_input_tokens_seen": 72322112, "step": 33515 }, { "epoch": 6.151587447238025, "grad_norm": 16.227811813354492, "learning_rate": 8.74396667746986e-06, "loss": 0.0066, "num_input_tokens_seen": 72333280, "step": 33520 }, { "epoch": 6.152505046797578, "grad_norm": 0.04243754222989082, "learning_rate": 8.743435885093945e-06, "loss": 0.0018, "num_input_tokens_seen": 72344096, "step": 33525 }, { "epoch": 6.1534226463571295, "grad_norm": 0.020457131788134575, "learning_rate": 8.742904996704699e-06, "loss": 0.0014, "num_input_tokens_seen": 72354656, "step": 33530 }, { "epoch": 6.154340245916682, "grad_norm": 0.730107843875885, "learning_rate": 8.742374012315742e-06, "loss": 0.1268, "num_input_tokens_seen": 72366016, "step": 33535 }, { "epoch": 6.155257845476235, "grad_norm": 0.21323558688163757, "learning_rate": 8.741842931940692e-06, "loss": 0.0973, "num_input_tokens_seen": 72377088, "step": 33540 }, { "epoch": 6.156175445035786, "grad_norm": 0.037094105035066605, "learning_rate": 8.74131175559317e-06, "loss": 0.0684, "num_input_tokens_seen": 72388768, "step": 33545 }, { "epoch": 6.157093044595339, "grad_norm": 0.03519145026803017, "learning_rate": 8.7407804832868e-06, "loss": 0.227, "num_input_tokens_seen": 72400032, "step": 33550 }, { "epoch": 6.158010644154891, "grad_norm": 0.014523176476359367, "learning_rate": 8.740249115035206e-06, "loss": 0.0009, "num_input_tokens_seen": 72411808, "step": 33555 }, { "epoch": 6.158928243714443, "grad_norm": 0.03145330771803856, "learning_rate": 8.739717650852023e-06, "loss": 0.2773, "num_input_tokens_seen": 72423808, "step": 33560 }, { "epoch": 6.159845843273995, "grad_norm": 0.018741564825177193, "learning_rate": 8.739186090750875e-06, "loss": 0.1541, "num_input_tokens_seen": 72433504, "step": 33565 }, { "epoch": 6.160763442833548, "grad_norm": 0.051223546266555786, "learning_rate": 8.738654434745402e-06, "loss": 0.0134, "num_input_tokens_seen": 72444608, "step": 33570 }, { "epoch": 6.161681042393099, "grad_norm": 0.5238342881202698, "learning_rate": 8.738122682849235e-06, "loss": 0.108, "num_input_tokens_seen": 72454944, "step": 33575 }, { "epoch": 6.162598641952652, "grad_norm": 0.023086749017238617, "learning_rate": 8.737590835076015e-06, "loss": 0.0519, "num_input_tokens_seen": 72466112, "step": 33580 }, { "epoch": 6.1635162415122045, "grad_norm": 0.2037954032421112, "learning_rate": 8.737058891439383e-06, "loss": 0.0023, "num_input_tokens_seen": 72476480, "step": 33585 }, { "epoch": 6.164433841071756, "grad_norm": 0.7442108392715454, "learning_rate": 8.736526851952982e-06, "loss": 0.1094, "num_input_tokens_seen": 72486592, "step": 33590 }, { "epoch": 6.165351440631309, "grad_norm": 0.06439310312271118, "learning_rate": 8.735994716630457e-06, "loss": 0.005, "num_input_tokens_seen": 72495968, "step": 33595 }, { "epoch": 6.166269040190861, "grad_norm": 24.964397430419922, "learning_rate": 8.735462485485462e-06, "loss": 0.1197, "num_input_tokens_seen": 72507296, "step": 33600 }, { "epoch": 6.167186639750413, "grad_norm": 0.08483894169330597, "learning_rate": 8.73493015853164e-06, "loss": 0.0016, "num_input_tokens_seen": 72518624, "step": 33605 }, { "epoch": 6.168104239309965, "grad_norm": 0.04176715016365051, "learning_rate": 8.73439773578265e-06, "loss": 0.002, "num_input_tokens_seen": 72528576, "step": 33610 }, { "epoch": 6.169021838869518, "grad_norm": 0.09813941270112991, "learning_rate": 8.733865217252144e-06, "loss": 0.0162, "num_input_tokens_seen": 72539712, "step": 33615 }, { "epoch": 6.169939438429069, "grad_norm": 0.02067381702363491, "learning_rate": 8.733332602953784e-06, "loss": 0.0012, "num_input_tokens_seen": 72549984, "step": 33620 }, { "epoch": 6.170857037988622, "grad_norm": 8.220541000366211, "learning_rate": 8.73279989290123e-06, "loss": 0.0999, "num_input_tokens_seen": 72560160, "step": 33625 }, { "epoch": 6.171774637548174, "grad_norm": 0.01744610257446766, "learning_rate": 8.732267087108142e-06, "loss": 0.0033, "num_input_tokens_seen": 72570976, "step": 33630 }, { "epoch": 6.172692237107726, "grad_norm": 0.014204474166035652, "learning_rate": 8.731734185588186e-06, "loss": 0.1328, "num_input_tokens_seen": 72582048, "step": 33635 }, { "epoch": 6.1736098366672785, "grad_norm": 0.03190046176314354, "learning_rate": 8.731201188355035e-06, "loss": 0.001, "num_input_tokens_seen": 72592960, "step": 33640 }, { "epoch": 6.174527436226831, "grad_norm": 0.01739444024860859, "learning_rate": 8.730668095422354e-06, "loss": 0.0012, "num_input_tokens_seen": 72604448, "step": 33645 }, { "epoch": 6.175445035786383, "grad_norm": 0.12181146442890167, "learning_rate": 8.73013490680382e-06, "loss": 0.1186, "num_input_tokens_seen": 72613952, "step": 33650 }, { "epoch": 6.176362635345935, "grad_norm": 0.06039256975054741, "learning_rate": 8.729601622513107e-06, "loss": 0.0019, "num_input_tokens_seen": 72625088, "step": 33655 }, { "epoch": 6.177280234905488, "grad_norm": 158.2217559814453, "learning_rate": 8.729068242563892e-06, "loss": 0.1549, "num_input_tokens_seen": 72636128, "step": 33660 }, { "epoch": 6.178197834465039, "grad_norm": 0.007262030616402626, "learning_rate": 8.728534766969856e-06, "loss": 0.0029, "num_input_tokens_seen": 72646976, "step": 33665 }, { "epoch": 6.179115434024592, "grad_norm": 0.01202051155269146, "learning_rate": 8.728001195744682e-06, "loss": 0.0061, "num_input_tokens_seen": 72657344, "step": 33670 }, { "epoch": 6.180033033584144, "grad_norm": 37.47972869873047, "learning_rate": 8.727467528902055e-06, "loss": 0.233, "num_input_tokens_seen": 72669344, "step": 33675 }, { "epoch": 6.180950633143696, "grad_norm": 0.1918877512216568, "learning_rate": 8.726933766455663e-06, "loss": 0.1139, "num_input_tokens_seen": 72681536, "step": 33680 }, { "epoch": 6.181868232703248, "grad_norm": 0.13083918392658234, "learning_rate": 8.726399908419196e-06, "loss": 0.179, "num_input_tokens_seen": 72691904, "step": 33685 }, { "epoch": 6.182785832262801, "grad_norm": 0.10247623920440674, "learning_rate": 8.725865954806348e-06, "loss": 0.1138, "num_input_tokens_seen": 72703488, "step": 33690 }, { "epoch": 6.1837034318223525, "grad_norm": 0.04623454436659813, "learning_rate": 8.725331905630811e-06, "loss": 0.1106, "num_input_tokens_seen": 72713312, "step": 33695 }, { "epoch": 6.184621031381905, "grad_norm": 0.03930873051285744, "learning_rate": 8.724797760906285e-06, "loss": 0.1045, "num_input_tokens_seen": 72723968, "step": 33700 }, { "epoch": 6.1855386309414575, "grad_norm": 0.011640067212283611, "learning_rate": 8.72426352064647e-06, "loss": 0.0017, "num_input_tokens_seen": 72735616, "step": 33705 }, { "epoch": 6.186456230501009, "grad_norm": 13.29677677154541, "learning_rate": 8.723729184865068e-06, "loss": 0.218, "num_input_tokens_seen": 72746304, "step": 33710 }, { "epoch": 6.187373830060562, "grad_norm": 0.07179474830627441, "learning_rate": 8.723194753575782e-06, "loss": 0.0625, "num_input_tokens_seen": 72757312, "step": 33715 }, { "epoch": 6.188291429620114, "grad_norm": 19.823333740234375, "learning_rate": 8.722660226792324e-06, "loss": 0.4167, "num_input_tokens_seen": 72769504, "step": 33720 }, { "epoch": 6.189209029179666, "grad_norm": 11.271867752075195, "learning_rate": 8.722125604528398e-06, "loss": 0.119, "num_input_tokens_seen": 72777856, "step": 33725 }, { "epoch": 6.190126628739218, "grad_norm": 0.031095033511519432, "learning_rate": 8.72159088679772e-06, "loss": 0.0026, "num_input_tokens_seen": 72788640, "step": 33730 }, { "epoch": 6.191044228298771, "grad_norm": 0.5242334604263306, "learning_rate": 8.721056073614002e-06, "loss": 0.0022, "num_input_tokens_seen": 72799840, "step": 33735 }, { "epoch": 6.191961827858322, "grad_norm": 0.023548385128378868, "learning_rate": 8.720521164990964e-06, "loss": 0.1484, "num_input_tokens_seen": 72810752, "step": 33740 }, { "epoch": 6.192879427417875, "grad_norm": 0.042808305472135544, "learning_rate": 8.719986160942326e-06, "loss": 0.136, "num_input_tokens_seen": 72821024, "step": 33745 }, { "epoch": 6.193797026977427, "grad_norm": 0.04307422786951065, "learning_rate": 8.719451061481808e-06, "loss": 0.0041, "num_input_tokens_seen": 72831296, "step": 33750 }, { "epoch": 6.194714626536979, "grad_norm": 0.07844580709934235, "learning_rate": 8.718915866623134e-06, "loss": 0.0284, "num_input_tokens_seen": 72841536, "step": 33755 }, { "epoch": 6.1956322260965315, "grad_norm": 0.08677133917808533, "learning_rate": 8.718380576380032e-06, "loss": 0.1144, "num_input_tokens_seen": 72852672, "step": 33760 }, { "epoch": 6.196549825656084, "grad_norm": 0.059626463800668716, "learning_rate": 8.717845190766229e-06, "loss": 0.0024, "num_input_tokens_seen": 72865056, "step": 33765 }, { "epoch": 6.197467425215636, "grad_norm": 0.08400388807058334, "learning_rate": 8.717309709795463e-06, "loss": 0.1269, "num_input_tokens_seen": 72874592, "step": 33770 }, { "epoch": 6.198385024775188, "grad_norm": 0.07965948432683945, "learning_rate": 8.716774133481462e-06, "loss": 0.011, "num_input_tokens_seen": 72886752, "step": 33775 }, { "epoch": 6.199302624334741, "grad_norm": 0.07689938694238663, "learning_rate": 8.716238461837964e-06, "loss": 0.1237, "num_input_tokens_seen": 72897696, "step": 33780 }, { "epoch": 6.200220223894292, "grad_norm": 0.02507350593805313, "learning_rate": 8.715702694878712e-06, "loss": 0.006, "num_input_tokens_seen": 72908960, "step": 33785 }, { "epoch": 6.201137823453845, "grad_norm": 0.3315722644329071, "learning_rate": 8.715166832617444e-06, "loss": 0.002, "num_input_tokens_seen": 72920000, "step": 33790 }, { "epoch": 6.202055423013397, "grad_norm": 0.0218144990503788, "learning_rate": 8.714630875067901e-06, "loss": 0.0025, "num_input_tokens_seen": 72930976, "step": 33795 }, { "epoch": 6.202973022572949, "grad_norm": 0.07663176953792572, "learning_rate": 8.714094822243837e-06, "loss": 0.1767, "num_input_tokens_seen": 72942368, "step": 33800 }, { "epoch": 6.203890622132501, "grad_norm": 0.017446134239435196, "learning_rate": 8.713558674158997e-06, "loss": 0.0008, "num_input_tokens_seen": 72954112, "step": 33805 }, { "epoch": 6.204808221692054, "grad_norm": 0.03938189521431923, "learning_rate": 8.713022430827132e-06, "loss": 0.3309, "num_input_tokens_seen": 72965696, "step": 33810 }, { "epoch": 6.2057258212516055, "grad_norm": 32.399539947509766, "learning_rate": 8.712486092261997e-06, "loss": 0.0332, "num_input_tokens_seen": 72974880, "step": 33815 }, { "epoch": 6.206643420811158, "grad_norm": 7.484116077423096, "learning_rate": 8.711949658477346e-06, "loss": 0.3317, "num_input_tokens_seen": 72985504, "step": 33820 }, { "epoch": 6.207561020370711, "grad_norm": 0.032356053590774536, "learning_rate": 8.711413129486938e-06, "loss": 0.1695, "num_input_tokens_seen": 72996640, "step": 33825 }, { "epoch": 6.208478619930262, "grad_norm": 0.011667806655168533, "learning_rate": 8.710876505304538e-06, "loss": 0.0017, "num_input_tokens_seen": 73006752, "step": 33830 }, { "epoch": 6.209396219489815, "grad_norm": 0.03637714311480522, "learning_rate": 8.710339785943906e-06, "loss": 0.3432, "num_input_tokens_seen": 73017664, "step": 33835 }, { "epoch": 6.210313819049367, "grad_norm": 0.021753616631031036, "learning_rate": 8.70980297141881e-06, "loss": 0.0018, "num_input_tokens_seen": 73028480, "step": 33840 }, { "epoch": 6.211231418608919, "grad_norm": 0.03634488955140114, "learning_rate": 8.709266061743015e-06, "loss": 0.0899, "num_input_tokens_seen": 73039680, "step": 33845 }, { "epoch": 6.212149018168471, "grad_norm": 0.030251378193497658, "learning_rate": 8.708729056930297e-06, "loss": 0.0017, "num_input_tokens_seen": 73049664, "step": 33850 }, { "epoch": 6.213066617728024, "grad_norm": 86.66400146484375, "learning_rate": 8.708191956994425e-06, "loss": 0.2186, "num_input_tokens_seen": 73060512, "step": 33855 }, { "epoch": 6.213984217287575, "grad_norm": 0.22078071534633636, "learning_rate": 8.707654761949178e-06, "loss": 0.1526, "num_input_tokens_seen": 73071072, "step": 33860 }, { "epoch": 6.214901816847128, "grad_norm": 6.8126444816589355, "learning_rate": 8.707117471808332e-06, "loss": 0.0979, "num_input_tokens_seen": 73081984, "step": 33865 }, { "epoch": 6.2158194164066805, "grad_norm": 0.025427915155887604, "learning_rate": 8.706580086585667e-06, "loss": 0.0012, "num_input_tokens_seen": 73092704, "step": 33870 }, { "epoch": 6.216737015966232, "grad_norm": 1.5275592803955078, "learning_rate": 8.70604260629497e-06, "loss": 0.0122, "num_input_tokens_seen": 73103808, "step": 33875 }, { "epoch": 6.217654615525785, "grad_norm": 0.033607110381126404, "learning_rate": 8.705505030950022e-06, "loss": 0.2938, "num_input_tokens_seen": 73115872, "step": 33880 }, { "epoch": 6.218572215085337, "grad_norm": 0.05492207407951355, "learning_rate": 8.704967360564614e-06, "loss": 0.0082, "num_input_tokens_seen": 73126272, "step": 33885 }, { "epoch": 6.219489814644889, "grad_norm": 0.039291802793741226, "learning_rate": 8.704429595152535e-06, "loss": 0.1473, "num_input_tokens_seen": 73138240, "step": 33890 }, { "epoch": 6.220407414204441, "grad_norm": 10.601533889770508, "learning_rate": 8.703891734727578e-06, "loss": 0.1189, "num_input_tokens_seen": 73148736, "step": 33895 }, { "epoch": 6.221325013763994, "grad_norm": 0.1626083105802536, "learning_rate": 8.70335377930354e-06, "loss": 0.0038, "num_input_tokens_seen": 73160000, "step": 33900 }, { "epoch": 6.222242613323545, "grad_norm": 0.28747671842575073, "learning_rate": 8.702815728894216e-06, "loss": 0.0139, "num_input_tokens_seen": 73170304, "step": 33905 }, { "epoch": 6.223160212883098, "grad_norm": 0.1795155555009842, "learning_rate": 8.702277583513408e-06, "loss": 0.0056, "num_input_tokens_seen": 73181216, "step": 33910 }, { "epoch": 6.22407781244265, "grad_norm": 37.01534652709961, "learning_rate": 8.701739343174916e-06, "loss": 0.0056, "num_input_tokens_seen": 73192640, "step": 33915 }, { "epoch": 6.224995412002202, "grad_norm": 20.50184440612793, "learning_rate": 8.70120100789255e-06, "loss": 0.0923, "num_input_tokens_seen": 73204224, "step": 33920 }, { "epoch": 6.2259130115617545, "grad_norm": 0.02358447201550007, "learning_rate": 8.700662577680113e-06, "loss": 0.0171, "num_input_tokens_seen": 73215328, "step": 33925 }, { "epoch": 6.226830611121307, "grad_norm": 7.682004451751709, "learning_rate": 8.700124052551415e-06, "loss": 0.2913, "num_input_tokens_seen": 73225664, "step": 33930 }, { "epoch": 6.227748210680859, "grad_norm": 0.03751619905233383, "learning_rate": 8.69958543252027e-06, "loss": 0.0473, "num_input_tokens_seen": 73236832, "step": 33935 }, { "epoch": 6.228665810240411, "grad_norm": 0.02155756950378418, "learning_rate": 8.699046717600494e-06, "loss": 0.0018, "num_input_tokens_seen": 73248512, "step": 33940 }, { "epoch": 6.229583409799964, "grad_norm": 4.261490345001221, "learning_rate": 8.698507907805903e-06, "loss": 0.2428, "num_input_tokens_seen": 73259296, "step": 33945 }, { "epoch": 6.230501009359515, "grad_norm": 20.6876220703125, "learning_rate": 8.697969003150314e-06, "loss": 0.1292, "num_input_tokens_seen": 73271392, "step": 33950 }, { "epoch": 6.231418608919068, "grad_norm": 0.11780782788991928, "learning_rate": 8.697430003647554e-06, "loss": 0.128, "num_input_tokens_seen": 73281120, "step": 33955 }, { "epoch": 6.23233620847862, "grad_norm": 0.008839238435029984, "learning_rate": 8.696890909311443e-06, "loss": 0.0015, "num_input_tokens_seen": 73292192, "step": 33960 }, { "epoch": 6.233253808038172, "grad_norm": 0.046445902436971664, "learning_rate": 8.69635172015581e-06, "loss": 0.0187, "num_input_tokens_seen": 73303680, "step": 33965 }, { "epoch": 6.234171407597724, "grad_norm": 0.022340035066008568, "learning_rate": 8.695812436194487e-06, "loss": 0.0069, "num_input_tokens_seen": 73314624, "step": 33970 }, { "epoch": 6.235089007157277, "grad_norm": 10.61821174621582, "learning_rate": 8.6952730574413e-06, "loss": 0.1685, "num_input_tokens_seen": 73325760, "step": 33975 }, { "epoch": 6.2360066067168285, "grad_norm": 0.05184514820575714, "learning_rate": 8.694733583910089e-06, "loss": 0.1513, "num_input_tokens_seen": 73336896, "step": 33980 }, { "epoch": 6.236924206276381, "grad_norm": 35.555458068847656, "learning_rate": 8.694194015614686e-06, "loss": 0.1458, "num_input_tokens_seen": 73347936, "step": 33985 }, { "epoch": 6.2378418058359335, "grad_norm": 0.12285355478525162, "learning_rate": 8.693654352568932e-06, "loss": 0.0822, "num_input_tokens_seen": 73359200, "step": 33990 }, { "epoch": 6.238759405395485, "grad_norm": 0.07383987307548523, "learning_rate": 8.693114594786667e-06, "loss": 0.1386, "num_input_tokens_seen": 73370880, "step": 33995 }, { "epoch": 6.239677004955038, "grad_norm": 0.1000271588563919, "learning_rate": 8.692574742281739e-06, "loss": 0.0017, "num_input_tokens_seen": 73382272, "step": 34000 }, { "epoch": 6.24059460451459, "grad_norm": 2.6653714179992676, "learning_rate": 8.692034795067991e-06, "loss": 0.0065, "num_input_tokens_seen": 73391264, "step": 34005 }, { "epoch": 6.241512204074142, "grad_norm": 0.08758469671010971, "learning_rate": 8.691494753159272e-06, "loss": 0.0036, "num_input_tokens_seen": 73403296, "step": 34010 }, { "epoch": 6.242429803633694, "grad_norm": 0.08387909829616547, "learning_rate": 8.690954616569434e-06, "loss": 0.0021, "num_input_tokens_seen": 73413600, "step": 34015 }, { "epoch": 6.243347403193247, "grad_norm": 0.09931585937738419, "learning_rate": 8.69041438531233e-06, "loss": 0.1414, "num_input_tokens_seen": 73424544, "step": 34020 }, { "epoch": 6.244265002752798, "grad_norm": 0.015689684078097343, "learning_rate": 8.68987405940182e-06, "loss": 0.0011, "num_input_tokens_seen": 73435584, "step": 34025 }, { "epoch": 6.245182602312351, "grad_norm": 0.04877912625670433, "learning_rate": 8.689333638851756e-06, "loss": 0.1966, "num_input_tokens_seen": 73444672, "step": 34030 }, { "epoch": 6.246100201871903, "grad_norm": 0.01996789500117302, "learning_rate": 8.688793123676002e-06, "loss": 0.2913, "num_input_tokens_seen": 73455680, "step": 34035 }, { "epoch": 6.247017801431455, "grad_norm": 0.3285498023033142, "learning_rate": 8.688252513888423e-06, "loss": 0.3053, "num_input_tokens_seen": 73467584, "step": 34040 }, { "epoch": 6.2479354009910075, "grad_norm": 0.025066953152418137, "learning_rate": 8.68771180950288e-06, "loss": 0.1022, "num_input_tokens_seen": 73479072, "step": 34045 }, { "epoch": 6.24885300055056, "grad_norm": 0.019341906532645226, "learning_rate": 8.687171010533249e-06, "loss": 0.0889, "num_input_tokens_seen": 73489376, "step": 34050 }, { "epoch": 6.249770600110112, "grad_norm": 1.149757981300354, "learning_rate": 8.686630116993395e-06, "loss": 0.0034, "num_input_tokens_seen": 73500736, "step": 34055 }, { "epoch": 6.250688199669664, "grad_norm": 45.476402282714844, "learning_rate": 8.686089128897191e-06, "loss": 0.0414, "num_input_tokens_seen": 73511040, "step": 34060 }, { "epoch": 6.251605799229217, "grad_norm": 7.310128211975098, "learning_rate": 8.685548046258514e-06, "loss": 0.0346, "num_input_tokens_seen": 73520608, "step": 34065 }, { "epoch": 6.252523398788768, "grad_norm": 0.43791666626930237, "learning_rate": 8.68500686909124e-06, "loss": 0.0028, "num_input_tokens_seen": 73531264, "step": 34070 }, { "epoch": 6.253440998348321, "grad_norm": 0.022022059187293053, "learning_rate": 8.684465597409255e-06, "loss": 0.0035, "num_input_tokens_seen": 73542016, "step": 34075 }, { "epoch": 6.254358597907873, "grad_norm": 0.4839261472225189, "learning_rate": 8.683924231226436e-06, "loss": 0.0943, "num_input_tokens_seen": 73551584, "step": 34080 }, { "epoch": 6.255276197467425, "grad_norm": 0.547883152961731, "learning_rate": 8.68338277055667e-06, "loss": 0.2181, "num_input_tokens_seen": 73563232, "step": 34085 }, { "epoch": 6.256193797026977, "grad_norm": 8.511922836303711, "learning_rate": 8.682841215413844e-06, "loss": 0.1572, "num_input_tokens_seen": 73574752, "step": 34090 }, { "epoch": 6.25711139658653, "grad_norm": 6.3106913566589355, "learning_rate": 8.68229956581185e-06, "loss": 0.011, "num_input_tokens_seen": 73586496, "step": 34095 }, { "epoch": 6.2580289961460815, "grad_norm": 0.04722864553332329, "learning_rate": 8.681757821764578e-06, "loss": 0.0017, "num_input_tokens_seen": 73596576, "step": 34100 }, { "epoch": 6.258946595705634, "grad_norm": 0.16267439723014832, "learning_rate": 8.681215983285924e-06, "loss": 0.0022, "num_input_tokens_seen": 73606080, "step": 34105 }, { "epoch": 6.259864195265187, "grad_norm": 0.1599685400724411, "learning_rate": 8.680674050389787e-06, "loss": 0.0017, "num_input_tokens_seen": 73617088, "step": 34110 }, { "epoch": 6.260781794824738, "grad_norm": 0.02675412781536579, "learning_rate": 8.680132023090065e-06, "loss": 0.0356, "num_input_tokens_seen": 73627680, "step": 34115 }, { "epoch": 6.261699394384291, "grad_norm": 74.97274780273438, "learning_rate": 8.679589901400657e-06, "loss": 0.1016, "num_input_tokens_seen": 73638016, "step": 34120 }, { "epoch": 6.262616993943843, "grad_norm": 0.020780373364686966, "learning_rate": 8.679047685335474e-06, "loss": 0.0006, "num_input_tokens_seen": 73649216, "step": 34125 }, { "epoch": 6.263534593503395, "grad_norm": 8.888347625732422, "learning_rate": 8.67850537490842e-06, "loss": 0.2546, "num_input_tokens_seen": 73659552, "step": 34130 }, { "epoch": 6.264452193062947, "grad_norm": 0.05345912650227547, "learning_rate": 8.677962970133403e-06, "loss": 0.1137, "num_input_tokens_seen": 73670976, "step": 34135 }, { "epoch": 6.2653697926225, "grad_norm": 0.12068822234869003, "learning_rate": 8.677420471024336e-06, "loss": 0.1453, "num_input_tokens_seen": 73683232, "step": 34140 }, { "epoch": 6.266287392182051, "grad_norm": 0.008612050674855709, "learning_rate": 8.676877877595135e-06, "loss": 0.0046, "num_input_tokens_seen": 73693248, "step": 34145 }, { "epoch": 6.267204991741604, "grad_norm": 0.032011307775974274, "learning_rate": 8.676335189859712e-06, "loss": 0.1326, "num_input_tokens_seen": 73704096, "step": 34150 }, { "epoch": 6.2681225913011565, "grad_norm": 0.03865218162536621, "learning_rate": 8.675792407831994e-06, "loss": 0.0019, "num_input_tokens_seen": 73714304, "step": 34155 }, { "epoch": 6.269040190860708, "grad_norm": 5.470060348510742, "learning_rate": 8.675249531525894e-06, "loss": 0.1235, "num_input_tokens_seen": 73725376, "step": 34160 }, { "epoch": 6.269957790420261, "grad_norm": 0.14874696731567383, "learning_rate": 8.67470656095534e-06, "loss": 0.0076, "num_input_tokens_seen": 73735840, "step": 34165 }, { "epoch": 6.270875389979813, "grad_norm": 0.058671630918979645, "learning_rate": 8.67416349613426e-06, "loss": 0.1168, "num_input_tokens_seen": 73747488, "step": 34170 }, { "epoch": 6.271792989539365, "grad_norm": 0.06922242790460587, "learning_rate": 8.673620337076578e-06, "loss": 0.0056, "num_input_tokens_seen": 73757568, "step": 34175 }, { "epoch": 6.272710589098917, "grad_norm": 0.038801830261945724, "learning_rate": 8.67307708379623e-06, "loss": 0.0074, "num_input_tokens_seen": 73768096, "step": 34180 }, { "epoch": 6.27362818865847, "grad_norm": 0.10182622820138931, "learning_rate": 8.672533736307148e-06, "loss": 0.063, "num_input_tokens_seen": 73779424, "step": 34185 }, { "epoch": 6.274545788218021, "grad_norm": 0.056395888328552246, "learning_rate": 8.671990294623265e-06, "loss": 0.0941, "num_input_tokens_seen": 73789728, "step": 34190 }, { "epoch": 6.275463387777574, "grad_norm": 0.07039429247379303, "learning_rate": 8.671446758758523e-06, "loss": 0.076, "num_input_tokens_seen": 73799968, "step": 34195 }, { "epoch": 6.276380987337126, "grad_norm": 0.020427238196134567, "learning_rate": 8.670903128726862e-06, "loss": 0.056, "num_input_tokens_seen": 73810912, "step": 34200 }, { "epoch": 6.277298586896678, "grad_norm": 0.03314514458179474, "learning_rate": 8.670359404542226e-06, "loss": 0.0075, "num_input_tokens_seen": 73821792, "step": 34205 }, { "epoch": 6.2782161864562305, "grad_norm": 0.04168044403195381, "learning_rate": 8.669815586218559e-06, "loss": 0.001, "num_input_tokens_seen": 73832928, "step": 34210 }, { "epoch": 6.279133786015783, "grad_norm": 2.864065647125244, "learning_rate": 8.669271673769811e-06, "loss": 0.0976, "num_input_tokens_seen": 73843232, "step": 34215 }, { "epoch": 6.280051385575335, "grad_norm": 11.65116024017334, "learning_rate": 8.66872766720993e-06, "loss": 0.0983, "num_input_tokens_seen": 73853856, "step": 34220 }, { "epoch": 6.280968985134887, "grad_norm": 0.042688410729169846, "learning_rate": 8.66818356655287e-06, "loss": 0.0026, "num_input_tokens_seen": 73865280, "step": 34225 }, { "epoch": 6.28188658469444, "grad_norm": 118.80702209472656, "learning_rate": 8.667639371812588e-06, "loss": 0.0554, "num_input_tokens_seen": 73876096, "step": 34230 }, { "epoch": 6.282804184253991, "grad_norm": 0.016980871558189392, "learning_rate": 8.66709508300304e-06, "loss": 0.0013, "num_input_tokens_seen": 73885888, "step": 34235 }, { "epoch": 6.283721783813544, "grad_norm": 0.018997177481651306, "learning_rate": 8.666550700138187e-06, "loss": 0.1073, "num_input_tokens_seen": 73896160, "step": 34240 }, { "epoch": 6.284639383373096, "grad_norm": 5.150221824645996, "learning_rate": 8.66600622323199e-06, "loss": 0.242, "num_input_tokens_seen": 73906944, "step": 34245 }, { "epoch": 6.285556982932648, "grad_norm": 14.872992515563965, "learning_rate": 8.665461652298416e-06, "loss": 0.1018, "num_input_tokens_seen": 73919136, "step": 34250 }, { "epoch": 6.2864745824922, "grad_norm": 0.11050546914339066, "learning_rate": 8.664916987351432e-06, "loss": 0.0014, "num_input_tokens_seen": 73930464, "step": 34255 }, { "epoch": 6.287392182051753, "grad_norm": 9.997719764709473, "learning_rate": 8.664372228405007e-06, "loss": 0.2865, "num_input_tokens_seen": 73939808, "step": 34260 }, { "epoch": 6.2883097816113045, "grad_norm": 0.023525379598140717, "learning_rate": 8.663827375473114e-06, "loss": 0.0016, "num_input_tokens_seen": 73950304, "step": 34265 }, { "epoch": 6.289227381170857, "grad_norm": 0.023351430892944336, "learning_rate": 8.663282428569728e-06, "loss": 0.1231, "num_input_tokens_seen": 73961760, "step": 34270 }, { "epoch": 6.2901449807304095, "grad_norm": 0.057180557399988174, "learning_rate": 8.662737387708822e-06, "loss": 0.0019, "num_input_tokens_seen": 73972640, "step": 34275 }, { "epoch": 6.291062580289961, "grad_norm": 0.04018879309296608, "learning_rate": 8.662192252904381e-06, "loss": 0.0025, "num_input_tokens_seen": 73983040, "step": 34280 }, { "epoch": 6.291980179849514, "grad_norm": 0.04494345560669899, "learning_rate": 8.661647024170385e-06, "loss": 0.0017, "num_input_tokens_seen": 73995072, "step": 34285 }, { "epoch": 6.292897779409066, "grad_norm": 0.050980545580387115, "learning_rate": 8.661101701520817e-06, "loss": 0.0016, "num_input_tokens_seen": 74004736, "step": 34290 }, { "epoch": 6.293815378968618, "grad_norm": 15.141953468322754, "learning_rate": 8.660556284969665e-06, "loss": 0.0911, "num_input_tokens_seen": 74015296, "step": 34295 }, { "epoch": 6.29473297852817, "grad_norm": 0.05851507559418678, "learning_rate": 8.660010774530917e-06, "loss": 0.2422, "num_input_tokens_seen": 74025024, "step": 34300 }, { "epoch": 6.295650578087723, "grad_norm": 0.10913292318582535, "learning_rate": 8.659465170218565e-06, "loss": 0.0057, "num_input_tokens_seen": 74035264, "step": 34305 }, { "epoch": 6.296568177647274, "grad_norm": 41.02510452270508, "learning_rate": 8.658919472046606e-06, "loss": 0.0119, "num_input_tokens_seen": 74046880, "step": 34310 }, { "epoch": 6.297485777206827, "grad_norm": 0.03812826797366142, "learning_rate": 8.658373680029029e-06, "loss": 0.1909, "num_input_tokens_seen": 74055840, "step": 34315 }, { "epoch": 6.298403376766379, "grad_norm": 0.777815043926239, "learning_rate": 8.65782779417984e-06, "loss": 0.0285, "num_input_tokens_seen": 74065920, "step": 34320 }, { "epoch": 6.299320976325931, "grad_norm": 0.06430672109127045, "learning_rate": 8.657281814513037e-06, "loss": 0.3409, "num_input_tokens_seen": 74077024, "step": 34325 }, { "epoch": 6.3002385758854835, "grad_norm": 0.26389777660369873, "learning_rate": 8.65673574104262e-06, "loss": 0.2318, "num_input_tokens_seen": 74086912, "step": 34330 }, { "epoch": 6.301156175445036, "grad_norm": 3.123117446899414, "learning_rate": 8.656189573782602e-06, "loss": 0.0667, "num_input_tokens_seen": 74097408, "step": 34335 }, { "epoch": 6.302073775004588, "grad_norm": 0.07741282880306244, "learning_rate": 8.655643312746988e-06, "loss": 0.0027, "num_input_tokens_seen": 74109312, "step": 34340 }, { "epoch": 6.30299137456414, "grad_norm": 19.525449752807617, "learning_rate": 8.65509695794979e-06, "loss": 0.1171, "num_input_tokens_seen": 74120128, "step": 34345 }, { "epoch": 6.303908974123693, "grad_norm": 0.18451082706451416, "learning_rate": 8.654550509405018e-06, "loss": 0.1419, "num_input_tokens_seen": 74130208, "step": 34350 }, { "epoch": 6.304826573683244, "grad_norm": 0.03719450905919075, "learning_rate": 8.65400396712669e-06, "loss": 0.1502, "num_input_tokens_seen": 74141600, "step": 34355 }, { "epoch": 6.305744173242797, "grad_norm": 0.07666297256946564, "learning_rate": 8.653457331128823e-06, "loss": 0.0547, "num_input_tokens_seen": 74153088, "step": 34360 }, { "epoch": 6.306661772802349, "grad_norm": 10.676076889038086, "learning_rate": 8.652910601425438e-06, "loss": 0.1239, "num_input_tokens_seen": 74164384, "step": 34365 }, { "epoch": 6.307579372361901, "grad_norm": 0.0662873163819313, "learning_rate": 8.652363778030558e-06, "loss": 0.1005, "num_input_tokens_seen": 74176576, "step": 34370 }, { "epoch": 6.308496971921453, "grad_norm": 0.033684004098176956, "learning_rate": 8.651816860958207e-06, "loss": 0.0021, "num_input_tokens_seen": 74187680, "step": 34375 }, { "epoch": 6.309414571481006, "grad_norm": 0.20542903244495392, "learning_rate": 8.651269850222414e-06, "loss": 0.0042, "num_input_tokens_seen": 74197440, "step": 34380 }, { "epoch": 6.3103321710405575, "grad_norm": 0.46596279740333557, "learning_rate": 8.650722745837208e-06, "loss": 0.2568, "num_input_tokens_seen": 74208416, "step": 34385 }, { "epoch": 6.31124977060011, "grad_norm": 0.23164094984531403, "learning_rate": 8.650175547816621e-06, "loss": 0.0033, "num_input_tokens_seen": 74218400, "step": 34390 }, { "epoch": 6.312167370159663, "grad_norm": 0.09787441790103912, "learning_rate": 8.649628256174689e-06, "loss": 0.0448, "num_input_tokens_seen": 74227776, "step": 34395 }, { "epoch": 6.313084969719214, "grad_norm": 0.17705541849136353, "learning_rate": 8.64908087092545e-06, "loss": 0.1236, "num_input_tokens_seen": 74238016, "step": 34400 }, { "epoch": 6.314002569278767, "grad_norm": 0.026108268648386, "learning_rate": 8.648533392082941e-06, "loss": 0.1557, "num_input_tokens_seen": 74248320, "step": 34405 }, { "epoch": 6.314920168838319, "grad_norm": 0.07614798843860626, "learning_rate": 8.647985819661204e-06, "loss": 0.0033, "num_input_tokens_seen": 74259104, "step": 34410 }, { "epoch": 6.315837768397871, "grad_norm": 0.045319195836782455, "learning_rate": 8.647438153674286e-06, "loss": 0.0178, "num_input_tokens_seen": 74269696, "step": 34415 }, { "epoch": 6.316755367957423, "grad_norm": 0.02564156800508499, "learning_rate": 8.646890394136232e-06, "loss": 0.1452, "num_input_tokens_seen": 74280064, "step": 34420 }, { "epoch": 6.317672967516976, "grad_norm": 0.32225969433784485, "learning_rate": 8.646342541061093e-06, "loss": 0.1583, "num_input_tokens_seen": 74291456, "step": 34425 }, { "epoch": 6.318590567076527, "grad_norm": 13.31888198852539, "learning_rate": 8.645794594462918e-06, "loss": 0.108, "num_input_tokens_seen": 74302048, "step": 34430 }, { "epoch": 6.31950816663608, "grad_norm": 0.05323462933301926, "learning_rate": 8.645246554355761e-06, "loss": 0.0025, "num_input_tokens_seen": 74311712, "step": 34435 }, { "epoch": 6.3204257661956325, "grad_norm": 0.09389448910951614, "learning_rate": 8.64469842075368e-06, "loss": 0.1425, "num_input_tokens_seen": 74322880, "step": 34440 }, { "epoch": 6.321343365755184, "grad_norm": 0.04116087034344673, "learning_rate": 8.644150193670735e-06, "loss": 0.0861, "num_input_tokens_seen": 74333312, "step": 34445 }, { "epoch": 6.322260965314737, "grad_norm": 6.26202917098999, "learning_rate": 8.643601873120983e-06, "loss": 0.2687, "num_input_tokens_seen": 74344608, "step": 34450 }, { "epoch": 6.323178564874289, "grad_norm": 0.02775406278669834, "learning_rate": 8.643053459118492e-06, "loss": 0.2106, "num_input_tokens_seen": 74355616, "step": 34455 }, { "epoch": 6.324096164433841, "grad_norm": 0.8597356677055359, "learning_rate": 8.642504951677325e-06, "loss": 0.0019, "num_input_tokens_seen": 74367168, "step": 34460 }, { "epoch": 6.325013763993393, "grad_norm": 0.23713986575603485, "learning_rate": 8.64195635081155e-06, "loss": 0.2228, "num_input_tokens_seen": 74378400, "step": 34465 }, { "epoch": 6.325931363552946, "grad_norm": 39.72104263305664, "learning_rate": 8.641407656535242e-06, "loss": 0.1843, "num_input_tokens_seen": 74388320, "step": 34470 }, { "epoch": 6.326848963112497, "grad_norm": 0.08074882626533508, "learning_rate": 8.64085886886247e-06, "loss": 0.2148, "num_input_tokens_seen": 74399168, "step": 34475 }, { "epoch": 6.32776656267205, "grad_norm": 0.041775867342948914, "learning_rate": 8.640309987807311e-06, "loss": 0.1594, "num_input_tokens_seen": 74409472, "step": 34480 }, { "epoch": 6.328684162231602, "grad_norm": 0.16738566756248474, "learning_rate": 8.639761013383842e-06, "loss": 0.0042, "num_input_tokens_seen": 74420064, "step": 34485 }, { "epoch": 6.329601761791154, "grad_norm": 0.10081353038549423, "learning_rate": 8.639211945606146e-06, "loss": 0.0027, "num_input_tokens_seen": 74430112, "step": 34490 }, { "epoch": 6.3305193613507065, "grad_norm": 0.07738696783781052, "learning_rate": 8.638662784488302e-06, "loss": 0.1571, "num_input_tokens_seen": 74441824, "step": 34495 }, { "epoch": 6.331436960910259, "grad_norm": 0.1916603296995163, "learning_rate": 8.638113530044397e-06, "loss": 0.0022, "num_input_tokens_seen": 74452832, "step": 34500 }, { "epoch": 6.332354560469811, "grad_norm": 15.534010887145996, "learning_rate": 8.63756418228852e-06, "loss": 0.309, "num_input_tokens_seen": 74463392, "step": 34505 }, { "epoch": 6.333272160029363, "grad_norm": 5.275966167449951, "learning_rate": 8.637014741234758e-06, "loss": 0.3572, "num_input_tokens_seen": 74473632, "step": 34510 }, { "epoch": 6.334189759588916, "grad_norm": 0.10216835141181946, "learning_rate": 8.636465206897207e-06, "loss": 0.2561, "num_input_tokens_seen": 74484352, "step": 34515 }, { "epoch": 6.335107359148467, "grad_norm": 0.2769050598144531, "learning_rate": 8.635915579289957e-06, "loss": 0.1329, "num_input_tokens_seen": 74494784, "step": 34520 }, { "epoch": 6.33602495870802, "grad_norm": 0.4943059980869293, "learning_rate": 8.635365858427107e-06, "loss": 0.0704, "num_input_tokens_seen": 74504992, "step": 34525 }, { "epoch": 6.336942558267572, "grad_norm": 0.029278814792633057, "learning_rate": 8.634816044322759e-06, "loss": 0.1054, "num_input_tokens_seen": 74516448, "step": 34530 }, { "epoch": 6.337860157827124, "grad_norm": 0.5072154998779297, "learning_rate": 8.634266136991011e-06, "loss": 0.0042, "num_input_tokens_seen": 74527968, "step": 34535 }, { "epoch": 6.338777757386676, "grad_norm": 0.27648574113845825, "learning_rate": 8.633716136445971e-06, "loss": 0.0063, "num_input_tokens_seen": 74539040, "step": 34540 }, { "epoch": 6.339695356946229, "grad_norm": 7.1120123863220215, "learning_rate": 8.633166042701744e-06, "loss": 0.0952, "num_input_tokens_seen": 74549728, "step": 34545 }, { "epoch": 6.3406129565057805, "grad_norm": 18.228044509887695, "learning_rate": 8.632615855772439e-06, "loss": 0.1655, "num_input_tokens_seen": 74558816, "step": 34550 }, { "epoch": 6.341530556065333, "grad_norm": 0.0433671697974205, "learning_rate": 8.632065575672166e-06, "loss": 0.0877, "num_input_tokens_seen": 74570240, "step": 34555 }, { "epoch": 6.3424481556248855, "grad_norm": 0.11690399795770645, "learning_rate": 8.631515202415041e-06, "loss": 0.1061, "num_input_tokens_seen": 74581248, "step": 34560 }, { "epoch": 6.343365755184437, "grad_norm": 0.0660136267542839, "learning_rate": 8.630964736015179e-06, "loss": 0.0065, "num_input_tokens_seen": 74590336, "step": 34565 }, { "epoch": 6.34428335474399, "grad_norm": 0.13102160394191742, "learning_rate": 8.630414176486699e-06, "loss": 0.0039, "num_input_tokens_seen": 74601504, "step": 34570 }, { "epoch": 6.345200954303542, "grad_norm": 40.16535568237305, "learning_rate": 8.629863523843722e-06, "loss": 0.3274, "num_input_tokens_seen": 74612544, "step": 34575 }, { "epoch": 6.346118553863094, "grad_norm": 2.7313899993896484, "learning_rate": 8.629312778100371e-06, "loss": 0.1589, "num_input_tokens_seen": 74622208, "step": 34580 }, { "epoch": 6.347036153422646, "grad_norm": 8.780875205993652, "learning_rate": 8.628761939270774e-06, "loss": 0.1269, "num_input_tokens_seen": 74634016, "step": 34585 }, { "epoch": 6.347953752982199, "grad_norm": 43.60552978515625, "learning_rate": 8.628211007369056e-06, "loss": 0.0349, "num_input_tokens_seen": 74644928, "step": 34590 }, { "epoch": 6.34887135254175, "grad_norm": 0.026456862688064575, "learning_rate": 8.62765998240935e-06, "loss": 0.0143, "num_input_tokens_seen": 74656960, "step": 34595 }, { "epoch": 6.349788952101303, "grad_norm": 28.366661071777344, "learning_rate": 8.627108864405784e-06, "loss": 0.0522, "num_input_tokens_seen": 74667232, "step": 34600 }, { "epoch": 6.350706551660855, "grad_norm": 0.847540557384491, "learning_rate": 8.6265576533725e-06, "loss": 0.3067, "num_input_tokens_seen": 74678144, "step": 34605 }, { "epoch": 6.351624151220407, "grad_norm": 18.332475662231445, "learning_rate": 8.626006349323633e-06, "loss": 0.3003, "num_input_tokens_seen": 74688416, "step": 34610 }, { "epoch": 6.3525417507799595, "grad_norm": 68.59867858886719, "learning_rate": 8.625454952273323e-06, "loss": 0.164, "num_input_tokens_seen": 74699520, "step": 34615 }, { "epoch": 6.353459350339512, "grad_norm": 0.8063937425613403, "learning_rate": 8.624903462235713e-06, "loss": 0.0034, "num_input_tokens_seen": 74710240, "step": 34620 }, { "epoch": 6.354376949899064, "grad_norm": 1.1141865253448486, "learning_rate": 8.624351879224945e-06, "loss": 0.0054, "num_input_tokens_seen": 74720448, "step": 34625 }, { "epoch": 6.355294549458616, "grad_norm": 0.24705801904201508, "learning_rate": 8.62380020325517e-06, "loss": 0.0895, "num_input_tokens_seen": 74730144, "step": 34630 }, { "epoch": 6.356212149018169, "grad_norm": 0.08233857154846191, "learning_rate": 8.623248434340537e-06, "loss": 0.1848, "num_input_tokens_seen": 74742080, "step": 34635 }, { "epoch": 6.35712974857772, "grad_norm": 0.21308667957782745, "learning_rate": 8.622696572495195e-06, "loss": 0.0029, "num_input_tokens_seen": 74752160, "step": 34640 }, { "epoch": 6.358047348137273, "grad_norm": 0.11937617510557175, "learning_rate": 8.622144617733302e-06, "loss": 0.1179, "num_input_tokens_seen": 74762880, "step": 34645 }, { "epoch": 6.358964947696825, "grad_norm": 0.03405369445681572, "learning_rate": 8.621592570069013e-06, "loss": 0.069, "num_input_tokens_seen": 74774272, "step": 34650 }, { "epoch": 6.359882547256377, "grad_norm": 0.05487215146422386, "learning_rate": 8.621040429516488e-06, "loss": 0.0152, "num_input_tokens_seen": 74783328, "step": 34655 }, { "epoch": 6.360800146815929, "grad_norm": 0.12256123870611191, "learning_rate": 8.620488196089888e-06, "loss": 0.0036, "num_input_tokens_seen": 74795296, "step": 34660 }, { "epoch": 6.361717746375482, "grad_norm": 6.388733386993408, "learning_rate": 8.619935869803378e-06, "loss": 0.2351, "num_input_tokens_seen": 74805056, "step": 34665 }, { "epoch": 6.3626353459350335, "grad_norm": 0.06026969105005264, "learning_rate": 8.619383450671121e-06, "loss": 0.1894, "num_input_tokens_seen": 74816896, "step": 34670 }, { "epoch": 6.363552945494586, "grad_norm": 1.0830528736114502, "learning_rate": 8.61883093870729e-06, "loss": 0.197, "num_input_tokens_seen": 74826496, "step": 34675 }, { "epoch": 6.364470545054139, "grad_norm": 0.076905257999897, "learning_rate": 8.618278333926053e-06, "loss": 0.0307, "num_input_tokens_seen": 74837792, "step": 34680 }, { "epoch": 6.36538814461369, "grad_norm": 0.02913903445005417, "learning_rate": 8.617725636341585e-06, "loss": 0.1639, "num_input_tokens_seen": 74847936, "step": 34685 }, { "epoch": 6.366305744173243, "grad_norm": 0.1880713403224945, "learning_rate": 8.61717284596806e-06, "loss": 0.003, "num_input_tokens_seen": 74858784, "step": 34690 }, { "epoch": 6.367223343732795, "grad_norm": 13.546073913574219, "learning_rate": 8.61661996281966e-06, "loss": 0.133, "num_input_tokens_seen": 74870368, "step": 34695 }, { "epoch": 6.368140943292347, "grad_norm": 16.662525177001953, "learning_rate": 8.616066986910561e-06, "loss": 0.1499, "num_input_tokens_seen": 74881440, "step": 34700 }, { "epoch": 6.369058542851899, "grad_norm": 23.327625274658203, "learning_rate": 8.61551391825495e-06, "loss": 0.1607, "num_input_tokens_seen": 74892128, "step": 34705 }, { "epoch": 6.369976142411452, "grad_norm": 0.08524332195520401, "learning_rate": 8.614960756867009e-06, "loss": 0.2427, "num_input_tokens_seen": 74902112, "step": 34710 }, { "epoch": 6.370893741971003, "grad_norm": 0.14193134009838104, "learning_rate": 8.614407502760928e-06, "loss": 0.0017, "num_input_tokens_seen": 74913088, "step": 34715 }, { "epoch": 6.371811341530556, "grad_norm": 0.044314682483673096, "learning_rate": 8.613854155950897e-06, "loss": 0.2876, "num_input_tokens_seen": 74922976, "step": 34720 }, { "epoch": 6.3727289410901085, "grad_norm": 0.19936518371105194, "learning_rate": 8.613300716451107e-06, "loss": 0.1202, "num_input_tokens_seen": 74933856, "step": 34725 }, { "epoch": 6.37364654064966, "grad_norm": 0.26527783274650574, "learning_rate": 8.612747184275753e-06, "loss": 0.1445, "num_input_tokens_seen": 74943744, "step": 34730 }, { "epoch": 6.374564140209213, "grad_norm": 0.9253643751144409, "learning_rate": 8.612193559439035e-06, "loss": 0.0194, "num_input_tokens_seen": 74954304, "step": 34735 }, { "epoch": 6.375481739768765, "grad_norm": 35.40376663208008, "learning_rate": 8.61163984195515e-06, "loss": 0.1574, "num_input_tokens_seen": 74966240, "step": 34740 }, { "epoch": 6.376399339328317, "grad_norm": 0.021140968427062035, "learning_rate": 8.6110860318383e-06, "loss": 0.0262, "num_input_tokens_seen": 74976544, "step": 34745 }, { "epoch": 6.377316938887869, "grad_norm": 0.02835298329591751, "learning_rate": 8.610532129102689e-06, "loss": 0.0043, "num_input_tokens_seen": 74985760, "step": 34750 }, { "epoch": 6.378234538447422, "grad_norm": 0.059093598276376724, "learning_rate": 8.609978133762527e-06, "loss": 0.0021, "num_input_tokens_seen": 74997728, "step": 34755 }, { "epoch": 6.379152138006974, "grad_norm": 0.030966568738222122, "learning_rate": 8.60942404583202e-06, "loss": 0.3624, "num_input_tokens_seen": 75009312, "step": 34760 }, { "epoch": 6.380069737566526, "grad_norm": 0.14962245523929596, "learning_rate": 8.60886986532538e-06, "loss": 0.0195, "num_input_tokens_seen": 75020160, "step": 34765 }, { "epoch": 6.380987337126078, "grad_norm": 0.06759241968393326, "learning_rate": 8.60831559225682e-06, "loss": 0.0809, "num_input_tokens_seen": 75030496, "step": 34770 }, { "epoch": 6.381904936685631, "grad_norm": 0.08965427428483963, "learning_rate": 8.607761226640559e-06, "loss": 0.1541, "num_input_tokens_seen": 75041280, "step": 34775 }, { "epoch": 6.3828225362451825, "grad_norm": 0.055509302765131, "learning_rate": 8.607206768490815e-06, "loss": 0.0023, "num_input_tokens_seen": 75052416, "step": 34780 }, { "epoch": 6.383740135804735, "grad_norm": 0.006895633414387703, "learning_rate": 8.606652217821806e-06, "loss": 0.1282, "num_input_tokens_seen": 75061952, "step": 34785 }, { "epoch": 6.3846577353642875, "grad_norm": 0.05007459223270416, "learning_rate": 8.606097574647759e-06, "loss": 0.1971, "num_input_tokens_seen": 75072320, "step": 34790 }, { "epoch": 6.385575334923839, "grad_norm": 64.0281982421875, "learning_rate": 8.605542838982896e-06, "loss": 0.2663, "num_input_tokens_seen": 75083200, "step": 34795 }, { "epoch": 6.386492934483392, "grad_norm": 0.028890738263726234, "learning_rate": 8.604988010841448e-06, "loss": 0.1763, "num_input_tokens_seen": 75093248, "step": 34800 }, { "epoch": 6.387410534042944, "grad_norm": 0.17328707873821259, "learning_rate": 8.604433090237646e-06, "loss": 0.0022, "num_input_tokens_seen": 75104384, "step": 34805 }, { "epoch": 6.388328133602496, "grad_norm": 48.35887908935547, "learning_rate": 8.60387807718572e-06, "loss": 0.167, "num_input_tokens_seen": 75114720, "step": 34810 }, { "epoch": 6.389245733162048, "grad_norm": 13.810452461242676, "learning_rate": 8.603322971699908e-06, "loss": 0.5979, "num_input_tokens_seen": 75125088, "step": 34815 }, { "epoch": 6.390163332721601, "grad_norm": 0.7054336667060852, "learning_rate": 8.602767773794447e-06, "loss": 0.1324, "num_input_tokens_seen": 75136576, "step": 34820 }, { "epoch": 6.391080932281152, "grad_norm": 0.11551645398139954, "learning_rate": 8.602212483483575e-06, "loss": 0.0024, "num_input_tokens_seen": 75148480, "step": 34825 }, { "epoch": 6.391998531840705, "grad_norm": 0.053545162081718445, "learning_rate": 8.601657100781537e-06, "loss": 0.0118, "num_input_tokens_seen": 75158976, "step": 34830 }, { "epoch": 6.392916131400257, "grad_norm": 0.024066179990768433, "learning_rate": 8.601101625702575e-06, "loss": 0.1082, "num_input_tokens_seen": 75169632, "step": 34835 }, { "epoch": 6.393833730959809, "grad_norm": 22.566844940185547, "learning_rate": 8.60054605826094e-06, "loss": 0.1988, "num_input_tokens_seen": 75180512, "step": 34840 }, { "epoch": 6.3947513305193615, "grad_norm": 0.09830784797668457, "learning_rate": 8.599990398470875e-06, "loss": 0.0068, "num_input_tokens_seen": 75191936, "step": 34845 }, { "epoch": 6.395668930078914, "grad_norm": 6.086650371551514, "learning_rate": 8.599434646346638e-06, "loss": 0.3629, "num_input_tokens_seen": 75203200, "step": 34850 }, { "epoch": 6.396586529638466, "grad_norm": 0.03009163960814476, "learning_rate": 8.598878801902481e-06, "loss": 0.4088, "num_input_tokens_seen": 75213920, "step": 34855 }, { "epoch": 6.397504129198018, "grad_norm": 5.278161525726318, "learning_rate": 8.598322865152661e-06, "loss": 0.2686, "num_input_tokens_seen": 75225504, "step": 34860 }, { "epoch": 6.398421728757571, "grad_norm": 0.142783060669899, "learning_rate": 8.597766836111434e-06, "loss": 0.0476, "num_input_tokens_seen": 75236384, "step": 34865 }, { "epoch": 6.399339328317122, "grad_norm": 0.6602503657341003, "learning_rate": 8.597210714793068e-06, "loss": 0.0094, "num_input_tokens_seen": 75247232, "step": 34870 }, { "epoch": 6.400256927876675, "grad_norm": 0.05436132475733757, "learning_rate": 8.596654501211819e-06, "loss": 0.2375, "num_input_tokens_seen": 75258048, "step": 34875 }, { "epoch": 6.401174527436227, "grad_norm": 84.70658874511719, "learning_rate": 8.596098195381956e-06, "loss": 0.1308, "num_input_tokens_seen": 75269216, "step": 34880 }, { "epoch": 6.402092126995779, "grad_norm": 0.06427206844091415, "learning_rate": 8.595541797317751e-06, "loss": 0.0023, "num_input_tokens_seen": 75281472, "step": 34885 }, { "epoch": 6.403009726555331, "grad_norm": 0.05601794645190239, "learning_rate": 8.594985307033467e-06, "loss": 0.1209, "num_input_tokens_seen": 75292384, "step": 34890 }, { "epoch": 6.403927326114884, "grad_norm": 0.046281442046165466, "learning_rate": 8.594428724543384e-06, "loss": 0.1533, "num_input_tokens_seen": 75302720, "step": 34895 }, { "epoch": 6.4048449256744355, "grad_norm": 47.670570373535156, "learning_rate": 8.593872049861776e-06, "loss": 0.0315, "num_input_tokens_seen": 75311936, "step": 34900 }, { "epoch": 6.405762525233988, "grad_norm": 7.3831257820129395, "learning_rate": 8.593315283002919e-06, "loss": 0.3865, "num_input_tokens_seen": 75324032, "step": 34905 }, { "epoch": 6.4066801247935405, "grad_norm": 0.08781249076128006, "learning_rate": 8.592758423981093e-06, "loss": 0.1173, "num_input_tokens_seen": 75334112, "step": 34910 }, { "epoch": 6.407597724353092, "grad_norm": 13.005316734313965, "learning_rate": 8.592201472810584e-06, "loss": 0.0964, "num_input_tokens_seen": 75344896, "step": 34915 }, { "epoch": 6.408515323912645, "grad_norm": 1.0775198936462402, "learning_rate": 8.591644429505672e-06, "loss": 0.3439, "num_input_tokens_seen": 75356256, "step": 34920 }, { "epoch": 6.409432923472197, "grad_norm": 0.06102947145700455, "learning_rate": 8.591087294080648e-06, "loss": 0.0519, "num_input_tokens_seen": 75367072, "step": 34925 }, { "epoch": 6.410350523031749, "grad_norm": 0.060966119170188904, "learning_rate": 8.590530066549802e-06, "loss": 0.2066, "num_input_tokens_seen": 75377568, "step": 34930 }, { "epoch": 6.411268122591301, "grad_norm": 0.12970322370529175, "learning_rate": 8.589972746927425e-06, "loss": 0.247, "num_input_tokens_seen": 75389024, "step": 34935 }, { "epoch": 6.412185722150854, "grad_norm": 0.12897929549217224, "learning_rate": 8.58941533522781e-06, "loss": 0.0032, "num_input_tokens_seen": 75399264, "step": 34940 }, { "epoch": 6.413103321710405, "grad_norm": 3.4423582553863525, "learning_rate": 8.588857831465252e-06, "loss": 0.0085, "num_input_tokens_seen": 75410528, "step": 34945 }, { "epoch": 6.414020921269958, "grad_norm": 0.13647404313087463, "learning_rate": 8.588300235654055e-06, "loss": 0.0969, "num_input_tokens_seen": 75420832, "step": 34950 }, { "epoch": 6.41493852082951, "grad_norm": 0.23914794623851776, "learning_rate": 8.587742547808519e-06, "loss": 0.1244, "num_input_tokens_seen": 75432512, "step": 34955 }, { "epoch": 6.415856120389062, "grad_norm": 0.13447734713554382, "learning_rate": 8.587184767942946e-06, "loss": 0.1637, "num_input_tokens_seen": 75443744, "step": 34960 }, { "epoch": 6.416773719948615, "grad_norm": 0.0223796796053648, "learning_rate": 8.586626896071643e-06, "loss": 0.1836, "num_input_tokens_seen": 75454304, "step": 34965 }, { "epoch": 6.417691319508167, "grad_norm": 0.3505801856517792, "learning_rate": 8.586068932208922e-06, "loss": 0.1518, "num_input_tokens_seen": 75464832, "step": 34970 }, { "epoch": 6.418608919067719, "grad_norm": 24.772518157958984, "learning_rate": 8.585510876369088e-06, "loss": 0.2213, "num_input_tokens_seen": 75476224, "step": 34975 }, { "epoch": 6.419526518627271, "grad_norm": 0.26694974303245544, "learning_rate": 8.584952728566459e-06, "loss": 0.1178, "num_input_tokens_seen": 75487776, "step": 34980 }, { "epoch": 6.420444118186824, "grad_norm": 0.1395617127418518, "learning_rate": 8.584394488815347e-06, "loss": 0.1168, "num_input_tokens_seen": 75499552, "step": 34985 }, { "epoch": 6.421361717746375, "grad_norm": 0.05189325660467148, "learning_rate": 8.58383615713007e-06, "loss": 0.0021, "num_input_tokens_seen": 75510368, "step": 34990 }, { "epoch": 6.422279317305928, "grad_norm": 0.10829728096723557, "learning_rate": 8.583277733524952e-06, "loss": 0.1314, "num_input_tokens_seen": 75521088, "step": 34995 }, { "epoch": 6.42319691686548, "grad_norm": 0.2625485956668854, "learning_rate": 8.582719218014314e-06, "loss": 0.1032, "num_input_tokens_seen": 75531648, "step": 35000 }, { "epoch": 6.424114516425032, "grad_norm": 0.15989769995212555, "learning_rate": 8.58216061061248e-06, "loss": 0.0019, "num_input_tokens_seen": 75542432, "step": 35005 }, { "epoch": 6.4250321159845845, "grad_norm": 75.16629791259766, "learning_rate": 8.581601911333778e-06, "loss": 0.2458, "num_input_tokens_seen": 75552480, "step": 35010 }, { "epoch": 6.425949715544137, "grad_norm": 0.23048709332942963, "learning_rate": 8.581043120192541e-06, "loss": 0.0026, "num_input_tokens_seen": 75563968, "step": 35015 }, { "epoch": 6.426867315103689, "grad_norm": 0.5013495087623596, "learning_rate": 8.580484237203095e-06, "loss": 0.1702, "num_input_tokens_seen": 75575552, "step": 35020 }, { "epoch": 6.427784914663241, "grad_norm": 0.03932826220989227, "learning_rate": 8.579925262379778e-06, "loss": 0.2923, "num_input_tokens_seen": 75585440, "step": 35025 }, { "epoch": 6.428702514222794, "grad_norm": 1.4788930416107178, "learning_rate": 8.579366195736927e-06, "loss": 0.1261, "num_input_tokens_seen": 75595968, "step": 35030 }, { "epoch": 6.429620113782345, "grad_norm": 0.021920524537563324, "learning_rate": 8.57880703728888e-06, "loss": 0.0829, "num_input_tokens_seen": 75606112, "step": 35035 }, { "epoch": 6.430537713341898, "grad_norm": 0.07731948792934418, "learning_rate": 8.57824778704998e-06, "loss": 0.0382, "num_input_tokens_seen": 75616096, "step": 35040 }, { "epoch": 6.43145531290145, "grad_norm": 0.011442376300692558, "learning_rate": 8.57768844503457e-06, "loss": 0.0804, "num_input_tokens_seen": 75626304, "step": 35045 }, { "epoch": 6.432372912461002, "grad_norm": 0.06832727044820786, "learning_rate": 8.577129011256996e-06, "loss": 0.0036, "num_input_tokens_seen": 75637056, "step": 35050 }, { "epoch": 6.433290512020554, "grad_norm": 0.2871078550815582, "learning_rate": 8.576569485731605e-06, "loss": 0.0029, "num_input_tokens_seen": 75648320, "step": 35055 }, { "epoch": 6.434208111580107, "grad_norm": 0.08037587255239487, "learning_rate": 8.57600986847275e-06, "loss": 0.0024, "num_input_tokens_seen": 75659136, "step": 35060 }, { "epoch": 6.4351257111396585, "grad_norm": 1.3769454956054688, "learning_rate": 8.575450159494787e-06, "loss": 0.0026, "num_input_tokens_seen": 75670016, "step": 35065 }, { "epoch": 6.436043310699211, "grad_norm": 0.039927028119564056, "learning_rate": 8.574890358812066e-06, "loss": 0.2003, "num_input_tokens_seen": 75681152, "step": 35070 }, { "epoch": 6.4369609102587635, "grad_norm": 0.015234604477882385, "learning_rate": 8.57433046643895e-06, "loss": 0.0012, "num_input_tokens_seen": 75692640, "step": 35075 }, { "epoch": 6.437878509818315, "grad_norm": 4.42043924331665, "learning_rate": 8.573770482389799e-06, "loss": 0.0028, "num_input_tokens_seen": 75703168, "step": 35080 }, { "epoch": 6.438796109377868, "grad_norm": 13.064629554748535, "learning_rate": 8.573210406678972e-06, "loss": 0.3949, "num_input_tokens_seen": 75713248, "step": 35085 }, { "epoch": 6.43971370893742, "grad_norm": 0.024077005684375763, "learning_rate": 8.572650239320835e-06, "loss": 0.1156, "num_input_tokens_seen": 75725344, "step": 35090 }, { "epoch": 6.440631308496972, "grad_norm": 0.13809514045715332, "learning_rate": 8.572089980329757e-06, "loss": 0.0502, "num_input_tokens_seen": 75735136, "step": 35095 }, { "epoch": 6.441548908056524, "grad_norm": 22.57853126525879, "learning_rate": 8.571529629720107e-06, "loss": 0.2813, "num_input_tokens_seen": 75745312, "step": 35100 }, { "epoch": 6.442466507616077, "grad_norm": 16.074674606323242, "learning_rate": 8.570969187506257e-06, "loss": 0.2771, "num_input_tokens_seen": 75756608, "step": 35105 }, { "epoch": 6.443384107175628, "grad_norm": 0.07665418833494186, "learning_rate": 8.570408653702582e-06, "loss": 0.1182, "num_input_tokens_seen": 75767680, "step": 35110 }, { "epoch": 6.444301706735181, "grad_norm": 0.0628240630030632, "learning_rate": 8.56984802832346e-06, "loss": 0.3512, "num_input_tokens_seen": 75778752, "step": 35115 }, { "epoch": 6.445219306294733, "grad_norm": 36.53213882446289, "learning_rate": 8.569287311383268e-06, "loss": 0.1158, "num_input_tokens_seen": 75789568, "step": 35120 }, { "epoch": 6.446136905854285, "grad_norm": 0.25791066884994507, "learning_rate": 8.568726502896389e-06, "loss": 0.0033, "num_input_tokens_seen": 75800576, "step": 35125 }, { "epoch": 6.4470545054138375, "grad_norm": 0.10579461604356766, "learning_rate": 8.568165602877206e-06, "loss": 0.1849, "num_input_tokens_seen": 75811552, "step": 35130 }, { "epoch": 6.44797210497339, "grad_norm": 0.09363888204097748, "learning_rate": 8.567604611340104e-06, "loss": 0.0382, "num_input_tokens_seen": 75823488, "step": 35135 }, { "epoch": 6.448889704532942, "grad_norm": 1.2029300928115845, "learning_rate": 8.567043528299474e-06, "loss": 0.0027, "num_input_tokens_seen": 75834368, "step": 35140 }, { "epoch": 6.449807304092494, "grad_norm": 6.2488555908203125, "learning_rate": 8.566482353769708e-06, "loss": 0.1304, "num_input_tokens_seen": 75845760, "step": 35145 }, { "epoch": 6.450724903652047, "grad_norm": 0.03280071169137955, "learning_rate": 8.565921087765195e-06, "loss": 0.0019, "num_input_tokens_seen": 75855552, "step": 35150 }, { "epoch": 6.451642503211598, "grad_norm": 0.11932391673326492, "learning_rate": 8.565359730300332e-06, "loss": 0.0022, "num_input_tokens_seen": 75866560, "step": 35155 }, { "epoch": 6.452560102771151, "grad_norm": 0.327837735414505, "learning_rate": 8.56479828138952e-06, "loss": 0.1117, "num_input_tokens_seen": 75876384, "step": 35160 }, { "epoch": 6.453477702330703, "grad_norm": 0.09507694095373154, "learning_rate": 8.564236741047154e-06, "loss": 0.0022, "num_input_tokens_seen": 75887008, "step": 35165 }, { "epoch": 6.454395301890255, "grad_norm": 8.335799217224121, "learning_rate": 8.56367510928764e-06, "loss": 0.1551, "num_input_tokens_seen": 75898944, "step": 35170 }, { "epoch": 6.455312901449807, "grad_norm": 0.648408055305481, "learning_rate": 8.563113386125385e-06, "loss": 0.1974, "num_input_tokens_seen": 75911200, "step": 35175 }, { "epoch": 6.45623050100936, "grad_norm": 0.06438038498163223, "learning_rate": 8.562551571574793e-06, "loss": 0.1161, "num_input_tokens_seen": 75921920, "step": 35180 }, { "epoch": 6.4571481005689115, "grad_norm": 0.08735982328653336, "learning_rate": 8.561989665650276e-06, "loss": 0.2221, "num_input_tokens_seen": 75932864, "step": 35185 }, { "epoch": 6.458065700128464, "grad_norm": 0.7467524409294128, "learning_rate": 8.561427668366243e-06, "loss": 0.0539, "num_input_tokens_seen": 75943968, "step": 35190 }, { "epoch": 6.4589832996880165, "grad_norm": 0.0934731662273407, "learning_rate": 8.56086557973711e-06, "loss": 0.2157, "num_input_tokens_seen": 75954784, "step": 35195 }, { "epoch": 6.459900899247568, "grad_norm": 0.3810057044029236, "learning_rate": 8.560303399777294e-06, "loss": 0.0056, "num_input_tokens_seen": 75965600, "step": 35200 }, { "epoch": 6.460818498807121, "grad_norm": 51.86098861694336, "learning_rate": 8.559741128501214e-06, "loss": 0.1496, "num_input_tokens_seen": 75976032, "step": 35205 }, { "epoch": 6.461736098366673, "grad_norm": 18.799253463745117, "learning_rate": 8.55917876592329e-06, "loss": 0.1039, "num_input_tokens_seen": 75986848, "step": 35210 }, { "epoch": 6.462653697926225, "grad_norm": 0.3409002125263214, "learning_rate": 8.558616312057948e-06, "loss": 0.19, "num_input_tokens_seen": 75997120, "step": 35215 }, { "epoch": 6.463571297485777, "grad_norm": 0.11662192642688751, "learning_rate": 8.558053766919614e-06, "loss": 0.0041, "num_input_tokens_seen": 76007264, "step": 35220 }, { "epoch": 6.46448889704533, "grad_norm": 0.10630404949188232, "learning_rate": 8.557491130522713e-06, "loss": 0.0021, "num_input_tokens_seen": 76017568, "step": 35225 }, { "epoch": 6.465406496604881, "grad_norm": 0.1420217603445053, "learning_rate": 8.55692840288168e-06, "loss": 0.0026, "num_input_tokens_seen": 76028928, "step": 35230 }, { "epoch": 6.466324096164434, "grad_norm": 0.03704974427819252, "learning_rate": 8.556365584010946e-06, "loss": 0.001, "num_input_tokens_seen": 76040000, "step": 35235 }, { "epoch": 6.467241695723986, "grad_norm": 0.022052213549613953, "learning_rate": 8.555802673924945e-06, "loss": 0.002, "num_input_tokens_seen": 76050144, "step": 35240 }, { "epoch": 6.468159295283538, "grad_norm": 0.05265188217163086, "learning_rate": 8.555239672638119e-06, "loss": 0.1431, "num_input_tokens_seen": 76060448, "step": 35245 }, { "epoch": 6.469076894843091, "grad_norm": 23.779592514038086, "learning_rate": 8.554676580164903e-06, "loss": 0.0096, "num_input_tokens_seen": 76072256, "step": 35250 }, { "epoch": 6.469994494402643, "grad_norm": 0.10088527202606201, "learning_rate": 8.554113396519744e-06, "loss": 0.1483, "num_input_tokens_seen": 76083680, "step": 35255 }, { "epoch": 6.470912093962195, "grad_norm": 0.020596303045749664, "learning_rate": 8.553550121717083e-06, "loss": 0.0009, "num_input_tokens_seen": 76093760, "step": 35260 }, { "epoch": 6.471829693521747, "grad_norm": 86.289794921875, "learning_rate": 8.55298675577137e-06, "loss": 0.211, "num_input_tokens_seen": 76104672, "step": 35265 }, { "epoch": 6.4727472930813, "grad_norm": 0.05577865615487099, "learning_rate": 8.552423298697052e-06, "loss": 0.0964, "num_input_tokens_seen": 76114528, "step": 35270 }, { "epoch": 6.473664892640851, "grad_norm": 14.448527336120605, "learning_rate": 8.551859750508584e-06, "loss": 0.0846, "num_input_tokens_seen": 76124992, "step": 35275 }, { "epoch": 6.474582492200404, "grad_norm": 0.23568861186504364, "learning_rate": 8.551296111220418e-06, "loss": 0.0015, "num_input_tokens_seen": 76135840, "step": 35280 }, { "epoch": 6.475500091759956, "grad_norm": 0.5541033744812012, "learning_rate": 8.55073238084701e-06, "loss": 0.283, "num_input_tokens_seen": 76146528, "step": 35285 }, { "epoch": 6.476417691319508, "grad_norm": 0.014620177447795868, "learning_rate": 8.550168559402819e-06, "loss": 0.0038, "num_input_tokens_seen": 76156064, "step": 35290 }, { "epoch": 6.4773352908790605, "grad_norm": 0.4840647578239441, "learning_rate": 8.549604646902307e-06, "loss": 0.1643, "num_input_tokens_seen": 76165760, "step": 35295 }, { "epoch": 6.478252890438613, "grad_norm": 0.03275398910045624, "learning_rate": 8.549040643359938e-06, "loss": 0.4172, "num_input_tokens_seen": 76175936, "step": 35300 }, { "epoch": 6.479170489998165, "grad_norm": 0.1556641161441803, "learning_rate": 8.548476548790177e-06, "loss": 0.0017, "num_input_tokens_seen": 76186912, "step": 35305 }, { "epoch": 6.480088089557717, "grad_norm": 0.1942114681005478, "learning_rate": 8.547912363207492e-06, "loss": 0.1321, "num_input_tokens_seen": 76196096, "step": 35310 }, { "epoch": 6.48100568911727, "grad_norm": 0.04684874787926674, "learning_rate": 8.547348086626354e-06, "loss": 0.0025, "num_input_tokens_seen": 76206592, "step": 35315 }, { "epoch": 6.481923288676821, "grad_norm": 0.09186289459466934, "learning_rate": 8.546783719061234e-06, "loss": 0.0016, "num_input_tokens_seen": 76218272, "step": 35320 }, { "epoch": 6.482840888236374, "grad_norm": 19.859222412109375, "learning_rate": 8.54621926052661e-06, "loss": 0.1207, "num_input_tokens_seen": 76229888, "step": 35325 }, { "epoch": 6.483758487795926, "grad_norm": 66.4852066040039, "learning_rate": 8.545654711036957e-06, "loss": 0.15, "num_input_tokens_seen": 76240768, "step": 35330 }, { "epoch": 6.484676087355478, "grad_norm": 0.6040542721748352, "learning_rate": 8.545090070606757e-06, "loss": 0.0031, "num_input_tokens_seen": 76250720, "step": 35335 }, { "epoch": 6.48559368691503, "grad_norm": 0.09116031974554062, "learning_rate": 8.544525339250491e-06, "loss": 0.0322, "num_input_tokens_seen": 76261760, "step": 35340 }, { "epoch": 6.486511286474583, "grad_norm": 17.65090560913086, "learning_rate": 8.543960516982643e-06, "loss": 0.1084, "num_input_tokens_seen": 76273600, "step": 35345 }, { "epoch": 6.4874288860341345, "grad_norm": 0.050243642181158066, "learning_rate": 8.5433956038177e-06, "loss": 0.0713, "num_input_tokens_seen": 76284320, "step": 35350 }, { "epoch": 6.488346485593687, "grad_norm": 0.01383553072810173, "learning_rate": 8.542830599770153e-06, "loss": 0.0894, "num_input_tokens_seen": 76294208, "step": 35355 }, { "epoch": 6.4892640851532395, "grad_norm": 118.8952407836914, "learning_rate": 8.542265504854492e-06, "loss": 0.1742, "num_input_tokens_seen": 76304992, "step": 35360 }, { "epoch": 6.490181684712791, "grad_norm": 0.17565278708934784, "learning_rate": 8.541700319085209e-06, "loss": 0.0018, "num_input_tokens_seen": 76316832, "step": 35365 }, { "epoch": 6.491099284272344, "grad_norm": 0.024144841358065605, "learning_rate": 8.541135042476804e-06, "loss": 0.0011, "num_input_tokens_seen": 76328544, "step": 35370 }, { "epoch": 6.492016883831896, "grad_norm": 0.3488900363445282, "learning_rate": 8.54056967504377e-06, "loss": 0.1529, "num_input_tokens_seen": 76339616, "step": 35375 }, { "epoch": 6.492934483391448, "grad_norm": 0.025693634524941444, "learning_rate": 8.540004216800614e-06, "loss": 0.0007, "num_input_tokens_seen": 76350720, "step": 35380 }, { "epoch": 6.493852082951, "grad_norm": 0.006220504175871611, "learning_rate": 8.539438667761836e-06, "loss": 0.1539, "num_input_tokens_seen": 76362464, "step": 35385 }, { "epoch": 6.494769682510553, "grad_norm": 1.7014386653900146, "learning_rate": 8.538873027941943e-06, "loss": 0.12, "num_input_tokens_seen": 76373856, "step": 35390 }, { "epoch": 6.495687282070104, "grad_norm": 0.9431385397911072, "learning_rate": 8.53830729735544e-06, "loss": 0.2151, "num_input_tokens_seen": 76385024, "step": 35395 }, { "epoch": 6.496604881629657, "grad_norm": 0.026742763817310333, "learning_rate": 8.537741476016838e-06, "loss": 0.2077, "num_input_tokens_seen": 76395424, "step": 35400 }, { "epoch": 6.497522481189209, "grad_norm": 13.103536605834961, "learning_rate": 8.537175563940652e-06, "loss": 0.2612, "num_input_tokens_seen": 76407168, "step": 35405 }, { "epoch": 6.498440080748761, "grad_norm": 93.38562774658203, "learning_rate": 8.536609561141394e-06, "loss": 0.0774, "num_input_tokens_seen": 76419456, "step": 35410 }, { "epoch": 6.4993576803083135, "grad_norm": 0.02888774871826172, "learning_rate": 8.536043467633582e-06, "loss": 0.0022, "num_input_tokens_seen": 76429920, "step": 35415 }, { "epoch": 6.500275279867866, "grad_norm": 5.485649108886719, "learning_rate": 8.535477283431736e-06, "loss": 0.1665, "num_input_tokens_seen": 76440768, "step": 35420 }, { "epoch": 6.501192879427418, "grad_norm": 0.12839095294475555, "learning_rate": 8.534911008550378e-06, "loss": 0.2103, "num_input_tokens_seen": 76451296, "step": 35425 }, { "epoch": 6.50211047898697, "grad_norm": 0.028224201872944832, "learning_rate": 8.53434464300403e-06, "loss": 0.3539, "num_input_tokens_seen": 76462592, "step": 35430 }, { "epoch": 6.503028078546523, "grad_norm": 0.01711430586874485, "learning_rate": 8.533778186807217e-06, "loss": 0.029, "num_input_tokens_seen": 76474624, "step": 35435 }, { "epoch": 6.503945678106074, "grad_norm": 10.970905303955078, "learning_rate": 8.533211639974474e-06, "loss": 0.1956, "num_input_tokens_seen": 76485600, "step": 35440 }, { "epoch": 6.504863277665627, "grad_norm": 44.318607330322266, "learning_rate": 8.532645002520328e-06, "loss": 0.2892, "num_input_tokens_seen": 76495360, "step": 35445 }, { "epoch": 6.505780877225179, "grad_norm": 0.05809528008103371, "learning_rate": 8.532078274459313e-06, "loss": 0.1247, "num_input_tokens_seen": 76505504, "step": 35450 }, { "epoch": 6.506698476784731, "grad_norm": 0.0894770547747612, "learning_rate": 8.531511455805964e-06, "loss": 0.0055, "num_input_tokens_seen": 76516192, "step": 35455 }, { "epoch": 6.507616076344283, "grad_norm": 2.9376227855682373, "learning_rate": 8.530944546574818e-06, "loss": 0.1347, "num_input_tokens_seen": 76526784, "step": 35460 }, { "epoch": 6.508533675903836, "grad_norm": 0.14703702926635742, "learning_rate": 8.530377546780417e-06, "loss": 0.0443, "num_input_tokens_seen": 76536896, "step": 35465 }, { "epoch": 6.5094512754633875, "grad_norm": 0.2295311540365219, "learning_rate": 8.529810456437303e-06, "loss": 0.0042, "num_input_tokens_seen": 76546528, "step": 35470 }, { "epoch": 6.51036887502294, "grad_norm": 31.220983505249023, "learning_rate": 8.529243275560025e-06, "loss": 0.0939, "num_input_tokens_seen": 76558016, "step": 35475 }, { "epoch": 6.5112864745824925, "grad_norm": 0.09241321682929993, "learning_rate": 8.528676004163124e-06, "loss": 0.1446, "num_input_tokens_seen": 76568576, "step": 35480 }, { "epoch": 6.512204074142044, "grad_norm": 0.051372017711400986, "learning_rate": 8.528108642261154e-06, "loss": 0.1887, "num_input_tokens_seen": 76579008, "step": 35485 }, { "epoch": 6.513121673701597, "grad_norm": 0.02893211506307125, "learning_rate": 8.527541189868664e-06, "loss": 0.3262, "num_input_tokens_seen": 76590880, "step": 35490 }, { "epoch": 6.514039273261149, "grad_norm": 0.03627318888902664, "learning_rate": 8.526973647000212e-06, "loss": 0.0036, "num_input_tokens_seen": 76602176, "step": 35495 }, { "epoch": 6.514956872820701, "grad_norm": 0.13144226372241974, "learning_rate": 8.52640601367035e-06, "loss": 0.0871, "num_input_tokens_seen": 76612992, "step": 35500 }, { "epoch": 6.515874472380253, "grad_norm": 0.09506073594093323, "learning_rate": 8.52583828989364e-06, "loss": 0.1799, "num_input_tokens_seen": 76623488, "step": 35505 }, { "epoch": 6.516792071939806, "grad_norm": 0.05527747422456741, "learning_rate": 8.525270475684642e-06, "loss": 0.0128, "num_input_tokens_seen": 76633504, "step": 35510 }, { "epoch": 6.517709671499357, "grad_norm": 0.022965213283896446, "learning_rate": 8.52470257105792e-06, "loss": 0.0017, "num_input_tokens_seen": 76642976, "step": 35515 }, { "epoch": 6.51862727105891, "grad_norm": 0.04957602918148041, "learning_rate": 8.52413457602804e-06, "loss": 0.0012, "num_input_tokens_seen": 76654528, "step": 35520 }, { "epoch": 6.519544870618462, "grad_norm": 24.548776626586914, "learning_rate": 8.52356649060957e-06, "loss": 0.1724, "num_input_tokens_seen": 76666208, "step": 35525 }, { "epoch": 6.520462470178014, "grad_norm": 0.11112981289625168, "learning_rate": 8.52299831481708e-06, "loss": 0.0009, "num_input_tokens_seen": 76676416, "step": 35530 }, { "epoch": 6.521380069737567, "grad_norm": 8.95958423614502, "learning_rate": 8.522430048665145e-06, "loss": 0.0064, "num_input_tokens_seen": 76686240, "step": 35535 }, { "epoch": 6.522297669297119, "grad_norm": 0.12500962615013123, "learning_rate": 8.521861692168337e-06, "loss": 0.0009, "num_input_tokens_seen": 76696992, "step": 35540 }, { "epoch": 6.523215268856671, "grad_norm": 0.03208548203110695, "learning_rate": 8.521293245341235e-06, "loss": 0.0078, "num_input_tokens_seen": 76706432, "step": 35545 }, { "epoch": 6.524132868416223, "grad_norm": 0.019856305792927742, "learning_rate": 8.52072470819842e-06, "loss": 0.0018, "num_input_tokens_seen": 76717088, "step": 35550 }, { "epoch": 6.525050467975776, "grad_norm": 0.02221236191689968, "learning_rate": 8.520156080754471e-06, "loss": 0.3492, "num_input_tokens_seen": 76728352, "step": 35555 }, { "epoch": 6.525968067535327, "grad_norm": 46.31663131713867, "learning_rate": 8.519587363023978e-06, "loss": 0.1723, "num_input_tokens_seen": 76739712, "step": 35560 }, { "epoch": 6.52688566709488, "grad_norm": 0.09245456010103226, "learning_rate": 8.51901855502152e-06, "loss": 0.0062, "num_input_tokens_seen": 76750944, "step": 35565 }, { "epoch": 6.527803266654432, "grad_norm": 0.5009512901306152, "learning_rate": 8.518449656761692e-06, "loss": 0.0007, "num_input_tokens_seen": 76762208, "step": 35570 }, { "epoch": 6.528720866213984, "grad_norm": 0.04937846213579178, "learning_rate": 8.517880668259082e-06, "loss": 0.0593, "num_input_tokens_seen": 76772640, "step": 35575 }, { "epoch": 6.5296384657735365, "grad_norm": 12.11181640625, "learning_rate": 8.517311589528286e-06, "loss": 0.1026, "num_input_tokens_seen": 76782752, "step": 35580 }, { "epoch": 6.530556065333089, "grad_norm": 8.51332950592041, "learning_rate": 8.516742420583899e-06, "loss": 0.1323, "num_input_tokens_seen": 76793952, "step": 35585 }, { "epoch": 6.531473664892641, "grad_norm": 0.03481720760464668, "learning_rate": 8.51617316144052e-06, "loss": 0.1054, "num_input_tokens_seen": 76804832, "step": 35590 }, { "epoch": 6.532391264452193, "grad_norm": 0.020561708137392998, "learning_rate": 8.515603812112749e-06, "loss": 0.1648, "num_input_tokens_seen": 76815744, "step": 35595 }, { "epoch": 6.533308864011746, "grad_norm": 0.6876570582389832, "learning_rate": 8.515034372615188e-06, "loss": 0.0054, "num_input_tokens_seen": 76825408, "step": 35600 }, { "epoch": 6.534226463571297, "grad_norm": 0.04171961545944214, "learning_rate": 8.514464842962442e-06, "loss": 0.0007, "num_input_tokens_seen": 76837024, "step": 35605 }, { "epoch": 6.53514406313085, "grad_norm": 7.458317279815674, "learning_rate": 8.513895223169122e-06, "loss": 0.3141, "num_input_tokens_seen": 76847392, "step": 35610 }, { "epoch": 6.536061662690402, "grad_norm": 0.3929857015609741, "learning_rate": 8.513325513249835e-06, "loss": 0.0023, "num_input_tokens_seen": 76857664, "step": 35615 }, { "epoch": 6.536979262249954, "grad_norm": 0.15304696559906006, "learning_rate": 8.512755713219193e-06, "loss": 0.0315, "num_input_tokens_seen": 76867808, "step": 35620 }, { "epoch": 6.537896861809506, "grad_norm": 0.419389545917511, "learning_rate": 8.512185823091812e-06, "loss": 0.1049, "num_input_tokens_seen": 76878272, "step": 35625 }, { "epoch": 6.538814461369059, "grad_norm": 0.06914783269166946, "learning_rate": 8.511615842882307e-06, "loss": 0.0017, "num_input_tokens_seen": 76888256, "step": 35630 }, { "epoch": 6.5397320609286105, "grad_norm": 0.015246836468577385, "learning_rate": 8.511045772605299e-06, "loss": 0.1073, "num_input_tokens_seen": 76898176, "step": 35635 }, { "epoch": 6.540649660488163, "grad_norm": 18.335323333740234, "learning_rate": 8.510475612275409e-06, "loss": 0.1468, "num_input_tokens_seen": 76908704, "step": 35640 }, { "epoch": 6.5415672600477155, "grad_norm": 0.500738799571991, "learning_rate": 8.50990536190726e-06, "loss": 0.1685, "num_input_tokens_seen": 76919520, "step": 35645 }, { "epoch": 6.542484859607267, "grad_norm": 7.225144863128662, "learning_rate": 8.509335021515476e-06, "loss": 0.1134, "num_input_tokens_seen": 76929952, "step": 35650 }, { "epoch": 6.54340245916682, "grad_norm": 59.9366340637207, "learning_rate": 8.50876459111469e-06, "loss": 0.3057, "num_input_tokens_seen": 76940992, "step": 35655 }, { "epoch": 6.544320058726372, "grad_norm": 0.03856822848320007, "learning_rate": 8.50819407071953e-06, "loss": 0.1267, "num_input_tokens_seen": 76951776, "step": 35660 }, { "epoch": 6.545237658285924, "grad_norm": 0.46580594778060913, "learning_rate": 8.50762346034463e-06, "loss": 0.1104, "num_input_tokens_seen": 76962112, "step": 35665 }, { "epoch": 6.546155257845476, "grad_norm": 60.35948181152344, "learning_rate": 8.507052760004626e-06, "loss": 0.2084, "num_input_tokens_seen": 76972352, "step": 35670 }, { "epoch": 6.547072857405029, "grad_norm": 0.03826534003019333, "learning_rate": 8.50648196971415e-06, "loss": 0.0018, "num_input_tokens_seen": 76983072, "step": 35675 }, { "epoch": 6.54799045696458, "grad_norm": 4.910703659057617, "learning_rate": 8.505911089487848e-06, "loss": 0.0026, "num_input_tokens_seen": 76994176, "step": 35680 }, { "epoch": 6.548908056524133, "grad_norm": 13.885068893432617, "learning_rate": 8.505340119340362e-06, "loss": 0.2487, "num_input_tokens_seen": 77005760, "step": 35685 }, { "epoch": 6.549825656083685, "grad_norm": 0.04640265181660652, "learning_rate": 8.504769059286332e-06, "loss": 0.2885, "num_input_tokens_seen": 77017120, "step": 35690 }, { "epoch": 6.550743255643237, "grad_norm": 3.0678398609161377, "learning_rate": 8.504197909340409e-06, "loss": 0.1414, "num_input_tokens_seen": 77027520, "step": 35695 }, { "epoch": 6.5516608552027895, "grad_norm": 0.06007516384124756, "learning_rate": 8.50362666951724e-06, "loss": 0.2455, "num_input_tokens_seen": 77039232, "step": 35700 }, { "epoch": 6.552578454762342, "grad_norm": 25.046302795410156, "learning_rate": 8.503055339831477e-06, "loss": 0.0946, "num_input_tokens_seen": 77048576, "step": 35705 }, { "epoch": 6.553496054321894, "grad_norm": 37.657630920410156, "learning_rate": 8.502483920297774e-06, "loss": 0.193, "num_input_tokens_seen": 77060480, "step": 35710 }, { "epoch": 6.554413653881446, "grad_norm": 159.21685791015625, "learning_rate": 8.501912410930786e-06, "loss": 0.0331, "num_input_tokens_seen": 77071616, "step": 35715 }, { "epoch": 6.555331253440999, "grad_norm": 3.9298083782196045, "learning_rate": 8.501340811745174e-06, "loss": 0.1959, "num_input_tokens_seen": 77081760, "step": 35720 }, { "epoch": 6.55624885300055, "grad_norm": 0.03782803937792778, "learning_rate": 8.500769122755596e-06, "loss": 0.001, "num_input_tokens_seen": 77093376, "step": 35725 }, { "epoch": 6.557166452560103, "grad_norm": 0.5598686933517456, "learning_rate": 8.500197343976714e-06, "loss": 0.1363, "num_input_tokens_seen": 77104352, "step": 35730 }, { "epoch": 6.558084052119655, "grad_norm": 0.12341617047786713, "learning_rate": 8.499625475423197e-06, "loss": 0.0023, "num_input_tokens_seen": 77115232, "step": 35735 }, { "epoch": 6.559001651679207, "grad_norm": 0.02961844764649868, "learning_rate": 8.49905351710971e-06, "loss": 0.1433, "num_input_tokens_seen": 77126048, "step": 35740 }, { "epoch": 6.559919251238759, "grad_norm": 9.240961074829102, "learning_rate": 8.498481469050923e-06, "loss": 0.2954, "num_input_tokens_seen": 77137408, "step": 35745 }, { "epoch": 6.560836850798312, "grad_norm": 84.76765441894531, "learning_rate": 8.49790933126151e-06, "loss": 0.1256, "num_input_tokens_seen": 77148992, "step": 35750 }, { "epoch": 6.5617544503578635, "grad_norm": 0.13920429348945618, "learning_rate": 8.497337103756142e-06, "loss": 0.1058, "num_input_tokens_seen": 77160736, "step": 35755 }, { "epoch": 6.562672049917416, "grad_norm": 12.202836036682129, "learning_rate": 8.496764786549499e-06, "loss": 0.405, "num_input_tokens_seen": 77171968, "step": 35760 }, { "epoch": 6.5635896494769685, "grad_norm": 0.02936377562582493, "learning_rate": 8.496192379656257e-06, "loss": 0.1616, "num_input_tokens_seen": 77181888, "step": 35765 }, { "epoch": 6.56450724903652, "grad_norm": 0.0733199268579483, "learning_rate": 8.4956198830911e-06, "loss": 0.0023, "num_input_tokens_seen": 77192384, "step": 35770 }, { "epoch": 6.565424848596073, "grad_norm": 0.06253254413604736, "learning_rate": 8.49504729686871e-06, "loss": 0.1095, "num_input_tokens_seen": 77202272, "step": 35775 }, { "epoch": 6.566342448155625, "grad_norm": 16.278907775878906, "learning_rate": 8.494474621003776e-06, "loss": 0.2758, "num_input_tokens_seen": 77212608, "step": 35780 }, { "epoch": 6.567260047715177, "grad_norm": 0.045762863010168076, "learning_rate": 8.493901855510983e-06, "loss": 0.1398, "num_input_tokens_seen": 77223104, "step": 35785 }, { "epoch": 6.568177647274729, "grad_norm": 14.313539505004883, "learning_rate": 8.493329000405019e-06, "loss": 0.37, "num_input_tokens_seen": 77233664, "step": 35790 }, { "epoch": 6.569095246834282, "grad_norm": 0.09442233294248581, "learning_rate": 8.492756055700584e-06, "loss": 0.0071, "num_input_tokens_seen": 77243296, "step": 35795 }, { "epoch": 6.570012846393833, "grad_norm": 12.793174743652344, "learning_rate": 8.492183021412368e-06, "loss": 0.1215, "num_input_tokens_seen": 77254496, "step": 35800 }, { "epoch": 6.570930445953386, "grad_norm": 0.3611825704574585, "learning_rate": 8.49160989755507e-06, "loss": 0.0034, "num_input_tokens_seen": 77265184, "step": 35805 }, { "epoch": 6.571848045512938, "grad_norm": 1.090932846069336, "learning_rate": 8.491036684143391e-06, "loss": 0.0792, "num_input_tokens_seen": 77277184, "step": 35810 }, { "epoch": 6.57276564507249, "grad_norm": 15.735355377197266, "learning_rate": 8.490463381192031e-06, "loss": 0.1723, "num_input_tokens_seen": 77287776, "step": 35815 }, { "epoch": 6.573683244632043, "grad_norm": 0.04408983141183853, "learning_rate": 8.489889988715696e-06, "loss": 0.1587, "num_input_tokens_seen": 77298496, "step": 35820 }, { "epoch": 6.574600844191595, "grad_norm": 2.9690442085266113, "learning_rate": 8.48931650672909e-06, "loss": 0.0061, "num_input_tokens_seen": 77309376, "step": 35825 }, { "epoch": 6.575518443751147, "grad_norm": 0.05217493698000908, "learning_rate": 8.488742935246923e-06, "loss": 0.0903, "num_input_tokens_seen": 77320576, "step": 35830 }, { "epoch": 6.576436043310699, "grad_norm": 0.08651449531316757, "learning_rate": 8.488169274283908e-06, "loss": 0.203, "num_input_tokens_seen": 77331040, "step": 35835 }, { "epoch": 6.577353642870252, "grad_norm": 0.07030566781759262, "learning_rate": 8.487595523854758e-06, "loss": 0.0082, "num_input_tokens_seen": 77343200, "step": 35840 }, { "epoch": 6.578271242429803, "grad_norm": 0.07236619293689728, "learning_rate": 8.487021683974186e-06, "loss": 0.0382, "num_input_tokens_seen": 77354816, "step": 35845 }, { "epoch": 6.579188841989356, "grad_norm": 0.24426418542861938, "learning_rate": 8.486447754656912e-06, "loss": 0.3048, "num_input_tokens_seen": 77366304, "step": 35850 }, { "epoch": 6.580106441548908, "grad_norm": 0.10153818130493164, "learning_rate": 8.48587373591766e-06, "loss": 0.2263, "num_input_tokens_seen": 77376448, "step": 35855 }, { "epoch": 6.58102404110846, "grad_norm": 1.3718101978302002, "learning_rate": 8.485299627771146e-06, "loss": 0.0062, "num_input_tokens_seen": 77387488, "step": 35860 }, { "epoch": 6.5819416406680125, "grad_norm": 37.37001419067383, "learning_rate": 8.4847254302321e-06, "loss": 0.2174, "num_input_tokens_seen": 77398176, "step": 35865 }, { "epoch": 6.582859240227565, "grad_norm": 0.05028395354747772, "learning_rate": 8.484151143315247e-06, "loss": 0.1296, "num_input_tokens_seen": 77409536, "step": 35870 }, { "epoch": 6.583776839787117, "grad_norm": 0.02089865505695343, "learning_rate": 8.483576767035318e-06, "loss": 0.0081, "num_input_tokens_seen": 77419584, "step": 35875 }, { "epoch": 6.584694439346669, "grad_norm": 0.07096036523580551, "learning_rate": 8.483002301407042e-06, "loss": 0.0904, "num_input_tokens_seen": 77429536, "step": 35880 }, { "epoch": 6.585612038906222, "grad_norm": 2.2126033306121826, "learning_rate": 8.482427746445156e-06, "loss": 0.0947, "num_input_tokens_seen": 77440736, "step": 35885 }, { "epoch": 6.586529638465773, "grad_norm": 0.09384378045797348, "learning_rate": 8.481853102164397e-06, "loss": 0.0055, "num_input_tokens_seen": 77451392, "step": 35890 }, { "epoch": 6.587447238025326, "grad_norm": 0.2285768836736679, "learning_rate": 8.4812783685795e-06, "loss": 0.008, "num_input_tokens_seen": 77462080, "step": 35895 }, { "epoch": 6.588364837584878, "grad_norm": 0.034847699105739594, "learning_rate": 8.48070354570521e-06, "loss": 0.2827, "num_input_tokens_seen": 77471328, "step": 35900 }, { "epoch": 6.58928243714443, "grad_norm": 0.017814286053180695, "learning_rate": 8.480128633556269e-06, "loss": 0.0157, "num_input_tokens_seen": 77482592, "step": 35905 }, { "epoch": 6.590200036703982, "grad_norm": 0.034944597631692886, "learning_rate": 8.47955363214742e-06, "loss": 0.0928, "num_input_tokens_seen": 77492576, "step": 35910 }, { "epoch": 6.591117636263535, "grad_norm": 35.64570236206055, "learning_rate": 8.478978541493414e-06, "loss": 0.0501, "num_input_tokens_seen": 77503136, "step": 35915 }, { "epoch": 6.5920352358230865, "grad_norm": 5.0996832847595215, "learning_rate": 8.478403361609002e-06, "loss": 0.1787, "num_input_tokens_seen": 77513952, "step": 35920 }, { "epoch": 6.592952835382639, "grad_norm": 0.47362130880355835, "learning_rate": 8.477828092508932e-06, "loss": 0.0039, "num_input_tokens_seen": 77525280, "step": 35925 }, { "epoch": 6.5938704349421915, "grad_norm": 0.08150745928287506, "learning_rate": 8.477252734207965e-06, "loss": 0.2267, "num_input_tokens_seen": 77536416, "step": 35930 }, { "epoch": 6.594788034501743, "grad_norm": 0.05363437905907631, "learning_rate": 8.476677286720853e-06, "loss": 0.0068, "num_input_tokens_seen": 77546304, "step": 35935 }, { "epoch": 6.595705634061296, "grad_norm": 0.2099047601222992, "learning_rate": 8.476101750062357e-06, "loss": 0.3079, "num_input_tokens_seen": 77556864, "step": 35940 }, { "epoch": 6.596623233620848, "grad_norm": 1.1600335836410522, "learning_rate": 8.475526124247238e-06, "loss": 0.0039, "num_input_tokens_seen": 77567040, "step": 35945 }, { "epoch": 6.5975408331804, "grad_norm": 6.3551836013793945, "learning_rate": 8.47495040929026e-06, "loss": 0.1343, "num_input_tokens_seen": 77578656, "step": 35950 }, { "epoch": 6.598458432739952, "grad_norm": 0.06184316799044609, "learning_rate": 8.474374605206191e-06, "loss": 0.1094, "num_input_tokens_seen": 77589056, "step": 35955 }, { "epoch": 6.599376032299505, "grad_norm": 0.028103040531277657, "learning_rate": 8.473798712009798e-06, "loss": 0.003, "num_input_tokens_seen": 77599584, "step": 35960 }, { "epoch": 6.600293631859056, "grad_norm": 0.1701193004846573, "learning_rate": 8.473222729715852e-06, "loss": 0.1454, "num_input_tokens_seen": 77611488, "step": 35965 }, { "epoch": 6.601211231418609, "grad_norm": 0.025829479098320007, "learning_rate": 8.472646658339126e-06, "loss": 0.0041, "num_input_tokens_seen": 77622112, "step": 35970 }, { "epoch": 6.602128830978161, "grad_norm": 0.02309168130159378, "learning_rate": 8.472070497894394e-06, "loss": 0.0135, "num_input_tokens_seen": 77631904, "step": 35975 }, { "epoch": 6.603046430537713, "grad_norm": 0.19532468914985657, "learning_rate": 8.471494248396437e-06, "loss": 0.002, "num_input_tokens_seen": 77642240, "step": 35980 }, { "epoch": 6.6039640300972655, "grad_norm": 0.13456064462661743, "learning_rate": 8.47091790986003e-06, "loss": 0.2987, "num_input_tokens_seen": 77651904, "step": 35985 }, { "epoch": 6.604881629656818, "grad_norm": 0.13443255424499512, "learning_rate": 8.47034148229996e-06, "loss": 0.2104, "num_input_tokens_seen": 77662944, "step": 35990 }, { "epoch": 6.60579922921637, "grad_norm": 10.70816421508789, "learning_rate": 8.469764965731011e-06, "loss": 0.2313, "num_input_tokens_seen": 77674592, "step": 35995 }, { "epoch": 6.606716828775922, "grad_norm": 0.1283530741930008, "learning_rate": 8.469188360167966e-06, "loss": 0.0141, "num_input_tokens_seen": 77686400, "step": 36000 }, { "epoch": 6.607634428335475, "grad_norm": 0.3415726125240326, "learning_rate": 8.468611665625616e-06, "loss": 0.0029, "num_input_tokens_seen": 77697952, "step": 36005 }, { "epoch": 6.608552027895026, "grad_norm": 21.643199920654297, "learning_rate": 8.468034882118753e-06, "loss": 0.0127, "num_input_tokens_seen": 77710400, "step": 36010 }, { "epoch": 6.609469627454579, "grad_norm": 0.021252673119306564, "learning_rate": 8.467458009662173e-06, "loss": 0.0775, "num_input_tokens_seen": 77719584, "step": 36015 }, { "epoch": 6.610387227014131, "grad_norm": 42.385501861572266, "learning_rate": 8.466881048270666e-06, "loss": 0.1251, "num_input_tokens_seen": 77729632, "step": 36020 }, { "epoch": 6.611304826573683, "grad_norm": 0.15022899210453033, "learning_rate": 8.466303997959035e-06, "loss": 0.0014, "num_input_tokens_seen": 77740160, "step": 36025 }, { "epoch": 6.612222426133235, "grad_norm": 0.06910330802202225, "learning_rate": 8.465726858742079e-06, "loss": 0.0441, "num_input_tokens_seen": 77750592, "step": 36030 }, { "epoch": 6.613140025692788, "grad_norm": 0.019354119896888733, "learning_rate": 8.465149630634598e-06, "loss": 0.0007, "num_input_tokens_seen": 77759200, "step": 36035 }, { "epoch": 6.6140576252523395, "grad_norm": 0.012573718093335629, "learning_rate": 8.464572313651401e-06, "loss": 0.1264, "num_input_tokens_seen": 77769792, "step": 36040 }, { "epoch": 6.614975224811892, "grad_norm": 0.07761550694704056, "learning_rate": 8.463994907807294e-06, "loss": 0.0015, "num_input_tokens_seen": 77779104, "step": 36045 }, { "epoch": 6.6158928243714445, "grad_norm": 0.8695674538612366, "learning_rate": 8.463417413117087e-06, "loss": 0.0485, "num_input_tokens_seen": 77790560, "step": 36050 }, { "epoch": 6.616810423930996, "grad_norm": 0.010336354374885559, "learning_rate": 8.46283982959559e-06, "loss": 0.0006, "num_input_tokens_seen": 77801600, "step": 36055 }, { "epoch": 6.617728023490549, "grad_norm": 110.4230728149414, "learning_rate": 8.462262157257618e-06, "loss": 0.1107, "num_input_tokens_seen": 77810560, "step": 36060 }, { "epoch": 6.618645623050101, "grad_norm": 0.09333592653274536, "learning_rate": 8.461684396117989e-06, "loss": 0.266, "num_input_tokens_seen": 77820576, "step": 36065 }, { "epoch": 6.619563222609653, "grad_norm": 3.4226815700531006, "learning_rate": 8.461106546191518e-06, "loss": 0.2451, "num_input_tokens_seen": 77831424, "step": 36070 }, { "epoch": 6.620480822169205, "grad_norm": 1.0219093561172485, "learning_rate": 8.46052860749303e-06, "loss": 0.1078, "num_input_tokens_seen": 77842400, "step": 36075 }, { "epoch": 6.621398421728758, "grad_norm": 0.06620615720748901, "learning_rate": 8.459950580037346e-06, "loss": 0.0879, "num_input_tokens_seen": 77853632, "step": 36080 }, { "epoch": 6.622316021288309, "grad_norm": 0.06375099718570709, "learning_rate": 8.459372463839293e-06, "loss": 0.0042, "num_input_tokens_seen": 77864448, "step": 36085 }, { "epoch": 6.623233620847862, "grad_norm": 0.06163768470287323, "learning_rate": 8.458794258913697e-06, "loss": 0.2952, "num_input_tokens_seen": 77876064, "step": 36090 }, { "epoch": 6.624151220407414, "grad_norm": 4.743185043334961, "learning_rate": 8.45821596527539e-06, "loss": 0.1756, "num_input_tokens_seen": 77886752, "step": 36095 }, { "epoch": 6.625068819966966, "grad_norm": 0.049735136330127716, "learning_rate": 8.457637582939202e-06, "loss": 0.0039, "num_input_tokens_seen": 77897856, "step": 36100 }, { "epoch": 6.625986419526519, "grad_norm": 0.3448864221572876, "learning_rate": 8.45705911191997e-06, "loss": 0.0037, "num_input_tokens_seen": 77909088, "step": 36105 }, { "epoch": 6.626904019086071, "grad_norm": 25.13471794128418, "learning_rate": 8.456480552232528e-06, "loss": 0.1029, "num_input_tokens_seen": 77921184, "step": 36110 }, { "epoch": 6.627821618645623, "grad_norm": 0.23219820857048035, "learning_rate": 8.455901903891719e-06, "loss": 0.0946, "num_input_tokens_seen": 77932160, "step": 36115 }, { "epoch": 6.628739218205175, "grad_norm": 0.1907910406589508, "learning_rate": 8.45532316691238e-06, "loss": 0.0054, "num_input_tokens_seen": 77942912, "step": 36120 }, { "epoch": 6.629656817764728, "grad_norm": 0.4106607735157013, "learning_rate": 8.454744341309359e-06, "loss": 0.3263, "num_input_tokens_seen": 77953504, "step": 36125 }, { "epoch": 6.630574417324279, "grad_norm": 0.09717090427875519, "learning_rate": 8.454165427097499e-06, "loss": 0.2123, "num_input_tokens_seen": 77964096, "step": 36130 }, { "epoch": 6.631492016883832, "grad_norm": 0.0953771248459816, "learning_rate": 8.45358642429165e-06, "loss": 0.2273, "num_input_tokens_seen": 77974208, "step": 36135 }, { "epoch": 6.632409616443384, "grad_norm": 0.0396888330578804, "learning_rate": 8.453007332906662e-06, "loss": 0.1263, "num_input_tokens_seen": 77983232, "step": 36140 }, { "epoch": 6.633327216002936, "grad_norm": 0.13179676234722137, "learning_rate": 8.452428152957386e-06, "loss": 0.1113, "num_input_tokens_seen": 77993696, "step": 36145 }, { "epoch": 6.6342448155624885, "grad_norm": 8.018525123596191, "learning_rate": 8.45184888445868e-06, "loss": 0.216, "num_input_tokens_seen": 78005152, "step": 36150 }, { "epoch": 6.635162415122041, "grad_norm": 10.39730167388916, "learning_rate": 8.451269527425399e-06, "loss": 0.2114, "num_input_tokens_seen": 78014944, "step": 36155 }, { "epoch": 6.636080014681593, "grad_norm": 0.06730829179286957, "learning_rate": 8.450690081872405e-06, "loss": 0.008, "num_input_tokens_seen": 78026944, "step": 36160 }, { "epoch": 6.636997614241145, "grad_norm": 11.97335147857666, "learning_rate": 8.450110547814557e-06, "loss": 0.0762, "num_input_tokens_seen": 78038144, "step": 36165 }, { "epoch": 6.637915213800698, "grad_norm": 0.07649454474449158, "learning_rate": 8.449530925266721e-06, "loss": 0.1162, "num_input_tokens_seen": 78047424, "step": 36170 }, { "epoch": 6.638832813360249, "grad_norm": 0.07674681395292282, "learning_rate": 8.448951214243763e-06, "loss": 0.0042, "num_input_tokens_seen": 78057696, "step": 36175 }, { "epoch": 6.639750412919802, "grad_norm": 0.1070045754313469, "learning_rate": 8.448371414760553e-06, "loss": 0.1895, "num_input_tokens_seen": 78068704, "step": 36180 }, { "epoch": 6.640668012479354, "grad_norm": 13.086508750915527, "learning_rate": 8.44779152683196e-06, "loss": 0.253, "num_input_tokens_seen": 78078752, "step": 36185 }, { "epoch": 6.641585612038906, "grad_norm": 0.08117464929819107, "learning_rate": 8.447211550472858e-06, "loss": 0.0727, "num_input_tokens_seen": 78089952, "step": 36190 }, { "epoch": 6.642503211598458, "grad_norm": 33.98434829711914, "learning_rate": 8.44663148569812e-06, "loss": 0.1232, "num_input_tokens_seen": 78100992, "step": 36195 }, { "epoch": 6.643420811158011, "grad_norm": 0.29102033376693726, "learning_rate": 8.44605133252263e-06, "loss": 0.1647, "num_input_tokens_seen": 78111360, "step": 36200 }, { "epoch": 6.6443384107175625, "grad_norm": 0.09435393661260605, "learning_rate": 8.445471090961262e-06, "loss": 0.0035, "num_input_tokens_seen": 78122112, "step": 36205 }, { "epoch": 6.645256010277115, "grad_norm": 0.1156749501824379, "learning_rate": 8.444890761028902e-06, "loss": 0.0945, "num_input_tokens_seen": 78133792, "step": 36210 }, { "epoch": 6.6461736098366675, "grad_norm": 4.986945152282715, "learning_rate": 8.444310342740432e-06, "loss": 0.4317, "num_input_tokens_seen": 78144032, "step": 36215 }, { "epoch": 6.647091209396219, "grad_norm": 0.08822376281023026, "learning_rate": 8.44372983611074e-06, "loss": 0.1602, "num_input_tokens_seen": 78155104, "step": 36220 }, { "epoch": 6.648008808955772, "grad_norm": 0.08297161757946014, "learning_rate": 8.443149241154716e-06, "loss": 0.0029, "num_input_tokens_seen": 78166368, "step": 36225 }, { "epoch": 6.648926408515324, "grad_norm": 0.18814890086650848, "learning_rate": 8.442568557887248e-06, "loss": 0.2252, "num_input_tokens_seen": 78177600, "step": 36230 }, { "epoch": 6.649844008074876, "grad_norm": 0.7475137114524841, "learning_rate": 8.441987786323234e-06, "loss": 0.361, "num_input_tokens_seen": 78188672, "step": 36235 }, { "epoch": 6.650761607634428, "grad_norm": 0.37857967615127563, "learning_rate": 8.441406926477567e-06, "loss": 0.0423, "num_input_tokens_seen": 78198848, "step": 36240 }, { "epoch": 6.651679207193981, "grad_norm": 16.386484146118164, "learning_rate": 8.440825978365145e-06, "loss": 0.1446, "num_input_tokens_seen": 78208096, "step": 36245 }, { "epoch": 6.652596806753532, "grad_norm": 0.11528607457876205, "learning_rate": 8.440244942000873e-06, "loss": 0.072, "num_input_tokens_seen": 78217248, "step": 36250 }, { "epoch": 6.653514406313085, "grad_norm": 24.16376495361328, "learning_rate": 8.439663817399647e-06, "loss": 0.2346, "num_input_tokens_seen": 78228928, "step": 36255 }, { "epoch": 6.654432005872637, "grad_norm": 0.2044648826122284, "learning_rate": 8.439082604576376e-06, "loss": 0.0041, "num_input_tokens_seen": 78239648, "step": 36260 }, { "epoch": 6.655349605432189, "grad_norm": 0.828539252281189, "learning_rate": 8.438501303545966e-06, "loss": 0.006, "num_input_tokens_seen": 78250080, "step": 36265 }, { "epoch": 6.6562672049917415, "grad_norm": 0.03167039155960083, "learning_rate": 8.437919914323326e-06, "loss": 0.0084, "num_input_tokens_seen": 78261024, "step": 36270 }, { "epoch": 6.657184804551294, "grad_norm": 0.21129058301448822, "learning_rate": 8.43733843692337e-06, "loss": 0.0026, "num_input_tokens_seen": 78271296, "step": 36275 }, { "epoch": 6.658102404110846, "grad_norm": 0.05472542345523834, "learning_rate": 8.43675687136101e-06, "loss": 0.0052, "num_input_tokens_seen": 78281856, "step": 36280 }, { "epoch": 6.659020003670398, "grad_norm": 0.03630317002534866, "learning_rate": 8.436175217651164e-06, "loss": 0.1956, "num_input_tokens_seen": 78292640, "step": 36285 }, { "epoch": 6.659937603229951, "grad_norm": 1.1156140565872192, "learning_rate": 8.435593475808747e-06, "loss": 0.0501, "num_input_tokens_seen": 78302880, "step": 36290 }, { "epoch": 6.660855202789502, "grad_norm": 0.023335501551628113, "learning_rate": 8.435011645848683e-06, "loss": 0.1402, "num_input_tokens_seen": 78313216, "step": 36295 }, { "epoch": 6.661772802349055, "grad_norm": 0.08825031667947769, "learning_rate": 8.434429727785895e-06, "loss": 0.2019, "num_input_tokens_seen": 78323520, "step": 36300 }, { "epoch": 6.662690401908607, "grad_norm": 0.5780316591262817, "learning_rate": 8.433847721635307e-06, "loss": 0.0085, "num_input_tokens_seen": 78333760, "step": 36305 }, { "epoch": 6.663608001468159, "grad_norm": 0.28797516226768494, "learning_rate": 8.433265627411846e-06, "loss": 0.2214, "num_input_tokens_seen": 78345760, "step": 36310 }, { "epoch": 6.664525601027711, "grad_norm": 0.05165529623627663, "learning_rate": 8.432683445130444e-06, "loss": 0.0156, "num_input_tokens_seen": 78357344, "step": 36315 }, { "epoch": 6.665443200587264, "grad_norm": 0.07916511595249176, "learning_rate": 8.432101174806031e-06, "loss": 0.001, "num_input_tokens_seen": 78368832, "step": 36320 }, { "epoch": 6.6663608001468155, "grad_norm": 0.11403213441371918, "learning_rate": 8.431518816453541e-06, "loss": 0.2234, "num_input_tokens_seen": 78380384, "step": 36325 }, { "epoch": 6.667278399706368, "grad_norm": 0.25324010848999023, "learning_rate": 8.430936370087911e-06, "loss": 0.1122, "num_input_tokens_seen": 78391744, "step": 36330 }, { "epoch": 6.6681959992659205, "grad_norm": 0.524118959903717, "learning_rate": 8.430353835724085e-06, "loss": 0.0058, "num_input_tokens_seen": 78402912, "step": 36335 }, { "epoch": 6.669113598825472, "grad_norm": 20.14899444580078, "learning_rate": 8.429771213376996e-06, "loss": 0.2245, "num_input_tokens_seen": 78414368, "step": 36340 }, { "epoch": 6.670031198385025, "grad_norm": 0.02343880943953991, "learning_rate": 8.429188503061593e-06, "loss": 0.169, "num_input_tokens_seen": 78424576, "step": 36345 }, { "epoch": 6.670948797944577, "grad_norm": 0.16501368582248688, "learning_rate": 8.428605704792818e-06, "loss": 0.0033, "num_input_tokens_seen": 78435232, "step": 36350 }, { "epoch": 6.671866397504129, "grad_norm": 0.16623789072036743, "learning_rate": 8.428022818585622e-06, "loss": 0.0019, "num_input_tokens_seen": 78445440, "step": 36355 }, { "epoch": 6.672783997063681, "grad_norm": 0.13785453140735626, "learning_rate": 8.427439844454952e-06, "loss": 0.154, "num_input_tokens_seen": 78456448, "step": 36360 }, { "epoch": 6.673701596623234, "grad_norm": 0.1127178966999054, "learning_rate": 8.426856782415765e-06, "loss": 0.0081, "num_input_tokens_seen": 78467584, "step": 36365 }, { "epoch": 6.674619196182785, "grad_norm": 0.19340986013412476, "learning_rate": 8.426273632483009e-06, "loss": 0.0503, "num_input_tokens_seen": 78479744, "step": 36370 }, { "epoch": 6.675536795742338, "grad_norm": 8.387823104858398, "learning_rate": 8.425690394671646e-06, "loss": 0.2765, "num_input_tokens_seen": 78490464, "step": 36375 }, { "epoch": 6.67645439530189, "grad_norm": 0.2870670557022095, "learning_rate": 8.425107068996635e-06, "loss": 0.211, "num_input_tokens_seen": 78500384, "step": 36380 }, { "epoch": 6.677371994861442, "grad_norm": 0.07186093926429749, "learning_rate": 8.424523655472934e-06, "loss": 0.0016, "num_input_tokens_seen": 78511360, "step": 36385 }, { "epoch": 6.678289594420995, "grad_norm": 0.048606015741825104, "learning_rate": 8.42394015411551e-06, "loss": 0.0362, "num_input_tokens_seen": 78522240, "step": 36390 }, { "epoch": 6.679207193980547, "grad_norm": 28.33457374572754, "learning_rate": 8.423356564939328e-06, "loss": 0.0823, "num_input_tokens_seen": 78533344, "step": 36395 }, { "epoch": 6.680124793540099, "grad_norm": 0.01902305707335472, "learning_rate": 8.422772887959355e-06, "loss": 0.0032, "num_input_tokens_seen": 78544800, "step": 36400 }, { "epoch": 6.681042393099651, "grad_norm": 0.06852978467941284, "learning_rate": 8.422189123190563e-06, "loss": 0.2408, "num_input_tokens_seen": 78555360, "step": 36405 }, { "epoch": 6.681959992659204, "grad_norm": 9.312301635742188, "learning_rate": 8.421605270647924e-06, "loss": 0.0051, "num_input_tokens_seen": 78566368, "step": 36410 }, { "epoch": 6.682877592218755, "grad_norm": 0.1276392787694931, "learning_rate": 8.421021330346412e-06, "loss": 0.1181, "num_input_tokens_seen": 78576608, "step": 36415 }, { "epoch": 6.683795191778308, "grad_norm": 0.05445076897740364, "learning_rate": 8.420437302301005e-06, "loss": 0.002, "num_input_tokens_seen": 78588032, "step": 36420 }, { "epoch": 6.68471279133786, "grad_norm": 0.0791107565164566, "learning_rate": 8.419853186526682e-06, "loss": 0.1513, "num_input_tokens_seen": 78599040, "step": 36425 }, { "epoch": 6.685630390897412, "grad_norm": 0.04651809483766556, "learning_rate": 8.419268983038427e-06, "loss": 0.1295, "num_input_tokens_seen": 78611840, "step": 36430 }, { "epoch": 6.6865479904569645, "grad_norm": 0.4033729135990143, "learning_rate": 8.41868469185122e-06, "loss": 0.0013, "num_input_tokens_seen": 78622496, "step": 36435 }, { "epoch": 6.687465590016517, "grad_norm": 0.05319172143936157, "learning_rate": 8.41810031298005e-06, "loss": 0.139, "num_input_tokens_seen": 78633344, "step": 36440 }, { "epoch": 6.688383189576069, "grad_norm": 0.48972395062446594, "learning_rate": 8.417515846439904e-06, "loss": 0.2798, "num_input_tokens_seen": 78644480, "step": 36445 }, { "epoch": 6.689300789135621, "grad_norm": 0.026153674349188805, "learning_rate": 8.416931292245773e-06, "loss": 0.1724, "num_input_tokens_seen": 78655552, "step": 36450 }, { "epoch": 6.690218388695174, "grad_norm": 39.49845504760742, "learning_rate": 8.416346650412651e-06, "loss": 0.0762, "num_input_tokens_seen": 78666944, "step": 36455 }, { "epoch": 6.691135988254725, "grad_norm": 0.11595521122217178, "learning_rate": 8.415761920955532e-06, "loss": 0.0023, "num_input_tokens_seen": 78678336, "step": 36460 }, { "epoch": 6.692053587814278, "grad_norm": 31.616695404052734, "learning_rate": 8.415177103889413e-06, "loss": 0.0724, "num_input_tokens_seen": 78689248, "step": 36465 }, { "epoch": 6.69297118737383, "grad_norm": 0.17798320949077606, "learning_rate": 8.414592199229297e-06, "loss": 0.1164, "num_input_tokens_seen": 78699744, "step": 36470 }, { "epoch": 6.693888786933382, "grad_norm": 0.005146236624568701, "learning_rate": 8.41400720699018e-06, "loss": 0.0014, "num_input_tokens_seen": 78709536, "step": 36475 }, { "epoch": 6.694806386492934, "grad_norm": 0.13833372294902802, "learning_rate": 8.413422127187072e-06, "loss": 0.0013, "num_input_tokens_seen": 78720064, "step": 36480 }, { "epoch": 6.695723986052487, "grad_norm": 20.742643356323242, "learning_rate": 8.412836959834975e-06, "loss": 0.5334, "num_input_tokens_seen": 78731136, "step": 36485 }, { "epoch": 6.6966415856120385, "grad_norm": 1.6798815727233887, "learning_rate": 8.4122517049489e-06, "loss": 0.0029, "num_input_tokens_seen": 78742432, "step": 36490 }, { "epoch": 6.697559185171591, "grad_norm": 9.245336532592773, "learning_rate": 8.411666362543857e-06, "loss": 0.1612, "num_input_tokens_seen": 78753376, "step": 36495 }, { "epoch": 6.6984767847311435, "grad_norm": 13.654786109924316, "learning_rate": 8.41108093263486e-06, "loss": 0.045, "num_input_tokens_seen": 78764224, "step": 36500 }, { "epoch": 6.699394384290695, "grad_norm": 11.55041217803955, "learning_rate": 8.410495415236923e-06, "loss": 0.0616, "num_input_tokens_seen": 78775360, "step": 36505 }, { "epoch": 6.700311983850248, "grad_norm": 0.025980085134506226, "learning_rate": 8.409909810365064e-06, "loss": 0.0059, "num_input_tokens_seen": 78787168, "step": 36510 }, { "epoch": 6.7012295834098, "grad_norm": 0.11439552903175354, "learning_rate": 8.409324118034304e-06, "loss": 0.0504, "num_input_tokens_seen": 78797824, "step": 36515 }, { "epoch": 6.702147182969352, "grad_norm": 1.3200665712356567, "learning_rate": 8.408738338259663e-06, "loss": 0.1123, "num_input_tokens_seen": 78807360, "step": 36520 }, { "epoch": 6.703064782528904, "grad_norm": 5.296145915985107, "learning_rate": 8.408152471056168e-06, "loss": 0.026, "num_input_tokens_seen": 78818272, "step": 36525 }, { "epoch": 6.703982382088457, "grad_norm": 1.3444794416427612, "learning_rate": 8.407566516438845e-06, "loss": 0.0955, "num_input_tokens_seen": 78829120, "step": 36530 }, { "epoch": 6.704899981648008, "grad_norm": 15.949514389038086, "learning_rate": 8.406980474422721e-06, "loss": 0.0088, "num_input_tokens_seen": 78840064, "step": 36535 }, { "epoch": 6.705817581207561, "grad_norm": 0.10594181716442108, "learning_rate": 8.406394345022828e-06, "loss": 0.0013, "num_input_tokens_seen": 78851840, "step": 36540 }, { "epoch": 6.706735180767113, "grad_norm": 0.17705285549163818, "learning_rate": 8.4058081282542e-06, "loss": 0.0152, "num_input_tokens_seen": 78862080, "step": 36545 }, { "epoch": 6.707652780326665, "grad_norm": 0.04102040454745293, "learning_rate": 8.405221824131873e-06, "loss": 0.0059, "num_input_tokens_seen": 78873728, "step": 36550 }, { "epoch": 6.7085703798862175, "grad_norm": 0.009947119280695915, "learning_rate": 8.404635432670882e-06, "loss": 0.0007, "num_input_tokens_seen": 78882560, "step": 36555 }, { "epoch": 6.70948797944577, "grad_norm": 10.484990119934082, "learning_rate": 8.40404895388627e-06, "loss": 0.2496, "num_input_tokens_seen": 78894592, "step": 36560 }, { "epoch": 6.710405579005322, "grad_norm": 0.06102362275123596, "learning_rate": 8.403462387793077e-06, "loss": 0.2974, "num_input_tokens_seen": 78905472, "step": 36565 }, { "epoch": 6.711323178564874, "grad_norm": 113.80573272705078, "learning_rate": 8.402875734406351e-06, "loss": 0.3127, "num_input_tokens_seen": 78916256, "step": 36570 }, { "epoch": 6.712240778124427, "grad_norm": 0.03902480751276016, "learning_rate": 8.402288993741134e-06, "loss": 0.1106, "num_input_tokens_seen": 78927424, "step": 36575 }, { "epoch": 6.713158377683978, "grad_norm": 0.12393737584352493, "learning_rate": 8.401702165812478e-06, "loss": 0.0024, "num_input_tokens_seen": 78938912, "step": 36580 }, { "epoch": 6.714075977243531, "grad_norm": 5.743107318878174, "learning_rate": 8.401115250635434e-06, "loss": 0.3906, "num_input_tokens_seen": 78949632, "step": 36585 }, { "epoch": 6.714993576803083, "grad_norm": 0.055949531495571136, "learning_rate": 8.400528248225055e-06, "loss": 0.1519, "num_input_tokens_seen": 78959840, "step": 36590 }, { "epoch": 6.715911176362635, "grad_norm": 0.01841874048113823, "learning_rate": 8.399941158596397e-06, "loss": 0.1115, "num_input_tokens_seen": 78972000, "step": 36595 }, { "epoch": 6.716828775922187, "grad_norm": 0.1091148629784584, "learning_rate": 8.399353981764516e-06, "loss": 0.0961, "num_input_tokens_seen": 78982496, "step": 36600 }, { "epoch": 6.71774637548174, "grad_norm": 0.19833479821681976, "learning_rate": 8.398766717744476e-06, "loss": 0.0025, "num_input_tokens_seen": 78992800, "step": 36605 }, { "epoch": 6.7186639750412915, "grad_norm": 9.100557327270508, "learning_rate": 8.398179366551336e-06, "loss": 0.2575, "num_input_tokens_seen": 79002880, "step": 36610 }, { "epoch": 6.719581574600844, "grad_norm": 14.210741996765137, "learning_rate": 8.397591928200163e-06, "loss": 0.2145, "num_input_tokens_seen": 79013472, "step": 36615 }, { "epoch": 6.7204991741603965, "grad_norm": 1.7091374397277832, "learning_rate": 8.397004402706021e-06, "loss": 0.0702, "num_input_tokens_seen": 79024896, "step": 36620 }, { "epoch": 6.721416773719948, "grad_norm": 0.16932837665081024, "learning_rate": 8.396416790083983e-06, "loss": 0.3824, "num_input_tokens_seen": 79036000, "step": 36625 }, { "epoch": 6.722334373279501, "grad_norm": 54.572654724121094, "learning_rate": 8.395829090349118e-06, "loss": 0.0982, "num_input_tokens_seen": 79047200, "step": 36630 }, { "epoch": 6.723251972839053, "grad_norm": 0.3412129282951355, "learning_rate": 8.395241303516499e-06, "loss": 0.104, "num_input_tokens_seen": 79057632, "step": 36635 }, { "epoch": 6.724169572398605, "grad_norm": 0.08538392186164856, "learning_rate": 8.394653429601203e-06, "loss": 0.3094, "num_input_tokens_seen": 79067424, "step": 36640 }, { "epoch": 6.725087171958157, "grad_norm": 0.6157045364379883, "learning_rate": 8.394065468618309e-06, "loss": 0.2547, "num_input_tokens_seen": 79075776, "step": 36645 }, { "epoch": 6.72600477151771, "grad_norm": 11.104801177978516, "learning_rate": 8.393477420582894e-06, "loss": 0.2869, "num_input_tokens_seen": 79086624, "step": 36650 }, { "epoch": 6.726922371077261, "grad_norm": 0.2415815144777298, "learning_rate": 8.392889285510045e-06, "loss": 0.0686, "num_input_tokens_seen": 79097504, "step": 36655 }, { "epoch": 6.727839970636814, "grad_norm": 21.680891036987305, "learning_rate": 8.392301063414843e-06, "loss": 0.1461, "num_input_tokens_seen": 79107200, "step": 36660 }, { "epoch": 6.7287575701963664, "grad_norm": 0.07894774526357651, "learning_rate": 8.391712754312375e-06, "loss": 0.0056, "num_input_tokens_seen": 79118976, "step": 36665 }, { "epoch": 6.729675169755918, "grad_norm": 0.11925162374973297, "learning_rate": 8.391124358217732e-06, "loss": 0.117, "num_input_tokens_seen": 79129216, "step": 36670 }, { "epoch": 6.730592769315471, "grad_norm": 28.26868438720703, "learning_rate": 8.390535875146006e-06, "loss": 0.1525, "num_input_tokens_seen": 79139872, "step": 36675 }, { "epoch": 6.731510368875023, "grad_norm": 5.291963577270508, "learning_rate": 8.38994730511229e-06, "loss": 0.3214, "num_input_tokens_seen": 79150880, "step": 36680 }, { "epoch": 6.732427968434575, "grad_norm": 0.11118785291910172, "learning_rate": 8.38935864813168e-06, "loss": 0.1308, "num_input_tokens_seen": 79161472, "step": 36685 }, { "epoch": 6.733345567994127, "grad_norm": 0.05112723633646965, "learning_rate": 8.388769904219272e-06, "loss": 0.0747, "num_input_tokens_seen": 79172000, "step": 36690 }, { "epoch": 6.73426316755368, "grad_norm": 0.13626186549663544, "learning_rate": 8.388181073390169e-06, "loss": 0.0042, "num_input_tokens_seen": 79181472, "step": 36695 }, { "epoch": 6.735180767113231, "grad_norm": 0.23053473234176636, "learning_rate": 8.387592155659472e-06, "loss": 0.0344, "num_input_tokens_seen": 79191680, "step": 36700 }, { "epoch": 6.736098366672784, "grad_norm": 0.06389371305704117, "learning_rate": 8.387003151042288e-06, "loss": 0.0058, "num_input_tokens_seen": 79202688, "step": 36705 }, { "epoch": 6.737015966232336, "grad_norm": 15.612524032592773, "learning_rate": 8.38641405955372e-06, "loss": 0.1373, "num_input_tokens_seen": 79214400, "step": 36710 }, { "epoch": 6.737933565791888, "grad_norm": 0.0821332260966301, "learning_rate": 8.385824881208881e-06, "loss": 0.1467, "num_input_tokens_seen": 79225056, "step": 36715 }, { "epoch": 6.7388511653514405, "grad_norm": 0.3412477374076843, "learning_rate": 8.385235616022883e-06, "loss": 0.0541, "num_input_tokens_seen": 79236256, "step": 36720 }, { "epoch": 6.739768764910993, "grad_norm": 0.38123759627342224, "learning_rate": 8.384646264010836e-06, "loss": 0.2306, "num_input_tokens_seen": 79248672, "step": 36725 }, { "epoch": 6.740686364470545, "grad_norm": 94.15794372558594, "learning_rate": 8.384056825187859e-06, "loss": 0.0737, "num_input_tokens_seen": 79259808, "step": 36730 }, { "epoch": 6.741603964030097, "grad_norm": 0.07741130143404007, "learning_rate": 8.383467299569068e-06, "loss": 0.1231, "num_input_tokens_seen": 79271520, "step": 36735 }, { "epoch": 6.74252156358965, "grad_norm": 17.162399291992188, "learning_rate": 8.382877687169586e-06, "loss": 0.3454, "num_input_tokens_seen": 79282752, "step": 36740 }, { "epoch": 6.743439163149201, "grad_norm": 0.05765651538968086, "learning_rate": 8.382287988004534e-06, "loss": 0.0017, "num_input_tokens_seen": 79293952, "step": 36745 }, { "epoch": 6.744356762708754, "grad_norm": 0.023252923041582108, "learning_rate": 8.381698202089036e-06, "loss": 0.1148, "num_input_tokens_seen": 79305600, "step": 36750 }, { "epoch": 6.745274362268306, "grad_norm": 0.12801387906074524, "learning_rate": 8.381108329438224e-06, "loss": 0.2349, "num_input_tokens_seen": 79316096, "step": 36755 }, { "epoch": 6.746191961827858, "grad_norm": 13.131575584411621, "learning_rate": 8.38051837006722e-06, "loss": 0.108, "num_input_tokens_seen": 79326592, "step": 36760 }, { "epoch": 6.74710956138741, "grad_norm": 0.04645877704024315, "learning_rate": 8.379928323991162e-06, "loss": 0.0584, "num_input_tokens_seen": 79337088, "step": 36765 }, { "epoch": 6.748027160946963, "grad_norm": 0.07034572958946228, "learning_rate": 8.379338191225177e-06, "loss": 0.0059, "num_input_tokens_seen": 79348832, "step": 36770 }, { "epoch": 6.7489447605065145, "grad_norm": 0.028421301394701004, "learning_rate": 8.378747971784407e-06, "loss": 0.0013, "num_input_tokens_seen": 79360384, "step": 36775 }, { "epoch": 6.749862360066067, "grad_norm": 0.016002001240849495, "learning_rate": 8.378157665683987e-06, "loss": 0.0719, "num_input_tokens_seen": 79371168, "step": 36780 }, { "epoch": 6.7507799596256195, "grad_norm": 0.06475646793842316, "learning_rate": 8.377567272939063e-06, "loss": 0.0028, "num_input_tokens_seen": 79382656, "step": 36785 }, { "epoch": 6.751697559185171, "grad_norm": 9.00094985961914, "learning_rate": 8.376976793564769e-06, "loss": 0.1028, "num_input_tokens_seen": 79392832, "step": 36790 }, { "epoch": 6.752615158744724, "grad_norm": 17.926620483398438, "learning_rate": 8.376386227576254e-06, "loss": 0.0077, "num_input_tokens_seen": 79403680, "step": 36795 }, { "epoch": 6.753532758304276, "grad_norm": 0.025258680805563927, "learning_rate": 8.375795574988667e-06, "loss": 0.2108, "num_input_tokens_seen": 79414720, "step": 36800 }, { "epoch": 6.754450357863828, "grad_norm": 0.23655229806900024, "learning_rate": 8.375204835817155e-06, "loss": 0.0019, "num_input_tokens_seen": 79425792, "step": 36805 }, { "epoch": 6.75536795742338, "grad_norm": 0.11429979652166367, "learning_rate": 8.374614010076869e-06, "loss": 0.0012, "num_input_tokens_seen": 79437920, "step": 36810 }, { "epoch": 6.756285556982933, "grad_norm": 0.5439922213554382, "learning_rate": 8.374023097782963e-06, "loss": 0.0041, "num_input_tokens_seen": 79449408, "step": 36815 }, { "epoch": 6.757203156542484, "grad_norm": 0.008355743251740932, "learning_rate": 8.373432098950595e-06, "loss": 0.103, "num_input_tokens_seen": 79458880, "step": 36820 }, { "epoch": 6.758120756102037, "grad_norm": 0.006742881610989571, "learning_rate": 8.372841013594924e-06, "loss": 0.1128, "num_input_tokens_seen": 79469536, "step": 36825 }, { "epoch": 6.759038355661589, "grad_norm": 0.004573099315166473, "learning_rate": 8.372249841731105e-06, "loss": 0.0095, "num_input_tokens_seen": 79479968, "step": 36830 }, { "epoch": 6.759955955221141, "grad_norm": 0.08757838606834412, "learning_rate": 8.371658583374306e-06, "loss": 0.2953, "num_input_tokens_seen": 79491040, "step": 36835 }, { "epoch": 6.7608735547806935, "grad_norm": 0.07745907455682755, "learning_rate": 8.371067238539692e-06, "loss": 0.1451, "num_input_tokens_seen": 79500576, "step": 36840 }, { "epoch": 6.761791154340246, "grad_norm": 0.06294696778059006, "learning_rate": 8.370475807242425e-06, "loss": 0.0012, "num_input_tokens_seen": 79510528, "step": 36845 }, { "epoch": 6.762708753899798, "grad_norm": 0.2642126977443695, "learning_rate": 8.369884289497678e-06, "loss": 0.2211, "num_input_tokens_seen": 79522080, "step": 36850 }, { "epoch": 6.76362635345935, "grad_norm": 4.807880878448486, "learning_rate": 8.369292685320623e-06, "loss": 0.0032, "num_input_tokens_seen": 79533216, "step": 36855 }, { "epoch": 6.764543953018903, "grad_norm": 25.833995819091797, "learning_rate": 8.368700994726432e-06, "loss": 0.0961, "num_input_tokens_seen": 79544416, "step": 36860 }, { "epoch": 6.765461552578454, "grad_norm": 25.092897415161133, "learning_rate": 8.36810921773028e-06, "loss": 0.3026, "num_input_tokens_seen": 79555264, "step": 36865 }, { "epoch": 6.766379152138007, "grad_norm": 1.7275186777114868, "learning_rate": 8.367517354347347e-06, "loss": 0.1173, "num_input_tokens_seen": 79564960, "step": 36870 }, { "epoch": 6.767296751697559, "grad_norm": 0.17098623514175415, "learning_rate": 8.366925404592814e-06, "loss": 0.0024, "num_input_tokens_seen": 79575008, "step": 36875 }, { "epoch": 6.768214351257111, "grad_norm": 0.14746545255184174, "learning_rate": 8.366333368481862e-06, "loss": 0.001, "num_input_tokens_seen": 79586240, "step": 36880 }, { "epoch": 6.769131950816663, "grad_norm": 0.0631169006228447, "learning_rate": 8.365741246029677e-06, "loss": 0.1386, "num_input_tokens_seen": 79597088, "step": 36885 }, { "epoch": 6.770049550376216, "grad_norm": 0.017647959291934967, "learning_rate": 8.365149037251445e-06, "loss": 0.1755, "num_input_tokens_seen": 79606784, "step": 36890 }, { "epoch": 6.7709671499357675, "grad_norm": 6.100584506988525, "learning_rate": 8.364556742162355e-06, "loss": 0.0042, "num_input_tokens_seen": 79617408, "step": 36895 }, { "epoch": 6.77188474949532, "grad_norm": 16.387296676635742, "learning_rate": 8.363964360777602e-06, "loss": 0.3727, "num_input_tokens_seen": 79629216, "step": 36900 }, { "epoch": 6.7728023490548726, "grad_norm": 0.03632630780339241, "learning_rate": 8.363371893112372e-06, "loss": 0.0918, "num_input_tokens_seen": 79640320, "step": 36905 }, { "epoch": 6.773719948614424, "grad_norm": 0.054037582129240036, "learning_rate": 8.36277933918187e-06, "loss": 0.1351, "num_input_tokens_seen": 79651392, "step": 36910 }, { "epoch": 6.774637548173977, "grad_norm": 0.027371514588594437, "learning_rate": 8.36218669900129e-06, "loss": 0.1294, "num_input_tokens_seen": 79663520, "step": 36915 }, { "epoch": 6.775555147733529, "grad_norm": 0.0038793860003352165, "learning_rate": 8.36159397258583e-06, "loss": 0.1656, "num_input_tokens_seen": 79673760, "step": 36920 }, { "epoch": 6.776472747293081, "grad_norm": 0.7927286028862, "learning_rate": 8.361001159950694e-06, "loss": 0.096, "num_input_tokens_seen": 79684352, "step": 36925 }, { "epoch": 6.777390346852633, "grad_norm": 7.363289833068848, "learning_rate": 8.360408261111088e-06, "loss": 0.0041, "num_input_tokens_seen": 79695808, "step": 36930 }, { "epoch": 6.778307946412186, "grad_norm": 16.047000885009766, "learning_rate": 8.359815276082219e-06, "loss": 0.2727, "num_input_tokens_seen": 79706976, "step": 36935 }, { "epoch": 6.779225545971738, "grad_norm": 0.01953704096376896, "learning_rate": 8.359222204879296e-06, "loss": 0.144, "num_input_tokens_seen": 79716768, "step": 36940 }, { "epoch": 6.78014314553129, "grad_norm": 0.014946707524359226, "learning_rate": 8.358629047517528e-06, "loss": 0.0051, "num_input_tokens_seen": 79727136, "step": 36945 }, { "epoch": 6.7810607450908424, "grad_norm": 22.981950759887695, "learning_rate": 8.358035804012131e-06, "loss": 0.216, "num_input_tokens_seen": 79738848, "step": 36950 }, { "epoch": 6.781978344650395, "grad_norm": 74.98002624511719, "learning_rate": 8.35744247437832e-06, "loss": 0.3323, "num_input_tokens_seen": 79749568, "step": 36955 }, { "epoch": 6.782895944209947, "grad_norm": 0.055786579847335815, "learning_rate": 8.356849058631314e-06, "loss": 0.4365, "num_input_tokens_seen": 79760096, "step": 36960 }, { "epoch": 6.783813543769499, "grad_norm": 0.027780666947364807, "learning_rate": 8.356255556786332e-06, "loss": 0.1649, "num_input_tokens_seen": 79768512, "step": 36965 }, { "epoch": 6.784731143329052, "grad_norm": 0.08064477145671844, "learning_rate": 8.355661968858595e-06, "loss": 0.0024, "num_input_tokens_seen": 79778976, "step": 36970 }, { "epoch": 6.785648742888603, "grad_norm": 0.22857393324375153, "learning_rate": 8.355068294863331e-06, "loss": 0.0956, "num_input_tokens_seen": 79789120, "step": 36975 }, { "epoch": 6.786566342448156, "grad_norm": 0.15051938593387604, "learning_rate": 8.354474534815764e-06, "loss": 0.1571, "num_input_tokens_seen": 79799648, "step": 36980 }, { "epoch": 6.787483942007708, "grad_norm": 163.92034912109375, "learning_rate": 8.353880688731126e-06, "loss": 0.4223, "num_input_tokens_seen": 79810368, "step": 36985 }, { "epoch": 6.78840154156726, "grad_norm": 0.7061366438865662, "learning_rate": 8.353286756624645e-06, "loss": 0.1198, "num_input_tokens_seen": 79821408, "step": 36990 }, { "epoch": 6.789319141126812, "grad_norm": 6.027621269226074, "learning_rate": 8.352692738511556e-06, "loss": 0.1069, "num_input_tokens_seen": 79833184, "step": 36995 }, { "epoch": 6.790236740686365, "grad_norm": 26.102407455444336, "learning_rate": 8.352098634407095e-06, "loss": 0.0631, "num_input_tokens_seen": 79843776, "step": 37000 }, { "epoch": 6.7911543402459165, "grad_norm": 37.97566223144531, "learning_rate": 8.3515044443265e-06, "loss": 0.164, "num_input_tokens_seen": 79855360, "step": 37005 }, { "epoch": 6.792071939805469, "grad_norm": 13.4266357421875, "learning_rate": 8.350910168285008e-06, "loss": 0.1526, "num_input_tokens_seen": 79866656, "step": 37010 }, { "epoch": 6.7929895393650215, "grad_norm": 0.47664862871170044, "learning_rate": 8.350315806297865e-06, "loss": 0.1062, "num_input_tokens_seen": 79877696, "step": 37015 }, { "epoch": 6.793907138924573, "grad_norm": 0.0450863316655159, "learning_rate": 8.349721358380314e-06, "loss": 0.0039, "num_input_tokens_seen": 79887840, "step": 37020 }, { "epoch": 6.794824738484126, "grad_norm": 0.27613282203674316, "learning_rate": 8.349126824547603e-06, "loss": 0.0024, "num_input_tokens_seen": 79898656, "step": 37025 }, { "epoch": 6.795742338043678, "grad_norm": 0.06843593716621399, "learning_rate": 8.348532204814976e-06, "loss": 0.0989, "num_input_tokens_seen": 79909248, "step": 37030 }, { "epoch": 6.79665993760323, "grad_norm": 0.2064128816127777, "learning_rate": 8.347937499197691e-06, "loss": 0.0139, "num_input_tokens_seen": 79919904, "step": 37035 }, { "epoch": 6.797577537162782, "grad_norm": 0.010669839568436146, "learning_rate": 8.347342707710997e-06, "loss": 0.0035, "num_input_tokens_seen": 79929504, "step": 37040 }, { "epoch": 6.798495136722335, "grad_norm": 0.01808079518377781, "learning_rate": 8.34674783037015e-06, "loss": 0.0011, "num_input_tokens_seen": 79939232, "step": 37045 }, { "epoch": 6.799412736281886, "grad_norm": 0.17531532049179077, "learning_rate": 8.346152867190409e-06, "loss": 0.0018, "num_input_tokens_seen": 79949216, "step": 37050 }, { "epoch": 6.800330335841439, "grad_norm": 0.023575808852910995, "learning_rate": 8.345557818187033e-06, "loss": 0.1087, "num_input_tokens_seen": 79959264, "step": 37055 }, { "epoch": 6.801247935400991, "grad_norm": 4.796584129333496, "learning_rate": 8.344962683375284e-06, "loss": 0.2815, "num_input_tokens_seen": 79969056, "step": 37060 }, { "epoch": 6.802165534960543, "grad_norm": 0.06179448589682579, "learning_rate": 8.344367462770426e-06, "loss": 0.0831, "num_input_tokens_seen": 79979616, "step": 37065 }, { "epoch": 6.8030831345200955, "grad_norm": 0.005153504665941, "learning_rate": 8.343772156387725e-06, "loss": 0.0006, "num_input_tokens_seen": 79989728, "step": 37070 }, { "epoch": 6.804000734079648, "grad_norm": 0.30972936749458313, "learning_rate": 8.343176764242452e-06, "loss": 0.0009, "num_input_tokens_seen": 80001472, "step": 37075 }, { "epoch": 6.8049183336392, "grad_norm": 7.957058906555176, "learning_rate": 8.342581286349876e-06, "loss": 0.3137, "num_input_tokens_seen": 80010656, "step": 37080 }, { "epoch": 6.805835933198752, "grad_norm": 116.42086791992188, "learning_rate": 8.34198572272527e-06, "loss": 0.0501, "num_input_tokens_seen": 80021472, "step": 37085 }, { "epoch": 6.806753532758305, "grad_norm": 1.5966322422027588, "learning_rate": 8.341390073383911e-06, "loss": 0.1459, "num_input_tokens_seen": 80032832, "step": 37090 }, { "epoch": 6.807671132317856, "grad_norm": 31.936532974243164, "learning_rate": 8.340794338341075e-06, "loss": 0.401, "num_input_tokens_seen": 80043392, "step": 37095 }, { "epoch": 6.808588731877409, "grad_norm": 0.10471180826425552, "learning_rate": 8.340198517612042e-06, "loss": 0.09, "num_input_tokens_seen": 80053664, "step": 37100 }, { "epoch": 6.809506331436961, "grad_norm": 34.34265899658203, "learning_rate": 8.339602611212093e-06, "loss": 0.262, "num_input_tokens_seen": 80063424, "step": 37105 }, { "epoch": 6.810423930996513, "grad_norm": 0.11689619719982147, "learning_rate": 8.339006619156513e-06, "loss": 0.0017, "num_input_tokens_seen": 80072896, "step": 37110 }, { "epoch": 6.811341530556065, "grad_norm": 0.037567026913166046, "learning_rate": 8.338410541460589e-06, "loss": 0.0006, "num_input_tokens_seen": 80082848, "step": 37115 }, { "epoch": 6.812259130115618, "grad_norm": 0.009349995292723179, "learning_rate": 8.337814378139607e-06, "loss": 0.1484, "num_input_tokens_seen": 80093984, "step": 37120 }, { "epoch": 6.8131767296751695, "grad_norm": 0.016985813155770302, "learning_rate": 8.337218129208862e-06, "loss": 0.0005, "num_input_tokens_seen": 80104576, "step": 37125 }, { "epoch": 6.814094329234722, "grad_norm": 0.0189558956772089, "learning_rate": 8.336621794683643e-06, "loss": 0.0952, "num_input_tokens_seen": 80115968, "step": 37130 }, { "epoch": 6.8150119287942745, "grad_norm": 0.05139314755797386, "learning_rate": 8.336025374579246e-06, "loss": 0.0077, "num_input_tokens_seen": 80127552, "step": 37135 }, { "epoch": 6.815929528353826, "grad_norm": 0.10813727229833603, "learning_rate": 8.335428868910968e-06, "loss": 0.1825, "num_input_tokens_seen": 80139488, "step": 37140 }, { "epoch": 6.816847127913379, "grad_norm": 8.115254402160645, "learning_rate": 8.33483227769411e-06, "loss": 0.1379, "num_input_tokens_seen": 80150432, "step": 37145 }, { "epoch": 6.817764727472931, "grad_norm": 29.581375122070312, "learning_rate": 8.334235600943972e-06, "loss": 0.3201, "num_input_tokens_seen": 80161824, "step": 37150 }, { "epoch": 6.818682327032483, "grad_norm": 0.9958626627922058, "learning_rate": 8.33363883867586e-06, "loss": 0.1395, "num_input_tokens_seen": 80172288, "step": 37155 }, { "epoch": 6.819599926592035, "grad_norm": 0.0515238419175148, "learning_rate": 8.333041990905076e-06, "loss": 0.0907, "num_input_tokens_seen": 80183776, "step": 37160 }, { "epoch": 6.820517526151588, "grad_norm": 0.041746631264686584, "learning_rate": 8.332445057646931e-06, "loss": 0.1632, "num_input_tokens_seen": 80194208, "step": 37165 }, { "epoch": 6.821435125711139, "grad_norm": 0.09800316393375397, "learning_rate": 8.331848038916737e-06, "loss": 0.0015, "num_input_tokens_seen": 80205248, "step": 37170 }, { "epoch": 6.822352725270692, "grad_norm": 1.2957051992416382, "learning_rate": 8.331250934729805e-06, "loss": 0.2192, "num_input_tokens_seen": 80216032, "step": 37175 }, { "epoch": 6.823270324830244, "grad_norm": 0.03254412114620209, "learning_rate": 8.330653745101447e-06, "loss": 0.106, "num_input_tokens_seen": 80226592, "step": 37180 }, { "epoch": 6.824187924389796, "grad_norm": 61.734710693359375, "learning_rate": 8.330056470046983e-06, "loss": 0.117, "num_input_tokens_seen": 80237152, "step": 37185 }, { "epoch": 6.8251055239493486, "grad_norm": 57.55356979370117, "learning_rate": 8.329459109581731e-06, "loss": 0.0297, "num_input_tokens_seen": 80248640, "step": 37190 }, { "epoch": 6.826023123508901, "grad_norm": 0.1776009351015091, "learning_rate": 8.328861663721017e-06, "loss": 0.135, "num_input_tokens_seen": 80258976, "step": 37195 }, { "epoch": 6.826940723068453, "grad_norm": 0.08232339471578598, "learning_rate": 8.328264132480157e-06, "loss": 0.1768, "num_input_tokens_seen": 80269920, "step": 37200 }, { "epoch": 6.827858322628005, "grad_norm": 0.3773849606513977, "learning_rate": 8.32766651587448e-06, "loss": 0.1161, "num_input_tokens_seen": 80281248, "step": 37205 }, { "epoch": 6.828775922187558, "grad_norm": 6.091331481933594, "learning_rate": 8.327068813919317e-06, "loss": 0.2596, "num_input_tokens_seen": 80292576, "step": 37210 }, { "epoch": 6.829693521747109, "grad_norm": 0.03565666079521179, "learning_rate": 8.326471026629994e-06, "loss": 0.0651, "num_input_tokens_seen": 80303360, "step": 37215 }, { "epoch": 6.830611121306662, "grad_norm": 0.8783206343650818, "learning_rate": 8.325873154021844e-06, "loss": 0.0036, "num_input_tokens_seen": 80312864, "step": 37220 }, { "epoch": 6.831528720866214, "grad_norm": 0.019994810223579407, "learning_rate": 8.325275196110202e-06, "loss": 0.2375, "num_input_tokens_seen": 80323520, "step": 37225 }, { "epoch": 6.832446320425766, "grad_norm": 2.1889305114746094, "learning_rate": 8.324677152910406e-06, "loss": 0.0099, "num_input_tokens_seen": 80334688, "step": 37230 }, { "epoch": 6.8333639199853184, "grad_norm": 0.018787644803524017, "learning_rate": 8.324079024437795e-06, "loss": 0.1777, "num_input_tokens_seen": 80345280, "step": 37235 }, { "epoch": 6.834281519544871, "grad_norm": 10.994257926940918, "learning_rate": 8.323480810707707e-06, "loss": 0.2845, "num_input_tokens_seen": 80356896, "step": 37240 }, { "epoch": 6.835199119104423, "grad_norm": 0.01761023886501789, "learning_rate": 8.322882511735489e-06, "loss": 0.1121, "num_input_tokens_seen": 80366816, "step": 37245 }, { "epoch": 6.836116718663975, "grad_norm": 53.76704406738281, "learning_rate": 8.322284127536481e-06, "loss": 0.0293, "num_input_tokens_seen": 80377728, "step": 37250 }, { "epoch": 6.837034318223528, "grad_norm": 0.04475332424044609, "learning_rate": 8.321685658126037e-06, "loss": 0.0477, "num_input_tokens_seen": 80388736, "step": 37255 }, { "epoch": 6.837951917783079, "grad_norm": 0.12442773580551147, "learning_rate": 8.321087103519503e-06, "loss": 0.0758, "num_input_tokens_seen": 80399488, "step": 37260 }, { "epoch": 6.838869517342632, "grad_norm": 0.09280967712402344, "learning_rate": 8.320488463732232e-06, "loss": 0.0018, "num_input_tokens_seen": 80408896, "step": 37265 }, { "epoch": 6.839787116902184, "grad_norm": 0.022610576823353767, "learning_rate": 8.31988973877958e-06, "loss": 0.0035, "num_input_tokens_seen": 80419072, "step": 37270 }, { "epoch": 6.840704716461736, "grad_norm": 21.57099151611328, "learning_rate": 8.319290928676899e-06, "loss": 0.2598, "num_input_tokens_seen": 80429824, "step": 37275 }, { "epoch": 6.841622316021288, "grad_norm": 18.079504013061523, "learning_rate": 8.318692033439553e-06, "loss": 0.1989, "num_input_tokens_seen": 80439808, "step": 37280 }, { "epoch": 6.842539915580841, "grad_norm": 0.43478527665138245, "learning_rate": 8.318093053082898e-06, "loss": 0.1069, "num_input_tokens_seen": 80451072, "step": 37285 }, { "epoch": 6.8434575151403925, "grad_norm": 0.5272278189659119, "learning_rate": 8.317493987622299e-06, "loss": 0.2923, "num_input_tokens_seen": 80462976, "step": 37290 }, { "epoch": 6.844375114699945, "grad_norm": 0.053890153765678406, "learning_rate": 8.316894837073119e-06, "loss": 0.0011, "num_input_tokens_seen": 80473184, "step": 37295 }, { "epoch": 6.8452927142594975, "grad_norm": 0.05724891647696495, "learning_rate": 8.31629560145073e-06, "loss": 0.0635, "num_input_tokens_seen": 80483872, "step": 37300 }, { "epoch": 6.846210313819049, "grad_norm": 0.011058810167014599, "learning_rate": 8.315696280770498e-06, "loss": 0.0024, "num_input_tokens_seen": 80494432, "step": 37305 }, { "epoch": 6.847127913378602, "grad_norm": 0.07378846406936646, "learning_rate": 8.315096875047795e-06, "loss": 0.0297, "num_input_tokens_seen": 80504448, "step": 37310 }, { "epoch": 6.848045512938154, "grad_norm": 0.0715971365571022, "learning_rate": 8.314497384297994e-06, "loss": 0.0318, "num_input_tokens_seen": 80515168, "step": 37315 }, { "epoch": 6.848963112497706, "grad_norm": 0.3973403573036194, "learning_rate": 8.313897808536472e-06, "loss": 0.1256, "num_input_tokens_seen": 80526592, "step": 37320 }, { "epoch": 6.849880712057258, "grad_norm": 62.3547248840332, "learning_rate": 8.31329814777861e-06, "loss": 0.2374, "num_input_tokens_seen": 80537984, "step": 37325 }, { "epoch": 6.850798311616811, "grad_norm": 0.4819134473800659, "learning_rate": 8.312698402039783e-06, "loss": 0.2649, "num_input_tokens_seen": 80549024, "step": 37330 }, { "epoch": 6.851715911176362, "grad_norm": 0.01901417225599289, "learning_rate": 8.312098571335377e-06, "loss": 0.331, "num_input_tokens_seen": 80559456, "step": 37335 }, { "epoch": 6.852633510735915, "grad_norm": 0.6675817966461182, "learning_rate": 8.311498655680777e-06, "loss": 0.1248, "num_input_tokens_seen": 80570624, "step": 37340 }, { "epoch": 6.853551110295467, "grad_norm": 0.10658174753189087, "learning_rate": 8.310898655091368e-06, "loss": 0.0033, "num_input_tokens_seen": 80579808, "step": 37345 }, { "epoch": 6.854468709855019, "grad_norm": 7.957995891571045, "learning_rate": 8.310298569582539e-06, "loss": 0.176, "num_input_tokens_seen": 80590048, "step": 37350 }, { "epoch": 6.8553863094145715, "grad_norm": 0.9710519909858704, "learning_rate": 8.309698399169683e-06, "loss": 0.0033, "num_input_tokens_seen": 80601728, "step": 37355 }, { "epoch": 6.856303908974124, "grad_norm": 22.158599853515625, "learning_rate": 8.309098143868193e-06, "loss": 0.0492, "num_input_tokens_seen": 80612000, "step": 37360 }, { "epoch": 6.857221508533676, "grad_norm": 0.05178473889827728, "learning_rate": 8.308497803693463e-06, "loss": 0.0021, "num_input_tokens_seen": 80623264, "step": 37365 }, { "epoch": 6.858139108093228, "grad_norm": 0.10177899897098541, "learning_rate": 8.307897378660894e-06, "loss": 0.0864, "num_input_tokens_seen": 80632928, "step": 37370 }, { "epoch": 6.859056707652781, "grad_norm": 34.48186492919922, "learning_rate": 8.307296868785882e-06, "loss": 0.1367, "num_input_tokens_seen": 80644064, "step": 37375 }, { "epoch": 6.859974307212332, "grad_norm": 0.025484489277005196, "learning_rate": 8.306696274083833e-06, "loss": 0.0021, "num_input_tokens_seen": 80655328, "step": 37380 }, { "epoch": 6.860891906771885, "grad_norm": 0.16132009029388428, "learning_rate": 8.306095594570149e-06, "loss": 0.0026, "num_input_tokens_seen": 80665792, "step": 37385 }, { "epoch": 6.861809506331437, "grad_norm": 0.01648947037756443, "learning_rate": 8.305494830260237e-06, "loss": 0.1271, "num_input_tokens_seen": 80677472, "step": 37390 }, { "epoch": 6.862727105890989, "grad_norm": 14.489940643310547, "learning_rate": 8.304893981169503e-06, "loss": 0.0119, "num_input_tokens_seen": 80688608, "step": 37395 }, { "epoch": 6.863644705450541, "grad_norm": 6.508580207824707, "learning_rate": 8.304293047313363e-06, "loss": 0.1447, "num_input_tokens_seen": 80699552, "step": 37400 }, { "epoch": 6.864562305010094, "grad_norm": 0.14115041494369507, "learning_rate": 8.303692028707229e-06, "loss": 0.0017, "num_input_tokens_seen": 80710272, "step": 37405 }, { "epoch": 6.8654799045696455, "grad_norm": 0.060356076806783676, "learning_rate": 8.303090925366513e-06, "loss": 0.1086, "num_input_tokens_seen": 80721568, "step": 37410 }, { "epoch": 6.866397504129198, "grad_norm": 0.008910322561860085, "learning_rate": 8.302489737306634e-06, "loss": 0.2259, "num_input_tokens_seen": 80733216, "step": 37415 }, { "epoch": 6.8673151036887505, "grad_norm": 0.21646170318126678, "learning_rate": 8.30188846454301e-06, "loss": 0.1957, "num_input_tokens_seen": 80744160, "step": 37420 }, { "epoch": 6.868232703248302, "grad_norm": 0.05485888943076134, "learning_rate": 8.301287107091067e-06, "loss": 0.1473, "num_input_tokens_seen": 80755072, "step": 37425 }, { "epoch": 6.869150302807855, "grad_norm": 0.07808328419923782, "learning_rate": 8.300685664966226e-06, "loss": 0.0033, "num_input_tokens_seen": 80766240, "step": 37430 }, { "epoch": 6.870067902367407, "grad_norm": 0.11185208708047867, "learning_rate": 8.300084138183913e-06, "loss": 0.0046, "num_input_tokens_seen": 80777792, "step": 37435 }, { "epoch": 6.870985501926959, "grad_norm": 97.81373596191406, "learning_rate": 8.299482526759554e-06, "loss": 0.1125, "num_input_tokens_seen": 80789504, "step": 37440 }, { "epoch": 6.871903101486511, "grad_norm": 0.01842433027923107, "learning_rate": 8.298880830708586e-06, "loss": 0.0217, "num_input_tokens_seen": 80800064, "step": 37445 }, { "epoch": 6.872820701046064, "grad_norm": 0.028148632496595383, "learning_rate": 8.298279050046434e-06, "loss": 0.1548, "num_input_tokens_seen": 80812032, "step": 37450 }, { "epoch": 6.873738300605615, "grad_norm": 0.13526596128940582, "learning_rate": 8.297677184788539e-06, "loss": 0.0015, "num_input_tokens_seen": 80822560, "step": 37455 }, { "epoch": 6.874655900165168, "grad_norm": 0.005000019911676645, "learning_rate": 8.297075234950333e-06, "loss": 0.0664, "num_input_tokens_seen": 80833440, "step": 37460 }, { "epoch": 6.87557349972472, "grad_norm": 13.171899795532227, "learning_rate": 8.29647320054726e-06, "loss": 0.3167, "num_input_tokens_seen": 80844192, "step": 37465 }, { "epoch": 6.876491099284272, "grad_norm": 38.629451751708984, "learning_rate": 8.295871081594755e-06, "loss": 0.2201, "num_input_tokens_seen": 80853504, "step": 37470 }, { "epoch": 6.8774086988438246, "grad_norm": 0.0521196611225605, "learning_rate": 8.295268878108266e-06, "loss": 0.0015, "num_input_tokens_seen": 80864000, "step": 37475 }, { "epoch": 6.878326298403377, "grad_norm": 7.051193714141846, "learning_rate": 8.29466659010324e-06, "loss": 0.11, "num_input_tokens_seen": 80874528, "step": 37480 }, { "epoch": 6.879243897962929, "grad_norm": 0.11361385881900787, "learning_rate": 8.29406421759512e-06, "loss": 0.0022, "num_input_tokens_seen": 80885504, "step": 37485 }, { "epoch": 6.880161497522481, "grad_norm": 0.10716076195240021, "learning_rate": 8.293461760599357e-06, "loss": 0.0024, "num_input_tokens_seen": 80896608, "step": 37490 }, { "epoch": 6.881079097082034, "grad_norm": 0.056540343910455704, "learning_rate": 8.292859219131406e-06, "loss": 0.1964, "num_input_tokens_seen": 80906752, "step": 37495 }, { "epoch": 6.881996696641585, "grad_norm": 0.07504341006278992, "learning_rate": 8.292256593206719e-06, "loss": 0.0023, "num_input_tokens_seen": 80917568, "step": 37500 }, { "epoch": 6.882914296201138, "grad_norm": 0.025514086708426476, "learning_rate": 8.291653882840754e-06, "loss": 0.0045, "num_input_tokens_seen": 80929280, "step": 37505 }, { "epoch": 6.88383189576069, "grad_norm": 0.0771452859044075, "learning_rate": 8.291051088048967e-06, "loss": 0.108, "num_input_tokens_seen": 80939040, "step": 37510 }, { "epoch": 6.884749495320242, "grad_norm": 5.3762898445129395, "learning_rate": 8.290448208846823e-06, "loss": 0.1219, "num_input_tokens_seen": 80949536, "step": 37515 }, { "epoch": 6.8856670948797944, "grad_norm": 0.05002845451235771, "learning_rate": 8.289845245249779e-06, "loss": 0.0013, "num_input_tokens_seen": 80960736, "step": 37520 }, { "epoch": 6.886584694439347, "grad_norm": 0.8166764974594116, "learning_rate": 8.289242197273303e-06, "loss": 0.0245, "num_input_tokens_seen": 80971648, "step": 37525 }, { "epoch": 6.887502293998899, "grad_norm": 0.05117467790842056, "learning_rate": 8.288639064932864e-06, "loss": 0.3461, "num_input_tokens_seen": 80981216, "step": 37530 }, { "epoch": 6.888419893558451, "grad_norm": 17.288557052612305, "learning_rate": 8.28803584824393e-06, "loss": 0.1509, "num_input_tokens_seen": 80991200, "step": 37535 }, { "epoch": 6.889337493118004, "grad_norm": 0.04746602475643158, "learning_rate": 8.287432547221972e-06, "loss": 0.0473, "num_input_tokens_seen": 81001888, "step": 37540 }, { "epoch": 6.890255092677555, "grad_norm": 11.980229377746582, "learning_rate": 8.286829161882463e-06, "loss": 0.3428, "num_input_tokens_seen": 81011680, "step": 37545 }, { "epoch": 6.891172692237108, "grad_norm": 0.12292952835559845, "learning_rate": 8.286225692240883e-06, "loss": 0.0018, "num_input_tokens_seen": 81021920, "step": 37550 }, { "epoch": 6.89209029179666, "grad_norm": 0.04553955793380737, "learning_rate": 8.285622138312705e-06, "loss": 0.1515, "num_input_tokens_seen": 81032544, "step": 37555 }, { "epoch": 6.893007891356212, "grad_norm": 0.015033593401312828, "learning_rate": 8.285018500113413e-06, "loss": 0.2742, "num_input_tokens_seen": 81044128, "step": 37560 }, { "epoch": 6.893925490915764, "grad_norm": 11.840544700622559, "learning_rate": 8.284414777658487e-06, "loss": 0.1034, "num_input_tokens_seen": 81055488, "step": 37565 }, { "epoch": 6.894843090475317, "grad_norm": 0.1936207115650177, "learning_rate": 8.283810970963411e-06, "loss": 0.072, "num_input_tokens_seen": 81065888, "step": 37570 }, { "epoch": 6.8957606900348685, "grad_norm": 0.11506611108779907, "learning_rate": 8.283207080043675e-06, "loss": 0.01, "num_input_tokens_seen": 81075968, "step": 37575 }, { "epoch": 6.896678289594421, "grad_norm": 0.06816890090703964, "learning_rate": 8.282603104914765e-06, "loss": 0.0021, "num_input_tokens_seen": 81087456, "step": 37580 }, { "epoch": 6.8975958891539735, "grad_norm": 0.03592899069190025, "learning_rate": 8.281999045592172e-06, "loss": 0.0039, "num_input_tokens_seen": 81096576, "step": 37585 }, { "epoch": 6.898513488713525, "grad_norm": 0.2208516001701355, "learning_rate": 8.281394902091392e-06, "loss": 0.1733, "num_input_tokens_seen": 81106336, "step": 37590 }, { "epoch": 6.899431088273078, "grad_norm": 0.19086989760398865, "learning_rate": 8.280790674427917e-06, "loss": 0.0029, "num_input_tokens_seen": 81116288, "step": 37595 }, { "epoch": 6.90034868783263, "grad_norm": 0.2769431471824646, "learning_rate": 8.280186362617247e-06, "loss": 0.0057, "num_input_tokens_seen": 81126400, "step": 37600 }, { "epoch": 6.901266287392182, "grad_norm": 0.0061711412854492664, "learning_rate": 8.279581966674881e-06, "loss": 0.1098, "num_input_tokens_seen": 81136960, "step": 37605 }, { "epoch": 6.902183886951734, "grad_norm": 0.08359693735837936, "learning_rate": 8.27897748661632e-06, "loss": 0.0035, "num_input_tokens_seen": 81147488, "step": 37610 }, { "epoch": 6.903101486511287, "grad_norm": 0.6712775230407715, "learning_rate": 8.278372922457067e-06, "loss": 0.09, "num_input_tokens_seen": 81157664, "step": 37615 }, { "epoch": 6.904019086070838, "grad_norm": 0.007099160458892584, "learning_rate": 8.27776827421263e-06, "loss": 0.0023, "num_input_tokens_seen": 81169440, "step": 37620 }, { "epoch": 6.904936685630391, "grad_norm": 0.09257501363754272, "learning_rate": 8.277163541898518e-06, "loss": 0.205, "num_input_tokens_seen": 81180544, "step": 37625 }, { "epoch": 6.905854285189943, "grad_norm": 16.727901458740234, "learning_rate": 8.27655872553024e-06, "loss": 0.2392, "num_input_tokens_seen": 81190816, "step": 37630 }, { "epoch": 6.906771884749495, "grad_norm": 0.012995874509215355, "learning_rate": 8.275953825123308e-06, "loss": 0.0031, "num_input_tokens_seen": 81201568, "step": 37635 }, { "epoch": 6.9076894843090475, "grad_norm": 69.71204376220703, "learning_rate": 8.275348840693241e-06, "loss": 0.1025, "num_input_tokens_seen": 81213248, "step": 37640 }, { "epoch": 6.9086070838686, "grad_norm": 0.05278471112251282, "learning_rate": 8.274743772255549e-06, "loss": 0.0869, "num_input_tokens_seen": 81223232, "step": 37645 }, { "epoch": 6.909524683428152, "grad_norm": 0.06347693502902985, "learning_rate": 8.274138619825756e-06, "loss": 0.2068, "num_input_tokens_seen": 81235040, "step": 37650 }, { "epoch": 6.910442282987704, "grad_norm": 0.04709358885884285, "learning_rate": 8.27353338341938e-06, "loss": 0.0937, "num_input_tokens_seen": 81245312, "step": 37655 }, { "epoch": 6.911359882547257, "grad_norm": 16.06596565246582, "learning_rate": 8.272928063051948e-06, "loss": 0.1271, "num_input_tokens_seen": 81255424, "step": 37660 }, { "epoch": 6.912277482106808, "grad_norm": 0.03457140922546387, "learning_rate": 8.272322658738984e-06, "loss": 0.0888, "num_input_tokens_seen": 81265504, "step": 37665 }, { "epoch": 6.913195081666361, "grad_norm": 0.020701050758361816, "learning_rate": 8.271717170496013e-06, "loss": 0.042, "num_input_tokens_seen": 81276704, "step": 37670 }, { "epoch": 6.914112681225913, "grad_norm": 11.297019958496094, "learning_rate": 8.271111598338571e-06, "loss": 0.1684, "num_input_tokens_seen": 81287840, "step": 37675 }, { "epoch": 6.915030280785465, "grad_norm": 9.790863037109375, "learning_rate": 8.270505942282184e-06, "loss": 0.1459, "num_input_tokens_seen": 81298304, "step": 37680 }, { "epoch": 6.915947880345017, "grad_norm": 8.130621910095215, "learning_rate": 8.269900202342388e-06, "loss": 0.2471, "num_input_tokens_seen": 81308672, "step": 37685 }, { "epoch": 6.91686547990457, "grad_norm": 0.022068239748477936, "learning_rate": 8.269294378534722e-06, "loss": 0.1327, "num_input_tokens_seen": 81319712, "step": 37690 }, { "epoch": 6.9177830794641215, "grad_norm": 0.033974286168813705, "learning_rate": 8.268688470874719e-06, "loss": 0.001, "num_input_tokens_seen": 81330816, "step": 37695 }, { "epoch": 6.918700679023674, "grad_norm": 0.12486885488033295, "learning_rate": 8.268082479377926e-06, "loss": 0.1941, "num_input_tokens_seen": 81341024, "step": 37700 }, { "epoch": 6.9196182785832265, "grad_norm": 0.021722961217164993, "learning_rate": 8.26747640405988e-06, "loss": 0.0012, "num_input_tokens_seen": 81351264, "step": 37705 }, { "epoch": 6.920535878142778, "grad_norm": 0.043528396636247635, "learning_rate": 8.26687024493613e-06, "loss": 0.1172, "num_input_tokens_seen": 81362368, "step": 37710 }, { "epoch": 6.921453477702331, "grad_norm": 0.026093261316418648, "learning_rate": 8.26626400202222e-06, "loss": 0.0936, "num_input_tokens_seen": 81372672, "step": 37715 }, { "epoch": 6.922371077261883, "grad_norm": 8.619020462036133, "learning_rate": 8.2656576753337e-06, "loss": 0.4324, "num_input_tokens_seen": 81383360, "step": 37720 }, { "epoch": 6.923288676821435, "grad_norm": 24.534080505371094, "learning_rate": 8.265051264886124e-06, "loss": 0.1802, "num_input_tokens_seen": 81394496, "step": 37725 }, { "epoch": 6.924206276380987, "grad_norm": 0.10947772860527039, "learning_rate": 8.264444770695043e-06, "loss": 0.1749, "num_input_tokens_seen": 81405408, "step": 37730 }, { "epoch": 6.92512387594054, "grad_norm": 0.07267376035451889, "learning_rate": 8.263838192776014e-06, "loss": 0.0021, "num_input_tokens_seen": 81416576, "step": 37735 }, { "epoch": 6.926041475500091, "grad_norm": 0.034781333059072495, "learning_rate": 8.26323153114459e-06, "loss": 0.1009, "num_input_tokens_seen": 81428096, "step": 37740 }, { "epoch": 6.926959075059644, "grad_norm": 0.03667012229561806, "learning_rate": 8.262624785816338e-06, "loss": 0.1093, "num_input_tokens_seen": 81438784, "step": 37745 }, { "epoch": 6.927876674619196, "grad_norm": 0.40421077609062195, "learning_rate": 8.262017956806818e-06, "loss": 0.1438, "num_input_tokens_seen": 81449760, "step": 37750 }, { "epoch": 6.928794274178748, "grad_norm": 0.7827727198600769, "learning_rate": 8.261411044131591e-06, "loss": 0.0068, "num_input_tokens_seen": 81460672, "step": 37755 }, { "epoch": 6.9297118737383006, "grad_norm": 9.110940933227539, "learning_rate": 8.260804047806226e-06, "loss": 0.1578, "num_input_tokens_seen": 81470912, "step": 37760 }, { "epoch": 6.930629473297853, "grad_norm": 0.776103138923645, "learning_rate": 8.26019696784629e-06, "loss": 0.0022, "num_input_tokens_seen": 81480512, "step": 37765 }, { "epoch": 6.931547072857405, "grad_norm": 0.07233873754739761, "learning_rate": 8.259589804267354e-06, "loss": 0.4057, "num_input_tokens_seen": 81491424, "step": 37770 }, { "epoch": 6.932464672416957, "grad_norm": 0.05500612035393715, "learning_rate": 8.258982557084993e-06, "loss": 0.3328, "num_input_tokens_seen": 81503712, "step": 37775 }, { "epoch": 6.93338227197651, "grad_norm": 16.254058837890625, "learning_rate": 8.258375226314781e-06, "loss": 0.0917, "num_input_tokens_seen": 81514400, "step": 37780 }, { "epoch": 6.934299871536061, "grad_norm": 0.06652895361185074, "learning_rate": 8.257767811972292e-06, "loss": 0.1142, "num_input_tokens_seen": 81524768, "step": 37785 }, { "epoch": 6.935217471095614, "grad_norm": 1.1824699640274048, "learning_rate": 8.25716031407311e-06, "loss": 0.0182, "num_input_tokens_seen": 81535904, "step": 37790 }, { "epoch": 6.936135070655166, "grad_norm": 0.1728016436100006, "learning_rate": 8.256552732632813e-06, "loss": 0.1176, "num_input_tokens_seen": 81546688, "step": 37795 }, { "epoch": 6.937052670214718, "grad_norm": 4.663774490356445, "learning_rate": 8.255945067666987e-06, "loss": 0.1202, "num_input_tokens_seen": 81556768, "step": 37800 }, { "epoch": 6.9379702697742704, "grad_norm": 44.58308410644531, "learning_rate": 8.255337319191215e-06, "loss": 0.1495, "num_input_tokens_seen": 81568576, "step": 37805 }, { "epoch": 6.938887869333823, "grad_norm": 0.06191166117787361, "learning_rate": 8.254729487221086e-06, "loss": 0.1104, "num_input_tokens_seen": 81579488, "step": 37810 }, { "epoch": 6.939805468893375, "grad_norm": 58.60586166381836, "learning_rate": 8.25412157177219e-06, "loss": 0.0306, "num_input_tokens_seen": 81590368, "step": 37815 }, { "epoch": 6.940723068452927, "grad_norm": 0.10357357561588287, "learning_rate": 8.253513572860119e-06, "loss": 0.0024, "num_input_tokens_seen": 81600544, "step": 37820 }, { "epoch": 6.94164066801248, "grad_norm": 0.02056463062763214, "learning_rate": 8.25290549050047e-06, "loss": 0.029, "num_input_tokens_seen": 81611296, "step": 37825 }, { "epoch": 6.942558267572031, "grad_norm": 0.13301299512386322, "learning_rate": 8.252297324708834e-06, "loss": 0.0029, "num_input_tokens_seen": 81622784, "step": 37830 }, { "epoch": 6.943475867131584, "grad_norm": 4.977529048919678, "learning_rate": 8.251689075500811e-06, "loss": 0.0076, "num_input_tokens_seen": 81632576, "step": 37835 }, { "epoch": 6.944393466691136, "grad_norm": 0.027217334136366844, "learning_rate": 8.251080742892005e-06, "loss": 0.2458, "num_input_tokens_seen": 81643392, "step": 37840 }, { "epoch": 6.945311066250688, "grad_norm": 18.733556747436523, "learning_rate": 8.250472326898016e-06, "loss": 0.0426, "num_input_tokens_seen": 81654336, "step": 37845 }, { "epoch": 6.94622866581024, "grad_norm": 0.08993802964687347, "learning_rate": 8.24986382753445e-06, "loss": 0.0017, "num_input_tokens_seen": 81666720, "step": 37850 }, { "epoch": 6.947146265369793, "grad_norm": 0.08269514888525009, "learning_rate": 8.249255244816914e-06, "loss": 0.0873, "num_input_tokens_seen": 81677344, "step": 37855 }, { "epoch": 6.9480638649293445, "grad_norm": 0.00718185119330883, "learning_rate": 8.248646578761016e-06, "loss": 0.0641, "num_input_tokens_seen": 81687520, "step": 37860 }, { "epoch": 6.948981464488897, "grad_norm": 0.1521667242050171, "learning_rate": 8.248037829382369e-06, "loss": 0.0894, "num_input_tokens_seen": 81698624, "step": 37865 }, { "epoch": 6.9498990640484495, "grad_norm": 0.05857277661561966, "learning_rate": 8.247428996696584e-06, "loss": 0.0012, "num_input_tokens_seen": 81708576, "step": 37870 }, { "epoch": 6.950816663608001, "grad_norm": 0.034246090799570084, "learning_rate": 8.24682008071928e-06, "loss": 0.0009, "num_input_tokens_seen": 81719552, "step": 37875 }, { "epoch": 6.951734263167554, "grad_norm": 0.04223771393299103, "learning_rate": 8.246211081466073e-06, "loss": 0.1949, "num_input_tokens_seen": 81730272, "step": 37880 }, { "epoch": 6.952651862727106, "grad_norm": 232.074951171875, "learning_rate": 8.245601998952583e-06, "loss": 0.1429, "num_input_tokens_seen": 81740928, "step": 37885 }, { "epoch": 6.953569462286658, "grad_norm": 14.469982147216797, "learning_rate": 8.244992833194431e-06, "loss": 0.0701, "num_input_tokens_seen": 81752192, "step": 37890 }, { "epoch": 6.95448706184621, "grad_norm": 7.11442232131958, "learning_rate": 8.244383584207244e-06, "loss": 0.3679, "num_input_tokens_seen": 81762176, "step": 37895 }, { "epoch": 6.955404661405763, "grad_norm": 0.09998417645692825, "learning_rate": 8.243774252006643e-06, "loss": 0.104, "num_input_tokens_seen": 81771872, "step": 37900 }, { "epoch": 6.956322260965314, "grad_norm": 0.22006633877754211, "learning_rate": 8.243164836608261e-06, "loss": 0.0015, "num_input_tokens_seen": 81781632, "step": 37905 }, { "epoch": 6.957239860524867, "grad_norm": 0.06456686556339264, "learning_rate": 8.242555338027729e-06, "loss": 0.0024, "num_input_tokens_seen": 81792736, "step": 37910 }, { "epoch": 6.958157460084419, "grad_norm": 0.16366708278656006, "learning_rate": 8.241945756280676e-06, "loss": 0.0024, "num_input_tokens_seen": 81803104, "step": 37915 }, { "epoch": 6.959075059643971, "grad_norm": 0.027462858706712723, "learning_rate": 8.241336091382741e-06, "loss": 0.1718, "num_input_tokens_seen": 81814368, "step": 37920 }, { "epoch": 6.9599926592035235, "grad_norm": 13.403008460998535, "learning_rate": 8.240726343349559e-06, "loss": 0.3368, "num_input_tokens_seen": 81823072, "step": 37925 }, { "epoch": 6.960910258763076, "grad_norm": 0.5822284817695618, "learning_rate": 8.240116512196767e-06, "loss": 0.2008, "num_input_tokens_seen": 81834304, "step": 37930 }, { "epoch": 6.961827858322628, "grad_norm": 0.039704203605651855, "learning_rate": 8.23950659794001e-06, "loss": 0.117, "num_input_tokens_seen": 81844448, "step": 37935 }, { "epoch": 6.96274545788218, "grad_norm": 6.68577241897583, "learning_rate": 8.238896600594928e-06, "loss": 0.238, "num_input_tokens_seen": 81854784, "step": 37940 }, { "epoch": 6.963663057441733, "grad_norm": 0.04059496521949768, "learning_rate": 8.23828652017717e-06, "loss": 0.0078, "num_input_tokens_seen": 81866784, "step": 37945 }, { "epoch": 6.964580657001284, "grad_norm": 0.08373882621526718, "learning_rate": 8.23767635670238e-06, "loss": 0.0138, "num_input_tokens_seen": 81877696, "step": 37950 }, { "epoch": 6.965498256560837, "grad_norm": 0.2720591425895691, "learning_rate": 8.23706611018621e-06, "loss": 0.0035, "num_input_tokens_seen": 81888288, "step": 37955 }, { "epoch": 6.966415856120389, "grad_norm": 0.05536789074540138, "learning_rate": 8.23645578064431e-06, "loss": 0.1189, "num_input_tokens_seen": 81897568, "step": 37960 }, { "epoch": 6.967333455679941, "grad_norm": 0.5334458947181702, "learning_rate": 8.235845368092336e-06, "loss": 0.137, "num_input_tokens_seen": 81908384, "step": 37965 }, { "epoch": 6.968251055239493, "grad_norm": 4.478914737701416, "learning_rate": 8.235234872545946e-06, "loss": 0.3033, "num_input_tokens_seen": 81918112, "step": 37970 }, { "epoch": 6.969168654799046, "grad_norm": 0.2306312918663025, "learning_rate": 8.234624294020792e-06, "loss": 0.1551, "num_input_tokens_seen": 81927360, "step": 37975 }, { "epoch": 6.9700862543585975, "grad_norm": 0.18183785676956177, "learning_rate": 8.23401363253254e-06, "loss": 0.0924, "num_input_tokens_seen": 81939360, "step": 37980 }, { "epoch": 6.97100385391815, "grad_norm": 0.08438777178525925, "learning_rate": 8.23340288809685e-06, "loss": 0.0539, "num_input_tokens_seen": 81949280, "step": 37985 }, { "epoch": 6.9719214534777025, "grad_norm": 0.37972909212112427, "learning_rate": 8.232792060729386e-06, "loss": 0.0389, "num_input_tokens_seen": 81959296, "step": 37990 }, { "epoch": 6.972839053037255, "grad_norm": 0.8363798260688782, "learning_rate": 8.23218115044582e-06, "loss": 0.1005, "num_input_tokens_seen": 81968832, "step": 37995 }, { "epoch": 6.973756652596807, "grad_norm": 0.12537801265716553, "learning_rate": 8.231570157261813e-06, "loss": 0.1787, "num_input_tokens_seen": 81978368, "step": 38000 }, { "epoch": 6.974674252156359, "grad_norm": 0.04403756931424141, "learning_rate": 8.230959081193042e-06, "loss": 0.0691, "num_input_tokens_seen": 81990304, "step": 38005 }, { "epoch": 6.975591851715912, "grad_norm": 0.04318693280220032, "learning_rate": 8.230347922255177e-06, "loss": 0.0201, "num_input_tokens_seen": 82001056, "step": 38010 }, { "epoch": 6.976509451275463, "grad_norm": 6.099183559417725, "learning_rate": 8.229736680463893e-06, "loss": 0.1322, "num_input_tokens_seen": 82012160, "step": 38015 }, { "epoch": 6.977427050835016, "grad_norm": 0.02791297249495983, "learning_rate": 8.229125355834872e-06, "loss": 0.0932, "num_input_tokens_seen": 82023616, "step": 38020 }, { "epoch": 6.978344650394568, "grad_norm": 0.058409687131643295, "learning_rate": 8.22851394838379e-06, "loss": 0.0039, "num_input_tokens_seen": 82034880, "step": 38025 }, { "epoch": 6.97926224995412, "grad_norm": 1.9552792310714722, "learning_rate": 8.227902458126326e-06, "loss": 0.1649, "num_input_tokens_seen": 82044064, "step": 38030 }, { "epoch": 6.980179849513672, "grad_norm": 0.040551140904426575, "learning_rate": 8.22729088507817e-06, "loss": 0.25, "num_input_tokens_seen": 82054496, "step": 38035 }, { "epoch": 6.981097449073225, "grad_norm": 0.04598753899335861, "learning_rate": 8.226679229255001e-06, "loss": 0.0064, "num_input_tokens_seen": 82065696, "step": 38040 }, { "epoch": 6.9820150486327766, "grad_norm": 13.328614234924316, "learning_rate": 8.226067490672514e-06, "loss": 0.0835, "num_input_tokens_seen": 82076384, "step": 38045 }, { "epoch": 6.982932648192329, "grad_norm": 12.22879409790039, "learning_rate": 8.225455669346394e-06, "loss": 0.2326, "num_input_tokens_seen": 82088384, "step": 38050 }, { "epoch": 6.983850247751882, "grad_norm": 0.17852777242660522, "learning_rate": 8.224843765292335e-06, "loss": 0.0028, "num_input_tokens_seen": 82099200, "step": 38055 }, { "epoch": 6.984767847311433, "grad_norm": 0.266806960105896, "learning_rate": 8.224231778526034e-06, "loss": 0.0062, "num_input_tokens_seen": 82109472, "step": 38060 }, { "epoch": 6.985685446870986, "grad_norm": 15.490757942199707, "learning_rate": 8.223619709063182e-06, "loss": 0.4041, "num_input_tokens_seen": 82121024, "step": 38065 }, { "epoch": 6.986603046430538, "grad_norm": 0.034674953669309616, "learning_rate": 8.223007556919482e-06, "loss": 0.0023, "num_input_tokens_seen": 82130208, "step": 38070 }, { "epoch": 6.98752064599009, "grad_norm": 4.523182392120361, "learning_rate": 8.222395322110634e-06, "loss": 0.0035, "num_input_tokens_seen": 82141536, "step": 38075 }, { "epoch": 6.988438245549642, "grad_norm": 0.03558322414755821, "learning_rate": 8.22178300465234e-06, "loss": 0.1141, "num_input_tokens_seen": 82152928, "step": 38080 }, { "epoch": 6.989355845109195, "grad_norm": 0.03440927341580391, "learning_rate": 8.221170604560305e-06, "loss": 0.0024, "num_input_tokens_seen": 82164160, "step": 38085 }, { "epoch": 6.9902734446687464, "grad_norm": 0.02047634683549404, "learning_rate": 8.220558121850235e-06, "loss": 0.002, "num_input_tokens_seen": 82176064, "step": 38090 }, { "epoch": 6.991191044228299, "grad_norm": 0.0739315003156662, "learning_rate": 8.219945556537842e-06, "loss": 0.0025, "num_input_tokens_seen": 82186464, "step": 38095 }, { "epoch": 6.9921086437878515, "grad_norm": 86.78585815429688, "learning_rate": 8.219332908638835e-06, "loss": 0.1689, "num_input_tokens_seen": 82196672, "step": 38100 }, { "epoch": 6.993026243347403, "grad_norm": 30.630516052246094, "learning_rate": 8.21872017816893e-06, "loss": 0.2735, "num_input_tokens_seen": 82207392, "step": 38105 }, { "epoch": 6.993943842906956, "grad_norm": 0.0660352110862732, "learning_rate": 8.21810736514384e-06, "loss": 0.1084, "num_input_tokens_seen": 82217600, "step": 38110 }, { "epoch": 6.994861442466508, "grad_norm": 0.030409669503569603, "learning_rate": 8.217494469579283e-06, "loss": 0.002, "num_input_tokens_seen": 82229248, "step": 38115 }, { "epoch": 6.99577904202606, "grad_norm": 0.13901418447494507, "learning_rate": 8.21688149149098e-06, "loss": 0.1237, "num_input_tokens_seen": 82239904, "step": 38120 }, { "epoch": 6.996696641585612, "grad_norm": 10.54995346069336, "learning_rate": 8.216268430894651e-06, "loss": 0.1294, "num_input_tokens_seen": 82249920, "step": 38125 }, { "epoch": 6.997614241145165, "grad_norm": 0.784257173538208, "learning_rate": 8.215655287806024e-06, "loss": 0.1235, "num_input_tokens_seen": 82260704, "step": 38130 }, { "epoch": 6.998531840704716, "grad_norm": 0.06305359303951263, "learning_rate": 8.215042062240823e-06, "loss": 0.1636, "num_input_tokens_seen": 82270272, "step": 38135 }, { "epoch": 6.999449440264269, "grad_norm": 15.25107479095459, "learning_rate": 8.214428754214774e-06, "loss": 0.1065, "num_input_tokens_seen": 82280640, "step": 38140 }, { "epoch": 7.0003670398238205, "grad_norm": 1.8597261905670166, "learning_rate": 8.213815363743612e-06, "loss": 0.0024, "num_input_tokens_seen": 82290384, "step": 38145 }, { "epoch": 7.001284639383373, "grad_norm": 0.2655431032180786, "learning_rate": 8.213201890843064e-06, "loss": 0.121, "num_input_tokens_seen": 82300944, "step": 38150 }, { "epoch": 7.0022022389429255, "grad_norm": 0.3352510631084442, "learning_rate": 8.212588335528868e-06, "loss": 0.1597, "num_input_tokens_seen": 82311696, "step": 38155 }, { "epoch": 7.003119838502477, "grad_norm": 0.04817551001906395, "learning_rate": 8.21197469781676e-06, "loss": 0.0031, "num_input_tokens_seen": 82324272, "step": 38160 }, { "epoch": 7.00403743806203, "grad_norm": 0.09861569106578827, "learning_rate": 8.211360977722482e-06, "loss": 0.0026, "num_input_tokens_seen": 82335632, "step": 38165 }, { "epoch": 7.004955037621582, "grad_norm": 0.5630620718002319, "learning_rate": 8.210747175261768e-06, "loss": 0.0137, "num_input_tokens_seen": 82346608, "step": 38170 }, { "epoch": 7.005872637181134, "grad_norm": 0.026028649881482124, "learning_rate": 8.210133290450369e-06, "loss": 0.2584, "num_input_tokens_seen": 82358384, "step": 38175 }, { "epoch": 7.006790236740686, "grad_norm": 0.5073089599609375, "learning_rate": 8.209519323304025e-06, "loss": 0.0034, "num_input_tokens_seen": 82369776, "step": 38180 }, { "epoch": 7.007707836300239, "grad_norm": 0.11626428365707397, "learning_rate": 8.208905273838483e-06, "loss": 0.2342, "num_input_tokens_seen": 82379088, "step": 38185 }, { "epoch": 7.00862543585979, "grad_norm": 0.06764933466911316, "learning_rate": 8.208291142069495e-06, "loss": 0.0126, "num_input_tokens_seen": 82390736, "step": 38190 }, { "epoch": 7.009543035419343, "grad_norm": 0.04510713368654251, "learning_rate": 8.207676928012813e-06, "loss": 0.2129, "num_input_tokens_seen": 82402192, "step": 38195 }, { "epoch": 7.010460634978895, "grad_norm": 0.08270546048879623, "learning_rate": 8.207062631684186e-06, "loss": 0.0022, "num_input_tokens_seen": 82413552, "step": 38200 }, { "epoch": 7.011378234538447, "grad_norm": 9.446651458740234, "learning_rate": 8.206448253099377e-06, "loss": 0.3103, "num_input_tokens_seen": 82424720, "step": 38205 }, { "epoch": 7.0122958340979995, "grad_norm": 0.32974767684936523, "learning_rate": 8.205833792274136e-06, "loss": 0.0026, "num_input_tokens_seen": 82436304, "step": 38210 }, { "epoch": 7.013213433657552, "grad_norm": 0.04755587503314018, "learning_rate": 8.20521924922423e-06, "loss": 0.0965, "num_input_tokens_seen": 82447504, "step": 38215 }, { "epoch": 7.014131033217104, "grad_norm": 0.1746625006198883, "learning_rate": 8.204604623965417e-06, "loss": 0.0642, "num_input_tokens_seen": 82457808, "step": 38220 }, { "epoch": 7.015048632776656, "grad_norm": 1.17701256275177, "learning_rate": 8.203989916513462e-06, "loss": 0.0056, "num_input_tokens_seen": 82467728, "step": 38225 }, { "epoch": 7.015966232336209, "grad_norm": 0.29932305216789246, "learning_rate": 8.20337512688413e-06, "loss": 0.0088, "num_input_tokens_seen": 82479056, "step": 38230 }, { "epoch": 7.01688383189576, "grad_norm": 0.022954469546675682, "learning_rate": 8.202760255093192e-06, "loss": 0.087, "num_input_tokens_seen": 82490288, "step": 38235 }, { "epoch": 7.017801431455313, "grad_norm": 8.16012954711914, "learning_rate": 8.202145301156417e-06, "loss": 0.24, "num_input_tokens_seen": 82499760, "step": 38240 }, { "epoch": 7.018719031014865, "grad_norm": 0.16526514291763306, "learning_rate": 8.201530265089579e-06, "loss": 0.183, "num_input_tokens_seen": 82511856, "step": 38245 }, { "epoch": 7.019636630574417, "grad_norm": 0.10536359250545502, "learning_rate": 8.20091514690845e-06, "loss": 0.0025, "num_input_tokens_seen": 82520624, "step": 38250 }, { "epoch": 7.020554230133969, "grad_norm": 0.05947418510913849, "learning_rate": 8.20029994662881e-06, "loss": 0.0016, "num_input_tokens_seen": 82531376, "step": 38255 }, { "epoch": 7.021471829693522, "grad_norm": 6.739100933074951, "learning_rate": 8.199684664266436e-06, "loss": 0.0059, "num_input_tokens_seen": 82542640, "step": 38260 }, { "epoch": 7.0223894292530735, "grad_norm": 0.09344591945409775, "learning_rate": 8.199069299837108e-06, "loss": 0.152, "num_input_tokens_seen": 82552784, "step": 38265 }, { "epoch": 7.023307028812626, "grad_norm": 0.12537536025047302, "learning_rate": 8.198453853356612e-06, "loss": 0.0571, "num_input_tokens_seen": 82564400, "step": 38270 }, { "epoch": 7.0242246283721785, "grad_norm": 0.5830041766166687, "learning_rate": 8.19783832484073e-06, "loss": 0.001, "num_input_tokens_seen": 82575056, "step": 38275 }, { "epoch": 7.02514222793173, "grad_norm": 0.09290331602096558, "learning_rate": 8.197222714305253e-06, "loss": 0.0017, "num_input_tokens_seen": 82585808, "step": 38280 }, { "epoch": 7.026059827491283, "grad_norm": 0.025421038269996643, "learning_rate": 8.196607021765968e-06, "loss": 0.0018, "num_input_tokens_seen": 82597456, "step": 38285 }, { "epoch": 7.026977427050835, "grad_norm": 0.02020939439535141, "learning_rate": 8.195991247238668e-06, "loss": 0.0773, "num_input_tokens_seen": 82608496, "step": 38290 }, { "epoch": 7.027895026610387, "grad_norm": 0.027553144842386246, "learning_rate": 8.195375390739146e-06, "loss": 0.0004, "num_input_tokens_seen": 82618352, "step": 38295 }, { "epoch": 7.028812626169939, "grad_norm": 5.481330871582031, "learning_rate": 8.194759452283196e-06, "loss": 0.3545, "num_input_tokens_seen": 82629168, "step": 38300 }, { "epoch": 7.029730225729492, "grad_norm": 0.05153375118970871, "learning_rate": 8.194143431886619e-06, "loss": 0.0012, "num_input_tokens_seen": 82639696, "step": 38305 }, { "epoch": 7.030647825289044, "grad_norm": 0.5659762620925903, "learning_rate": 8.193527329565211e-06, "loss": 0.0019, "num_input_tokens_seen": 82650512, "step": 38310 }, { "epoch": 7.031565424848596, "grad_norm": 0.09938004612922668, "learning_rate": 8.19291114533478e-06, "loss": 0.0011, "num_input_tokens_seen": 82661488, "step": 38315 }, { "epoch": 7.032483024408148, "grad_norm": 0.33326277136802673, "learning_rate": 8.192294879211124e-06, "loss": 0.2609, "num_input_tokens_seen": 82670608, "step": 38320 }, { "epoch": 7.033400623967701, "grad_norm": 14.779853820800781, "learning_rate": 8.191678531210055e-06, "loss": 0.1143, "num_input_tokens_seen": 82681648, "step": 38325 }, { "epoch": 7.034318223527253, "grad_norm": 0.14576882123947144, "learning_rate": 8.191062101347375e-06, "loss": 0.1174, "num_input_tokens_seen": 82690736, "step": 38330 }, { "epoch": 7.035235823086805, "grad_norm": 0.011062009260058403, "learning_rate": 8.190445589638898e-06, "loss": 0.0011, "num_input_tokens_seen": 82700688, "step": 38335 }, { "epoch": 7.036153422646358, "grad_norm": 0.04185795038938522, "learning_rate": 8.189828996100437e-06, "loss": 0.2451, "num_input_tokens_seen": 82711664, "step": 38340 }, { "epoch": 7.037071022205909, "grad_norm": 0.09721069037914276, "learning_rate": 8.189212320747807e-06, "loss": 0.0982, "num_input_tokens_seen": 82722512, "step": 38345 }, { "epoch": 7.037988621765462, "grad_norm": 0.0176542978733778, "learning_rate": 8.188595563596824e-06, "loss": 0.0985, "num_input_tokens_seen": 82733872, "step": 38350 }, { "epoch": 7.038906221325014, "grad_norm": 0.05224158614873886, "learning_rate": 8.187978724663305e-06, "loss": 0.1135, "num_input_tokens_seen": 82745328, "step": 38355 }, { "epoch": 7.039823820884566, "grad_norm": 0.0035980932880192995, "learning_rate": 8.187361803963074e-06, "loss": 0.0661, "num_input_tokens_seen": 82756720, "step": 38360 }, { "epoch": 7.040741420444118, "grad_norm": 4.432119846343994, "learning_rate": 8.186744801511953e-06, "loss": 0.0096, "num_input_tokens_seen": 82767184, "step": 38365 }, { "epoch": 7.041659020003671, "grad_norm": 0.05967326462268829, "learning_rate": 8.186127717325765e-06, "loss": 0.002, "num_input_tokens_seen": 82778544, "step": 38370 }, { "epoch": 7.0425766195632225, "grad_norm": 0.018810585141181946, "learning_rate": 8.185510551420341e-06, "loss": 0.0015, "num_input_tokens_seen": 82789200, "step": 38375 }, { "epoch": 7.043494219122775, "grad_norm": 0.037823598831892014, "learning_rate": 8.184893303811507e-06, "loss": 0.0462, "num_input_tokens_seen": 82799216, "step": 38380 }, { "epoch": 7.0444118186823275, "grad_norm": 41.1905632019043, "learning_rate": 8.184275974515096e-06, "loss": 0.1504, "num_input_tokens_seen": 82809744, "step": 38385 }, { "epoch": 7.045329418241879, "grad_norm": 0.024449646472930908, "learning_rate": 8.183658563546942e-06, "loss": 0.001, "num_input_tokens_seen": 82820144, "step": 38390 }, { "epoch": 7.046247017801432, "grad_norm": 0.02171342633664608, "learning_rate": 8.18304107092288e-06, "loss": 0.0025, "num_input_tokens_seen": 82830800, "step": 38395 }, { "epoch": 7.047164617360984, "grad_norm": 0.3522546589374542, "learning_rate": 8.182423496658749e-06, "loss": 0.0011, "num_input_tokens_seen": 82840080, "step": 38400 }, { "epoch": 7.048082216920536, "grad_norm": 0.016253873705863953, "learning_rate": 8.181805840770386e-06, "loss": 0.0022, "num_input_tokens_seen": 82849840, "step": 38405 }, { "epoch": 7.048999816480088, "grad_norm": 0.08821461349725723, "learning_rate": 8.181188103273634e-06, "loss": 0.1321, "num_input_tokens_seen": 82860464, "step": 38410 }, { "epoch": 7.049917416039641, "grad_norm": 0.2515413463115692, "learning_rate": 8.18057028418434e-06, "loss": 0.1451, "num_input_tokens_seen": 82872624, "step": 38415 }, { "epoch": 7.050835015599192, "grad_norm": 0.011310391128063202, "learning_rate": 8.179952383518346e-06, "loss": 0.0003, "num_input_tokens_seen": 82884240, "step": 38420 }, { "epoch": 7.051752615158745, "grad_norm": 0.02835441567003727, "learning_rate": 8.1793344012915e-06, "loss": 0.1023, "num_input_tokens_seen": 82894448, "step": 38425 }, { "epoch": 7.052670214718297, "grad_norm": 0.6170671582221985, "learning_rate": 8.178716337519657e-06, "loss": 0.0013, "num_input_tokens_seen": 82905552, "step": 38430 }, { "epoch": 7.053587814277849, "grad_norm": 11.006851196289062, "learning_rate": 8.178098192218666e-06, "loss": 0.0162, "num_input_tokens_seen": 82915984, "step": 38435 }, { "epoch": 7.0545054138374015, "grad_norm": 0.006549495737999678, "learning_rate": 8.177479965404382e-06, "loss": 0.0023, "num_input_tokens_seen": 82926320, "step": 38440 }, { "epoch": 7.055423013396954, "grad_norm": 0.012953070923686028, "learning_rate": 8.176861657092661e-06, "loss": 0.0009, "num_input_tokens_seen": 82936368, "step": 38445 }, { "epoch": 7.056340612956506, "grad_norm": 0.03236011415719986, "learning_rate": 8.176243267299362e-06, "loss": 0.0003, "num_input_tokens_seen": 82945904, "step": 38450 }, { "epoch": 7.057258212516058, "grad_norm": 0.007361624855548143, "learning_rate": 8.175624796040347e-06, "loss": 0.0005, "num_input_tokens_seen": 82957616, "step": 38455 }, { "epoch": 7.058175812075611, "grad_norm": 0.013678056187927723, "learning_rate": 8.175006243331477e-06, "loss": 0.0007, "num_input_tokens_seen": 82968240, "step": 38460 }, { "epoch": 7.059093411635162, "grad_norm": 0.028312139213085175, "learning_rate": 8.174387609188618e-06, "loss": 0.0008, "num_input_tokens_seen": 82979888, "step": 38465 }, { "epoch": 7.060011011194715, "grad_norm": 0.19568926095962524, "learning_rate": 8.173768893627635e-06, "loss": 0.198, "num_input_tokens_seen": 82990160, "step": 38470 }, { "epoch": 7.060928610754267, "grad_norm": 0.0058532594703137875, "learning_rate": 8.173150096664401e-06, "loss": 0.0006, "num_input_tokens_seen": 83000880, "step": 38475 }, { "epoch": 7.061846210313819, "grad_norm": 0.05106458440423012, "learning_rate": 8.172531218314783e-06, "loss": 0.0806, "num_input_tokens_seen": 83010320, "step": 38480 }, { "epoch": 7.062763809873371, "grad_norm": 0.01442707609385252, "learning_rate": 8.17191225859466e-06, "loss": 0.0011, "num_input_tokens_seen": 83020944, "step": 38485 }, { "epoch": 7.063681409432924, "grad_norm": 0.11384464800357819, "learning_rate": 8.171293217519899e-06, "loss": 0.006, "num_input_tokens_seen": 83032272, "step": 38490 }, { "epoch": 7.0645990089924755, "grad_norm": 0.0065459380857646465, "learning_rate": 8.170674095106384e-06, "loss": 0.0978, "num_input_tokens_seen": 83044272, "step": 38495 }, { "epoch": 7.065516608552028, "grad_norm": 58.702598571777344, "learning_rate": 8.170054891369991e-06, "loss": 0.1167, "num_input_tokens_seen": 83055408, "step": 38500 }, { "epoch": 7.0664342081115805, "grad_norm": 0.04500441998243332, "learning_rate": 8.169435606326605e-06, "loss": 0.0007, "num_input_tokens_seen": 83066032, "step": 38505 }, { "epoch": 7.067351807671132, "grad_norm": 0.39840954542160034, "learning_rate": 8.168816239992109e-06, "loss": 0.0168, "num_input_tokens_seen": 83076528, "step": 38510 }, { "epoch": 7.068269407230685, "grad_norm": 0.019089816138148308, "learning_rate": 8.168196792382386e-06, "loss": 0.0021, "num_input_tokens_seen": 83086608, "step": 38515 }, { "epoch": 7.069187006790237, "grad_norm": 0.14516519010066986, "learning_rate": 8.167577263513325e-06, "loss": 0.1475, "num_input_tokens_seen": 83097936, "step": 38520 }, { "epoch": 7.070104606349789, "grad_norm": 1.2882747650146484, "learning_rate": 8.166957653400818e-06, "loss": 0.2674, "num_input_tokens_seen": 83109136, "step": 38525 }, { "epoch": 7.071022205909341, "grad_norm": 0.025800645351409912, "learning_rate": 8.166337962060755e-06, "loss": 0.0023, "num_input_tokens_seen": 83119760, "step": 38530 }, { "epoch": 7.071939805468894, "grad_norm": 159.7135009765625, "learning_rate": 8.16571818950903e-06, "loss": 0.0235, "num_input_tokens_seen": 83130352, "step": 38535 }, { "epoch": 7.072857405028445, "grad_norm": 0.01842787303030491, "learning_rate": 8.165098335761541e-06, "loss": 0.0356, "num_input_tokens_seen": 83141392, "step": 38540 }, { "epoch": 7.073775004587998, "grad_norm": 10.832765579223633, "learning_rate": 8.164478400834184e-06, "loss": 0.2464, "num_input_tokens_seen": 83152016, "step": 38545 }, { "epoch": 7.07469260414755, "grad_norm": 0.00924879964441061, "learning_rate": 8.16385838474286e-06, "loss": 0.104, "num_input_tokens_seen": 83162736, "step": 38550 }, { "epoch": 7.075610203707102, "grad_norm": 0.018672430887818336, "learning_rate": 8.163238287503473e-06, "loss": 0.0042, "num_input_tokens_seen": 83173616, "step": 38555 }, { "epoch": 7.0765278032666545, "grad_norm": 0.17230165004730225, "learning_rate": 8.162618109131928e-06, "loss": 0.0007, "num_input_tokens_seen": 83184944, "step": 38560 }, { "epoch": 7.077445402826207, "grad_norm": 10.79056167602539, "learning_rate": 8.161997849644127e-06, "loss": 0.0938, "num_input_tokens_seen": 83196624, "step": 38565 }, { "epoch": 7.078363002385759, "grad_norm": 0.10161168873310089, "learning_rate": 8.161377509055983e-06, "loss": 0.0023, "num_input_tokens_seen": 83207856, "step": 38570 }, { "epoch": 7.079280601945311, "grad_norm": 0.009725132025778294, "learning_rate": 8.160757087383406e-06, "loss": 0.1255, "num_input_tokens_seen": 83219184, "step": 38575 }, { "epoch": 7.080198201504864, "grad_norm": 0.01779780723154545, "learning_rate": 8.160136584642308e-06, "loss": 0.0167, "num_input_tokens_seen": 83230928, "step": 38580 }, { "epoch": 7.081115801064415, "grad_norm": 0.018109170719981194, "learning_rate": 8.159516000848606e-06, "loss": 0.2264, "num_input_tokens_seen": 83239984, "step": 38585 }, { "epoch": 7.082033400623968, "grad_norm": 65.19976043701172, "learning_rate": 8.158895336018213e-06, "loss": 0.0821, "num_input_tokens_seen": 83251888, "step": 38590 }, { "epoch": 7.08295100018352, "grad_norm": 0.048931222409009933, "learning_rate": 8.158274590167052e-06, "loss": 0.0008, "num_input_tokens_seen": 83263888, "step": 38595 }, { "epoch": 7.083868599743072, "grad_norm": 0.06625372916460037, "learning_rate": 8.157653763311041e-06, "loss": 0.0007, "num_input_tokens_seen": 83274384, "step": 38600 }, { "epoch": 7.084786199302624, "grad_norm": 0.021776404231786728, "learning_rate": 8.157032855466106e-06, "loss": 0.0006, "num_input_tokens_seen": 83284240, "step": 38605 }, { "epoch": 7.085703798862177, "grad_norm": 0.8924374580383301, "learning_rate": 8.156411866648172e-06, "loss": 0.0039, "num_input_tokens_seen": 83294256, "step": 38610 }, { "epoch": 7.086621398421729, "grad_norm": 0.13238157331943512, "learning_rate": 8.155790796873167e-06, "loss": 0.1183, "num_input_tokens_seen": 83306224, "step": 38615 }, { "epoch": 7.087538997981281, "grad_norm": 0.06546897441148758, "learning_rate": 8.155169646157017e-06, "loss": 0.234, "num_input_tokens_seen": 83317200, "step": 38620 }, { "epoch": 7.088456597540834, "grad_norm": 0.02500752918422222, "learning_rate": 8.154548414515655e-06, "loss": 0.037, "num_input_tokens_seen": 83326448, "step": 38625 }, { "epoch": 7.089374197100385, "grad_norm": 0.049415089190006256, "learning_rate": 8.153927101965015e-06, "loss": 0.1615, "num_input_tokens_seen": 83337808, "step": 38630 }, { "epoch": 7.090291796659938, "grad_norm": 0.021963289007544518, "learning_rate": 8.153305708521035e-06, "loss": 0.0002, "num_input_tokens_seen": 83348720, "step": 38635 }, { "epoch": 7.09120939621949, "grad_norm": 0.2979654371738434, "learning_rate": 8.15268423419965e-06, "loss": 0.0018, "num_input_tokens_seen": 83357968, "step": 38640 }, { "epoch": 7.092126995779042, "grad_norm": 181.39137268066406, "learning_rate": 8.1520626790168e-06, "loss": 0.0672, "num_input_tokens_seen": 83368624, "step": 38645 }, { "epoch": 7.093044595338594, "grad_norm": 0.007396049331873655, "learning_rate": 8.151441042988428e-06, "loss": 0.0016, "num_input_tokens_seen": 83380368, "step": 38650 }, { "epoch": 7.093962194898147, "grad_norm": 0.04460005834698677, "learning_rate": 8.150819326130477e-06, "loss": 0.2356, "num_input_tokens_seen": 83389680, "step": 38655 }, { "epoch": 7.0948797944576985, "grad_norm": 1.307515025138855, "learning_rate": 8.150197528458894e-06, "loss": 0.0017, "num_input_tokens_seen": 83400816, "step": 38660 }, { "epoch": 7.095797394017251, "grad_norm": 0.04533524438738823, "learning_rate": 8.149575649989627e-06, "loss": 0.1293, "num_input_tokens_seen": 83412240, "step": 38665 }, { "epoch": 7.0967149935768035, "grad_norm": 0.07168599218130112, "learning_rate": 8.148953690738625e-06, "loss": 0.011, "num_input_tokens_seen": 83425072, "step": 38670 }, { "epoch": 7.097632593136355, "grad_norm": 0.06084795668721199, "learning_rate": 8.148331650721843e-06, "loss": 0.0009, "num_input_tokens_seen": 83435888, "step": 38675 }, { "epoch": 7.098550192695908, "grad_norm": 6.642614364624023, "learning_rate": 8.147709529955233e-06, "loss": 0.0203, "num_input_tokens_seen": 83446864, "step": 38680 }, { "epoch": 7.09946779225546, "grad_norm": 0.009301263839006424, "learning_rate": 8.14708732845475e-06, "loss": 0.1859, "num_input_tokens_seen": 83457616, "step": 38685 }, { "epoch": 7.100385391815012, "grad_norm": 0.005494509823620319, "learning_rate": 8.146465046236357e-06, "loss": 0.0839, "num_input_tokens_seen": 83467312, "step": 38690 }, { "epoch": 7.101302991374564, "grad_norm": 0.08821488171815872, "learning_rate": 8.145842683316013e-06, "loss": 0.0007, "num_input_tokens_seen": 83477392, "step": 38695 }, { "epoch": 7.102220590934117, "grad_norm": 0.020425474271178246, "learning_rate": 8.145220239709676e-06, "loss": 0.2149, "num_input_tokens_seen": 83488720, "step": 38700 }, { "epoch": 7.103138190493668, "grad_norm": 0.13466240465641022, "learning_rate": 8.144597715433316e-06, "loss": 0.0023, "num_input_tokens_seen": 83499312, "step": 38705 }, { "epoch": 7.104055790053221, "grad_norm": 0.034826695919036865, "learning_rate": 8.1439751105029e-06, "loss": 0.0027, "num_input_tokens_seen": 83511056, "step": 38710 }, { "epoch": 7.104973389612773, "grad_norm": 0.11872897297143936, "learning_rate": 8.143352424934394e-06, "loss": 0.0025, "num_input_tokens_seen": 83522384, "step": 38715 }, { "epoch": 7.105890989172325, "grad_norm": 0.058076828718185425, "learning_rate": 8.142729658743771e-06, "loss": 0.001, "num_input_tokens_seen": 83532976, "step": 38720 }, { "epoch": 7.1068085887318775, "grad_norm": 0.5154903531074524, "learning_rate": 8.142106811947002e-06, "loss": 0.1259, "num_input_tokens_seen": 83543568, "step": 38725 }, { "epoch": 7.10772618829143, "grad_norm": 0.00935798604041338, "learning_rate": 8.141483884560063e-06, "loss": 0.2189, "num_input_tokens_seen": 83554320, "step": 38730 }, { "epoch": 7.108643787850982, "grad_norm": 0.028717540204524994, "learning_rate": 8.140860876598931e-06, "loss": 0.0773, "num_input_tokens_seen": 83565328, "step": 38735 }, { "epoch": 7.109561387410534, "grad_norm": 0.0068658869713544846, "learning_rate": 8.140237788079586e-06, "loss": 0.0041, "num_input_tokens_seen": 83575920, "step": 38740 }, { "epoch": 7.110478986970087, "grad_norm": 9.694774627685547, "learning_rate": 8.139614619018011e-06, "loss": 0.1039, "num_input_tokens_seen": 83586096, "step": 38745 }, { "epoch": 7.111396586529638, "grad_norm": 0.658084511756897, "learning_rate": 8.138991369430182e-06, "loss": 0.0022, "num_input_tokens_seen": 83596624, "step": 38750 }, { "epoch": 7.112314186089191, "grad_norm": 0.03477149084210396, "learning_rate": 8.138368039332092e-06, "loss": 0.135, "num_input_tokens_seen": 83607344, "step": 38755 }, { "epoch": 7.113231785648743, "grad_norm": 0.00822758674621582, "learning_rate": 8.137744628739726e-06, "loss": 0.0009, "num_input_tokens_seen": 83618896, "step": 38760 }, { "epoch": 7.114149385208295, "grad_norm": 48.19100570678711, "learning_rate": 8.137121137669072e-06, "loss": 0.0765, "num_input_tokens_seen": 83629072, "step": 38765 }, { "epoch": 7.115066984767847, "grad_norm": 0.02914845012128353, "learning_rate": 8.136497566136126e-06, "loss": 0.1747, "num_input_tokens_seen": 83638800, "step": 38770 }, { "epoch": 7.1159845843274, "grad_norm": 0.11435743421316147, "learning_rate": 8.135873914156875e-06, "loss": 0.1653, "num_input_tokens_seen": 83648720, "step": 38775 }, { "epoch": 7.1169021838869515, "grad_norm": 0.02623840793967247, "learning_rate": 8.13525018174732e-06, "loss": 0.0479, "num_input_tokens_seen": 83659600, "step": 38780 }, { "epoch": 7.117819783446504, "grad_norm": 0.08864232152700424, "learning_rate": 8.134626368923458e-06, "loss": 0.0019, "num_input_tokens_seen": 83669136, "step": 38785 }, { "epoch": 7.1187373830060565, "grad_norm": 0.05727586895227432, "learning_rate": 8.134002475701287e-06, "loss": 0.0341, "num_input_tokens_seen": 83679504, "step": 38790 }, { "epoch": 7.119654982565608, "grad_norm": 0.010640250518918037, "learning_rate": 8.13337850209681e-06, "loss": 0.0921, "num_input_tokens_seen": 83690832, "step": 38795 }, { "epoch": 7.120572582125161, "grad_norm": 0.14432954788208008, "learning_rate": 8.13275444812603e-06, "loss": 0.2776, "num_input_tokens_seen": 83700368, "step": 38800 }, { "epoch": 7.121490181684713, "grad_norm": 0.11905395239591599, "learning_rate": 8.132130313804953e-06, "loss": 0.0031, "num_input_tokens_seen": 83710800, "step": 38805 }, { "epoch": 7.122407781244265, "grad_norm": 0.04114146530628204, "learning_rate": 8.131506099149589e-06, "loss": 0.0009, "num_input_tokens_seen": 83720912, "step": 38810 }, { "epoch": 7.123325380803817, "grad_norm": 0.09924209117889404, "learning_rate": 8.130881804175948e-06, "loss": 0.0006, "num_input_tokens_seen": 83731504, "step": 38815 }, { "epoch": 7.12424298036337, "grad_norm": 0.003990998957306147, "learning_rate": 8.130257428900039e-06, "loss": 0.001, "num_input_tokens_seen": 83742768, "step": 38820 }, { "epoch": 7.125160579922921, "grad_norm": 0.6729706525802612, "learning_rate": 8.129632973337879e-06, "loss": 0.0993, "num_input_tokens_seen": 83754224, "step": 38825 }, { "epoch": 7.126078179482474, "grad_norm": 0.008510622195899487, "learning_rate": 8.129008437505485e-06, "loss": 0.0006, "num_input_tokens_seen": 83764112, "step": 38830 }, { "epoch": 7.126995779042026, "grad_norm": 0.19823899865150452, "learning_rate": 8.128383821418873e-06, "loss": 0.1154, "num_input_tokens_seen": 83774096, "step": 38835 }, { "epoch": 7.127913378601578, "grad_norm": 0.015101250261068344, "learning_rate": 8.127759125094064e-06, "loss": 0.0002, "num_input_tokens_seen": 83786000, "step": 38840 }, { "epoch": 7.1288309781611305, "grad_norm": 0.08314734697341919, "learning_rate": 8.127134348547082e-06, "loss": 0.0051, "num_input_tokens_seen": 83798224, "step": 38845 }, { "epoch": 7.129748577720683, "grad_norm": 0.021464157849550247, "learning_rate": 8.12650949179395e-06, "loss": 0.0005, "num_input_tokens_seen": 83809200, "step": 38850 }, { "epoch": 7.130666177280235, "grad_norm": 0.06837128847837448, "learning_rate": 8.125884554850696e-06, "loss": 0.0758, "num_input_tokens_seen": 83820048, "step": 38855 }, { "epoch": 7.131583776839787, "grad_norm": 88.23151397705078, "learning_rate": 8.125259537733347e-06, "loss": 0.0248, "num_input_tokens_seen": 83831088, "step": 38860 }, { "epoch": 7.13250137639934, "grad_norm": 0.0164782851934433, "learning_rate": 8.124634440457935e-06, "loss": 0.1133, "num_input_tokens_seen": 83841104, "step": 38865 }, { "epoch": 7.133418975958891, "grad_norm": 31.986143112182617, "learning_rate": 8.124009263040491e-06, "loss": 0.0989, "num_input_tokens_seen": 83851664, "step": 38870 }, { "epoch": 7.134336575518444, "grad_norm": 0.03425184637308121, "learning_rate": 8.123384005497053e-06, "loss": 0.0004, "num_input_tokens_seen": 83863888, "step": 38875 }, { "epoch": 7.135254175077996, "grad_norm": 1.5886892080307007, "learning_rate": 8.122758667843655e-06, "loss": 0.0021, "num_input_tokens_seen": 83875344, "step": 38880 }, { "epoch": 7.136171774637548, "grad_norm": 0.035900771617889404, "learning_rate": 8.122133250096337e-06, "loss": 0.0007, "num_input_tokens_seen": 83887312, "step": 38885 }, { "epoch": 7.1370893741971, "grad_norm": 0.7278207540512085, "learning_rate": 8.121507752271142e-06, "loss": 0.1239, "num_input_tokens_seen": 83898992, "step": 38890 }, { "epoch": 7.138006973756653, "grad_norm": 0.2977971136569977, "learning_rate": 8.120882174384109e-06, "loss": 0.001, "num_input_tokens_seen": 83910032, "step": 38895 }, { "epoch": 7.138924573316205, "grad_norm": 0.04008079320192337, "learning_rate": 8.120256516451286e-06, "loss": 0.1257, "num_input_tokens_seen": 83920048, "step": 38900 }, { "epoch": 7.139842172875757, "grad_norm": 0.021047335118055344, "learning_rate": 8.119630778488718e-06, "loss": 0.092, "num_input_tokens_seen": 83930320, "step": 38905 }, { "epoch": 7.14075977243531, "grad_norm": 0.010044347494840622, "learning_rate": 8.119004960512457e-06, "loss": 0.1042, "num_input_tokens_seen": 83940912, "step": 38910 }, { "epoch": 7.141677371994861, "grad_norm": 0.02577705681324005, "learning_rate": 8.118379062538553e-06, "loss": 0.0008, "num_input_tokens_seen": 83951056, "step": 38915 }, { "epoch": 7.142594971554414, "grad_norm": 0.009187142364680767, "learning_rate": 8.117753084583057e-06, "loss": 0.0004, "num_input_tokens_seen": 83962960, "step": 38920 }, { "epoch": 7.143512571113966, "grad_norm": 33.65622329711914, "learning_rate": 8.117127026662028e-06, "loss": 0.1108, "num_input_tokens_seen": 83974000, "step": 38925 }, { "epoch": 7.144430170673518, "grad_norm": 0.005009978078305721, "learning_rate": 8.116500888791523e-06, "loss": 0.0596, "num_input_tokens_seen": 83985168, "step": 38930 }, { "epoch": 7.14534777023307, "grad_norm": 23.487621307373047, "learning_rate": 8.115874670987598e-06, "loss": 0.2921, "num_input_tokens_seen": 83996240, "step": 38935 }, { "epoch": 7.146265369792623, "grad_norm": 0.013531492091715336, "learning_rate": 8.11524837326632e-06, "loss": 0.168, "num_input_tokens_seen": 84006768, "step": 38940 }, { "epoch": 7.1471829693521745, "grad_norm": 0.029547592625021935, "learning_rate": 8.114621995643746e-06, "loss": 0.0291, "num_input_tokens_seen": 84017520, "step": 38945 }, { "epoch": 7.148100568911727, "grad_norm": 0.028656087815761566, "learning_rate": 8.113995538135946e-06, "loss": 0.0268, "num_input_tokens_seen": 84028432, "step": 38950 }, { "epoch": 7.1490181684712795, "grad_norm": 0.015390570275485516, "learning_rate": 8.113369000758988e-06, "loss": 0.0093, "num_input_tokens_seen": 84039408, "step": 38955 }, { "epoch": 7.149935768030831, "grad_norm": 0.006156522314995527, "learning_rate": 8.112742383528939e-06, "loss": 0.0005, "num_input_tokens_seen": 84050352, "step": 38960 }, { "epoch": 7.150853367590384, "grad_norm": 0.018287932500243187, "learning_rate": 8.112115686461873e-06, "loss": 0.2321, "num_input_tokens_seen": 84061488, "step": 38965 }, { "epoch": 7.151770967149936, "grad_norm": 0.3735732436180115, "learning_rate": 8.111488909573863e-06, "loss": 0.0004, "num_input_tokens_seen": 84070960, "step": 38970 }, { "epoch": 7.152688566709488, "grad_norm": 0.00152268644887954, "learning_rate": 8.110862052880983e-06, "loss": 0.0764, "num_input_tokens_seen": 84081936, "step": 38975 }, { "epoch": 7.15360616626904, "grad_norm": 0.006199313793331385, "learning_rate": 8.110235116399315e-06, "loss": 0.0003, "num_input_tokens_seen": 84091216, "step": 38980 }, { "epoch": 7.154523765828593, "grad_norm": 0.008977907709777355, "learning_rate": 8.109608100144935e-06, "loss": 0.0028, "num_input_tokens_seen": 84102224, "step": 38985 }, { "epoch": 7.155441365388144, "grad_norm": 0.0032014648895710707, "learning_rate": 8.108981004133929e-06, "loss": 0.0002, "num_input_tokens_seen": 84113488, "step": 38990 }, { "epoch": 7.156358964947697, "grad_norm": 0.009823990985751152, "learning_rate": 8.108353828382376e-06, "loss": 0.1192, "num_input_tokens_seen": 84124272, "step": 38995 }, { "epoch": 7.157276564507249, "grad_norm": 0.026728853583335876, "learning_rate": 8.107726572906366e-06, "loss": 0.2419, "num_input_tokens_seen": 84134352, "step": 39000 }, { "epoch": 7.158194164066801, "grad_norm": 59.53553771972656, "learning_rate": 8.107099237721987e-06, "loss": 0.1893, "num_input_tokens_seen": 84146288, "step": 39005 }, { "epoch": 7.1591117636263535, "grad_norm": 0.007605449296534061, "learning_rate": 8.106471822845327e-06, "loss": 0.1192, "num_input_tokens_seen": 84158288, "step": 39010 }, { "epoch": 7.160029363185906, "grad_norm": 0.0158257894217968, "learning_rate": 8.10584432829248e-06, "loss": 0.002, "num_input_tokens_seen": 84168880, "step": 39015 }, { "epoch": 7.160946962745458, "grad_norm": 0.07463807612657547, "learning_rate": 8.105216754079538e-06, "loss": 0.0009, "num_input_tokens_seen": 84179376, "step": 39020 }, { "epoch": 7.16186456230501, "grad_norm": 14.515532493591309, "learning_rate": 8.1045891002226e-06, "loss": 0.0031, "num_input_tokens_seen": 84190128, "step": 39025 }, { "epoch": 7.162782161864563, "grad_norm": 0.18085162341594696, "learning_rate": 8.103961366737765e-06, "loss": 0.0007, "num_input_tokens_seen": 84200592, "step": 39030 }, { "epoch": 7.163699761424114, "grad_norm": 0.005311182234436274, "learning_rate": 8.10333355364113e-06, "loss": 0.0002, "num_input_tokens_seen": 84210736, "step": 39035 }, { "epoch": 7.164617360983667, "grad_norm": 31.0074405670166, "learning_rate": 8.102705660948799e-06, "loss": 0.254, "num_input_tokens_seen": 84222320, "step": 39040 }, { "epoch": 7.165534960543219, "grad_norm": 1.0990865230560303, "learning_rate": 8.102077688676875e-06, "loss": 0.0009, "num_input_tokens_seen": 84232272, "step": 39045 }, { "epoch": 7.166452560102771, "grad_norm": 0.11034628003835678, "learning_rate": 8.101449636841468e-06, "loss": 0.001, "num_input_tokens_seen": 84241872, "step": 39050 }, { "epoch": 7.167370159662323, "grad_norm": 0.09945607930421829, "learning_rate": 8.100821505458684e-06, "loss": 0.0008, "num_input_tokens_seen": 84252496, "step": 39055 }, { "epoch": 7.168287759221876, "grad_norm": 0.007458871696144342, "learning_rate": 8.100193294544637e-06, "loss": 0.0003, "num_input_tokens_seen": 84263408, "step": 39060 }, { "epoch": 7.1692053587814275, "grad_norm": 0.007200624328106642, "learning_rate": 8.099565004115432e-06, "loss": 0.0003, "num_input_tokens_seen": 84273616, "step": 39065 }, { "epoch": 7.17012295834098, "grad_norm": 0.051816511899232864, "learning_rate": 8.098936634187193e-06, "loss": 0.1611, "num_input_tokens_seen": 84283312, "step": 39070 }, { "epoch": 7.1710405579005325, "grad_norm": 109.43660736083984, "learning_rate": 8.09830818477603e-06, "loss": 0.0948, "num_input_tokens_seen": 84294576, "step": 39075 }, { "epoch": 7.171958157460084, "grad_norm": 131.45489501953125, "learning_rate": 8.097679655898063e-06, "loss": 0.1587, "num_input_tokens_seen": 84304176, "step": 39080 }, { "epoch": 7.172875757019637, "grad_norm": 0.04702628403902054, "learning_rate": 8.097051047569416e-06, "loss": 0.0137, "num_input_tokens_seen": 84313968, "step": 39085 }, { "epoch": 7.173793356579189, "grad_norm": 0.023601798340678215, "learning_rate": 8.096422359806209e-06, "loss": 0.0006, "num_input_tokens_seen": 84325712, "step": 39090 }, { "epoch": 7.174710956138741, "grad_norm": 0.006723830010741949, "learning_rate": 8.095793592624566e-06, "loss": 0.0006, "num_input_tokens_seen": 84334768, "step": 39095 }, { "epoch": 7.175628555698293, "grad_norm": 0.01763785257935524, "learning_rate": 8.095164746040618e-06, "loss": 0.001, "num_input_tokens_seen": 84346416, "step": 39100 }, { "epoch": 7.176546155257846, "grad_norm": 0.001258147181943059, "learning_rate": 8.094535820070488e-06, "loss": 0.0046, "num_input_tokens_seen": 84356720, "step": 39105 }, { "epoch": 7.177463754817397, "grad_norm": 20.172739028930664, "learning_rate": 8.093906814730313e-06, "loss": 0.2943, "num_input_tokens_seen": 84366192, "step": 39110 }, { "epoch": 7.17838135437695, "grad_norm": 0.03272800147533417, "learning_rate": 8.093277730036221e-06, "loss": 0.1621, "num_input_tokens_seen": 84376784, "step": 39115 }, { "epoch": 7.179298953936502, "grad_norm": 0.02034592255949974, "learning_rate": 8.092648566004352e-06, "loss": 0.1099, "num_input_tokens_seen": 84387760, "step": 39120 }, { "epoch": 7.180216553496054, "grad_norm": 463.06060791015625, "learning_rate": 8.09201932265084e-06, "loss": 0.2424, "num_input_tokens_seen": 84398672, "step": 39125 }, { "epoch": 7.1811341530556065, "grad_norm": 0.003993047401309013, "learning_rate": 8.091389999991824e-06, "loss": 0.0009, "num_input_tokens_seen": 84409808, "step": 39130 }, { "epoch": 7.182051752615159, "grad_norm": 0.02259575016796589, "learning_rate": 8.090760598043444e-06, "loss": 0.1608, "num_input_tokens_seen": 84420752, "step": 39135 }, { "epoch": 7.182969352174711, "grad_norm": 0.4562317430973053, "learning_rate": 8.090131116821846e-06, "loss": 0.2264, "num_input_tokens_seen": 84431216, "step": 39140 }, { "epoch": 7.183886951734263, "grad_norm": 0.006409958936274052, "learning_rate": 8.089501556343175e-06, "loss": 0.0006, "num_input_tokens_seen": 84442928, "step": 39145 }, { "epoch": 7.184804551293816, "grad_norm": 0.4571551978588104, "learning_rate": 8.088871916623577e-06, "loss": 0.0939, "num_input_tokens_seen": 84454224, "step": 39150 }, { "epoch": 7.185722150853367, "grad_norm": 0.9617373943328857, "learning_rate": 8.0882421976792e-06, "loss": 0.0049, "num_input_tokens_seen": 84464464, "step": 39155 }, { "epoch": 7.18663975041292, "grad_norm": 0.13833534717559814, "learning_rate": 8.087612399526201e-06, "loss": 0.088, "num_input_tokens_seen": 84475216, "step": 39160 }, { "epoch": 7.187557349972472, "grad_norm": 0.0031506610102951527, "learning_rate": 8.086982522180726e-06, "loss": 0.053, "num_input_tokens_seen": 84485936, "step": 39165 }, { "epoch": 7.188474949532024, "grad_norm": 0.03803412616252899, "learning_rate": 8.086352565658934e-06, "loss": 0.0002, "num_input_tokens_seen": 84496400, "step": 39170 }, { "epoch": 7.189392549091576, "grad_norm": 0.1219056099653244, "learning_rate": 8.085722529976985e-06, "loss": 0.0003, "num_input_tokens_seen": 84506576, "step": 39175 }, { "epoch": 7.190310148651129, "grad_norm": 35.723575592041016, "learning_rate": 8.085092415151032e-06, "loss": 0.0431, "num_input_tokens_seen": 84517168, "step": 39180 }, { "epoch": 7.191227748210681, "grad_norm": 0.03515179082751274, "learning_rate": 8.084462221197241e-06, "loss": 0.1255, "num_input_tokens_seen": 84527888, "step": 39185 }, { "epoch": 7.192145347770233, "grad_norm": 0.012221084907650948, "learning_rate": 8.083831948131774e-06, "loss": 0.0006, "num_input_tokens_seen": 84538576, "step": 39190 }, { "epoch": 7.193062947329786, "grad_norm": 0.018672535195946693, "learning_rate": 8.0832015959708e-06, "loss": 0.0002, "num_input_tokens_seen": 84549616, "step": 39195 }, { "epoch": 7.193980546889337, "grad_norm": 0.008763376623392105, "learning_rate": 8.082571164730482e-06, "loss": 0.1499, "num_input_tokens_seen": 84560560, "step": 39200 }, { "epoch": 7.19489814644889, "grad_norm": 0.4762893617153168, "learning_rate": 8.08194065442699e-06, "loss": 0.295, "num_input_tokens_seen": 84570448, "step": 39205 }, { "epoch": 7.195815746008442, "grad_norm": 0.05923105403780937, "learning_rate": 8.081310065076497e-06, "loss": 0.0088, "num_input_tokens_seen": 84580816, "step": 39210 }, { "epoch": 7.196733345567994, "grad_norm": 0.03175251558423042, "learning_rate": 8.080679396695177e-06, "loss": 0.0006, "num_input_tokens_seen": 84592144, "step": 39215 }, { "epoch": 7.197650945127546, "grad_norm": 0.047731246799230576, "learning_rate": 8.080048649299203e-06, "loss": 0.0037, "num_input_tokens_seen": 84602352, "step": 39220 }, { "epoch": 7.198568544687099, "grad_norm": 0.13853491842746735, "learning_rate": 8.079417822904759e-06, "loss": 0.1667, "num_input_tokens_seen": 84613904, "step": 39225 }, { "epoch": 7.1994861442466505, "grad_norm": 0.040713097900152206, "learning_rate": 8.078786917528016e-06, "loss": 0.0891, "num_input_tokens_seen": 84624560, "step": 39230 }, { "epoch": 7.200403743806203, "grad_norm": 0.3886054754257202, "learning_rate": 8.078155933185163e-06, "loss": 0.0028, "num_input_tokens_seen": 84635696, "step": 39235 }, { "epoch": 7.2013213433657555, "grad_norm": 0.01668381690979004, "learning_rate": 8.077524869892382e-06, "loss": 0.0012, "num_input_tokens_seen": 84647120, "step": 39240 }, { "epoch": 7.202238942925307, "grad_norm": 0.08926639705896378, "learning_rate": 8.076893727665855e-06, "loss": 0.1138, "num_input_tokens_seen": 84658672, "step": 39245 }, { "epoch": 7.20315654248486, "grad_norm": 0.02517739310860634, "learning_rate": 8.076262506521774e-06, "loss": 0.1659, "num_input_tokens_seen": 84668912, "step": 39250 }, { "epoch": 7.204074142044412, "grad_norm": 19.317373275756836, "learning_rate": 8.075631206476328e-06, "loss": 0.1664, "num_input_tokens_seen": 84679056, "step": 39255 }, { "epoch": 7.204991741603964, "grad_norm": 0.008245447650551796, "learning_rate": 8.074999827545708e-06, "loss": 0.0891, "num_input_tokens_seen": 84689872, "step": 39260 }, { "epoch": 7.205909341163516, "grad_norm": 0.010851309634745121, "learning_rate": 8.074368369746107e-06, "loss": 0.0008, "num_input_tokens_seen": 84701392, "step": 39265 }, { "epoch": 7.206826940723069, "grad_norm": 0.1636255532503128, "learning_rate": 8.073736833093725e-06, "loss": 0.0008, "num_input_tokens_seen": 84711952, "step": 39270 }, { "epoch": 7.20774454028262, "grad_norm": 14.282386779785156, "learning_rate": 8.073105217604754e-06, "loss": 0.2321, "num_input_tokens_seen": 84722256, "step": 39275 }, { "epoch": 7.208662139842173, "grad_norm": 0.03370102122426033, "learning_rate": 8.072473523295398e-06, "loss": 0.001, "num_input_tokens_seen": 84731760, "step": 39280 }, { "epoch": 7.209579739401725, "grad_norm": 0.030376138165593147, "learning_rate": 8.071841750181858e-06, "loss": 0.0023, "num_input_tokens_seen": 84742800, "step": 39285 }, { "epoch": 7.210497338961277, "grad_norm": 1.3002427816390991, "learning_rate": 8.071209898280339e-06, "loss": 0.0033, "num_input_tokens_seen": 84753648, "step": 39290 }, { "epoch": 7.2114149385208295, "grad_norm": 65.60384368896484, "learning_rate": 8.070577967607044e-06, "loss": 0.1437, "num_input_tokens_seen": 84764976, "step": 39295 }, { "epoch": 7.212332538080382, "grad_norm": 0.14438237249851227, "learning_rate": 8.069945958178187e-06, "loss": 0.0299, "num_input_tokens_seen": 84775376, "step": 39300 }, { "epoch": 7.213250137639934, "grad_norm": 0.021192029118537903, "learning_rate": 8.069313870009971e-06, "loss": 0.1178, "num_input_tokens_seen": 84785584, "step": 39305 }, { "epoch": 7.214167737199486, "grad_norm": 0.00620284304022789, "learning_rate": 8.068681703118611e-06, "loss": 0.2703, "num_input_tokens_seen": 84797072, "step": 39310 }, { "epoch": 7.215085336759039, "grad_norm": 15.087959289550781, "learning_rate": 8.068049457520322e-06, "loss": 0.0034, "num_input_tokens_seen": 84808048, "step": 39315 }, { "epoch": 7.21600293631859, "grad_norm": 0.003293028101325035, "learning_rate": 8.06741713323132e-06, "loss": 0.261, "num_input_tokens_seen": 84819120, "step": 39320 }, { "epoch": 7.216920535878143, "grad_norm": 0.16800986230373383, "learning_rate": 8.066784730267822e-06, "loss": 0.0832, "num_input_tokens_seen": 84829456, "step": 39325 }, { "epoch": 7.217838135437695, "grad_norm": 0.03861566632986069, "learning_rate": 8.06615224864605e-06, "loss": 0.0003, "num_input_tokens_seen": 84840048, "step": 39330 }, { "epoch": 7.218755734997247, "grad_norm": 44.6486701965332, "learning_rate": 8.065519688382224e-06, "loss": 0.2931, "num_input_tokens_seen": 84850704, "step": 39335 }, { "epoch": 7.219673334556799, "grad_norm": 0.07056418061256409, "learning_rate": 8.06488704949257e-06, "loss": 0.0028, "num_input_tokens_seen": 84860912, "step": 39340 }, { "epoch": 7.220590934116352, "grad_norm": 0.0482499860227108, "learning_rate": 8.064254331993311e-06, "loss": 0.0013, "num_input_tokens_seen": 84871952, "step": 39345 }, { "epoch": 7.2215085336759035, "grad_norm": 0.11001157015562057, "learning_rate": 8.063621535900679e-06, "loss": 0.0116, "num_input_tokens_seen": 84882256, "step": 39350 }, { "epoch": 7.222426133235456, "grad_norm": 16.90277671813965, "learning_rate": 8.062988661230903e-06, "loss": 0.2688, "num_input_tokens_seen": 84892816, "step": 39355 }, { "epoch": 7.2233437327950085, "grad_norm": 0.7205990552902222, "learning_rate": 8.062355708000215e-06, "loss": 0.0922, "num_input_tokens_seen": 84904528, "step": 39360 }, { "epoch": 7.22426133235456, "grad_norm": 1.1389296054840088, "learning_rate": 8.061722676224848e-06, "loss": 0.0042, "num_input_tokens_seen": 84915568, "step": 39365 }, { "epoch": 7.225178931914113, "grad_norm": 37.81975173950195, "learning_rate": 8.061089565921042e-06, "loss": 0.0948, "num_input_tokens_seen": 84926256, "step": 39370 }, { "epoch": 7.226096531473665, "grad_norm": 131.5319061279297, "learning_rate": 8.060456377105031e-06, "loss": 0.0491, "num_input_tokens_seen": 84937264, "step": 39375 }, { "epoch": 7.227014131033217, "grad_norm": 0.986384391784668, "learning_rate": 8.059823109793058e-06, "loss": 0.0008, "num_input_tokens_seen": 84947504, "step": 39380 }, { "epoch": 7.227931730592769, "grad_norm": 169.74838256835938, "learning_rate": 8.059189764001366e-06, "loss": 0.0134, "num_input_tokens_seen": 84957360, "step": 39385 }, { "epoch": 7.228849330152322, "grad_norm": 0.005004612263292074, "learning_rate": 8.058556339746195e-06, "loss": 0.0002, "num_input_tokens_seen": 84968848, "step": 39390 }, { "epoch": 7.229766929711873, "grad_norm": 0.22239847481250763, "learning_rate": 8.057922837043796e-06, "loss": 0.0008, "num_input_tokens_seen": 84979440, "step": 39395 }, { "epoch": 7.230684529271426, "grad_norm": 0.13186298310756683, "learning_rate": 8.057289255910415e-06, "loss": 0.0954, "num_input_tokens_seen": 84990928, "step": 39400 }, { "epoch": 7.231602128830978, "grad_norm": 0.07824189960956573, "learning_rate": 8.056655596362302e-06, "loss": 0.0003, "num_input_tokens_seen": 85001328, "step": 39405 }, { "epoch": 7.23251972839053, "grad_norm": 0.030641553923487663, "learning_rate": 8.056021858415711e-06, "loss": 0.0009, "num_input_tokens_seen": 85012528, "step": 39410 }, { "epoch": 7.2334373279500825, "grad_norm": 58.061771392822266, "learning_rate": 8.055388042086895e-06, "loss": 0.6704, "num_input_tokens_seen": 85022896, "step": 39415 }, { "epoch": 7.234354927509635, "grad_norm": 0.08233359456062317, "learning_rate": 8.054754147392114e-06, "loss": 0.0004, "num_input_tokens_seen": 85034288, "step": 39420 }, { "epoch": 7.235272527069187, "grad_norm": 0.18229898810386658, "learning_rate": 8.054120174347622e-06, "loss": 0.099, "num_input_tokens_seen": 85044848, "step": 39425 }, { "epoch": 7.236190126628739, "grad_norm": 0.00248822383582592, "learning_rate": 8.05348612296968e-06, "loss": 0.0004, "num_input_tokens_seen": 85054864, "step": 39430 }, { "epoch": 7.237107726188292, "grad_norm": 0.07717051357030869, "learning_rate": 8.052851993274552e-06, "loss": 0.0007, "num_input_tokens_seen": 85066384, "step": 39435 }, { "epoch": 7.238025325747843, "grad_norm": 1.0069891214370728, "learning_rate": 8.052217785278503e-06, "loss": 0.0022, "num_input_tokens_seen": 85077488, "step": 39440 }, { "epoch": 7.238942925307396, "grad_norm": 0.02794305980205536, "learning_rate": 8.051583498997797e-06, "loss": 0.0009, "num_input_tokens_seen": 85088016, "step": 39445 }, { "epoch": 7.239860524866948, "grad_norm": 0.011533543467521667, "learning_rate": 8.050949134448703e-06, "loss": 0.1179, "num_input_tokens_seen": 85098672, "step": 39450 }, { "epoch": 7.2407781244265, "grad_norm": 0.03365049511194229, "learning_rate": 8.050314691647494e-06, "loss": 0.1677, "num_input_tokens_seen": 85110480, "step": 39455 }, { "epoch": 7.241695723986052, "grad_norm": 0.021100904792547226, "learning_rate": 8.04968017061044e-06, "loss": 0.0631, "num_input_tokens_seen": 85122064, "step": 39460 }, { "epoch": 7.242613323545605, "grad_norm": 17.575992584228516, "learning_rate": 8.049045571353816e-06, "loss": 0.2736, "num_input_tokens_seen": 85133264, "step": 39465 }, { "epoch": 7.243530923105157, "grad_norm": 0.07189779728651047, "learning_rate": 8.048410893893898e-06, "loss": 0.1259, "num_input_tokens_seen": 85144688, "step": 39470 }, { "epoch": 7.244448522664709, "grad_norm": 0.5823266506195068, "learning_rate": 8.047776138246968e-06, "loss": 0.0033, "num_input_tokens_seen": 85155600, "step": 39475 }, { "epoch": 7.245366122224262, "grad_norm": 0.05819176882505417, "learning_rate": 8.047141304429301e-06, "loss": 0.0217, "num_input_tokens_seen": 85165744, "step": 39480 }, { "epoch": 7.246283721783813, "grad_norm": 0.11694929748773575, "learning_rate": 8.046506392457183e-06, "loss": 0.0009, "num_input_tokens_seen": 85177552, "step": 39485 }, { "epoch": 7.247201321343366, "grad_norm": 0.31477758288383484, "learning_rate": 8.045871402346897e-06, "loss": 0.0019, "num_input_tokens_seen": 85186992, "step": 39490 }, { "epoch": 7.248118920902918, "grad_norm": 0.007155698258429766, "learning_rate": 8.045236334114732e-06, "loss": 0.1261, "num_input_tokens_seen": 85199024, "step": 39495 }, { "epoch": 7.24903652046247, "grad_norm": 17.941564559936523, "learning_rate": 8.044601187776973e-06, "loss": 0.1714, "num_input_tokens_seen": 85209168, "step": 39500 }, { "epoch": 7.249954120022022, "grad_norm": 0.1065702810883522, "learning_rate": 8.043965963349914e-06, "loss": 0.0074, "num_input_tokens_seen": 85219600, "step": 39505 }, { "epoch": 7.250871719581575, "grad_norm": 0.12928029894828796, "learning_rate": 8.043330660849844e-06, "loss": 0.0995, "num_input_tokens_seen": 85228144, "step": 39510 }, { "epoch": 7.2517893191411265, "grad_norm": 0.10206621885299683, "learning_rate": 8.04269528029306e-06, "loss": 0.1362, "num_input_tokens_seen": 85240048, "step": 39515 }, { "epoch": 7.252706918700679, "grad_norm": 57.800540924072266, "learning_rate": 8.042059821695857e-06, "loss": 0.0976, "num_input_tokens_seen": 85251056, "step": 39520 }, { "epoch": 7.2536245182602315, "grad_norm": 0.0531839057803154, "learning_rate": 8.041424285074535e-06, "loss": 0.0027, "num_input_tokens_seen": 85263344, "step": 39525 }, { "epoch": 7.254542117819783, "grad_norm": 0.026918789371848106, "learning_rate": 8.040788670445394e-06, "loss": 0.0867, "num_input_tokens_seen": 85274704, "step": 39530 }, { "epoch": 7.255459717379336, "grad_norm": 0.12834274768829346, "learning_rate": 8.040152977824736e-06, "loss": 0.0009, "num_input_tokens_seen": 85286320, "step": 39535 }, { "epoch": 7.256377316938888, "grad_norm": 0.04278237000107765, "learning_rate": 8.039517207228867e-06, "loss": 0.0008, "num_input_tokens_seen": 85295984, "step": 39540 }, { "epoch": 7.25729491649844, "grad_norm": 0.05430306866765022, "learning_rate": 8.038881358674092e-06, "loss": 0.0085, "num_input_tokens_seen": 85306832, "step": 39545 }, { "epoch": 7.258212516057992, "grad_norm": 0.0012619440676644444, "learning_rate": 8.038245432176718e-06, "loss": 0.1573, "num_input_tokens_seen": 85317360, "step": 39550 }, { "epoch": 7.259130115617545, "grad_norm": 0.09304967522621155, "learning_rate": 8.037609427753062e-06, "loss": 0.0931, "num_input_tokens_seen": 85327920, "step": 39555 }, { "epoch": 7.260047715177096, "grad_norm": 0.07113432884216309, "learning_rate": 8.036973345419428e-06, "loss": 0.0008, "num_input_tokens_seen": 85339568, "step": 39560 }, { "epoch": 7.260965314736649, "grad_norm": 0.23534086346626282, "learning_rate": 8.036337185192135e-06, "loss": 0.2558, "num_input_tokens_seen": 85350928, "step": 39565 }, { "epoch": 7.261882914296201, "grad_norm": 0.023057399317622185, "learning_rate": 8.0357009470875e-06, "loss": 0.0013, "num_input_tokens_seen": 85361936, "step": 39570 }, { "epoch": 7.262800513855753, "grad_norm": 12.817793846130371, "learning_rate": 8.03506463112184e-06, "loss": 0.2858, "num_input_tokens_seen": 85373200, "step": 39575 }, { "epoch": 7.2637181134153055, "grad_norm": 0.11444313079118729, "learning_rate": 8.034428237311478e-06, "loss": 0.0017, "num_input_tokens_seen": 85383248, "step": 39580 }, { "epoch": 7.264635712974858, "grad_norm": 0.0995156541466713, "learning_rate": 8.033791765672732e-06, "loss": 0.1227, "num_input_tokens_seen": 85395920, "step": 39585 }, { "epoch": 7.26555331253441, "grad_norm": 0.05452390015125275, "learning_rate": 8.033155216221931e-06, "loss": 0.0012, "num_input_tokens_seen": 85406992, "step": 39590 }, { "epoch": 7.266470912093962, "grad_norm": 0.04760563746094704, "learning_rate": 8.0325185889754e-06, "loss": 0.0007, "num_input_tokens_seen": 85417520, "step": 39595 }, { "epoch": 7.267388511653515, "grad_norm": 34.60322952270508, "learning_rate": 8.031881883949467e-06, "loss": 0.1043, "num_input_tokens_seen": 85429424, "step": 39600 }, { "epoch": 7.268306111213066, "grad_norm": 0.03576100990176201, "learning_rate": 8.03124510116046e-06, "loss": 0.0015, "num_input_tokens_seen": 85440464, "step": 39605 }, { "epoch": 7.269223710772619, "grad_norm": 0.06995341181755066, "learning_rate": 8.030608240624717e-06, "loss": 0.1571, "num_input_tokens_seen": 85451088, "step": 39610 }, { "epoch": 7.270141310332171, "grad_norm": 0.027392681688070297, "learning_rate": 8.029971302358568e-06, "loss": 0.0311, "num_input_tokens_seen": 85461104, "step": 39615 }, { "epoch": 7.271058909891723, "grad_norm": 0.005676349624991417, "learning_rate": 8.029334286378352e-06, "loss": 0.0006, "num_input_tokens_seen": 85472016, "step": 39620 }, { "epoch": 7.271976509451275, "grad_norm": 0.32305195927619934, "learning_rate": 8.028697192700407e-06, "loss": 0.0128, "num_input_tokens_seen": 85483056, "step": 39625 }, { "epoch": 7.272894109010828, "grad_norm": 0.03615481033921242, "learning_rate": 8.028060021341074e-06, "loss": 0.0016, "num_input_tokens_seen": 85492944, "step": 39630 }, { "epoch": 7.2738117085703795, "grad_norm": 0.014230937696993351, "learning_rate": 8.027422772316692e-06, "loss": 0.0012, "num_input_tokens_seen": 85504240, "step": 39635 }, { "epoch": 7.274729308129932, "grad_norm": 0.00355530739761889, "learning_rate": 8.026785445643608e-06, "loss": 0.1466, "num_input_tokens_seen": 85513744, "step": 39640 }, { "epoch": 7.2756469076894845, "grad_norm": 0.0118294982239604, "learning_rate": 8.026148041338171e-06, "loss": 0.1883, "num_input_tokens_seen": 85523888, "step": 39645 }, { "epoch": 7.276564507249036, "grad_norm": 0.6516103148460388, "learning_rate": 8.025510559416725e-06, "loss": 0.1119, "num_input_tokens_seen": 85533744, "step": 39650 }, { "epoch": 7.277482106808589, "grad_norm": 0.013514447025954723, "learning_rate": 8.024872999895623e-06, "loss": 0.0408, "num_input_tokens_seen": 85544432, "step": 39655 }, { "epoch": 7.278399706368141, "grad_norm": 0.02355857379734516, "learning_rate": 8.024235362791216e-06, "loss": 0.5842, "num_input_tokens_seen": 85554800, "step": 39660 }, { "epoch": 7.279317305927693, "grad_norm": 0.054235659539699554, "learning_rate": 8.023597648119859e-06, "loss": 0.0009, "num_input_tokens_seen": 85564624, "step": 39665 }, { "epoch": 7.280234905487245, "grad_norm": 128.6490020751953, "learning_rate": 8.02295985589791e-06, "loss": 0.0897, "num_input_tokens_seen": 85573904, "step": 39670 }, { "epoch": 7.281152505046798, "grad_norm": 0.035006046295166016, "learning_rate": 8.022321986141724e-06, "loss": 0.1737, "num_input_tokens_seen": 85586000, "step": 39675 }, { "epoch": 7.282070104606349, "grad_norm": 0.10832570493221283, "learning_rate": 8.021684038867663e-06, "loss": 0.0015, "num_input_tokens_seen": 85597488, "step": 39680 }, { "epoch": 7.282987704165902, "grad_norm": 76.78697967529297, "learning_rate": 8.021046014092091e-06, "loss": 0.0471, "num_input_tokens_seen": 85608848, "step": 39685 }, { "epoch": 7.283905303725454, "grad_norm": 0.04640835151076317, "learning_rate": 8.02040791183137e-06, "loss": 0.0062, "num_input_tokens_seen": 85619152, "step": 39690 }, { "epoch": 7.284822903285006, "grad_norm": 0.05432668328285217, "learning_rate": 8.019769732101868e-06, "loss": 0.0005, "num_input_tokens_seen": 85630352, "step": 39695 }, { "epoch": 7.2857405028445585, "grad_norm": 0.629780650138855, "learning_rate": 8.019131474919953e-06, "loss": 0.2642, "num_input_tokens_seen": 85640208, "step": 39700 }, { "epoch": 7.286658102404111, "grad_norm": 0.033369485288858414, "learning_rate": 8.018493140301994e-06, "loss": 0.3261, "num_input_tokens_seen": 85650416, "step": 39705 }, { "epoch": 7.287575701963663, "grad_norm": 3.9880168437957764, "learning_rate": 8.017854728264363e-06, "loss": 0.116, "num_input_tokens_seen": 85661008, "step": 39710 }, { "epoch": 7.288493301523215, "grad_norm": 0.05842692404985428, "learning_rate": 8.017216238823437e-06, "loss": 0.1013, "num_input_tokens_seen": 85672592, "step": 39715 }, { "epoch": 7.289410901082768, "grad_norm": 0.03621958941221237, "learning_rate": 8.016577671995592e-06, "loss": 0.1299, "num_input_tokens_seen": 85683312, "step": 39720 }, { "epoch": 7.290328500642319, "grad_norm": 0.6774995923042297, "learning_rate": 8.015939027797203e-06, "loss": 0.1261, "num_input_tokens_seen": 85693968, "step": 39725 }, { "epoch": 7.291246100201872, "grad_norm": 0.03886210173368454, "learning_rate": 8.015300306244653e-06, "loss": 0.001, "num_input_tokens_seen": 85705968, "step": 39730 }, { "epoch": 7.292163699761424, "grad_norm": 0.09134116768836975, "learning_rate": 8.014661507354322e-06, "loss": 0.2674, "num_input_tokens_seen": 85715120, "step": 39735 }, { "epoch": 7.293081299320976, "grad_norm": 0.01191716268658638, "learning_rate": 8.014022631142599e-06, "loss": 0.0915, "num_input_tokens_seen": 85726672, "step": 39740 }, { "epoch": 7.293998898880528, "grad_norm": 8.681727409362793, "learning_rate": 8.013383677625864e-06, "loss": 0.1175, "num_input_tokens_seen": 85736656, "step": 39745 }, { "epoch": 7.294916498440081, "grad_norm": 0.4724366366863251, "learning_rate": 8.012744646820509e-06, "loss": 0.0111, "num_input_tokens_seen": 85746800, "step": 39750 }, { "epoch": 7.295834097999633, "grad_norm": 0.0731273964047432, "learning_rate": 8.012105538742922e-06, "loss": 0.1078, "num_input_tokens_seen": 85757936, "step": 39755 }, { "epoch": 7.296751697559185, "grad_norm": 0.2777852714061737, "learning_rate": 8.011466353409498e-06, "loss": 0.0016, "num_input_tokens_seen": 85768496, "step": 39760 }, { "epoch": 7.297669297118738, "grad_norm": 150.9694366455078, "learning_rate": 8.010827090836628e-06, "loss": 0.2805, "num_input_tokens_seen": 85779472, "step": 39765 }, { "epoch": 7.298586896678289, "grad_norm": 0.13567541539669037, "learning_rate": 8.01018775104071e-06, "loss": 0.003, "num_input_tokens_seen": 85790288, "step": 39770 }, { "epoch": 7.299504496237842, "grad_norm": 0.02760034054517746, "learning_rate": 8.00954833403814e-06, "loss": 0.332, "num_input_tokens_seen": 85802320, "step": 39775 }, { "epoch": 7.300422095797394, "grad_norm": 0.03327697888016701, "learning_rate": 8.008908839845321e-06, "loss": 0.088, "num_input_tokens_seen": 85813840, "step": 39780 }, { "epoch": 7.301339695356946, "grad_norm": 0.015609960071742535, "learning_rate": 8.008269268478654e-06, "loss": 0.1732, "num_input_tokens_seen": 85823920, "step": 39785 }, { "epoch": 7.302257294916498, "grad_norm": 0.1036563292145729, "learning_rate": 8.007629619954544e-06, "loss": 0.0013, "num_input_tokens_seen": 85835152, "step": 39790 }, { "epoch": 7.303174894476051, "grad_norm": 0.028678912669420242, "learning_rate": 8.006989894289394e-06, "loss": 0.0004, "num_input_tokens_seen": 85845936, "step": 39795 }, { "epoch": 7.3040924940356025, "grad_norm": 0.16039767861366272, "learning_rate": 8.006350091499613e-06, "loss": 0.0026, "num_input_tokens_seen": 85855312, "step": 39800 }, { "epoch": 7.305010093595155, "grad_norm": 0.025589516386389732, "learning_rate": 8.005710211601613e-06, "loss": 0.1709, "num_input_tokens_seen": 85865936, "step": 39805 }, { "epoch": 7.3059276931547075, "grad_norm": 10.392441749572754, "learning_rate": 8.005070254611802e-06, "loss": 0.1706, "num_input_tokens_seen": 85877456, "step": 39810 }, { "epoch": 7.306845292714259, "grad_norm": 0.06548033654689789, "learning_rate": 8.0044302205466e-06, "loss": 0.1017, "num_input_tokens_seen": 85889008, "step": 39815 }, { "epoch": 7.307762892273812, "grad_norm": 0.00895650964230299, "learning_rate": 8.003790109422417e-06, "loss": 0.1085, "num_input_tokens_seen": 85899664, "step": 39820 }, { "epoch": 7.308680491833364, "grad_norm": 0.0138561325147748, "learning_rate": 8.003149921255673e-06, "loss": 0.0049, "num_input_tokens_seen": 85910064, "step": 39825 }, { "epoch": 7.309598091392916, "grad_norm": 27.071130752563477, "learning_rate": 8.00250965606279e-06, "loss": 0.1967, "num_input_tokens_seen": 85921200, "step": 39830 }, { "epoch": 7.310515690952468, "grad_norm": 1.0680506229400635, "learning_rate": 8.001869313860185e-06, "loss": 0.0031, "num_input_tokens_seen": 85932240, "step": 39835 }, { "epoch": 7.311433290512021, "grad_norm": 0.11089477688074112, "learning_rate": 8.001228894664287e-06, "loss": 0.1673, "num_input_tokens_seen": 85942192, "step": 39840 }, { "epoch": 7.312350890071572, "grad_norm": 0.01679924689233303, "learning_rate": 8.000588398491519e-06, "loss": 0.0004, "num_input_tokens_seen": 85954064, "step": 39845 }, { "epoch": 7.313268489631125, "grad_norm": 26.841970443725586, "learning_rate": 7.999947825358307e-06, "loss": 0.0847, "num_input_tokens_seen": 85964304, "step": 39850 }, { "epoch": 7.314186089190677, "grad_norm": 0.12826131284236908, "learning_rate": 7.999307175281084e-06, "loss": 0.063, "num_input_tokens_seen": 85975760, "step": 39855 }, { "epoch": 7.315103688750229, "grad_norm": 0.031224709004163742, "learning_rate": 7.99866644827628e-06, "loss": 0.3246, "num_input_tokens_seen": 85986320, "step": 39860 }, { "epoch": 7.3160212883097815, "grad_norm": 0.04685518890619278, "learning_rate": 7.998025644360332e-06, "loss": 0.0866, "num_input_tokens_seen": 85997648, "step": 39865 }, { "epoch": 7.316938887869334, "grad_norm": 2.9031341075897217, "learning_rate": 7.99738476354967e-06, "loss": 0.0536, "num_input_tokens_seen": 86008784, "step": 39870 }, { "epoch": 7.317856487428886, "grad_norm": 0.018716705963015556, "learning_rate": 7.996743805860734e-06, "loss": 0.0404, "num_input_tokens_seen": 86019248, "step": 39875 }, { "epoch": 7.318774086988438, "grad_norm": 0.5268092155456543, "learning_rate": 7.996102771309965e-06, "loss": 0.0024, "num_input_tokens_seen": 86030096, "step": 39880 }, { "epoch": 7.319691686547991, "grad_norm": 4.506187915802002, "learning_rate": 7.995461659913803e-06, "loss": 0.2431, "num_input_tokens_seen": 86040944, "step": 39885 }, { "epoch": 7.320609286107542, "grad_norm": 14.882956504821777, "learning_rate": 7.994820471688692e-06, "loss": 0.0191, "num_input_tokens_seen": 86052368, "step": 39890 }, { "epoch": 7.321526885667095, "grad_norm": 0.03257042169570923, "learning_rate": 7.994179206651078e-06, "loss": 0.421, "num_input_tokens_seen": 86062064, "step": 39895 }, { "epoch": 7.322444485226647, "grad_norm": 0.01158080529421568, "learning_rate": 7.993537864817407e-06, "loss": 0.068, "num_input_tokens_seen": 86072208, "step": 39900 }, { "epoch": 7.323362084786199, "grad_norm": 0.06105102226138115, "learning_rate": 7.992896446204131e-06, "loss": 0.2733, "num_input_tokens_seen": 86083504, "step": 39905 }, { "epoch": 7.324279684345751, "grad_norm": 0.05240929499268532, "learning_rate": 7.992254950827698e-06, "loss": 0.0056, "num_input_tokens_seen": 86093520, "step": 39910 }, { "epoch": 7.325197283905304, "grad_norm": 0.015506110154092312, "learning_rate": 7.991613378704564e-06, "loss": 0.0015, "num_input_tokens_seen": 86104144, "step": 39915 }, { "epoch": 7.3261148834648555, "grad_norm": 0.16133736073970795, "learning_rate": 7.990971729851183e-06, "loss": 0.0025, "num_input_tokens_seen": 86113808, "step": 39920 }, { "epoch": 7.327032483024408, "grad_norm": 0.01329719927161932, "learning_rate": 7.990330004284012e-06, "loss": 0.0804, "num_input_tokens_seen": 86123920, "step": 39925 }, { "epoch": 7.3279500825839605, "grad_norm": 0.019292332231998444, "learning_rate": 7.989688202019512e-06, "loss": 0.0927, "num_input_tokens_seen": 86135760, "step": 39930 }, { "epoch": 7.328867682143512, "grad_norm": 0.6196074485778809, "learning_rate": 7.989046323074143e-06, "loss": 0.0024, "num_input_tokens_seen": 86147344, "step": 39935 }, { "epoch": 7.329785281703065, "grad_norm": 0.5671143531799316, "learning_rate": 7.988404367464369e-06, "loss": 0.0056, "num_input_tokens_seen": 86157712, "step": 39940 }, { "epoch": 7.330702881262617, "grad_norm": 0.1467900276184082, "learning_rate": 7.987762335206653e-06, "loss": 0.0809, "num_input_tokens_seen": 86167536, "step": 39945 }, { "epoch": 7.331620480822169, "grad_norm": 0.04630878567695618, "learning_rate": 7.987120226317466e-06, "loss": 0.1226, "num_input_tokens_seen": 86178384, "step": 39950 }, { "epoch": 7.332538080381721, "grad_norm": 0.48969176411628723, "learning_rate": 7.986478040813273e-06, "loss": 0.0019, "num_input_tokens_seen": 86189200, "step": 39955 }, { "epoch": 7.333455679941274, "grad_norm": 0.04863028600811958, "learning_rate": 7.985835778710546e-06, "loss": 0.0006, "num_input_tokens_seen": 86199536, "step": 39960 }, { "epoch": 7.334373279500825, "grad_norm": 0.006543462630361319, "learning_rate": 7.985193440025761e-06, "loss": 0.0235, "num_input_tokens_seen": 86210608, "step": 39965 }, { "epoch": 7.335290879060378, "grad_norm": 0.0123355807736516, "learning_rate": 7.98455102477539e-06, "loss": 0.095, "num_input_tokens_seen": 86221616, "step": 39970 }, { "epoch": 7.33620847861993, "grad_norm": 19.316646575927734, "learning_rate": 7.98390853297591e-06, "loss": 0.0679, "num_input_tokens_seen": 86232688, "step": 39975 }, { "epoch": 7.337126078179482, "grad_norm": 11.214821815490723, "learning_rate": 7.983265964643802e-06, "loss": 0.05, "num_input_tokens_seen": 86243152, "step": 39980 }, { "epoch": 7.3380436777390345, "grad_norm": 0.07445414364337921, "learning_rate": 7.982623319795546e-06, "loss": 0.0064, "num_input_tokens_seen": 86252272, "step": 39985 }, { "epoch": 7.338961277298587, "grad_norm": 0.014530937187373638, "learning_rate": 7.981980598447623e-06, "loss": 0.2976, "num_input_tokens_seen": 86263280, "step": 39990 }, { "epoch": 7.339878876858139, "grad_norm": 0.25050681829452515, "learning_rate": 7.981337800616521e-06, "loss": 0.0065, "num_input_tokens_seen": 86272176, "step": 39995 }, { "epoch": 7.340796476417691, "grad_norm": 40.880165100097656, "learning_rate": 7.980694926318724e-06, "loss": 0.309, "num_input_tokens_seen": 86283056, "step": 40000 }, { "epoch": 7.341714075977244, "grad_norm": 0.12314701825380325, "learning_rate": 7.980051975570721e-06, "loss": 0.1062, "num_input_tokens_seen": 86292816, "step": 40005 }, { "epoch": 7.342631675536795, "grad_norm": 7.969597339630127, "learning_rate": 7.979408948389007e-06, "loss": 0.2872, "num_input_tokens_seen": 86303024, "step": 40010 }, { "epoch": 7.343549275096348, "grad_norm": 0.05028488487005234, "learning_rate": 7.978765844790068e-06, "loss": 0.041, "num_input_tokens_seen": 86314064, "step": 40015 }, { "epoch": 7.3444668746559, "grad_norm": 0.05310416221618652, "learning_rate": 7.978122664790403e-06, "loss": 0.2908, "num_input_tokens_seen": 86324720, "step": 40020 }, { "epoch": 7.345384474215452, "grad_norm": 0.5015154480934143, "learning_rate": 7.977479408406507e-06, "loss": 0.0028, "num_input_tokens_seen": 86336688, "step": 40025 }, { "epoch": 7.346302073775004, "grad_norm": 115.1625747680664, "learning_rate": 7.976836075654879e-06, "loss": 0.0329, "num_input_tokens_seen": 86347792, "step": 40030 }, { "epoch": 7.347219673334557, "grad_norm": 0.3788769245147705, "learning_rate": 7.976192666552018e-06, "loss": 0.1069, "num_input_tokens_seen": 86357232, "step": 40035 }, { "epoch": 7.348137272894109, "grad_norm": 0.1055980771780014, "learning_rate": 7.975549181114429e-06, "loss": 0.0804, "num_input_tokens_seen": 86368048, "step": 40040 }, { "epoch": 7.349054872453661, "grad_norm": 9.258150100708008, "learning_rate": 7.974905619358615e-06, "loss": 0.1134, "num_input_tokens_seen": 86379184, "step": 40045 }, { "epoch": 7.349972472013214, "grad_norm": 16.303438186645508, "learning_rate": 7.974261981301082e-06, "loss": 0.0873, "num_input_tokens_seen": 86390192, "step": 40050 }, { "epoch": 7.350890071572765, "grad_norm": 13.090534210205078, "learning_rate": 7.97361826695834e-06, "loss": 0.01, "num_input_tokens_seen": 86400176, "step": 40055 }, { "epoch": 7.351807671132318, "grad_norm": 0.562396764755249, "learning_rate": 7.972974476346898e-06, "loss": 0.1435, "num_input_tokens_seen": 86410896, "step": 40060 }, { "epoch": 7.35272527069187, "grad_norm": 193.2709503173828, "learning_rate": 7.972330609483266e-06, "loss": 0.1218, "num_input_tokens_seen": 86421552, "step": 40065 }, { "epoch": 7.353642870251422, "grad_norm": 0.2169673889875412, "learning_rate": 7.971686666383963e-06, "loss": 0.0553, "num_input_tokens_seen": 86432816, "step": 40070 }, { "epoch": 7.354560469810974, "grad_norm": 14.884230613708496, "learning_rate": 7.971042647065503e-06, "loss": 0.1825, "num_input_tokens_seen": 86443504, "step": 40075 }, { "epoch": 7.355478069370527, "grad_norm": 3.3626320362091064, "learning_rate": 7.970398551544403e-06, "loss": 0.0045, "num_input_tokens_seen": 86455696, "step": 40080 }, { "epoch": 7.3563956689300785, "grad_norm": 0.0356404148042202, "learning_rate": 7.969754379837184e-06, "loss": 0.0011, "num_input_tokens_seen": 86467280, "step": 40085 }, { "epoch": 7.357313268489631, "grad_norm": 0.09814716130495071, "learning_rate": 7.969110131960368e-06, "loss": 0.0009, "num_input_tokens_seen": 86477584, "step": 40090 }, { "epoch": 7.3582308680491835, "grad_norm": 3.6696531772613525, "learning_rate": 7.968465807930477e-06, "loss": 0.009, "num_input_tokens_seen": 86488496, "step": 40095 }, { "epoch": 7.359148467608735, "grad_norm": 0.040934689342975616, "learning_rate": 7.96782140776404e-06, "loss": 0.0042, "num_input_tokens_seen": 86499280, "step": 40100 }, { "epoch": 7.360066067168288, "grad_norm": 0.012166700325906277, "learning_rate": 7.967176931477583e-06, "loss": 0.0861, "num_input_tokens_seen": 86510320, "step": 40105 }, { "epoch": 7.36098366672784, "grad_norm": 0.01594633422791958, "learning_rate": 7.966532379087639e-06, "loss": 0.0234, "num_input_tokens_seen": 86520496, "step": 40110 }, { "epoch": 7.361901266287392, "grad_norm": 0.0056999921798706055, "learning_rate": 7.965887750610735e-06, "loss": 0.2114, "num_input_tokens_seen": 86532080, "step": 40115 }, { "epoch": 7.362818865846944, "grad_norm": 0.03324554115533829, "learning_rate": 7.965243046063407e-06, "loss": 0.0003, "num_input_tokens_seen": 86542960, "step": 40120 }, { "epoch": 7.363736465406497, "grad_norm": 0.01159948855638504, "learning_rate": 7.964598265462192e-06, "loss": 0.0013, "num_input_tokens_seen": 86554000, "step": 40125 }, { "epoch": 7.364654064966048, "grad_norm": 0.01114640198647976, "learning_rate": 7.963953408823623e-06, "loss": 0.0114, "num_input_tokens_seen": 86565200, "step": 40130 }, { "epoch": 7.365571664525601, "grad_norm": 0.12308640033006668, "learning_rate": 7.963308476164246e-06, "loss": 0.1851, "num_input_tokens_seen": 86575408, "step": 40135 }, { "epoch": 7.366489264085153, "grad_norm": 1.528902530670166, "learning_rate": 7.962663467500597e-06, "loss": 0.1236, "num_input_tokens_seen": 86586288, "step": 40140 }, { "epoch": 7.367406863644705, "grad_norm": 0.004970541223883629, "learning_rate": 7.962018382849224e-06, "loss": 0.1016, "num_input_tokens_seen": 86597680, "step": 40145 }, { "epoch": 7.3683244632042575, "grad_norm": 0.012609078548848629, "learning_rate": 7.961373222226669e-06, "loss": 0.0211, "num_input_tokens_seen": 86608144, "step": 40150 }, { "epoch": 7.36924206276381, "grad_norm": 17.785486221313477, "learning_rate": 7.960727985649481e-06, "loss": 0.0359, "num_input_tokens_seen": 86618576, "step": 40155 }, { "epoch": 7.370159662323362, "grad_norm": 0.025510506704449654, "learning_rate": 7.960082673134208e-06, "loss": 0.0033, "num_input_tokens_seen": 86629712, "step": 40160 }, { "epoch": 7.371077261882914, "grad_norm": 0.01865598000586033, "learning_rate": 7.959437284697403e-06, "loss": 0.0455, "num_input_tokens_seen": 86641680, "step": 40165 }, { "epoch": 7.371994861442467, "grad_norm": 0.06118049845099449, "learning_rate": 7.958791820355619e-06, "loss": 0.0009, "num_input_tokens_seen": 86653232, "step": 40170 }, { "epoch": 7.372912461002018, "grad_norm": 0.00725291995331645, "learning_rate": 7.95814628012541e-06, "loss": 0.1477, "num_input_tokens_seen": 86664880, "step": 40175 }, { "epoch": 7.373830060561571, "grad_norm": 0.03569583222270012, "learning_rate": 7.957500664023332e-06, "loss": 0.0006, "num_input_tokens_seen": 86676496, "step": 40180 }, { "epoch": 7.374747660121123, "grad_norm": 1.4312105178833008, "learning_rate": 7.956854972065948e-06, "loss": 0.1114, "num_input_tokens_seen": 86687792, "step": 40185 }, { "epoch": 7.375665259680676, "grad_norm": 0.019563894718885422, "learning_rate": 7.956209204269815e-06, "loss": 0.0009, "num_input_tokens_seen": 86699504, "step": 40190 }, { "epoch": 7.376582859240227, "grad_norm": 0.06930366158485413, "learning_rate": 7.955563360651499e-06, "loss": 0.1958, "num_input_tokens_seen": 86710448, "step": 40195 }, { "epoch": 7.37750045879978, "grad_norm": 0.034845203161239624, "learning_rate": 7.95491744122756e-06, "loss": 0.0035, "num_input_tokens_seen": 86720400, "step": 40200 }, { "epoch": 7.378418058359332, "grad_norm": 0.022950220853090286, "learning_rate": 7.954271446014572e-06, "loss": 0.1784, "num_input_tokens_seen": 86730352, "step": 40205 }, { "epoch": 7.379335657918884, "grad_norm": 0.06453417986631393, "learning_rate": 7.953625375029099e-06, "loss": 0.0006, "num_input_tokens_seen": 86740880, "step": 40210 }, { "epoch": 7.3802532574784365, "grad_norm": 0.48969244956970215, "learning_rate": 7.952979228287715e-06, "loss": 0.0013, "num_input_tokens_seen": 86751600, "step": 40215 }, { "epoch": 7.381170857037989, "grad_norm": 9.197190284729004, "learning_rate": 7.952333005806987e-06, "loss": 0.2818, "num_input_tokens_seen": 86761936, "step": 40220 }, { "epoch": 7.382088456597541, "grad_norm": 0.06460018455982208, "learning_rate": 7.951686707603495e-06, "loss": 0.2216, "num_input_tokens_seen": 86773008, "step": 40225 }, { "epoch": 7.383006056157093, "grad_norm": 13.853734970092773, "learning_rate": 7.951040333693813e-06, "loss": 0.2405, "num_input_tokens_seen": 86784016, "step": 40230 }, { "epoch": 7.383923655716646, "grad_norm": 0.48641014099121094, "learning_rate": 7.95039388409452e-06, "loss": 0.0024, "num_input_tokens_seen": 86795760, "step": 40235 }, { "epoch": 7.384841255276197, "grad_norm": 0.07326351851224899, "learning_rate": 7.949747358822197e-06, "loss": 0.2394, "num_input_tokens_seen": 86806512, "step": 40240 }, { "epoch": 7.38575885483575, "grad_norm": 0.07773539423942566, "learning_rate": 7.949100757893426e-06, "loss": 0.0011, "num_input_tokens_seen": 86817200, "step": 40245 }, { "epoch": 7.386676454395302, "grad_norm": 0.015343235805630684, "learning_rate": 7.948454081324793e-06, "loss": 0.0014, "num_input_tokens_seen": 86827632, "step": 40250 }, { "epoch": 7.387594053954854, "grad_norm": 0.03168981522321701, "learning_rate": 7.94780732913288e-06, "loss": 0.114, "num_input_tokens_seen": 86837584, "step": 40255 }, { "epoch": 7.388511653514406, "grad_norm": 0.0938049778342247, "learning_rate": 7.947160501334278e-06, "loss": 0.1014, "num_input_tokens_seen": 86849072, "step": 40260 }, { "epoch": 7.389429253073959, "grad_norm": 13.864424705505371, "learning_rate": 7.946513597945577e-06, "loss": 0.1637, "num_input_tokens_seen": 86859952, "step": 40265 }, { "epoch": 7.3903468526335105, "grad_norm": 0.028976446017622948, "learning_rate": 7.945866618983368e-06, "loss": 0.1453, "num_input_tokens_seen": 86869712, "step": 40270 }, { "epoch": 7.391264452193063, "grad_norm": 11.463481903076172, "learning_rate": 7.945219564464249e-06, "loss": 0.2398, "num_input_tokens_seen": 86880720, "step": 40275 }, { "epoch": 7.392182051752616, "grad_norm": 12.945205688476562, "learning_rate": 7.94457243440481e-06, "loss": 0.1292, "num_input_tokens_seen": 86890288, "step": 40280 }, { "epoch": 7.393099651312167, "grad_norm": 246.9935302734375, "learning_rate": 7.943925228821652e-06, "loss": 0.1984, "num_input_tokens_seen": 86901488, "step": 40285 }, { "epoch": 7.39401725087172, "grad_norm": 0.0474872887134552, "learning_rate": 7.943277947731374e-06, "loss": 0.0256, "num_input_tokens_seen": 86912080, "step": 40290 }, { "epoch": 7.394934850431272, "grad_norm": 0.2732612192630768, "learning_rate": 7.94263059115058e-06, "loss": 0.006, "num_input_tokens_seen": 86921936, "step": 40295 }, { "epoch": 7.395852449990824, "grad_norm": 0.11910706013441086, "learning_rate": 7.941983159095872e-06, "loss": 0.0027, "num_input_tokens_seen": 86933424, "step": 40300 }, { "epoch": 7.396770049550376, "grad_norm": 0.2136976420879364, "learning_rate": 7.941335651583856e-06, "loss": 0.2539, "num_input_tokens_seen": 86944464, "step": 40305 }, { "epoch": 7.397687649109929, "grad_norm": 0.10683713108301163, "learning_rate": 7.940688068631136e-06, "loss": 0.0445, "num_input_tokens_seen": 86956336, "step": 40310 }, { "epoch": 7.39860524866948, "grad_norm": 0.28391358256340027, "learning_rate": 7.940040410254328e-06, "loss": 0.0344, "num_input_tokens_seen": 86968528, "step": 40315 }, { "epoch": 7.399522848229033, "grad_norm": 0.01896618865430355, "learning_rate": 7.93939267647004e-06, "loss": 0.0549, "num_input_tokens_seen": 86979664, "step": 40320 }, { "epoch": 7.4004404477885855, "grad_norm": 0.02728462405502796, "learning_rate": 7.938744867294883e-06, "loss": 0.1159, "num_input_tokens_seen": 86990480, "step": 40325 }, { "epoch": 7.401358047348137, "grad_norm": 0.0077684842981398106, "learning_rate": 7.938096982745478e-06, "loss": 0.0993, "num_input_tokens_seen": 87000752, "step": 40330 }, { "epoch": 7.40227564690769, "grad_norm": 0.357391893863678, "learning_rate": 7.937449022838438e-06, "loss": 0.2516, "num_input_tokens_seen": 87010352, "step": 40335 }, { "epoch": 7.403193246467242, "grad_norm": 0.14159120619297028, "learning_rate": 7.93680098759038e-06, "loss": 0.0017, "num_input_tokens_seen": 87020720, "step": 40340 }, { "epoch": 7.404110846026794, "grad_norm": 0.08198591321706772, "learning_rate": 7.936152877017933e-06, "loss": 0.0021, "num_input_tokens_seen": 87030896, "step": 40345 }, { "epoch": 7.405028445586346, "grad_norm": 0.14094780385494232, "learning_rate": 7.935504691137712e-06, "loss": 0.0863, "num_input_tokens_seen": 87041072, "step": 40350 }, { "epoch": 7.405946045145899, "grad_norm": 0.04071710258722305, "learning_rate": 7.934856429966347e-06, "loss": 0.0009, "num_input_tokens_seen": 87051184, "step": 40355 }, { "epoch": 7.40686364470545, "grad_norm": 0.045139189809560776, "learning_rate": 7.934208093520462e-06, "loss": 0.0933, "num_input_tokens_seen": 87061776, "step": 40360 }, { "epoch": 7.407781244265003, "grad_norm": 0.017280079424381256, "learning_rate": 7.933559681816687e-06, "loss": 0.0006, "num_input_tokens_seen": 87071856, "step": 40365 }, { "epoch": 7.408698843824555, "grad_norm": 2.1781556606292725, "learning_rate": 7.932911194871656e-06, "loss": 0.0012, "num_input_tokens_seen": 87082032, "step": 40370 }, { "epoch": 7.409616443384107, "grad_norm": 0.030477674677968025, "learning_rate": 7.932262632701995e-06, "loss": 0.0002, "num_input_tokens_seen": 87093776, "step": 40375 }, { "epoch": 7.4105340429436595, "grad_norm": 0.007980328053236008, "learning_rate": 7.931613995324343e-06, "loss": 0.1038, "num_input_tokens_seen": 87105072, "step": 40380 }, { "epoch": 7.411451642503212, "grad_norm": 0.3463332951068878, "learning_rate": 7.930965282755334e-06, "loss": 0.0007, "num_input_tokens_seen": 87114864, "step": 40385 }, { "epoch": 7.412369242062764, "grad_norm": 0.04386841878294945, "learning_rate": 7.930316495011609e-06, "loss": 0.1447, "num_input_tokens_seen": 87125872, "step": 40390 }, { "epoch": 7.413286841622316, "grad_norm": 1.64165198802948, "learning_rate": 7.92966763210981e-06, "loss": 0.0749, "num_input_tokens_seen": 87136176, "step": 40395 }, { "epoch": 7.414204441181869, "grad_norm": 0.07289594411849976, "learning_rate": 7.929018694066575e-06, "loss": 0.0037, "num_input_tokens_seen": 87146064, "step": 40400 }, { "epoch": 7.41512204074142, "grad_norm": 7.803406715393066, "learning_rate": 7.92836968089855e-06, "loss": 0.2487, "num_input_tokens_seen": 87158096, "step": 40405 }, { "epoch": 7.416039640300973, "grad_norm": 76.34597778320312, "learning_rate": 7.927720592622382e-06, "loss": 0.248, "num_input_tokens_seen": 87168784, "step": 40410 }, { "epoch": 7.416957239860525, "grad_norm": 0.03708821162581444, "learning_rate": 7.927071429254715e-06, "loss": 0.0061, "num_input_tokens_seen": 87179536, "step": 40415 }, { "epoch": 7.417874839420077, "grad_norm": 0.0366135910153389, "learning_rate": 7.926422190812206e-06, "loss": 0.0566, "num_input_tokens_seen": 87189936, "step": 40420 }, { "epoch": 7.418792438979629, "grad_norm": 0.023376088589429855, "learning_rate": 7.925772877311503e-06, "loss": 0.0195, "num_input_tokens_seen": 87201520, "step": 40425 }, { "epoch": 7.419710038539182, "grad_norm": 0.006152731366455555, "learning_rate": 7.92512348876926e-06, "loss": 0.0949, "num_input_tokens_seen": 87212272, "step": 40430 }, { "epoch": 7.4206276380987335, "grad_norm": 10.061423301696777, "learning_rate": 7.924474025202131e-06, "loss": 0.0039, "num_input_tokens_seen": 87223600, "step": 40435 }, { "epoch": 7.421545237658286, "grad_norm": 0.06382148712873459, "learning_rate": 7.923824486626778e-06, "loss": 0.0022, "num_input_tokens_seen": 87234800, "step": 40440 }, { "epoch": 7.4224628372178385, "grad_norm": 0.11529096215963364, "learning_rate": 7.923174873059859e-06, "loss": 0.1516, "num_input_tokens_seen": 87245040, "step": 40445 }, { "epoch": 7.42338043677739, "grad_norm": 0.11021193116903305, "learning_rate": 7.922525184518032e-06, "loss": 0.1221, "num_input_tokens_seen": 87255056, "step": 40450 }, { "epoch": 7.424298036336943, "grad_norm": 0.09035177528858185, "learning_rate": 7.921875421017966e-06, "loss": 0.0166, "num_input_tokens_seen": 87265808, "step": 40455 }, { "epoch": 7.425215635896495, "grad_norm": 30.83956527709961, "learning_rate": 7.921225582576323e-06, "loss": 0.1632, "num_input_tokens_seen": 87277456, "step": 40460 }, { "epoch": 7.426133235456047, "grad_norm": 0.025441044941544533, "learning_rate": 7.920575669209774e-06, "loss": 0.1388, "num_input_tokens_seen": 87288720, "step": 40465 }, { "epoch": 7.427050835015599, "grad_norm": 0.2061479389667511, "learning_rate": 7.91992568093498e-06, "loss": 0.1013, "num_input_tokens_seen": 87299504, "step": 40470 }, { "epoch": 7.427968434575152, "grad_norm": 0.029589679092168808, "learning_rate": 7.919275617768622e-06, "loss": 0.1049, "num_input_tokens_seen": 87309936, "step": 40475 }, { "epoch": 7.428886034134703, "grad_norm": 0.22033381462097168, "learning_rate": 7.918625479727368e-06, "loss": 0.1271, "num_input_tokens_seen": 87320752, "step": 40480 }, { "epoch": 7.429803633694256, "grad_norm": 0.03984259441494942, "learning_rate": 7.917975266827893e-06, "loss": 0.1804, "num_input_tokens_seen": 87333520, "step": 40485 }, { "epoch": 7.430721233253808, "grad_norm": 0.5537673830986023, "learning_rate": 7.917324979086878e-06, "loss": 0.0487, "num_input_tokens_seen": 87343376, "step": 40490 }, { "epoch": 7.43163883281336, "grad_norm": 0.0031799718271940947, "learning_rate": 7.916674616520995e-06, "loss": 0.2674, "num_input_tokens_seen": 87353680, "step": 40495 }, { "epoch": 7.4325564323729125, "grad_norm": 0.0950118824839592, "learning_rate": 7.91602417914693e-06, "loss": 0.2642, "num_input_tokens_seen": 87365040, "step": 40500 }, { "epoch": 7.433474031932465, "grad_norm": 0.2550065219402313, "learning_rate": 7.915373666981364e-06, "loss": 0.13, "num_input_tokens_seen": 87375280, "step": 40505 }, { "epoch": 7.434391631492017, "grad_norm": 0.048575494438409805, "learning_rate": 7.914723080040982e-06, "loss": 0.0037, "num_input_tokens_seen": 87386096, "step": 40510 }, { "epoch": 7.435309231051569, "grad_norm": 0.042363930493593216, "learning_rate": 7.91407241834247e-06, "loss": 0.3086, "num_input_tokens_seen": 87397456, "step": 40515 }, { "epoch": 7.436226830611122, "grad_norm": 0.27686813473701477, "learning_rate": 7.913421681902518e-06, "loss": 0.1265, "num_input_tokens_seen": 87408368, "step": 40520 }, { "epoch": 7.437144430170673, "grad_norm": 41.424354553222656, "learning_rate": 7.912770870737814e-06, "loss": 0.202, "num_input_tokens_seen": 87419696, "step": 40525 }, { "epoch": 7.438062029730226, "grad_norm": 0.04542771354317665, "learning_rate": 7.912119984865052e-06, "loss": 0.0066, "num_input_tokens_seen": 87427856, "step": 40530 }, { "epoch": 7.438979629289778, "grad_norm": 13.447009086608887, "learning_rate": 7.911469024300927e-06, "loss": 0.224, "num_input_tokens_seen": 87439184, "step": 40535 }, { "epoch": 7.43989722884933, "grad_norm": 0.04720582440495491, "learning_rate": 7.910817989062131e-06, "loss": 0.0018, "num_input_tokens_seen": 87450640, "step": 40540 }, { "epoch": 7.440814828408882, "grad_norm": 648.2572631835938, "learning_rate": 7.910166879165367e-06, "loss": 0.1326, "num_input_tokens_seen": 87461104, "step": 40545 }, { "epoch": 7.441732427968435, "grad_norm": 0.04164405167102814, "learning_rate": 7.909515694627333e-06, "loss": 0.0011, "num_input_tokens_seen": 87472176, "step": 40550 }, { "epoch": 7.4426500275279865, "grad_norm": 0.022093424573540688, "learning_rate": 7.908864435464728e-06, "loss": 0.0026, "num_input_tokens_seen": 87483024, "step": 40555 }, { "epoch": 7.443567627087539, "grad_norm": 0.03458612784743309, "learning_rate": 7.908213101694263e-06, "loss": 0.0017, "num_input_tokens_seen": 87491856, "step": 40560 }, { "epoch": 7.444485226647092, "grad_norm": 0.17291168868541718, "learning_rate": 7.907561693332638e-06, "loss": 0.0407, "num_input_tokens_seen": 87503376, "step": 40565 }, { "epoch": 7.445402826206643, "grad_norm": 0.0069449604488909245, "learning_rate": 7.90691021039656e-06, "loss": 0.0012, "num_input_tokens_seen": 87514640, "step": 40570 }, { "epoch": 7.446320425766196, "grad_norm": 0.014331445097923279, "learning_rate": 7.906258652902741e-06, "loss": 0.1177, "num_input_tokens_seen": 87524784, "step": 40575 }, { "epoch": 7.447238025325748, "grad_norm": 0.03828149661421776, "learning_rate": 7.905607020867892e-06, "loss": 0.1076, "num_input_tokens_seen": 87534640, "step": 40580 }, { "epoch": 7.4481556248853, "grad_norm": 0.07774616777896881, "learning_rate": 7.904955314308726e-06, "loss": 0.0008, "num_input_tokens_seen": 87546224, "step": 40585 }, { "epoch": 7.449073224444852, "grad_norm": 0.8856281042098999, "learning_rate": 7.90430353324196e-06, "loss": 0.0014, "num_input_tokens_seen": 87556784, "step": 40590 }, { "epoch": 7.449990824004405, "grad_norm": 0.030406944453716278, "learning_rate": 7.903651677684308e-06, "loss": 0.0985, "num_input_tokens_seen": 87567856, "step": 40595 }, { "epoch": 7.450908423563956, "grad_norm": 0.010793589986860752, "learning_rate": 7.902999747652492e-06, "loss": 0.0008, "num_input_tokens_seen": 87578896, "step": 40600 }, { "epoch": 7.451826023123509, "grad_norm": 0.08663086593151093, "learning_rate": 7.90234774316323e-06, "loss": 0.27, "num_input_tokens_seen": 87588944, "step": 40605 }, { "epoch": 7.4527436226830615, "grad_norm": 0.016764020547270775, "learning_rate": 7.901695664233248e-06, "loss": 0.0027, "num_input_tokens_seen": 87600432, "step": 40610 }, { "epoch": 7.453661222242613, "grad_norm": 0.005059258546680212, "learning_rate": 7.90104351087927e-06, "loss": 0.0005, "num_input_tokens_seen": 87610672, "step": 40615 }, { "epoch": 7.454578821802166, "grad_norm": 74.61815643310547, "learning_rate": 7.90039128311802e-06, "loss": 0.3398, "num_input_tokens_seen": 87621040, "step": 40620 }, { "epoch": 7.455496421361718, "grad_norm": 0.01682448573410511, "learning_rate": 7.899738980966231e-06, "loss": 0.1194, "num_input_tokens_seen": 87630640, "step": 40625 }, { "epoch": 7.45641402092127, "grad_norm": 0.04635811597108841, "learning_rate": 7.89908660444063e-06, "loss": 0.001, "num_input_tokens_seen": 87642160, "step": 40630 }, { "epoch": 7.457331620480822, "grad_norm": 0.1356719583272934, "learning_rate": 7.89843415355795e-06, "loss": 0.0018, "num_input_tokens_seen": 87653168, "step": 40635 }, { "epoch": 7.458249220040375, "grad_norm": 13.493560791015625, "learning_rate": 7.897781628334928e-06, "loss": 0.151, "num_input_tokens_seen": 87663824, "step": 40640 }, { "epoch": 7.459166819599926, "grad_norm": 0.059328317642211914, "learning_rate": 7.897129028788297e-06, "loss": 0.0009, "num_input_tokens_seen": 87674032, "step": 40645 }, { "epoch": 7.460084419159479, "grad_norm": 0.005713314283639193, "learning_rate": 7.896476354934798e-06, "loss": 0.1386, "num_input_tokens_seen": 87685424, "step": 40650 }, { "epoch": 7.461002018719031, "grad_norm": 0.03400533273816109, "learning_rate": 7.895823606791169e-06, "loss": 0.0008, "num_input_tokens_seen": 87697424, "step": 40655 }, { "epoch": 7.461919618278583, "grad_norm": 0.06041005626320839, "learning_rate": 7.895170784374152e-06, "loss": 0.0006, "num_input_tokens_seen": 87708400, "step": 40660 }, { "epoch": 7.4628372178381355, "grad_norm": 0.04555423930287361, "learning_rate": 7.894517887700492e-06, "loss": 0.0013, "num_input_tokens_seen": 87719280, "step": 40665 }, { "epoch": 7.463754817397688, "grad_norm": 0.03986964002251625, "learning_rate": 7.893864916786934e-06, "loss": 0.2203, "num_input_tokens_seen": 87729456, "step": 40670 }, { "epoch": 7.46467241695724, "grad_norm": 0.0626319944858551, "learning_rate": 7.893211871650226e-06, "loss": 0.0009, "num_input_tokens_seen": 87740272, "step": 40675 }, { "epoch": 7.465590016516792, "grad_norm": 0.007413675542920828, "learning_rate": 7.892558752307118e-06, "loss": 0.1503, "num_input_tokens_seen": 87751088, "step": 40680 }, { "epoch": 7.466507616076345, "grad_norm": 0.042893197387456894, "learning_rate": 7.891905558774359e-06, "loss": 0.1641, "num_input_tokens_seen": 87760592, "step": 40685 }, { "epoch": 7.467425215635896, "grad_norm": 0.052811212837696075, "learning_rate": 7.891252291068707e-06, "loss": 0.001, "num_input_tokens_seen": 87771472, "step": 40690 }, { "epoch": 7.468342815195449, "grad_norm": 0.09530438482761383, "learning_rate": 7.890598949206915e-06, "loss": 0.0983, "num_input_tokens_seen": 87783312, "step": 40695 }, { "epoch": 7.469260414755001, "grad_norm": 0.08160915225744247, "learning_rate": 7.889945533205738e-06, "loss": 0.0011, "num_input_tokens_seen": 87793680, "step": 40700 }, { "epoch": 7.470178014314553, "grad_norm": 1.977920651435852, "learning_rate": 7.88929204308194e-06, "loss": 0.0023, "num_input_tokens_seen": 87804240, "step": 40705 }, { "epoch": 7.471095613874105, "grad_norm": 0.01597156934440136, "learning_rate": 7.888638478852275e-06, "loss": 0.1515, "num_input_tokens_seen": 87814608, "step": 40710 }, { "epoch": 7.472013213433658, "grad_norm": 0.010236704722046852, "learning_rate": 7.887984840533514e-06, "loss": 0.1756, "num_input_tokens_seen": 87825744, "step": 40715 }, { "epoch": 7.4729308129932095, "grad_norm": 0.06327258795499802, "learning_rate": 7.887331128142415e-06, "loss": 0.0018, "num_input_tokens_seen": 87837168, "step": 40720 }, { "epoch": 7.473848412552762, "grad_norm": 66.8871841430664, "learning_rate": 7.88667734169575e-06, "loss": 0.2099, "num_input_tokens_seen": 87847952, "step": 40725 }, { "epoch": 7.4747660121123145, "grad_norm": 0.3291025459766388, "learning_rate": 7.886023481210281e-06, "loss": 0.0057, "num_input_tokens_seen": 87858256, "step": 40730 }, { "epoch": 7.475683611671866, "grad_norm": 0.028836874291300774, "learning_rate": 7.885369546702785e-06, "loss": 0.1448, "num_input_tokens_seen": 87869584, "step": 40735 }, { "epoch": 7.476601211231419, "grad_norm": 11.098002433776855, "learning_rate": 7.884715538190034e-06, "loss": 0.1422, "num_input_tokens_seen": 87878960, "step": 40740 }, { "epoch": 7.477518810790971, "grad_norm": 0.01982114650309086, "learning_rate": 7.884061455688797e-06, "loss": 0.0011, "num_input_tokens_seen": 87889360, "step": 40745 }, { "epoch": 7.478436410350523, "grad_norm": 0.3758523762226105, "learning_rate": 7.883407299215856e-06, "loss": 0.1208, "num_input_tokens_seen": 87900784, "step": 40750 }, { "epoch": 7.479354009910075, "grad_norm": 0.041286054998636246, "learning_rate": 7.882753068787984e-06, "loss": 0.0021, "num_input_tokens_seen": 87911664, "step": 40755 }, { "epoch": 7.480271609469628, "grad_norm": 80.44457244873047, "learning_rate": 7.882098764421968e-06, "loss": 0.2292, "num_input_tokens_seen": 87921872, "step": 40760 }, { "epoch": 7.481189209029179, "grad_norm": 17.824466705322266, "learning_rate": 7.881444386134583e-06, "loss": 0.1421, "num_input_tokens_seen": 87932720, "step": 40765 }, { "epoch": 7.482106808588732, "grad_norm": 0.036446452140808105, "learning_rate": 7.880789933942614e-06, "loss": 0.1481, "num_input_tokens_seen": 87944560, "step": 40770 }, { "epoch": 7.483024408148284, "grad_norm": 0.09829321503639221, "learning_rate": 7.88013540786285e-06, "loss": 0.1217, "num_input_tokens_seen": 87954736, "step": 40775 }, { "epoch": 7.483942007707836, "grad_norm": 0.08916877955198288, "learning_rate": 7.879480807912077e-06, "loss": 0.0006, "num_input_tokens_seen": 87965744, "step": 40780 }, { "epoch": 7.4848596072673885, "grad_norm": 0.014637270011007786, "learning_rate": 7.878826134107082e-06, "loss": 0.0967, "num_input_tokens_seen": 87977104, "step": 40785 }, { "epoch": 7.485777206826941, "grad_norm": 0.10725722461938858, "learning_rate": 7.87817138646466e-06, "loss": 0.1416, "num_input_tokens_seen": 87989168, "step": 40790 }, { "epoch": 7.486694806386493, "grad_norm": 0.305402934551239, "learning_rate": 7.877516565001602e-06, "loss": 0.0021, "num_input_tokens_seen": 87999408, "step": 40795 }, { "epoch": 7.487612405946045, "grad_norm": 0.3229624927043915, "learning_rate": 7.876861669734703e-06, "loss": 0.1022, "num_input_tokens_seen": 88010768, "step": 40800 }, { "epoch": 7.488530005505598, "grad_norm": 0.09673959016799927, "learning_rate": 7.876206700680762e-06, "loss": 0.0016, "num_input_tokens_seen": 88021488, "step": 40805 }, { "epoch": 7.489447605065149, "grad_norm": 0.09056004136800766, "learning_rate": 7.875551657856577e-06, "loss": 0.0596, "num_input_tokens_seen": 88031568, "step": 40810 }, { "epoch": 7.490365204624702, "grad_norm": 0.29766401648521423, "learning_rate": 7.87489654127895e-06, "loss": 0.1392, "num_input_tokens_seen": 88041392, "step": 40815 }, { "epoch": 7.491282804184254, "grad_norm": 0.01609654724597931, "learning_rate": 7.87424135096468e-06, "loss": 0.0008, "num_input_tokens_seen": 88052688, "step": 40820 }, { "epoch": 7.492200403743806, "grad_norm": 0.020973747596144676, "learning_rate": 7.873586086930573e-06, "loss": 0.2385, "num_input_tokens_seen": 88064144, "step": 40825 }, { "epoch": 7.493118003303358, "grad_norm": 8.601834297180176, "learning_rate": 7.87293074919344e-06, "loss": 0.3076, "num_input_tokens_seen": 88075600, "step": 40830 }, { "epoch": 7.494035602862911, "grad_norm": 0.019379280507564545, "learning_rate": 7.872275337770084e-06, "loss": 0.1957, "num_input_tokens_seen": 88087984, "step": 40835 }, { "epoch": 7.4949532024224625, "grad_norm": 0.02743236906826496, "learning_rate": 7.871619852677317e-06, "loss": 0.0011, "num_input_tokens_seen": 88098352, "step": 40840 }, { "epoch": 7.495870801982015, "grad_norm": 0.08404617011547089, "learning_rate": 7.870964293931952e-06, "loss": 0.1731, "num_input_tokens_seen": 88109200, "step": 40845 }, { "epoch": 7.496788401541568, "grad_norm": 0.012344694696366787, "learning_rate": 7.870308661550802e-06, "loss": 0.0007, "num_input_tokens_seen": 88120336, "step": 40850 }, { "epoch": 7.497706001101119, "grad_norm": 0.11060093343257904, "learning_rate": 7.869652955550684e-06, "loss": 0.0019, "num_input_tokens_seen": 88131088, "step": 40855 }, { "epoch": 7.498623600660672, "grad_norm": 0.0385175421833992, "learning_rate": 7.868997175948417e-06, "loss": 0.0711, "num_input_tokens_seen": 88143184, "step": 40860 }, { "epoch": 7.499541200220224, "grad_norm": 4.150918006896973, "learning_rate": 7.868341322760815e-06, "loss": 0.0038, "num_input_tokens_seen": 88153584, "step": 40865 }, { "epoch": 7.500458799779776, "grad_norm": 0.07329311221837997, "learning_rate": 7.867685396004704e-06, "loss": 0.2423, "num_input_tokens_seen": 88165264, "step": 40870 }, { "epoch": 7.501376399339328, "grad_norm": 0.303585022687912, "learning_rate": 7.86702939569691e-06, "loss": 0.1326, "num_input_tokens_seen": 88177136, "step": 40875 }, { "epoch": 7.502293998898881, "grad_norm": 0.27251219749450684, "learning_rate": 7.866373321854255e-06, "loss": 0.1618, "num_input_tokens_seen": 88187504, "step": 40880 }, { "epoch": 7.503211598458432, "grad_norm": 0.03091292828321457, "learning_rate": 7.865717174493566e-06, "loss": 0.003, "num_input_tokens_seen": 88197840, "step": 40885 }, { "epoch": 7.504129198017985, "grad_norm": 0.13067227602005005, "learning_rate": 7.865060953631672e-06, "loss": 0.0025, "num_input_tokens_seen": 88207888, "step": 40890 }, { "epoch": 7.5050467975775375, "grad_norm": 0.14251458644866943, "learning_rate": 7.864404659285406e-06, "loss": 0.0004, "num_input_tokens_seen": 88219184, "step": 40895 }, { "epoch": 7.505964397137089, "grad_norm": 0.263863742351532, "learning_rate": 7.8637482914716e-06, "loss": 0.0008, "num_input_tokens_seen": 88228624, "step": 40900 }, { "epoch": 7.506881996696642, "grad_norm": 0.027381276711821556, "learning_rate": 7.863091850207088e-06, "loss": 0.0004, "num_input_tokens_seen": 88238864, "step": 40905 }, { "epoch": 7.507799596256194, "grad_norm": 18.148035049438477, "learning_rate": 7.862435335508709e-06, "loss": 0.0092, "num_input_tokens_seen": 88250192, "step": 40910 }, { "epoch": 7.508717195815746, "grad_norm": 52.23514938354492, "learning_rate": 7.861778747393299e-06, "loss": 0.0107, "num_input_tokens_seen": 88260464, "step": 40915 }, { "epoch": 7.509634795375298, "grad_norm": 78.64407348632812, "learning_rate": 7.8611220858777e-06, "loss": 0.0433, "num_input_tokens_seen": 88271856, "step": 40920 }, { "epoch": 7.510552394934851, "grad_norm": 0.26906701922416687, "learning_rate": 7.860465350978752e-06, "loss": 0.0211, "num_input_tokens_seen": 88282608, "step": 40925 }, { "epoch": 7.511469994494402, "grad_norm": 206.81263732910156, "learning_rate": 7.859808542713304e-06, "loss": 0.1295, "num_input_tokens_seen": 88294000, "step": 40930 }, { "epoch": 7.512387594053955, "grad_norm": 0.016372766345739365, "learning_rate": 7.859151661098197e-06, "loss": 0.1228, "num_input_tokens_seen": 88303984, "step": 40935 }, { "epoch": 7.513305193613507, "grad_norm": 0.017382385209202766, "learning_rate": 7.858494706150282e-06, "loss": 0.2195, "num_input_tokens_seen": 88314416, "step": 40940 }, { "epoch": 7.514222793173059, "grad_norm": 0.23432494699954987, "learning_rate": 7.857837677886406e-06, "loss": 0.0007, "num_input_tokens_seen": 88325872, "step": 40945 }, { "epoch": 7.5151403927326115, "grad_norm": 16.019210815429688, "learning_rate": 7.857180576323425e-06, "loss": 0.1598, "num_input_tokens_seen": 88335888, "step": 40950 }, { "epoch": 7.516057992292164, "grad_norm": 338.14581298828125, "learning_rate": 7.85652340147819e-06, "loss": 0.2577, "num_input_tokens_seen": 88346480, "step": 40955 }, { "epoch": 7.516975591851716, "grad_norm": 0.0276534091681242, "learning_rate": 7.855866153367557e-06, "loss": 0.0006, "num_input_tokens_seen": 88358224, "step": 40960 }, { "epoch": 7.517893191411268, "grad_norm": 0.09287261962890625, "learning_rate": 7.855208832008383e-06, "loss": 0.0935, "num_input_tokens_seen": 88368336, "step": 40965 }, { "epoch": 7.518810790970821, "grad_norm": 0.1277000457048416, "learning_rate": 7.854551437417528e-06, "loss": 0.0006, "num_input_tokens_seen": 88379984, "step": 40970 }, { "epoch": 7.519728390530372, "grad_norm": 0.24874313175678253, "learning_rate": 7.853893969611852e-06, "loss": 0.0007, "num_input_tokens_seen": 88392208, "step": 40975 }, { "epoch": 7.520645990089925, "grad_norm": 0.4299798607826233, "learning_rate": 7.853236428608219e-06, "loss": 0.1052, "num_input_tokens_seen": 88403184, "step": 40980 }, { "epoch": 7.521563589649477, "grad_norm": 0.1986939162015915, "learning_rate": 7.852578814423497e-06, "loss": 0.077, "num_input_tokens_seen": 88413424, "step": 40985 }, { "epoch": 7.522481189209029, "grad_norm": 0.00900818407535553, "learning_rate": 7.851921127074545e-06, "loss": 0.0012, "num_input_tokens_seen": 88424496, "step": 40990 }, { "epoch": 7.523398788768581, "grad_norm": 0.020872527733445168, "learning_rate": 7.851263366578239e-06, "loss": 0.1195, "num_input_tokens_seen": 88435984, "step": 40995 }, { "epoch": 7.524316388328134, "grad_norm": 0.009554049000144005, "learning_rate": 7.850605532951446e-06, "loss": 0.1163, "num_input_tokens_seen": 88446352, "step": 41000 }, { "epoch": 7.5252339878876855, "grad_norm": 0.44702598452568054, "learning_rate": 7.849947626211037e-06, "loss": 0.0593, "num_input_tokens_seen": 88457040, "step": 41005 }, { "epoch": 7.526151587447238, "grad_norm": 0.028549745678901672, "learning_rate": 7.84928964637389e-06, "loss": 0.0048, "num_input_tokens_seen": 88468784, "step": 41010 }, { "epoch": 7.5270691870067905, "grad_norm": 17.703426361083984, "learning_rate": 7.848631593456881e-06, "loss": 0.1401, "num_input_tokens_seen": 88480240, "step": 41015 }, { "epoch": 7.527986786566342, "grad_norm": 0.013055442832410336, "learning_rate": 7.847973467476885e-06, "loss": 0.0647, "num_input_tokens_seen": 88490800, "step": 41020 }, { "epoch": 7.528904386125895, "grad_norm": 0.09892642498016357, "learning_rate": 7.847315268450783e-06, "loss": 0.1072, "num_input_tokens_seen": 88502448, "step": 41025 }, { "epoch": 7.529821985685447, "grad_norm": 36.949562072753906, "learning_rate": 7.84665699639546e-06, "loss": 0.0288, "num_input_tokens_seen": 88513424, "step": 41030 }, { "epoch": 7.530739585244999, "grad_norm": 0.05235086381435394, "learning_rate": 7.845998651327794e-06, "loss": 0.2046, "num_input_tokens_seen": 88524944, "step": 41035 }, { "epoch": 7.531657184804551, "grad_norm": 69.89192199707031, "learning_rate": 7.845340233264675e-06, "loss": 0.0365, "num_input_tokens_seen": 88534576, "step": 41040 }, { "epoch": 7.532574784364104, "grad_norm": 0.005371961742639542, "learning_rate": 7.844681742222989e-06, "loss": 0.0043, "num_input_tokens_seen": 88546448, "step": 41045 }, { "epoch": 7.533492383923655, "grad_norm": 0.005189219955354929, "learning_rate": 7.844023178219624e-06, "loss": 0.0736, "num_input_tokens_seen": 88556752, "step": 41050 }, { "epoch": 7.534409983483208, "grad_norm": 0.0059317974373698235, "learning_rate": 7.843364541271475e-06, "loss": 0.197, "num_input_tokens_seen": 88566480, "step": 41055 }, { "epoch": 7.53532758304276, "grad_norm": 1.6471542119979858, "learning_rate": 7.842705831395429e-06, "loss": 0.1, "num_input_tokens_seen": 88577104, "step": 41060 }, { "epoch": 7.536245182602312, "grad_norm": 0.255943238735199, "learning_rate": 7.842047048608386e-06, "loss": 0.0059, "num_input_tokens_seen": 88587664, "step": 41065 }, { "epoch": 7.5371627821618645, "grad_norm": 0.05732276663184166, "learning_rate": 7.841388192927239e-06, "loss": 0.0014, "num_input_tokens_seen": 88598256, "step": 41070 }, { "epoch": 7.538080381721417, "grad_norm": 0.02347273752093315, "learning_rate": 7.84072926436889e-06, "loss": 0.0007, "num_input_tokens_seen": 88609040, "step": 41075 }, { "epoch": 7.538997981280969, "grad_norm": 0.09420805424451828, "learning_rate": 7.84007026295024e-06, "loss": 0.0015, "num_input_tokens_seen": 88620656, "step": 41080 }, { "epoch": 7.539915580840521, "grad_norm": 0.47544315457344055, "learning_rate": 7.839411188688187e-06, "loss": 0.0931, "num_input_tokens_seen": 88630224, "step": 41085 }, { "epoch": 7.540833180400074, "grad_norm": 0.03200207278132439, "learning_rate": 7.838752041599637e-06, "loss": 0.0002, "num_input_tokens_seen": 88641584, "step": 41090 }, { "epoch": 7.541750779959625, "grad_norm": 0.009785198606550694, "learning_rate": 7.838092821701499e-06, "loss": 0.0011, "num_input_tokens_seen": 88652880, "step": 41095 }, { "epoch": 7.542668379519178, "grad_norm": 0.011074191890656948, "learning_rate": 7.837433529010679e-06, "loss": 0.2511, "num_input_tokens_seen": 88664272, "step": 41100 }, { "epoch": 7.54358597907873, "grad_norm": 0.1937088519334793, "learning_rate": 7.836774163544084e-06, "loss": 0.2109, "num_input_tokens_seen": 88675888, "step": 41105 }, { "epoch": 7.544503578638282, "grad_norm": 0.1107276901602745, "learning_rate": 7.83611472531863e-06, "loss": 0.0025, "num_input_tokens_seen": 88685328, "step": 41110 }, { "epoch": 7.545421178197834, "grad_norm": 9.2453031539917, "learning_rate": 7.835455214351228e-06, "loss": 0.0827, "num_input_tokens_seen": 88697456, "step": 41115 }, { "epoch": 7.546338777757387, "grad_norm": 0.18678227066993713, "learning_rate": 7.834795630658797e-06, "loss": 0.0019, "num_input_tokens_seen": 88708432, "step": 41120 }, { "epoch": 7.5472563773169385, "grad_norm": 0.08803543448448181, "learning_rate": 7.834135974258249e-06, "loss": 0.3514, "num_input_tokens_seen": 88718640, "step": 41125 }, { "epoch": 7.548173976876491, "grad_norm": 134.165283203125, "learning_rate": 7.833476245166507e-06, "loss": 0.3428, "num_input_tokens_seen": 88729456, "step": 41130 }, { "epoch": 7.549091576436044, "grad_norm": 0.43862184882164, "learning_rate": 7.83281644340049e-06, "loss": 0.2536, "num_input_tokens_seen": 88740496, "step": 41135 }, { "epoch": 7.550009175995595, "grad_norm": 0.030832258984446526, "learning_rate": 7.832156568977122e-06, "loss": 0.0393, "num_input_tokens_seen": 88752144, "step": 41140 }, { "epoch": 7.550926775555148, "grad_norm": 17.486587524414062, "learning_rate": 7.831496621913327e-06, "loss": 0.1141, "num_input_tokens_seen": 88763056, "step": 41145 }, { "epoch": 7.5518443751147, "grad_norm": 14.713330268859863, "learning_rate": 7.830836602226032e-06, "loss": 0.1745, "num_input_tokens_seen": 88772688, "step": 41150 }, { "epoch": 7.552761974674252, "grad_norm": 0.037741102278232574, "learning_rate": 7.830176509932167e-06, "loss": 0.1009, "num_input_tokens_seen": 88782736, "step": 41155 }, { "epoch": 7.553679574233804, "grad_norm": 11.237125396728516, "learning_rate": 7.82951634504866e-06, "loss": 0.2526, "num_input_tokens_seen": 88792816, "step": 41160 }, { "epoch": 7.554597173793357, "grad_norm": 0.044575661420822144, "learning_rate": 7.828856107592443e-06, "loss": 0.0925, "num_input_tokens_seen": 88803248, "step": 41165 }, { "epoch": 7.555514773352908, "grad_norm": 0.8650451898574829, "learning_rate": 7.828195797580454e-06, "loss": 0.0037, "num_input_tokens_seen": 88814032, "step": 41170 }, { "epoch": 7.556432372912461, "grad_norm": 1.5593385696411133, "learning_rate": 7.827535415029624e-06, "loss": 0.0625, "num_input_tokens_seen": 88825104, "step": 41175 }, { "epoch": 7.5573499724720135, "grad_norm": 0.10930457711219788, "learning_rate": 7.826874959956891e-06, "loss": 0.0022, "num_input_tokens_seen": 88836304, "step": 41180 }, { "epoch": 7.558267572031565, "grad_norm": 0.16528429090976715, "learning_rate": 7.8262144323792e-06, "loss": 0.1241, "num_input_tokens_seen": 88847920, "step": 41185 }, { "epoch": 7.559185171591118, "grad_norm": 0.1694716364145279, "learning_rate": 7.825553832313486e-06, "loss": 0.0255, "num_input_tokens_seen": 88858832, "step": 41190 }, { "epoch": 7.56010277115067, "grad_norm": 1.1413183212280273, "learning_rate": 7.824893159776698e-06, "loss": 0.0014, "num_input_tokens_seen": 88869904, "step": 41195 }, { "epoch": 7.561020370710222, "grad_norm": 8.207049369812012, "learning_rate": 7.824232414785778e-06, "loss": 0.3458, "num_input_tokens_seen": 88881264, "step": 41200 }, { "epoch": 7.561937970269774, "grad_norm": 0.027375511825084686, "learning_rate": 7.823571597357675e-06, "loss": 0.139, "num_input_tokens_seen": 88890352, "step": 41205 }, { "epoch": 7.562855569829327, "grad_norm": 0.05001646280288696, "learning_rate": 7.822910707509335e-06, "loss": 0.1006, "num_input_tokens_seen": 88900816, "step": 41210 }, { "epoch": 7.563773169388878, "grad_norm": 0.08264632523059845, "learning_rate": 7.82224974525771e-06, "loss": 0.1078, "num_input_tokens_seen": 88911760, "step": 41215 }, { "epoch": 7.564690768948431, "grad_norm": 0.050599049776792526, "learning_rate": 7.821588710619753e-06, "loss": 0.0026, "num_input_tokens_seen": 88923216, "step": 41220 }, { "epoch": 7.565608368507983, "grad_norm": 15.886138916015625, "learning_rate": 7.820927603612421e-06, "loss": 0.3476, "num_input_tokens_seen": 88934128, "step": 41225 }, { "epoch": 7.566525968067535, "grad_norm": 16.409284591674805, "learning_rate": 7.820266424252665e-06, "loss": 0.522, "num_input_tokens_seen": 88944720, "step": 41230 }, { "epoch": 7.5674435676270875, "grad_norm": 0.09627993404865265, "learning_rate": 7.819605172557448e-06, "loss": 0.0815, "num_input_tokens_seen": 88955824, "step": 41235 }, { "epoch": 7.56836116718664, "grad_norm": 0.07093632221221924, "learning_rate": 7.818943848543729e-06, "loss": 0.0516, "num_input_tokens_seen": 88967248, "step": 41240 }, { "epoch": 7.569278766746192, "grad_norm": 0.12643969058990479, "learning_rate": 7.818282452228466e-06, "loss": 0.0742, "num_input_tokens_seen": 88978032, "step": 41245 }, { "epoch": 7.570196366305744, "grad_norm": 0.37560930848121643, "learning_rate": 7.817620983628629e-06, "loss": 0.003, "num_input_tokens_seen": 88989264, "step": 41250 }, { "epoch": 7.571113965865297, "grad_norm": 8.210404396057129, "learning_rate": 7.81695944276118e-06, "loss": 0.108, "num_input_tokens_seen": 88999792, "step": 41255 }, { "epoch": 7.572031565424848, "grad_norm": 9.240010261535645, "learning_rate": 7.816297829643088e-06, "loss": 0.1235, "num_input_tokens_seen": 89010800, "step": 41260 }, { "epoch": 7.572949164984401, "grad_norm": 0.8725377917289734, "learning_rate": 7.815636144291321e-06, "loss": 0.0057, "num_input_tokens_seen": 89020528, "step": 41265 }, { "epoch": 7.573866764543953, "grad_norm": 0.057577021420001984, "learning_rate": 7.81497438672285e-06, "loss": 0.002, "num_input_tokens_seen": 89030128, "step": 41270 }, { "epoch": 7.574784364103505, "grad_norm": 17.670846939086914, "learning_rate": 7.814312556954648e-06, "loss": 0.1492, "num_input_tokens_seen": 89041168, "step": 41275 }, { "epoch": 7.575701963663057, "grad_norm": 0.04224782437086105, "learning_rate": 7.813650655003693e-06, "loss": 0.0626, "num_input_tokens_seen": 89052016, "step": 41280 }, { "epoch": 7.57661956322261, "grad_norm": 0.04109453782439232, "learning_rate": 7.812988680886959e-06, "loss": 0.0027, "num_input_tokens_seen": 89062992, "step": 41285 }, { "epoch": 7.5775371627821615, "grad_norm": 0.10759668797254562, "learning_rate": 7.812326634621424e-06, "loss": 0.0015, "num_input_tokens_seen": 89073584, "step": 41290 }, { "epoch": 7.578454762341714, "grad_norm": 0.05294743552803993, "learning_rate": 7.811664516224069e-06, "loss": 0.0019, "num_input_tokens_seen": 89084432, "step": 41295 }, { "epoch": 7.5793723619012665, "grad_norm": 0.051575709134340286, "learning_rate": 7.811002325711879e-06, "loss": 0.0006, "num_input_tokens_seen": 89094992, "step": 41300 }, { "epoch": 7.580289961460818, "grad_norm": 0.08162835985422134, "learning_rate": 7.810340063101835e-06, "loss": 0.0804, "num_input_tokens_seen": 89106384, "step": 41305 }, { "epoch": 7.581207561020371, "grad_norm": 0.12674668431282043, "learning_rate": 7.809677728410922e-06, "loss": 0.1793, "num_input_tokens_seen": 89116272, "step": 41310 }, { "epoch": 7.582125160579923, "grad_norm": 0.018733564764261246, "learning_rate": 7.809015321656132e-06, "loss": 0.0009, "num_input_tokens_seen": 89128176, "step": 41315 }, { "epoch": 7.583042760139475, "grad_norm": 0.0190513264387846, "learning_rate": 7.808352842854454e-06, "loss": 0.0012, "num_input_tokens_seen": 89139280, "step": 41320 }, { "epoch": 7.583960359699027, "grad_norm": 0.003557730233296752, "learning_rate": 7.807690292022876e-06, "loss": 0.2266, "num_input_tokens_seen": 89149456, "step": 41325 }, { "epoch": 7.58487795925858, "grad_norm": 0.027409827336668968, "learning_rate": 7.807027669178394e-06, "loss": 0.1926, "num_input_tokens_seen": 89160112, "step": 41330 }, { "epoch": 7.585795558818131, "grad_norm": 0.16653947532176971, "learning_rate": 7.806364974338001e-06, "loss": 0.0295, "num_input_tokens_seen": 89171600, "step": 41335 }, { "epoch": 7.586713158377684, "grad_norm": 0.010962390340864658, "learning_rate": 7.805702207518699e-06, "loss": 0.062, "num_input_tokens_seen": 89181104, "step": 41340 }, { "epoch": 7.587630757937236, "grad_norm": 0.009664425626397133, "learning_rate": 7.805039368737483e-06, "loss": 0.0536, "num_input_tokens_seen": 89192848, "step": 41345 }, { "epoch": 7.588548357496788, "grad_norm": 0.01517315860837698, "learning_rate": 7.804376458011354e-06, "loss": 0.0003, "num_input_tokens_seen": 89203568, "step": 41350 }, { "epoch": 7.5894659570563405, "grad_norm": 0.009459203109145164, "learning_rate": 7.803713475357316e-06, "loss": 0.0003, "num_input_tokens_seen": 89215984, "step": 41355 }, { "epoch": 7.590383556615893, "grad_norm": 0.00471070222556591, "learning_rate": 7.803050420792371e-06, "loss": 0.0023, "num_input_tokens_seen": 89227568, "step": 41360 }, { "epoch": 7.591301156175445, "grad_norm": 0.02893548086285591, "learning_rate": 7.802387294333528e-06, "loss": 0.0003, "num_input_tokens_seen": 89238608, "step": 41365 }, { "epoch": 7.592218755734997, "grad_norm": 0.8235442042350769, "learning_rate": 7.801724095997794e-06, "loss": 0.0023, "num_input_tokens_seen": 89250128, "step": 41370 }, { "epoch": 7.59313635529455, "grad_norm": 0.01352489460259676, "learning_rate": 7.801060825802181e-06, "loss": 0.0027, "num_input_tokens_seen": 89259952, "step": 41375 }, { "epoch": 7.594053954854101, "grad_norm": 15.907255172729492, "learning_rate": 7.800397483763697e-06, "loss": 0.2356, "num_input_tokens_seen": 89271984, "step": 41380 }, { "epoch": 7.594971554413654, "grad_norm": 0.021388348191976547, "learning_rate": 7.799734069899357e-06, "loss": 0.1008, "num_input_tokens_seen": 89283376, "step": 41385 }, { "epoch": 7.595889153973206, "grad_norm": 0.02353539504110813, "learning_rate": 7.799070584226179e-06, "loss": 0.0016, "num_input_tokens_seen": 89294896, "step": 41390 }, { "epoch": 7.596806753532758, "grad_norm": 0.17460620403289795, "learning_rate": 7.798407026761178e-06, "loss": 0.0709, "num_input_tokens_seen": 89306416, "step": 41395 }, { "epoch": 7.59772435309231, "grad_norm": 0.00522883515805006, "learning_rate": 7.797743397521376e-06, "loss": 0.0005, "num_input_tokens_seen": 89316560, "step": 41400 }, { "epoch": 7.598641952651863, "grad_norm": 19.184614181518555, "learning_rate": 7.797079696523788e-06, "loss": 0.1412, "num_input_tokens_seen": 89326928, "step": 41405 }, { "epoch": 7.599559552211415, "grad_norm": 0.03483092784881592, "learning_rate": 7.796415923785443e-06, "loss": 0.1494, "num_input_tokens_seen": 89337808, "step": 41410 }, { "epoch": 7.600477151770967, "grad_norm": 0.04384313151240349, "learning_rate": 7.795752079323364e-06, "loss": 0.0983, "num_input_tokens_seen": 89349488, "step": 41415 }, { "epoch": 7.60139475133052, "grad_norm": 25.89078140258789, "learning_rate": 7.795088163154578e-06, "loss": 0.3039, "num_input_tokens_seen": 89359472, "step": 41420 }, { "epoch": 7.602312350890072, "grad_norm": 0.020007485523819923, "learning_rate": 7.794424175296111e-06, "loss": 0.1202, "num_input_tokens_seen": 89371056, "step": 41425 }, { "epoch": 7.603229950449624, "grad_norm": 9.673906326293945, "learning_rate": 7.793760115764995e-06, "loss": 0.4103, "num_input_tokens_seen": 89381008, "step": 41430 }, { "epoch": 7.604147550009176, "grad_norm": 0.05465342104434967, "learning_rate": 7.793095984578263e-06, "loss": 0.0012, "num_input_tokens_seen": 89390896, "step": 41435 }, { "epoch": 7.605065149568729, "grad_norm": 0.017987627536058426, "learning_rate": 7.792431781752946e-06, "loss": 0.002, "num_input_tokens_seen": 89400912, "step": 41440 }, { "epoch": 7.60598274912828, "grad_norm": 0.07438240945339203, "learning_rate": 7.791767507306083e-06, "loss": 0.0022, "num_input_tokens_seen": 89411056, "step": 41445 }, { "epoch": 7.606900348687833, "grad_norm": 0.14000460505485535, "learning_rate": 7.791103161254711e-06, "loss": 0.0008, "num_input_tokens_seen": 89420688, "step": 41450 }, { "epoch": 7.607817948247385, "grad_norm": 0.028559716418385506, "learning_rate": 7.790438743615867e-06, "loss": 0.0605, "num_input_tokens_seen": 89431824, "step": 41455 }, { "epoch": 7.608735547806937, "grad_norm": 29.610605239868164, "learning_rate": 7.789774254406595e-06, "loss": 0.1227, "num_input_tokens_seen": 89443408, "step": 41460 }, { "epoch": 7.6096531473664895, "grad_norm": 0.201140895485878, "learning_rate": 7.789109693643936e-06, "loss": 0.057, "num_input_tokens_seen": 89453072, "step": 41465 }, { "epoch": 7.610570746926042, "grad_norm": 0.039805132895708084, "learning_rate": 7.788445061344938e-06, "loss": 0.1462, "num_input_tokens_seen": 89464816, "step": 41470 }, { "epoch": 7.611488346485594, "grad_norm": 0.049225710332393646, "learning_rate": 7.787780357526646e-06, "loss": 0.0012, "num_input_tokens_seen": 89474448, "step": 41475 }, { "epoch": 7.612405946045146, "grad_norm": 0.048432834446430206, "learning_rate": 7.787115582206105e-06, "loss": 0.1233, "num_input_tokens_seen": 89485360, "step": 41480 }, { "epoch": 7.613323545604699, "grad_norm": 0.7043830156326294, "learning_rate": 7.786450735400373e-06, "loss": 0.0014, "num_input_tokens_seen": 89494672, "step": 41485 }, { "epoch": 7.61424114516425, "grad_norm": 0.005816558375954628, "learning_rate": 7.785785817126497e-06, "loss": 0.1164, "num_input_tokens_seen": 89506672, "step": 41490 }, { "epoch": 7.615158744723803, "grad_norm": 0.03204939886927605, "learning_rate": 7.785120827401531e-06, "loss": 0.0007, "num_input_tokens_seen": 89517008, "step": 41495 }, { "epoch": 7.616076344283355, "grad_norm": 0.0049581993371248245, "learning_rate": 7.784455766242535e-06, "loss": 0.1573, "num_input_tokens_seen": 89527568, "step": 41500 }, { "epoch": 7.616993943842907, "grad_norm": 0.12161446362733841, "learning_rate": 7.783790633666562e-06, "loss": 0.27, "num_input_tokens_seen": 89538992, "step": 41505 }, { "epoch": 7.617911543402459, "grad_norm": 1.0562994480133057, "learning_rate": 7.783125429690675e-06, "loss": 0.1846, "num_input_tokens_seen": 89550576, "step": 41510 }, { "epoch": 7.618829142962012, "grad_norm": 0.2739510238170624, "learning_rate": 7.782460154331932e-06, "loss": 0.0028, "num_input_tokens_seen": 89561072, "step": 41515 }, { "epoch": 7.6197467425215635, "grad_norm": 0.0254259891808033, "learning_rate": 7.7817948076074e-06, "loss": 0.0013, "num_input_tokens_seen": 89570448, "step": 41520 }, { "epoch": 7.620664342081116, "grad_norm": 0.015165110118687153, "learning_rate": 7.781129389534144e-06, "loss": 0.0463, "num_input_tokens_seen": 89580336, "step": 41525 }, { "epoch": 7.6215819416406685, "grad_norm": 0.04416574537754059, "learning_rate": 7.780463900129228e-06, "loss": 0.0016, "num_input_tokens_seen": 89590960, "step": 41530 }, { "epoch": 7.62249954120022, "grad_norm": 10.116344451904297, "learning_rate": 7.779798339409721e-06, "loss": 0.1731, "num_input_tokens_seen": 89603216, "step": 41535 }, { "epoch": 7.623417140759773, "grad_norm": 31.473228454589844, "learning_rate": 7.779132707392695e-06, "loss": 0.161, "num_input_tokens_seen": 89613904, "step": 41540 }, { "epoch": 7.624334740319325, "grad_norm": 0.4665059447288513, "learning_rate": 7.778467004095225e-06, "loss": 0.2422, "num_input_tokens_seen": 89623568, "step": 41545 }, { "epoch": 7.625252339878877, "grad_norm": 0.18607378005981445, "learning_rate": 7.77780122953438e-06, "loss": 0.1101, "num_input_tokens_seen": 89634000, "step": 41550 }, { "epoch": 7.626169939438429, "grad_norm": 0.05076741427183151, "learning_rate": 7.77713538372724e-06, "loss": 0.2891, "num_input_tokens_seen": 89644688, "step": 41555 }, { "epoch": 7.627087538997982, "grad_norm": 0.25378814339637756, "learning_rate": 7.77646946669088e-06, "loss": 0.0057, "num_input_tokens_seen": 89654064, "step": 41560 }, { "epoch": 7.628005138557533, "grad_norm": 2.7907159328460693, "learning_rate": 7.775803478442384e-06, "loss": 0.148, "num_input_tokens_seen": 89665072, "step": 41565 }, { "epoch": 7.628922738117086, "grad_norm": 0.2809208631515503, "learning_rate": 7.77513741899883e-06, "loss": 0.0028, "num_input_tokens_seen": 89676624, "step": 41570 }, { "epoch": 7.629840337676638, "grad_norm": 0.0946793332695961, "learning_rate": 7.7744712883773e-06, "loss": 0.0013, "num_input_tokens_seen": 89687440, "step": 41575 }, { "epoch": 7.63075793723619, "grad_norm": 0.07464827597141266, "learning_rate": 7.773805086594884e-06, "loss": 0.0008, "num_input_tokens_seen": 89698672, "step": 41580 }, { "epoch": 7.6316755367957425, "grad_norm": 0.015848219394683838, "learning_rate": 7.773138813668666e-06, "loss": 0.0923, "num_input_tokens_seen": 89709008, "step": 41585 }, { "epoch": 7.632593136355295, "grad_norm": 14.69428539276123, "learning_rate": 7.772472469615734e-06, "loss": 0.174, "num_input_tokens_seen": 89718512, "step": 41590 }, { "epoch": 7.633510735914847, "grad_norm": 0.2598860561847687, "learning_rate": 7.771806054453182e-06, "loss": 0.0017, "num_input_tokens_seen": 89727088, "step": 41595 }, { "epoch": 7.634428335474399, "grad_norm": 0.06442347168922424, "learning_rate": 7.771139568198101e-06, "loss": 0.1663, "num_input_tokens_seen": 89737712, "step": 41600 }, { "epoch": 7.635345935033952, "grad_norm": 0.010224839672446251, "learning_rate": 7.770473010867582e-06, "loss": 0.0037, "num_input_tokens_seen": 89748016, "step": 41605 }, { "epoch": 7.636263534593503, "grad_norm": 0.011674136854708195, "learning_rate": 7.769806382478728e-06, "loss": 0.0827, "num_input_tokens_seen": 89758448, "step": 41610 }, { "epoch": 7.637181134153056, "grad_norm": 0.03223615512251854, "learning_rate": 7.76913968304863e-06, "loss": 0.0011, "num_input_tokens_seen": 89770000, "step": 41615 }, { "epoch": 7.638098733712608, "grad_norm": 0.004422641359269619, "learning_rate": 7.768472912594392e-06, "loss": 0.0009, "num_input_tokens_seen": 89780496, "step": 41620 }, { "epoch": 7.63901633327216, "grad_norm": 0.02225484512746334, "learning_rate": 7.767806071133116e-06, "loss": 0.113, "num_input_tokens_seen": 89791632, "step": 41625 }, { "epoch": 7.639933932831712, "grad_norm": 0.015535610727965832, "learning_rate": 7.767139158681901e-06, "loss": 0.113, "num_input_tokens_seen": 89802256, "step": 41630 }, { "epoch": 7.640851532391265, "grad_norm": 22.661800384521484, "learning_rate": 7.766472175257857e-06, "loss": 0.0146, "num_input_tokens_seen": 89814128, "step": 41635 }, { "epoch": 7.6417691319508165, "grad_norm": 0.023608071729540825, "learning_rate": 7.76580512087809e-06, "loss": 0.0004, "num_input_tokens_seen": 89825424, "step": 41640 }, { "epoch": 7.642686731510369, "grad_norm": 0.02584453858435154, "learning_rate": 7.765137995559706e-06, "loss": 0.0003, "num_input_tokens_seen": 89837360, "step": 41645 }, { "epoch": 7.6436043310699215, "grad_norm": 56.71527099609375, "learning_rate": 7.76447079931982e-06, "loss": 0.0149, "num_input_tokens_seen": 89849008, "step": 41650 }, { "epoch": 7.644521930629473, "grad_norm": 0.03240178897976875, "learning_rate": 7.76380353217554e-06, "loss": 0.0035, "num_input_tokens_seen": 89859408, "step": 41655 }, { "epoch": 7.645439530189026, "grad_norm": 0.0034398341085761786, "learning_rate": 7.763136194143987e-06, "loss": 0.0004, "num_input_tokens_seen": 89870416, "step": 41660 }, { "epoch": 7.646357129748578, "grad_norm": 3.3382647037506104, "learning_rate": 7.76246878524227e-06, "loss": 0.0061, "num_input_tokens_seen": 89882224, "step": 41665 }, { "epoch": 7.64727472930813, "grad_norm": 0.09106386452913284, "learning_rate": 7.761801305487511e-06, "loss": 0.0004, "num_input_tokens_seen": 89893456, "step": 41670 }, { "epoch": 7.648192328867682, "grad_norm": 0.005132457707077265, "learning_rate": 7.76113375489683e-06, "loss": 0.0038, "num_input_tokens_seen": 89903632, "step": 41675 }, { "epoch": 7.649109928427235, "grad_norm": 0.01071864552795887, "learning_rate": 7.760466133487346e-06, "loss": 0.0014, "num_input_tokens_seen": 89914480, "step": 41680 }, { "epoch": 7.650027527986786, "grad_norm": 0.021730119362473488, "learning_rate": 7.759798441276184e-06, "loss": 0.0004, "num_input_tokens_seen": 89925008, "step": 41685 }, { "epoch": 7.650945127546339, "grad_norm": 9.29947566986084, "learning_rate": 7.75913067828047e-06, "loss": 0.3005, "num_input_tokens_seen": 89936336, "step": 41690 }, { "epoch": 7.651862727105891, "grad_norm": 0.005015181377530098, "learning_rate": 7.75846284451733e-06, "loss": 0.0004, "num_input_tokens_seen": 89946608, "step": 41695 }, { "epoch": 7.652780326665443, "grad_norm": 0.06241711229085922, "learning_rate": 7.757794940003892e-06, "loss": 0.0005, "num_input_tokens_seen": 89956496, "step": 41700 }, { "epoch": 7.653697926224996, "grad_norm": 0.06830520182847977, "learning_rate": 7.757126964757291e-06, "loss": 0.0015, "num_input_tokens_seen": 89966896, "step": 41705 }, { "epoch": 7.654615525784548, "grad_norm": 0.005416124127805233, "learning_rate": 7.756458918794655e-06, "loss": 0.0006, "num_input_tokens_seen": 89975408, "step": 41710 }, { "epoch": 7.6555331253441, "grad_norm": 0.01651584357023239, "learning_rate": 7.755790802133119e-06, "loss": 0.0181, "num_input_tokens_seen": 89985712, "step": 41715 }, { "epoch": 7.656450724903652, "grad_norm": 0.05771799013018608, "learning_rate": 7.75512261478982e-06, "loss": 0.0355, "num_input_tokens_seen": 89996304, "step": 41720 }, { "epoch": 7.657368324463205, "grad_norm": 12.915581703186035, "learning_rate": 7.754454356781898e-06, "loss": 0.1043, "num_input_tokens_seen": 90006192, "step": 41725 }, { "epoch": 7.658285924022756, "grad_norm": 0.04891335219144821, "learning_rate": 7.753786028126488e-06, "loss": 0.0022, "num_input_tokens_seen": 90017872, "step": 41730 }, { "epoch": 7.659203523582309, "grad_norm": 0.22711046040058136, "learning_rate": 7.753117628840736e-06, "loss": 0.1536, "num_input_tokens_seen": 90027984, "step": 41735 }, { "epoch": 7.660121123141861, "grad_norm": 0.036632418632507324, "learning_rate": 7.752449158941785e-06, "loss": 0.0111, "num_input_tokens_seen": 90038320, "step": 41740 }, { "epoch": 7.661038722701413, "grad_norm": 0.007178879342973232, "learning_rate": 7.751780618446778e-06, "loss": 0.4787, "num_input_tokens_seen": 90048912, "step": 41745 }, { "epoch": 7.6619563222609655, "grad_norm": 0.06992900371551514, "learning_rate": 7.751112007372862e-06, "loss": 0.0015, "num_input_tokens_seen": 90059440, "step": 41750 }, { "epoch": 7.662873921820518, "grad_norm": 0.02810017764568329, "learning_rate": 7.750443325737186e-06, "loss": 0.0917, "num_input_tokens_seen": 90070736, "step": 41755 }, { "epoch": 7.66379152138007, "grad_norm": 0.06694148480892181, "learning_rate": 7.749774573556905e-06, "loss": 0.0008, "num_input_tokens_seen": 90080944, "step": 41760 }, { "epoch": 7.664709120939622, "grad_norm": 2.1576430797576904, "learning_rate": 7.749105750849165e-06, "loss": 0.0021, "num_input_tokens_seen": 90092272, "step": 41765 }, { "epoch": 7.665626720499175, "grad_norm": 0.009565969929099083, "learning_rate": 7.748436857631125e-06, "loss": 0.0051, "num_input_tokens_seen": 90102960, "step": 41770 }, { "epoch": 7.666544320058726, "grad_norm": 0.0157192200422287, "learning_rate": 7.747767893919938e-06, "loss": 0.1012, "num_input_tokens_seen": 90114960, "step": 41775 }, { "epoch": 7.667461919618279, "grad_norm": 0.03208860754966736, "learning_rate": 7.747098859732762e-06, "loss": 0.0007, "num_input_tokens_seen": 90126096, "step": 41780 }, { "epoch": 7.668379519177831, "grad_norm": 0.09802132099866867, "learning_rate": 7.74642975508676e-06, "loss": 0.0592, "num_input_tokens_seen": 90137072, "step": 41785 }, { "epoch": 7.669297118737383, "grad_norm": 1.9869242906570435, "learning_rate": 7.74576057999909e-06, "loss": 0.0007, "num_input_tokens_seen": 90147984, "step": 41790 }, { "epoch": 7.670214718296935, "grad_norm": 0.009752199985086918, "learning_rate": 7.74509133448692e-06, "loss": 0.0005, "num_input_tokens_seen": 90157840, "step": 41795 }, { "epoch": 7.671132317856488, "grad_norm": 0.003969451878219843, "learning_rate": 7.744422018567408e-06, "loss": 0.369, "num_input_tokens_seen": 90168496, "step": 41800 }, { "epoch": 7.6720499174160395, "grad_norm": 1.0720092058181763, "learning_rate": 7.743752632257725e-06, "loss": 0.1571, "num_input_tokens_seen": 90179088, "step": 41805 }, { "epoch": 7.672967516975592, "grad_norm": 0.009652668610215187, "learning_rate": 7.743083175575041e-06, "loss": 0.0003, "num_input_tokens_seen": 90188752, "step": 41810 }, { "epoch": 7.6738851165351445, "grad_norm": 0.03688668832182884, "learning_rate": 7.742413648536524e-06, "loss": 0.179, "num_input_tokens_seen": 90199024, "step": 41815 }, { "epoch": 7.674802716094696, "grad_norm": 0.00479791359975934, "learning_rate": 7.74174405115935e-06, "loss": 0.0864, "num_input_tokens_seen": 90210000, "step": 41820 }, { "epoch": 7.675720315654249, "grad_norm": 0.02089649997651577, "learning_rate": 7.741074383460687e-06, "loss": 0.0006, "num_input_tokens_seen": 90220208, "step": 41825 }, { "epoch": 7.676637915213801, "grad_norm": 0.016850298270583153, "learning_rate": 7.740404645457716e-06, "loss": 0.2478, "num_input_tokens_seen": 90231888, "step": 41830 }, { "epoch": 7.677555514773353, "grad_norm": 0.033731862902641296, "learning_rate": 7.739734837167612e-06, "loss": 0.0006, "num_input_tokens_seen": 90242864, "step": 41835 }, { "epoch": 7.678473114332905, "grad_norm": 0.7216489911079407, "learning_rate": 7.739064958607556e-06, "loss": 0.0986, "num_input_tokens_seen": 90253648, "step": 41840 }, { "epoch": 7.679390713892458, "grad_norm": 0.006722999271005392, "learning_rate": 7.738395009794728e-06, "loss": 0.3129, "num_input_tokens_seen": 90263856, "step": 41845 }, { "epoch": 7.680308313452009, "grad_norm": 24.292858123779297, "learning_rate": 7.737724990746313e-06, "loss": 0.1933, "num_input_tokens_seen": 90274160, "step": 41850 }, { "epoch": 7.681225913011562, "grad_norm": 0.008809464052319527, "learning_rate": 7.737054901479497e-06, "loss": 0.0059, "num_input_tokens_seen": 90286288, "step": 41855 }, { "epoch": 7.682143512571114, "grad_norm": 0.10091877728700638, "learning_rate": 7.736384742011462e-06, "loss": 0.0017, "num_input_tokens_seen": 90297712, "step": 41860 }, { "epoch": 7.683061112130666, "grad_norm": 0.02392512932419777, "learning_rate": 7.7357145123594e-06, "loss": 0.0006, "num_input_tokens_seen": 90309040, "step": 41865 }, { "epoch": 7.6839787116902185, "grad_norm": 0.023311439901590347, "learning_rate": 7.7350442125405e-06, "loss": 0.0017, "num_input_tokens_seen": 90319408, "step": 41870 }, { "epoch": 7.684896311249771, "grad_norm": 186.3410186767578, "learning_rate": 7.734373842571958e-06, "loss": 0.2847, "num_input_tokens_seen": 90330448, "step": 41875 }, { "epoch": 7.685813910809323, "grad_norm": 0.05310845375061035, "learning_rate": 7.733703402470963e-06, "loss": 0.0016, "num_input_tokens_seen": 90340688, "step": 41880 }, { "epoch": 7.686731510368875, "grad_norm": 0.08197770267724991, "learning_rate": 7.733032892254711e-06, "loss": 0.0005, "num_input_tokens_seen": 90350064, "step": 41885 }, { "epoch": 7.687649109928428, "grad_norm": 0.0466156005859375, "learning_rate": 7.732362311940403e-06, "loss": 0.0015, "num_input_tokens_seen": 90360720, "step": 41890 }, { "epoch": 7.688566709487979, "grad_norm": 0.017236361280083656, "learning_rate": 7.731691661545237e-06, "loss": 0.0002, "num_input_tokens_seen": 90371280, "step": 41895 }, { "epoch": 7.689484309047532, "grad_norm": 0.05812960863113403, "learning_rate": 7.731020941086412e-06, "loss": 0.0006, "num_input_tokens_seen": 90381360, "step": 41900 }, { "epoch": 7.690401908607084, "grad_norm": 8.746720314025879, "learning_rate": 7.730350150581134e-06, "loss": 0.1884, "num_input_tokens_seen": 90391824, "step": 41905 }, { "epoch": 7.691319508166636, "grad_norm": 35.24597930908203, "learning_rate": 7.729679290046606e-06, "loss": 0.0859, "num_input_tokens_seen": 90402928, "step": 41910 }, { "epoch": 7.692237107726188, "grad_norm": 0.11628589034080505, "learning_rate": 7.729008359500033e-06, "loss": 0.0737, "num_input_tokens_seen": 90413808, "step": 41915 }, { "epoch": 7.693154707285741, "grad_norm": 0.002355078002437949, "learning_rate": 7.728337358958627e-06, "loss": 0.1069, "num_input_tokens_seen": 90425616, "step": 41920 }, { "epoch": 7.6940723068452925, "grad_norm": 0.008368744514882565, "learning_rate": 7.727666288439595e-06, "loss": 0.0005, "num_input_tokens_seen": 90434128, "step": 41925 }, { "epoch": 7.694989906404845, "grad_norm": 0.02846560999751091, "learning_rate": 7.726995147960153e-06, "loss": 0.1122, "num_input_tokens_seen": 90444784, "step": 41930 }, { "epoch": 7.6959075059643975, "grad_norm": 0.033805154263973236, "learning_rate": 7.726323937537508e-06, "loss": 0.061, "num_input_tokens_seen": 90455952, "step": 41935 }, { "epoch": 7.696825105523949, "grad_norm": 0.8570382595062256, "learning_rate": 7.725652657188883e-06, "loss": 0.002, "num_input_tokens_seen": 90468080, "step": 41940 }, { "epoch": 7.697742705083502, "grad_norm": 0.6579164862632751, "learning_rate": 7.72498130693149e-06, "loss": 0.2743, "num_input_tokens_seen": 90478352, "step": 41945 }, { "epoch": 7.698660304643054, "grad_norm": 0.046253178268671036, "learning_rate": 7.724309886782548e-06, "loss": 0.2319, "num_input_tokens_seen": 90489680, "step": 41950 }, { "epoch": 7.699577904202606, "grad_norm": 0.036069754511117935, "learning_rate": 7.723638396759283e-06, "loss": 0.0124, "num_input_tokens_seen": 90500976, "step": 41955 }, { "epoch": 7.700495503762158, "grad_norm": 0.014430994167923927, "learning_rate": 7.722966836878914e-06, "loss": 0.0008, "num_input_tokens_seen": 90510256, "step": 41960 }, { "epoch": 7.701413103321711, "grad_norm": 0.03304581344127655, "learning_rate": 7.722295207158663e-06, "loss": 0.0021, "num_input_tokens_seen": 90521392, "step": 41965 }, { "epoch": 7.702330702881262, "grad_norm": 0.00814810674637556, "learning_rate": 7.721623507615761e-06, "loss": 0.1349, "num_input_tokens_seen": 90532368, "step": 41970 }, { "epoch": 7.703248302440815, "grad_norm": 0.01382646057754755, "learning_rate": 7.720951738267434e-06, "loss": 0.0015, "num_input_tokens_seen": 90543312, "step": 41975 }, { "epoch": 7.704165902000367, "grad_norm": 0.1015186458826065, "learning_rate": 7.720279899130914e-06, "loss": 0.0014, "num_input_tokens_seen": 90554416, "step": 41980 }, { "epoch": 7.705083501559919, "grad_norm": 0.005476343911141157, "learning_rate": 7.719607990223427e-06, "loss": 0.1754, "num_input_tokens_seen": 90564016, "step": 41985 }, { "epoch": 7.706001101119472, "grad_norm": 38.22763442993164, "learning_rate": 7.718936011562213e-06, "loss": 0.0684, "num_input_tokens_seen": 90574512, "step": 41990 }, { "epoch": 7.706918700679024, "grad_norm": 0.5987325310707092, "learning_rate": 7.718263963164502e-06, "loss": 0.0026, "num_input_tokens_seen": 90585680, "step": 41995 }, { "epoch": 7.707836300238576, "grad_norm": 0.024788279086351395, "learning_rate": 7.717591845047533e-06, "loss": 0.0005, "num_input_tokens_seen": 90596624, "step": 42000 }, { "epoch": 7.708753899798128, "grad_norm": 0.08065635710954666, "learning_rate": 7.716919657228548e-06, "loss": 0.2084, "num_input_tokens_seen": 90606640, "step": 42005 }, { "epoch": 7.709671499357681, "grad_norm": 182.805908203125, "learning_rate": 7.716247399724783e-06, "loss": 0.0143, "num_input_tokens_seen": 90617680, "step": 42010 }, { "epoch": 7.710589098917232, "grad_norm": 0.053535331040620804, "learning_rate": 7.715575072553482e-06, "loss": 0.0004, "num_input_tokens_seen": 90628560, "step": 42015 }, { "epoch": 7.711506698476785, "grad_norm": 0.047367364168167114, "learning_rate": 7.71490267573189e-06, "loss": 0.1636, "num_input_tokens_seen": 90637904, "step": 42020 }, { "epoch": 7.712424298036337, "grad_norm": 0.03812088444828987, "learning_rate": 7.71423020927725e-06, "loss": 0.1266, "num_input_tokens_seen": 90648144, "step": 42025 }, { "epoch": 7.713341897595889, "grad_norm": 0.01861545257270336, "learning_rate": 7.713557673206813e-06, "loss": 0.1305, "num_input_tokens_seen": 90658704, "step": 42030 }, { "epoch": 7.7142594971554415, "grad_norm": 0.01604110561311245, "learning_rate": 7.712885067537827e-06, "loss": 0.1442, "num_input_tokens_seen": 90669456, "step": 42035 }, { "epoch": 7.715177096714994, "grad_norm": 0.01365628931671381, "learning_rate": 7.712212392287546e-06, "loss": 0.001, "num_input_tokens_seen": 90679376, "step": 42040 }, { "epoch": 7.716094696274546, "grad_norm": 0.3336116671562195, "learning_rate": 7.711539647473219e-06, "loss": 0.0007, "num_input_tokens_seen": 90690960, "step": 42045 }, { "epoch": 7.717012295834098, "grad_norm": 0.049585822969675064, "learning_rate": 7.710866833112101e-06, "loss": 0.0005, "num_input_tokens_seen": 90702288, "step": 42050 }, { "epoch": 7.717929895393651, "grad_norm": 0.03348548710346222, "learning_rate": 7.710193949221452e-06, "loss": 0.001, "num_input_tokens_seen": 90713840, "step": 42055 }, { "epoch": 7.718847494953202, "grad_norm": 0.46801987290382385, "learning_rate": 7.709520995818527e-06, "loss": 0.0031, "num_input_tokens_seen": 90724912, "step": 42060 }, { "epoch": 7.719765094512755, "grad_norm": 0.040292903780937195, "learning_rate": 7.70884797292059e-06, "loss": 0.101, "num_input_tokens_seen": 90736240, "step": 42065 }, { "epoch": 7.720682694072307, "grad_norm": 15.12062931060791, "learning_rate": 7.708174880544899e-06, "loss": 0.0926, "num_input_tokens_seen": 90746352, "step": 42070 }, { "epoch": 7.721600293631859, "grad_norm": 0.03259645029902458, "learning_rate": 7.707501718708721e-06, "loss": 0.0031, "num_input_tokens_seen": 90756464, "step": 42075 }, { "epoch": 7.722517893191411, "grad_norm": 0.02360239066183567, "learning_rate": 7.706828487429318e-06, "loss": 0.0771, "num_input_tokens_seen": 90768784, "step": 42080 }, { "epoch": 7.723435492750964, "grad_norm": 0.019921578466892242, "learning_rate": 7.706155186723962e-06, "loss": 0.2775, "num_input_tokens_seen": 90778768, "step": 42085 }, { "epoch": 7.7243530923105155, "grad_norm": 0.03602878376841545, "learning_rate": 7.705481816609918e-06, "loss": 0.0019, "num_input_tokens_seen": 90790128, "step": 42090 }, { "epoch": 7.725270691870068, "grad_norm": 0.004353642463684082, "learning_rate": 7.70480837710446e-06, "loss": 0.0189, "num_input_tokens_seen": 90801040, "step": 42095 }, { "epoch": 7.7261882914296205, "grad_norm": 0.005957040935754776, "learning_rate": 7.704134868224857e-06, "loss": 0.0051, "num_input_tokens_seen": 90811408, "step": 42100 }, { "epoch": 7.727105890989172, "grad_norm": 1.0079079866409302, "learning_rate": 7.703461289988387e-06, "loss": 0.3018, "num_input_tokens_seen": 90822512, "step": 42105 }, { "epoch": 7.728023490548725, "grad_norm": 36.22900390625, "learning_rate": 7.702787642412326e-06, "loss": 0.1451, "num_input_tokens_seen": 90833424, "step": 42110 }, { "epoch": 7.728941090108277, "grad_norm": 0.11874783039093018, "learning_rate": 7.70211392551395e-06, "loss": 0.2131, "num_input_tokens_seen": 90843664, "step": 42115 }, { "epoch": 7.729858689667829, "grad_norm": 0.3050273358821869, "learning_rate": 7.701440139310538e-06, "loss": 0.0012, "num_input_tokens_seen": 90853584, "step": 42120 }, { "epoch": 7.730776289227381, "grad_norm": 0.0392347127199173, "learning_rate": 7.700766283819376e-06, "loss": 0.0181, "num_input_tokens_seen": 90863344, "step": 42125 }, { "epoch": 7.731693888786934, "grad_norm": 4.229085445404053, "learning_rate": 7.700092359057743e-06, "loss": 0.1022, "num_input_tokens_seen": 90873904, "step": 42130 }, { "epoch": 7.732611488346485, "grad_norm": 0.022169174626469612, "learning_rate": 7.699418365042928e-06, "loss": 0.0017, "num_input_tokens_seen": 90884464, "step": 42135 }, { "epoch": 7.733529087906038, "grad_norm": 0.0101614436134696, "learning_rate": 7.698744301792213e-06, "loss": 0.3198, "num_input_tokens_seen": 90895216, "step": 42140 }, { "epoch": 7.73444668746559, "grad_norm": 0.05972062051296234, "learning_rate": 7.69807016932289e-06, "loss": 0.0015, "num_input_tokens_seen": 90907312, "step": 42145 }, { "epoch": 7.735364287025142, "grad_norm": 0.19621697068214417, "learning_rate": 7.697395967652248e-06, "loss": 0.163, "num_input_tokens_seen": 90918768, "step": 42150 }, { "epoch": 7.7362818865846945, "grad_norm": 0.04531009867787361, "learning_rate": 7.696721696797583e-06, "loss": 0.1261, "num_input_tokens_seen": 90929264, "step": 42155 }, { "epoch": 7.737199486144247, "grad_norm": 0.4458160698413849, "learning_rate": 7.696047356776184e-06, "loss": 0.0012, "num_input_tokens_seen": 90940208, "step": 42160 }, { "epoch": 7.738117085703799, "grad_norm": 1.0788391828536987, "learning_rate": 7.69537294760535e-06, "loss": 0.0014, "num_input_tokens_seen": 90951248, "step": 42165 }, { "epoch": 7.739034685263351, "grad_norm": 7.444356918334961, "learning_rate": 7.694698469302373e-06, "loss": 0.133, "num_input_tokens_seen": 90962288, "step": 42170 }, { "epoch": 7.739952284822904, "grad_norm": 16.796405792236328, "learning_rate": 7.694023921884562e-06, "loss": 0.1009, "num_input_tokens_seen": 90973712, "step": 42175 }, { "epoch": 7.740869884382455, "grad_norm": 0.03531863912940025, "learning_rate": 7.693349305369208e-06, "loss": 0.0016, "num_input_tokens_seen": 90983856, "step": 42180 }, { "epoch": 7.741787483942008, "grad_norm": 0.8774412274360657, "learning_rate": 7.692674619773622e-06, "loss": 0.145, "num_input_tokens_seen": 90994544, "step": 42185 }, { "epoch": 7.74270508350156, "grad_norm": 0.06752631813287735, "learning_rate": 7.691999865115106e-06, "loss": 0.0173, "num_input_tokens_seen": 91005168, "step": 42190 }, { "epoch": 7.743622683061112, "grad_norm": 0.031357910484075546, "learning_rate": 7.691325041410962e-06, "loss": 0.0746, "num_input_tokens_seen": 91015856, "step": 42195 }, { "epoch": 7.744540282620664, "grad_norm": 13.293615341186523, "learning_rate": 7.690650148678505e-06, "loss": 0.2987, "num_input_tokens_seen": 91027280, "step": 42200 }, { "epoch": 7.745457882180217, "grad_norm": 0.05884457007050514, "learning_rate": 7.689975186935041e-06, "loss": 0.0012, "num_input_tokens_seen": 91037744, "step": 42205 }, { "epoch": 7.7463754817397685, "grad_norm": 0.23887497186660767, "learning_rate": 7.68930015619788e-06, "loss": 0.0011, "num_input_tokens_seen": 91048176, "step": 42210 }, { "epoch": 7.747293081299321, "grad_norm": 10.159299850463867, "learning_rate": 7.688625056484343e-06, "loss": 0.3026, "num_input_tokens_seen": 91059408, "step": 42215 }, { "epoch": 7.7482106808588735, "grad_norm": 0.03244424983859062, "learning_rate": 7.687949887811736e-06, "loss": 0.0829, "num_input_tokens_seen": 91070096, "step": 42220 }, { "epoch": 7.749128280418425, "grad_norm": 0.11865908652544022, "learning_rate": 7.687274650197383e-06, "loss": 0.0299, "num_input_tokens_seen": 91081904, "step": 42225 }, { "epoch": 7.750045879977978, "grad_norm": 0.1546308994293213, "learning_rate": 7.686599343658598e-06, "loss": 0.1425, "num_input_tokens_seen": 91093744, "step": 42230 }, { "epoch": 7.75096347953753, "grad_norm": 0.015443803742527962, "learning_rate": 7.685923968212704e-06, "loss": 0.0981, "num_input_tokens_seen": 91105520, "step": 42235 }, { "epoch": 7.751881079097082, "grad_norm": 0.04018810763955116, "learning_rate": 7.685248523877025e-06, "loss": 0.0503, "num_input_tokens_seen": 91117296, "step": 42240 }, { "epoch": 7.752798678656634, "grad_norm": 0.058844756335020065, "learning_rate": 7.684573010668884e-06, "loss": 0.1796, "num_input_tokens_seen": 91127632, "step": 42245 }, { "epoch": 7.753716278216187, "grad_norm": 0.10651911050081253, "learning_rate": 7.683897428605603e-06, "loss": 0.0011, "num_input_tokens_seen": 91138736, "step": 42250 }, { "epoch": 7.754633877775738, "grad_norm": 0.3558966815471649, "learning_rate": 7.683221777704512e-06, "loss": 0.0016, "num_input_tokens_seen": 91148720, "step": 42255 }, { "epoch": 7.755551477335291, "grad_norm": 0.11930550634860992, "learning_rate": 7.682546057982943e-06, "loss": 0.0047, "num_input_tokens_seen": 91159440, "step": 42260 }, { "epoch": 7.756469076894843, "grad_norm": 0.09763933718204498, "learning_rate": 7.681870269458226e-06, "loss": 0.0024, "num_input_tokens_seen": 91169648, "step": 42265 }, { "epoch": 7.757386676454395, "grad_norm": 0.044247034937143326, "learning_rate": 7.681194412147691e-06, "loss": 0.0048, "num_input_tokens_seen": 91180816, "step": 42270 }, { "epoch": 7.758304276013948, "grad_norm": 0.034838151186704636, "learning_rate": 7.680518486068677e-06, "loss": 0.0864, "num_input_tokens_seen": 91192432, "step": 42275 }, { "epoch": 7.7592218755735, "grad_norm": 0.41301411390304565, "learning_rate": 7.679842491238517e-06, "loss": 0.0018, "num_input_tokens_seen": 91203856, "step": 42280 }, { "epoch": 7.760139475133052, "grad_norm": 0.38738855719566345, "learning_rate": 7.67916642767455e-06, "loss": 0.1057, "num_input_tokens_seen": 91214704, "step": 42285 }, { "epoch": 7.761057074692604, "grad_norm": 0.09818524867296219, "learning_rate": 7.678490295394116e-06, "loss": 0.0009, "num_input_tokens_seen": 91225936, "step": 42290 }, { "epoch": 7.761974674252157, "grad_norm": 0.06245695427060127, "learning_rate": 7.677814094414557e-06, "loss": 0.002, "num_input_tokens_seen": 91236656, "step": 42295 }, { "epoch": 7.762892273811708, "grad_norm": 0.9997009038925171, "learning_rate": 7.677137824753219e-06, "loss": 0.1292, "num_input_tokens_seen": 91248400, "step": 42300 }, { "epoch": 7.763809873371261, "grad_norm": 0.036881983280181885, "learning_rate": 7.676461486427444e-06, "loss": 0.0023, "num_input_tokens_seen": 91259088, "step": 42305 }, { "epoch": 7.764727472930813, "grad_norm": 0.24863301217556, "learning_rate": 7.67578507945458e-06, "loss": 0.2208, "num_input_tokens_seen": 91271504, "step": 42310 }, { "epoch": 7.765645072490365, "grad_norm": 0.04821319878101349, "learning_rate": 7.675108603851976e-06, "loss": 0.186, "num_input_tokens_seen": 91282640, "step": 42315 }, { "epoch": 7.7665626720499175, "grad_norm": 0.026306426152586937, "learning_rate": 7.67443205963698e-06, "loss": 0.2654, "num_input_tokens_seen": 91292720, "step": 42320 }, { "epoch": 7.76748027160947, "grad_norm": 0.33800455927848816, "learning_rate": 7.673755446826949e-06, "loss": 0.0019, "num_input_tokens_seen": 91303408, "step": 42325 }, { "epoch": 7.768397871169022, "grad_norm": 29.28559112548828, "learning_rate": 7.673078765439235e-06, "loss": 0.005, "num_input_tokens_seen": 91312592, "step": 42330 }, { "epoch": 7.769315470728574, "grad_norm": 65.74406433105469, "learning_rate": 7.672402015491194e-06, "loss": 0.0466, "num_input_tokens_seen": 91325168, "step": 42335 }, { "epoch": 7.770233070288127, "grad_norm": 0.2505784034729004, "learning_rate": 7.67172519700018e-06, "loss": 0.1873, "num_input_tokens_seen": 91336112, "step": 42340 }, { "epoch": 7.771150669847678, "grad_norm": 0.05869584158062935, "learning_rate": 7.671048309983558e-06, "loss": 0.1081, "num_input_tokens_seen": 91346256, "step": 42345 }, { "epoch": 7.772068269407231, "grad_norm": 0.06163399666547775, "learning_rate": 7.670371354458686e-06, "loss": 0.0017, "num_input_tokens_seen": 91356560, "step": 42350 }, { "epoch": 7.772985868966783, "grad_norm": 0.005499068181961775, "learning_rate": 7.669694330442929e-06, "loss": 0.0018, "num_input_tokens_seen": 91367376, "step": 42355 }, { "epoch": 7.773903468526335, "grad_norm": 0.40275731682777405, "learning_rate": 7.669017237953648e-06, "loss": 0.0027, "num_input_tokens_seen": 91377968, "step": 42360 }, { "epoch": 7.774821068085887, "grad_norm": 0.2630036473274231, "learning_rate": 7.668340077008212e-06, "loss": 0.0479, "num_input_tokens_seen": 91388464, "step": 42365 }, { "epoch": 7.77573866764544, "grad_norm": 0.032284848392009735, "learning_rate": 7.667662847623989e-06, "loss": 0.1835, "num_input_tokens_seen": 91399600, "step": 42370 }, { "epoch": 7.7766562672049915, "grad_norm": 0.040907446295022964, "learning_rate": 7.66698554981835e-06, "loss": 0.2025, "num_input_tokens_seen": 91408848, "step": 42375 }, { "epoch": 7.777573866764544, "grad_norm": 7.758481025695801, "learning_rate": 7.666308183608662e-06, "loss": 0.2662, "num_input_tokens_seen": 91418608, "step": 42380 }, { "epoch": 7.7784914663240965, "grad_norm": 0.08589692413806915, "learning_rate": 7.665630749012303e-06, "loss": 0.1291, "num_input_tokens_seen": 91429360, "step": 42385 }, { "epoch": 7.779409065883648, "grad_norm": 0.14579783380031586, "learning_rate": 7.664953246046644e-06, "loss": 0.0616, "num_input_tokens_seen": 91440720, "step": 42390 }, { "epoch": 7.780326665443201, "grad_norm": 0.2144356071949005, "learning_rate": 7.664275674729068e-06, "loss": 0.1576, "num_input_tokens_seen": 91450576, "step": 42395 }, { "epoch": 7.781244265002753, "grad_norm": 15.610208511352539, "learning_rate": 7.663598035076949e-06, "loss": 0.1517, "num_input_tokens_seen": 91461296, "step": 42400 }, { "epoch": 7.782161864562305, "grad_norm": 0.05155835673213005, "learning_rate": 7.662920327107669e-06, "loss": 0.0014, "num_input_tokens_seen": 91472016, "step": 42405 }, { "epoch": 7.783079464121857, "grad_norm": 0.02456563524901867, "learning_rate": 7.66224255083861e-06, "loss": 0.0014, "num_input_tokens_seen": 91483472, "step": 42410 }, { "epoch": 7.78399706368141, "grad_norm": 0.025593621656298637, "learning_rate": 7.661564706287155e-06, "loss": 0.1302, "num_input_tokens_seen": 91494576, "step": 42415 }, { "epoch": 7.784914663240961, "grad_norm": 0.08600803464651108, "learning_rate": 7.66088679347069e-06, "loss": 0.0014, "num_input_tokens_seen": 91505552, "step": 42420 }, { "epoch": 7.785832262800514, "grad_norm": 0.03652937337756157, "learning_rate": 7.660208812406605e-06, "loss": 0.2154, "num_input_tokens_seen": 91516624, "step": 42425 }, { "epoch": 7.786749862360066, "grad_norm": 0.009759826585650444, "learning_rate": 7.659530763112284e-06, "loss": 0.1052, "num_input_tokens_seen": 91527664, "step": 42430 }, { "epoch": 7.787667461919618, "grad_norm": 0.03667585551738739, "learning_rate": 7.658852645605122e-06, "loss": 0.1054, "num_input_tokens_seen": 91538896, "step": 42435 }, { "epoch": 7.7885850614791705, "grad_norm": 0.013005483895540237, "learning_rate": 7.65817445990251e-06, "loss": 0.0013, "num_input_tokens_seen": 91549968, "step": 42440 }, { "epoch": 7.789502661038723, "grad_norm": 0.12549494206905365, "learning_rate": 7.657496206021843e-06, "loss": 0.1264, "num_input_tokens_seen": 91561488, "step": 42445 }, { "epoch": 7.790420260598275, "grad_norm": 0.16054505109786987, "learning_rate": 7.656817883980518e-06, "loss": 0.1915, "num_input_tokens_seen": 91572016, "step": 42450 }, { "epoch": 7.791337860157827, "grad_norm": 26.73975944519043, "learning_rate": 7.656139493795932e-06, "loss": 0.328, "num_input_tokens_seen": 91584048, "step": 42455 }, { "epoch": 7.79225545971738, "grad_norm": 0.1132572591304779, "learning_rate": 7.655461035485483e-06, "loss": 0.0012, "num_input_tokens_seen": 91594608, "step": 42460 }, { "epoch": 7.793173059276931, "grad_norm": 13.709220886230469, "learning_rate": 7.654782509066577e-06, "loss": 0.6029, "num_input_tokens_seen": 91604624, "step": 42465 }, { "epoch": 7.794090658836484, "grad_norm": 25.704286575317383, "learning_rate": 7.654103914556611e-06, "loss": 0.1553, "num_input_tokens_seen": 91614096, "step": 42470 }, { "epoch": 7.795008258396036, "grad_norm": 0.2928048372268677, "learning_rate": 7.653425251972995e-06, "loss": 0.0312, "num_input_tokens_seen": 91624304, "step": 42475 }, { "epoch": 7.795925857955588, "grad_norm": 11.843140602111816, "learning_rate": 7.652746521333132e-06, "loss": 0.127, "num_input_tokens_seen": 91635600, "step": 42480 }, { "epoch": 7.79684345751514, "grad_norm": 0.5609879493713379, "learning_rate": 7.652067722654435e-06, "loss": 0.0151, "num_input_tokens_seen": 91646192, "step": 42485 }, { "epoch": 7.797761057074693, "grad_norm": 0.4298750162124634, "learning_rate": 7.651388855954308e-06, "loss": 0.0252, "num_input_tokens_seen": 91657392, "step": 42490 }, { "epoch": 7.7986786566342445, "grad_norm": 8.922938346862793, "learning_rate": 7.650709921250168e-06, "loss": 0.0058, "num_input_tokens_seen": 91668848, "step": 42495 }, { "epoch": 7.799596256193797, "grad_norm": 0.06259296834468842, "learning_rate": 7.650030918559426e-06, "loss": 0.0911, "num_input_tokens_seen": 91680880, "step": 42500 }, { "epoch": 7.8005138557533495, "grad_norm": 0.061489854007959366, "learning_rate": 7.649351847899498e-06, "loss": 0.005, "num_input_tokens_seen": 91691504, "step": 42505 }, { "epoch": 7.801431455312901, "grad_norm": 24.707199096679688, "learning_rate": 7.648672709287802e-06, "loss": 0.1327, "num_input_tokens_seen": 91702352, "step": 42510 }, { "epoch": 7.802349054872454, "grad_norm": 0.06514665484428406, "learning_rate": 7.647993502741755e-06, "loss": 0.0012, "num_input_tokens_seen": 91712336, "step": 42515 }, { "epoch": 7.803266654432006, "grad_norm": 0.15176086127758026, "learning_rate": 7.64731422827878e-06, "loss": 0.2981, "num_input_tokens_seen": 91724496, "step": 42520 }, { "epoch": 7.804184253991558, "grad_norm": 0.030201105400919914, "learning_rate": 7.646634885916298e-06, "loss": 0.002, "num_input_tokens_seen": 91736048, "step": 42525 }, { "epoch": 7.80510185355111, "grad_norm": 40.44241714477539, "learning_rate": 7.64595547567173e-06, "loss": 0.209, "num_input_tokens_seen": 91746384, "step": 42530 }, { "epoch": 7.806019453110663, "grad_norm": 0.032920025289058685, "learning_rate": 7.645275997562509e-06, "loss": 0.0026, "num_input_tokens_seen": 91756208, "step": 42535 }, { "epoch": 7.806937052670214, "grad_norm": 0.04792527109384537, "learning_rate": 7.644596451606057e-06, "loss": 0.0928, "num_input_tokens_seen": 91766960, "step": 42540 }, { "epoch": 7.807854652229767, "grad_norm": 0.07693357020616531, "learning_rate": 7.643916837819804e-06, "loss": 0.0057, "num_input_tokens_seen": 91778512, "step": 42545 }, { "epoch": 7.808772251789319, "grad_norm": 0.05240185558795929, "learning_rate": 7.643237156221183e-06, "loss": 0.0036, "num_input_tokens_seen": 91790352, "step": 42550 }, { "epoch": 7.809689851348871, "grad_norm": 0.016212506219744682, "learning_rate": 7.642557406827625e-06, "loss": 0.0011, "num_input_tokens_seen": 91800976, "step": 42555 }, { "epoch": 7.810607450908424, "grad_norm": 0.13915544748306274, "learning_rate": 7.641877589656566e-06, "loss": 0.0013, "num_input_tokens_seen": 91810352, "step": 42560 }, { "epoch": 7.811525050467976, "grad_norm": 0.003701132722198963, "learning_rate": 7.64119770472544e-06, "loss": 0.0771, "num_input_tokens_seen": 91821040, "step": 42565 }, { "epoch": 7.812442650027528, "grad_norm": 0.15795552730560303, "learning_rate": 7.640517752051686e-06, "loss": 0.1318, "num_input_tokens_seen": 91833136, "step": 42570 }, { "epoch": 7.81336024958708, "grad_norm": 0.035921938717365265, "learning_rate": 7.639837731652745e-06, "loss": 0.0011, "num_input_tokens_seen": 91843216, "step": 42575 }, { "epoch": 7.814277849146633, "grad_norm": 0.08559644967317581, "learning_rate": 7.639157643546059e-06, "loss": 0.1257, "num_input_tokens_seen": 91853712, "step": 42580 }, { "epoch": 7.815195448706184, "grad_norm": 0.051047202199697495, "learning_rate": 7.638477487749068e-06, "loss": 0.1284, "num_input_tokens_seen": 91864784, "step": 42585 }, { "epoch": 7.816113048265737, "grad_norm": 14.808639526367188, "learning_rate": 7.637797264279218e-06, "loss": 0.2757, "num_input_tokens_seen": 91875856, "step": 42590 }, { "epoch": 7.817030647825289, "grad_norm": 0.5960928201675415, "learning_rate": 7.637116973153958e-06, "loss": 0.1451, "num_input_tokens_seen": 91887536, "step": 42595 }, { "epoch": 7.817948247384841, "grad_norm": 1.4072818756103516, "learning_rate": 7.636436614390734e-06, "loss": 0.0306, "num_input_tokens_seen": 91898096, "step": 42600 }, { "epoch": 7.8188658469443935, "grad_norm": 0.009830586612224579, "learning_rate": 7.635756188006998e-06, "loss": 0.0413, "num_input_tokens_seen": 91909232, "step": 42605 }, { "epoch": 7.819783446503946, "grad_norm": 0.008601970970630646, "learning_rate": 7.6350756940202e-06, "loss": 0.0007, "num_input_tokens_seen": 91920880, "step": 42610 }, { "epoch": 7.820701046063498, "grad_norm": 12.96181583404541, "learning_rate": 7.634395132447793e-06, "loss": 0.1529, "num_input_tokens_seen": 91930576, "step": 42615 }, { "epoch": 7.82161864562305, "grad_norm": 1.2902849912643433, "learning_rate": 7.633714503307236e-06, "loss": 0.1047, "num_input_tokens_seen": 91941200, "step": 42620 }, { "epoch": 7.822536245182603, "grad_norm": 0.0501570887863636, "learning_rate": 7.633033806615982e-06, "loss": 0.0006, "num_input_tokens_seen": 91952272, "step": 42625 }, { "epoch": 7.823453844742154, "grad_norm": 0.012043074704706669, "learning_rate": 7.632353042391493e-06, "loss": 0.0012, "num_input_tokens_seen": 91962512, "step": 42630 }, { "epoch": 7.824371444301707, "grad_norm": 86.77227783203125, "learning_rate": 7.63167221065123e-06, "loss": 0.1952, "num_input_tokens_seen": 91973552, "step": 42635 }, { "epoch": 7.825289043861259, "grad_norm": 28.77225112915039, "learning_rate": 7.63099131141265e-06, "loss": 0.1074, "num_input_tokens_seen": 91983792, "step": 42640 }, { "epoch": 7.826206643420811, "grad_norm": 0.018046066164970398, "learning_rate": 7.630310344693222e-06, "loss": 0.0919, "num_input_tokens_seen": 91994864, "step": 42645 }, { "epoch": 7.827124242980363, "grad_norm": 0.03205544501543045, "learning_rate": 7.62962931051041e-06, "loss": 0.0019, "num_input_tokens_seen": 92004464, "step": 42650 }, { "epoch": 7.828041842539916, "grad_norm": 0.016879355534911156, "learning_rate": 7.628948208881683e-06, "loss": 0.0006, "num_input_tokens_seen": 92015312, "step": 42655 }, { "epoch": 7.8289594420994675, "grad_norm": 0.003666413715109229, "learning_rate": 7.628267039824508e-06, "loss": 0.0004, "num_input_tokens_seen": 92024656, "step": 42660 }, { "epoch": 7.82987704165902, "grad_norm": 0.05113060772418976, "learning_rate": 7.627585803356355e-06, "loss": 0.0006, "num_input_tokens_seen": 92036144, "step": 42665 }, { "epoch": 7.8307946412185725, "grad_norm": 0.18479084968566895, "learning_rate": 7.626904499494702e-06, "loss": 0.1444, "num_input_tokens_seen": 92047536, "step": 42670 }, { "epoch": 7.831712240778124, "grad_norm": 0.008114187978208065, "learning_rate": 7.626223128257018e-06, "loss": 0.1796, "num_input_tokens_seen": 92057808, "step": 42675 }, { "epoch": 7.832629840337677, "grad_norm": 0.04923702031373978, "learning_rate": 7.6255416896607814e-06, "loss": 0.1616, "num_input_tokens_seen": 92068112, "step": 42680 }, { "epoch": 7.833547439897229, "grad_norm": 0.03976472467184067, "learning_rate": 7.62486018372347e-06, "loss": 0.192, "num_input_tokens_seen": 92079568, "step": 42685 }, { "epoch": 7.834465039456781, "grad_norm": 0.015593255870044231, "learning_rate": 7.624178610462563e-06, "loss": 0.064, "num_input_tokens_seen": 92092112, "step": 42690 }, { "epoch": 7.835382639016333, "grad_norm": 0.13590604066848755, "learning_rate": 7.623496969895541e-06, "loss": 0.2192, "num_input_tokens_seen": 92103536, "step": 42695 }, { "epoch": 7.836300238575886, "grad_norm": 0.015021732077002525, "learning_rate": 7.622815262039889e-06, "loss": 0.1348, "num_input_tokens_seen": 92114480, "step": 42700 }, { "epoch": 7.837217838135437, "grad_norm": 0.0693858414888382, "learning_rate": 7.622133486913089e-06, "loss": 0.0004, "num_input_tokens_seen": 92126576, "step": 42705 }, { "epoch": 7.83813543769499, "grad_norm": 0.8989271521568298, "learning_rate": 7.621451644532629e-06, "loss": 0.0087, "num_input_tokens_seen": 92138416, "step": 42710 }, { "epoch": 7.839053037254542, "grad_norm": 0.03442716971039772, "learning_rate": 7.620769734915998e-06, "loss": 0.0007, "num_input_tokens_seen": 92148976, "step": 42715 }, { "epoch": 7.839970636814094, "grad_norm": 0.023506920784711838, "learning_rate": 7.620087758080685e-06, "loss": 0.0009, "num_input_tokens_seen": 92160272, "step": 42720 }, { "epoch": 7.8408882363736465, "grad_norm": 10.976034164428711, "learning_rate": 7.6194057140441825e-06, "loss": 0.1803, "num_input_tokens_seen": 92171056, "step": 42725 }, { "epoch": 7.841805835933199, "grad_norm": 35.42766571044922, "learning_rate": 7.618723602823983e-06, "loss": 0.1225, "num_input_tokens_seen": 92182512, "step": 42730 }, { "epoch": 7.842723435492751, "grad_norm": 0.04020318388938904, "learning_rate": 7.618041424437581e-06, "loss": 0.0006, "num_input_tokens_seen": 92193808, "step": 42735 }, { "epoch": 7.843641035052303, "grad_norm": 0.03150472417473793, "learning_rate": 7.617359178902475e-06, "loss": 0.0013, "num_input_tokens_seen": 92206000, "step": 42740 }, { "epoch": 7.844558634611856, "grad_norm": 0.0030792385805398226, "learning_rate": 7.616676866236161e-06, "loss": 0.0594, "num_input_tokens_seen": 92215408, "step": 42745 }, { "epoch": 7.845476234171407, "grad_norm": 0.7335962057113647, "learning_rate": 7.615994486456142e-06, "loss": 0.0966, "num_input_tokens_seen": 92225392, "step": 42750 }, { "epoch": 7.84639383373096, "grad_norm": 2.951352596282959, "learning_rate": 7.6153120395799185e-06, "loss": 0.0025, "num_input_tokens_seen": 92236336, "step": 42755 }, { "epoch": 7.847311433290512, "grad_norm": 116.02981567382812, "learning_rate": 7.6146295256249944e-06, "loss": 0.044, "num_input_tokens_seen": 92247920, "step": 42760 }, { "epoch": 7.848229032850064, "grad_norm": 0.14639009535312653, "learning_rate": 7.613946944608875e-06, "loss": 0.0795, "num_input_tokens_seen": 92259280, "step": 42765 }, { "epoch": 7.849146632409616, "grad_norm": 0.25066956877708435, "learning_rate": 7.613264296549068e-06, "loss": 0.0186, "num_input_tokens_seen": 92270160, "step": 42770 }, { "epoch": 7.850064231969169, "grad_norm": 24.793502807617188, "learning_rate": 7.612581581463082e-06, "loss": 0.1697, "num_input_tokens_seen": 92281360, "step": 42775 }, { "epoch": 7.8509818315287205, "grad_norm": 0.029870331287384033, "learning_rate": 7.611898799368429e-06, "loss": 0.2569, "num_input_tokens_seen": 92292592, "step": 42780 }, { "epoch": 7.851899431088273, "grad_norm": 0.002855180762708187, "learning_rate": 7.611215950282619e-06, "loss": 0.0005, "num_input_tokens_seen": 92303856, "step": 42785 }, { "epoch": 7.8528170306478255, "grad_norm": 0.029325546696782112, "learning_rate": 7.6105330342231665e-06, "loss": 0.0003, "num_input_tokens_seen": 92314224, "step": 42790 }, { "epoch": 7.853734630207377, "grad_norm": 0.046931419521570206, "learning_rate": 7.609850051207588e-06, "loss": 0.0017, "num_input_tokens_seen": 92324624, "step": 42795 }, { "epoch": 7.85465222976693, "grad_norm": 0.009285204112529755, "learning_rate": 7.609167001253399e-06, "loss": 0.0009, "num_input_tokens_seen": 92334800, "step": 42800 }, { "epoch": 7.855569829326482, "grad_norm": 0.012683778069913387, "learning_rate": 7.608483884378123e-06, "loss": 0.1679, "num_input_tokens_seen": 92344848, "step": 42805 }, { "epoch": 7.856487428886034, "grad_norm": 0.045364558696746826, "learning_rate": 7.607800700599276e-06, "loss": 0.1419, "num_input_tokens_seen": 92356784, "step": 42810 }, { "epoch": 7.857405028445586, "grad_norm": 12.073526382446289, "learning_rate": 7.607117449934384e-06, "loss": 0.3899, "num_input_tokens_seen": 92367472, "step": 42815 }, { "epoch": 7.858322628005139, "grad_norm": 0.5814258456230164, "learning_rate": 7.606434132400968e-06, "loss": 0.0012, "num_input_tokens_seen": 92378192, "step": 42820 }, { "epoch": 7.85924022756469, "grad_norm": 0.1945105791091919, "learning_rate": 7.605750748016558e-06, "loss": 0.0018, "num_input_tokens_seen": 92389008, "step": 42825 }, { "epoch": 7.860157827124243, "grad_norm": 0.13972094655036926, "learning_rate": 7.60506729679868e-06, "loss": 0.0075, "num_input_tokens_seen": 92399888, "step": 42830 }, { "epoch": 7.861075426683795, "grad_norm": 0.0161474347114563, "learning_rate": 7.604383778764863e-06, "loss": 0.1131, "num_input_tokens_seen": 92412240, "step": 42835 }, { "epoch": 7.861993026243347, "grad_norm": 0.034643713384866714, "learning_rate": 7.603700193932638e-06, "loss": 0.0516, "num_input_tokens_seen": 92422256, "step": 42840 }, { "epoch": 7.8629106258029, "grad_norm": 16.152315139770508, "learning_rate": 7.603016542319539e-06, "loss": 0.1352, "num_input_tokens_seen": 92432688, "step": 42845 }, { "epoch": 7.863828225362452, "grad_norm": 11.275456428527832, "learning_rate": 7.602332823943099e-06, "loss": 0.0984, "num_input_tokens_seen": 92443920, "step": 42850 }, { "epoch": 7.864745824922004, "grad_norm": 16.12015151977539, "learning_rate": 7.601649038820857e-06, "loss": 0.0384, "num_input_tokens_seen": 92454928, "step": 42855 }, { "epoch": 7.865663424481556, "grad_norm": 0.02925584465265274, "learning_rate": 7.6009651869703485e-06, "loss": 0.0017, "num_input_tokens_seen": 92466096, "step": 42860 }, { "epoch": 7.866581024041109, "grad_norm": 0.07048110663890839, "learning_rate": 7.600281268409113e-06, "loss": 0.0012, "num_input_tokens_seen": 92476656, "step": 42865 }, { "epoch": 7.86749862360066, "grad_norm": 0.009179706685245037, "learning_rate": 7.599597283154694e-06, "loss": 0.0014, "num_input_tokens_seen": 92487856, "step": 42870 }, { "epoch": 7.868416223160213, "grad_norm": 0.03930594027042389, "learning_rate": 7.598913231224634e-06, "loss": 0.0011, "num_input_tokens_seen": 92498064, "step": 42875 }, { "epoch": 7.869333822719765, "grad_norm": 0.07051383703947067, "learning_rate": 7.598229112636477e-06, "loss": 0.0865, "num_input_tokens_seen": 92508848, "step": 42880 }, { "epoch": 7.870251422279317, "grad_norm": 0.2776312232017517, "learning_rate": 7.59754492740777e-06, "loss": 0.0007, "num_input_tokens_seen": 92519984, "step": 42885 }, { "epoch": 7.8711690218388695, "grad_norm": 0.31524422764778137, "learning_rate": 7.5968606755560625e-06, "loss": 0.0004, "num_input_tokens_seen": 92529584, "step": 42890 }, { "epoch": 7.872086621398422, "grad_norm": 0.009164352901279926, "learning_rate": 7.596176357098904e-06, "loss": 0.0003, "num_input_tokens_seen": 92540240, "step": 42895 }, { "epoch": 7.873004220957974, "grad_norm": 0.01850830763578415, "learning_rate": 7.595491972053843e-06, "loss": 0.0006, "num_input_tokens_seen": 92551440, "step": 42900 }, { "epoch": 7.873921820517526, "grad_norm": 0.027456630021333694, "learning_rate": 7.5948075204384385e-06, "loss": 0.0007, "num_input_tokens_seen": 92561104, "step": 42905 }, { "epoch": 7.874839420077079, "grad_norm": 10.493249893188477, "learning_rate": 7.594123002270239e-06, "loss": 0.3008, "num_input_tokens_seen": 92572176, "step": 42910 }, { "epoch": 7.87575701963663, "grad_norm": 0.003724792506545782, "learning_rate": 7.59343841756681e-06, "loss": 0.0004, "num_input_tokens_seen": 92582576, "step": 42915 }, { "epoch": 7.876674619196183, "grad_norm": 0.0076820338144898415, "learning_rate": 7.592753766345701e-06, "loss": 0.0013, "num_input_tokens_seen": 92594160, "step": 42920 }, { "epoch": 7.877592218755735, "grad_norm": 0.00331143313087523, "learning_rate": 7.592069048624478e-06, "loss": 0.0017, "num_input_tokens_seen": 92605008, "step": 42925 }, { "epoch": 7.878509818315287, "grad_norm": 0.00778093421831727, "learning_rate": 7.5913842644207005e-06, "loss": 0.1413, "num_input_tokens_seen": 92615408, "step": 42930 }, { "epoch": 7.879427417874839, "grad_norm": 0.3885057270526886, "learning_rate": 7.590699413751932e-06, "loss": 0.1637, "num_input_tokens_seen": 92627824, "step": 42935 }, { "epoch": 7.880345017434392, "grad_norm": 0.019096365198493004, "learning_rate": 7.59001449663574e-06, "loss": 0.1288, "num_input_tokens_seen": 92638384, "step": 42940 }, { "epoch": 7.8812626169939435, "grad_norm": 0.02334997057914734, "learning_rate": 7.589329513089692e-06, "loss": 0.1531, "num_input_tokens_seen": 92648080, "step": 42945 }, { "epoch": 7.882180216553496, "grad_norm": 12.71985149383545, "learning_rate": 7.5886444631313525e-06, "loss": 0.161, "num_input_tokens_seen": 92659472, "step": 42950 }, { "epoch": 7.8830978161130485, "grad_norm": 2.089320182800293, "learning_rate": 7.587959346778295e-06, "loss": 0.0991, "num_input_tokens_seen": 92670000, "step": 42955 }, { "epoch": 7.8840154156726, "grad_norm": 0.07321078330278397, "learning_rate": 7.587274164048092e-06, "loss": 0.1332, "num_input_tokens_seen": 92680624, "step": 42960 }, { "epoch": 7.884933015232153, "grad_norm": 0.4598350524902344, "learning_rate": 7.5865889149583176e-06, "loss": 0.0021, "num_input_tokens_seen": 92691472, "step": 42965 }, { "epoch": 7.885850614791705, "grad_norm": 0.01672985404729843, "learning_rate": 7.5859035995265425e-06, "loss": 0.0009, "num_input_tokens_seen": 92702800, "step": 42970 }, { "epoch": 7.886768214351257, "grad_norm": 13.680793762207031, "learning_rate": 7.585218217770351e-06, "loss": 0.0108, "num_input_tokens_seen": 92713264, "step": 42975 }, { "epoch": 7.887685813910809, "grad_norm": 0.02660747803747654, "learning_rate": 7.584532769707319e-06, "loss": 0.2522, "num_input_tokens_seen": 92723856, "step": 42980 }, { "epoch": 7.888603413470362, "grad_norm": 0.011171678081154823, "learning_rate": 7.5838472553550255e-06, "loss": 0.0484, "num_input_tokens_seen": 92734928, "step": 42985 }, { "epoch": 7.889521013029913, "grad_norm": 0.3690696656703949, "learning_rate": 7.5831616747310565e-06, "loss": 0.1638, "num_input_tokens_seen": 92745360, "step": 42990 }, { "epoch": 7.890438612589466, "grad_norm": 0.020433397963643074, "learning_rate": 7.582476027852992e-06, "loss": 0.0006, "num_input_tokens_seen": 92755984, "step": 42995 }, { "epoch": 7.891356212149018, "grad_norm": 0.014242901466786861, "learning_rate": 7.581790314738422e-06, "loss": 0.3832, "num_input_tokens_seen": 92766608, "step": 43000 }, { "epoch": 7.89227381170857, "grad_norm": 0.0151347815990448, "learning_rate": 7.58110453540493e-06, "loss": 0.001, "num_input_tokens_seen": 92777680, "step": 43005 }, { "epoch": 7.8931914112681225, "grad_norm": 13.725008964538574, "learning_rate": 7.5804186898701085e-06, "loss": 0.2049, "num_input_tokens_seen": 92788400, "step": 43010 }, { "epoch": 7.894109010827675, "grad_norm": 0.042348023504018784, "learning_rate": 7.5797327781515475e-06, "loss": 0.0021, "num_input_tokens_seen": 92800496, "step": 43015 }, { "epoch": 7.895026610387227, "grad_norm": 0.04067913442850113, "learning_rate": 7.579046800266836e-06, "loss": 0.1236, "num_input_tokens_seen": 92810992, "step": 43020 }, { "epoch": 7.895944209946779, "grad_norm": 12.379494667053223, "learning_rate": 7.578360756233574e-06, "loss": 0.0135, "num_input_tokens_seen": 92821232, "step": 43025 }, { "epoch": 7.896861809506332, "grad_norm": 0.0826832726597786, "learning_rate": 7.5776746460693525e-06, "loss": 0.0055, "num_input_tokens_seen": 92832112, "step": 43030 }, { "epoch": 7.897779409065883, "grad_norm": 7.589157581329346, "learning_rate": 7.5769884697917726e-06, "loss": 0.2762, "num_input_tokens_seen": 92843376, "step": 43035 }, { "epoch": 7.898697008625436, "grad_norm": 0.08676664531230927, "learning_rate": 7.576302227418433e-06, "loss": 0.1329, "num_input_tokens_seen": 92854224, "step": 43040 }, { "epoch": 7.899614608184988, "grad_norm": 10.666961669921875, "learning_rate": 7.5756159189669325e-06, "loss": 0.1906, "num_input_tokens_seen": 92865008, "step": 43045 }, { "epoch": 7.90053220774454, "grad_norm": 0.01361007709056139, "learning_rate": 7.574929544454877e-06, "loss": 0.0033, "num_input_tokens_seen": 92875504, "step": 43050 }, { "epoch": 7.901449807304092, "grad_norm": 0.03012111410498619, "learning_rate": 7.574243103899869e-06, "loss": 0.1175, "num_input_tokens_seen": 92886384, "step": 43055 }, { "epoch": 7.902367406863645, "grad_norm": 0.042058903723955154, "learning_rate": 7.573556597319516e-06, "loss": 0.0013, "num_input_tokens_seen": 92896496, "step": 43060 }, { "epoch": 7.9032850064231965, "grad_norm": 0.15887026488780975, "learning_rate": 7.572870024731423e-06, "loss": 0.004, "num_input_tokens_seen": 92906384, "step": 43065 }, { "epoch": 7.904202605982749, "grad_norm": 33.466270446777344, "learning_rate": 7.572183386153203e-06, "loss": 0.0103, "num_input_tokens_seen": 92917040, "step": 43070 }, { "epoch": 7.9051202055423015, "grad_norm": 0.09540928155183792, "learning_rate": 7.571496681602464e-06, "loss": 0.004, "num_input_tokens_seen": 92928240, "step": 43075 }, { "epoch": 7.906037805101853, "grad_norm": 0.1852542906999588, "learning_rate": 7.5708099110968214e-06, "loss": 0.0011, "num_input_tokens_seen": 92938832, "step": 43080 }, { "epoch": 7.906955404661406, "grad_norm": 59.05984115600586, "learning_rate": 7.57012307465389e-06, "loss": 0.1437, "num_input_tokens_seen": 92948592, "step": 43085 }, { "epoch": 7.907873004220958, "grad_norm": 0.05925274267792702, "learning_rate": 7.569436172291284e-06, "loss": 0.2349, "num_input_tokens_seen": 92959216, "step": 43090 }, { "epoch": 7.90879060378051, "grad_norm": 0.1227649673819542, "learning_rate": 7.568749204026622e-06, "loss": 0.0005, "num_input_tokens_seen": 92969488, "step": 43095 }, { "epoch": 7.909708203340062, "grad_norm": 0.004704182967543602, "learning_rate": 7.568062169877526e-06, "loss": 0.0004, "num_input_tokens_seen": 92980080, "step": 43100 }, { "epoch": 7.910625802899615, "grad_norm": 28.88698959350586, "learning_rate": 7.567375069861614e-06, "loss": 0.1139, "num_input_tokens_seen": 92989808, "step": 43105 }, { "epoch": 7.911543402459166, "grad_norm": 0.017082681879401207, "learning_rate": 7.56668790399651e-06, "loss": 0.2571, "num_input_tokens_seen": 93001232, "step": 43110 }, { "epoch": 7.912461002018719, "grad_norm": 18.641315460205078, "learning_rate": 7.56600067229984e-06, "loss": 0.2169, "num_input_tokens_seen": 93012528, "step": 43115 }, { "epoch": 7.913378601578271, "grad_norm": 0.009359247051179409, "learning_rate": 7.56531337478923e-06, "loss": 0.2479, "num_input_tokens_seen": 93021424, "step": 43120 }, { "epoch": 7.914296201137823, "grad_norm": 0.05972280725836754, "learning_rate": 7.564626011482308e-06, "loss": 0.0009, "num_input_tokens_seen": 93032432, "step": 43125 }, { "epoch": 7.915213800697376, "grad_norm": 0.03837865591049194, "learning_rate": 7.563938582396704e-06, "loss": 0.0744, "num_input_tokens_seen": 93042608, "step": 43130 }, { "epoch": 7.916131400256928, "grad_norm": 92.03355407714844, "learning_rate": 7.563251087550047e-06, "loss": 0.2498, "num_input_tokens_seen": 93053808, "step": 43135 }, { "epoch": 7.91704899981648, "grad_norm": 0.06963624060153961, "learning_rate": 7.562563526959974e-06, "loss": 0.0012, "num_input_tokens_seen": 93064528, "step": 43140 }, { "epoch": 7.917966599376032, "grad_norm": 0.11412825435400009, "learning_rate": 7.5618759006441175e-06, "loss": 0.0011, "num_input_tokens_seen": 93075504, "step": 43145 }, { "epoch": 7.918884198935585, "grad_norm": 0.0370173342525959, "learning_rate": 7.561188208620116e-06, "loss": 0.0036, "num_input_tokens_seen": 93086608, "step": 43150 }, { "epoch": 7.919801798495136, "grad_norm": 0.01709926128387451, "learning_rate": 7.560500450905605e-06, "loss": 0.0742, "num_input_tokens_seen": 93096816, "step": 43155 }, { "epoch": 7.920719398054689, "grad_norm": 0.1526079773902893, "learning_rate": 7.559812627518226e-06, "loss": 0.002, "num_input_tokens_seen": 93107248, "step": 43160 }, { "epoch": 7.921636997614241, "grad_norm": 0.9603552222251892, "learning_rate": 7.559124738475621e-06, "loss": 0.482, "num_input_tokens_seen": 93118448, "step": 43165 }, { "epoch": 7.922554597173793, "grad_norm": 28.175777435302734, "learning_rate": 7.558436783795432e-06, "loss": 0.1489, "num_input_tokens_seen": 93128752, "step": 43170 }, { "epoch": 7.9234721967333455, "grad_norm": 0.0826973095536232, "learning_rate": 7.557748763495305e-06, "loss": 0.1322, "num_input_tokens_seen": 93139696, "step": 43175 }, { "epoch": 7.924389796292898, "grad_norm": 0.07331506907939911, "learning_rate": 7.557060677592887e-06, "loss": 0.1229, "num_input_tokens_seen": 93149904, "step": 43180 }, { "epoch": 7.92530739585245, "grad_norm": 0.04526376351714134, "learning_rate": 7.556372526105825e-06, "loss": 0.2567, "num_input_tokens_seen": 93160400, "step": 43185 }, { "epoch": 7.926224995412002, "grad_norm": 31.558744430541992, "learning_rate": 7.55568430905177e-06, "loss": 0.163, "num_input_tokens_seen": 93170864, "step": 43190 }, { "epoch": 7.927142594971555, "grad_norm": 0.07242774218320847, "learning_rate": 7.554996026448374e-06, "loss": 0.1323, "num_input_tokens_seen": 93181168, "step": 43195 }, { "epoch": 7.928060194531106, "grad_norm": 0.06499405205249786, "learning_rate": 7.554307678313289e-06, "loss": 0.0871, "num_input_tokens_seen": 93191856, "step": 43200 }, { "epoch": 7.928977794090659, "grad_norm": 0.04414902627468109, "learning_rate": 7.553619264664169e-06, "loss": 0.156, "num_input_tokens_seen": 93202352, "step": 43205 }, { "epoch": 7.929895393650211, "grad_norm": 0.20439524948596954, "learning_rate": 7.552930785518676e-06, "loss": 0.0226, "num_input_tokens_seen": 93212592, "step": 43210 }, { "epoch": 7.930812993209763, "grad_norm": 0.020197467878460884, "learning_rate": 7.552242240894465e-06, "loss": 0.0495, "num_input_tokens_seen": 93224400, "step": 43215 }, { "epoch": 7.931730592769315, "grad_norm": 0.5314685702323914, "learning_rate": 7.551553630809194e-06, "loss": 0.0992, "num_input_tokens_seen": 93235056, "step": 43220 }, { "epoch": 7.932648192328868, "grad_norm": 0.03297233581542969, "learning_rate": 7.550864955280528e-06, "loss": 0.082, "num_input_tokens_seen": 93246768, "step": 43225 }, { "epoch": 7.9335657918884195, "grad_norm": 1.4466444253921509, "learning_rate": 7.5501762143261285e-06, "loss": 0.0945, "num_input_tokens_seen": 93257232, "step": 43230 }, { "epoch": 7.934483391447972, "grad_norm": 0.02100631408393383, "learning_rate": 7.549487407963663e-06, "loss": 0.2037, "num_input_tokens_seen": 93267824, "step": 43235 }, { "epoch": 7.9354009910075245, "grad_norm": 6.68050479888916, "learning_rate": 7.548798536210795e-06, "loss": 0.2355, "num_input_tokens_seen": 93278928, "step": 43240 }, { "epoch": 7.936318590567076, "grad_norm": 0.15492083132266998, "learning_rate": 7.5481095990851975e-06, "loss": 0.0012, "num_input_tokens_seen": 93289008, "step": 43245 }, { "epoch": 7.937236190126629, "grad_norm": 0.25170716643333435, "learning_rate": 7.5474205966045356e-06, "loss": 0.0603, "num_input_tokens_seen": 93300144, "step": 43250 }, { "epoch": 7.938153789686181, "grad_norm": 0.20753994584083557, "learning_rate": 7.546731528786484e-06, "loss": 0.002, "num_input_tokens_seen": 93310416, "step": 43255 }, { "epoch": 7.939071389245733, "grad_norm": 0.5368321537971497, "learning_rate": 7.546042395648716e-06, "loss": 0.1735, "num_input_tokens_seen": 93318672, "step": 43260 }, { "epoch": 7.939988988805285, "grad_norm": 0.013797195628285408, "learning_rate": 7.5453531972089064e-06, "loss": 0.0005, "num_input_tokens_seen": 93330608, "step": 43265 }, { "epoch": 7.940906588364838, "grad_norm": 15.616408348083496, "learning_rate": 7.544663933484733e-06, "loss": 0.0684, "num_input_tokens_seen": 93342320, "step": 43270 }, { "epoch": 7.941824187924389, "grad_norm": 0.019330808892846107, "learning_rate": 7.543974604493873e-06, "loss": 0.0011, "num_input_tokens_seen": 93352816, "step": 43275 }, { "epoch": 7.942741787483942, "grad_norm": 26.39388084411621, "learning_rate": 7.5432852102540055e-06, "loss": 0.0363, "num_input_tokens_seen": 93364496, "step": 43280 }, { "epoch": 7.943659387043494, "grad_norm": 0.033110953867435455, "learning_rate": 7.542595750782817e-06, "loss": 0.1726, "num_input_tokens_seen": 93375280, "step": 43285 }, { "epoch": 7.944576986603046, "grad_norm": 43.90511703491211, "learning_rate": 7.541906226097986e-06, "loss": 0.1145, "num_input_tokens_seen": 93386480, "step": 43290 }, { "epoch": 7.9454945861625985, "grad_norm": 0.07368472963571548, "learning_rate": 7.541216636217201e-06, "loss": 0.0007, "num_input_tokens_seen": 93398000, "step": 43295 }, { "epoch": 7.946412185722151, "grad_norm": 17.567398071289062, "learning_rate": 7.540526981158147e-06, "loss": 0.2519, "num_input_tokens_seen": 93409456, "step": 43300 }, { "epoch": 7.947329785281703, "grad_norm": 8.848380088806152, "learning_rate": 7.539837260938514e-06, "loss": 0.1435, "num_input_tokens_seen": 93420112, "step": 43305 }, { "epoch": 7.948247384841255, "grad_norm": 0.024492790922522545, "learning_rate": 7.539147475575992e-06, "loss": 0.0026, "num_input_tokens_seen": 93430480, "step": 43310 }, { "epoch": 7.949164984400808, "grad_norm": 12.574237823486328, "learning_rate": 7.5384576250882725e-06, "loss": 0.2088, "num_input_tokens_seen": 93442032, "step": 43315 }, { "epoch": 7.950082583960359, "grad_norm": 12.132689476013184, "learning_rate": 7.537767709493049e-06, "loss": 0.0469, "num_input_tokens_seen": 93452848, "step": 43320 }, { "epoch": 7.951000183519912, "grad_norm": 0.024075843393802643, "learning_rate": 7.537077728808018e-06, "loss": 0.1097, "num_input_tokens_seen": 93464336, "step": 43325 }, { "epoch": 7.951917783079464, "grad_norm": 0.17635053396224976, "learning_rate": 7.536387683050874e-06, "loss": 0.0036, "num_input_tokens_seen": 93475120, "step": 43330 }, { "epoch": 7.952835382639016, "grad_norm": 0.28858163952827454, "learning_rate": 7.535697572239318e-06, "loss": 0.0132, "num_input_tokens_seen": 93486480, "step": 43335 }, { "epoch": 7.953752982198568, "grad_norm": 0.024657486006617546, "learning_rate": 7.535007396391047e-06, "loss": 0.0649, "num_input_tokens_seen": 93496720, "step": 43340 }, { "epoch": 7.954670581758121, "grad_norm": 0.09524010121822357, "learning_rate": 7.534317155523767e-06, "loss": 0.0662, "num_input_tokens_seen": 93506160, "step": 43345 }, { "epoch": 7.9555881813176725, "grad_norm": 14.507152557373047, "learning_rate": 7.5336268496551805e-06, "loss": 0.0344, "num_input_tokens_seen": 93516080, "step": 43350 }, { "epoch": 7.956505780877225, "grad_norm": 0.07862704992294312, "learning_rate": 7.5329364788029905e-06, "loss": 0.1177, "num_input_tokens_seen": 93527568, "step": 43355 }, { "epoch": 7.9574233804367775, "grad_norm": 4.898947238922119, "learning_rate": 7.532246042984906e-06, "loss": 0.0037, "num_input_tokens_seen": 93539536, "step": 43360 }, { "epoch": 7.958340979996329, "grad_norm": 0.022278163582086563, "learning_rate": 7.5315555422186335e-06, "loss": 0.003, "num_input_tokens_seen": 93549904, "step": 43365 }, { "epoch": 7.959258579555882, "grad_norm": 0.029079308733344078, "learning_rate": 7.530864976521888e-06, "loss": 0.0005, "num_input_tokens_seen": 93560784, "step": 43370 }, { "epoch": 7.960176179115434, "grad_norm": 0.0229240320622921, "learning_rate": 7.5301743459123755e-06, "loss": 0.0041, "num_input_tokens_seen": 93571664, "step": 43375 }, { "epoch": 7.961093778674986, "grad_norm": 0.01107393391430378, "learning_rate": 7.529483650407815e-06, "loss": 0.0031, "num_input_tokens_seen": 93581936, "step": 43380 }, { "epoch": 7.962011378234538, "grad_norm": 0.016696872189641, "learning_rate": 7.528792890025918e-06, "loss": 0.309, "num_input_tokens_seen": 93592176, "step": 43385 }, { "epoch": 7.962928977794091, "grad_norm": 0.03325160592794418, "learning_rate": 7.5281020647844015e-06, "loss": 0.2444, "num_input_tokens_seen": 93603248, "step": 43390 }, { "epoch": 7.963846577353642, "grad_norm": 367.3630676269531, "learning_rate": 7.527411174700987e-06, "loss": 0.0524, "num_input_tokens_seen": 93614672, "step": 43395 }, { "epoch": 7.964764176913195, "grad_norm": 71.80982971191406, "learning_rate": 7.526720219793393e-06, "loss": 0.028, "num_input_tokens_seen": 93625616, "step": 43400 }, { "epoch": 7.965681776472747, "grad_norm": 0.010506309568881989, "learning_rate": 7.526029200079341e-06, "loss": 0.0008, "num_input_tokens_seen": 93636048, "step": 43405 }, { "epoch": 7.966599376032299, "grad_norm": 0.040350694209337234, "learning_rate": 7.525338115576555e-06, "loss": 0.0045, "num_input_tokens_seen": 93646352, "step": 43410 }, { "epoch": 7.967516975591852, "grad_norm": 23.216304779052734, "learning_rate": 7.524646966302759e-06, "loss": 0.0047, "num_input_tokens_seen": 93657872, "step": 43415 }, { "epoch": 7.968434575151404, "grad_norm": 0.031093338504433632, "learning_rate": 7.523955752275682e-06, "loss": 0.0008, "num_input_tokens_seen": 93667184, "step": 43420 }, { "epoch": 7.969352174710956, "grad_norm": 0.05979648232460022, "learning_rate": 7.523264473513052e-06, "loss": 0.0009, "num_input_tokens_seen": 93677200, "step": 43425 }, { "epoch": 7.970269774270508, "grad_norm": 0.025478366762399673, "learning_rate": 7.5225731300326e-06, "loss": 0.1325, "num_input_tokens_seen": 93687536, "step": 43430 }, { "epoch": 7.971187373830061, "grad_norm": 16.67230796813965, "learning_rate": 7.521881721852056e-06, "loss": 0.0858, "num_input_tokens_seen": 93698672, "step": 43435 }, { "epoch": 7.972104973389612, "grad_norm": 0.08985604345798492, "learning_rate": 7.521190248989154e-06, "loss": 0.0004, "num_input_tokens_seen": 93709072, "step": 43440 }, { "epoch": 7.973022572949165, "grad_norm": 0.0031898333691060543, "learning_rate": 7.52049871146163e-06, "loss": 0.0082, "num_input_tokens_seen": 93720368, "step": 43445 }, { "epoch": 7.973940172508717, "grad_norm": 0.017430957406759262, "learning_rate": 7.51980710928722e-06, "loss": 0.0004, "num_input_tokens_seen": 93730384, "step": 43450 }, { "epoch": 7.974857772068269, "grad_norm": 0.7169990539550781, "learning_rate": 7.519115442483664e-06, "loss": 0.1385, "num_input_tokens_seen": 93741328, "step": 43455 }, { "epoch": 7.9757753716278215, "grad_norm": 0.03345675393939018, "learning_rate": 7.5184237110686995e-06, "loss": 0.2454, "num_input_tokens_seen": 93751728, "step": 43460 }, { "epoch": 7.976692971187374, "grad_norm": 0.014376787468791008, "learning_rate": 7.5177319150600714e-06, "loss": 0.0011, "num_input_tokens_seen": 93763056, "step": 43465 }, { "epoch": 7.977610570746926, "grad_norm": 0.03445200249552727, "learning_rate": 7.517040054475522e-06, "loss": 0.0036, "num_input_tokens_seen": 93774480, "step": 43470 }, { "epoch": 7.978528170306478, "grad_norm": 11.779017448425293, "learning_rate": 7.516348129332794e-06, "loss": 0.3459, "num_input_tokens_seen": 93785040, "step": 43475 }, { "epoch": 7.979445769866031, "grad_norm": 0.0676153376698494, "learning_rate": 7.515656139649639e-06, "loss": 0.1079, "num_input_tokens_seen": 93795472, "step": 43480 }, { "epoch": 7.980363369425582, "grad_norm": 0.48000505566596985, "learning_rate": 7.514964085443801e-06, "loss": 0.0019, "num_input_tokens_seen": 93806832, "step": 43485 }, { "epoch": 7.981280968985135, "grad_norm": 1.5636999607086182, "learning_rate": 7.514271966733034e-06, "loss": 0.0122, "num_input_tokens_seen": 93816176, "step": 43490 }, { "epoch": 7.982198568544687, "grad_norm": 10.875030517578125, "learning_rate": 7.513579783535088e-06, "loss": 0.0548, "num_input_tokens_seen": 93827088, "step": 43495 }, { "epoch": 7.983116168104239, "grad_norm": 0.04056069254875183, "learning_rate": 7.512887535867713e-06, "loss": 0.0011, "num_input_tokens_seen": 93837712, "step": 43500 }, { "epoch": 7.984033767663791, "grad_norm": 0.026277780532836914, "learning_rate": 7.51219522374867e-06, "loss": 0.2517, "num_input_tokens_seen": 93848336, "step": 43505 }, { "epoch": 7.984951367223344, "grad_norm": 32.06534194946289, "learning_rate": 7.511502847195713e-06, "loss": 0.1961, "num_input_tokens_seen": 93860176, "step": 43510 }, { "epoch": 7.9858689667828955, "grad_norm": 0.09770886600017548, "learning_rate": 7.510810406226601e-06, "loss": 0.0012, "num_input_tokens_seen": 93870448, "step": 43515 }, { "epoch": 7.986786566342448, "grad_norm": 0.030024755746126175, "learning_rate": 7.510117900859091e-06, "loss": 0.0165, "num_input_tokens_seen": 93882480, "step": 43520 }, { "epoch": 7.9877041659020005, "grad_norm": 0.04642461612820625, "learning_rate": 7.509425331110949e-06, "loss": 0.2003, "num_input_tokens_seen": 93893904, "step": 43525 }, { "epoch": 7.988621765461552, "grad_norm": 0.028969528153538704, "learning_rate": 7.508732696999937e-06, "loss": 0.0447, "num_input_tokens_seen": 93904688, "step": 43530 }, { "epoch": 7.989539365021105, "grad_norm": 65.44274139404297, "learning_rate": 7.508039998543817e-06, "loss": 0.3173, "num_input_tokens_seen": 93916208, "step": 43535 }, { "epoch": 7.990456964580657, "grad_norm": 0.06295019388198853, "learning_rate": 7.507347235760361e-06, "loss": 0.002, "num_input_tokens_seen": 93927856, "step": 43540 }, { "epoch": 7.991374564140209, "grad_norm": 1.0333197116851807, "learning_rate": 7.5066544086673335e-06, "loss": 0.0012, "num_input_tokens_seen": 93938320, "step": 43545 }, { "epoch": 7.992292163699761, "grad_norm": 0.09668537229299545, "learning_rate": 7.505961517282505e-06, "loss": 0.0064, "num_input_tokens_seen": 93947696, "step": 43550 }, { "epoch": 7.993209763259314, "grad_norm": 0.1408684402704239, "learning_rate": 7.505268561623647e-06, "loss": 0.0011, "num_input_tokens_seen": 93958768, "step": 43555 }, { "epoch": 7.994127362818865, "grad_norm": 31.204214096069336, "learning_rate": 7.504575541708534e-06, "loss": 0.0823, "num_input_tokens_seen": 93970288, "step": 43560 }, { "epoch": 7.995044962378418, "grad_norm": 0.22070330381393433, "learning_rate": 7.503882457554941e-06, "loss": 0.0074, "num_input_tokens_seen": 93981264, "step": 43565 }, { "epoch": 7.99596256193797, "grad_norm": 0.03383176773786545, "learning_rate": 7.503189309180642e-06, "loss": 0.0926, "num_input_tokens_seen": 93993264, "step": 43570 }, { "epoch": 7.996880161497522, "grad_norm": 0.08343906700611115, "learning_rate": 7.502496096603417e-06, "loss": 0.0004, "num_input_tokens_seen": 94004336, "step": 43575 }, { "epoch": 7.9977977610570745, "grad_norm": 0.23038575053215027, "learning_rate": 7.501802819841046e-06, "loss": 0.0045, "num_input_tokens_seen": 94016432, "step": 43580 }, { "epoch": 7.998715360616627, "grad_norm": 67.5416030883789, "learning_rate": 7.50110947891131e-06, "loss": 0.089, "num_input_tokens_seen": 94027472, "step": 43585 }, { "epoch": 7.999632960176179, "grad_norm": 0.010075418278574944, "learning_rate": 7.5004160738319934e-06, "loss": 0.0005, "num_input_tokens_seen": 94037488, "step": 43590 }, { "epoch": 8.0, "eval_loss": 0.2700943350791931, "eval_runtime": 179.2511, "eval_samples_per_second": 30.399, "eval_steps_per_second": 7.604, "num_input_tokens_seen": 94040064, "step": 43592 }, { "epoch": 8.000550559735732, "grad_norm": 0.034453146159648895, "learning_rate": 7.499722604620878e-06, "loss": 0.0042, "num_input_tokens_seen": 94046656, "step": 43595 }, { "epoch": 8.001468159295284, "grad_norm": 18.143516540527344, "learning_rate": 7.4990290712957515e-06, "loss": 0.0026, "num_input_tokens_seen": 94058496, "step": 43600 }, { "epoch": 8.002385758854835, "grad_norm": 0.01982765644788742, "learning_rate": 7.498335473874405e-06, "loss": 0.0004, "num_input_tokens_seen": 94067968, "step": 43605 }, { "epoch": 8.003303358414389, "grad_norm": 0.029795031994581223, "learning_rate": 7.497641812374623e-06, "loss": 0.1443, "num_input_tokens_seen": 94079008, "step": 43610 }, { "epoch": 8.00422095797394, "grad_norm": 0.009539945051074028, "learning_rate": 7.496948086814202e-06, "loss": 0.0005, "num_input_tokens_seen": 94089504, "step": 43615 }, { "epoch": 8.005138557533492, "grad_norm": 0.01401714887470007, "learning_rate": 7.496254297210931e-06, "loss": 0.0708, "num_input_tokens_seen": 94100800, "step": 43620 }, { "epoch": 8.006056157093045, "grad_norm": 0.38106802105903625, "learning_rate": 7.495560443582606e-06, "loss": 0.0014, "num_input_tokens_seen": 94110944, "step": 43625 }, { "epoch": 8.006973756652597, "grad_norm": 0.018536720424890518, "learning_rate": 7.494866525947024e-06, "loss": 0.0004, "num_input_tokens_seen": 94122720, "step": 43630 }, { "epoch": 8.007891356212149, "grad_norm": 0.005600344389677048, "learning_rate": 7.494172544321982e-06, "loss": 0.0004, "num_input_tokens_seen": 94133120, "step": 43635 }, { "epoch": 8.008808955771702, "grad_norm": 1.0351362228393555, "learning_rate": 7.4934784987252805e-06, "loss": 0.002, "num_input_tokens_seen": 94144256, "step": 43640 }, { "epoch": 8.009726555331254, "grad_norm": 0.03948812931776047, "learning_rate": 7.49278438917472e-06, "loss": 0.0005, "num_input_tokens_seen": 94154656, "step": 43645 }, { "epoch": 8.010644154890805, "grad_norm": 0.003268694505095482, "learning_rate": 7.492090215688103e-06, "loss": 0.0003, "num_input_tokens_seen": 94165376, "step": 43650 }, { "epoch": 8.011561754450359, "grad_norm": 0.022221513092517853, "learning_rate": 7.491395978283235e-06, "loss": 0.0003, "num_input_tokens_seen": 94177024, "step": 43655 }, { "epoch": 8.01247935400991, "grad_norm": 98.54961395263672, "learning_rate": 7.4907016769779206e-06, "loss": 0.0743, "num_input_tokens_seen": 94186912, "step": 43660 }, { "epoch": 8.013396953569462, "grad_norm": 0.008406697772443295, "learning_rate": 7.4900073117899686e-06, "loss": 0.0004, "num_input_tokens_seen": 94197888, "step": 43665 }, { "epoch": 8.014314553129015, "grad_norm": 0.22610269486904144, "learning_rate": 7.4893128827371875e-06, "loss": 0.0007, "num_input_tokens_seen": 94210432, "step": 43670 }, { "epoch": 8.015232152688567, "grad_norm": 0.015437650494277477, "learning_rate": 7.48861838983739e-06, "loss": 0.0331, "num_input_tokens_seen": 94221664, "step": 43675 }, { "epoch": 8.016149752248118, "grad_norm": 36.461021423339844, "learning_rate": 7.487923833108388e-06, "loss": 0.1009, "num_input_tokens_seen": 94231552, "step": 43680 }, { "epoch": 8.017067351807672, "grad_norm": 10.602903366088867, "learning_rate": 7.487229212567995e-06, "loss": 0.2169, "num_input_tokens_seen": 94240160, "step": 43685 }, { "epoch": 8.017984951367223, "grad_norm": 0.23174850642681122, "learning_rate": 7.486534528234028e-06, "loss": 0.001, "num_input_tokens_seen": 94250656, "step": 43690 }, { "epoch": 8.018902550926775, "grad_norm": 17.125324249267578, "learning_rate": 7.485839780124303e-06, "loss": 0.0536, "num_input_tokens_seen": 94260640, "step": 43695 }, { "epoch": 8.019820150486328, "grad_norm": 0.028534771874547005, "learning_rate": 7.485144968256641e-06, "loss": 0.0007, "num_input_tokens_seen": 94270656, "step": 43700 }, { "epoch": 8.02073775004588, "grad_norm": 0.05784556269645691, "learning_rate": 7.484450092648863e-06, "loss": 0.1598, "num_input_tokens_seen": 94281888, "step": 43705 }, { "epoch": 8.021655349605432, "grad_norm": 0.015672054141759872, "learning_rate": 7.48375515331879e-06, "loss": 0.0004, "num_input_tokens_seen": 94292288, "step": 43710 }, { "epoch": 8.022572949164985, "grad_norm": 0.012924136593937874, "learning_rate": 7.483060150284247e-06, "loss": 0.0008, "num_input_tokens_seen": 94303104, "step": 43715 }, { "epoch": 8.023490548724537, "grad_norm": 0.7182543277740479, "learning_rate": 7.4823650835630594e-06, "loss": 0.1048, "num_input_tokens_seen": 94313952, "step": 43720 }, { "epoch": 8.024408148284088, "grad_norm": 0.04026654362678528, "learning_rate": 7.481669953173055e-06, "loss": 0.0007, "num_input_tokens_seen": 94324512, "step": 43725 }, { "epoch": 8.025325747843642, "grad_norm": 0.003542982740327716, "learning_rate": 7.480974759132061e-06, "loss": 0.1567, "num_input_tokens_seen": 94335296, "step": 43730 }, { "epoch": 8.026243347403193, "grad_norm": 0.0101110078394413, "learning_rate": 7.480279501457911e-06, "loss": 0.1199, "num_input_tokens_seen": 94345664, "step": 43735 }, { "epoch": 8.027160946962745, "grad_norm": 1.2300785779953003, "learning_rate": 7.479584180168437e-06, "loss": 0.0012, "num_input_tokens_seen": 94357600, "step": 43740 }, { "epoch": 8.028078546522298, "grad_norm": 31.74488067626953, "learning_rate": 7.47888879528147e-06, "loss": 0.3092, "num_input_tokens_seen": 94368352, "step": 43745 }, { "epoch": 8.02899614608185, "grad_norm": 0.019920552149415016, "learning_rate": 7.478193346814848e-06, "loss": 0.0003, "num_input_tokens_seen": 94377536, "step": 43750 }, { "epoch": 8.029913745641402, "grad_norm": 0.011965620331466198, "learning_rate": 7.477497834786408e-06, "loss": 0.0005, "num_input_tokens_seen": 94387360, "step": 43755 }, { "epoch": 8.030831345200955, "grad_norm": 12.241567611694336, "learning_rate": 7.476802259213987e-06, "loss": 0.0273, "num_input_tokens_seen": 94397952, "step": 43760 }, { "epoch": 8.031748944760507, "grad_norm": 0.016245733946561813, "learning_rate": 7.476106620115429e-06, "loss": 0.0168, "num_input_tokens_seen": 94409984, "step": 43765 }, { "epoch": 8.032666544320058, "grad_norm": 0.02227376028895378, "learning_rate": 7.475410917508571e-06, "loss": 0.0028, "num_input_tokens_seen": 94420128, "step": 43770 }, { "epoch": 8.033584143879612, "grad_norm": 0.010949772782623768, "learning_rate": 7.47471515141126e-06, "loss": 0.0825, "num_input_tokens_seen": 94431520, "step": 43775 }, { "epoch": 8.034501743439163, "grad_norm": 0.019413167610764503, "learning_rate": 7.474019321841343e-06, "loss": 0.0007, "num_input_tokens_seen": 94442560, "step": 43780 }, { "epoch": 8.035419342998715, "grad_norm": 0.05041066184639931, "learning_rate": 7.4733234288166625e-06, "loss": 0.2077, "num_input_tokens_seen": 94453792, "step": 43785 }, { "epoch": 8.036336942558268, "grad_norm": 0.08783864229917526, "learning_rate": 7.472627472355071e-06, "loss": 0.1072, "num_input_tokens_seen": 94465280, "step": 43790 }, { "epoch": 8.03725454211782, "grad_norm": 0.08291526883840561, "learning_rate": 7.471931452474414e-06, "loss": 0.0009, "num_input_tokens_seen": 94474368, "step": 43795 }, { "epoch": 8.038172141677371, "grad_norm": 6.159337043762207, "learning_rate": 7.471235369192551e-06, "loss": 0.2058, "num_input_tokens_seen": 94483840, "step": 43800 }, { "epoch": 8.039089741236925, "grad_norm": 0.011978024616837502, "learning_rate": 7.470539222527328e-06, "loss": 0.0008, "num_input_tokens_seen": 94495168, "step": 43805 }, { "epoch": 8.040007340796476, "grad_norm": 0.0270847100764513, "learning_rate": 7.469843012496603e-06, "loss": 0.003, "num_input_tokens_seen": 94507424, "step": 43810 }, { "epoch": 8.040924940356028, "grad_norm": 0.029212942346930504, "learning_rate": 7.469146739118233e-06, "loss": 0.0007, "num_input_tokens_seen": 94518848, "step": 43815 }, { "epoch": 8.041842539915582, "grad_norm": 0.010045140981674194, "learning_rate": 7.468450402410076e-06, "loss": 0.1166, "num_input_tokens_seen": 94529792, "step": 43820 }, { "epoch": 8.042760139475133, "grad_norm": 0.6277073621749878, "learning_rate": 7.467754002389992e-06, "loss": 0.001, "num_input_tokens_seen": 94540288, "step": 43825 }, { "epoch": 8.043677739034685, "grad_norm": 0.02406265400350094, "learning_rate": 7.467057539075842e-06, "loss": 0.1595, "num_input_tokens_seen": 94551520, "step": 43830 }, { "epoch": 8.044595338594238, "grad_norm": 10.394205093383789, "learning_rate": 7.466361012485491e-06, "loss": 0.3258, "num_input_tokens_seen": 94562784, "step": 43835 }, { "epoch": 8.04551293815379, "grad_norm": 3.9575815200805664, "learning_rate": 7.465664422636801e-06, "loss": 0.2245, "num_input_tokens_seen": 94573440, "step": 43840 }, { "epoch": 8.046430537713341, "grad_norm": 0.013648022897541523, "learning_rate": 7.464967769547641e-06, "loss": 0.0012, "num_input_tokens_seen": 94585120, "step": 43845 }, { "epoch": 8.047348137272895, "grad_norm": 0.06954099982976913, "learning_rate": 7.464271053235877e-06, "loss": 0.0032, "num_input_tokens_seen": 94596704, "step": 43850 }, { "epoch": 8.048265736832446, "grad_norm": 0.052026569843292236, "learning_rate": 7.463574273719381e-06, "loss": 0.157, "num_input_tokens_seen": 94607232, "step": 43855 }, { "epoch": 8.049183336391998, "grad_norm": 2.139303684234619, "learning_rate": 7.4628774310160235e-06, "loss": 0.1141, "num_input_tokens_seen": 94619616, "step": 43860 }, { "epoch": 8.050100935951551, "grad_norm": 0.009557739831507206, "learning_rate": 7.462180525143676e-06, "loss": 0.0012, "num_input_tokens_seen": 94629664, "step": 43865 }, { "epoch": 8.051018535511103, "grad_norm": 16.201168060302734, "learning_rate": 7.461483556120214e-06, "loss": 0.0229, "num_input_tokens_seen": 94641824, "step": 43870 }, { "epoch": 8.051936135070655, "grad_norm": 0.053644973784685135, "learning_rate": 7.4607865239635145e-06, "loss": 0.1764, "num_input_tokens_seen": 94652448, "step": 43875 }, { "epoch": 8.052853734630208, "grad_norm": 0.007270834408700466, "learning_rate": 7.4600894286914535e-06, "loss": 0.0006, "num_input_tokens_seen": 94663776, "step": 43880 }, { "epoch": 8.05377133418976, "grad_norm": 0.04552028700709343, "learning_rate": 7.4593922703219126e-06, "loss": 0.0004, "num_input_tokens_seen": 94675232, "step": 43885 }, { "epoch": 8.054688933749311, "grad_norm": 0.03050888143479824, "learning_rate": 7.45869504887277e-06, "loss": 0.0014, "num_input_tokens_seen": 94685824, "step": 43890 }, { "epoch": 8.055606533308865, "grad_norm": 14.446270942687988, "learning_rate": 7.4579977643619104e-06, "loss": 0.138, "num_input_tokens_seen": 94695456, "step": 43895 }, { "epoch": 8.056524132868416, "grad_norm": 0.09585971385240555, "learning_rate": 7.457300416807219e-06, "loss": 0.0015, "num_input_tokens_seen": 94705280, "step": 43900 }, { "epoch": 8.057441732427968, "grad_norm": 0.015764998272061348, "learning_rate": 7.45660300622658e-06, "loss": 0.0012, "num_input_tokens_seen": 94715840, "step": 43905 }, { "epoch": 8.058359331987521, "grad_norm": 10.246371269226074, "learning_rate": 7.455905532637881e-06, "loss": 0.1082, "num_input_tokens_seen": 94726496, "step": 43910 }, { "epoch": 8.059276931547073, "grad_norm": 0.07466606050729752, "learning_rate": 7.455207996059011e-06, "loss": 0.001, "num_input_tokens_seen": 94737632, "step": 43915 }, { "epoch": 8.060194531106625, "grad_norm": 0.05724480375647545, "learning_rate": 7.454510396507861e-06, "loss": 0.001, "num_input_tokens_seen": 94747936, "step": 43920 }, { "epoch": 8.061112130666178, "grad_norm": 0.0253727026283741, "learning_rate": 7.453812734002325e-06, "loss": 0.0005, "num_input_tokens_seen": 94759360, "step": 43925 }, { "epoch": 8.06202973022573, "grad_norm": 0.14941208064556122, "learning_rate": 7.453115008560295e-06, "loss": 0.0462, "num_input_tokens_seen": 94771520, "step": 43930 }, { "epoch": 8.062947329785281, "grad_norm": 0.08907189965248108, "learning_rate": 7.452417220199666e-06, "loss": 0.1536, "num_input_tokens_seen": 94782688, "step": 43935 }, { "epoch": 8.063864929344835, "grad_norm": 0.018759943544864655, "learning_rate": 7.4517193689383364e-06, "loss": 0.0007, "num_input_tokens_seen": 94794496, "step": 43940 }, { "epoch": 8.064782528904386, "grad_norm": 0.23910309374332428, "learning_rate": 7.451021454794204e-06, "loss": 0.001, "num_input_tokens_seen": 94803968, "step": 43945 }, { "epoch": 8.065700128463938, "grad_norm": 0.025422492995858192, "learning_rate": 7.4503234777851716e-06, "loss": 0.0006, "num_input_tokens_seen": 94814656, "step": 43950 }, { "epoch": 8.066617728023491, "grad_norm": 0.054697636514902115, "learning_rate": 7.449625437929139e-06, "loss": 0.0014, "num_input_tokens_seen": 94825920, "step": 43955 }, { "epoch": 8.067535327583043, "grad_norm": 0.017601529136300087, "learning_rate": 7.448927335244012e-06, "loss": 0.0029, "num_input_tokens_seen": 94836736, "step": 43960 }, { "epoch": 8.068452927142594, "grad_norm": 0.016403747722506523, "learning_rate": 7.448229169747692e-06, "loss": 0.0008, "num_input_tokens_seen": 94848128, "step": 43965 }, { "epoch": 8.069370526702148, "grad_norm": 124.85067749023438, "learning_rate": 7.4475309414580896e-06, "loss": 0.2666, "num_input_tokens_seen": 94858912, "step": 43970 }, { "epoch": 8.0702881262617, "grad_norm": 0.041347112506628036, "learning_rate": 7.446832650393112e-06, "loss": 0.0004, "num_input_tokens_seen": 94869984, "step": 43975 }, { "epoch": 8.071205725821251, "grad_norm": 0.010181176476180553, "learning_rate": 7.446134296570669e-06, "loss": 0.001, "num_input_tokens_seen": 94878720, "step": 43980 }, { "epoch": 8.072123325380804, "grad_norm": 65.2428207397461, "learning_rate": 7.4454358800086715e-06, "loss": 0.0767, "num_input_tokens_seen": 94888800, "step": 43985 }, { "epoch": 8.073040924940356, "grad_norm": 0.08803422003984451, "learning_rate": 7.444737400725034e-06, "loss": 0.0003, "num_input_tokens_seen": 94899552, "step": 43990 }, { "epoch": 8.073958524499908, "grad_norm": 0.014633110724389553, "learning_rate": 7.444038858737672e-06, "loss": 0.1383, "num_input_tokens_seen": 94909440, "step": 43995 }, { "epoch": 8.074876124059461, "grad_norm": 0.02914891019463539, "learning_rate": 7.443340254064499e-06, "loss": 0.0005, "num_input_tokens_seen": 94920032, "step": 44000 }, { "epoch": 8.075793723619013, "grad_norm": 0.01114784274250269, "learning_rate": 7.442641586723438e-06, "loss": 0.0004, "num_input_tokens_seen": 94931584, "step": 44005 }, { "epoch": 8.076711323178564, "grad_norm": 0.006208937615156174, "learning_rate": 7.441942856732405e-06, "loss": 0.0002, "num_input_tokens_seen": 94942688, "step": 44010 }, { "epoch": 8.077628922738118, "grad_norm": 0.018559392541646957, "learning_rate": 7.441244064109322e-06, "loss": 0.0004, "num_input_tokens_seen": 94952160, "step": 44015 }, { "epoch": 8.07854652229767, "grad_norm": 0.030773915350437164, "learning_rate": 7.440545208872114e-06, "loss": 0.0002, "num_input_tokens_seen": 94963392, "step": 44020 }, { "epoch": 8.079464121857221, "grad_norm": 0.028623484075069427, "learning_rate": 7.4398462910387016e-06, "loss": 0.0002, "num_input_tokens_seen": 94974944, "step": 44025 }, { "epoch": 8.080381721416774, "grad_norm": 0.08702593296766281, "learning_rate": 7.439147310627014e-06, "loss": 0.141, "num_input_tokens_seen": 94985952, "step": 44030 }, { "epoch": 8.081299320976326, "grad_norm": 0.025523345917463303, "learning_rate": 7.43844826765498e-06, "loss": 0.0003, "num_input_tokens_seen": 94996448, "step": 44035 }, { "epoch": 8.082216920535878, "grad_norm": 0.031248275190591812, "learning_rate": 7.437749162140524e-06, "loss": 0.001, "num_input_tokens_seen": 95008288, "step": 44040 }, { "epoch": 8.083134520095431, "grad_norm": 0.007365142926573753, "learning_rate": 7.437049994101583e-06, "loss": 0.0002, "num_input_tokens_seen": 95019136, "step": 44045 }, { "epoch": 8.084052119654983, "grad_norm": 39.87068176269531, "learning_rate": 7.436350763556085e-06, "loss": 0.065, "num_input_tokens_seen": 95029728, "step": 44050 }, { "epoch": 8.084969719214534, "grad_norm": 0.013399635441601276, "learning_rate": 7.4356514705219664e-06, "loss": 0.0012, "num_input_tokens_seen": 95039840, "step": 44055 }, { "epoch": 8.085887318774088, "grad_norm": 0.08329086750745773, "learning_rate": 7.4349521150171634e-06, "loss": 0.0003, "num_input_tokens_seen": 95050272, "step": 44060 }, { "epoch": 8.08680491833364, "grad_norm": 0.007990296930074692, "learning_rate": 7.434252697059611e-06, "loss": 0.0003, "num_input_tokens_seen": 95062240, "step": 44065 }, { "epoch": 8.08772251789319, "grad_norm": 0.008150323294103146, "learning_rate": 7.433553216667251e-06, "loss": 0.0004, "num_input_tokens_seen": 95073600, "step": 44070 }, { "epoch": 8.088640117452744, "grad_norm": 17.203968048095703, "learning_rate": 7.432853673858021e-06, "loss": 0.1194, "num_input_tokens_seen": 95084448, "step": 44075 }, { "epoch": 8.089557717012296, "grad_norm": 0.09506457298994064, "learning_rate": 7.432154068649867e-06, "loss": 0.0003, "num_input_tokens_seen": 95095392, "step": 44080 }, { "epoch": 8.090475316571847, "grad_norm": 0.004295213147997856, "learning_rate": 7.4314544010607306e-06, "loss": 0.0005, "num_input_tokens_seen": 95106336, "step": 44085 }, { "epoch": 8.0913929161314, "grad_norm": 0.07504522055387497, "learning_rate": 7.430754671108555e-06, "loss": 0.0043, "num_input_tokens_seen": 95117664, "step": 44090 }, { "epoch": 8.092310515690952, "grad_norm": 0.0024345670826733112, "learning_rate": 7.430054878811292e-06, "loss": 0.0617, "num_input_tokens_seen": 95127904, "step": 44095 }, { "epoch": 8.093228115250504, "grad_norm": 0.009483731351792812, "learning_rate": 7.429355024186885e-06, "loss": 0.0006, "num_input_tokens_seen": 95139424, "step": 44100 }, { "epoch": 8.094145714810058, "grad_norm": 0.0029636286199092865, "learning_rate": 7.428655107253288e-06, "loss": 0.0191, "num_input_tokens_seen": 95150624, "step": 44105 }, { "epoch": 8.09506331436961, "grad_norm": 0.02556094527244568, "learning_rate": 7.427955128028452e-06, "loss": 0.0002, "num_input_tokens_seen": 95161728, "step": 44110 }, { "epoch": 8.09598091392916, "grad_norm": 0.047387514263391495, "learning_rate": 7.42725508653033e-06, "loss": 0.0004, "num_input_tokens_seen": 95172064, "step": 44115 }, { "epoch": 8.096898513488714, "grad_norm": 0.03849956393241882, "learning_rate": 7.4265549827768755e-06, "loss": 0.0002, "num_input_tokens_seen": 95183136, "step": 44120 }, { "epoch": 8.097816113048266, "grad_norm": 0.07698637992143631, "learning_rate": 7.425854816786048e-06, "loss": 0.0631, "num_input_tokens_seen": 95194144, "step": 44125 }, { "epoch": 8.098733712607817, "grad_norm": 0.007998914457857609, "learning_rate": 7.425154588575803e-06, "loss": 0.0534, "num_input_tokens_seen": 95204800, "step": 44130 }, { "epoch": 8.09965131216737, "grad_norm": 0.13858802616596222, "learning_rate": 7.424454298164102e-06, "loss": 0.002, "num_input_tokens_seen": 95214240, "step": 44135 }, { "epoch": 8.100568911726922, "grad_norm": 0.004452577326446772, "learning_rate": 7.4237539455689055e-06, "loss": 0.0003, "num_input_tokens_seen": 95226016, "step": 44140 }, { "epoch": 8.101486511286474, "grad_norm": 0.019313719123601913, "learning_rate": 7.423053530808178e-06, "loss": 0.1008, "num_input_tokens_seen": 95235552, "step": 44145 }, { "epoch": 8.102404110846027, "grad_norm": 0.0069656469859182835, "learning_rate": 7.422353053899881e-06, "loss": 0.0003, "num_input_tokens_seen": 95246624, "step": 44150 }, { "epoch": 8.103321710405579, "grad_norm": 0.14737568795681, "learning_rate": 7.421652514861985e-06, "loss": 0.0007, "num_input_tokens_seen": 95257248, "step": 44155 }, { "epoch": 8.10423930996513, "grad_norm": 0.021670576184988022, "learning_rate": 7.420951913712453e-06, "loss": 0.0008, "num_input_tokens_seen": 95268512, "step": 44160 }, { "epoch": 8.105156909524684, "grad_norm": 0.02421768195927143, "learning_rate": 7.420251250469257e-06, "loss": 0.0009, "num_input_tokens_seen": 95278816, "step": 44165 }, { "epoch": 8.106074509084236, "grad_norm": 0.004017329774796963, "learning_rate": 7.419550525150367e-06, "loss": 0.0309, "num_input_tokens_seen": 95287616, "step": 44170 }, { "epoch": 8.106992108643787, "grad_norm": 30.106739044189453, "learning_rate": 7.4188497377737565e-06, "loss": 0.2157, "num_input_tokens_seen": 95297984, "step": 44175 }, { "epoch": 8.10790970820334, "grad_norm": 0.025922134518623352, "learning_rate": 7.4181488883574e-06, "loss": 0.0002, "num_input_tokens_seen": 95308928, "step": 44180 }, { "epoch": 8.108827307762892, "grad_norm": 0.21834920346736908, "learning_rate": 7.417447976919272e-06, "loss": 0.0002, "num_input_tokens_seen": 95319776, "step": 44185 }, { "epoch": 8.109744907322444, "grad_norm": 0.00971047393977642, "learning_rate": 7.4167470034773505e-06, "loss": 0.0002, "num_input_tokens_seen": 95330624, "step": 44190 }, { "epoch": 8.110662506881997, "grad_norm": 0.020571649074554443, "learning_rate": 7.416045968049613e-06, "loss": 0.0001, "num_input_tokens_seen": 95340352, "step": 44195 }, { "epoch": 8.111580106441549, "grad_norm": 0.011459417641162872, "learning_rate": 7.415344870654041e-06, "loss": 0.0003, "num_input_tokens_seen": 95351424, "step": 44200 }, { "epoch": 8.1124977060011, "grad_norm": 0.013754944317042828, "learning_rate": 7.4146437113086164e-06, "loss": 0.0002, "num_input_tokens_seen": 95362240, "step": 44205 }, { "epoch": 8.113415305560654, "grad_norm": 0.014168606139719486, "learning_rate": 7.4139424900313225e-06, "loss": 0.2001, "num_input_tokens_seen": 95372640, "step": 44210 }, { "epoch": 8.114332905120206, "grad_norm": 0.0075896549969911575, "learning_rate": 7.413241206840146e-06, "loss": 0.0003, "num_input_tokens_seen": 95383584, "step": 44215 }, { "epoch": 8.115250504679757, "grad_norm": 0.027624785900115967, "learning_rate": 7.412539861753073e-06, "loss": 0.169, "num_input_tokens_seen": 95394688, "step": 44220 }, { "epoch": 8.11616810423931, "grad_norm": 0.006178934592753649, "learning_rate": 7.41183845478809e-06, "loss": 0.1254, "num_input_tokens_seen": 95405344, "step": 44225 }, { "epoch": 8.117085703798862, "grad_norm": 0.011222713626921177, "learning_rate": 7.411136985963191e-06, "loss": 0.0027, "num_input_tokens_seen": 95414720, "step": 44230 }, { "epoch": 8.118003303358414, "grad_norm": 0.03138534352183342, "learning_rate": 7.410435455296364e-06, "loss": 0.123, "num_input_tokens_seen": 95424544, "step": 44235 }, { "epoch": 8.118920902917967, "grad_norm": 0.014741671271622181, "learning_rate": 7.409733862805603e-06, "loss": 0.0945, "num_input_tokens_seen": 95434880, "step": 44240 }, { "epoch": 8.119838502477519, "grad_norm": 0.012202403508126736, "learning_rate": 7.409032208508904e-06, "loss": 0.0005, "num_input_tokens_seen": 95446112, "step": 44245 }, { "epoch": 8.12075610203707, "grad_norm": 26.178543090820312, "learning_rate": 7.408330492424262e-06, "loss": 0.0857, "num_input_tokens_seen": 95458016, "step": 44250 }, { "epoch": 8.121673701596624, "grad_norm": 0.031435899436473846, "learning_rate": 7.407628714569676e-06, "loss": 0.1099, "num_input_tokens_seen": 95469152, "step": 44255 }, { "epoch": 8.122591301156175, "grad_norm": 0.010393703356385231, "learning_rate": 7.406926874963144e-06, "loss": 0.0002, "num_input_tokens_seen": 95479936, "step": 44260 }, { "epoch": 8.123508900715727, "grad_norm": 0.0031733440700918436, "learning_rate": 7.4062249736226685e-06, "loss": 0.0004, "num_input_tokens_seen": 95490208, "step": 44265 }, { "epoch": 8.12442650027528, "grad_norm": 0.002949150511994958, "learning_rate": 7.405523010566252e-06, "loss": 0.1597, "num_input_tokens_seen": 95500928, "step": 44270 }, { "epoch": 8.125344099834832, "grad_norm": 0.004461838863790035, "learning_rate": 7.404820985811898e-06, "loss": 0.0004, "num_input_tokens_seen": 95511520, "step": 44275 }, { "epoch": 8.126261699394384, "grad_norm": 0.01448050793260336, "learning_rate": 7.404118899377612e-06, "loss": 0.0104, "num_input_tokens_seen": 95523040, "step": 44280 }, { "epoch": 8.127179298953937, "grad_norm": 0.03501834720373154, "learning_rate": 7.403416751281403e-06, "loss": 0.015, "num_input_tokens_seen": 95533952, "step": 44285 }, { "epoch": 8.128096898513489, "grad_norm": 0.006673700176179409, "learning_rate": 7.4027145415412816e-06, "loss": 0.0005, "num_input_tokens_seen": 95544768, "step": 44290 }, { "epoch": 8.12901449807304, "grad_norm": 0.000823758018668741, "learning_rate": 7.402012270175254e-06, "loss": 0.0008, "num_input_tokens_seen": 95556800, "step": 44295 }, { "epoch": 8.129932097632594, "grad_norm": 0.009693583473563194, "learning_rate": 7.401309937201334e-06, "loss": 0.0246, "num_input_tokens_seen": 95567648, "step": 44300 }, { "epoch": 8.130849697192145, "grad_norm": 0.015477074310183525, "learning_rate": 7.400607542637537e-06, "loss": 0.1229, "num_input_tokens_seen": 95579200, "step": 44305 }, { "epoch": 8.131767296751697, "grad_norm": 0.02211151458323002, "learning_rate": 7.3999050865018764e-06, "loss": 0.0677, "num_input_tokens_seen": 95589664, "step": 44310 }, { "epoch": 8.13268489631125, "grad_norm": 0.004764984827488661, "learning_rate": 7.39920256881237e-06, "loss": 0.0566, "num_input_tokens_seen": 95598720, "step": 44315 }, { "epoch": 8.133602495870802, "grad_norm": 0.0108981654047966, "learning_rate": 7.398499989587036e-06, "loss": 0.1741, "num_input_tokens_seen": 95611008, "step": 44320 }, { "epoch": 8.134520095430354, "grad_norm": 70.0684585571289, "learning_rate": 7.3977973488438945e-06, "loss": 0.1886, "num_input_tokens_seen": 95619968, "step": 44325 }, { "epoch": 8.135437694989907, "grad_norm": 0.009413125924766064, "learning_rate": 7.397094646600968e-06, "loss": 0.1011, "num_input_tokens_seen": 95631552, "step": 44330 }, { "epoch": 8.136355294549459, "grad_norm": 0.9544186592102051, "learning_rate": 7.3963918828762785e-06, "loss": 0.0723, "num_input_tokens_seen": 95642624, "step": 44335 }, { "epoch": 8.13727289410901, "grad_norm": 0.596175491809845, "learning_rate": 7.3956890576878515e-06, "loss": 0.0899, "num_input_tokens_seen": 95652128, "step": 44340 }, { "epoch": 8.138190493668564, "grad_norm": 0.006714583840221167, "learning_rate": 7.394986171053713e-06, "loss": 0.1815, "num_input_tokens_seen": 95663808, "step": 44345 }, { "epoch": 8.139108093228115, "grad_norm": 8.557955741882324, "learning_rate": 7.39428322299189e-06, "loss": 0.058, "num_input_tokens_seen": 95674368, "step": 44350 }, { "epoch": 8.140025692787667, "grad_norm": 21.169998168945312, "learning_rate": 7.393580213520415e-06, "loss": 0.0093, "num_input_tokens_seen": 95685632, "step": 44355 }, { "epoch": 8.14094329234722, "grad_norm": 0.36966753005981445, "learning_rate": 7.392877142657316e-06, "loss": 0.0007, "num_input_tokens_seen": 95695936, "step": 44360 }, { "epoch": 8.141860891906772, "grad_norm": 0.006872003898024559, "learning_rate": 7.392174010420628e-06, "loss": 0.0032, "num_input_tokens_seen": 95707008, "step": 44365 }, { "epoch": 8.142778491466323, "grad_norm": 0.3057822585105896, "learning_rate": 7.3914708168283824e-06, "loss": 0.0006, "num_input_tokens_seen": 95715744, "step": 44370 }, { "epoch": 8.143696091025877, "grad_norm": 14.66771125793457, "learning_rate": 7.390767561898617e-06, "loss": 0.1263, "num_input_tokens_seen": 95726240, "step": 44375 }, { "epoch": 8.144613690585429, "grad_norm": 0.0934695452451706, "learning_rate": 7.390064245649371e-06, "loss": 0.0006, "num_input_tokens_seen": 95737088, "step": 44380 }, { "epoch": 8.14553129014498, "grad_norm": 40.99915313720703, "learning_rate": 7.389360868098679e-06, "loss": 0.1509, "num_input_tokens_seen": 95748416, "step": 44385 }, { "epoch": 8.146448889704534, "grad_norm": 13.409151077270508, "learning_rate": 7.3886574292645865e-06, "loss": 0.0058, "num_input_tokens_seen": 95759424, "step": 44390 }, { "epoch": 8.147366489264085, "grad_norm": 0.11847522109746933, "learning_rate": 7.38795392916513e-06, "loss": 0.1445, "num_input_tokens_seen": 95770272, "step": 44395 }, { "epoch": 8.148284088823637, "grad_norm": 0.01435263641178608, "learning_rate": 7.38725036781836e-06, "loss": 0.0005, "num_input_tokens_seen": 95781408, "step": 44400 }, { "epoch": 8.14920168838319, "grad_norm": 0.05236690863966942, "learning_rate": 7.386546745242316e-06, "loss": 0.2543, "num_input_tokens_seen": 95791904, "step": 44405 }, { "epoch": 8.150119287942742, "grad_norm": 0.04059155285358429, "learning_rate": 7.3858430614550455e-06, "loss": 0.0256, "num_input_tokens_seen": 95802304, "step": 44410 }, { "epoch": 8.151036887502293, "grad_norm": 12.333414077758789, "learning_rate": 7.3851393164746e-06, "loss": 0.0162, "num_input_tokens_seen": 95811392, "step": 44415 }, { "epoch": 8.151954487061847, "grad_norm": 0.008679199032485485, "learning_rate": 7.384435510319027e-06, "loss": 0.0015, "num_input_tokens_seen": 95821792, "step": 44420 }, { "epoch": 8.152872086621398, "grad_norm": 364.5002136230469, "learning_rate": 7.383731643006379e-06, "loss": 0.0677, "num_input_tokens_seen": 95831840, "step": 44425 }, { "epoch": 8.15378968618095, "grad_norm": 0.1727205365896225, "learning_rate": 7.383027714554708e-06, "loss": 0.0005, "num_input_tokens_seen": 95843456, "step": 44430 }, { "epoch": 8.154707285740503, "grad_norm": 0.0108822425827384, "learning_rate": 7.38232372498207e-06, "loss": 0.0406, "num_input_tokens_seen": 95854400, "step": 44435 }, { "epoch": 8.155624885300055, "grad_norm": 0.12773948907852173, "learning_rate": 7.381619674306521e-06, "loss": 0.0009, "num_input_tokens_seen": 95865760, "step": 44440 }, { "epoch": 8.156542484859607, "grad_norm": 0.025567878037691116, "learning_rate": 7.380915562546117e-06, "loss": 0.0491, "num_input_tokens_seen": 95876544, "step": 44445 }, { "epoch": 8.15746008441916, "grad_norm": 2.064364433288574, "learning_rate": 7.380211389718921e-06, "loss": 0.12, "num_input_tokens_seen": 95885824, "step": 44450 }, { "epoch": 8.158377683978712, "grad_norm": 0.11903352290391922, "learning_rate": 7.379507155842991e-06, "loss": 0.0006, "num_input_tokens_seen": 95897664, "step": 44455 }, { "epoch": 8.159295283538263, "grad_norm": 0.0613459050655365, "learning_rate": 7.378802860936389e-06, "loss": 0.117, "num_input_tokens_seen": 95908480, "step": 44460 }, { "epoch": 8.160212883097817, "grad_norm": 0.019897671416401863, "learning_rate": 7.378098505017183e-06, "loss": 0.0268, "num_input_tokens_seen": 95917312, "step": 44465 }, { "epoch": 8.161130482657368, "grad_norm": 0.051126737147569656, "learning_rate": 7.377394088103433e-06, "loss": 0.22, "num_input_tokens_seen": 95928800, "step": 44470 }, { "epoch": 8.16204808221692, "grad_norm": 0.030712058767676353, "learning_rate": 7.376689610213212e-06, "loss": 0.1302, "num_input_tokens_seen": 95938720, "step": 44475 }, { "epoch": 8.162965681776473, "grad_norm": 3.5784521102905273, "learning_rate": 7.375985071364585e-06, "loss": 0.0041, "num_input_tokens_seen": 95948480, "step": 44480 }, { "epoch": 8.163883281336025, "grad_norm": 0.0022140119690448046, "learning_rate": 7.375280471575624e-06, "loss": 0.0619, "num_input_tokens_seen": 95960128, "step": 44485 }, { "epoch": 8.164800880895577, "grad_norm": 0.1789519339799881, "learning_rate": 7.3745758108643995e-06, "loss": 0.1143, "num_input_tokens_seen": 95971104, "step": 44490 }, { "epoch": 8.16571848045513, "grad_norm": 0.01875218190252781, "learning_rate": 7.373871089248985e-06, "loss": 0.1629, "num_input_tokens_seen": 95981248, "step": 44495 }, { "epoch": 8.166636080014682, "grad_norm": 0.1016325131058693, "learning_rate": 7.373166306747458e-06, "loss": 0.09, "num_input_tokens_seen": 95992192, "step": 44500 }, { "epoch": 8.167553679574233, "grad_norm": 9.196147918701172, "learning_rate": 7.3724614633778925e-06, "loss": 0.1445, "num_input_tokens_seen": 96002752, "step": 44505 }, { "epoch": 8.168471279133787, "grad_norm": 280.1322326660156, "learning_rate": 7.371756559158367e-06, "loss": 0.0237, "num_input_tokens_seen": 96011872, "step": 44510 }, { "epoch": 8.169388878693338, "grad_norm": 0.163083016872406, "learning_rate": 7.371051594106964e-06, "loss": 0.0408, "num_input_tokens_seen": 96022752, "step": 44515 }, { "epoch": 8.17030647825289, "grad_norm": 0.664059042930603, "learning_rate": 7.37034656824176e-06, "loss": 0.0027, "num_input_tokens_seen": 96033184, "step": 44520 }, { "epoch": 8.171224077812443, "grad_norm": 0.220778226852417, "learning_rate": 7.369641481580841e-06, "loss": 0.0022, "num_input_tokens_seen": 96043712, "step": 44525 }, { "epoch": 8.172141677371995, "grad_norm": 0.9032276272773743, "learning_rate": 7.368936334142289e-06, "loss": 0.0022, "num_input_tokens_seen": 96054592, "step": 44530 }, { "epoch": 8.173059276931546, "grad_norm": 0.07868477702140808, "learning_rate": 7.368231125944193e-06, "loss": 0.0265, "num_input_tokens_seen": 96065664, "step": 44535 }, { "epoch": 8.1739768764911, "grad_norm": 0.008863342925906181, "learning_rate": 7.3675258570046395e-06, "loss": 0.0065, "num_input_tokens_seen": 96076672, "step": 44540 }, { "epoch": 8.174894476050651, "grad_norm": 0.12715698778629303, "learning_rate": 7.366820527341716e-06, "loss": 0.0006, "num_input_tokens_seen": 96087648, "step": 44545 }, { "epoch": 8.175812075610203, "grad_norm": 0.04745474457740784, "learning_rate": 7.366115136973515e-06, "loss": 0.0027, "num_input_tokens_seen": 96096512, "step": 44550 }, { "epoch": 8.176729675169756, "grad_norm": 0.008467479608952999, "learning_rate": 7.365409685918128e-06, "loss": 0.0007, "num_input_tokens_seen": 96106464, "step": 44555 }, { "epoch": 8.177647274729308, "grad_norm": 0.011142192408442497, "learning_rate": 7.364704174193646e-06, "loss": 0.2173, "num_input_tokens_seen": 96118816, "step": 44560 }, { "epoch": 8.17856487428886, "grad_norm": 0.027325302362442017, "learning_rate": 7.363998601818171e-06, "loss": 0.2776, "num_input_tokens_seen": 96127872, "step": 44565 }, { "epoch": 8.179482473848413, "grad_norm": 0.004981834441423416, "learning_rate": 7.363292968809793e-06, "loss": 0.0011, "num_input_tokens_seen": 96139200, "step": 44570 }, { "epoch": 8.180400073407965, "grad_norm": 0.04167953133583069, "learning_rate": 7.362587275186614e-06, "loss": 0.0148, "num_input_tokens_seen": 96147936, "step": 44575 }, { "epoch": 8.181317672967516, "grad_norm": 0.018647177144885063, "learning_rate": 7.361881520966733e-06, "loss": 0.0003, "num_input_tokens_seen": 96158624, "step": 44580 }, { "epoch": 8.18223527252707, "grad_norm": 0.0060557108372449875, "learning_rate": 7.361175706168252e-06, "loss": 0.0013, "num_input_tokens_seen": 96170240, "step": 44585 }, { "epoch": 8.183152872086621, "grad_norm": 0.01193518377840519, "learning_rate": 7.360469830809272e-06, "loss": 0.0001, "num_input_tokens_seen": 96180288, "step": 44590 }, { "epoch": 8.184070471646173, "grad_norm": 0.11859933286905289, "learning_rate": 7.359763894907901e-06, "loss": 0.0151, "num_input_tokens_seen": 96191808, "step": 44595 }, { "epoch": 8.184988071205726, "grad_norm": 0.042924560606479645, "learning_rate": 7.359057898482244e-06, "loss": 0.3838, "num_input_tokens_seen": 96202592, "step": 44600 }, { "epoch": 8.185905670765278, "grad_norm": 0.018592339009046555, "learning_rate": 7.358351841550406e-06, "loss": 0.0002, "num_input_tokens_seen": 96213568, "step": 44605 }, { "epoch": 8.18682327032483, "grad_norm": 0.024336213245987892, "learning_rate": 7.3576457241305e-06, "loss": 0.0003, "num_input_tokens_seen": 96224256, "step": 44610 }, { "epoch": 8.187740869884383, "grad_norm": 0.033523816615343094, "learning_rate": 7.3569395462406335e-06, "loss": 0.0002, "num_input_tokens_seen": 96234752, "step": 44615 }, { "epoch": 8.188658469443935, "grad_norm": 11.501495361328125, "learning_rate": 7.356233307898922e-06, "loss": 0.0026, "num_input_tokens_seen": 96246368, "step": 44620 }, { "epoch": 8.189576069003486, "grad_norm": 0.09695027023553848, "learning_rate": 7.355527009123479e-06, "loss": 0.2097, "num_input_tokens_seen": 96257440, "step": 44625 }, { "epoch": 8.19049366856304, "grad_norm": 123.8773422241211, "learning_rate": 7.354820649932417e-06, "loss": 0.0564, "num_input_tokens_seen": 96268352, "step": 44630 }, { "epoch": 8.191411268122591, "grad_norm": 5.2434773445129395, "learning_rate": 7.354114230343856e-06, "loss": 0.3268, "num_input_tokens_seen": 96278208, "step": 44635 }, { "epoch": 8.192328867682143, "grad_norm": 0.011132885701954365, "learning_rate": 7.3534077503759125e-06, "loss": 0.0006, "num_input_tokens_seen": 96289024, "step": 44640 }, { "epoch": 8.193246467241696, "grad_norm": 0.006583577953279018, "learning_rate": 7.352701210046708e-06, "loss": 0.0133, "num_input_tokens_seen": 96299616, "step": 44645 }, { "epoch": 8.194164066801248, "grad_norm": 0.03918147087097168, "learning_rate": 7.351994609374364e-06, "loss": 0.0006, "num_input_tokens_seen": 96310240, "step": 44650 }, { "epoch": 8.1950816663608, "grad_norm": 0.04366935044527054, "learning_rate": 7.3512879483770035e-06, "loss": 0.0005, "num_input_tokens_seen": 96319808, "step": 44655 }, { "epoch": 8.195999265920353, "grad_norm": 0.0049187373369932175, "learning_rate": 7.350581227072752e-06, "loss": 0.0917, "num_input_tokens_seen": 96330208, "step": 44660 }, { "epoch": 8.196916865479905, "grad_norm": 0.016092946752905846, "learning_rate": 7.349874445479733e-06, "loss": 0.1069, "num_input_tokens_seen": 96340928, "step": 44665 }, { "epoch": 8.197834465039456, "grad_norm": 0.010516636073589325, "learning_rate": 7.349167603616079e-06, "loss": 0.0004, "num_input_tokens_seen": 96351936, "step": 44670 }, { "epoch": 8.19875206459901, "grad_norm": 0.0049637663178145885, "learning_rate": 7.348460701499915e-06, "loss": 0.0015, "num_input_tokens_seen": 96363616, "step": 44675 }, { "epoch": 8.199669664158561, "grad_norm": 0.011679130606353283, "learning_rate": 7.3477537391493745e-06, "loss": 0.0005, "num_input_tokens_seen": 96373984, "step": 44680 }, { "epoch": 8.200587263718113, "grad_norm": 0.025036189705133438, "learning_rate": 7.34704671658259e-06, "loss": 0.0002, "num_input_tokens_seen": 96384640, "step": 44685 }, { "epoch": 8.201504863277666, "grad_norm": 0.002973678056150675, "learning_rate": 7.346339633817694e-06, "loss": 0.0007, "num_input_tokens_seen": 96395616, "step": 44690 }, { "epoch": 8.202422462837218, "grad_norm": 0.08777167648077011, "learning_rate": 7.345632490872821e-06, "loss": 0.0045, "num_input_tokens_seen": 96405344, "step": 44695 }, { "epoch": 8.20334006239677, "grad_norm": 0.051795974373817444, "learning_rate": 7.344925287766114e-06, "loss": 0.0003, "num_input_tokens_seen": 96417376, "step": 44700 }, { "epoch": 8.204257661956323, "grad_norm": 57.253868103027344, "learning_rate": 7.344218024515704e-06, "loss": 0.1689, "num_input_tokens_seen": 96427520, "step": 44705 }, { "epoch": 8.205175261515874, "grad_norm": 0.005884336773306131, "learning_rate": 7.343510701139737e-06, "loss": 0.0681, "num_input_tokens_seen": 96437600, "step": 44710 }, { "epoch": 8.206092861075426, "grad_norm": 0.052943192422389984, "learning_rate": 7.342803317656353e-06, "loss": 0.1598, "num_input_tokens_seen": 96448896, "step": 44715 }, { "epoch": 8.20701046063498, "grad_norm": 21.838878631591797, "learning_rate": 7.342095874083694e-06, "loss": 0.1734, "num_input_tokens_seen": 96459520, "step": 44720 }, { "epoch": 8.207928060194531, "grad_norm": 0.03628145530819893, "learning_rate": 7.341388370439907e-06, "loss": 0.0019, "num_input_tokens_seen": 96470880, "step": 44725 }, { "epoch": 8.208845659754083, "grad_norm": 0.0070478105917572975, "learning_rate": 7.340680806743135e-06, "loss": 0.1131, "num_input_tokens_seen": 96480544, "step": 44730 }, { "epoch": 8.209763259313636, "grad_norm": 0.016831643879413605, "learning_rate": 7.33997318301153e-06, "loss": 0.0008, "num_input_tokens_seen": 96490624, "step": 44735 }, { "epoch": 8.210680858873188, "grad_norm": 0.007526730187237263, "learning_rate": 7.339265499263237e-06, "loss": 0.0006, "num_input_tokens_seen": 96501920, "step": 44740 }, { "epoch": 8.21159845843274, "grad_norm": 28.783889770507812, "learning_rate": 7.338557755516412e-06, "loss": 0.2097, "num_input_tokens_seen": 96513344, "step": 44745 }, { "epoch": 8.212516057992293, "grad_norm": 112.2773208618164, "learning_rate": 7.337849951789204e-06, "loss": 0.0781, "num_input_tokens_seen": 96523072, "step": 44750 }, { "epoch": 8.213433657551844, "grad_norm": 38.11758041381836, "learning_rate": 7.337142088099767e-06, "loss": 0.0884, "num_input_tokens_seen": 96534720, "step": 44755 }, { "epoch": 8.214351257111396, "grad_norm": 0.4336976408958435, "learning_rate": 7.33643416446626e-06, "loss": 0.0023, "num_input_tokens_seen": 96546656, "step": 44760 }, { "epoch": 8.21526885667095, "grad_norm": 0.07354320585727692, "learning_rate": 7.335726180906836e-06, "loss": 0.0003, "num_input_tokens_seen": 96556352, "step": 44765 }, { "epoch": 8.216186456230501, "grad_norm": 44.03877258300781, "learning_rate": 7.335018137439657e-06, "loss": 0.1354, "num_input_tokens_seen": 96567040, "step": 44770 }, { "epoch": 8.217104055790053, "grad_norm": 0.006545218639075756, "learning_rate": 7.33431003408288e-06, "loss": 0.0002, "num_input_tokens_seen": 96576256, "step": 44775 }, { "epoch": 8.218021655349606, "grad_norm": 0.03969196975231171, "learning_rate": 7.333601870854669e-06, "loss": 0.0619, "num_input_tokens_seen": 96586432, "step": 44780 }, { "epoch": 8.218939254909158, "grad_norm": 0.009549052454531193, "learning_rate": 7.332893647773187e-06, "loss": 0.1557, "num_input_tokens_seen": 96598208, "step": 44785 }, { "epoch": 8.21985685446871, "grad_norm": 0.033106040209531784, "learning_rate": 7.332185364856599e-06, "loss": 0.1603, "num_input_tokens_seen": 96609376, "step": 44790 }, { "epoch": 8.220774454028263, "grad_norm": 0.0008629722869955003, "learning_rate": 7.33147702212307e-06, "loss": 0.0004, "num_input_tokens_seen": 96621088, "step": 44795 }, { "epoch": 8.221692053587814, "grad_norm": 0.28592413663864136, "learning_rate": 7.330768619590769e-06, "loss": 0.0557, "num_input_tokens_seen": 96632928, "step": 44800 }, { "epoch": 8.222609653147366, "grad_norm": 0.00487716868519783, "learning_rate": 7.3300601572778655e-06, "loss": 0.0065, "num_input_tokens_seen": 96644928, "step": 44805 }, { "epoch": 8.22352725270692, "grad_norm": 0.011087206192314625, "learning_rate": 7.32935163520253e-06, "loss": 0.1445, "num_input_tokens_seen": 96656480, "step": 44810 }, { "epoch": 8.22444485226647, "grad_norm": 0.025412023067474365, "learning_rate": 7.328643053382937e-06, "loss": 0.0248, "num_input_tokens_seen": 96667968, "step": 44815 }, { "epoch": 8.225362451826022, "grad_norm": 0.023456620052456856, "learning_rate": 7.3279344118372575e-06, "loss": 0.1043, "num_input_tokens_seen": 96679648, "step": 44820 }, { "epoch": 8.226280051385576, "grad_norm": 0.20529362559318542, "learning_rate": 7.327225710583668e-06, "loss": 0.2536, "num_input_tokens_seen": 96690496, "step": 44825 }, { "epoch": 8.227197650945127, "grad_norm": 0.13899309933185577, "learning_rate": 7.326516949640346e-06, "loss": 0.0005, "num_input_tokens_seen": 96700960, "step": 44830 }, { "epoch": 8.228115250504679, "grad_norm": 0.02622869610786438, "learning_rate": 7.3258081290254715e-06, "loss": 0.0049, "num_input_tokens_seen": 96713504, "step": 44835 }, { "epoch": 8.229032850064232, "grad_norm": 0.02714415453374386, "learning_rate": 7.325099248757221e-06, "loss": 0.0003, "num_input_tokens_seen": 96723712, "step": 44840 }, { "epoch": 8.229950449623784, "grad_norm": 0.015482100658118725, "learning_rate": 7.324390308853779e-06, "loss": 0.0003, "num_input_tokens_seen": 96734944, "step": 44845 }, { "epoch": 8.230868049183336, "grad_norm": 0.2902044355869293, "learning_rate": 7.323681309333328e-06, "loss": 0.0006, "num_input_tokens_seen": 96745376, "step": 44850 }, { "epoch": 8.231785648742889, "grad_norm": 0.0925072580575943, "learning_rate": 7.322972250214054e-06, "loss": 0.0004, "num_input_tokens_seen": 96756352, "step": 44855 }, { "epoch": 8.23270324830244, "grad_norm": 0.14563269913196564, "learning_rate": 7.32226313151414e-06, "loss": 0.0918, "num_input_tokens_seen": 96767904, "step": 44860 }, { "epoch": 8.233620847861992, "grad_norm": 0.011240487918257713, "learning_rate": 7.321553953251777e-06, "loss": 0.046, "num_input_tokens_seen": 96779200, "step": 44865 }, { "epoch": 8.234538447421546, "grad_norm": 0.0588148757815361, "learning_rate": 7.320844715445153e-06, "loss": 0.0007, "num_input_tokens_seen": 96789408, "step": 44870 }, { "epoch": 8.235456046981097, "grad_norm": 0.002604707144200802, "learning_rate": 7.32013541811246e-06, "loss": 0.0007, "num_input_tokens_seen": 96800640, "step": 44875 }, { "epoch": 8.236373646540649, "grad_norm": 0.02051105722784996, "learning_rate": 7.319426061271888e-06, "loss": 0.0011, "num_input_tokens_seen": 96812320, "step": 44880 }, { "epoch": 8.237291246100202, "grad_norm": 0.002876527840271592, "learning_rate": 7.318716644941633e-06, "loss": 0.0086, "num_input_tokens_seen": 96822496, "step": 44885 }, { "epoch": 8.238208845659754, "grad_norm": 0.04103131964802742, "learning_rate": 7.318007169139889e-06, "loss": 0.0003, "num_input_tokens_seen": 96833088, "step": 44890 }, { "epoch": 8.239126445219306, "grad_norm": 0.005280924029648304, "learning_rate": 7.317297633884854e-06, "loss": 0.288, "num_input_tokens_seen": 96845248, "step": 44895 }, { "epoch": 8.240044044778859, "grad_norm": 0.010967184789478779, "learning_rate": 7.316588039194726e-06, "loss": 0.0005, "num_input_tokens_seen": 96855904, "step": 44900 }, { "epoch": 8.24096164433841, "grad_norm": 0.004677976947277784, "learning_rate": 7.315878385087707e-06, "loss": 0.0002, "num_input_tokens_seen": 96865536, "step": 44905 }, { "epoch": 8.241879243897962, "grad_norm": 0.010958007536828518, "learning_rate": 7.315168671581995e-06, "loss": 0.0004, "num_input_tokens_seen": 96875488, "step": 44910 }, { "epoch": 8.242796843457516, "grad_norm": 0.006067101377993822, "learning_rate": 7.314458898695794e-06, "loss": 0.0002, "num_input_tokens_seen": 96886848, "step": 44915 }, { "epoch": 8.243714443017067, "grad_norm": 0.011468365788459778, "learning_rate": 7.313749066447311e-06, "loss": 0.0751, "num_input_tokens_seen": 96898560, "step": 44920 }, { "epoch": 8.244632042576619, "grad_norm": 0.001606960198841989, "learning_rate": 7.31303917485475e-06, "loss": 0.0012, "num_input_tokens_seen": 96908352, "step": 44925 }, { "epoch": 8.245549642136172, "grad_norm": 0.003080356866121292, "learning_rate": 7.31232922393632e-06, "loss": 0.0001, "num_input_tokens_seen": 96920768, "step": 44930 }, { "epoch": 8.246467241695724, "grad_norm": 0.01384699996560812, "learning_rate": 7.311619213710227e-06, "loss": 0.0003, "num_input_tokens_seen": 96930816, "step": 44935 }, { "epoch": 8.247384841255275, "grad_norm": 0.010714301839470863, "learning_rate": 7.310909144194685e-06, "loss": 0.0536, "num_input_tokens_seen": 96942112, "step": 44940 }, { "epoch": 8.248302440814829, "grad_norm": 0.01670352928340435, "learning_rate": 7.310199015407906e-06, "loss": 0.0003, "num_input_tokens_seen": 96952480, "step": 44945 }, { "epoch": 8.24922004037438, "grad_norm": 0.004832085222005844, "learning_rate": 7.309488827368102e-06, "loss": 0.1818, "num_input_tokens_seen": 96962208, "step": 44950 }, { "epoch": 8.250137639933932, "grad_norm": 0.003591343527659774, "learning_rate": 7.308778580093489e-06, "loss": 0.0003, "num_input_tokens_seen": 96971648, "step": 44955 }, { "epoch": 8.251055239493486, "grad_norm": 0.013367521576583385, "learning_rate": 7.308068273602283e-06, "loss": 0.0002, "num_input_tokens_seen": 96981920, "step": 44960 }, { "epoch": 8.251972839053037, "grad_norm": 0.1243375763297081, "learning_rate": 7.307357907912702e-06, "loss": 0.0002, "num_input_tokens_seen": 96992896, "step": 44965 }, { "epoch": 8.252890438612589, "grad_norm": 0.008828959427773952, "learning_rate": 7.306647483042969e-06, "loss": 0.0004, "num_input_tokens_seen": 97004032, "step": 44970 }, { "epoch": 8.253808038172142, "grad_norm": 0.015264803543686867, "learning_rate": 7.305936999011303e-06, "loss": 0.1223, "num_input_tokens_seen": 97014592, "step": 44975 }, { "epoch": 8.254725637731694, "grad_norm": 0.002652491210028529, "learning_rate": 7.305226455835926e-06, "loss": 0.0009, "num_input_tokens_seen": 97025632, "step": 44980 }, { "epoch": 8.255643237291245, "grad_norm": 0.12397255748510361, "learning_rate": 7.304515853535062e-06, "loss": 0.0004, "num_input_tokens_seen": 97036448, "step": 44985 }, { "epoch": 8.256560836850799, "grad_norm": 0.003491368144750595, "learning_rate": 7.303805192126939e-06, "loss": 0.0001, "num_input_tokens_seen": 97047584, "step": 44990 }, { "epoch": 8.25747843641035, "grad_norm": 0.0043742540292441845, "learning_rate": 7.303094471629785e-06, "loss": 0.0038, "num_input_tokens_seen": 97057760, "step": 44995 }, { "epoch": 8.258396035969902, "grad_norm": 0.003688404569402337, "learning_rate": 7.302383692061825e-06, "loss": 0.1721, "num_input_tokens_seen": 97068736, "step": 45000 }, { "epoch": 8.259313635529455, "grad_norm": 0.10986195504665375, "learning_rate": 7.301672853441293e-06, "loss": 0.0002, "num_input_tokens_seen": 97078720, "step": 45005 }, { "epoch": 8.260231235089007, "grad_norm": 0.021767383441329002, "learning_rate": 7.300961955786419e-06, "loss": 0.0002, "num_input_tokens_seen": 97090080, "step": 45010 }, { "epoch": 8.261148834648559, "grad_norm": 0.014253795146942139, "learning_rate": 7.300250999115437e-06, "loss": 0.3068, "num_input_tokens_seen": 97101088, "step": 45015 }, { "epoch": 8.262066434208112, "grad_norm": 0.007196096237748861, "learning_rate": 7.299539983446582e-06, "loss": 0.1545, "num_input_tokens_seen": 97111712, "step": 45020 }, { "epoch": 8.262984033767664, "grad_norm": 0.018788591027259827, "learning_rate": 7.29882890879809e-06, "loss": 0.1008, "num_input_tokens_seen": 97121408, "step": 45025 }, { "epoch": 8.263901633327215, "grad_norm": 0.023058416321873665, "learning_rate": 7.298117775188201e-06, "loss": 0.1483, "num_input_tokens_seen": 97132320, "step": 45030 }, { "epoch": 8.264819232886769, "grad_norm": 0.014633637852966785, "learning_rate": 7.29740658263515e-06, "loss": 0.0099, "num_input_tokens_seen": 97143136, "step": 45035 }, { "epoch": 8.26573683244632, "grad_norm": 107.03938293457031, "learning_rate": 7.296695331157184e-06, "loss": 0.0826, "num_input_tokens_seen": 97154720, "step": 45040 }, { "epoch": 8.266654432005872, "grad_norm": 0.2590383291244507, "learning_rate": 7.29598402077254e-06, "loss": 0.0003, "num_input_tokens_seen": 97165024, "step": 45045 }, { "epoch": 8.267572031565425, "grad_norm": 0.0385914146900177, "learning_rate": 7.295272651499465e-06, "loss": 0.0007, "num_input_tokens_seen": 97174560, "step": 45050 }, { "epoch": 8.268489631124977, "grad_norm": 0.005848689004778862, "learning_rate": 7.2945612233562045e-06, "loss": 0.0004, "num_input_tokens_seen": 97185472, "step": 45055 }, { "epoch": 8.269407230684529, "grad_norm": 0.044116731733083725, "learning_rate": 7.293849736361005e-06, "loss": 0.0005, "num_input_tokens_seen": 97196960, "step": 45060 }, { "epoch": 8.270324830244082, "grad_norm": 0.023022174835205078, "learning_rate": 7.293138190532114e-06, "loss": 0.0676, "num_input_tokens_seen": 97208960, "step": 45065 }, { "epoch": 8.271242429803634, "grad_norm": 0.018716108053922653, "learning_rate": 7.292426585887783e-06, "loss": 0.0844, "num_input_tokens_seen": 97219520, "step": 45070 }, { "epoch": 8.272160029363185, "grad_norm": 0.03194461017847061, "learning_rate": 7.291714922446262e-06, "loss": 0.0246, "num_input_tokens_seen": 97231008, "step": 45075 }, { "epoch": 8.273077628922739, "grad_norm": 0.014608390629291534, "learning_rate": 7.291003200225806e-06, "loss": 0.1597, "num_input_tokens_seen": 97241216, "step": 45080 }, { "epoch": 8.27399522848229, "grad_norm": 0.162886843085289, "learning_rate": 7.290291419244669e-06, "loss": 0.1228, "num_input_tokens_seen": 97251072, "step": 45085 }, { "epoch": 8.274912828041842, "grad_norm": 0.1123763918876648, "learning_rate": 7.289579579521106e-06, "loss": 0.0007, "num_input_tokens_seen": 97261984, "step": 45090 }, { "epoch": 8.275830427601395, "grad_norm": 0.10370136797428131, "learning_rate": 7.288867681073375e-06, "loss": 0.0011, "num_input_tokens_seen": 97272320, "step": 45095 }, { "epoch": 8.276748027160947, "grad_norm": 0.021657105535268784, "learning_rate": 7.288155723919735e-06, "loss": 0.0008, "num_input_tokens_seen": 97282880, "step": 45100 }, { "epoch": 8.277665626720498, "grad_norm": 0.0520792230963707, "learning_rate": 7.287443708078448e-06, "loss": 0.0004, "num_input_tokens_seen": 97294112, "step": 45105 }, { "epoch": 8.278583226280052, "grad_norm": 0.005332866217941046, "learning_rate": 7.286731633567775e-06, "loss": 0.1067, "num_input_tokens_seen": 97304576, "step": 45110 }, { "epoch": 8.279500825839603, "grad_norm": 0.014287290163338184, "learning_rate": 7.2860195004059806e-06, "loss": 0.2324, "num_input_tokens_seen": 97315968, "step": 45115 }, { "epoch": 8.280418425399155, "grad_norm": 0.019189275801181793, "learning_rate": 7.285307308611327e-06, "loss": 0.0008, "num_input_tokens_seen": 97326560, "step": 45120 }, { "epoch": 8.281336024958708, "grad_norm": 0.01640421524643898, "learning_rate": 7.2845950582020844e-06, "loss": 0.0002, "num_input_tokens_seen": 97337952, "step": 45125 }, { "epoch": 8.28225362451826, "grad_norm": 0.0025708922185003757, "learning_rate": 7.283882749196519e-06, "loss": 0.1884, "num_input_tokens_seen": 97348512, "step": 45130 }, { "epoch": 8.283171224077812, "grad_norm": 0.03151717036962509, "learning_rate": 7.2831703816129e-06, "loss": 0.0003, "num_input_tokens_seen": 97359264, "step": 45135 }, { "epoch": 8.284088823637365, "grad_norm": 20.49036979675293, "learning_rate": 7.2824579554695e-06, "loss": 0.0803, "num_input_tokens_seen": 97370016, "step": 45140 }, { "epoch": 8.285006423196917, "grad_norm": 0.0529475063085556, "learning_rate": 7.2817454707845914e-06, "loss": 0.0023, "num_input_tokens_seen": 97380992, "step": 45145 }, { "epoch": 8.285924022756468, "grad_norm": 0.055676400661468506, "learning_rate": 7.281032927576448e-06, "loss": 0.0013, "num_input_tokens_seen": 97390656, "step": 45150 }, { "epoch": 8.286841622316022, "grad_norm": 0.0241151861846447, "learning_rate": 7.280320325863344e-06, "loss": 0.1872, "num_input_tokens_seen": 97401792, "step": 45155 }, { "epoch": 8.287759221875573, "grad_norm": 0.21951483190059662, "learning_rate": 7.27960766566356e-06, "loss": 0.2022, "num_input_tokens_seen": 97412096, "step": 45160 }, { "epoch": 8.288676821435125, "grad_norm": 0.03305468708276749, "learning_rate": 7.27889494699537e-06, "loss": 0.0006, "num_input_tokens_seen": 97421088, "step": 45165 }, { "epoch": 8.289594420994678, "grad_norm": 0.04169449955224991, "learning_rate": 7.278182169877057e-06, "loss": 0.0037, "num_input_tokens_seen": 97432160, "step": 45170 }, { "epoch": 8.29051202055423, "grad_norm": 0.04354053735733032, "learning_rate": 7.277469334326903e-06, "loss": 0.001, "num_input_tokens_seen": 97442464, "step": 45175 }, { "epoch": 8.291429620113782, "grad_norm": 3.485407590866089, "learning_rate": 7.276756440363191e-06, "loss": 0.2177, "num_input_tokens_seen": 97454144, "step": 45180 }, { "epoch": 8.292347219673335, "grad_norm": 2.3853254318237305, "learning_rate": 7.276043488004203e-06, "loss": 0.0033, "num_input_tokens_seen": 97465088, "step": 45185 }, { "epoch": 8.293264819232887, "grad_norm": 0.14044535160064697, "learning_rate": 7.275330477268229e-06, "loss": 0.0139, "num_input_tokens_seen": 97476320, "step": 45190 }, { "epoch": 8.294182418792438, "grad_norm": 0.052363041788339615, "learning_rate": 7.274617408173552e-06, "loss": 0.0042, "num_input_tokens_seen": 97486720, "step": 45195 }, { "epoch": 8.295100018351992, "grad_norm": 0.0152974221855402, "learning_rate": 7.273904280738466e-06, "loss": 0.1009, "num_input_tokens_seen": 97497152, "step": 45200 }, { "epoch": 8.296017617911543, "grad_norm": 0.007481144741177559, "learning_rate": 7.27319109498126e-06, "loss": 0.0005, "num_input_tokens_seen": 97508576, "step": 45205 }, { "epoch": 8.296935217471095, "grad_norm": 0.07463689148426056, "learning_rate": 7.2724778509202235e-06, "loss": 0.0004, "num_input_tokens_seen": 97521440, "step": 45210 }, { "epoch": 8.297852817030648, "grad_norm": 0.08848311752080917, "learning_rate": 7.271764548573654e-06, "loss": 0.1167, "num_input_tokens_seen": 97531168, "step": 45215 }, { "epoch": 8.2987704165902, "grad_norm": 0.12947732210159302, "learning_rate": 7.271051187959843e-06, "loss": 0.054, "num_input_tokens_seen": 97541664, "step": 45220 }, { "epoch": 8.299688016149751, "grad_norm": 0.14524072408676147, "learning_rate": 7.270337769097091e-06, "loss": 0.0709, "num_input_tokens_seen": 97551808, "step": 45225 }, { "epoch": 8.300605615709305, "grad_norm": 0.016353975981473923, "learning_rate": 7.269624292003692e-06, "loss": 0.117, "num_input_tokens_seen": 97561408, "step": 45230 }, { "epoch": 8.301523215268857, "grad_norm": 2.144486427307129, "learning_rate": 7.268910756697948e-06, "loss": 0.001, "num_input_tokens_seen": 97571616, "step": 45235 }, { "epoch": 8.302440814828408, "grad_norm": 0.01822775788605213, "learning_rate": 7.268197163198161e-06, "loss": 0.1975, "num_input_tokens_seen": 97583296, "step": 45240 }, { "epoch": 8.303358414387962, "grad_norm": 0.11577978730201721, "learning_rate": 7.26748351152263e-06, "loss": 0.1973, "num_input_tokens_seen": 97592832, "step": 45245 }, { "epoch": 8.304276013947513, "grad_norm": 0.005369880702346563, "learning_rate": 7.266769801689662e-06, "loss": 0.1596, "num_input_tokens_seen": 97604608, "step": 45250 }, { "epoch": 8.305193613507065, "grad_norm": 0.006775634363293648, "learning_rate": 7.266056033717561e-06, "loss": 0.0006, "num_input_tokens_seen": 97616000, "step": 45255 }, { "epoch": 8.306111213066618, "grad_norm": 0.06277228891849518, "learning_rate": 7.265342207624637e-06, "loss": 0.0003, "num_input_tokens_seen": 97627040, "step": 45260 }, { "epoch": 8.30702881262617, "grad_norm": 0.1021810919046402, "learning_rate": 7.264628323429196e-06, "loss": 0.0005, "num_input_tokens_seen": 97637952, "step": 45265 }, { "epoch": 8.307946412185721, "grad_norm": 0.012287084944546223, "learning_rate": 7.263914381149546e-06, "loss": 0.0007, "num_input_tokens_seen": 97649376, "step": 45270 }, { "epoch": 8.308864011745275, "grad_norm": 0.27094295620918274, "learning_rate": 7.263200380804003e-06, "loss": 0.0005, "num_input_tokens_seen": 97660416, "step": 45275 }, { "epoch": 8.309781611304826, "grad_norm": 0.05023888498544693, "learning_rate": 7.2624863224108775e-06, "loss": 0.1085, "num_input_tokens_seen": 97672064, "step": 45280 }, { "epoch": 8.310699210864378, "grad_norm": 0.024542108178138733, "learning_rate": 7.261772205988484e-06, "loss": 0.3065, "num_input_tokens_seen": 97681184, "step": 45285 }, { "epoch": 8.311616810423931, "grad_norm": 0.009410779923200607, "learning_rate": 7.261058031555139e-06, "loss": 0.0028, "num_input_tokens_seen": 97691968, "step": 45290 }, { "epoch": 8.312534409983483, "grad_norm": 64.02912902832031, "learning_rate": 7.260343799129159e-06, "loss": 0.0766, "num_input_tokens_seen": 97704576, "step": 45295 }, { "epoch": 8.313452009543035, "grad_norm": 0.12479942291975021, "learning_rate": 7.259629508728865e-06, "loss": 0.0006, "num_input_tokens_seen": 97716160, "step": 45300 }, { "epoch": 8.314369609102588, "grad_norm": 0.047392118722200394, "learning_rate": 7.258915160372575e-06, "loss": 0.1197, "num_input_tokens_seen": 97727488, "step": 45305 }, { "epoch": 8.31528720866214, "grad_norm": 0.28158944845199585, "learning_rate": 7.2582007540786125e-06, "loss": 0.0009, "num_input_tokens_seen": 97737632, "step": 45310 }, { "epoch": 8.316204808221691, "grad_norm": 0.006396826822310686, "learning_rate": 7.257486289865302e-06, "loss": 0.0008, "num_input_tokens_seen": 97748896, "step": 45315 }, { "epoch": 8.317122407781245, "grad_norm": 0.04474526643753052, "learning_rate": 7.256771767750965e-06, "loss": 0.0005, "num_input_tokens_seen": 97758592, "step": 45320 }, { "epoch": 8.318040007340796, "grad_norm": 14.373668670654297, "learning_rate": 7.256057187753931e-06, "loss": 0.1853, "num_input_tokens_seen": 97768480, "step": 45325 }, { "epoch": 8.318957606900348, "grad_norm": 0.03247574716806412, "learning_rate": 7.255342549892525e-06, "loss": 0.063, "num_input_tokens_seen": 97780960, "step": 45330 }, { "epoch": 8.319875206459901, "grad_norm": 0.025886360555887222, "learning_rate": 7.254627854185081e-06, "loss": 0.0009, "num_input_tokens_seen": 97791200, "step": 45335 }, { "epoch": 8.320792806019453, "grad_norm": 0.14634083211421967, "learning_rate": 7.253913100649926e-06, "loss": 0.0538, "num_input_tokens_seen": 97800448, "step": 45340 }, { "epoch": 8.321710405579005, "grad_norm": 0.12023515999317169, "learning_rate": 7.253198289305391e-06, "loss": 0.0013, "num_input_tokens_seen": 97811296, "step": 45345 }, { "epoch": 8.322628005138558, "grad_norm": 0.10927542299032211, "learning_rate": 7.252483420169813e-06, "loss": 0.0005, "num_input_tokens_seen": 97821952, "step": 45350 }, { "epoch": 8.32354560469811, "grad_norm": 0.006700932513922453, "learning_rate": 7.251768493261527e-06, "loss": 0.0022, "num_input_tokens_seen": 97832704, "step": 45355 }, { "epoch": 8.324463204257661, "grad_norm": 0.019271766766905785, "learning_rate": 7.2510535085988695e-06, "loss": 0.1131, "num_input_tokens_seen": 97843616, "step": 45360 }, { "epoch": 8.325380803817215, "grad_norm": 0.008619244210422039, "learning_rate": 7.250338466200178e-06, "loss": 0.205, "num_input_tokens_seen": 97853504, "step": 45365 }, { "epoch": 8.326298403376766, "grad_norm": 0.03162263706326485, "learning_rate": 7.249623366083793e-06, "loss": 0.0009, "num_input_tokens_seen": 97863424, "step": 45370 }, { "epoch": 8.327216002936318, "grad_norm": 0.08306821435689926, "learning_rate": 7.248908208268055e-06, "loss": 0.1882, "num_input_tokens_seen": 97874304, "step": 45375 }, { "epoch": 8.328133602495871, "grad_norm": 0.015936030074954033, "learning_rate": 7.248192992771306e-06, "loss": 0.0005, "num_input_tokens_seen": 97884288, "step": 45380 }, { "epoch": 8.329051202055423, "grad_norm": 0.008642260916531086, "learning_rate": 7.247477719611893e-06, "loss": 0.0001, "num_input_tokens_seen": 97895968, "step": 45385 }, { "epoch": 8.329968801614974, "grad_norm": 16.372055053710938, "learning_rate": 7.246762388808158e-06, "loss": 0.2383, "num_input_tokens_seen": 97906240, "step": 45390 }, { "epoch": 8.330886401174528, "grad_norm": 0.01613125205039978, "learning_rate": 7.246047000378449e-06, "loss": 0.1049, "num_input_tokens_seen": 97916960, "step": 45395 }, { "epoch": 8.33180400073408, "grad_norm": 0.01781897433102131, "learning_rate": 7.245331554341118e-06, "loss": 0.0004, "num_input_tokens_seen": 97927680, "step": 45400 }, { "epoch": 8.332721600293631, "grad_norm": 0.022273557260632515, "learning_rate": 7.24461605071451e-06, "loss": 0.0018, "num_input_tokens_seen": 97939488, "step": 45405 }, { "epoch": 8.333639199853184, "grad_norm": 0.00257553206756711, "learning_rate": 7.243900489516982e-06, "loss": 0.001, "num_input_tokens_seen": 97950048, "step": 45410 }, { "epoch": 8.334556799412736, "grad_norm": 0.029770124703645706, "learning_rate": 7.2431848707668815e-06, "loss": 0.0006, "num_input_tokens_seen": 97960352, "step": 45415 }, { "epoch": 8.335474398972288, "grad_norm": 0.0126408152282238, "learning_rate": 7.242469194482566e-06, "loss": 0.054, "num_input_tokens_seen": 97972096, "step": 45420 }, { "epoch": 8.336391998531841, "grad_norm": 0.07744420319795609, "learning_rate": 7.241753460682393e-06, "loss": 0.0004, "num_input_tokens_seen": 97980928, "step": 45425 }, { "epoch": 8.337309598091393, "grad_norm": 37.135231018066406, "learning_rate": 7.241037669384716e-06, "loss": 0.2179, "num_input_tokens_seen": 97991136, "step": 45430 }, { "epoch": 8.338227197650944, "grad_norm": 0.022200334817171097, "learning_rate": 7.2403218206078985e-06, "loss": 0.0133, "num_input_tokens_seen": 98001280, "step": 45435 }, { "epoch": 8.339144797210498, "grad_norm": 15.645631790161133, "learning_rate": 7.239605914370297e-06, "loss": 0.1012, "num_input_tokens_seen": 98011648, "step": 45440 }, { "epoch": 8.34006239677005, "grad_norm": 0.019225746393203735, "learning_rate": 7.238889950690275e-06, "loss": 0.1834, "num_input_tokens_seen": 98022304, "step": 45445 }, { "epoch": 8.340979996329601, "grad_norm": 0.16475173830986023, "learning_rate": 7.238173929586196e-06, "loss": 0.1137, "num_input_tokens_seen": 98032320, "step": 45450 }, { "epoch": 8.341897595889154, "grad_norm": 0.024264538660645485, "learning_rate": 7.237457851076424e-06, "loss": 0.0004, "num_input_tokens_seen": 98042624, "step": 45455 }, { "epoch": 8.342815195448706, "grad_norm": 0.015380008146166801, "learning_rate": 7.236741715179327e-06, "loss": 0.0002, "num_input_tokens_seen": 98053792, "step": 45460 }, { "epoch": 8.343732795008258, "grad_norm": 0.07401975989341736, "learning_rate": 7.2360255219132705e-06, "loss": 0.0062, "num_input_tokens_seen": 98064608, "step": 45465 }, { "epoch": 8.344650394567811, "grad_norm": 0.009581921622157097, "learning_rate": 7.235309271296625e-06, "loss": 0.0676, "num_input_tokens_seen": 98075520, "step": 45470 }, { "epoch": 8.345567994127363, "grad_norm": 0.0012452309019863605, "learning_rate": 7.234592963347762e-06, "loss": 0.0624, "num_input_tokens_seen": 98086048, "step": 45475 }, { "epoch": 8.346485593686914, "grad_norm": 0.047379761934280396, "learning_rate": 7.233876598085053e-06, "loss": 0.4261, "num_input_tokens_seen": 98095936, "step": 45480 }, { "epoch": 8.347403193246468, "grad_norm": 0.007244842126965523, "learning_rate": 7.233160175526871e-06, "loss": 0.0006, "num_input_tokens_seen": 98106208, "step": 45485 }, { "epoch": 8.34832079280602, "grad_norm": 0.05059406906366348, "learning_rate": 7.23244369569159e-06, "loss": 0.0009, "num_input_tokens_seen": 98117888, "step": 45490 }, { "epoch": 8.34923839236557, "grad_norm": 27.387432098388672, "learning_rate": 7.23172715859759e-06, "loss": 0.0515, "num_input_tokens_seen": 98129568, "step": 45495 }, { "epoch": 8.350155991925124, "grad_norm": 0.666763186454773, "learning_rate": 7.2310105642632465e-06, "loss": 0.0013, "num_input_tokens_seen": 98140320, "step": 45500 }, { "epoch": 8.351073591484676, "grad_norm": 0.03232164308428764, "learning_rate": 7.23029391270694e-06, "loss": 0.001, "num_input_tokens_seen": 98150912, "step": 45505 }, { "epoch": 8.351991191044227, "grad_norm": 0.03203253820538521, "learning_rate": 7.229577203947051e-06, "loss": 0.0896, "num_input_tokens_seen": 98161888, "step": 45510 }, { "epoch": 8.35290879060378, "grad_norm": 2.3241658210754395, "learning_rate": 7.228860438001962e-06, "loss": 0.0011, "num_input_tokens_seen": 98172416, "step": 45515 }, { "epoch": 8.353826390163333, "grad_norm": 0.035376038402318954, "learning_rate": 7.228143614890058e-06, "loss": 0.0003, "num_input_tokens_seen": 98183744, "step": 45520 }, { "epoch": 8.354743989722884, "grad_norm": 10.220425605773926, "learning_rate": 7.2274267346297235e-06, "loss": 0.1007, "num_input_tokens_seen": 98195232, "step": 45525 }, { "epoch": 8.355661589282438, "grad_norm": 0.21885840594768524, "learning_rate": 7.226709797239344e-06, "loss": 0.0012, "num_input_tokens_seen": 98204864, "step": 45530 }, { "epoch": 8.35657918884199, "grad_norm": 0.06913671642541885, "learning_rate": 7.2259928027373116e-06, "loss": 0.0015, "num_input_tokens_seen": 98216064, "step": 45535 }, { "epoch": 8.35749678840154, "grad_norm": 0.025809045881032944, "learning_rate": 7.225275751142013e-06, "loss": 0.0002, "num_input_tokens_seen": 98227488, "step": 45540 }, { "epoch": 8.358414387961094, "grad_norm": 0.014256144873797894, "learning_rate": 7.22455864247184e-06, "loss": 0.0003, "num_input_tokens_seen": 98238848, "step": 45545 }, { "epoch": 8.359331987520646, "grad_norm": 0.01512068510055542, "learning_rate": 7.223841476745185e-06, "loss": 0.0003, "num_input_tokens_seen": 98249024, "step": 45550 }, { "epoch": 8.360249587080197, "grad_norm": 81.069580078125, "learning_rate": 7.2231242539804425e-06, "loss": 0.0265, "num_input_tokens_seen": 98258432, "step": 45555 }, { "epoch": 8.36116718663975, "grad_norm": 0.009739844128489494, "learning_rate": 7.22240697419601e-06, "loss": 0.144, "num_input_tokens_seen": 98269216, "step": 45560 }, { "epoch": 8.362084786199302, "grad_norm": 0.07489614188671112, "learning_rate": 7.221689637410282e-06, "loss": 0.0005, "num_input_tokens_seen": 98281280, "step": 45565 }, { "epoch": 8.363002385758854, "grad_norm": 55.13713836669922, "learning_rate": 7.220972243641658e-06, "loss": 0.1371, "num_input_tokens_seen": 98292224, "step": 45570 }, { "epoch": 8.363919985318407, "grad_norm": 0.03342360630631447, "learning_rate": 7.220254792908539e-06, "loss": 0.2106, "num_input_tokens_seen": 98302528, "step": 45575 }, { "epoch": 8.364837584877959, "grad_norm": 0.025295324623584747, "learning_rate": 7.219537285229325e-06, "loss": 0.0003, "num_input_tokens_seen": 98313056, "step": 45580 }, { "epoch": 8.36575518443751, "grad_norm": 0.010544453747570515, "learning_rate": 7.21881972062242e-06, "loss": 0.0916, "num_input_tokens_seen": 98324544, "step": 45585 }, { "epoch": 8.366672783997064, "grad_norm": 0.11609766632318497, "learning_rate": 7.218102099106228e-06, "loss": 0.0005, "num_input_tokens_seen": 98334592, "step": 45590 }, { "epoch": 8.367590383556616, "grad_norm": 0.05053750053048134, "learning_rate": 7.217384420699155e-06, "loss": 0.0008, "num_input_tokens_seen": 98344896, "step": 45595 }, { "epoch": 8.368507983116167, "grad_norm": 0.10023750364780426, "learning_rate": 7.2166666854196075e-06, "loss": 0.0092, "num_input_tokens_seen": 98355648, "step": 45600 }, { "epoch": 8.36942558267572, "grad_norm": 16.483972549438477, "learning_rate": 7.215948893285996e-06, "loss": 0.2511, "num_input_tokens_seen": 98366336, "step": 45605 }, { "epoch": 8.370343182235272, "grad_norm": 0.023014958947896957, "learning_rate": 7.215231044316728e-06, "loss": 0.1071, "num_input_tokens_seen": 98377408, "step": 45610 }, { "epoch": 8.371260781794824, "grad_norm": 0.0018459769198670983, "learning_rate": 7.214513138530219e-06, "loss": 0.0004, "num_input_tokens_seen": 98387456, "step": 45615 }, { "epoch": 8.372178381354377, "grad_norm": 17.47804069519043, "learning_rate": 7.21379517594488e-06, "loss": 0.3725, "num_input_tokens_seen": 98398912, "step": 45620 }, { "epoch": 8.373095980913929, "grad_norm": 0.17311003804206848, "learning_rate": 7.213077156579125e-06, "loss": 0.0017, "num_input_tokens_seen": 98408992, "step": 45625 }, { "epoch": 8.37401358047348, "grad_norm": 50.3261833190918, "learning_rate": 7.21235908045137e-06, "loss": 0.109, "num_input_tokens_seen": 98419872, "step": 45630 }, { "epoch": 8.374931180033034, "grad_norm": 0.1668918877840042, "learning_rate": 7.2116409475800356e-06, "loss": 0.0007, "num_input_tokens_seen": 98429952, "step": 45635 }, { "epoch": 8.375848779592586, "grad_norm": 0.014090052805840969, "learning_rate": 7.210922757983536e-06, "loss": 0.0008, "num_input_tokens_seen": 98441216, "step": 45640 }, { "epoch": 8.376766379152137, "grad_norm": 0.010200858116149902, "learning_rate": 7.210204511680296e-06, "loss": 0.0004, "num_input_tokens_seen": 98451616, "step": 45645 }, { "epoch": 8.37768397871169, "grad_norm": 0.05948380380868912, "learning_rate": 7.209486208688736e-06, "loss": 0.0007, "num_input_tokens_seen": 98461536, "step": 45650 }, { "epoch": 8.378601578271242, "grad_norm": 0.00855370331555605, "learning_rate": 7.20876784902728e-06, "loss": 0.0003, "num_input_tokens_seen": 98472800, "step": 45655 }, { "epoch": 8.379519177830794, "grad_norm": 0.09012021869421005, "learning_rate": 7.20804943271435e-06, "loss": 0.1227, "num_input_tokens_seen": 98483008, "step": 45660 }, { "epoch": 8.380436777390347, "grad_norm": 0.028388207778334618, "learning_rate": 7.207330959768375e-06, "loss": 0.1833, "num_input_tokens_seen": 98494528, "step": 45665 }, { "epoch": 8.381354376949899, "grad_norm": 0.05246206372976303, "learning_rate": 7.206612430207782e-06, "loss": 0.0007, "num_input_tokens_seen": 98504704, "step": 45670 }, { "epoch": 8.38227197650945, "grad_norm": 0.2748722434043884, "learning_rate": 7.205893844051e-06, "loss": 0.0005, "num_input_tokens_seen": 98516544, "step": 45675 }, { "epoch": 8.383189576069004, "grad_norm": 0.020442236214876175, "learning_rate": 7.2051752013164585e-06, "loss": 0.0003, "num_input_tokens_seen": 98527872, "step": 45680 }, { "epoch": 8.384107175628555, "grad_norm": 0.360991895198822, "learning_rate": 7.204456502022592e-06, "loss": 0.0013, "num_input_tokens_seen": 98537664, "step": 45685 }, { "epoch": 8.385024775188107, "grad_norm": 34.650962829589844, "learning_rate": 7.2037377461878334e-06, "loss": 0.1629, "num_input_tokens_seen": 98546208, "step": 45690 }, { "epoch": 8.38594237474766, "grad_norm": 0.9706148505210876, "learning_rate": 7.203018933830617e-06, "loss": 0.0019, "num_input_tokens_seen": 98556992, "step": 45695 }, { "epoch": 8.386859974307212, "grad_norm": 0.006045416463166475, "learning_rate": 7.202300064969378e-06, "loss": 0.0045, "num_input_tokens_seen": 98567936, "step": 45700 }, { "epoch": 8.387777573866764, "grad_norm": 0.5574643015861511, "learning_rate": 7.2015811396225574e-06, "loss": 0.0021, "num_input_tokens_seen": 98578592, "step": 45705 }, { "epoch": 8.388695173426317, "grad_norm": 0.004163989331573248, "learning_rate": 7.200862157808593e-06, "loss": 0.1162, "num_input_tokens_seen": 98588608, "step": 45710 }, { "epoch": 8.389612772985869, "grad_norm": 0.004046075511723757, "learning_rate": 7.200143119545922e-06, "loss": 0.0012, "num_input_tokens_seen": 98600128, "step": 45715 }, { "epoch": 8.39053037254542, "grad_norm": 10.225638389587402, "learning_rate": 7.199424024852993e-06, "loss": 0.1905, "num_input_tokens_seen": 98610272, "step": 45720 }, { "epoch": 8.391447972104974, "grad_norm": 2.2054386138916016, "learning_rate": 7.198704873748245e-06, "loss": 0.0475, "num_input_tokens_seen": 98620736, "step": 45725 }, { "epoch": 8.392365571664525, "grad_norm": 16.978622436523438, "learning_rate": 7.197985666250126e-06, "loss": 0.2732, "num_input_tokens_seen": 98632768, "step": 45730 }, { "epoch": 8.393283171224077, "grad_norm": 58.80551528930664, "learning_rate": 7.19726640237708e-06, "loss": 0.135, "num_input_tokens_seen": 98643360, "step": 45735 }, { "epoch": 8.39420077078363, "grad_norm": 0.027130551636219025, "learning_rate": 7.196547082147556e-06, "loss": 0.0011, "num_input_tokens_seen": 98653568, "step": 45740 }, { "epoch": 8.395118370343182, "grad_norm": 0.005118729546666145, "learning_rate": 7.195827705580006e-06, "loss": 0.1255, "num_input_tokens_seen": 98664448, "step": 45745 }, { "epoch": 8.396035969902734, "grad_norm": 0.068135567009449, "learning_rate": 7.195108272692874e-06, "loss": 0.1167, "num_input_tokens_seen": 98675584, "step": 45750 }, { "epoch": 8.396953569462287, "grad_norm": 0.029178446158766747, "learning_rate": 7.194388783504621e-06, "loss": 0.1594, "num_input_tokens_seen": 98686336, "step": 45755 }, { "epoch": 8.397871169021839, "grad_norm": 0.046656008809804916, "learning_rate": 7.193669238033696e-06, "loss": 0.0006, "num_input_tokens_seen": 98696128, "step": 45760 }, { "epoch": 8.39878876858139, "grad_norm": 0.00968528725206852, "learning_rate": 7.192949636298554e-06, "loss": 0.0249, "num_input_tokens_seen": 98708576, "step": 45765 }, { "epoch": 8.399706368140944, "grad_norm": 0.003221517661586404, "learning_rate": 7.192229978317653e-06, "loss": 0.0007, "num_input_tokens_seen": 98720160, "step": 45770 }, { "epoch": 8.400623967700495, "grad_norm": 0.004985852167010307, "learning_rate": 7.191510264109451e-06, "loss": 0.0028, "num_input_tokens_seen": 98731392, "step": 45775 }, { "epoch": 8.401541567260047, "grad_norm": 0.011807238683104515, "learning_rate": 7.190790493692407e-06, "loss": 0.0017, "num_input_tokens_seen": 98742944, "step": 45780 }, { "epoch": 8.4024591668196, "grad_norm": 0.1136540099978447, "learning_rate": 7.1900706670849815e-06, "loss": 0.1129, "num_input_tokens_seen": 98753696, "step": 45785 }, { "epoch": 8.403376766379152, "grad_norm": 0.021734360605478287, "learning_rate": 7.189350784305639e-06, "loss": 0.0003, "num_input_tokens_seen": 98764608, "step": 45790 }, { "epoch": 8.404294365938703, "grad_norm": 0.0722719207406044, "learning_rate": 7.188630845372841e-06, "loss": 0.0653, "num_input_tokens_seen": 98775936, "step": 45795 }, { "epoch": 8.405211965498257, "grad_norm": 0.006648911163210869, "learning_rate": 7.187910850305055e-06, "loss": 0.4408, "num_input_tokens_seen": 98787616, "step": 45800 }, { "epoch": 8.406129565057809, "grad_norm": 0.08765062689781189, "learning_rate": 7.187190799120747e-06, "loss": 0.001, "num_input_tokens_seen": 98799648, "step": 45805 }, { "epoch": 8.40704716461736, "grad_norm": 0.018339749425649643, "learning_rate": 7.186470691838383e-06, "loss": 0.0003, "num_input_tokens_seen": 98811232, "step": 45810 }, { "epoch": 8.407964764176914, "grad_norm": 0.22240564227104187, "learning_rate": 7.1857505284764365e-06, "loss": 0.1819, "num_input_tokens_seen": 98821568, "step": 45815 }, { "epoch": 8.408882363736465, "grad_norm": 0.11222994327545166, "learning_rate": 7.185030309053374e-06, "loss": 0.0035, "num_input_tokens_seen": 98832480, "step": 45820 }, { "epoch": 8.409799963296017, "grad_norm": 0.06617571413516998, "learning_rate": 7.184310033587672e-06, "loss": 0.1729, "num_input_tokens_seen": 98843232, "step": 45825 }, { "epoch": 8.41071756285557, "grad_norm": 0.019117780029773712, "learning_rate": 7.183589702097803e-06, "loss": 0.2625, "num_input_tokens_seen": 98853952, "step": 45830 }, { "epoch": 8.411635162415122, "grad_norm": 0.36693060398101807, "learning_rate": 7.182869314602242e-06, "loss": 0.0047, "num_input_tokens_seen": 98864416, "step": 45835 }, { "epoch": 8.412552761974673, "grad_norm": 0.2146996259689331, "learning_rate": 7.182148871119467e-06, "loss": 0.0007, "num_input_tokens_seen": 98875904, "step": 45840 }, { "epoch": 8.413470361534227, "grad_norm": 17.253890991210938, "learning_rate": 7.181428371667954e-06, "loss": 0.0624, "num_input_tokens_seen": 98886656, "step": 45845 }, { "epoch": 8.414387961093778, "grad_norm": 0.04112398624420166, "learning_rate": 7.180707816266186e-06, "loss": 0.0738, "num_input_tokens_seen": 98898304, "step": 45850 }, { "epoch": 8.41530556065333, "grad_norm": 176.0153350830078, "learning_rate": 7.179987204932641e-06, "loss": 0.015, "num_input_tokens_seen": 98909152, "step": 45855 }, { "epoch": 8.416223160212883, "grad_norm": 0.22569143772125244, "learning_rate": 7.179266537685804e-06, "loss": 0.0011, "num_input_tokens_seen": 98918400, "step": 45860 }, { "epoch": 8.417140759772435, "grad_norm": 0.06277116388082504, "learning_rate": 7.178545814544158e-06, "loss": 0.1536, "num_input_tokens_seen": 98929600, "step": 45865 }, { "epoch": 8.418058359331987, "grad_norm": 1.5006910562515259, "learning_rate": 7.177825035526187e-06, "loss": 0.2522, "num_input_tokens_seen": 98941184, "step": 45870 }, { "epoch": 8.41897595889154, "grad_norm": 0.025813106447458267, "learning_rate": 7.1771042006503784e-06, "loss": 0.0008, "num_input_tokens_seen": 98952000, "step": 45875 }, { "epoch": 8.419893558451092, "grad_norm": 0.2999090254306793, "learning_rate": 7.176383309935224e-06, "loss": 0.0313, "num_input_tokens_seen": 98962080, "step": 45880 }, { "epoch": 8.420811158010643, "grad_norm": 0.06676657497882843, "learning_rate": 7.175662363399208e-06, "loss": 0.0922, "num_input_tokens_seen": 98972928, "step": 45885 }, { "epoch": 8.421728757570197, "grad_norm": 1.8514273166656494, "learning_rate": 7.174941361060826e-06, "loss": 0.0044, "num_input_tokens_seen": 98984512, "step": 45890 }, { "epoch": 8.422646357129748, "grad_norm": 0.014167211949825287, "learning_rate": 7.174220302938569e-06, "loss": 0.1567, "num_input_tokens_seen": 98994720, "step": 45895 }, { "epoch": 8.4235639566893, "grad_norm": 75.47762298583984, "learning_rate": 7.173499189050931e-06, "loss": 0.1565, "num_input_tokens_seen": 99006144, "step": 45900 }, { "epoch": 8.424481556248853, "grad_norm": 0.0869772732257843, "learning_rate": 7.172778019416407e-06, "loss": 0.0271, "num_input_tokens_seen": 99016640, "step": 45905 }, { "epoch": 8.425399155808405, "grad_norm": 0.10268072038888931, "learning_rate": 7.1720567940534945e-06, "loss": 0.0014, "num_input_tokens_seen": 99027040, "step": 45910 }, { "epoch": 8.426316755367957, "grad_norm": 0.10521277785301208, "learning_rate": 7.1713355129806925e-06, "loss": 0.0047, "num_input_tokens_seen": 99039168, "step": 45915 }, { "epoch": 8.42723435492751, "grad_norm": 0.01701580546796322, "learning_rate": 7.170614176216498e-06, "loss": 0.0019, "num_input_tokens_seen": 99049664, "step": 45920 }, { "epoch": 8.428151954487062, "grad_norm": 0.6519230604171753, "learning_rate": 7.169892783779414e-06, "loss": 0.0015, "num_input_tokens_seen": 99060096, "step": 45925 }, { "epoch": 8.429069554046613, "grad_norm": 0.02722974121570587, "learning_rate": 7.1691713356879455e-06, "loss": 0.0004, "num_input_tokens_seen": 99070880, "step": 45930 }, { "epoch": 8.429987153606167, "grad_norm": 0.472830593585968, "learning_rate": 7.168449831960591e-06, "loss": 0.0004, "num_input_tokens_seen": 99082304, "step": 45935 }, { "epoch": 8.430904753165718, "grad_norm": 0.026514440774917603, "learning_rate": 7.167728272615862e-06, "loss": 0.1537, "num_input_tokens_seen": 99092960, "step": 45940 }, { "epoch": 8.43182235272527, "grad_norm": 0.07050129026174545, "learning_rate": 7.1670066576722605e-06, "loss": 0.1401, "num_input_tokens_seen": 99104096, "step": 45945 }, { "epoch": 8.432739952284823, "grad_norm": 1.4255789518356323, "learning_rate": 7.166284987148299e-06, "loss": 0.1304, "num_input_tokens_seen": 99115168, "step": 45950 }, { "epoch": 8.433657551844375, "grad_norm": 0.01148295123130083, "learning_rate": 7.165563261062482e-06, "loss": 0.1628, "num_input_tokens_seen": 99124096, "step": 45955 }, { "epoch": 8.434575151403926, "grad_norm": 23.311031341552734, "learning_rate": 7.164841479433326e-06, "loss": 0.0814, "num_input_tokens_seen": 99135488, "step": 45960 }, { "epoch": 8.43549275096348, "grad_norm": 16.645654678344727, "learning_rate": 7.16411964227934e-06, "loss": 0.1223, "num_input_tokens_seen": 99145312, "step": 45965 }, { "epoch": 8.436410350523031, "grad_norm": 5.988250255584717, "learning_rate": 7.163397749619039e-06, "loss": 0.0122, "num_input_tokens_seen": 99155680, "step": 45970 }, { "epoch": 8.437327950082583, "grad_norm": 0.03829188272356987, "learning_rate": 7.16267580147094e-06, "loss": 0.0002, "num_input_tokens_seen": 99166240, "step": 45975 }, { "epoch": 8.438245549642136, "grad_norm": 0.015077365562319756, "learning_rate": 7.161953797853558e-06, "loss": 0.217, "num_input_tokens_seen": 99177472, "step": 45980 }, { "epoch": 8.439163149201688, "grad_norm": 0.010107683949172497, "learning_rate": 7.161231738785411e-06, "loss": 0.0005, "num_input_tokens_seen": 99188256, "step": 45985 }, { "epoch": 8.44008074876124, "grad_norm": 0.27498114109039307, "learning_rate": 7.160509624285021e-06, "loss": 0.0016, "num_input_tokens_seen": 99200096, "step": 45990 }, { "epoch": 8.440998348320793, "grad_norm": 90.19950866699219, "learning_rate": 7.159787454370906e-06, "loss": 0.2641, "num_input_tokens_seen": 99210624, "step": 45995 }, { "epoch": 8.441915947880345, "grad_norm": 0.06994546949863434, "learning_rate": 7.159065229061592e-06, "loss": 0.064, "num_input_tokens_seen": 99221376, "step": 46000 }, { "epoch": 8.442833547439896, "grad_norm": 0.03183774650096893, "learning_rate": 7.1583429483756e-06, "loss": 0.0003, "num_input_tokens_seen": 99231424, "step": 46005 }, { "epoch": 8.44375114699945, "grad_norm": 0.015798227861523628, "learning_rate": 7.157620612331457e-06, "loss": 0.0535, "num_input_tokens_seen": 99242240, "step": 46010 }, { "epoch": 8.444668746559001, "grad_norm": 0.1409791111946106, "learning_rate": 7.1568982209476875e-06, "loss": 0.0007, "num_input_tokens_seen": 99254880, "step": 46015 }, { "epoch": 8.445586346118553, "grad_norm": 0.10646370053291321, "learning_rate": 7.156175774242824e-06, "loss": 0.1879, "num_input_tokens_seen": 99265344, "step": 46020 }, { "epoch": 8.446503945678106, "grad_norm": 10.856307983398438, "learning_rate": 7.155453272235393e-06, "loss": 0.1384, "num_input_tokens_seen": 99277248, "step": 46025 }, { "epoch": 8.447421545237658, "grad_norm": 0.031113412231206894, "learning_rate": 7.1547307149439264e-06, "loss": 0.0047, "num_input_tokens_seen": 99288384, "step": 46030 }, { "epoch": 8.44833914479721, "grad_norm": 0.03027024120092392, "learning_rate": 7.154008102386955e-06, "loss": 0.0005, "num_input_tokens_seen": 99298528, "step": 46035 }, { "epoch": 8.449256744356763, "grad_norm": 0.024983933195471764, "learning_rate": 7.1532854345830146e-06, "loss": 0.0916, "num_input_tokens_seen": 99309952, "step": 46040 }, { "epoch": 8.450174343916315, "grad_norm": 0.015183055773377419, "learning_rate": 7.152562711550642e-06, "loss": 0.0015, "num_input_tokens_seen": 99319392, "step": 46045 }, { "epoch": 8.451091943475866, "grad_norm": 0.0689360499382019, "learning_rate": 7.15183993330837e-06, "loss": 0.0267, "num_input_tokens_seen": 99329184, "step": 46050 }, { "epoch": 8.45200954303542, "grad_norm": 0.13617034256458282, "learning_rate": 7.151117099874739e-06, "loss": 0.0007, "num_input_tokens_seen": 99340064, "step": 46055 }, { "epoch": 8.452927142594971, "grad_norm": 13.170324325561523, "learning_rate": 7.150394211268288e-06, "loss": 0.0979, "num_input_tokens_seen": 99351872, "step": 46060 }, { "epoch": 8.453844742154523, "grad_norm": 11.808732986450195, "learning_rate": 7.1496712675075595e-06, "loss": 0.2317, "num_input_tokens_seen": 99362112, "step": 46065 }, { "epoch": 8.454762341714076, "grad_norm": 0.02782627008855343, "learning_rate": 7.148948268611094e-06, "loss": 0.2478, "num_input_tokens_seen": 99372608, "step": 46070 }, { "epoch": 8.455679941273628, "grad_norm": 0.0883926972746849, "learning_rate": 7.1482252145974375e-06, "loss": 0.0012, "num_input_tokens_seen": 99382944, "step": 46075 }, { "epoch": 8.45659754083318, "grad_norm": 0.09068052470684052, "learning_rate": 7.1475021054851314e-06, "loss": 0.0508, "num_input_tokens_seen": 99394240, "step": 46080 }, { "epoch": 8.457515140392733, "grad_norm": 0.018735487014055252, "learning_rate": 7.146778941292725e-06, "loss": 0.1744, "num_input_tokens_seen": 99403968, "step": 46085 }, { "epoch": 8.458432739952285, "grad_norm": 0.24615007638931274, "learning_rate": 7.146055722038767e-06, "loss": 0.0008, "num_input_tokens_seen": 99415520, "step": 46090 }, { "epoch": 8.459350339511836, "grad_norm": 0.39326125383377075, "learning_rate": 7.145332447741805e-06, "loss": 0.0014, "num_input_tokens_seen": 99426656, "step": 46095 }, { "epoch": 8.46026793907139, "grad_norm": 0.014107090421020985, "learning_rate": 7.144609118420391e-06, "loss": 0.0224, "num_input_tokens_seen": 99437760, "step": 46100 }, { "epoch": 8.461185538630941, "grad_norm": 4.95442533493042, "learning_rate": 7.143885734093077e-06, "loss": 0.0023, "num_input_tokens_seen": 99449312, "step": 46105 }, { "epoch": 8.462103138190493, "grad_norm": 11.579753875732422, "learning_rate": 7.143162294778418e-06, "loss": 0.2302, "num_input_tokens_seen": 99458976, "step": 46110 }, { "epoch": 8.463020737750046, "grad_norm": 0.009883422404527664, "learning_rate": 7.142438800494965e-06, "loss": 0.0005, "num_input_tokens_seen": 99469312, "step": 46115 }, { "epoch": 8.463938337309598, "grad_norm": 0.005580310244113207, "learning_rate": 7.14171525126128e-06, "loss": 0.0902, "num_input_tokens_seen": 99480736, "step": 46120 }, { "epoch": 8.46485593686915, "grad_norm": 0.000721575110219419, "learning_rate": 7.140991647095916e-06, "loss": 0.0005, "num_input_tokens_seen": 99491968, "step": 46125 }, { "epoch": 8.465773536428703, "grad_norm": 1.8483171463012695, "learning_rate": 7.140267988017435e-06, "loss": 0.0062, "num_input_tokens_seen": 99500832, "step": 46130 }, { "epoch": 8.466691135988254, "grad_norm": 0.022344671189785004, "learning_rate": 7.139544274044398e-06, "loss": 0.0017, "num_input_tokens_seen": 99512032, "step": 46135 }, { "epoch": 8.467608735547808, "grad_norm": 8.668523788452148, "learning_rate": 7.138820505195366e-06, "loss": 0.0409, "num_input_tokens_seen": 99522880, "step": 46140 }, { "epoch": 8.46852633510736, "grad_norm": 16.038047790527344, "learning_rate": 7.138096681488902e-06, "loss": 0.1291, "num_input_tokens_seen": 99533920, "step": 46145 }, { "epoch": 8.469443934666911, "grad_norm": 0.04983551800251007, "learning_rate": 7.137372802943574e-06, "loss": 0.0021, "num_input_tokens_seen": 99543520, "step": 46150 }, { "epoch": 8.470361534226464, "grad_norm": 8.447737693786621, "learning_rate": 7.136648869577945e-06, "loss": 0.0766, "num_input_tokens_seen": 99553696, "step": 46155 }, { "epoch": 8.471279133786016, "grad_norm": 0.10504096001386642, "learning_rate": 7.135924881410583e-06, "loss": 0.2677, "num_input_tokens_seen": 99564640, "step": 46160 }, { "epoch": 8.472196733345568, "grad_norm": 50.50856399536133, "learning_rate": 7.135200838460059e-06, "loss": 0.038, "num_input_tokens_seen": 99574368, "step": 46165 }, { "epoch": 8.473114332905121, "grad_norm": 2.364690065383911, "learning_rate": 7.1344767407449426e-06, "loss": 0.3807, "num_input_tokens_seen": 99584480, "step": 46170 }, { "epoch": 8.474031932464673, "grad_norm": 0.00753212021663785, "learning_rate": 7.133752588283807e-06, "loss": 0.0048, "num_input_tokens_seen": 99594368, "step": 46175 }, { "epoch": 8.474949532024224, "grad_norm": 0.015777798369526863, "learning_rate": 7.133028381095223e-06, "loss": 0.0005, "num_input_tokens_seen": 99605184, "step": 46180 }, { "epoch": 8.475867131583778, "grad_norm": 66.16697692871094, "learning_rate": 7.132304119197768e-06, "loss": 0.1854, "num_input_tokens_seen": 99615648, "step": 46185 }, { "epoch": 8.47678473114333, "grad_norm": 0.22685736417770386, "learning_rate": 7.131579802610016e-06, "loss": 0.0004, "num_input_tokens_seen": 99627104, "step": 46190 }, { "epoch": 8.477702330702881, "grad_norm": 0.010496353730559349, "learning_rate": 7.130855431350546e-06, "loss": 0.0906, "num_input_tokens_seen": 99638496, "step": 46195 }, { "epoch": 8.478619930262434, "grad_norm": 0.1523212492465973, "learning_rate": 7.130131005437937e-06, "loss": 0.0003, "num_input_tokens_seen": 99648096, "step": 46200 }, { "epoch": 8.479537529821986, "grad_norm": 0.011998393572866917, "learning_rate": 7.12940652489077e-06, "loss": 0.0002, "num_input_tokens_seen": 99658912, "step": 46205 }, { "epoch": 8.480455129381538, "grad_norm": 0.9078300595283508, "learning_rate": 7.128681989727625e-06, "loss": 0.0009, "num_input_tokens_seen": 99669568, "step": 46210 }, { "epoch": 8.481372728941091, "grad_norm": 0.6917851567268372, "learning_rate": 7.127957399967086e-06, "loss": 0.0009, "num_input_tokens_seen": 99680096, "step": 46215 }, { "epoch": 8.482290328500643, "grad_norm": 0.019186044111847878, "learning_rate": 7.127232755627739e-06, "loss": 0.0004, "num_input_tokens_seen": 99691328, "step": 46220 }, { "epoch": 8.483207928060194, "grad_norm": 0.0031401049345731735, "learning_rate": 7.126508056728166e-06, "loss": 0.107, "num_input_tokens_seen": 99701728, "step": 46225 }, { "epoch": 8.484125527619748, "grad_norm": 0.029563279822468758, "learning_rate": 7.125783303286959e-06, "loss": 0.1419, "num_input_tokens_seen": 99712288, "step": 46230 }, { "epoch": 8.4850431271793, "grad_norm": 0.03407036140561104, "learning_rate": 7.125058495322706e-06, "loss": 0.0003, "num_input_tokens_seen": 99724896, "step": 46235 }, { "epoch": 8.48596072673885, "grad_norm": 4.313926696777344, "learning_rate": 7.1243336328539944e-06, "loss": 0.0016, "num_input_tokens_seen": 99734560, "step": 46240 }, { "epoch": 8.486878326298404, "grad_norm": 0.012324375100433826, "learning_rate": 7.123608715899418e-06, "loss": 0.2382, "num_input_tokens_seen": 99745120, "step": 46245 }, { "epoch": 8.487795925857956, "grad_norm": 178.98477172851562, "learning_rate": 7.12288374447757e-06, "loss": 0.0176, "num_input_tokens_seen": 99755552, "step": 46250 }, { "epoch": 8.488713525417507, "grad_norm": 0.01625041291117668, "learning_rate": 7.122158718607043e-06, "loss": 0.0004, "num_input_tokens_seen": 99767296, "step": 46255 }, { "epoch": 8.48963112497706, "grad_norm": 0.005847617518156767, "learning_rate": 7.121433638306436e-06, "loss": 0.0007, "num_input_tokens_seen": 99778208, "step": 46260 }, { "epoch": 8.490548724536612, "grad_norm": 0.002578380284830928, "learning_rate": 7.120708503594341e-06, "loss": 0.2563, "num_input_tokens_seen": 99789696, "step": 46265 }, { "epoch": 8.491466324096164, "grad_norm": 41.61427688598633, "learning_rate": 7.119983314489363e-06, "loss": 0.2414, "num_input_tokens_seen": 99799840, "step": 46270 }, { "epoch": 8.492383923655717, "grad_norm": 0.006185933481901884, "learning_rate": 7.119258071010096e-06, "loss": 0.0005, "num_input_tokens_seen": 99811968, "step": 46275 }, { "epoch": 8.493301523215269, "grad_norm": 0.0061216214671730995, "learning_rate": 7.118532773175144e-06, "loss": 0.1819, "num_input_tokens_seen": 99823264, "step": 46280 }, { "epoch": 8.49421912277482, "grad_norm": 0.022458193823695183, "learning_rate": 7.1178074210031116e-06, "loss": 0.0002, "num_input_tokens_seen": 99833568, "step": 46285 }, { "epoch": 8.495136722334374, "grad_norm": 0.004533103667199612, "learning_rate": 7.1170820145126e-06, "loss": 0.0002, "num_input_tokens_seen": 99844896, "step": 46290 }, { "epoch": 8.496054321893926, "grad_norm": 0.0459628626704216, "learning_rate": 7.116356553722217e-06, "loss": 0.1279, "num_input_tokens_seen": 99854944, "step": 46295 }, { "epoch": 8.496971921453477, "grad_norm": 12.452589988708496, "learning_rate": 7.1156310386505665e-06, "loss": 0.0829, "num_input_tokens_seen": 99865952, "step": 46300 }, { "epoch": 8.49788952101303, "grad_norm": 0.06586074084043503, "learning_rate": 7.11490546931626e-06, "loss": 0.0008, "num_input_tokens_seen": 99876352, "step": 46305 }, { "epoch": 8.498807120572582, "grad_norm": 80.07555389404297, "learning_rate": 7.1141798457379055e-06, "loss": 0.0212, "num_input_tokens_seen": 99886688, "step": 46310 }, { "epoch": 8.499724720132134, "grad_norm": 0.015331100672483444, "learning_rate": 7.113454167934115e-06, "loss": 0.0028, "num_input_tokens_seen": 99896384, "step": 46315 }, { "epoch": 8.500642319691687, "grad_norm": 0.01708778366446495, "learning_rate": 7.112728435923502e-06, "loss": 0.0002, "num_input_tokens_seen": 99906464, "step": 46320 }, { "epoch": 8.501559919251239, "grad_norm": 0.049953050911426544, "learning_rate": 7.112002649724676e-06, "loss": 0.0647, "num_input_tokens_seen": 99916896, "step": 46325 }, { "epoch": 8.50247751881079, "grad_norm": 0.08428598940372467, "learning_rate": 7.111276809356258e-06, "loss": 0.1383, "num_input_tokens_seen": 99927072, "step": 46330 }, { "epoch": 8.503395118370344, "grad_norm": 1.1826303005218506, "learning_rate": 7.1105509148368615e-06, "loss": 0.0017, "num_input_tokens_seen": 99937920, "step": 46335 }, { "epoch": 8.504312717929896, "grad_norm": 0.026275748386979103, "learning_rate": 7.109824966185105e-06, "loss": 0.0002, "num_input_tokens_seen": 99947488, "step": 46340 }, { "epoch": 8.505230317489447, "grad_norm": 0.2587890923023224, "learning_rate": 7.109098963419608e-06, "loss": 0.0004, "num_input_tokens_seen": 99957536, "step": 46345 }, { "epoch": 8.506147917049, "grad_norm": 0.005244340281933546, "learning_rate": 7.108372906558991e-06, "loss": 0.0004, "num_input_tokens_seen": 99967488, "step": 46350 }, { "epoch": 8.507065516608552, "grad_norm": 0.026371603831648827, "learning_rate": 7.107646795621876e-06, "loss": 0.0002, "num_input_tokens_seen": 99978944, "step": 46355 }, { "epoch": 8.507983116168104, "grad_norm": 0.00368294445797801, "learning_rate": 7.106920630626889e-06, "loss": 0.0051, "num_input_tokens_seen": 99989472, "step": 46360 }, { "epoch": 8.508900715727657, "grad_norm": 0.019752949476242065, "learning_rate": 7.106194411592652e-06, "loss": 0.1787, "num_input_tokens_seen": 100000672, "step": 46365 }, { "epoch": 8.509818315287209, "grad_norm": 0.2735891044139862, "learning_rate": 7.105468138537793e-06, "loss": 0.1136, "num_input_tokens_seen": 100012096, "step": 46370 }, { "epoch": 8.51073591484676, "grad_norm": 0.01989581622183323, "learning_rate": 7.1047418114809395e-06, "loss": 0.0003, "num_input_tokens_seen": 100023104, "step": 46375 }, { "epoch": 8.511653514406314, "grad_norm": 225.2928924560547, "learning_rate": 7.104015430440719e-06, "loss": 0.0193, "num_input_tokens_seen": 100033568, "step": 46380 }, { "epoch": 8.512571113965866, "grad_norm": 0.004971694201231003, "learning_rate": 7.1032889954357665e-06, "loss": 0.1287, "num_input_tokens_seen": 100043968, "step": 46385 }, { "epoch": 8.513488713525417, "grad_norm": 0.038065869361162186, "learning_rate": 7.102562506484709e-06, "loss": 0.0885, "num_input_tokens_seen": 100055712, "step": 46390 }, { "epoch": 8.51440631308497, "grad_norm": 101.7134017944336, "learning_rate": 7.101835963606183e-06, "loss": 0.2788, "num_input_tokens_seen": 100067168, "step": 46395 }, { "epoch": 8.515323912644522, "grad_norm": 33.89690017700195, "learning_rate": 7.101109366818822e-06, "loss": 0.1633, "num_input_tokens_seen": 100077920, "step": 46400 }, { "epoch": 8.516241512204074, "grad_norm": 0.19439470767974854, "learning_rate": 7.100382716141262e-06, "loss": 0.0005, "num_input_tokens_seen": 100088864, "step": 46405 }, { "epoch": 8.517159111763627, "grad_norm": 0.008213197812438011, "learning_rate": 7.09965601159214e-06, "loss": 0.0008, "num_input_tokens_seen": 100100640, "step": 46410 }, { "epoch": 8.518076711323179, "grad_norm": 157.0406494140625, "learning_rate": 7.098929253190095e-06, "loss": 0.0705, "num_input_tokens_seen": 100111808, "step": 46415 }, { "epoch": 8.51899431088273, "grad_norm": 0.018775124102830887, "learning_rate": 7.09820244095377e-06, "loss": 0.0016, "num_input_tokens_seen": 100121440, "step": 46420 }, { "epoch": 8.519911910442284, "grad_norm": 195.1135711669922, "learning_rate": 7.097475574901802e-06, "loss": 0.0979, "num_input_tokens_seen": 100131520, "step": 46425 }, { "epoch": 8.520829510001835, "grad_norm": 0.01141287013888359, "learning_rate": 7.096748655052837e-06, "loss": 0.0004, "num_input_tokens_seen": 100143552, "step": 46430 }, { "epoch": 8.521747109561387, "grad_norm": 0.0035238165874034166, "learning_rate": 7.0960216814255185e-06, "loss": 0.0004, "num_input_tokens_seen": 100155104, "step": 46435 }, { "epoch": 8.52266470912094, "grad_norm": 1.9238370656967163, "learning_rate": 7.095294654038493e-06, "loss": 0.0008, "num_input_tokens_seen": 100165024, "step": 46440 }, { "epoch": 8.523582308680492, "grad_norm": 0.03893887251615524, "learning_rate": 7.094567572910407e-06, "loss": 0.1068, "num_input_tokens_seen": 100175936, "step": 46445 }, { "epoch": 8.524499908240044, "grad_norm": 0.01287086121737957, "learning_rate": 7.093840438059909e-06, "loss": 0.1192, "num_input_tokens_seen": 100185888, "step": 46450 }, { "epoch": 8.525417507799597, "grad_norm": 0.0012643922818824649, "learning_rate": 7.09311324950565e-06, "loss": 0.0001, "num_input_tokens_seen": 100196192, "step": 46455 }, { "epoch": 8.526335107359149, "grad_norm": 0.1515018194913864, "learning_rate": 7.092386007266279e-06, "loss": 0.0003, "num_input_tokens_seen": 100207360, "step": 46460 }, { "epoch": 8.5272527069187, "grad_norm": 0.01825859397649765, "learning_rate": 7.09165871136045e-06, "loss": 0.1878, "num_input_tokens_seen": 100219616, "step": 46465 }, { "epoch": 8.528170306478254, "grad_norm": 0.03177851065993309, "learning_rate": 7.0909313618068166e-06, "loss": 0.0002, "num_input_tokens_seen": 100231360, "step": 46470 }, { "epoch": 8.529087906037805, "grad_norm": 0.024757135659456253, "learning_rate": 7.090203958624033e-06, "loss": 0.0003, "num_input_tokens_seen": 100243168, "step": 46475 }, { "epoch": 8.530005505597357, "grad_norm": 0.0322745218873024, "learning_rate": 7.08947650183076e-06, "loss": 0.0024, "num_input_tokens_seen": 100254752, "step": 46480 }, { "epoch": 8.53092310515691, "grad_norm": 0.0032432982698082924, "learning_rate": 7.088748991445651e-06, "loss": 0.0003, "num_input_tokens_seen": 100264608, "step": 46485 }, { "epoch": 8.531840704716462, "grad_norm": 0.07132216542959213, "learning_rate": 7.088021427487368e-06, "loss": 0.0286, "num_input_tokens_seen": 100275264, "step": 46490 }, { "epoch": 8.532758304276014, "grad_norm": 0.046369899064302444, "learning_rate": 7.087293809974574e-06, "loss": 0.1536, "num_input_tokens_seen": 100284800, "step": 46495 }, { "epoch": 8.533675903835567, "grad_norm": 222.16891479492188, "learning_rate": 7.086566138925925e-06, "loss": 0.0419, "num_input_tokens_seen": 100296960, "step": 46500 }, { "epoch": 8.534593503395119, "grad_norm": 0.004272899590432644, "learning_rate": 7.085838414360091e-06, "loss": 0.0004, "num_input_tokens_seen": 100308224, "step": 46505 }, { "epoch": 8.53551110295467, "grad_norm": 0.918304443359375, "learning_rate": 7.085110636295733e-06, "loss": 0.1179, "num_input_tokens_seen": 100319904, "step": 46510 }, { "epoch": 8.536428702514224, "grad_norm": 0.016378946602344513, "learning_rate": 7.084382804751519e-06, "loss": 0.0002, "num_input_tokens_seen": 100330016, "step": 46515 }, { "epoch": 8.537346302073775, "grad_norm": 63.54747009277344, "learning_rate": 7.083654919746119e-06, "loss": 0.6071, "num_input_tokens_seen": 100341344, "step": 46520 }, { "epoch": 8.538263901633327, "grad_norm": 0.3880491554737091, "learning_rate": 7.082926981298197e-06, "loss": 0.1319, "num_input_tokens_seen": 100352064, "step": 46525 }, { "epoch": 8.53918150119288, "grad_norm": 0.020174726843833923, "learning_rate": 7.082198989426428e-06, "loss": 0.0774, "num_input_tokens_seen": 100362560, "step": 46530 }, { "epoch": 8.540099100752432, "grad_norm": 0.04425368085503578, "learning_rate": 7.08147094414948e-06, "loss": 0.0012, "num_input_tokens_seen": 100373664, "step": 46535 }, { "epoch": 8.541016700311983, "grad_norm": 0.015176861546933651, "learning_rate": 7.08074284548603e-06, "loss": 0.0004, "num_input_tokens_seen": 100383552, "step": 46540 }, { "epoch": 8.541934299871537, "grad_norm": 0.019266847521066666, "learning_rate": 7.08001469345475e-06, "loss": 0.0042, "num_input_tokens_seen": 100393792, "step": 46545 }, { "epoch": 8.542851899431088, "grad_norm": 0.1798139065504074, "learning_rate": 7.079286488074317e-06, "loss": 0.0012, "num_input_tokens_seen": 100404896, "step": 46550 }, { "epoch": 8.54376949899064, "grad_norm": 73.34202575683594, "learning_rate": 7.078558229363408e-06, "loss": 0.1226, "num_input_tokens_seen": 100415936, "step": 46555 }, { "epoch": 8.544687098550193, "grad_norm": 0.008028523065149784, "learning_rate": 7.077829917340703e-06, "loss": 0.001, "num_input_tokens_seen": 100426144, "step": 46560 }, { "epoch": 8.545604698109745, "grad_norm": 0.0490344800055027, "learning_rate": 7.07710155202488e-06, "loss": 0.0003, "num_input_tokens_seen": 100437536, "step": 46565 }, { "epoch": 8.546522297669297, "grad_norm": 0.034713853150606155, "learning_rate": 7.076373133434621e-06, "loss": 0.0004, "num_input_tokens_seen": 100448800, "step": 46570 }, { "epoch": 8.54743989722885, "grad_norm": 0.006077036261558533, "learning_rate": 7.07564466158861e-06, "loss": 0.1975, "num_input_tokens_seen": 100458624, "step": 46575 }, { "epoch": 8.548357496788402, "grad_norm": 0.5712735056877136, "learning_rate": 7.0749161365055295e-06, "loss": 0.2483, "num_input_tokens_seen": 100468832, "step": 46580 }, { "epoch": 8.549275096347953, "grad_norm": 0.10080208629369736, "learning_rate": 7.074187558204066e-06, "loss": 0.1601, "num_input_tokens_seen": 100478304, "step": 46585 }, { "epoch": 8.550192695907507, "grad_norm": 0.08120542764663696, "learning_rate": 7.073458926702907e-06, "loss": 0.0016, "num_input_tokens_seen": 100488384, "step": 46590 }, { "epoch": 8.551110295467058, "grad_norm": 0.10044311732053757, "learning_rate": 7.07273024202074e-06, "loss": 0.001, "num_input_tokens_seen": 100500576, "step": 46595 }, { "epoch": 8.55202789502661, "grad_norm": 0.006024565082043409, "learning_rate": 7.072001504176255e-06, "loss": 0.1476, "num_input_tokens_seen": 100511104, "step": 46600 }, { "epoch": 8.552945494586163, "grad_norm": 0.015515766106545925, "learning_rate": 7.071272713188142e-06, "loss": 0.1289, "num_input_tokens_seen": 100521568, "step": 46605 }, { "epoch": 8.553863094145715, "grad_norm": 0.4293403625488281, "learning_rate": 7.070543869075095e-06, "loss": 0.0012, "num_input_tokens_seen": 100532480, "step": 46610 }, { "epoch": 8.554780693705267, "grad_norm": 0.11855636537075043, "learning_rate": 7.069814971855806e-06, "loss": 0.002, "num_input_tokens_seen": 100542208, "step": 46615 }, { "epoch": 8.55569829326482, "grad_norm": 0.024671632796525955, "learning_rate": 7.069086021548971e-06, "loss": 0.0886, "num_input_tokens_seen": 100553472, "step": 46620 }, { "epoch": 8.556615892824372, "grad_norm": 0.00969619769603014, "learning_rate": 7.0683570181732865e-06, "loss": 0.0211, "num_input_tokens_seen": 100564608, "step": 46625 }, { "epoch": 8.557533492383923, "grad_norm": 1.048222303390503, "learning_rate": 7.06762796174745e-06, "loss": 0.0014, "num_input_tokens_seen": 100574208, "step": 46630 }, { "epoch": 8.558451091943477, "grad_norm": 0.024158289656043053, "learning_rate": 7.06689885229016e-06, "loss": 0.0003, "num_input_tokens_seen": 100585536, "step": 46635 }, { "epoch": 8.559368691503028, "grad_norm": 0.024971310049295425, "learning_rate": 7.06616968982012e-06, "loss": 0.0013, "num_input_tokens_seen": 100596448, "step": 46640 }, { "epoch": 8.56028629106258, "grad_norm": 0.03241605684161186, "learning_rate": 7.065440474356028e-06, "loss": 0.0733, "num_input_tokens_seen": 100607200, "step": 46645 }, { "epoch": 8.561203890622133, "grad_norm": 48.39979934692383, "learning_rate": 7.06471120591659e-06, "loss": 0.1134, "num_input_tokens_seen": 100617568, "step": 46650 }, { "epoch": 8.562121490181685, "grad_norm": 0.002845610026270151, "learning_rate": 7.063981884520509e-06, "loss": 0.0019, "num_input_tokens_seen": 100628576, "step": 46655 }, { "epoch": 8.563039089741237, "grad_norm": 0.010020441375672817, "learning_rate": 7.063252510186493e-06, "loss": 0.0007, "num_input_tokens_seen": 100638304, "step": 46660 }, { "epoch": 8.56395668930079, "grad_norm": 0.0596279613673687, "learning_rate": 7.062523082933245e-06, "loss": 0.0017, "num_input_tokens_seen": 100649568, "step": 46665 }, { "epoch": 8.564874288860342, "grad_norm": 0.006568621378391981, "learning_rate": 7.061793602779479e-06, "loss": 0.2168, "num_input_tokens_seen": 100661376, "step": 46670 }, { "epoch": 8.565791888419893, "grad_norm": 0.06723353266716003, "learning_rate": 7.061064069743902e-06, "loss": 0.0003, "num_input_tokens_seen": 100672352, "step": 46675 }, { "epoch": 8.566709487979447, "grad_norm": 0.014640457928180695, "learning_rate": 7.060334483845225e-06, "loss": 0.0007, "num_input_tokens_seen": 100683200, "step": 46680 }, { "epoch": 8.567627087538998, "grad_norm": 0.04281233623623848, "learning_rate": 7.059604845102161e-06, "loss": 0.0005, "num_input_tokens_seen": 100692000, "step": 46685 }, { "epoch": 8.56854468709855, "grad_norm": 0.06404367834329605, "learning_rate": 7.058875153533428e-06, "loss": 0.0122, "num_input_tokens_seen": 100702464, "step": 46690 }, { "epoch": 8.569462286658103, "grad_norm": 0.005466856062412262, "learning_rate": 7.0581454091577354e-06, "loss": 0.0005, "num_input_tokens_seen": 100714080, "step": 46695 }, { "epoch": 8.570379886217655, "grad_norm": 0.01045975275337696, "learning_rate": 7.057415611993803e-06, "loss": 0.0007, "num_input_tokens_seen": 100723968, "step": 46700 }, { "epoch": 8.571297485777206, "grad_norm": 0.21292413771152496, "learning_rate": 7.0566857620603515e-06, "loss": 0.0185, "num_input_tokens_seen": 100734368, "step": 46705 }, { "epoch": 8.57221508533676, "grad_norm": 0.03894725814461708, "learning_rate": 7.0559558593760944e-06, "loss": 0.0006, "num_input_tokens_seen": 100745184, "step": 46710 }, { "epoch": 8.573132684896311, "grad_norm": 0.021747378632426262, "learning_rate": 7.055225903959759e-06, "loss": 0.0002, "num_input_tokens_seen": 100757088, "step": 46715 }, { "epoch": 8.574050284455863, "grad_norm": 0.06352830678224564, "learning_rate": 7.054495895830063e-06, "loss": 0.0002, "num_input_tokens_seen": 100766752, "step": 46720 }, { "epoch": 8.574967884015416, "grad_norm": 0.014580558985471725, "learning_rate": 7.053765835005732e-06, "loss": 0.1513, "num_input_tokens_seen": 100778688, "step": 46725 }, { "epoch": 8.575885483574968, "grad_norm": 0.1856522262096405, "learning_rate": 7.053035721505489e-06, "loss": 0.0006, "num_input_tokens_seen": 100789184, "step": 46730 }, { "epoch": 8.57680308313452, "grad_norm": 0.006085286848247051, "learning_rate": 7.052305555348062e-06, "loss": 0.0001, "num_input_tokens_seen": 100800128, "step": 46735 }, { "epoch": 8.577720682694073, "grad_norm": 0.00475460197776556, "learning_rate": 7.051575336552179e-06, "loss": 0.0072, "num_input_tokens_seen": 100811904, "step": 46740 }, { "epoch": 8.578638282253625, "grad_norm": 0.020370788872241974, "learning_rate": 7.050845065136568e-06, "loss": 0.0001, "num_input_tokens_seen": 100823456, "step": 46745 }, { "epoch": 8.579555881813176, "grad_norm": 0.0026832122821360826, "learning_rate": 7.05011474111996e-06, "loss": 0.0, "num_input_tokens_seen": 100833792, "step": 46750 }, { "epoch": 8.58047348137273, "grad_norm": 0.39077064394950867, "learning_rate": 7.049384364521086e-06, "loss": 0.0739, "num_input_tokens_seen": 100844320, "step": 46755 }, { "epoch": 8.581391080932281, "grad_norm": 0.0006017679115757346, "learning_rate": 7.048653935358681e-06, "loss": 0.0002, "num_input_tokens_seen": 100854176, "step": 46760 }, { "epoch": 8.582308680491833, "grad_norm": 0.009868058376014233, "learning_rate": 7.047923453651474e-06, "loss": 0.0109, "num_input_tokens_seen": 100865088, "step": 46765 }, { "epoch": 8.583226280051386, "grad_norm": 0.021340616047382355, "learning_rate": 7.047192919418207e-06, "loss": 0.1254, "num_input_tokens_seen": 100875424, "step": 46770 }, { "epoch": 8.584143879610938, "grad_norm": 0.010781152173876762, "learning_rate": 7.046462332677614e-06, "loss": 0.0069, "num_input_tokens_seen": 100885600, "step": 46775 }, { "epoch": 8.58506147917049, "grad_norm": 0.0008596701081842184, "learning_rate": 7.045731693448434e-06, "loss": 0.0813, "num_input_tokens_seen": 100896000, "step": 46780 }, { "epoch": 8.585979078730043, "grad_norm": 0.012054625898599625, "learning_rate": 7.045001001749406e-06, "loss": 0.033, "num_input_tokens_seen": 100906432, "step": 46785 }, { "epoch": 8.586896678289595, "grad_norm": 0.16474559903144836, "learning_rate": 7.044270257599273e-06, "loss": 0.0982, "num_input_tokens_seen": 100917856, "step": 46790 }, { "epoch": 8.587814277849146, "grad_norm": 0.008686038665473461, "learning_rate": 7.043539461016775e-06, "loss": 0.0109, "num_input_tokens_seen": 100929856, "step": 46795 }, { "epoch": 8.5887318774087, "grad_norm": 0.004937679972499609, "learning_rate": 7.0428086120206575e-06, "loss": 0.0002, "num_input_tokens_seen": 100941696, "step": 46800 }, { "epoch": 8.589649476968251, "grad_norm": 0.007394089829176664, "learning_rate": 7.0420777106296645e-06, "loss": 0.1693, "num_input_tokens_seen": 100952800, "step": 46805 }, { "epoch": 8.590567076527803, "grad_norm": 0.011809470131993294, "learning_rate": 7.041346756862543e-06, "loss": 0.0002, "num_input_tokens_seen": 100962880, "step": 46810 }, { "epoch": 8.591484676087356, "grad_norm": 0.0019264263100922108, "learning_rate": 7.040615750738042e-06, "loss": 0.2202, "num_input_tokens_seen": 100972832, "step": 46815 }, { "epoch": 8.592402275646908, "grad_norm": 0.07098668068647385, "learning_rate": 7.03988469227491e-06, "loss": 0.0008, "num_input_tokens_seen": 100983648, "step": 46820 }, { "epoch": 8.59331987520646, "grad_norm": 0.003705106908455491, "learning_rate": 7.039153581491898e-06, "loss": 0.0004, "num_input_tokens_seen": 100994432, "step": 46825 }, { "epoch": 8.594237474766013, "grad_norm": 0.216299906373024, "learning_rate": 7.038422418407754e-06, "loss": 0.0004, "num_input_tokens_seen": 101005600, "step": 46830 }, { "epoch": 8.595155074325564, "grad_norm": 0.004948475863784552, "learning_rate": 7.037691203041236e-06, "loss": 0.1534, "num_input_tokens_seen": 101017600, "step": 46835 }, { "epoch": 8.596072673885116, "grad_norm": 74.9688720703125, "learning_rate": 7.036959935411096e-06, "loss": 0.239, "num_input_tokens_seen": 101028864, "step": 46840 }, { "epoch": 8.59699027344467, "grad_norm": 0.022361014038324356, "learning_rate": 7.036228615536091e-06, "loss": 0.0017, "num_input_tokens_seen": 101039872, "step": 46845 }, { "epoch": 8.597907873004221, "grad_norm": 0.003794971853494644, "learning_rate": 7.035497243434979e-06, "loss": 0.0002, "num_input_tokens_seen": 101049120, "step": 46850 }, { "epoch": 8.598825472563773, "grad_norm": 0.10537462681531906, "learning_rate": 7.034765819126517e-06, "loss": 0.0007, "num_input_tokens_seen": 101059488, "step": 46855 }, { "epoch": 8.599743072123326, "grad_norm": 0.04025498405098915, "learning_rate": 7.034034342629464e-06, "loss": 0.0003, "num_input_tokens_seen": 101070848, "step": 46860 }, { "epoch": 8.600660671682878, "grad_norm": 0.1007477268576622, "learning_rate": 7.0333028139625835e-06, "loss": 0.0005, "num_input_tokens_seen": 101081920, "step": 46865 }, { "epoch": 8.60157827124243, "grad_norm": 0.026388516649603844, "learning_rate": 7.032571233144638e-06, "loss": 0.1785, "num_input_tokens_seen": 101091680, "step": 46870 }, { "epoch": 8.602495870801983, "grad_norm": 0.013198119588196278, "learning_rate": 7.03183960019439e-06, "loss": 0.0025, "num_input_tokens_seen": 101103200, "step": 46875 }, { "epoch": 8.603413470361534, "grad_norm": 0.09529517590999603, "learning_rate": 7.031107915130606e-06, "loss": 0.0797, "num_input_tokens_seen": 101114048, "step": 46880 }, { "epoch": 8.604331069921086, "grad_norm": 0.009834465570747852, "learning_rate": 7.0303761779720516e-06, "loss": 0.0012, "num_input_tokens_seen": 101124640, "step": 46885 }, { "epoch": 8.60524866948064, "grad_norm": 0.06644143909215927, "learning_rate": 7.029644388737493e-06, "loss": 0.0005, "num_input_tokens_seen": 101136512, "step": 46890 }, { "epoch": 8.606166269040191, "grad_norm": 56.658626556396484, "learning_rate": 7.028912547445703e-06, "loss": 0.0099, "num_input_tokens_seen": 101146112, "step": 46895 }, { "epoch": 8.607083868599743, "grad_norm": 14.321735382080078, "learning_rate": 7.028180654115451e-06, "loss": 0.1628, "num_input_tokens_seen": 101156832, "step": 46900 }, { "epoch": 8.608001468159296, "grad_norm": 0.00475383410230279, "learning_rate": 7.027448708765508e-06, "loss": 0.1255, "num_input_tokens_seen": 101167840, "step": 46905 }, { "epoch": 8.608919067718848, "grad_norm": 0.31203213334083557, "learning_rate": 7.026716711414648e-06, "loss": 0.0007, "num_input_tokens_seen": 101178240, "step": 46910 }, { "epoch": 8.6098366672784, "grad_norm": 0.0020885851699858904, "learning_rate": 7.025984662081645e-06, "loss": 0.0003, "num_input_tokens_seen": 101187552, "step": 46915 }, { "epoch": 8.610754266837953, "grad_norm": 25.964672088623047, "learning_rate": 7.025252560785276e-06, "loss": 0.3285, "num_input_tokens_seen": 101197024, "step": 46920 }, { "epoch": 8.611671866397504, "grad_norm": 30.683500289916992, "learning_rate": 7.024520407544319e-06, "loss": 0.0979, "num_input_tokens_seen": 101207168, "step": 46925 }, { "epoch": 8.612589465957056, "grad_norm": 0.02964874543249607, "learning_rate": 7.023788202377549e-06, "loss": 0.0769, "num_input_tokens_seen": 101217472, "step": 46930 }, { "epoch": 8.61350706551661, "grad_norm": 5.140234470367432, "learning_rate": 7.02305594530375e-06, "loss": 0.1019, "num_input_tokens_seen": 101228576, "step": 46935 }, { "epoch": 8.614424665076161, "grad_norm": 0.24833549559116364, "learning_rate": 7.022323636341699e-06, "loss": 0.1107, "num_input_tokens_seen": 101239104, "step": 46940 }, { "epoch": 8.615342264635713, "grad_norm": 0.018956119194626808, "learning_rate": 7.021591275510182e-06, "loss": 0.008, "num_input_tokens_seen": 101249408, "step": 46945 }, { "epoch": 8.616259864195266, "grad_norm": 0.004261434078216553, "learning_rate": 7.020858862827984e-06, "loss": 0.0003, "num_input_tokens_seen": 101260512, "step": 46950 }, { "epoch": 8.617177463754818, "grad_norm": 0.13982811570167542, "learning_rate": 7.020126398313887e-06, "loss": 0.0013, "num_input_tokens_seen": 101270016, "step": 46955 }, { "epoch": 8.61809506331437, "grad_norm": 0.07731804996728897, "learning_rate": 7.019393881986678e-06, "loss": 0.0065, "num_input_tokens_seen": 101281152, "step": 46960 }, { "epoch": 8.619012662873923, "grad_norm": 0.013392513617873192, "learning_rate": 7.018661313865147e-06, "loss": 0.065, "num_input_tokens_seen": 101291232, "step": 46965 }, { "epoch": 8.619930262433474, "grad_norm": 0.03136714547872543, "learning_rate": 7.01792869396808e-06, "loss": 0.0001, "num_input_tokens_seen": 101303232, "step": 46970 }, { "epoch": 8.620847861993026, "grad_norm": 121.33782196044922, "learning_rate": 7.017196022314272e-06, "loss": 0.072, "num_input_tokens_seen": 101313728, "step": 46975 }, { "epoch": 8.62176546155258, "grad_norm": 0.045334022492170334, "learning_rate": 7.016463298922511e-06, "loss": 0.2066, "num_input_tokens_seen": 101325088, "step": 46980 }, { "epoch": 8.62268306111213, "grad_norm": 0.00944594107568264, "learning_rate": 7.015730523811592e-06, "loss": 0.0003, "num_input_tokens_seen": 101336096, "step": 46985 }, { "epoch": 8.623600660671682, "grad_norm": 22.3746337890625, "learning_rate": 7.014997697000309e-06, "loss": 0.0854, "num_input_tokens_seen": 101346976, "step": 46990 }, { "epoch": 8.624518260231236, "grad_norm": 0.08853799849748611, "learning_rate": 7.014264818507458e-06, "loss": 0.0766, "num_input_tokens_seen": 101357728, "step": 46995 }, { "epoch": 8.625435859790787, "grad_norm": 0.04481983557343483, "learning_rate": 7.013531888351837e-06, "loss": 0.0005, "num_input_tokens_seen": 101368160, "step": 47000 }, { "epoch": 8.626353459350339, "grad_norm": 0.004987737629562616, "learning_rate": 7.012798906552242e-06, "loss": 0.0009, "num_input_tokens_seen": 101378752, "step": 47005 }, { "epoch": 8.627271058909892, "grad_norm": 0.11601117253303528, "learning_rate": 7.012065873127476e-06, "loss": 0.0826, "num_input_tokens_seen": 101389952, "step": 47010 }, { "epoch": 8.628188658469444, "grad_norm": 0.004698862787336111, "learning_rate": 7.011332788096338e-06, "loss": 0.0006, "num_input_tokens_seen": 101400448, "step": 47015 }, { "epoch": 8.629106258028996, "grad_norm": 0.006944045890122652, "learning_rate": 7.010599651477632e-06, "loss": 0.0002, "num_input_tokens_seen": 101411712, "step": 47020 }, { "epoch": 8.630023857588549, "grad_norm": 0.004028872586786747, "learning_rate": 7.00986646329016e-06, "loss": 0.1862, "num_input_tokens_seen": 101422592, "step": 47025 }, { "epoch": 8.6309414571481, "grad_norm": 0.041908588260412216, "learning_rate": 7.009133223552729e-06, "loss": 0.2537, "num_input_tokens_seen": 101433408, "step": 47030 }, { "epoch": 8.631859056707652, "grad_norm": 0.027392040938138962, "learning_rate": 7.008399932284145e-06, "loss": 0.0007, "num_input_tokens_seen": 101443552, "step": 47035 }, { "epoch": 8.632776656267206, "grad_norm": 0.028960635885596275, "learning_rate": 7.007666589503215e-06, "loss": 0.0004, "num_input_tokens_seen": 101455552, "step": 47040 }, { "epoch": 8.633694255826757, "grad_norm": 0.059312909841537476, "learning_rate": 7.006933195228749e-06, "loss": 0.1388, "num_input_tokens_seen": 101466688, "step": 47045 }, { "epoch": 8.634611855386309, "grad_norm": 0.36680811643600464, "learning_rate": 7.006199749479557e-06, "loss": 0.0005, "num_input_tokens_seen": 101477728, "step": 47050 }, { "epoch": 8.635529454945862, "grad_norm": 0.16266511380672455, "learning_rate": 7.005466252274449e-06, "loss": 0.2868, "num_input_tokens_seen": 101488736, "step": 47055 }, { "epoch": 8.636447054505414, "grad_norm": 0.06289210170507431, "learning_rate": 7.004732703632242e-06, "loss": 0.0987, "num_input_tokens_seen": 101500448, "step": 47060 }, { "epoch": 8.637364654064966, "grad_norm": 0.1410416215658188, "learning_rate": 7.003999103571747e-06, "loss": 0.0893, "num_input_tokens_seen": 101511392, "step": 47065 }, { "epoch": 8.638282253624519, "grad_norm": 0.01841496117413044, "learning_rate": 7.003265452111781e-06, "loss": 0.0833, "num_input_tokens_seen": 101521728, "step": 47070 }, { "epoch": 8.63919985318407, "grad_norm": 0.01456835400313139, "learning_rate": 7.002531749271162e-06, "loss": 0.0863, "num_input_tokens_seen": 101533984, "step": 47075 }, { "epoch": 8.640117452743622, "grad_norm": 0.08738166093826294, "learning_rate": 7.001797995068706e-06, "loss": 0.0013, "num_input_tokens_seen": 101545632, "step": 47080 }, { "epoch": 8.641035052303176, "grad_norm": 0.7625078558921814, "learning_rate": 7.0010641895232345e-06, "loss": 0.1251, "num_input_tokens_seen": 101555776, "step": 47085 }, { "epoch": 8.641952651862727, "grad_norm": 0.02644042856991291, "learning_rate": 7.000330332653569e-06, "loss": 0.005, "num_input_tokens_seen": 101565792, "step": 47090 }, { "epoch": 8.642870251422279, "grad_norm": 2.191408634185791, "learning_rate": 6.99959642447853e-06, "loss": 0.0037, "num_input_tokens_seen": 101576288, "step": 47095 }, { "epoch": 8.643787850981832, "grad_norm": 32.358158111572266, "learning_rate": 6.998862465016941e-06, "loss": 0.2664, "num_input_tokens_seen": 101586752, "step": 47100 }, { "epoch": 8.644705450541384, "grad_norm": 0.028133444488048553, "learning_rate": 6.998128454287627e-06, "loss": 0.1692, "num_input_tokens_seen": 101597472, "step": 47105 }, { "epoch": 8.645623050100935, "grad_norm": 0.0012068889336660504, "learning_rate": 6.997394392309418e-06, "loss": 0.0002, "num_input_tokens_seen": 101608736, "step": 47110 }, { "epoch": 8.646540649660489, "grad_norm": 0.027901239693164825, "learning_rate": 6.996660279101135e-06, "loss": 0.0017, "num_input_tokens_seen": 101619968, "step": 47115 }, { "epoch": 8.64745824922004, "grad_norm": 20.196552276611328, "learning_rate": 6.995926114681612e-06, "loss": 0.3013, "num_input_tokens_seen": 101629952, "step": 47120 }, { "epoch": 8.648375848779592, "grad_norm": 0.005107021424919367, "learning_rate": 6.995191899069678e-06, "loss": 0.0007, "num_input_tokens_seen": 101640320, "step": 47125 }, { "epoch": 8.649293448339145, "grad_norm": 0.05135262385010719, "learning_rate": 6.994457632284164e-06, "loss": 0.1163, "num_input_tokens_seen": 101650976, "step": 47130 }, { "epoch": 8.650211047898697, "grad_norm": 0.5470623970031738, "learning_rate": 6.993723314343903e-06, "loss": 0.0016, "num_input_tokens_seen": 101662144, "step": 47135 }, { "epoch": 8.651128647458249, "grad_norm": 0.04613441228866577, "learning_rate": 6.992988945267728e-06, "loss": 0.0003, "num_input_tokens_seen": 101673056, "step": 47140 }, { "epoch": 8.652046247017802, "grad_norm": 0.7629560828208923, "learning_rate": 6.9922545250744754e-06, "loss": 0.0005, "num_input_tokens_seen": 101683200, "step": 47145 }, { "epoch": 8.652963846577354, "grad_norm": 0.384370356798172, "learning_rate": 6.991520053782983e-06, "loss": 0.0015, "num_input_tokens_seen": 101695648, "step": 47150 }, { "epoch": 8.653881446136905, "grad_norm": 0.030380498617887497, "learning_rate": 6.990785531412087e-06, "loss": 0.1219, "num_input_tokens_seen": 101706560, "step": 47155 }, { "epoch": 8.654799045696459, "grad_norm": 0.020955704152584076, "learning_rate": 6.990050957980628e-06, "loss": 0.1917, "num_input_tokens_seen": 101716992, "step": 47160 }, { "epoch": 8.65571664525601, "grad_norm": 0.009427618235349655, "learning_rate": 6.989316333507446e-06, "loss": 0.0009, "num_input_tokens_seen": 101727456, "step": 47165 }, { "epoch": 8.656634244815562, "grad_norm": 15.550031661987305, "learning_rate": 6.988581658011383e-06, "loss": 0.2448, "num_input_tokens_seen": 101738176, "step": 47170 }, { "epoch": 8.657551844375115, "grad_norm": 0.22549356520175934, "learning_rate": 6.987846931511282e-06, "loss": 0.1079, "num_input_tokens_seen": 101747552, "step": 47175 }, { "epoch": 8.658469443934667, "grad_norm": 0.30970245599746704, "learning_rate": 6.98711215402599e-06, "loss": 0.3257, "num_input_tokens_seen": 101759072, "step": 47180 }, { "epoch": 8.659387043494219, "grad_norm": 32.12735366821289, "learning_rate": 6.98637732557435e-06, "loss": 0.0772, "num_input_tokens_seen": 101770208, "step": 47185 }, { "epoch": 8.660304643053772, "grad_norm": 0.03252184018492699, "learning_rate": 6.985642446175209e-06, "loss": 0.1227, "num_input_tokens_seen": 101779872, "step": 47190 }, { "epoch": 8.661222242613324, "grad_norm": 28.582883834838867, "learning_rate": 6.984907515847418e-06, "loss": 0.0859, "num_input_tokens_seen": 101791072, "step": 47195 }, { "epoch": 8.662139842172875, "grad_norm": 0.02221558429300785, "learning_rate": 6.984172534609825e-06, "loss": 0.0012, "num_input_tokens_seen": 101802496, "step": 47200 }, { "epoch": 8.663057441732429, "grad_norm": 0.05208635330200195, "learning_rate": 6.983437502481283e-06, "loss": 0.002, "num_input_tokens_seen": 101813152, "step": 47205 }, { "epoch": 8.66397504129198, "grad_norm": 0.1190207451581955, "learning_rate": 6.982702419480642e-06, "loss": 0.1506, "num_input_tokens_seen": 101823904, "step": 47210 }, { "epoch": 8.664892640851532, "grad_norm": 10.86235523223877, "learning_rate": 6.981967285626756e-06, "loss": 0.1091, "num_input_tokens_seen": 101834080, "step": 47215 }, { "epoch": 8.665810240411085, "grad_norm": 0.024435514584183693, "learning_rate": 6.981232100938482e-06, "loss": 0.0887, "num_input_tokens_seen": 101844352, "step": 47220 }, { "epoch": 8.666727839970637, "grad_norm": 0.12018942087888718, "learning_rate": 6.980496865434675e-06, "loss": 0.1562, "num_input_tokens_seen": 101854880, "step": 47225 }, { "epoch": 8.667645439530189, "grad_norm": 0.23107509315013885, "learning_rate": 6.979761579134193e-06, "loss": 0.0009, "num_input_tokens_seen": 101865024, "step": 47230 }, { "epoch": 8.668563039089742, "grad_norm": 4.7322516441345215, "learning_rate": 6.979026242055895e-06, "loss": 0.0013, "num_input_tokens_seen": 101875744, "step": 47235 }, { "epoch": 8.669480638649294, "grad_norm": 0.1398368626832962, "learning_rate": 6.97829085421864e-06, "loss": 0.0333, "num_input_tokens_seen": 101886976, "step": 47240 }, { "epoch": 8.670398238208845, "grad_norm": 0.06445912271738052, "learning_rate": 6.9775554156412925e-06, "loss": 0.243, "num_input_tokens_seen": 101898016, "step": 47245 }, { "epoch": 8.671315837768399, "grad_norm": 0.15080970525741577, "learning_rate": 6.976819926342712e-06, "loss": 0.0033, "num_input_tokens_seen": 101909184, "step": 47250 }, { "epoch": 8.67223343732795, "grad_norm": 9.227090835571289, "learning_rate": 6.976084386341766e-06, "loss": 0.3414, "num_input_tokens_seen": 101920128, "step": 47255 }, { "epoch": 8.673151036887502, "grad_norm": 0.1275314837694168, "learning_rate": 6.975348795657316e-06, "loss": 0.0008, "num_input_tokens_seen": 101930496, "step": 47260 }, { "epoch": 8.674068636447055, "grad_norm": 13.578716278076172, "learning_rate": 6.974613154308232e-06, "loss": 0.1321, "num_input_tokens_seen": 101940800, "step": 47265 }, { "epoch": 8.674986236006607, "grad_norm": 0.20086492598056793, "learning_rate": 6.973877462313381e-06, "loss": 0.0009, "num_input_tokens_seen": 101951392, "step": 47270 }, { "epoch": 8.675903835566158, "grad_norm": 2.720156669616699, "learning_rate": 6.973141719691632e-06, "loss": 0.1436, "num_input_tokens_seen": 101961632, "step": 47275 }, { "epoch": 8.676821435125712, "grad_norm": 0.014859550632536411, "learning_rate": 6.972405926461856e-06, "loss": 0.0022, "num_input_tokens_seen": 101973280, "step": 47280 }, { "epoch": 8.677739034685263, "grad_norm": 0.3020528554916382, "learning_rate": 6.971670082642925e-06, "loss": 0.0017, "num_input_tokens_seen": 101983328, "step": 47285 }, { "epoch": 8.678656634244815, "grad_norm": 0.016205554828047752, "learning_rate": 6.9709341882537105e-06, "loss": 0.0048, "num_input_tokens_seen": 101994144, "step": 47290 }, { "epoch": 8.679574233804368, "grad_norm": 0.054541055113077164, "learning_rate": 6.970198243313091e-06, "loss": 0.0039, "num_input_tokens_seen": 102004992, "step": 47295 }, { "epoch": 8.68049183336392, "grad_norm": 0.218361496925354, "learning_rate": 6.969462247839941e-06, "loss": 0.1804, "num_input_tokens_seen": 102015264, "step": 47300 }, { "epoch": 8.681409432923472, "grad_norm": 2.8022875785827637, "learning_rate": 6.9687262018531345e-06, "loss": 0.0097, "num_input_tokens_seen": 102027104, "step": 47305 }, { "epoch": 8.682327032483025, "grad_norm": 0.06411366164684296, "learning_rate": 6.9679901053715536e-06, "loss": 0.0006, "num_input_tokens_seen": 102037312, "step": 47310 }, { "epoch": 8.683244632042577, "grad_norm": 3.232438087463379, "learning_rate": 6.967253958414075e-06, "loss": 0.0023, "num_input_tokens_seen": 102047744, "step": 47315 }, { "epoch": 8.684162231602128, "grad_norm": 0.005774975288659334, "learning_rate": 6.966517760999584e-06, "loss": 0.0706, "num_input_tokens_seen": 102059232, "step": 47320 }, { "epoch": 8.685079831161682, "grad_norm": 0.018192684277892113, "learning_rate": 6.965781513146957e-06, "loss": 0.0003, "num_input_tokens_seen": 102069024, "step": 47325 }, { "epoch": 8.685997430721233, "grad_norm": 0.025008823722600937, "learning_rate": 6.965045214875083e-06, "loss": 0.0002, "num_input_tokens_seen": 102079104, "step": 47330 }, { "epoch": 8.686915030280785, "grad_norm": 0.1066809892654419, "learning_rate": 6.964308866202844e-06, "loss": 0.2035, "num_input_tokens_seen": 102089824, "step": 47335 }, { "epoch": 8.687832629840338, "grad_norm": 0.036416687071323395, "learning_rate": 6.963572467149128e-06, "loss": 0.0004, "num_input_tokens_seen": 102102496, "step": 47340 }, { "epoch": 8.68875022939989, "grad_norm": 0.022818991914391518, "learning_rate": 6.96283601773282e-06, "loss": 0.016, "num_input_tokens_seen": 102113984, "step": 47345 }, { "epoch": 8.689667828959442, "grad_norm": 0.016329806298017502, "learning_rate": 6.962099517972811e-06, "loss": 0.1143, "num_input_tokens_seen": 102125088, "step": 47350 }, { "epoch": 8.690585428518995, "grad_norm": 0.040994007140398026, "learning_rate": 6.96136296788799e-06, "loss": 0.0003, "num_input_tokens_seen": 102135552, "step": 47355 }, { "epoch": 8.691503028078547, "grad_norm": 1.1010960340499878, "learning_rate": 6.9606263674972485e-06, "loss": 0.0009, "num_input_tokens_seen": 102146336, "step": 47360 }, { "epoch": 8.692420627638098, "grad_norm": 32.64341735839844, "learning_rate": 6.959889716819481e-06, "loss": 0.1665, "num_input_tokens_seen": 102157344, "step": 47365 }, { "epoch": 8.693338227197652, "grad_norm": 0.2992250919342041, "learning_rate": 6.959153015873578e-06, "loss": 0.2235, "num_input_tokens_seen": 102166496, "step": 47370 }, { "epoch": 8.694255826757203, "grad_norm": 0.002288958290591836, "learning_rate": 6.958416264678437e-06, "loss": 0.0002, "num_input_tokens_seen": 102176320, "step": 47375 }, { "epoch": 8.695173426316755, "grad_norm": 0.014503780752420425, "learning_rate": 6.9576794632529546e-06, "loss": 0.0002, "num_input_tokens_seen": 102186720, "step": 47380 }, { "epoch": 8.696091025876308, "grad_norm": 0.0183922927826643, "learning_rate": 6.9569426116160275e-06, "loss": 0.0054, "num_input_tokens_seen": 102196288, "step": 47385 }, { "epoch": 8.69700862543586, "grad_norm": 0.01769370399415493, "learning_rate": 6.956205709786556e-06, "loss": 0.0003, "num_input_tokens_seen": 102206368, "step": 47390 }, { "epoch": 8.697926224995411, "grad_norm": 0.10489416867494583, "learning_rate": 6.955468757783439e-06, "loss": 0.0008, "num_input_tokens_seen": 102217312, "step": 47395 }, { "epoch": 8.698843824554965, "grad_norm": 0.3190891742706299, "learning_rate": 6.9547317556255785e-06, "loss": 0.0008, "num_input_tokens_seen": 102228864, "step": 47400 }, { "epoch": 8.699761424114516, "grad_norm": 0.04426955059170723, "learning_rate": 6.953994703331881e-06, "loss": 0.0004, "num_input_tokens_seen": 102240800, "step": 47405 }, { "epoch": 8.700679023674068, "grad_norm": 0.08750040829181671, "learning_rate": 6.953257600921246e-06, "loss": 0.0949, "num_input_tokens_seen": 102251552, "step": 47410 }, { "epoch": 8.701596623233621, "grad_norm": 0.01158913504332304, "learning_rate": 6.952520448412581e-06, "loss": 0.0002, "num_input_tokens_seen": 102261952, "step": 47415 }, { "epoch": 8.702514222793173, "grad_norm": 0.00554822850972414, "learning_rate": 6.951783245824793e-06, "loss": 0.1442, "num_input_tokens_seen": 102272096, "step": 47420 }, { "epoch": 8.703431822352725, "grad_norm": 0.011653375811874866, "learning_rate": 6.951045993176788e-06, "loss": 0.0765, "num_input_tokens_seen": 102283776, "step": 47425 }, { "epoch": 8.704349421912278, "grad_norm": 0.006724616512656212, "learning_rate": 6.9503086904874795e-06, "loss": 0.0311, "num_input_tokens_seen": 102295648, "step": 47430 }, { "epoch": 8.70526702147183, "grad_norm": 0.01453370414674282, "learning_rate": 6.949571337775774e-06, "loss": 0.1817, "num_input_tokens_seen": 102306880, "step": 47435 }, { "epoch": 8.706184621031381, "grad_norm": 0.043225906789302826, "learning_rate": 6.948833935060586e-06, "loss": 0.0003, "num_input_tokens_seen": 102318080, "step": 47440 }, { "epoch": 8.707102220590935, "grad_norm": 0.03861300274729729, "learning_rate": 6.948096482360827e-06, "loss": 0.0855, "num_input_tokens_seen": 102327968, "step": 47445 }, { "epoch": 8.708019820150486, "grad_norm": 0.010674179531633854, "learning_rate": 6.947358979695413e-06, "loss": 0.0004, "num_input_tokens_seen": 102337184, "step": 47450 }, { "epoch": 8.708937419710038, "grad_norm": 0.005360737442970276, "learning_rate": 6.9466214270832596e-06, "loss": 0.0002, "num_input_tokens_seen": 102348928, "step": 47455 }, { "epoch": 8.709855019269591, "grad_norm": 3.653757333755493, "learning_rate": 6.945883824543283e-06, "loss": 0.1052, "num_input_tokens_seen": 102358944, "step": 47460 }, { "epoch": 8.710772618829143, "grad_norm": 0.006199615076184273, "learning_rate": 6.9451461720944035e-06, "loss": 0.3368, "num_input_tokens_seen": 102369696, "step": 47465 }, { "epoch": 8.711690218388695, "grad_norm": 0.03992745280265808, "learning_rate": 6.944408469755539e-06, "loss": 0.0005, "num_input_tokens_seen": 102380288, "step": 47470 }, { "epoch": 8.712607817948248, "grad_norm": 0.0030923765152692795, "learning_rate": 6.943670717545611e-06, "loss": 0.3496, "num_input_tokens_seen": 102392672, "step": 47475 }, { "epoch": 8.7135254175078, "grad_norm": 0.03381182998418808, "learning_rate": 6.94293291548354e-06, "loss": 0.2792, "num_input_tokens_seen": 102403488, "step": 47480 }, { "epoch": 8.714443017067351, "grad_norm": 0.02153068035840988, "learning_rate": 6.9421950635882514e-06, "loss": 0.054, "num_input_tokens_seen": 102414304, "step": 47485 }, { "epoch": 8.715360616626905, "grad_norm": 0.07058952748775482, "learning_rate": 6.941457161878671e-06, "loss": 0.0012, "num_input_tokens_seen": 102424704, "step": 47490 }, { "epoch": 8.716278216186456, "grad_norm": 0.17507733404636383, "learning_rate": 6.940719210373722e-06, "loss": 0.0029, "num_input_tokens_seen": 102435360, "step": 47495 }, { "epoch": 8.717195815746008, "grad_norm": 0.07695765048265457, "learning_rate": 6.939981209092334e-06, "loss": 0.0011, "num_input_tokens_seen": 102445408, "step": 47500 }, { "epoch": 8.718113415305561, "grad_norm": 0.24127750098705292, "learning_rate": 6.939243158053434e-06, "loss": 0.0015, "num_input_tokens_seen": 102456384, "step": 47505 }, { "epoch": 8.719031014865113, "grad_norm": 0.37684136629104614, "learning_rate": 6.938505057275951e-06, "loss": 0.1203, "num_input_tokens_seen": 102466752, "step": 47510 }, { "epoch": 8.719948614424665, "grad_norm": 0.09543969482183456, "learning_rate": 6.937766906778821e-06, "loss": 0.0016, "num_input_tokens_seen": 102478944, "step": 47515 }, { "epoch": 8.720866213984218, "grad_norm": 0.2377178966999054, "learning_rate": 6.937028706580973e-06, "loss": 0.0859, "num_input_tokens_seen": 102489568, "step": 47520 }, { "epoch": 8.72178381354377, "grad_norm": 0.00938767846673727, "learning_rate": 6.9362904567013374e-06, "loss": 0.0015, "num_input_tokens_seen": 102500544, "step": 47525 }, { "epoch": 8.722701413103321, "grad_norm": 0.008467023260891438, "learning_rate": 6.935552157158856e-06, "loss": 0.0011, "num_input_tokens_seen": 102511520, "step": 47530 }, { "epoch": 8.723619012662875, "grad_norm": 0.03502257540822029, "learning_rate": 6.934813807972459e-06, "loss": 0.0016, "num_input_tokens_seen": 102524032, "step": 47535 }, { "epoch": 8.724536612222426, "grad_norm": 0.02789871022105217, "learning_rate": 6.934075409161088e-06, "loss": 0.0026, "num_input_tokens_seen": 102533984, "step": 47540 }, { "epoch": 8.725454211781978, "grad_norm": 1.476523518562317, "learning_rate": 6.933336960743679e-06, "loss": 0.2657, "num_input_tokens_seen": 102544288, "step": 47545 }, { "epoch": 8.726371811341531, "grad_norm": 0.017969954758882523, "learning_rate": 6.932598462739176e-06, "loss": 0.0067, "num_input_tokens_seen": 102555872, "step": 47550 }, { "epoch": 8.727289410901083, "grad_norm": 32.17251205444336, "learning_rate": 6.931859915166516e-06, "loss": 0.319, "num_input_tokens_seen": 102567264, "step": 47555 }, { "epoch": 8.728207010460634, "grad_norm": 9.358717918395996, "learning_rate": 6.931121318044642e-06, "loss": 0.0201, "num_input_tokens_seen": 102577216, "step": 47560 }, { "epoch": 8.729124610020188, "grad_norm": 0.05062338709831238, "learning_rate": 6.930382671392502e-06, "loss": 0.2216, "num_input_tokens_seen": 102588960, "step": 47565 }, { "epoch": 8.73004220957974, "grad_norm": 0.17538946866989136, "learning_rate": 6.929643975229036e-06, "loss": 0.0009, "num_input_tokens_seen": 102599904, "step": 47570 }, { "epoch": 8.730959809139291, "grad_norm": 0.03652692958712578, "learning_rate": 6.928905229573194e-06, "loss": 0.0006, "num_input_tokens_seen": 102611168, "step": 47575 }, { "epoch": 8.731877408698844, "grad_norm": 3.0236456394195557, "learning_rate": 6.9281664344439215e-06, "loss": 0.0012, "num_input_tokens_seen": 102620800, "step": 47580 }, { "epoch": 8.732795008258396, "grad_norm": 0.023021746426820755, "learning_rate": 6.927427589860167e-06, "loss": 0.0007, "num_input_tokens_seen": 102630848, "step": 47585 }, { "epoch": 8.733712607817948, "grad_norm": 0.017147431150078773, "learning_rate": 6.9266886958408855e-06, "loss": 0.0941, "num_input_tokens_seen": 102641248, "step": 47590 }, { "epoch": 8.734630207377501, "grad_norm": 0.021343642845749855, "learning_rate": 6.9259497524050225e-06, "loss": 0.1662, "num_input_tokens_seen": 102651744, "step": 47595 }, { "epoch": 8.735547806937053, "grad_norm": 0.10889746248722076, "learning_rate": 6.925210759571535e-06, "loss": 0.0003, "num_input_tokens_seen": 102662144, "step": 47600 }, { "epoch": 8.736465406496604, "grad_norm": 0.027858886867761612, "learning_rate": 6.924471717359373e-06, "loss": 0.001, "num_input_tokens_seen": 102674368, "step": 47605 }, { "epoch": 8.737383006056158, "grad_norm": 0.17343273758888245, "learning_rate": 6.923732625787496e-06, "loss": 0.0008, "num_input_tokens_seen": 102686048, "step": 47610 }, { "epoch": 8.73830060561571, "grad_norm": 0.012076948769390583, "learning_rate": 6.922993484874858e-06, "loss": 0.0023, "num_input_tokens_seen": 102696576, "step": 47615 }, { "epoch": 8.739218205175261, "grad_norm": 0.012766205705702305, "learning_rate": 6.922254294640419e-06, "loss": 0.0002, "num_input_tokens_seen": 102708256, "step": 47620 }, { "epoch": 8.740135804734814, "grad_norm": 0.07134580612182617, "learning_rate": 6.921515055103134e-06, "loss": 0.0015, "num_input_tokens_seen": 102719648, "step": 47625 }, { "epoch": 8.741053404294366, "grad_norm": 0.002319756429642439, "learning_rate": 6.920775766281968e-06, "loss": 0.1934, "num_input_tokens_seen": 102730528, "step": 47630 }, { "epoch": 8.741971003853918, "grad_norm": 0.00718664051964879, "learning_rate": 6.92003642819588e-06, "loss": 0.0915, "num_input_tokens_seen": 102741312, "step": 47635 }, { "epoch": 8.742888603413471, "grad_norm": 0.10025589913129807, "learning_rate": 6.919297040863832e-06, "loss": 0.0004, "num_input_tokens_seen": 102750976, "step": 47640 }, { "epoch": 8.743806202973023, "grad_norm": 0.0724586769938469, "learning_rate": 6.918557604304792e-06, "loss": 0.0008, "num_input_tokens_seen": 102761248, "step": 47645 }, { "epoch": 8.744723802532574, "grad_norm": 0.0028195970226079226, "learning_rate": 6.917818118537721e-06, "loss": 0.1096, "num_input_tokens_seen": 102772320, "step": 47650 }, { "epoch": 8.745641402092128, "grad_norm": 0.024284105747938156, "learning_rate": 6.917078583581589e-06, "loss": 0.0002, "num_input_tokens_seen": 102782656, "step": 47655 }, { "epoch": 8.74655900165168, "grad_norm": 0.010091137140989304, "learning_rate": 6.916338999455362e-06, "loss": 0.0691, "num_input_tokens_seen": 102793856, "step": 47660 }, { "epoch": 8.74747660121123, "grad_norm": 0.0059880102053284645, "learning_rate": 6.915599366178008e-06, "loss": 0.0004, "num_input_tokens_seen": 102804256, "step": 47665 }, { "epoch": 8.748394200770784, "grad_norm": 0.007198041304945946, "learning_rate": 6.914859683768501e-06, "loss": 0.1533, "num_input_tokens_seen": 102814496, "step": 47670 }, { "epoch": 8.749311800330336, "grad_norm": 0.026060020551085472, "learning_rate": 6.9141199522458115e-06, "loss": 0.0006, "num_input_tokens_seen": 102824800, "step": 47675 }, { "epoch": 8.750229399889887, "grad_norm": 26.63554573059082, "learning_rate": 6.913380171628909e-06, "loss": 0.2034, "num_input_tokens_seen": 102835392, "step": 47680 }, { "epoch": 8.75114699944944, "grad_norm": 0.010488510131835938, "learning_rate": 6.912640341936774e-06, "loss": 0.0003, "num_input_tokens_seen": 102846016, "step": 47685 }, { "epoch": 8.752064599008992, "grad_norm": 0.021816037595272064, "learning_rate": 6.911900463188377e-06, "loss": 0.0002, "num_input_tokens_seen": 102857120, "step": 47690 }, { "epoch": 8.752982198568544, "grad_norm": 0.02377636358141899, "learning_rate": 6.911160535402694e-06, "loss": 0.2034, "num_input_tokens_seen": 102866688, "step": 47695 }, { "epoch": 8.753899798128097, "grad_norm": 0.027959072962403297, "learning_rate": 6.910420558598709e-06, "loss": 0.0002, "num_input_tokens_seen": 102877856, "step": 47700 }, { "epoch": 8.754817397687649, "grad_norm": 0.022401925176382065, "learning_rate": 6.9096805327953955e-06, "loss": 0.1472, "num_input_tokens_seen": 102888864, "step": 47705 }, { "epoch": 8.7557349972472, "grad_norm": 0.010286309756338596, "learning_rate": 6.9089404580117365e-06, "loss": 0.1877, "num_input_tokens_seen": 102898976, "step": 47710 }, { "epoch": 8.756652596806754, "grad_norm": 0.03977872058749199, "learning_rate": 6.9082003342667145e-06, "loss": 0.1389, "num_input_tokens_seen": 102909408, "step": 47715 }, { "epoch": 8.757570196366306, "grad_norm": 0.009456132538616657, "learning_rate": 6.907460161579309e-06, "loss": 0.0034, "num_input_tokens_seen": 102919648, "step": 47720 }, { "epoch": 8.758487795925857, "grad_norm": 0.038281094282865524, "learning_rate": 6.906719939968509e-06, "loss": 0.1008, "num_input_tokens_seen": 102931904, "step": 47725 }, { "epoch": 8.75940539548541, "grad_norm": 0.01825621724128723, "learning_rate": 6.905979669453298e-06, "loss": 0.0854, "num_input_tokens_seen": 102942656, "step": 47730 }, { "epoch": 8.760322995044962, "grad_norm": 0.0391765795648098, "learning_rate": 6.90523935005266e-06, "loss": 0.0928, "num_input_tokens_seen": 102953472, "step": 47735 }, { "epoch": 8.761240594604514, "grad_norm": 0.030063143000006676, "learning_rate": 6.904498981785588e-06, "loss": 0.0007, "num_input_tokens_seen": 102964064, "step": 47740 }, { "epoch": 8.762158194164067, "grad_norm": 0.0190117210149765, "learning_rate": 6.903758564671067e-06, "loss": 0.0007, "num_input_tokens_seen": 102974016, "step": 47745 }, { "epoch": 8.763075793723619, "grad_norm": 0.07466720789670944, "learning_rate": 6.90301809872809e-06, "loss": 0.0007, "num_input_tokens_seen": 102985120, "step": 47750 }, { "epoch": 8.76399339328317, "grad_norm": 1.9110709428787231, "learning_rate": 6.902277583975647e-06, "loss": 0.1704, "num_input_tokens_seen": 102996288, "step": 47755 }, { "epoch": 8.764910992842724, "grad_norm": 0.01068019773811102, "learning_rate": 6.901537020432735e-06, "loss": 0.0007, "num_input_tokens_seen": 103007808, "step": 47760 }, { "epoch": 8.765828592402276, "grad_norm": 0.04735059291124344, "learning_rate": 6.900796408118343e-06, "loss": 0.0009, "num_input_tokens_seen": 103019264, "step": 47765 }, { "epoch": 8.766746191961827, "grad_norm": 0.01128439698368311, "learning_rate": 6.90005574705147e-06, "loss": 0.001, "num_input_tokens_seen": 103029408, "step": 47770 }, { "epoch": 8.76766379152138, "grad_norm": 0.01968299224972725, "learning_rate": 6.899315037251112e-06, "loss": 0.1008, "num_input_tokens_seen": 103039584, "step": 47775 }, { "epoch": 8.768581391080932, "grad_norm": 0.07355894893407822, "learning_rate": 6.898574278736266e-06, "loss": 0.0008, "num_input_tokens_seen": 103050816, "step": 47780 }, { "epoch": 8.769498990640484, "grad_norm": 14.02786636352539, "learning_rate": 6.897833471525934e-06, "loss": 0.2577, "num_input_tokens_seen": 103062208, "step": 47785 }, { "epoch": 8.770416590200037, "grad_norm": 0.02830834873020649, "learning_rate": 6.897092615639114e-06, "loss": 0.0019, "num_input_tokens_seen": 103072768, "step": 47790 }, { "epoch": 8.771334189759589, "grad_norm": 0.09353446960449219, "learning_rate": 6.896351711094809e-06, "loss": 0.1638, "num_input_tokens_seen": 103082592, "step": 47795 }, { "epoch": 8.77225178931914, "grad_norm": 12.71731185913086, "learning_rate": 6.895610757912021e-06, "loss": 0.3763, "num_input_tokens_seen": 103094016, "step": 47800 }, { "epoch": 8.773169388878694, "grad_norm": 0.6846746802330017, "learning_rate": 6.894869756109754e-06, "loss": 0.0016, "num_input_tokens_seen": 103104352, "step": 47805 }, { "epoch": 8.774086988438246, "grad_norm": 0.03909428045153618, "learning_rate": 6.894128705707018e-06, "loss": 0.0028, "num_input_tokens_seen": 103115712, "step": 47810 }, { "epoch": 8.775004587997797, "grad_norm": 0.024518514052033424, "learning_rate": 6.893387606722812e-06, "loss": 0.1144, "num_input_tokens_seen": 103126624, "step": 47815 }, { "epoch": 8.77592218755735, "grad_norm": 0.32560452818870544, "learning_rate": 6.892646459176152e-06, "loss": 0.0009, "num_input_tokens_seen": 103137728, "step": 47820 }, { "epoch": 8.776839787116902, "grad_norm": 0.018862735480070114, "learning_rate": 6.891905263086042e-06, "loss": 0.0008, "num_input_tokens_seen": 103147904, "step": 47825 }, { "epoch": 8.777757386676454, "grad_norm": 0.056078728288412094, "learning_rate": 6.891164018471493e-06, "loss": 0.0008, "num_input_tokens_seen": 103157184, "step": 47830 }, { "epoch": 8.778674986236007, "grad_norm": 0.021193580701947212, "learning_rate": 6.89042272535152e-06, "loss": 0.0005, "num_input_tokens_seen": 103167456, "step": 47835 }, { "epoch": 8.779592585795559, "grad_norm": 0.5776898264884949, "learning_rate": 6.889681383745135e-06, "loss": 0.1201, "num_input_tokens_seen": 103178176, "step": 47840 }, { "epoch": 8.78051018535511, "grad_norm": 0.011484545655548573, "learning_rate": 6.888939993671349e-06, "loss": 0.0716, "num_input_tokens_seen": 103189504, "step": 47845 }, { "epoch": 8.781427784914664, "grad_norm": 74.99846649169922, "learning_rate": 6.888198555149181e-06, "loss": 0.2874, "num_input_tokens_seen": 103199808, "step": 47850 }, { "epoch": 8.782345384474215, "grad_norm": 0.07704826444387436, "learning_rate": 6.887457068197645e-06, "loss": 0.0011, "num_input_tokens_seen": 103211456, "step": 47855 }, { "epoch": 8.783262984033767, "grad_norm": 0.00508051086217165, "learning_rate": 6.886715532835761e-06, "loss": 0.3691, "num_input_tokens_seen": 103222016, "step": 47860 }, { "epoch": 8.78418058359332, "grad_norm": 0.38686832785606384, "learning_rate": 6.8859739490825485e-06, "loss": 0.2426, "num_input_tokens_seen": 103232064, "step": 47865 }, { "epoch": 8.785098183152872, "grad_norm": 123.71098327636719, "learning_rate": 6.885232316957027e-06, "loss": 0.0907, "num_input_tokens_seen": 103243616, "step": 47870 }, { "epoch": 8.786015782712424, "grad_norm": 0.044379111379384995, "learning_rate": 6.884490636478217e-06, "loss": 0.0108, "num_input_tokens_seen": 103253600, "step": 47875 }, { "epoch": 8.786933382271977, "grad_norm": 0.028758516535162926, "learning_rate": 6.883748907665144e-06, "loss": 0.0013, "num_input_tokens_seen": 103265440, "step": 47880 }, { "epoch": 8.787850981831529, "grad_norm": 0.20008768141269684, "learning_rate": 6.883007130536832e-06, "loss": 0.1018, "num_input_tokens_seen": 103276928, "step": 47885 }, { "epoch": 8.78876858139108, "grad_norm": 10.582908630371094, "learning_rate": 6.882265305112303e-06, "loss": 0.0812, "num_input_tokens_seen": 103287552, "step": 47890 }, { "epoch": 8.789686180950634, "grad_norm": 0.28733691573143005, "learning_rate": 6.881523431410589e-06, "loss": 0.0008, "num_input_tokens_seen": 103297728, "step": 47895 }, { "epoch": 8.790603780510185, "grad_norm": 0.19218187034130096, "learning_rate": 6.880781509450714e-06, "loss": 0.001, "num_input_tokens_seen": 103308992, "step": 47900 }, { "epoch": 8.791521380069737, "grad_norm": 11.203485488891602, "learning_rate": 6.8800395392517074e-06, "loss": 0.1424, "num_input_tokens_seen": 103320032, "step": 47905 }, { "epoch": 8.79243897962929, "grad_norm": 0.03326598182320595, "learning_rate": 6.879297520832602e-06, "loss": 0.0005, "num_input_tokens_seen": 103330944, "step": 47910 }, { "epoch": 8.793356579188842, "grad_norm": 0.010256406851112843, "learning_rate": 6.878555454212426e-06, "loss": 0.0006, "num_input_tokens_seen": 103340800, "step": 47915 }, { "epoch": 8.794274178748394, "grad_norm": 0.006636835169047117, "learning_rate": 6.877813339410215e-06, "loss": 0.0026, "num_input_tokens_seen": 103352032, "step": 47920 }, { "epoch": 8.795191778307947, "grad_norm": 0.075353242456913, "learning_rate": 6.8770711764450026e-06, "loss": 0.0013, "num_input_tokens_seen": 103361920, "step": 47925 }, { "epoch": 8.796109377867499, "grad_norm": 0.02766459435224533, "learning_rate": 6.876328965335822e-06, "loss": 0.1144, "num_input_tokens_seen": 103373472, "step": 47930 }, { "epoch": 8.79702697742705, "grad_norm": 0.015403361059725285, "learning_rate": 6.875586706101713e-06, "loss": 0.1102, "num_input_tokens_seen": 103384800, "step": 47935 }, { "epoch": 8.797944576986604, "grad_norm": 0.011345275677740574, "learning_rate": 6.874844398761712e-06, "loss": 0.0834, "num_input_tokens_seen": 103395776, "step": 47940 }, { "epoch": 8.798862176546155, "grad_norm": 0.03918404132127762, "learning_rate": 6.874102043334858e-06, "loss": 0.0023, "num_input_tokens_seen": 103407680, "step": 47945 }, { "epoch": 8.799779776105707, "grad_norm": 0.0614481158554554, "learning_rate": 6.873359639840191e-06, "loss": 0.2959, "num_input_tokens_seen": 103417856, "step": 47950 }, { "epoch": 8.80069737566526, "grad_norm": 0.17172421514987946, "learning_rate": 6.872617188296753e-06, "loss": 0.1672, "num_input_tokens_seen": 103428544, "step": 47955 }, { "epoch": 8.801614975224812, "grad_norm": 0.1258048117160797, "learning_rate": 6.871874688723586e-06, "loss": 0.0014, "num_input_tokens_seen": 103440384, "step": 47960 }, { "epoch": 8.802532574784363, "grad_norm": 18.757274627685547, "learning_rate": 6.871132141139734e-06, "loss": 0.077, "num_input_tokens_seen": 103449888, "step": 47965 }, { "epoch": 8.803450174343917, "grad_norm": 0.031029557809233665, "learning_rate": 6.870389545564243e-06, "loss": 0.0631, "num_input_tokens_seen": 103460736, "step": 47970 }, { "epoch": 8.804367773903468, "grad_norm": 18.20490074157715, "learning_rate": 6.869646902016158e-06, "loss": 0.1142, "num_input_tokens_seen": 103470656, "step": 47975 }, { "epoch": 8.80528537346302, "grad_norm": 41.64778137207031, "learning_rate": 6.8689042105145296e-06, "loss": 0.1935, "num_input_tokens_seen": 103481600, "step": 47980 }, { "epoch": 8.806202973022573, "grad_norm": 0.08239804208278656, "learning_rate": 6.8681614710784025e-06, "loss": 0.1046, "num_input_tokens_seen": 103492032, "step": 47985 }, { "epoch": 8.807120572582125, "grad_norm": 0.0725257620215416, "learning_rate": 6.867418683726831e-06, "loss": 0.0006, "num_input_tokens_seen": 103501792, "step": 47990 }, { "epoch": 8.808038172141677, "grad_norm": 0.6061571836471558, "learning_rate": 6.8666758484788645e-06, "loss": 0.0062, "num_input_tokens_seen": 103512704, "step": 47995 }, { "epoch": 8.80895577170123, "grad_norm": 0.0201652143150568, "learning_rate": 6.865932965353555e-06, "loss": 0.002, "num_input_tokens_seen": 103523424, "step": 48000 }, { "epoch": 8.809873371260782, "grad_norm": 0.4346523880958557, "learning_rate": 6.865190034369956e-06, "loss": 0.1014, "num_input_tokens_seen": 103535680, "step": 48005 }, { "epoch": 8.810790970820333, "grad_norm": 0.03251512721180916, "learning_rate": 6.864447055547123e-06, "loss": 0.0004, "num_input_tokens_seen": 103546048, "step": 48010 }, { "epoch": 8.811708570379887, "grad_norm": 0.04132305458188057, "learning_rate": 6.8637040289041135e-06, "loss": 0.0002, "num_input_tokens_seen": 103557184, "step": 48015 }, { "epoch": 8.812626169939438, "grad_norm": 0.032609548419713974, "learning_rate": 6.862960954459985e-06, "loss": 0.0099, "num_input_tokens_seen": 103568032, "step": 48020 }, { "epoch": 8.81354376949899, "grad_norm": 260.9488830566406, "learning_rate": 6.862217832233795e-06, "loss": 0.1411, "num_input_tokens_seen": 103578880, "step": 48025 }, { "epoch": 8.814461369058543, "grad_norm": 3.817563772201538, "learning_rate": 6.861474662244604e-06, "loss": 0.0028, "num_input_tokens_seen": 103589920, "step": 48030 }, { "epoch": 8.815378968618095, "grad_norm": 142.9229736328125, "learning_rate": 6.860731444511471e-06, "loss": 0.0947, "num_input_tokens_seen": 103601280, "step": 48035 }, { "epoch": 8.816296568177647, "grad_norm": 0.022462399676442146, "learning_rate": 6.859988179053461e-06, "loss": 0.0057, "num_input_tokens_seen": 103612448, "step": 48040 }, { "epoch": 8.8172141677372, "grad_norm": 0.05402765050530434, "learning_rate": 6.859244865889639e-06, "loss": 0.0004, "num_input_tokens_seen": 103623488, "step": 48045 }, { "epoch": 8.818131767296752, "grad_norm": 0.01781795360147953, "learning_rate": 6.8585015050390655e-06, "loss": 0.1471, "num_input_tokens_seen": 103634112, "step": 48050 }, { "epoch": 8.819049366856303, "grad_norm": 0.1913606971502304, "learning_rate": 6.85775809652081e-06, "loss": 0.0122, "num_input_tokens_seen": 103645152, "step": 48055 }, { "epoch": 8.819966966415857, "grad_norm": 0.009372564032673836, "learning_rate": 6.85701464035394e-06, "loss": 0.1213, "num_input_tokens_seen": 103655904, "step": 48060 }, { "epoch": 8.820884565975408, "grad_norm": 100.9338150024414, "learning_rate": 6.85627113655752e-06, "loss": 0.129, "num_input_tokens_seen": 103666624, "step": 48065 }, { "epoch": 8.82180216553496, "grad_norm": 0.04366376996040344, "learning_rate": 6.855527585150623e-06, "loss": 0.0002, "num_input_tokens_seen": 103677344, "step": 48070 }, { "epoch": 8.822719765094513, "grad_norm": 0.005762679968029261, "learning_rate": 6.85478398615232e-06, "loss": 0.0001, "num_input_tokens_seen": 103688320, "step": 48075 }, { "epoch": 8.823637364654065, "grad_norm": 35.21656036376953, "learning_rate": 6.854040339581683e-06, "loss": 0.0952, "num_input_tokens_seen": 103699936, "step": 48080 }, { "epoch": 8.824554964213617, "grad_norm": 2.4283909797668457, "learning_rate": 6.853296645457782e-06, "loss": 0.0012, "num_input_tokens_seen": 103710624, "step": 48085 }, { "epoch": 8.82547256377317, "grad_norm": 0.010429253801703453, "learning_rate": 6.852552903799697e-06, "loss": 0.0003, "num_input_tokens_seen": 103721728, "step": 48090 }, { "epoch": 8.826390163332722, "grad_norm": 0.011459806933999062, "learning_rate": 6.8518091146264995e-06, "loss": 0.0012, "num_input_tokens_seen": 103730880, "step": 48095 }, { "epoch": 8.827307762892273, "grad_norm": 0.04509571194648743, "learning_rate": 6.851065277957268e-06, "loss": 0.0008, "num_input_tokens_seen": 103741120, "step": 48100 }, { "epoch": 8.828225362451827, "grad_norm": 0.001403836882673204, "learning_rate": 6.8503213938110835e-06, "loss": 0.0001, "num_input_tokens_seen": 103752736, "step": 48105 }, { "epoch": 8.829142962011378, "grad_norm": 0.01618296280503273, "learning_rate": 6.849577462207021e-06, "loss": 0.0002, "num_input_tokens_seen": 103763936, "step": 48110 }, { "epoch": 8.83006056157093, "grad_norm": 0.030502784997224808, "learning_rate": 6.848833483164164e-06, "loss": 0.0001, "num_input_tokens_seen": 103774208, "step": 48115 }, { "epoch": 8.830978161130483, "grad_norm": 35.510986328125, "learning_rate": 6.8480894567015936e-06, "loss": 0.0028, "num_input_tokens_seen": 103784768, "step": 48120 }, { "epoch": 8.831895760690035, "grad_norm": 0.0076196701265871525, "learning_rate": 6.847345382838392e-06, "loss": 0.1562, "num_input_tokens_seen": 103795136, "step": 48125 }, { "epoch": 8.832813360249586, "grad_norm": 0.007613764144480228, "learning_rate": 6.846601261593645e-06, "loss": 0.11, "num_input_tokens_seen": 103807104, "step": 48130 }, { "epoch": 8.83373095980914, "grad_norm": 0.008902130648493767, "learning_rate": 6.845857092986437e-06, "loss": 0.0012, "num_input_tokens_seen": 103818624, "step": 48135 }, { "epoch": 8.834648559368691, "grad_norm": 0.06213189661502838, "learning_rate": 6.8451128770358565e-06, "loss": 0.0002, "num_input_tokens_seen": 103829216, "step": 48140 }, { "epoch": 8.835566158928243, "grad_norm": 56.40318298339844, "learning_rate": 6.84436861376099e-06, "loss": 0.0288, "num_input_tokens_seen": 103839104, "step": 48145 }, { "epoch": 8.836483758487796, "grad_norm": 14.95421028137207, "learning_rate": 6.843624303180927e-06, "loss": 0.1574, "num_input_tokens_seen": 103850272, "step": 48150 }, { "epoch": 8.837401358047348, "grad_norm": 134.89071655273438, "learning_rate": 6.842879945314758e-06, "loss": 0.1127, "num_input_tokens_seen": 103861888, "step": 48155 }, { "epoch": 8.8383189576069, "grad_norm": 0.02371511235833168, "learning_rate": 6.842135540181575e-06, "loss": 0.0824, "num_input_tokens_seen": 103872992, "step": 48160 }, { "epoch": 8.839236557166453, "grad_norm": 0.014762400649487972, "learning_rate": 6.84139108780047e-06, "loss": 0.0019, "num_input_tokens_seen": 103884256, "step": 48165 }, { "epoch": 8.840154156726005, "grad_norm": 0.002704078797250986, "learning_rate": 6.840646588190539e-06, "loss": 0.1191, "num_input_tokens_seen": 103896032, "step": 48170 }, { "epoch": 8.841071756285556, "grad_norm": 803.8523559570312, "learning_rate": 6.839902041370873e-06, "loss": 0.0307, "num_input_tokens_seen": 103907936, "step": 48175 }, { "epoch": 8.84198935584511, "grad_norm": 139.9212188720703, "learning_rate": 6.839157447360573e-06, "loss": 0.2266, "num_input_tokens_seen": 103917600, "step": 48180 }, { "epoch": 8.842906955404661, "grad_norm": 86.5663833618164, "learning_rate": 6.838412806178734e-06, "loss": 0.1418, "num_input_tokens_seen": 103927264, "step": 48185 }, { "epoch": 8.843824554964213, "grad_norm": 0.008406028151512146, "learning_rate": 6.837668117844456e-06, "loss": 0.0004, "num_input_tokens_seen": 103937632, "step": 48190 }, { "epoch": 8.844742154523766, "grad_norm": 3.6860246658325195, "learning_rate": 6.836923382376839e-06, "loss": 0.2036, "num_input_tokens_seen": 103948000, "step": 48195 }, { "epoch": 8.845659754083318, "grad_norm": 0.019301846623420715, "learning_rate": 6.8361785997949835e-06, "loss": 0.0003, "num_input_tokens_seen": 103958624, "step": 48200 }, { "epoch": 8.84657735364287, "grad_norm": 0.052693262696266174, "learning_rate": 6.835433770117994e-06, "loss": 0.0005, "num_input_tokens_seen": 103969088, "step": 48205 }, { "epoch": 8.847494953202423, "grad_norm": 0.007313234731554985, "learning_rate": 6.834688893364972e-06, "loss": 0.0001, "num_input_tokens_seen": 103980768, "step": 48210 }, { "epoch": 8.848412552761975, "grad_norm": 0.1037658154964447, "learning_rate": 6.833943969555025e-06, "loss": 0.0046, "num_input_tokens_seen": 103991616, "step": 48215 }, { "epoch": 8.849330152321526, "grad_norm": 0.016656741499900818, "learning_rate": 6.833198998707256e-06, "loss": 0.1103, "num_input_tokens_seen": 104002528, "step": 48220 }, { "epoch": 8.85024775188108, "grad_norm": 0.2751772403717041, "learning_rate": 6.832453980840774e-06, "loss": 0.0005, "num_input_tokens_seen": 104013280, "step": 48225 }, { "epoch": 8.851165351440631, "grad_norm": 0.015589234419167042, "learning_rate": 6.831708915974689e-06, "loss": 0.144, "num_input_tokens_seen": 104024256, "step": 48230 }, { "epoch": 8.852082951000183, "grad_norm": 0.17985406517982483, "learning_rate": 6.830963804128107e-06, "loss": 0.0629, "num_input_tokens_seen": 104034688, "step": 48235 }, { "epoch": 8.853000550559736, "grad_norm": 26.020973205566406, "learning_rate": 6.830218645320142e-06, "loss": 0.2264, "num_input_tokens_seen": 104045920, "step": 48240 }, { "epoch": 8.853918150119288, "grad_norm": 0.35647377371788025, "learning_rate": 6.829473439569906e-06, "loss": 0.0242, "num_input_tokens_seen": 104056480, "step": 48245 }, { "epoch": 8.85483574967884, "grad_norm": 0.02991000935435295, "learning_rate": 6.8287281868965114e-06, "loss": 0.1838, "num_input_tokens_seen": 104066080, "step": 48250 }, { "epoch": 8.855753349238393, "grad_norm": 0.020365672186017036, "learning_rate": 6.827982887319072e-06, "loss": 0.1233, "num_input_tokens_seen": 104076640, "step": 48255 }, { "epoch": 8.856670948797944, "grad_norm": 0.13170616328716278, "learning_rate": 6.827237540856705e-06, "loss": 0.0006, "num_input_tokens_seen": 104088480, "step": 48260 }, { "epoch": 8.857588548357496, "grad_norm": 0.032631684094667435, "learning_rate": 6.8264921475285284e-06, "loss": 0.2322, "num_input_tokens_seen": 104097696, "step": 48265 }, { "epoch": 8.85850614791705, "grad_norm": 0.007257823832333088, "learning_rate": 6.825746707353659e-06, "loss": 0.0023, "num_input_tokens_seen": 104108672, "step": 48270 }, { "epoch": 8.859423747476601, "grad_norm": 3.0329675674438477, "learning_rate": 6.825001220351215e-06, "loss": 0.1791, "num_input_tokens_seen": 104119104, "step": 48275 }, { "epoch": 8.860341347036153, "grad_norm": 0.03251073881983757, "learning_rate": 6.8242556865403185e-06, "loss": 0.0078, "num_input_tokens_seen": 104129248, "step": 48280 }, { "epoch": 8.861258946595706, "grad_norm": 0.06387138366699219, "learning_rate": 6.823510105940091e-06, "loss": 0.2881, "num_input_tokens_seen": 104140032, "step": 48285 }, { "epoch": 8.862176546155258, "grad_norm": 0.07806389033794403, "learning_rate": 6.822764478569655e-06, "loss": 0.0021, "num_input_tokens_seen": 104150656, "step": 48290 }, { "epoch": 8.86309414571481, "grad_norm": 0.7803459763526917, "learning_rate": 6.8220188044481364e-06, "loss": 0.1207, "num_input_tokens_seen": 104161120, "step": 48295 }, { "epoch": 8.864011745274363, "grad_norm": 30.183399200439453, "learning_rate": 6.82127308359466e-06, "loss": 0.3255, "num_input_tokens_seen": 104172704, "step": 48300 }, { "epoch": 8.864929344833914, "grad_norm": 0.055688995867967606, "learning_rate": 6.820527316028349e-06, "loss": 0.1131, "num_input_tokens_seen": 104183392, "step": 48305 }, { "epoch": 8.865846944393466, "grad_norm": 0.3880750238895416, "learning_rate": 6.819781501768334e-06, "loss": 0.1508, "num_input_tokens_seen": 104194528, "step": 48310 }, { "epoch": 8.86676454395302, "grad_norm": 0.036646462976932526, "learning_rate": 6.819035640833746e-06, "loss": 0.0652, "num_input_tokens_seen": 104205376, "step": 48315 }, { "epoch": 8.867682143512571, "grad_norm": 0.004709762986749411, "learning_rate": 6.818289733243713e-06, "loss": 0.0985, "num_input_tokens_seen": 104216224, "step": 48320 }, { "epoch": 8.868599743072123, "grad_norm": 0.8604668974876404, "learning_rate": 6.817543779017365e-06, "loss": 0.0019, "num_input_tokens_seen": 104227808, "step": 48325 }, { "epoch": 8.869517342631676, "grad_norm": 2.2431342601776123, "learning_rate": 6.816797778173836e-06, "loss": 0.1125, "num_input_tokens_seen": 104238656, "step": 48330 }, { "epoch": 8.870434942191228, "grad_norm": 2.608731985092163, "learning_rate": 6.816051730732261e-06, "loss": 0.0015, "num_input_tokens_seen": 104249344, "step": 48335 }, { "epoch": 8.87135254175078, "grad_norm": 0.20264370739459991, "learning_rate": 6.815305636711773e-06, "loss": 0.0014, "num_input_tokens_seen": 104260672, "step": 48340 }, { "epoch": 8.872270141310333, "grad_norm": 0.09114397317171097, "learning_rate": 6.814559496131509e-06, "loss": 0.0029, "num_input_tokens_seen": 104271552, "step": 48345 }, { "epoch": 8.873187740869884, "grad_norm": 0.02628297731280327, "learning_rate": 6.813813309010606e-06, "loss": 0.1538, "num_input_tokens_seen": 104281408, "step": 48350 }, { "epoch": 8.874105340429436, "grad_norm": 56.317623138427734, "learning_rate": 6.813067075368203e-06, "loss": 0.011, "num_input_tokens_seen": 104292416, "step": 48355 }, { "epoch": 8.87502293998899, "grad_norm": 0.032101113349199295, "learning_rate": 6.812320795223439e-06, "loss": 0.0005, "num_input_tokens_seen": 104302592, "step": 48360 }, { "epoch": 8.875940539548541, "grad_norm": 0.007946210913360119, "learning_rate": 6.811574468595457e-06, "loss": 0.0001, "num_input_tokens_seen": 104313856, "step": 48365 }, { "epoch": 8.876858139108093, "grad_norm": 0.01845483109354973, "learning_rate": 6.810828095503396e-06, "loss": 0.0009, "num_input_tokens_seen": 104323968, "step": 48370 }, { "epoch": 8.877775738667646, "grad_norm": 0.10540373623371124, "learning_rate": 6.810081675966403e-06, "loss": 0.001, "num_input_tokens_seen": 104335232, "step": 48375 }, { "epoch": 8.878693338227198, "grad_norm": 0.010378547944128513, "learning_rate": 6.809335210003618e-06, "loss": 0.0981, "num_input_tokens_seen": 104347360, "step": 48380 }, { "epoch": 8.87961093778675, "grad_norm": 0.0054688663221895695, "learning_rate": 6.80858869763419e-06, "loss": 0.0005, "num_input_tokens_seen": 104357632, "step": 48385 }, { "epoch": 8.880528537346303, "grad_norm": 0.16394422948360443, "learning_rate": 6.807842138877266e-06, "loss": 0.0006, "num_input_tokens_seen": 104368832, "step": 48390 }, { "epoch": 8.881446136905854, "grad_norm": 0.00908702053129673, "learning_rate": 6.807095533751991e-06, "loss": 0.1409, "num_input_tokens_seen": 104380512, "step": 48395 }, { "epoch": 8.882363736465406, "grad_norm": 0.057089101523160934, "learning_rate": 6.806348882277517e-06, "loss": 0.0151, "num_input_tokens_seen": 104391968, "step": 48400 }, { "epoch": 8.88328133602496, "grad_norm": 29.178443908691406, "learning_rate": 6.805602184472993e-06, "loss": 0.1566, "num_input_tokens_seen": 104403328, "step": 48405 }, { "epoch": 8.88419893558451, "grad_norm": 1.8673475980758667, "learning_rate": 6.804855440357574e-06, "loss": 0.0012, "num_input_tokens_seen": 104414016, "step": 48410 }, { "epoch": 8.885116535144062, "grad_norm": 2.392307996749878, "learning_rate": 6.8041086499504075e-06, "loss": 0.0011, "num_input_tokens_seen": 104424256, "step": 48415 }, { "epoch": 8.886034134703616, "grad_norm": 0.005736049264669418, "learning_rate": 6.803361813270651e-06, "loss": 0.0001, "num_input_tokens_seen": 104433952, "step": 48420 }, { "epoch": 8.886951734263167, "grad_norm": 0.011790896765887737, "learning_rate": 6.80261493033746e-06, "loss": 0.0002, "num_input_tokens_seen": 104445120, "step": 48425 }, { "epoch": 8.887869333822719, "grad_norm": 0.09769908338785172, "learning_rate": 6.801868001169989e-06, "loss": 0.2203, "num_input_tokens_seen": 104455552, "step": 48430 }, { "epoch": 8.888786933382272, "grad_norm": 0.015644676983356476, "learning_rate": 6.801121025787396e-06, "loss": 0.1195, "num_input_tokens_seen": 104466784, "step": 48435 }, { "epoch": 8.889704532941824, "grad_norm": 0.0038897639606148005, "learning_rate": 6.8003740042088405e-06, "loss": 0.0377, "num_input_tokens_seen": 104478304, "step": 48440 }, { "epoch": 8.890622132501376, "grad_norm": 0.15261238813400269, "learning_rate": 6.799626936453483e-06, "loss": 0.0003, "num_input_tokens_seen": 104488608, "step": 48445 }, { "epoch": 8.891539732060929, "grad_norm": 0.0084541579708457, "learning_rate": 6.7988798225404825e-06, "loss": 0.0019, "num_input_tokens_seen": 104498752, "step": 48450 }, { "epoch": 8.89245733162048, "grad_norm": 13.735033988952637, "learning_rate": 6.7981326624890034e-06, "loss": 0.1378, "num_input_tokens_seen": 104510016, "step": 48455 }, { "epoch": 8.893374931180032, "grad_norm": 27.890766143798828, "learning_rate": 6.797385456318208e-06, "loss": 0.2452, "num_input_tokens_seen": 104520640, "step": 48460 }, { "epoch": 8.894292530739586, "grad_norm": 0.04283033683896065, "learning_rate": 6.7966382040472614e-06, "loss": 0.0706, "num_input_tokens_seen": 104531040, "step": 48465 }, { "epoch": 8.895210130299137, "grad_norm": 0.23662099242210388, "learning_rate": 6.795890905695329e-06, "loss": 0.1837, "num_input_tokens_seen": 104540384, "step": 48470 }, { "epoch": 8.896127729858689, "grad_norm": 11.576924324035645, "learning_rate": 6.795143561281579e-06, "loss": 0.0061, "num_input_tokens_seen": 104551968, "step": 48475 }, { "epoch": 8.897045329418242, "grad_norm": 0.4945444166660309, "learning_rate": 6.794396170825179e-06, "loss": 0.0014, "num_input_tokens_seen": 104563424, "step": 48480 }, { "epoch": 8.897962928977794, "grad_norm": 0.022587748244404793, "learning_rate": 6.793648734345298e-06, "loss": 0.0024, "num_input_tokens_seen": 104575104, "step": 48485 }, { "epoch": 8.898880528537346, "grad_norm": 44.7316780090332, "learning_rate": 6.7929012518611074e-06, "loss": 0.0607, "num_input_tokens_seen": 104585568, "step": 48490 }, { "epoch": 8.899798128096899, "grad_norm": 35.659454345703125, "learning_rate": 6.792153723391776e-06, "loss": 0.135, "num_input_tokens_seen": 104597184, "step": 48495 }, { "epoch": 8.90071572765645, "grad_norm": 0.02040257677435875, "learning_rate": 6.791406148956482e-06, "loss": 0.0916, "num_input_tokens_seen": 104606912, "step": 48500 }, { "epoch": 8.901633327216002, "grad_norm": 0.06067200005054474, "learning_rate": 6.7906585285743954e-06, "loss": 0.0438, "num_input_tokens_seen": 104617600, "step": 48505 }, { "epoch": 8.902550926775556, "grad_norm": 0.13968020677566528, "learning_rate": 6.789910862264693e-06, "loss": 0.0006, "num_input_tokens_seen": 104628416, "step": 48510 }, { "epoch": 8.903468526335107, "grad_norm": 0.14743760228157043, "learning_rate": 6.789163150046552e-06, "loss": 0.0003, "num_input_tokens_seen": 104639232, "step": 48515 }, { "epoch": 8.904386125894659, "grad_norm": 1.049169659614563, "learning_rate": 6.7884153919391475e-06, "loss": 0.0034, "num_input_tokens_seen": 104650464, "step": 48520 }, { "epoch": 8.905303725454212, "grad_norm": 0.004992072004824877, "learning_rate": 6.787667587961662e-06, "loss": 0.0002, "num_input_tokens_seen": 104662304, "step": 48525 }, { "epoch": 8.906221325013764, "grad_norm": 0.012218089774250984, "learning_rate": 6.786919738133271e-06, "loss": 0.0004, "num_input_tokens_seen": 104673344, "step": 48530 }, { "epoch": 8.907138924573315, "grad_norm": 0.06861236691474915, "learning_rate": 6.786171842473159e-06, "loss": 0.0003, "num_input_tokens_seen": 104682912, "step": 48535 }, { "epoch": 8.908056524132869, "grad_norm": 0.013695264235138893, "learning_rate": 6.785423901000509e-06, "loss": 0.0002, "num_input_tokens_seen": 104692576, "step": 48540 }, { "epoch": 8.90897412369242, "grad_norm": 0.6425543427467346, "learning_rate": 6.784675913734502e-06, "loss": 0.0005, "num_input_tokens_seen": 104704096, "step": 48545 }, { "epoch": 8.909891723251972, "grad_norm": 0.045287422835826874, "learning_rate": 6.783927880694323e-06, "loss": 0.1942, "num_input_tokens_seen": 104714816, "step": 48550 }, { "epoch": 8.910809322811525, "grad_norm": 0.0032216310501098633, "learning_rate": 6.78317980189916e-06, "loss": 0.1268, "num_input_tokens_seen": 104725728, "step": 48555 }, { "epoch": 8.911726922371077, "grad_norm": 0.19690950214862823, "learning_rate": 6.7824316773681985e-06, "loss": 0.0018, "num_input_tokens_seen": 104736224, "step": 48560 }, { "epoch": 8.912644521930629, "grad_norm": 13.721955299377441, "learning_rate": 6.781683507120627e-06, "loss": 0.2297, "num_input_tokens_seen": 104746880, "step": 48565 }, { "epoch": 8.913562121490182, "grad_norm": 0.0209162849932909, "learning_rate": 6.780935291175636e-06, "loss": 0.0005, "num_input_tokens_seen": 104759040, "step": 48570 }, { "epoch": 8.914479721049734, "grad_norm": 0.3040282428264618, "learning_rate": 6.780187029552412e-06, "loss": 0.0006, "num_input_tokens_seen": 104770336, "step": 48575 }, { "epoch": 8.915397320609285, "grad_norm": 0.11355426907539368, "learning_rate": 6.7794387222701505e-06, "loss": 0.0005, "num_input_tokens_seen": 104780288, "step": 48580 }, { "epoch": 8.916314920168839, "grad_norm": 0.015233051963150501, "learning_rate": 6.778690369348047e-06, "loss": 0.3129, "num_input_tokens_seen": 104792384, "step": 48585 }, { "epoch": 8.91723251972839, "grad_norm": 0.06169014424085617, "learning_rate": 6.777941970805288e-06, "loss": 0.1196, "num_input_tokens_seen": 104801504, "step": 48590 }, { "epoch": 8.918150119287942, "grad_norm": 0.06261957436800003, "learning_rate": 6.777193526661077e-06, "loss": 0.0004, "num_input_tokens_seen": 104811968, "step": 48595 }, { "epoch": 8.919067718847495, "grad_norm": 0.013056081719696522, "learning_rate": 6.776445036934605e-06, "loss": 0.0005, "num_input_tokens_seen": 104823136, "step": 48600 }, { "epoch": 8.919985318407047, "grad_norm": 0.003086210461333394, "learning_rate": 6.775696501645069e-06, "loss": 0.1047, "num_input_tokens_seen": 104833152, "step": 48605 }, { "epoch": 8.920902917966599, "grad_norm": 0.19212155044078827, "learning_rate": 6.774947920811672e-06, "loss": 0.0013, "num_input_tokens_seen": 104843712, "step": 48610 }, { "epoch": 8.921820517526152, "grad_norm": 0.08198586106300354, "learning_rate": 6.77419929445361e-06, "loss": 0.0007, "num_input_tokens_seen": 104855072, "step": 48615 }, { "epoch": 8.922738117085704, "grad_norm": 0.021384865045547485, "learning_rate": 6.7734506225900875e-06, "loss": 0.0007, "num_input_tokens_seen": 104865280, "step": 48620 }, { "epoch": 8.923655716645255, "grad_norm": 0.007923031225800514, "learning_rate": 6.772701905240304e-06, "loss": 0.0005, "num_input_tokens_seen": 104876256, "step": 48625 }, { "epoch": 8.924573316204809, "grad_norm": 0.004244987387210131, "learning_rate": 6.771953142423464e-06, "loss": 0.1288, "num_input_tokens_seen": 104886592, "step": 48630 }, { "epoch": 8.92549091576436, "grad_norm": 0.3950214684009552, "learning_rate": 6.771204334158773e-06, "loss": 0.1147, "num_input_tokens_seen": 104896384, "step": 48635 }, { "epoch": 8.926408515323912, "grad_norm": 0.033155180513858795, "learning_rate": 6.770455480465435e-06, "loss": 0.0076, "num_input_tokens_seen": 104907968, "step": 48640 }, { "epoch": 8.927326114883465, "grad_norm": 0.09689107537269592, "learning_rate": 6.7697065813626595e-06, "loss": 0.0006, "num_input_tokens_seen": 104918912, "step": 48645 }, { "epoch": 8.928243714443017, "grad_norm": 14.147918701171875, "learning_rate": 6.768957636869652e-06, "loss": 0.1534, "num_input_tokens_seen": 104929312, "step": 48650 }, { "epoch": 8.929161314002569, "grad_norm": 0.020296087488532066, "learning_rate": 6.768208647005622e-06, "loss": 0.0001, "num_input_tokens_seen": 104940832, "step": 48655 }, { "epoch": 8.930078913562122, "grad_norm": 184.11463928222656, "learning_rate": 6.767459611789782e-06, "loss": 0.0749, "num_input_tokens_seen": 104951360, "step": 48660 }, { "epoch": 8.930996513121674, "grad_norm": 0.17263035476207733, "learning_rate": 6.766710531241341e-06, "loss": 0.0013, "num_input_tokens_seen": 104961568, "step": 48665 }, { "epoch": 8.931914112681225, "grad_norm": 0.005467534065246582, "learning_rate": 6.7659614053795146e-06, "loss": 0.0946, "num_input_tokens_seen": 104973888, "step": 48670 }, { "epoch": 8.932831712240779, "grad_norm": 0.1761419028043747, "learning_rate": 6.7652122342235135e-06, "loss": 0.0083, "num_input_tokens_seen": 104984128, "step": 48675 }, { "epoch": 8.93374931180033, "grad_norm": 0.07770970463752747, "learning_rate": 6.764463017792555e-06, "loss": 0.0004, "num_input_tokens_seen": 104994432, "step": 48680 }, { "epoch": 8.934666911359882, "grad_norm": 0.27276089787483215, "learning_rate": 6.763713756105855e-06, "loss": 0.0011, "num_input_tokens_seen": 105005888, "step": 48685 }, { "epoch": 8.935584510919435, "grad_norm": 78.68180084228516, "learning_rate": 6.762964449182631e-06, "loss": 0.0986, "num_input_tokens_seen": 105018048, "step": 48690 }, { "epoch": 8.936502110478987, "grad_norm": 0.002882555592805147, "learning_rate": 6.762215097042101e-06, "loss": 0.0033, "num_input_tokens_seen": 105028608, "step": 48695 }, { "epoch": 8.937419710038538, "grad_norm": 17.07032012939453, "learning_rate": 6.761465699703485e-06, "loss": 0.251, "num_input_tokens_seen": 105038016, "step": 48700 }, { "epoch": 8.938337309598092, "grad_norm": 0.009355878457427025, "learning_rate": 6.760716257186004e-06, "loss": 0.0004, "num_input_tokens_seen": 105049344, "step": 48705 }, { "epoch": 8.939254909157643, "grad_norm": 0.002559797605499625, "learning_rate": 6.75996676950888e-06, "loss": 0.0002, "num_input_tokens_seen": 105060512, "step": 48710 }, { "epoch": 8.940172508717195, "grad_norm": 0.0017899730009958148, "learning_rate": 6.759217236691335e-06, "loss": 0.0001, "num_input_tokens_seen": 105069984, "step": 48715 }, { "epoch": 8.941090108276748, "grad_norm": 0.013724087737500668, "learning_rate": 6.7584676587525955e-06, "loss": 0.0537, "num_input_tokens_seen": 105081984, "step": 48720 }, { "epoch": 8.9420077078363, "grad_norm": 0.056241195648908615, "learning_rate": 6.757718035711885e-06, "loss": 0.0003, "num_input_tokens_seen": 105093184, "step": 48725 }, { "epoch": 8.942925307395852, "grad_norm": 0.007872337475419044, "learning_rate": 6.756968367588432e-06, "loss": 0.0917, "num_input_tokens_seen": 105104576, "step": 48730 }, { "epoch": 8.943842906955405, "grad_norm": 0.031102783977985382, "learning_rate": 6.756218654401463e-06, "loss": 0.0627, "num_input_tokens_seen": 105114464, "step": 48735 }, { "epoch": 8.944760506514957, "grad_norm": 0.05132296308875084, "learning_rate": 6.755468896170207e-06, "loss": 0.0014, "num_input_tokens_seen": 105124768, "step": 48740 }, { "epoch": 8.945678106074508, "grad_norm": 0.0784434974193573, "learning_rate": 6.754719092913895e-06, "loss": 0.0624, "num_input_tokens_seen": 105136160, "step": 48745 }, { "epoch": 8.946595705634062, "grad_norm": 3.378864288330078, "learning_rate": 6.753969244651757e-06, "loss": 0.0032, "num_input_tokens_seen": 105147456, "step": 48750 }, { "epoch": 8.947513305193613, "grad_norm": 0.3633969724178314, "learning_rate": 6.753219351403027e-06, "loss": 0.001, "num_input_tokens_seen": 105159072, "step": 48755 }, { "epoch": 8.948430904753165, "grad_norm": 0.12486347556114197, "learning_rate": 6.752469413186938e-06, "loss": 0.0001, "num_input_tokens_seen": 105170720, "step": 48760 }, { "epoch": 8.949348504312718, "grad_norm": 0.0809134840965271, "learning_rate": 6.751719430022724e-06, "loss": 0.1783, "num_input_tokens_seen": 105181056, "step": 48765 }, { "epoch": 8.95026610387227, "grad_norm": 18.624378204345703, "learning_rate": 6.7509694019296226e-06, "loss": 0.276, "num_input_tokens_seen": 105193056, "step": 48770 }, { "epoch": 8.951183703431822, "grad_norm": 24.428356170654297, "learning_rate": 6.750219328926868e-06, "loss": 0.2449, "num_input_tokens_seen": 105203232, "step": 48775 }, { "epoch": 8.952101302991375, "grad_norm": 0.17923520505428314, "learning_rate": 6.749469211033702e-06, "loss": 0.0005, "num_input_tokens_seen": 105214400, "step": 48780 }, { "epoch": 8.953018902550927, "grad_norm": 0.22713258862495422, "learning_rate": 6.748719048269362e-06, "loss": 0.0004, "num_input_tokens_seen": 105225664, "step": 48785 }, { "epoch": 8.953936502110478, "grad_norm": 0.011950930580496788, "learning_rate": 6.747968840653087e-06, "loss": 0.0005, "num_input_tokens_seen": 105236992, "step": 48790 }, { "epoch": 8.954854101670032, "grad_norm": 0.023963235318660736, "learning_rate": 6.747218588204123e-06, "loss": 0.0002, "num_input_tokens_seen": 105248704, "step": 48795 }, { "epoch": 8.955771701229583, "grad_norm": 0.001030914718285203, "learning_rate": 6.746468290941708e-06, "loss": 0.0002, "num_input_tokens_seen": 105259552, "step": 48800 }, { "epoch": 8.956689300789135, "grad_norm": 0.01033838465809822, "learning_rate": 6.74571794888509e-06, "loss": 0.0887, "num_input_tokens_seen": 105269376, "step": 48805 }, { "epoch": 8.957606900348688, "grad_norm": 0.004445170983672142, "learning_rate": 6.744967562053512e-06, "loss": 0.1132, "num_input_tokens_seen": 105280480, "step": 48810 }, { "epoch": 8.95852449990824, "grad_norm": 0.3047710359096527, "learning_rate": 6.744217130466219e-06, "loss": 0.0652, "num_input_tokens_seen": 105291008, "step": 48815 }, { "epoch": 8.959442099467791, "grad_norm": 0.00783238559961319, "learning_rate": 6.743466654142461e-06, "loss": 0.0003, "num_input_tokens_seen": 105299968, "step": 48820 }, { "epoch": 8.960359699027345, "grad_norm": 0.037843283265829086, "learning_rate": 6.7427161331014845e-06, "loss": 0.0151, "num_input_tokens_seen": 105309056, "step": 48825 }, { "epoch": 8.961277298586896, "grad_norm": 0.01636424846947193, "learning_rate": 6.74196556736254e-06, "loss": 0.147, "num_input_tokens_seen": 105319712, "step": 48830 }, { "epoch": 8.962194898146448, "grad_norm": 0.8382046818733215, "learning_rate": 6.741214956944879e-06, "loss": 0.0008, "num_input_tokens_seen": 105330976, "step": 48835 }, { "epoch": 8.963112497706001, "grad_norm": 0.02655450440943241, "learning_rate": 6.740464301867753e-06, "loss": 0.008, "num_input_tokens_seen": 105341344, "step": 48840 }, { "epoch": 8.964030097265553, "grad_norm": 1.980405330657959, "learning_rate": 6.739713602150416e-06, "loss": 0.0032, "num_input_tokens_seen": 105352896, "step": 48845 }, { "epoch": 8.964947696825105, "grad_norm": 0.10602791607379913, "learning_rate": 6.73896285781212e-06, "loss": 0.0003, "num_input_tokens_seen": 105364352, "step": 48850 }, { "epoch": 8.965865296384658, "grad_norm": 0.015616461634635925, "learning_rate": 6.738212068872123e-06, "loss": 0.0004, "num_input_tokens_seen": 105375296, "step": 48855 }, { "epoch": 8.96678289594421, "grad_norm": 0.022003885358572006, "learning_rate": 6.73746123534968e-06, "loss": 0.1695, "num_input_tokens_seen": 105386656, "step": 48860 }, { "epoch": 8.967700495503761, "grad_norm": 0.16889402270317078, "learning_rate": 6.736710357264049e-06, "loss": 0.0891, "num_input_tokens_seen": 105398688, "step": 48865 }, { "epoch": 8.968618095063315, "grad_norm": 0.01289238128811121, "learning_rate": 6.735959434634489e-06, "loss": 0.3284, "num_input_tokens_seen": 105409440, "step": 48870 }, { "epoch": 8.969535694622866, "grad_norm": 0.023055488243699074, "learning_rate": 6.73520846748026e-06, "loss": 0.0013, "num_input_tokens_seen": 105421088, "step": 48875 }, { "epoch": 8.970453294182418, "grad_norm": 0.16497814655303955, "learning_rate": 6.734457455820623e-06, "loss": 0.0006, "num_input_tokens_seen": 105430912, "step": 48880 }, { "epoch": 8.971370893741971, "grad_norm": 0.01943732053041458, "learning_rate": 6.733706399674841e-06, "loss": 0.1585, "num_input_tokens_seen": 105440896, "step": 48885 }, { "epoch": 8.972288493301523, "grad_norm": 0.0008194898255169392, "learning_rate": 6.732955299062176e-06, "loss": 0.0012, "num_input_tokens_seen": 105450688, "step": 48890 }, { "epoch": 8.973206092861075, "grad_norm": 0.004060806706547737, "learning_rate": 6.732204154001895e-06, "loss": 0.0022, "num_input_tokens_seen": 105461696, "step": 48895 }, { "epoch": 8.974123692420628, "grad_norm": 0.009031455963850021, "learning_rate": 6.7314529645132595e-06, "loss": 0.0038, "num_input_tokens_seen": 105473536, "step": 48900 }, { "epoch": 8.97504129198018, "grad_norm": 0.06294060498476028, "learning_rate": 6.730701730615541e-06, "loss": 0.1286, "num_input_tokens_seen": 105484640, "step": 48905 }, { "epoch": 8.975958891539731, "grad_norm": 0.002581185894086957, "learning_rate": 6.729950452328004e-06, "loss": 0.001, "num_input_tokens_seen": 105496448, "step": 48910 }, { "epoch": 8.976876491099285, "grad_norm": 0.05602819100022316, "learning_rate": 6.729199129669921e-06, "loss": 0.0003, "num_input_tokens_seen": 105507072, "step": 48915 }, { "epoch": 8.977794090658836, "grad_norm": 0.4834364354610443, "learning_rate": 6.72844776266056e-06, "loss": 0.0229, "num_input_tokens_seen": 105518720, "step": 48920 }, { "epoch": 8.978711690218388, "grad_norm": 0.0039586881175637245, "learning_rate": 6.727696351319192e-06, "loss": 0.0002, "num_input_tokens_seen": 105528960, "step": 48925 }, { "epoch": 8.979629289777941, "grad_norm": 0.004749980289489031, "learning_rate": 6.726944895665091e-06, "loss": 0.1909, "num_input_tokens_seen": 105539072, "step": 48930 }, { "epoch": 8.980546889337493, "grad_norm": 0.021686339750885963, "learning_rate": 6.726193395717528e-06, "loss": 0.1932, "num_input_tokens_seen": 105549344, "step": 48935 }, { "epoch": 8.981464488897045, "grad_norm": 0.028038764372467995, "learning_rate": 6.725441851495782e-06, "loss": 0.1285, "num_input_tokens_seen": 105559392, "step": 48940 }, { "epoch": 8.982382088456598, "grad_norm": 0.009661920368671417, "learning_rate": 6.724690263019126e-06, "loss": 0.1379, "num_input_tokens_seen": 105569056, "step": 48945 }, { "epoch": 8.98329968801615, "grad_norm": 0.014246253296732903, "learning_rate": 6.723938630306837e-06, "loss": 0.1393, "num_input_tokens_seen": 105580480, "step": 48950 }, { "epoch": 8.984217287575701, "grad_norm": 0.002625095658004284, "learning_rate": 6.723186953378195e-06, "loss": 0.1497, "num_input_tokens_seen": 105590880, "step": 48955 }, { "epoch": 8.985134887135255, "grad_norm": 0.058667078614234924, "learning_rate": 6.722435232252478e-06, "loss": 0.0028, "num_input_tokens_seen": 105601408, "step": 48960 }, { "epoch": 8.986052486694806, "grad_norm": 0.0054330178536474705, "learning_rate": 6.721683466948966e-06, "loss": 0.0008, "num_input_tokens_seen": 105611872, "step": 48965 }, { "epoch": 8.986970086254358, "grad_norm": 0.05221811681985855, "learning_rate": 6.720931657486942e-06, "loss": 0.0003, "num_input_tokens_seen": 105623360, "step": 48970 }, { "epoch": 8.987887685813911, "grad_norm": 0.40004318952560425, "learning_rate": 6.720179803885688e-06, "loss": 0.001, "num_input_tokens_seen": 105634528, "step": 48975 }, { "epoch": 8.988805285373463, "grad_norm": 0.005673873238265514, "learning_rate": 6.7194279061644885e-06, "loss": 0.1259, "num_input_tokens_seen": 105644576, "step": 48980 }, { "epoch": 8.989722884933014, "grad_norm": 0.0975261703133583, "learning_rate": 6.718675964342628e-06, "loss": 0.0009, "num_input_tokens_seen": 105655680, "step": 48985 }, { "epoch": 8.990640484492568, "grad_norm": 35.06182861328125, "learning_rate": 6.717923978439393e-06, "loss": 0.2281, "num_input_tokens_seen": 105665760, "step": 48990 }, { "epoch": 8.99155808405212, "grad_norm": 0.00804731622338295, "learning_rate": 6.717171948474071e-06, "loss": 0.1507, "num_input_tokens_seen": 105676800, "step": 48995 }, { "epoch": 8.992475683611671, "grad_norm": 0.007564689964056015, "learning_rate": 6.716419874465948e-06, "loss": 0.0007, "num_input_tokens_seen": 105688544, "step": 49000 }, { "epoch": 8.993393283171224, "grad_norm": 0.0024669880513101816, "learning_rate": 6.715667756434316e-06, "loss": 0.0003, "num_input_tokens_seen": 105698720, "step": 49005 }, { "epoch": 8.994310882730776, "grad_norm": 0.024898288771510124, "learning_rate": 6.714915594398466e-06, "loss": 0.1608, "num_input_tokens_seen": 105711072, "step": 49010 }, { "epoch": 8.995228482290328, "grad_norm": 0.05840347334742546, "learning_rate": 6.714163388377689e-06, "loss": 0.0003, "num_input_tokens_seen": 105722304, "step": 49015 }, { "epoch": 8.996146081849881, "grad_norm": 0.008922593668103218, "learning_rate": 6.713411138391277e-06, "loss": 0.1253, "num_input_tokens_seen": 105733280, "step": 49020 }, { "epoch": 8.997063681409433, "grad_norm": 0.01362109836190939, "learning_rate": 6.712658844458526e-06, "loss": 0.0013, "num_input_tokens_seen": 105743744, "step": 49025 }, { "epoch": 8.997981280968984, "grad_norm": 0.005062194541096687, "learning_rate": 6.711906506598728e-06, "loss": 0.1386, "num_input_tokens_seen": 105753376, "step": 49030 }, { "epoch": 8.998898880528538, "grad_norm": 0.003275347640737891, "learning_rate": 6.711154124831183e-06, "loss": 0.0083, "num_input_tokens_seen": 105763616, "step": 49035 }, { "epoch": 8.99981648008809, "grad_norm": 17.391555786132812, "learning_rate": 6.710401699175187e-06, "loss": 0.2695, "num_input_tokens_seen": 105773760, "step": 49040 }, { "epoch": 9.000734079647641, "grad_norm": 23.092269897460938, "learning_rate": 6.709649229650037e-06, "loss": 0.2451, "num_input_tokens_seen": 105783952, "step": 49045 }, { "epoch": 9.001651679207194, "grad_norm": 18.72838020324707, "learning_rate": 6.708896716275035e-06, "loss": 0.0153, "num_input_tokens_seen": 105795088, "step": 49050 }, { "epoch": 9.002569278766746, "grad_norm": 0.05785875767469406, "learning_rate": 6.70814415906948e-06, "loss": 0.0593, "num_input_tokens_seen": 105804528, "step": 49055 }, { "epoch": 9.003486878326298, "grad_norm": 0.9795994162559509, "learning_rate": 6.707391558052675e-06, "loss": 0.0017, "num_input_tokens_seen": 105816016, "step": 49060 }, { "epoch": 9.004404477885851, "grad_norm": 0.0457780547440052, "learning_rate": 6.706638913243924e-06, "loss": 0.0003, "num_input_tokens_seen": 105826608, "step": 49065 }, { "epoch": 9.005322077445403, "grad_norm": 0.016790160909295082, "learning_rate": 6.705886224662528e-06, "loss": 0.0566, "num_input_tokens_seen": 105837072, "step": 49070 }, { "epoch": 9.006239677004954, "grad_norm": 0.011656463146209717, "learning_rate": 6.7051334923277945e-06, "loss": 0.0009, "num_input_tokens_seen": 105847120, "step": 49075 }, { "epoch": 9.007157276564508, "grad_norm": 0.06852011382579803, "learning_rate": 6.70438071625903e-06, "loss": 0.0005, "num_input_tokens_seen": 105858256, "step": 49080 }, { "epoch": 9.00807487612406, "grad_norm": 58.418853759765625, "learning_rate": 6.7036278964755404e-06, "loss": 0.043, "num_input_tokens_seen": 105868144, "step": 49085 }, { "epoch": 9.00899247568361, "grad_norm": 0.20762576162815094, "learning_rate": 6.702875032996638e-06, "loss": 0.0331, "num_input_tokens_seen": 105878512, "step": 49090 }, { "epoch": 9.009910075243164, "grad_norm": 0.03111623227596283, "learning_rate": 6.702122125841629e-06, "loss": 0.0004, "num_input_tokens_seen": 105889424, "step": 49095 }, { "epoch": 9.010827674802716, "grad_norm": 0.010316829197108746, "learning_rate": 6.701369175029826e-06, "loss": 0.0046, "num_input_tokens_seen": 105901168, "step": 49100 }, { "epoch": 9.011745274362267, "grad_norm": 0.03674289584159851, "learning_rate": 6.70061618058054e-06, "loss": 0.0003, "num_input_tokens_seen": 105913232, "step": 49105 }, { "epoch": 9.01266287392182, "grad_norm": 0.0337093286216259, "learning_rate": 6.699863142513085e-06, "loss": 0.0002, "num_input_tokens_seen": 105923696, "step": 49110 }, { "epoch": 9.013580473481372, "grad_norm": 0.0036772219464182854, "learning_rate": 6.699110060846775e-06, "loss": 0.0009, "num_input_tokens_seen": 105933840, "step": 49115 }, { "epoch": 9.014498073040924, "grad_norm": 0.0019253934733569622, "learning_rate": 6.698356935600924e-06, "loss": 0.0001, "num_input_tokens_seen": 105944688, "step": 49120 }, { "epoch": 9.015415672600477, "grad_norm": 0.17011623084545135, "learning_rate": 6.697603766794853e-06, "loss": 0.2248, "num_input_tokens_seen": 105955888, "step": 49125 }, { "epoch": 9.016333272160029, "grad_norm": 0.0034464735072106123, "learning_rate": 6.696850554447873e-06, "loss": 0.0001, "num_input_tokens_seen": 105966160, "step": 49130 }, { "epoch": 9.01725087171958, "grad_norm": 0.03318300470709801, "learning_rate": 6.696097298579308e-06, "loss": 0.1101, "num_input_tokens_seen": 105977264, "step": 49135 }, { "epoch": 9.018168471279134, "grad_norm": 0.00601999880746007, "learning_rate": 6.695343999208477e-06, "loss": 0.0003, "num_input_tokens_seen": 105987824, "step": 49140 }, { "epoch": 9.019086070838686, "grad_norm": 0.16003306210041046, "learning_rate": 6.694590656354698e-06, "loss": 0.0004, "num_input_tokens_seen": 105997232, "step": 49145 }, { "epoch": 9.020003670398237, "grad_norm": 0.01637543924152851, "learning_rate": 6.693837270037296e-06, "loss": 0.003, "num_input_tokens_seen": 106008176, "step": 49150 }, { "epoch": 9.02092126995779, "grad_norm": 0.002504524774849415, "learning_rate": 6.693083840275592e-06, "loss": 0.0003, "num_input_tokens_seen": 106018768, "step": 49155 }, { "epoch": 9.021838869517342, "grad_norm": 0.005131356418132782, "learning_rate": 6.692330367088913e-06, "loss": 0.0003, "num_input_tokens_seen": 106029264, "step": 49160 }, { "epoch": 9.022756469076894, "grad_norm": 0.01725705713033676, "learning_rate": 6.691576850496582e-06, "loss": 0.0005, "num_input_tokens_seen": 106039856, "step": 49165 }, { "epoch": 9.023674068636447, "grad_norm": 0.01506791915744543, "learning_rate": 6.690823290517926e-06, "loss": 0.0002, "num_input_tokens_seen": 106048976, "step": 49170 }, { "epoch": 9.024591668195999, "grad_norm": 0.24642297625541687, "learning_rate": 6.690069687172275e-06, "loss": 0.0002, "num_input_tokens_seen": 106059376, "step": 49175 }, { "epoch": 9.02550926775555, "grad_norm": 0.10395310819149017, "learning_rate": 6.689316040478955e-06, "loss": 0.0007, "num_input_tokens_seen": 106069680, "step": 49180 }, { "epoch": 9.026426867315104, "grad_norm": 0.00650032190605998, "learning_rate": 6.688562350457297e-06, "loss": 0.0002, "num_input_tokens_seen": 106080976, "step": 49185 }, { "epoch": 9.027344466874656, "grad_norm": 0.008421151898801327, "learning_rate": 6.68780861712663e-06, "loss": 0.1435, "num_input_tokens_seen": 106091632, "step": 49190 }, { "epoch": 9.028262066434207, "grad_norm": 6.024713039398193, "learning_rate": 6.687054840506288e-06, "loss": 0.0012, "num_input_tokens_seen": 106101904, "step": 49195 }, { "epoch": 9.02917966599376, "grad_norm": 0.0009106952347792685, "learning_rate": 6.686301020615606e-06, "loss": 0.0017, "num_input_tokens_seen": 106112784, "step": 49200 }, { "epoch": 9.030097265553312, "grad_norm": 0.10535460710525513, "learning_rate": 6.685547157473916e-06, "loss": 0.0004, "num_input_tokens_seen": 106123856, "step": 49205 }, { "epoch": 9.031014865112864, "grad_norm": 0.022692864760756493, "learning_rate": 6.684793251100554e-06, "loss": 0.376, "num_input_tokens_seen": 106134320, "step": 49210 }, { "epoch": 9.031932464672417, "grad_norm": 0.04995112866163254, "learning_rate": 6.6840393015148555e-06, "loss": 0.0004, "num_input_tokens_seen": 106145328, "step": 49215 }, { "epoch": 9.032850064231969, "grad_norm": 0.13442201912403107, "learning_rate": 6.6832853087361586e-06, "loss": 0.0004, "num_input_tokens_seen": 106155664, "step": 49220 }, { "epoch": 9.03376766379152, "grad_norm": 0.05268441513180733, "learning_rate": 6.6825312727838035e-06, "loss": 0.0001, "num_input_tokens_seen": 106166384, "step": 49225 }, { "epoch": 9.034685263351074, "grad_norm": 15.80199146270752, "learning_rate": 6.681777193677128e-06, "loss": 0.1472, "num_input_tokens_seen": 106177904, "step": 49230 }, { "epoch": 9.035602862910626, "grad_norm": 0.02460547722876072, "learning_rate": 6.681023071435475e-06, "loss": 0.0951, "num_input_tokens_seen": 106187728, "step": 49235 }, { "epoch": 9.036520462470177, "grad_norm": 0.01684526726603508, "learning_rate": 6.680268906078184e-06, "loss": 0.0708, "num_input_tokens_seen": 106200336, "step": 49240 }, { "epoch": 9.03743806202973, "grad_norm": 0.04279186204075813, "learning_rate": 6.679514697624601e-06, "loss": 0.0003, "num_input_tokens_seen": 106210288, "step": 49245 }, { "epoch": 9.038355661589282, "grad_norm": 0.03985091671347618, "learning_rate": 6.678760446094069e-06, "loss": 0.1733, "num_input_tokens_seen": 106221776, "step": 49250 }, { "epoch": 9.039273261148834, "grad_norm": 0.02686076983809471, "learning_rate": 6.678006151505934e-06, "loss": 0.0009, "num_input_tokens_seen": 106233072, "step": 49255 }, { "epoch": 9.040190860708387, "grad_norm": 0.005382710602134466, "learning_rate": 6.67725181387954e-06, "loss": 0.0286, "num_input_tokens_seen": 106245168, "step": 49260 }, { "epoch": 9.041108460267939, "grad_norm": 0.02988055907189846, "learning_rate": 6.676497433234237e-06, "loss": 0.0004, "num_input_tokens_seen": 106256944, "step": 49265 }, { "epoch": 9.04202605982749, "grad_norm": 0.012482753023505211, "learning_rate": 6.675743009589374e-06, "loss": 0.162, "num_input_tokens_seen": 106267280, "step": 49270 }, { "epoch": 9.042943659387044, "grad_norm": 15.708694458007812, "learning_rate": 6.6749885429643e-06, "loss": 0.0922, "num_input_tokens_seen": 106278800, "step": 49275 }, { "epoch": 9.043861258946595, "grad_norm": 0.042362384498119354, "learning_rate": 6.674234033378365e-06, "loss": 0.0008, "num_input_tokens_seen": 106289008, "step": 49280 }, { "epoch": 9.044778858506147, "grad_norm": 0.039095327258110046, "learning_rate": 6.673479480850923e-06, "loss": 0.0004, "num_input_tokens_seen": 106300368, "step": 49285 }, { "epoch": 9.0456964580657, "grad_norm": 0.02771291881799698, "learning_rate": 6.672724885401325e-06, "loss": 0.0006, "num_input_tokens_seen": 106312208, "step": 49290 }, { "epoch": 9.046614057625252, "grad_norm": 0.028920438140630722, "learning_rate": 6.6719702470489255e-06, "loss": 0.0862, "num_input_tokens_seen": 106323408, "step": 49295 }, { "epoch": 9.047531657184804, "grad_norm": 0.005293954163789749, "learning_rate": 6.6712155658130815e-06, "loss": 0.1502, "num_input_tokens_seen": 106333680, "step": 49300 }, { "epoch": 9.048449256744357, "grad_norm": 0.021759996190667152, "learning_rate": 6.670460841713149e-06, "loss": 0.0265, "num_input_tokens_seen": 106345360, "step": 49305 }, { "epoch": 9.049366856303909, "grad_norm": 0.03443824127316475, "learning_rate": 6.669706074768484e-06, "loss": 0.0006, "num_input_tokens_seen": 106354736, "step": 49310 }, { "epoch": 9.05028445586346, "grad_norm": 0.03698399290442467, "learning_rate": 6.6689512649984454e-06, "loss": 0.0003, "num_input_tokens_seen": 106364720, "step": 49315 }, { "epoch": 9.051202055423014, "grad_norm": 0.010642718523740768, "learning_rate": 6.668196412422395e-06, "loss": 0.0004, "num_input_tokens_seen": 106376208, "step": 49320 }, { "epoch": 9.052119654982565, "grad_norm": 12.147686004638672, "learning_rate": 6.667441517059692e-06, "loss": 0.0049, "num_input_tokens_seen": 106384880, "step": 49325 }, { "epoch": 9.053037254542117, "grad_norm": 0.007651074789464474, "learning_rate": 6.666686578929696e-06, "loss": 0.0003, "num_input_tokens_seen": 106396976, "step": 49330 }, { "epoch": 9.05395485410167, "grad_norm": 0.04026385769248009, "learning_rate": 6.665931598051776e-06, "loss": 0.0247, "num_input_tokens_seen": 106406704, "step": 49335 }, { "epoch": 9.054872453661222, "grad_norm": 0.034479640424251556, "learning_rate": 6.665176574445289e-06, "loss": 0.0004, "num_input_tokens_seen": 106417744, "step": 49340 }, { "epoch": 9.055790053220774, "grad_norm": 0.04325002804398537, "learning_rate": 6.664421508129606e-06, "loss": 0.0002, "num_input_tokens_seen": 106428656, "step": 49345 }, { "epoch": 9.056707652780327, "grad_norm": 0.07849512249231339, "learning_rate": 6.663666399124091e-06, "loss": 0.0004, "num_input_tokens_seen": 106439184, "step": 49350 }, { "epoch": 9.057625252339879, "grad_norm": 0.024155959486961365, "learning_rate": 6.662911247448111e-06, "loss": 0.0005, "num_input_tokens_seen": 106450640, "step": 49355 }, { "epoch": 9.05854285189943, "grad_norm": 0.015129471197724342, "learning_rate": 6.6621560531210335e-06, "loss": 0.0001, "num_input_tokens_seen": 106459920, "step": 49360 }, { "epoch": 9.059460451458984, "grad_norm": 0.04174579679965973, "learning_rate": 6.66140081616223e-06, "loss": 0.0001, "num_input_tokens_seen": 106470960, "step": 49365 }, { "epoch": 9.060378051018535, "grad_norm": 4.35988187789917, "learning_rate": 6.660645536591072e-06, "loss": 0.0008, "num_input_tokens_seen": 106480944, "step": 49370 }, { "epoch": 9.061295650578089, "grad_norm": 0.013241956941783428, "learning_rate": 6.659890214426927e-06, "loss": 0.0003, "num_input_tokens_seen": 106490736, "step": 49375 }, { "epoch": 9.06221325013764, "grad_norm": 0.03752981498837471, "learning_rate": 6.659134849689173e-06, "loss": 0.0003, "num_input_tokens_seen": 106500528, "step": 49380 }, { "epoch": 9.063130849697192, "grad_norm": 0.007529784459620714, "learning_rate": 6.658379442397181e-06, "loss": 0.0025, "num_input_tokens_seen": 106511408, "step": 49385 }, { "epoch": 9.064048449256745, "grad_norm": 0.015605272725224495, "learning_rate": 6.657623992570325e-06, "loss": 0.0015, "num_input_tokens_seen": 106522000, "step": 49390 }, { "epoch": 9.064966048816297, "grad_norm": 0.010662668384611607, "learning_rate": 6.656868500227984e-06, "loss": 0.0191, "num_input_tokens_seen": 106531152, "step": 49395 }, { "epoch": 9.065883648375848, "grad_norm": 0.004027200397104025, "learning_rate": 6.656112965389534e-06, "loss": 0.0793, "num_input_tokens_seen": 106541360, "step": 49400 }, { "epoch": 9.066801247935402, "grad_norm": 0.007245480548590422, "learning_rate": 6.6553573880743516e-06, "loss": 0.0002, "num_input_tokens_seen": 106552528, "step": 49405 }, { "epoch": 9.067718847494953, "grad_norm": 0.023150330409407616, "learning_rate": 6.65460176830182e-06, "loss": 0.0003, "num_input_tokens_seen": 106561328, "step": 49410 }, { "epoch": 9.068636447054505, "grad_norm": 0.010010900907218456, "learning_rate": 6.653846106091316e-06, "loss": 0.0001, "num_input_tokens_seen": 106573104, "step": 49415 }, { "epoch": 9.069554046614059, "grad_norm": 0.004961423110216856, "learning_rate": 6.6530904014622234e-06, "loss": 0.0001, "num_input_tokens_seen": 106583504, "step": 49420 }, { "epoch": 9.07047164617361, "grad_norm": 0.010122978128492832, "learning_rate": 6.652334654433923e-06, "loss": 0.0002, "num_input_tokens_seen": 106592848, "step": 49425 }, { "epoch": 9.071389245733162, "grad_norm": 0.01807229407131672, "learning_rate": 6.6515788650258005e-06, "loss": 0.0004, "num_input_tokens_seen": 106603728, "step": 49430 }, { "epoch": 9.072306845292715, "grad_norm": 0.0007183414418250322, "learning_rate": 6.65082303325724e-06, "loss": 0.2314, "num_input_tokens_seen": 106614448, "step": 49435 }, { "epoch": 9.073224444852267, "grad_norm": 0.005402385722845793, "learning_rate": 6.650067159147626e-06, "loss": 0.0004, "num_input_tokens_seen": 106624976, "step": 49440 }, { "epoch": 9.074142044411818, "grad_norm": 0.007587258238345385, "learning_rate": 6.649311242716348e-06, "loss": 0.0001, "num_input_tokens_seen": 106634640, "step": 49445 }, { "epoch": 9.075059643971372, "grad_norm": 0.04417487978935242, "learning_rate": 6.648555283982793e-06, "loss": 0.1346, "num_input_tokens_seen": 106645552, "step": 49450 }, { "epoch": 9.075977243530923, "grad_norm": 0.006527505815029144, "learning_rate": 6.647799282966349e-06, "loss": 0.0793, "num_input_tokens_seen": 106656432, "step": 49455 }, { "epoch": 9.076894843090475, "grad_norm": 0.0005284040817059577, "learning_rate": 6.647043239686409e-06, "loss": 0.0001, "num_input_tokens_seen": 106667376, "step": 49460 }, { "epoch": 9.077812442650028, "grad_norm": 0.6423947215080261, "learning_rate": 6.646287154162361e-06, "loss": 0.0537, "num_input_tokens_seen": 106678288, "step": 49465 }, { "epoch": 9.07873004220958, "grad_norm": 0.0028663016855716705, "learning_rate": 6.6455310264136e-06, "loss": 0.0005, "num_input_tokens_seen": 106688816, "step": 49470 }, { "epoch": 9.079647641769132, "grad_norm": 0.0312790647149086, "learning_rate": 6.644774856459517e-06, "loss": 0.0002, "num_input_tokens_seen": 106700240, "step": 49475 }, { "epoch": 9.080565241328685, "grad_norm": 0.0013104260433465242, "learning_rate": 6.64401864431951e-06, "loss": 0.0002, "num_input_tokens_seen": 106711536, "step": 49480 }, { "epoch": 9.081482840888237, "grad_norm": 10.975706100463867, "learning_rate": 6.643262390012971e-06, "loss": 0.2151, "num_input_tokens_seen": 106723184, "step": 49485 }, { "epoch": 9.082400440447788, "grad_norm": 0.01714552566409111, "learning_rate": 6.642506093559299e-06, "loss": 0.0004, "num_input_tokens_seen": 106734064, "step": 49490 }, { "epoch": 9.083318040007342, "grad_norm": 0.030620397999882698, "learning_rate": 6.641749754977892e-06, "loss": 0.0619, "num_input_tokens_seen": 106743920, "step": 49495 }, { "epoch": 9.084235639566893, "grad_norm": 0.04249534383416176, "learning_rate": 6.640993374288147e-06, "loss": 0.0009, "num_input_tokens_seen": 106754544, "step": 49500 }, { "epoch": 9.085153239126445, "grad_norm": 0.4017851948738098, "learning_rate": 6.640236951509467e-06, "loss": 0.0013, "num_input_tokens_seen": 106765904, "step": 49505 }, { "epoch": 9.086070838685998, "grad_norm": 0.011092420667409897, "learning_rate": 6.639480486661249e-06, "loss": 0.0005, "num_input_tokens_seen": 106777936, "step": 49510 }, { "epoch": 9.08698843824555, "grad_norm": 0.021473510190844536, "learning_rate": 6.638723979762899e-06, "loss": 0.2235, "num_input_tokens_seen": 106788336, "step": 49515 }, { "epoch": 9.087906037805102, "grad_norm": 0.020862018689513206, "learning_rate": 6.63796743083382e-06, "loss": 0.0148, "num_input_tokens_seen": 106800048, "step": 49520 }, { "epoch": 9.088823637364655, "grad_norm": 0.2517780065536499, "learning_rate": 6.637210839893412e-06, "loss": 0.0109, "num_input_tokens_seen": 106810864, "step": 49525 }, { "epoch": 9.089741236924207, "grad_norm": 0.08348701149225235, "learning_rate": 6.636454206961086e-06, "loss": 0.0031, "num_input_tokens_seen": 106821104, "step": 49530 }, { "epoch": 9.090658836483758, "grad_norm": 0.10809573531150818, "learning_rate": 6.6356975320562445e-06, "loss": 0.1071, "num_input_tokens_seen": 106831376, "step": 49535 }, { "epoch": 9.091576436043312, "grad_norm": 0.024410145357251167, "learning_rate": 6.6349408151982965e-06, "loss": 0.0004, "num_input_tokens_seen": 106841264, "step": 49540 }, { "epoch": 9.092494035602863, "grad_norm": 0.00993074756115675, "learning_rate": 6.634184056406652e-06, "loss": 0.0002, "num_input_tokens_seen": 106852016, "step": 49545 }, { "epoch": 9.093411635162415, "grad_norm": 0.07790946215391159, "learning_rate": 6.6334272557007175e-06, "loss": 0.0005, "num_input_tokens_seen": 106862960, "step": 49550 }, { "epoch": 9.094329234721968, "grad_norm": 0.004958014003932476, "learning_rate": 6.632670413099906e-06, "loss": 0.1723, "num_input_tokens_seen": 106874736, "step": 49555 }, { "epoch": 9.09524683428152, "grad_norm": 136.85885620117188, "learning_rate": 6.631913528623628e-06, "loss": 0.1347, "num_input_tokens_seen": 106884560, "step": 49560 }, { "epoch": 9.096164433841071, "grad_norm": 0.014978875406086445, "learning_rate": 6.631156602291299e-06, "loss": 0.0001, "num_input_tokens_seen": 106896048, "step": 49565 }, { "epoch": 9.097082033400625, "grad_norm": 0.0007310697110369802, "learning_rate": 6.630399634122331e-06, "loss": 0.0072, "num_input_tokens_seen": 106906256, "step": 49570 }, { "epoch": 9.097999632960176, "grad_norm": 0.014021585695445538, "learning_rate": 6.629642624136138e-06, "loss": 0.0042, "num_input_tokens_seen": 106916656, "step": 49575 }, { "epoch": 9.098917232519728, "grad_norm": 0.07192707061767578, "learning_rate": 6.628885572352139e-06, "loss": 0.0005, "num_input_tokens_seen": 106927952, "step": 49580 }, { "epoch": 9.099834832079281, "grad_norm": 0.044846247881650925, "learning_rate": 6.628128478789747e-06, "loss": 0.0175, "num_input_tokens_seen": 106937872, "step": 49585 }, { "epoch": 9.100752431638833, "grad_norm": 0.005009031388908625, "learning_rate": 6.627371343468385e-06, "loss": 0.0244, "num_input_tokens_seen": 106948816, "step": 49590 }, { "epoch": 9.101670031198385, "grad_norm": 0.02372610755264759, "learning_rate": 6.6266141664074704e-06, "loss": 0.0402, "num_input_tokens_seen": 106959216, "step": 49595 }, { "epoch": 9.102587630757938, "grad_norm": 0.04507575184106827, "learning_rate": 6.625856947626421e-06, "loss": 0.0051, "num_input_tokens_seen": 106969488, "step": 49600 }, { "epoch": 9.10350523031749, "grad_norm": 0.44892945885658264, "learning_rate": 6.625099687144664e-06, "loss": 0.0889, "num_input_tokens_seen": 106980784, "step": 49605 }, { "epoch": 9.104422829877041, "grad_norm": 0.11803314834833145, "learning_rate": 6.624342384981617e-06, "loss": 0.0007, "num_input_tokens_seen": 106992464, "step": 49610 }, { "epoch": 9.105340429436595, "grad_norm": 0.2761477530002594, "learning_rate": 6.623585041156706e-06, "loss": 0.0005, "num_input_tokens_seen": 107002608, "step": 49615 }, { "epoch": 9.106258028996146, "grad_norm": 0.38369184732437134, "learning_rate": 6.622827655689353e-06, "loss": 0.0006, "num_input_tokens_seen": 107012720, "step": 49620 }, { "epoch": 9.107175628555698, "grad_norm": 0.0006105945212766528, "learning_rate": 6.622070228598987e-06, "loss": 0.0003, "num_input_tokens_seen": 107024656, "step": 49625 }, { "epoch": 9.108093228115251, "grad_norm": 0.17231646180152893, "learning_rate": 6.621312759905035e-06, "loss": 0.0885, "num_input_tokens_seen": 107035632, "step": 49630 }, { "epoch": 9.109010827674803, "grad_norm": 0.0038330010138452053, "learning_rate": 6.6205552496269225e-06, "loss": 0.0008, "num_input_tokens_seen": 107047568, "step": 49635 }, { "epoch": 9.109928427234355, "grad_norm": 0.028119584545493126, "learning_rate": 6.619797697784079e-06, "loss": 0.0005, "num_input_tokens_seen": 107058448, "step": 49640 }, { "epoch": 9.110846026793908, "grad_norm": 0.0036609063390642405, "learning_rate": 6.619040104395935e-06, "loss": 0.0266, "num_input_tokens_seen": 107068944, "step": 49645 }, { "epoch": 9.11176362635346, "grad_norm": 0.0038546803407371044, "learning_rate": 6.618282469481922e-06, "loss": 0.0001, "num_input_tokens_seen": 107079600, "step": 49650 }, { "epoch": 9.112681225913011, "grad_norm": 0.012152373790740967, "learning_rate": 6.617524793061473e-06, "loss": 0.0003, "num_input_tokens_seen": 107090512, "step": 49655 }, { "epoch": 9.113598825472565, "grad_norm": 0.0005841944948770106, "learning_rate": 6.616767075154018e-06, "loss": 0.0001, "num_input_tokens_seen": 107101936, "step": 49660 }, { "epoch": 9.114516425032116, "grad_norm": 28.408899307250977, "learning_rate": 6.616009315778995e-06, "loss": 0.0857, "num_input_tokens_seen": 107111632, "step": 49665 }, { "epoch": 9.115434024591668, "grad_norm": 0.2123691737651825, "learning_rate": 6.615251514955837e-06, "loss": 0.0001, "num_input_tokens_seen": 107122064, "step": 49670 }, { "epoch": 9.116351624151221, "grad_norm": 0.00785442441701889, "learning_rate": 6.61449367270398e-06, "loss": 0.0857, "num_input_tokens_seen": 107133392, "step": 49675 }, { "epoch": 9.117269223710773, "grad_norm": 0.0018779279198497534, "learning_rate": 6.613735789042864e-06, "loss": 0.0001, "num_input_tokens_seen": 107144240, "step": 49680 }, { "epoch": 9.118186823270324, "grad_norm": 0.046856872737407684, "learning_rate": 6.6129778639919254e-06, "loss": 0.0004, "num_input_tokens_seen": 107155856, "step": 49685 }, { "epoch": 9.119104422829878, "grad_norm": 0.011491633020341396, "learning_rate": 6.612219897570604e-06, "loss": 0.0881, "num_input_tokens_seen": 107166288, "step": 49690 }, { "epoch": 9.12002202238943, "grad_norm": 0.0034037220757454634, "learning_rate": 6.6114618897983415e-06, "loss": 0.056, "num_input_tokens_seen": 107177296, "step": 49695 }, { "epoch": 9.120939621948981, "grad_norm": 0.12396707385778427, "learning_rate": 6.610703840694579e-06, "loss": 0.0978, "num_input_tokens_seen": 107188112, "step": 49700 }, { "epoch": 9.121857221508535, "grad_norm": 0.01649235375225544, "learning_rate": 6.609945750278759e-06, "loss": 0.0707, "num_input_tokens_seen": 107198864, "step": 49705 }, { "epoch": 9.122774821068086, "grad_norm": 0.0749819278717041, "learning_rate": 6.609187618570327e-06, "loss": 0.0175, "num_input_tokens_seen": 107209840, "step": 49710 }, { "epoch": 9.123692420627638, "grad_norm": 15.04915714263916, "learning_rate": 6.608429445588725e-06, "loss": 0.2221, "num_input_tokens_seen": 107220496, "step": 49715 }, { "epoch": 9.124610020187191, "grad_norm": 0.14881528913974762, "learning_rate": 6.6076712313534006e-06, "loss": 0.0004, "num_input_tokens_seen": 107231216, "step": 49720 }, { "epoch": 9.125527619746743, "grad_norm": 0.006571399513632059, "learning_rate": 6.606912975883801e-06, "loss": 0.0007, "num_input_tokens_seen": 107242384, "step": 49725 }, { "epoch": 9.126445219306294, "grad_norm": 49.44550704956055, "learning_rate": 6.606154679199375e-06, "loss": 0.1425, "num_input_tokens_seen": 107252688, "step": 49730 }, { "epoch": 9.127362818865848, "grad_norm": 0.01866193115711212, "learning_rate": 6.60539634131957e-06, "loss": 0.0008, "num_input_tokens_seen": 107264368, "step": 49735 }, { "epoch": 9.1282804184254, "grad_norm": 0.41002267599105835, "learning_rate": 6.604637962263838e-06, "loss": 0.08, "num_input_tokens_seen": 107275056, "step": 49740 }, { "epoch": 9.129198017984951, "grad_norm": 125.96387481689453, "learning_rate": 6.603879542051628e-06, "loss": 0.0207, "num_input_tokens_seen": 107286544, "step": 49745 }, { "epoch": 9.130115617544504, "grad_norm": 0.011977214366197586, "learning_rate": 6.6031210807023925e-06, "loss": 0.0003, "num_input_tokens_seen": 107297712, "step": 49750 }, { "epoch": 9.131033217104056, "grad_norm": 0.08752770721912384, "learning_rate": 6.602362578235588e-06, "loss": 0.0009, "num_input_tokens_seen": 107306672, "step": 49755 }, { "epoch": 9.131950816663608, "grad_norm": 0.003302752273157239, "learning_rate": 6.601604034670667e-06, "loss": 0.0002, "num_input_tokens_seen": 107317456, "step": 49760 }, { "epoch": 9.132868416223161, "grad_norm": 0.07673485577106476, "learning_rate": 6.600845450027085e-06, "loss": 0.0023, "num_input_tokens_seen": 107327792, "step": 49765 }, { "epoch": 9.133786015782713, "grad_norm": 0.016775019466876984, "learning_rate": 6.600086824324295e-06, "loss": 0.0009, "num_input_tokens_seen": 107338800, "step": 49770 }, { "epoch": 9.134703615342264, "grad_norm": 0.005377482157200575, "learning_rate": 6.599328157581762e-06, "loss": 0.1871, "num_input_tokens_seen": 107348208, "step": 49775 }, { "epoch": 9.135621214901818, "grad_norm": 0.09277765452861786, "learning_rate": 6.598569449818939e-06, "loss": 0.0003, "num_input_tokens_seen": 107358128, "step": 49780 }, { "epoch": 9.13653881446137, "grad_norm": 0.0032177313696593046, "learning_rate": 6.597810701055286e-06, "loss": 0.0002, "num_input_tokens_seen": 107368336, "step": 49785 }, { "epoch": 9.137456414020921, "grad_norm": 0.007996557280421257, "learning_rate": 6.597051911310266e-06, "loss": 0.0003, "num_input_tokens_seen": 107379248, "step": 49790 }, { "epoch": 9.138374013580474, "grad_norm": 0.0412365198135376, "learning_rate": 6.596293080603338e-06, "loss": 0.0001, "num_input_tokens_seen": 107389872, "step": 49795 }, { "epoch": 9.139291613140026, "grad_norm": 0.003799653612077236, "learning_rate": 6.595534208953969e-06, "loss": 0.0003, "num_input_tokens_seen": 107400688, "step": 49800 }, { "epoch": 9.140209212699578, "grad_norm": 0.019691456109285355, "learning_rate": 6.594775296381619e-06, "loss": 0.0003, "num_input_tokens_seen": 107410032, "step": 49805 }, { "epoch": 9.141126812259131, "grad_norm": 0.004905532114207745, "learning_rate": 6.5940163429057544e-06, "loss": 0.0005, "num_input_tokens_seen": 107421040, "step": 49810 }, { "epoch": 9.142044411818683, "grad_norm": 0.056012704968452454, "learning_rate": 6.5932573485458415e-06, "loss": 0.0007, "num_input_tokens_seen": 107431664, "step": 49815 }, { "epoch": 9.142962011378234, "grad_norm": 0.009689240716397762, "learning_rate": 6.5924983133213475e-06, "loss": 0.0003, "num_input_tokens_seen": 107442640, "step": 49820 }, { "epoch": 9.143879610937788, "grad_norm": 0.07363336533308029, "learning_rate": 6.591739237251739e-06, "loss": 0.1191, "num_input_tokens_seen": 107454576, "step": 49825 }, { "epoch": 9.14479721049734, "grad_norm": 0.008839213289320469, "learning_rate": 6.590980120356485e-06, "loss": 0.1159, "num_input_tokens_seen": 107465040, "step": 49830 }, { "epoch": 9.14571481005689, "grad_norm": 0.04740641638636589, "learning_rate": 6.5902209626550585e-06, "loss": 0.0002, "num_input_tokens_seen": 107476368, "step": 49835 }, { "epoch": 9.146632409616444, "grad_norm": 0.020736977458000183, "learning_rate": 6.589461764166929e-06, "loss": 0.1266, "num_input_tokens_seen": 107487248, "step": 49840 }, { "epoch": 9.147550009175996, "grad_norm": 0.002305952599272132, "learning_rate": 6.588702524911566e-06, "loss": 0.0001, "num_input_tokens_seen": 107498128, "step": 49845 }, { "epoch": 9.148467608735547, "grad_norm": 0.006501247175037861, "learning_rate": 6.587943244908449e-06, "loss": 0.0534, "num_input_tokens_seen": 107509712, "step": 49850 }, { "epoch": 9.1493852082951, "grad_norm": 0.020743943750858307, "learning_rate": 6.587183924177048e-06, "loss": 0.0002, "num_input_tokens_seen": 107520528, "step": 49855 }, { "epoch": 9.150302807854652, "grad_norm": 0.001740495441481471, "learning_rate": 6.5864245627368375e-06, "loss": 0.1208, "num_input_tokens_seen": 107530352, "step": 49860 }, { "epoch": 9.151220407414204, "grad_norm": 0.002806131262332201, "learning_rate": 6.585665160607297e-06, "loss": 0.0002, "num_input_tokens_seen": 107541488, "step": 49865 }, { "epoch": 9.152138006973757, "grad_norm": 44.26447296142578, "learning_rate": 6.5849057178079014e-06, "loss": 0.0647, "num_input_tokens_seen": 107551696, "step": 49870 }, { "epoch": 9.153055606533309, "grad_norm": 0.004513128660619259, "learning_rate": 6.584146234358133e-06, "loss": 0.0003, "num_input_tokens_seen": 107562992, "step": 49875 }, { "epoch": 9.15397320609286, "grad_norm": 0.014105062931776047, "learning_rate": 6.583386710277467e-06, "loss": 0.0008, "num_input_tokens_seen": 107574448, "step": 49880 }, { "epoch": 9.154890805652414, "grad_norm": 0.018124043941497803, "learning_rate": 6.582627145585387e-06, "loss": 0.0016, "num_input_tokens_seen": 107584816, "step": 49885 }, { "epoch": 9.155808405211966, "grad_norm": 0.16748131811618805, "learning_rate": 6.5818675403013735e-06, "loss": 0.0006, "num_input_tokens_seen": 107594640, "step": 49890 }, { "epoch": 9.156726004771517, "grad_norm": 0.009160928428173065, "learning_rate": 6.581107894444908e-06, "loss": 0.1195, "num_input_tokens_seen": 107606064, "step": 49895 }, { "epoch": 9.15764360433107, "grad_norm": 0.04962657764554024, "learning_rate": 6.580348208035476e-06, "loss": 0.0002, "num_input_tokens_seen": 107617072, "step": 49900 }, { "epoch": 9.158561203890622, "grad_norm": 0.0030138047877699137, "learning_rate": 6.579588481092563e-06, "loss": 0.0002, "num_input_tokens_seen": 107626672, "step": 49905 }, { "epoch": 9.159478803450174, "grad_norm": 0.050113700330257416, "learning_rate": 6.578828713635652e-06, "loss": 0.069, "num_input_tokens_seen": 107637648, "step": 49910 }, { "epoch": 9.160396403009727, "grad_norm": 0.01985006406903267, "learning_rate": 6.5780689056842314e-06, "loss": 0.0004, "num_input_tokens_seen": 107649392, "step": 49915 }, { "epoch": 9.161314002569279, "grad_norm": 0.03057805262506008, "learning_rate": 6.57730905725779e-06, "loss": 0.0002, "num_input_tokens_seen": 107661616, "step": 49920 }, { "epoch": 9.16223160212883, "grad_norm": 0.008132725022733212, "learning_rate": 6.576549168375817e-06, "loss": 0.0005, "num_input_tokens_seen": 107672528, "step": 49925 }, { "epoch": 9.163149201688384, "grad_norm": 0.1391081064939499, "learning_rate": 6.575789239057799e-06, "loss": 0.0535, "num_input_tokens_seen": 107684496, "step": 49930 }, { "epoch": 9.164066801247936, "grad_norm": 0.058292411267757416, "learning_rate": 6.575029269323231e-06, "loss": 0.0003, "num_input_tokens_seen": 107694352, "step": 49935 }, { "epoch": 9.164984400807487, "grad_norm": 55.40638732910156, "learning_rate": 6.5742692591916025e-06, "loss": 0.1347, "num_input_tokens_seen": 107703600, "step": 49940 }, { "epoch": 9.16590200036704, "grad_norm": 0.08561825752258301, "learning_rate": 6.573509208682407e-06, "loss": 0.0003, "num_input_tokens_seen": 107713936, "step": 49945 }, { "epoch": 9.166819599926592, "grad_norm": 0.000613392679952085, "learning_rate": 6.57274911781514e-06, "loss": 0.0562, "num_input_tokens_seen": 107724784, "step": 49950 }, { "epoch": 9.167737199486144, "grad_norm": 0.007450507953763008, "learning_rate": 6.571988986609295e-06, "loss": 0.1464, "num_input_tokens_seen": 107735856, "step": 49955 }, { "epoch": 9.168654799045697, "grad_norm": 35.519874572753906, "learning_rate": 6.5712288150843695e-06, "loss": 0.1435, "num_input_tokens_seen": 107745744, "step": 49960 }, { "epoch": 9.169572398605249, "grad_norm": 2.9012763500213623, "learning_rate": 6.5704686032598584e-06, "loss": 0.0005, "num_input_tokens_seen": 107756368, "step": 49965 }, { "epoch": 9.1704899981648, "grad_norm": 0.0381263792514801, "learning_rate": 6.569708351155263e-06, "loss": 0.0002, "num_input_tokens_seen": 107765872, "step": 49970 }, { "epoch": 9.171407597724354, "grad_norm": 0.019167521968483925, "learning_rate": 6.568948058790081e-06, "loss": 0.0011, "num_input_tokens_seen": 107776432, "step": 49975 }, { "epoch": 9.172325197283905, "grad_norm": 0.018566757440567017, "learning_rate": 6.568187726183812e-06, "loss": 0.0001, "num_input_tokens_seen": 107787184, "step": 49980 }, { "epoch": 9.173242796843457, "grad_norm": 0.3341315984725952, "learning_rate": 6.567427353355961e-06, "loss": 0.0004, "num_input_tokens_seen": 107797616, "step": 49985 }, { "epoch": 9.17416039640301, "grad_norm": 23.08180809020996, "learning_rate": 6.566666940326026e-06, "loss": 0.1444, "num_input_tokens_seen": 107808848, "step": 49990 }, { "epoch": 9.175077995962562, "grad_norm": 0.4862748384475708, "learning_rate": 6.565906487113511e-06, "loss": 0.0007, "num_input_tokens_seen": 107820528, "step": 49995 }, { "epoch": 9.175995595522114, "grad_norm": 26.67033576965332, "learning_rate": 6.565145993737924e-06, "loss": 0.1377, "num_input_tokens_seen": 107832176, "step": 50000 }, { "epoch": 9.176913195081667, "grad_norm": 0.01109381951391697, "learning_rate": 6.564385460218766e-06, "loss": 0.0002, "num_input_tokens_seen": 107842128, "step": 50005 }, { "epoch": 9.177830794641219, "grad_norm": 0.1474681943655014, "learning_rate": 6.563624886575547e-06, "loss": 0.0002, "num_input_tokens_seen": 107852848, "step": 50010 }, { "epoch": 9.17874839420077, "grad_norm": 0.017692239955067635, "learning_rate": 6.562864272827772e-06, "loss": 0.0012, "num_input_tokens_seen": 107864464, "step": 50015 }, { "epoch": 9.179665993760324, "grad_norm": 0.009848939254879951, "learning_rate": 6.562103618994951e-06, "loss": 0.0006, "num_input_tokens_seen": 107876016, "step": 50020 }, { "epoch": 9.180583593319875, "grad_norm": 0.017805971205234528, "learning_rate": 6.561342925096592e-06, "loss": 0.1063, "num_input_tokens_seen": 107886160, "step": 50025 }, { "epoch": 9.181501192879427, "grad_norm": 0.009912930428981781, "learning_rate": 6.560582191152207e-06, "loss": 0.0004, "num_input_tokens_seen": 107897136, "step": 50030 }, { "epoch": 9.18241879243898, "grad_norm": 0.0307683777064085, "learning_rate": 6.55982141718131e-06, "loss": 0.0002, "num_input_tokens_seen": 107908976, "step": 50035 }, { "epoch": 9.183336391998532, "grad_norm": 0.01436181366443634, "learning_rate": 6.559060603203409e-06, "loss": 0.0094, "num_input_tokens_seen": 107919440, "step": 50040 }, { "epoch": 9.184253991558084, "grad_norm": 0.0029509318992495537, "learning_rate": 6.55829974923802e-06, "loss": 0.0001, "num_input_tokens_seen": 107929840, "step": 50045 }, { "epoch": 9.185171591117637, "grad_norm": 4.294773578643799, "learning_rate": 6.557538855304658e-06, "loss": 0.0006, "num_input_tokens_seen": 107941328, "step": 50050 }, { "epoch": 9.186089190677189, "grad_norm": 0.0003951791441068053, "learning_rate": 6.5567779214228375e-06, "loss": 0.0001, "num_input_tokens_seen": 107951984, "step": 50055 }, { "epoch": 9.18700679023674, "grad_norm": 0.002674039686098695, "learning_rate": 6.556016947612078e-06, "loss": 0.0003, "num_input_tokens_seen": 107960656, "step": 50060 }, { "epoch": 9.187924389796294, "grad_norm": 18.60294532775879, "learning_rate": 6.555255933891893e-06, "loss": 0.0946, "num_input_tokens_seen": 107970928, "step": 50065 }, { "epoch": 9.188841989355845, "grad_norm": 0.004809965379536152, "learning_rate": 6.554494880281805e-06, "loss": 0.0002, "num_input_tokens_seen": 107983024, "step": 50070 }, { "epoch": 9.189759588915397, "grad_norm": 0.0007936061010695994, "learning_rate": 6.553733786801333e-06, "loss": 0.0001, "num_input_tokens_seen": 107993616, "step": 50075 }, { "epoch": 9.19067718847495, "grad_norm": 0.017343683168292046, "learning_rate": 6.552972653469997e-06, "loss": 0.0005, "num_input_tokens_seen": 108005104, "step": 50080 }, { "epoch": 9.191594788034502, "grad_norm": 0.05377159267663956, "learning_rate": 6.55221148030732e-06, "loss": 0.1596, "num_input_tokens_seen": 108016208, "step": 50085 }, { "epoch": 9.192512387594054, "grad_norm": 0.028500162065029144, "learning_rate": 6.551450267332823e-06, "loss": 0.0005, "num_input_tokens_seen": 108026768, "step": 50090 }, { "epoch": 9.193429987153607, "grad_norm": 0.02197524905204773, "learning_rate": 6.5506890145660314e-06, "loss": 0.0003, "num_input_tokens_seen": 108037232, "step": 50095 }, { "epoch": 9.194347586713159, "grad_norm": 0.011061502620577812, "learning_rate": 6.54992772202647e-06, "loss": 0.0478, "num_input_tokens_seen": 108048208, "step": 50100 }, { "epoch": 9.19526518627271, "grad_norm": 0.14249317348003387, "learning_rate": 6.549166389733665e-06, "loss": 0.0918, "num_input_tokens_seen": 108058416, "step": 50105 }, { "epoch": 9.196182785832264, "grad_norm": 0.013872108422219753, "learning_rate": 6.548405017707144e-06, "loss": 0.0027, "num_input_tokens_seen": 108068336, "step": 50110 }, { "epoch": 9.197100385391815, "grad_norm": 0.006311982870101929, "learning_rate": 6.5476436059664336e-06, "loss": 0.2992, "num_input_tokens_seen": 108078640, "step": 50115 }, { "epoch": 9.198017984951367, "grad_norm": 0.0033918414264917374, "learning_rate": 6.546882154531064e-06, "loss": 0.0002, "num_input_tokens_seen": 108089488, "step": 50120 }, { "epoch": 9.19893558451092, "grad_norm": 0.0058261738158762455, "learning_rate": 6.546120663420562e-06, "loss": 0.0002, "num_input_tokens_seen": 108098704, "step": 50125 }, { "epoch": 9.199853184070472, "grad_norm": 0.028628574684262276, "learning_rate": 6.545359132654463e-06, "loss": 0.0001, "num_input_tokens_seen": 108109360, "step": 50130 }, { "epoch": 9.200770783630023, "grad_norm": 0.00677227508276701, "learning_rate": 6.5445975622522975e-06, "loss": 0.0001, "num_input_tokens_seen": 108119696, "step": 50135 }, { "epoch": 9.201688383189577, "grad_norm": 14.601561546325684, "learning_rate": 6.543835952233597e-06, "loss": 0.0796, "num_input_tokens_seen": 108130704, "step": 50140 }, { "epoch": 9.202605982749128, "grad_norm": 0.10696760565042496, "learning_rate": 6.543074302617899e-06, "loss": 0.0004, "num_input_tokens_seen": 108142352, "step": 50145 }, { "epoch": 9.20352358230868, "grad_norm": 0.007486249320209026, "learning_rate": 6.542312613424735e-06, "loss": 0.0003, "num_input_tokens_seen": 108153200, "step": 50150 }, { "epoch": 9.204441181868233, "grad_norm": 0.0074979523196816444, "learning_rate": 6.541550884673643e-06, "loss": 0.0001, "num_input_tokens_seen": 108164208, "step": 50155 }, { "epoch": 9.205358781427785, "grad_norm": 0.009864418767392635, "learning_rate": 6.540789116384162e-06, "loss": 0.0705, "num_input_tokens_seen": 108175280, "step": 50160 }, { "epoch": 9.206276380987337, "grad_norm": 0.29413893818855286, "learning_rate": 6.540027308575826e-06, "loss": 0.0022, "num_input_tokens_seen": 108188080, "step": 50165 }, { "epoch": 9.20719398054689, "grad_norm": 0.5726720690727234, "learning_rate": 6.539265461268178e-06, "loss": 0.0013, "num_input_tokens_seen": 108199152, "step": 50170 }, { "epoch": 9.208111580106442, "grad_norm": 0.0019733523949980736, "learning_rate": 6.5385035744807545e-06, "loss": 0.0002, "num_input_tokens_seen": 108209744, "step": 50175 }, { "epoch": 9.209029179665993, "grad_norm": 0.003463391913101077, "learning_rate": 6.5377416482331005e-06, "loss": 0.0001, "num_input_tokens_seen": 108220208, "step": 50180 }, { "epoch": 9.209946779225547, "grad_norm": 19.013296127319336, "learning_rate": 6.536979682544755e-06, "loss": 0.0264, "num_input_tokens_seen": 108229872, "step": 50185 }, { "epoch": 9.210864378785098, "grad_norm": 11.713010787963867, "learning_rate": 6.536217677435264e-06, "loss": 0.0166, "num_input_tokens_seen": 108240400, "step": 50190 }, { "epoch": 9.21178197834465, "grad_norm": 0.006734840106219053, "learning_rate": 6.53545563292417e-06, "loss": 0.0002, "num_input_tokens_seen": 108250224, "step": 50195 }, { "epoch": 9.212699577904203, "grad_norm": 0.06119752675294876, "learning_rate": 6.534693549031019e-06, "loss": 0.0046, "num_input_tokens_seen": 108261456, "step": 50200 }, { "epoch": 9.213617177463755, "grad_norm": 0.011783858761191368, "learning_rate": 6.533931425775357e-06, "loss": 0.0002, "num_input_tokens_seen": 108270576, "step": 50205 }, { "epoch": 9.214534777023307, "grad_norm": 1.193442940711975, "learning_rate": 6.533169263176733e-06, "loss": 0.0258, "num_input_tokens_seen": 108281328, "step": 50210 }, { "epoch": 9.21545237658286, "grad_norm": 0.10286558419466019, "learning_rate": 6.5324070612546905e-06, "loss": 0.2378, "num_input_tokens_seen": 108291824, "step": 50215 }, { "epoch": 9.216369976142412, "grad_norm": 31.79950714111328, "learning_rate": 6.531644820028784e-06, "loss": 0.1285, "num_input_tokens_seen": 108303184, "step": 50220 }, { "epoch": 9.217287575701963, "grad_norm": 0.02272632345557213, "learning_rate": 6.530882539518562e-06, "loss": 0.0001, "num_input_tokens_seen": 108313552, "step": 50225 }, { "epoch": 9.218205175261517, "grad_norm": 0.007531885523349047, "learning_rate": 6.530120219743574e-06, "loss": 0.0023, "num_input_tokens_seen": 108323280, "step": 50230 }, { "epoch": 9.219122774821068, "grad_norm": 0.023302050307393074, "learning_rate": 6.529357860723374e-06, "loss": 0.1126, "num_input_tokens_seen": 108334864, "step": 50235 }, { "epoch": 9.22004037438062, "grad_norm": 0.05713364854454994, "learning_rate": 6.528595462477515e-06, "loss": 0.0946, "num_input_tokens_seen": 108344592, "step": 50240 }, { "epoch": 9.220957973940173, "grad_norm": 0.04434637352824211, "learning_rate": 6.527833025025553e-06, "loss": 0.0774, "num_input_tokens_seen": 108355472, "step": 50245 }, { "epoch": 9.221875573499725, "grad_norm": 0.023076940327882767, "learning_rate": 6.52707054838704e-06, "loss": 0.0178, "num_input_tokens_seen": 108365264, "step": 50250 }, { "epoch": 9.222793173059276, "grad_norm": 0.059257738292217255, "learning_rate": 6.5263080325815356e-06, "loss": 0.0005, "num_input_tokens_seen": 108375408, "step": 50255 }, { "epoch": 9.22371077261883, "grad_norm": 0.10030650347471237, "learning_rate": 6.525545477628594e-06, "loss": 0.0234, "num_input_tokens_seen": 108387216, "step": 50260 }, { "epoch": 9.224628372178381, "grad_norm": 0.454287052154541, "learning_rate": 6.524782883547777e-06, "loss": 0.0018, "num_input_tokens_seen": 108397712, "step": 50265 }, { "epoch": 9.225545971737933, "grad_norm": 33.67338180541992, "learning_rate": 6.5240202503586415e-06, "loss": 0.12, "num_input_tokens_seen": 108407856, "step": 50270 }, { "epoch": 9.226463571297487, "grad_norm": 0.02291012555360794, "learning_rate": 6.5232575780807484e-06, "loss": 0.2033, "num_input_tokens_seen": 108418448, "step": 50275 }, { "epoch": 9.227381170857038, "grad_norm": 0.40173426270484924, "learning_rate": 6.522494866733661e-06, "loss": 0.1257, "num_input_tokens_seen": 108429488, "step": 50280 }, { "epoch": 9.22829877041659, "grad_norm": 0.1552499234676361, "learning_rate": 6.521732116336938e-06, "loss": 0.0078, "num_input_tokens_seen": 108439728, "step": 50285 }, { "epoch": 9.229216369976143, "grad_norm": 0.015944818034768105, "learning_rate": 6.5209693269101435e-06, "loss": 0.0005, "num_input_tokens_seen": 108450384, "step": 50290 }, { "epoch": 9.230133969535695, "grad_norm": 0.041794877499341965, "learning_rate": 6.520206498472846e-06, "loss": 0.0004, "num_input_tokens_seen": 108461840, "step": 50295 }, { "epoch": 9.231051569095246, "grad_norm": 0.3275938034057617, "learning_rate": 6.519443631044607e-06, "loss": 0.0006, "num_input_tokens_seen": 108473904, "step": 50300 }, { "epoch": 9.2319691686548, "grad_norm": 0.6326586008071899, "learning_rate": 6.5186807246449935e-06, "loss": 0.0365, "num_input_tokens_seen": 108484720, "step": 50305 }, { "epoch": 9.232886768214351, "grad_norm": 0.05315300449728966, "learning_rate": 6.517917779293572e-06, "loss": 0.117, "num_input_tokens_seen": 108495760, "step": 50310 }, { "epoch": 9.233804367773903, "grad_norm": 0.008813222870230675, "learning_rate": 6.517154795009914e-06, "loss": 0.0001, "num_input_tokens_seen": 108507472, "step": 50315 }, { "epoch": 9.234721967333456, "grad_norm": 0.028278175741434097, "learning_rate": 6.516391771813587e-06, "loss": 0.0008, "num_input_tokens_seen": 108519408, "step": 50320 }, { "epoch": 9.235639566893008, "grad_norm": 0.005687200464308262, "learning_rate": 6.51562870972416e-06, "loss": 0.0004, "num_input_tokens_seen": 108529904, "step": 50325 }, { "epoch": 9.23655716645256, "grad_norm": 16.674165725708008, "learning_rate": 6.514865608761206e-06, "loss": 0.1363, "num_input_tokens_seen": 108540208, "step": 50330 }, { "epoch": 9.237474766012113, "grad_norm": 0.020720986649394035, "learning_rate": 6.514102468944297e-06, "loss": 0.0002, "num_input_tokens_seen": 108550416, "step": 50335 }, { "epoch": 9.238392365571665, "grad_norm": 0.03662740811705589, "learning_rate": 6.513339290293005e-06, "loss": 0.2382, "num_input_tokens_seen": 108562064, "step": 50340 }, { "epoch": 9.239309965131216, "grad_norm": 0.02971148118376732, "learning_rate": 6.512576072826907e-06, "loss": 0.2127, "num_input_tokens_seen": 108572272, "step": 50345 }, { "epoch": 9.24022756469077, "grad_norm": 0.1846422553062439, "learning_rate": 6.5118128165655766e-06, "loss": 0.0004, "num_input_tokens_seen": 108582192, "step": 50350 }, { "epoch": 9.241145164250321, "grad_norm": 0.004870754666626453, "learning_rate": 6.511049521528592e-06, "loss": 0.0557, "num_input_tokens_seen": 108592976, "step": 50355 }, { "epoch": 9.242062763809873, "grad_norm": 0.022845158353447914, "learning_rate": 6.510286187735527e-06, "loss": 0.1257, "num_input_tokens_seen": 108604080, "step": 50360 }, { "epoch": 9.242980363369426, "grad_norm": 0.05367795005440712, "learning_rate": 6.509522815205962e-06, "loss": 0.034, "num_input_tokens_seen": 108615312, "step": 50365 }, { "epoch": 9.243897962928978, "grad_norm": 0.11957718431949615, "learning_rate": 6.508759403959478e-06, "loss": 0.0006, "num_input_tokens_seen": 108626256, "step": 50370 }, { "epoch": 9.24481556248853, "grad_norm": 0.018856840208172798, "learning_rate": 6.507995954015654e-06, "loss": 0.001, "num_input_tokens_seen": 108637680, "step": 50375 }, { "epoch": 9.245733162048083, "grad_norm": 0.01687108725309372, "learning_rate": 6.507232465394069e-06, "loss": 0.0634, "num_input_tokens_seen": 108647120, "step": 50380 }, { "epoch": 9.246650761607635, "grad_norm": 0.000985958380624652, "learning_rate": 6.506468938114307e-06, "loss": 0.0003, "num_input_tokens_seen": 108658576, "step": 50385 }, { "epoch": 9.247568361167186, "grad_norm": 0.00088722218060866, "learning_rate": 6.505705372195954e-06, "loss": 0.0, "num_input_tokens_seen": 108668656, "step": 50390 }, { "epoch": 9.24848596072674, "grad_norm": 0.004190256353467703, "learning_rate": 6.50494176765859e-06, "loss": 0.0001, "num_input_tokens_seen": 108678960, "step": 50395 }, { "epoch": 9.249403560286291, "grad_norm": 0.006761828903108835, "learning_rate": 6.504178124521803e-06, "loss": 0.0001, "num_input_tokens_seen": 108688112, "step": 50400 }, { "epoch": 9.250321159845843, "grad_norm": 0.8159058690071106, "learning_rate": 6.5034144428051784e-06, "loss": 0.001, "num_input_tokens_seen": 108699312, "step": 50405 }, { "epoch": 9.251238759405396, "grad_norm": 0.013988303020596504, "learning_rate": 6.502650722528302e-06, "loss": 0.0006, "num_input_tokens_seen": 108711824, "step": 50410 }, { "epoch": 9.252156358964948, "grad_norm": 0.003901217831298709, "learning_rate": 6.5018869637107655e-06, "loss": 0.1565, "num_input_tokens_seen": 108721168, "step": 50415 }, { "epoch": 9.2530739585245, "grad_norm": 0.001490650582127273, "learning_rate": 6.501123166372154e-06, "loss": 0.0, "num_input_tokens_seen": 108732432, "step": 50420 }, { "epoch": 9.253991558084053, "grad_norm": 0.07955892384052277, "learning_rate": 6.500359330532062e-06, "loss": 0.0002, "num_input_tokens_seen": 108742864, "step": 50425 }, { "epoch": 9.254909157643604, "grad_norm": 0.0027901395224034786, "learning_rate": 6.499595456210077e-06, "loss": 0.001, "num_input_tokens_seen": 108753776, "step": 50430 }, { "epoch": 9.255826757203156, "grad_norm": 0.015366427600383759, "learning_rate": 6.498831543425793e-06, "loss": 0.0133, "num_input_tokens_seen": 108765136, "step": 50435 }, { "epoch": 9.25674435676271, "grad_norm": 0.12468339502811432, "learning_rate": 6.498067592198804e-06, "loss": 0.0002, "num_input_tokens_seen": 108775792, "step": 50440 }, { "epoch": 9.257661956322261, "grad_norm": 0.007338332012295723, "learning_rate": 6.497303602548701e-06, "loss": 0.0001, "num_input_tokens_seen": 108786320, "step": 50445 }, { "epoch": 9.258579555881813, "grad_norm": 0.001347395358607173, "learning_rate": 6.4965395744950825e-06, "loss": 0.0, "num_input_tokens_seen": 108797584, "step": 50450 }, { "epoch": 9.259497155441366, "grad_norm": 17.29195785522461, "learning_rate": 6.495775508057543e-06, "loss": 0.0043, "num_input_tokens_seen": 108808944, "step": 50455 }, { "epoch": 9.260414755000918, "grad_norm": 0.012220355682075024, "learning_rate": 6.49501140325568e-06, "loss": 0.3204, "num_input_tokens_seen": 108818992, "step": 50460 }, { "epoch": 9.26133235456047, "grad_norm": 0.010196713730692863, "learning_rate": 6.494247260109092e-06, "loss": 0.0004, "num_input_tokens_seen": 108829808, "step": 50465 }, { "epoch": 9.262249954120023, "grad_norm": 0.008968100883066654, "learning_rate": 6.4934830786373775e-06, "loss": 0.0564, "num_input_tokens_seen": 108840720, "step": 50470 }, { "epoch": 9.263167553679574, "grad_norm": 0.01262813899666071, "learning_rate": 6.492718858860135e-06, "loss": 0.0619, "num_input_tokens_seen": 108850672, "step": 50475 }, { "epoch": 9.264085153239126, "grad_norm": 0.06781743466854095, "learning_rate": 6.49195460079697e-06, "loss": 0.0002, "num_input_tokens_seen": 108860688, "step": 50480 }, { "epoch": 9.26500275279868, "grad_norm": 0.3774780035018921, "learning_rate": 6.491190304467481e-06, "loss": 0.0741, "num_input_tokens_seen": 108872080, "step": 50485 }, { "epoch": 9.265920352358231, "grad_norm": 0.020316172391176224, "learning_rate": 6.490425969891271e-06, "loss": 0.0001, "num_input_tokens_seen": 108883600, "step": 50490 }, { "epoch": 9.266837951917783, "grad_norm": 1.0175731182098389, "learning_rate": 6.489661597087945e-06, "loss": 0.001, "num_input_tokens_seen": 108893712, "step": 50495 }, { "epoch": 9.267755551477336, "grad_norm": 0.0027026801835745573, "learning_rate": 6.4888971860771075e-06, "loss": 0.0002, "num_input_tokens_seen": 108904080, "step": 50500 }, { "epoch": 9.268673151036888, "grad_norm": 0.09618313610553741, "learning_rate": 6.4881327368783655e-06, "loss": 0.1386, "num_input_tokens_seen": 108914256, "step": 50505 }, { "epoch": 9.26959075059644, "grad_norm": 0.0025101087521761656, "learning_rate": 6.487368249511324e-06, "loss": 0.008, "num_input_tokens_seen": 108925872, "step": 50510 }, { "epoch": 9.270508350155993, "grad_norm": 0.12813887000083923, "learning_rate": 6.486603723995595e-06, "loss": 0.0001, "num_input_tokens_seen": 108937808, "step": 50515 }, { "epoch": 9.271425949715544, "grad_norm": 0.018562914803624153, "learning_rate": 6.485839160350782e-06, "loss": 0.11, "num_input_tokens_seen": 108948976, "step": 50520 }, { "epoch": 9.272343549275096, "grad_norm": 0.00871296040713787, "learning_rate": 6.485074558596498e-06, "loss": 0.0005, "num_input_tokens_seen": 108959696, "step": 50525 }, { "epoch": 9.27326114883465, "grad_norm": 0.061568278819322586, "learning_rate": 6.484309918752353e-06, "loss": 0.0675, "num_input_tokens_seen": 108970864, "step": 50530 }, { "epoch": 9.2741787483942, "grad_norm": 0.0008110888884402812, "learning_rate": 6.483545240837959e-06, "loss": 0.248, "num_input_tokens_seen": 108979824, "step": 50535 }, { "epoch": 9.275096347953752, "grad_norm": 11.913747787475586, "learning_rate": 6.48278052487293e-06, "loss": 0.0821, "num_input_tokens_seen": 108991888, "step": 50540 }, { "epoch": 9.276013947513306, "grad_norm": 0.03442000970244408, "learning_rate": 6.4820157708768775e-06, "loss": 0.0948, "num_input_tokens_seen": 109002768, "step": 50545 }, { "epoch": 9.276931547072857, "grad_norm": 0.24172818660736084, "learning_rate": 6.4812509788694174e-06, "loss": 0.0055, "num_input_tokens_seen": 109013328, "step": 50550 }, { "epoch": 9.27784914663241, "grad_norm": 0.004464615602046251, "learning_rate": 6.4804861488701665e-06, "loss": 0.0001, "num_input_tokens_seen": 109024304, "step": 50555 }, { "epoch": 9.278766746191963, "grad_norm": 0.020850403234362602, "learning_rate": 6.4797212808987385e-06, "loss": 0.0247, "num_input_tokens_seen": 109036112, "step": 50560 }, { "epoch": 9.279684345751514, "grad_norm": 0.04187804460525513, "learning_rate": 6.478956374974755e-06, "loss": 0.0059, "num_input_tokens_seen": 109046384, "step": 50565 }, { "epoch": 9.280601945311066, "grad_norm": 0.0014249020023271441, "learning_rate": 6.478191431117832e-06, "loss": 0.0006, "num_input_tokens_seen": 109057264, "step": 50570 }, { "epoch": 9.28151954487062, "grad_norm": 0.14438143372535706, "learning_rate": 6.477426449347589e-06, "loss": 0.0004, "num_input_tokens_seen": 109068368, "step": 50575 }, { "epoch": 9.28243714443017, "grad_norm": 0.008090836927294731, "learning_rate": 6.476661429683649e-06, "loss": 0.0625, "num_input_tokens_seen": 109079728, "step": 50580 }, { "epoch": 9.283354743989722, "grad_norm": 0.3171388804912567, "learning_rate": 6.475896372145629e-06, "loss": 0.0007, "num_input_tokens_seen": 109090288, "step": 50585 }, { "epoch": 9.284272343549276, "grad_norm": 0.06597118079662323, "learning_rate": 6.475131276753157e-06, "loss": 0.0003, "num_input_tokens_seen": 109101328, "step": 50590 }, { "epoch": 9.285189943108827, "grad_norm": 0.004466333892196417, "learning_rate": 6.474366143525853e-06, "loss": 0.0007, "num_input_tokens_seen": 109113776, "step": 50595 }, { "epoch": 9.286107542668379, "grad_norm": 0.05428476259112358, "learning_rate": 6.473600972483344e-06, "loss": 0.1098, "num_input_tokens_seen": 109125808, "step": 50600 }, { "epoch": 9.287025142227932, "grad_norm": 0.00843102764338255, "learning_rate": 6.472835763645252e-06, "loss": 0.1441, "num_input_tokens_seen": 109136688, "step": 50605 }, { "epoch": 9.287942741787484, "grad_norm": 0.04663638025522232, "learning_rate": 6.472070517031206e-06, "loss": 0.0001, "num_input_tokens_seen": 109148240, "step": 50610 }, { "epoch": 9.288860341347036, "grad_norm": 0.011514549143612385, "learning_rate": 6.471305232660833e-06, "loss": 0.0004, "num_input_tokens_seen": 109159824, "step": 50615 }, { "epoch": 9.289777940906589, "grad_norm": 0.01005371380597353, "learning_rate": 6.47053991055376e-06, "loss": 0.0004, "num_input_tokens_seen": 109169904, "step": 50620 }, { "epoch": 9.29069554046614, "grad_norm": 0.08896877616643906, "learning_rate": 6.4697745507296194e-06, "loss": 0.0003, "num_input_tokens_seen": 109180048, "step": 50625 }, { "epoch": 9.291613140025692, "grad_norm": 0.0011738869361579418, "learning_rate": 6.469009153208038e-06, "loss": 0.0001, "num_input_tokens_seen": 109190128, "step": 50630 }, { "epoch": 9.292530739585246, "grad_norm": 0.001825876533985138, "learning_rate": 6.46824371800865e-06, "loss": 0.0007, "num_input_tokens_seen": 109201520, "step": 50635 }, { "epoch": 9.293448339144797, "grad_norm": 19.516653060913086, "learning_rate": 6.4674782451510845e-06, "loss": 0.2416, "num_input_tokens_seen": 109212720, "step": 50640 }, { "epoch": 9.294365938704349, "grad_norm": 0.0008976130047813058, "learning_rate": 6.4667127346549785e-06, "loss": 0.0008, "num_input_tokens_seen": 109222608, "step": 50645 }, { "epoch": 9.295283538263902, "grad_norm": 0.07763324677944183, "learning_rate": 6.465947186539962e-06, "loss": 0.0001, "num_input_tokens_seen": 109233680, "step": 50650 }, { "epoch": 9.296201137823454, "grad_norm": 0.005673196632415056, "learning_rate": 6.465181600825673e-06, "loss": 0.0001, "num_input_tokens_seen": 109243984, "step": 50655 }, { "epoch": 9.297118737383006, "grad_norm": 79.50189208984375, "learning_rate": 6.464415977531746e-06, "loss": 0.2861, "num_input_tokens_seen": 109254736, "step": 50660 }, { "epoch": 9.298036336942559, "grad_norm": 0.002731233835220337, "learning_rate": 6.4636503166778206e-06, "loss": 0.0005, "num_input_tokens_seen": 109265552, "step": 50665 }, { "epoch": 9.29895393650211, "grad_norm": 0.08974220603704453, "learning_rate": 6.462884618283531e-06, "loss": 0.1007, "num_input_tokens_seen": 109277104, "step": 50670 }, { "epoch": 9.299871536061662, "grad_norm": 0.0013623374979943037, "learning_rate": 6.4621188823685195e-06, "loss": 0.0003, "num_input_tokens_seen": 109288048, "step": 50675 }, { "epoch": 9.300789135621216, "grad_norm": 0.05725152790546417, "learning_rate": 6.4613531089524236e-06, "loss": 0.0007, "num_input_tokens_seen": 109298064, "step": 50680 }, { "epoch": 9.301706735180767, "grad_norm": 0.017566680908203125, "learning_rate": 6.460587298054887e-06, "loss": 0.0001, "num_input_tokens_seen": 109307984, "step": 50685 }, { "epoch": 9.302624334740319, "grad_norm": 0.005119428504258394, "learning_rate": 6.459821449695549e-06, "loss": 0.031, "num_input_tokens_seen": 109318544, "step": 50690 }, { "epoch": 9.303541934299872, "grad_norm": 0.051147833466529846, "learning_rate": 6.459055563894053e-06, "loss": 0.1069, "num_input_tokens_seen": 109329584, "step": 50695 }, { "epoch": 9.304459533859424, "grad_norm": 0.003893848741427064, "learning_rate": 6.458289640670044e-06, "loss": 0.4861, "num_input_tokens_seen": 109340336, "step": 50700 }, { "epoch": 9.305377133418975, "grad_norm": 0.014009376987814903, "learning_rate": 6.457523680043165e-06, "loss": 0.0004, "num_input_tokens_seen": 109351440, "step": 50705 }, { "epoch": 9.306294732978529, "grad_norm": 0.0009732721373438835, "learning_rate": 6.456757682033063e-06, "loss": 0.0, "num_input_tokens_seen": 109361456, "step": 50710 }, { "epoch": 9.30721233253808, "grad_norm": 0.1642855852842331, "learning_rate": 6.455991646659384e-06, "loss": 0.0003, "num_input_tokens_seen": 109371568, "step": 50715 }, { "epoch": 9.308129932097632, "grad_norm": 0.005537243559956551, "learning_rate": 6.455225573941776e-06, "loss": 0.2191, "num_input_tokens_seen": 109382448, "step": 50720 }, { "epoch": 9.309047531657185, "grad_norm": 0.01119029987603426, "learning_rate": 6.454459463899887e-06, "loss": 0.0003, "num_input_tokens_seen": 109392688, "step": 50725 }, { "epoch": 9.309965131216737, "grad_norm": 0.08756005018949509, "learning_rate": 6.453693316553368e-06, "loss": 0.0001, "num_input_tokens_seen": 109404080, "step": 50730 }, { "epoch": 9.310882730776289, "grad_norm": 19.177459716796875, "learning_rate": 6.452927131921868e-06, "loss": 0.3952, "num_input_tokens_seen": 109414768, "step": 50735 }, { "epoch": 9.311800330335842, "grad_norm": 0.002427661558613181, "learning_rate": 6.452160910025038e-06, "loss": 0.0012, "num_input_tokens_seen": 109426352, "step": 50740 }, { "epoch": 9.312717929895394, "grad_norm": 23.340408325195312, "learning_rate": 6.451394650882532e-06, "loss": 0.0215, "num_input_tokens_seen": 109437232, "step": 50745 }, { "epoch": 9.313635529454945, "grad_norm": 0.004203807562589645, "learning_rate": 6.450628354514004e-06, "loss": 0.0003, "num_input_tokens_seen": 109448272, "step": 50750 }, { "epoch": 9.314553129014499, "grad_norm": 0.033519357442855835, "learning_rate": 6.449862020939105e-06, "loss": 0.0002, "num_input_tokens_seen": 109459440, "step": 50755 }, { "epoch": 9.31547072857405, "grad_norm": 0.8254947066307068, "learning_rate": 6.449095650177494e-06, "loss": 0.0036, "num_input_tokens_seen": 109469680, "step": 50760 }, { "epoch": 9.316388328133602, "grad_norm": 0.4164324402809143, "learning_rate": 6.448329242248823e-06, "loss": 0.0017, "num_input_tokens_seen": 109480432, "step": 50765 }, { "epoch": 9.317305927693155, "grad_norm": 0.0015586444642394781, "learning_rate": 6.447562797172753e-06, "loss": 0.1794, "num_input_tokens_seen": 109490768, "step": 50770 }, { "epoch": 9.318223527252707, "grad_norm": 0.19300870597362518, "learning_rate": 6.446796314968942e-06, "loss": 0.0177, "num_input_tokens_seen": 109500944, "step": 50775 }, { "epoch": 9.319141126812259, "grad_norm": 0.003977919928729534, "learning_rate": 6.446029795657045e-06, "loss": 0.0001, "num_input_tokens_seen": 109511216, "step": 50780 }, { "epoch": 9.320058726371812, "grad_norm": 0.07619259506464005, "learning_rate": 6.445263239256727e-06, "loss": 0.0011, "num_input_tokens_seen": 109523312, "step": 50785 }, { "epoch": 9.320976325931364, "grad_norm": 12.428388595581055, "learning_rate": 6.444496645787647e-06, "loss": 0.0982, "num_input_tokens_seen": 109533680, "step": 50790 }, { "epoch": 9.321893925490915, "grad_norm": 0.004587163217365742, "learning_rate": 6.443730015269465e-06, "loss": 0.0002, "num_input_tokens_seen": 109544048, "step": 50795 }, { "epoch": 9.322811525050469, "grad_norm": 0.019911225885152817, "learning_rate": 6.4429633477218475e-06, "loss": 0.2108, "num_input_tokens_seen": 109555344, "step": 50800 }, { "epoch": 9.32372912461002, "grad_norm": 38.09901428222656, "learning_rate": 6.442196643164455e-06, "loss": 0.0462, "num_input_tokens_seen": 109566224, "step": 50805 }, { "epoch": 9.324646724169572, "grad_norm": 0.004762943368405104, "learning_rate": 6.441429901616956e-06, "loss": 0.0007, "num_input_tokens_seen": 109578160, "step": 50810 }, { "epoch": 9.325564323729125, "grad_norm": 0.0011657021241262555, "learning_rate": 6.440663123099012e-06, "loss": 0.0001, "num_input_tokens_seen": 109589296, "step": 50815 }, { "epoch": 9.326481923288677, "grad_norm": 17.88300323486328, "learning_rate": 6.439896307630293e-06, "loss": 0.0873, "num_input_tokens_seen": 109599536, "step": 50820 }, { "epoch": 9.327399522848228, "grad_norm": 45.17243957519531, "learning_rate": 6.439129455230465e-06, "loss": 0.0245, "num_input_tokens_seen": 109610480, "step": 50825 }, { "epoch": 9.328317122407782, "grad_norm": 0.06280584633350372, "learning_rate": 6.4383625659191964e-06, "loss": 0.0002, "num_input_tokens_seen": 109620208, "step": 50830 }, { "epoch": 9.329234721967333, "grad_norm": 0.0005980361602269113, "learning_rate": 6.437595639716158e-06, "loss": 0.0028, "num_input_tokens_seen": 109631248, "step": 50835 }, { "epoch": 9.330152321526885, "grad_norm": 0.009977659210562706, "learning_rate": 6.436828676641018e-06, "loss": 0.0645, "num_input_tokens_seen": 109642608, "step": 50840 }, { "epoch": 9.331069921086439, "grad_norm": 0.0030449614860117435, "learning_rate": 6.436061676713451e-06, "loss": 0.0001, "num_input_tokens_seen": 109653200, "step": 50845 }, { "epoch": 9.33198752064599, "grad_norm": 29.745132446289062, "learning_rate": 6.435294639953127e-06, "loss": 0.1658, "num_input_tokens_seen": 109663120, "step": 50850 }, { "epoch": 9.332905120205542, "grad_norm": 0.24628007411956787, "learning_rate": 6.43452756637972e-06, "loss": 0.0005, "num_input_tokens_seen": 109673392, "step": 50855 }, { "epoch": 9.333822719765095, "grad_norm": 32.42573547363281, "learning_rate": 6.433760456012905e-06, "loss": 0.0023, "num_input_tokens_seen": 109684880, "step": 50860 }, { "epoch": 9.334740319324647, "grad_norm": 0.032721538096666336, "learning_rate": 6.432993308872356e-06, "loss": 0.1252, "num_input_tokens_seen": 109696240, "step": 50865 }, { "epoch": 9.335657918884198, "grad_norm": 0.007454592268913984, "learning_rate": 6.43222612497775e-06, "loss": 0.0001, "num_input_tokens_seen": 109707376, "step": 50870 }, { "epoch": 9.336575518443752, "grad_norm": 0.006743564270436764, "learning_rate": 6.431458904348762e-06, "loss": 0.0001, "num_input_tokens_seen": 109718800, "step": 50875 }, { "epoch": 9.337493118003303, "grad_norm": 0.0016931730788201094, "learning_rate": 6.430691647005072e-06, "loss": 0.0313, "num_input_tokens_seen": 109730576, "step": 50880 }, { "epoch": 9.338410717562855, "grad_norm": 0.013173690997064114, "learning_rate": 6.42992435296636e-06, "loss": 0.0001, "num_input_tokens_seen": 109741776, "step": 50885 }, { "epoch": 9.339328317122408, "grad_norm": 0.00173751765396446, "learning_rate": 6.4291570222523035e-06, "loss": 0.0001, "num_input_tokens_seen": 109752592, "step": 50890 }, { "epoch": 9.34024591668196, "grad_norm": 0.08590757101774216, "learning_rate": 6.4283896548825856e-06, "loss": 0.0001, "num_input_tokens_seen": 109764528, "step": 50895 }, { "epoch": 9.341163516241512, "grad_norm": 0.0012121633626520634, "learning_rate": 6.427622250876885e-06, "loss": 0.0, "num_input_tokens_seen": 109776080, "step": 50900 }, { "epoch": 9.342081115801065, "grad_norm": 0.005934036336839199, "learning_rate": 6.426854810254887e-06, "loss": 0.0001, "num_input_tokens_seen": 109785296, "step": 50905 }, { "epoch": 9.342998715360617, "grad_norm": 0.030145179480314255, "learning_rate": 6.426087333036275e-06, "loss": 0.0058, "num_input_tokens_seen": 109796464, "step": 50910 }, { "epoch": 9.343916314920168, "grad_norm": 0.07589811086654663, "learning_rate": 6.425319819240733e-06, "loss": 0.033, "num_input_tokens_seen": 109806096, "step": 50915 }, { "epoch": 9.344833914479722, "grad_norm": 0.43936845660209656, "learning_rate": 6.424552268887947e-06, "loss": 0.0006, "num_input_tokens_seen": 109816528, "step": 50920 }, { "epoch": 9.345751514039273, "grad_norm": 0.002063305350020528, "learning_rate": 6.423784681997602e-06, "loss": 0.1379, "num_input_tokens_seen": 109827216, "step": 50925 }, { "epoch": 9.346669113598825, "grad_norm": 0.0019147306447848678, "learning_rate": 6.423017058589387e-06, "loss": 0.0, "num_input_tokens_seen": 109837872, "step": 50930 }, { "epoch": 9.347586713158378, "grad_norm": 0.007877747528254986, "learning_rate": 6.4222493986829906e-06, "loss": 0.0479, "num_input_tokens_seen": 109849872, "step": 50935 }, { "epoch": 9.34850431271793, "grad_norm": 0.030081579461693764, "learning_rate": 6.4214817022981e-06, "loss": 0.0823, "num_input_tokens_seen": 109861072, "step": 50940 }, { "epoch": 9.349421912277482, "grad_norm": 0.004975346848368645, "learning_rate": 6.420713969454408e-06, "loss": 0.0006, "num_input_tokens_seen": 109871792, "step": 50945 }, { "epoch": 9.350339511837035, "grad_norm": 0.0034032114781439304, "learning_rate": 6.419946200171605e-06, "loss": 0.0007, "num_input_tokens_seen": 109882288, "step": 50950 }, { "epoch": 9.351257111396587, "grad_norm": 0.04627969488501549, "learning_rate": 6.419178394469383e-06, "loss": 0.2962, "num_input_tokens_seen": 109893328, "step": 50955 }, { "epoch": 9.352174710956138, "grad_norm": 0.06786484271287918, "learning_rate": 6.418410552367433e-06, "loss": 0.0025, "num_input_tokens_seen": 109904656, "step": 50960 }, { "epoch": 9.353092310515692, "grad_norm": 0.012943233363330364, "learning_rate": 6.417642673885452e-06, "loss": 0.0017, "num_input_tokens_seen": 109915824, "step": 50965 }, { "epoch": 9.354009910075243, "grad_norm": 0.08738356828689575, "learning_rate": 6.416874759043133e-06, "loss": 0.0004, "num_input_tokens_seen": 109926032, "step": 50970 }, { "epoch": 9.354927509634795, "grad_norm": 0.0049121566116809845, "learning_rate": 6.416106807860173e-06, "loss": 0.0001, "num_input_tokens_seen": 109937168, "step": 50975 }, { "epoch": 9.355845109194348, "grad_norm": 36.4359130859375, "learning_rate": 6.415338820356267e-06, "loss": 0.1098, "num_input_tokens_seen": 109946320, "step": 50980 }, { "epoch": 9.3567627087539, "grad_norm": 0.036741744726896286, "learning_rate": 6.414570796551115e-06, "loss": 0.003, "num_input_tokens_seen": 109956304, "step": 50985 }, { "epoch": 9.357680308313451, "grad_norm": 0.00429151114076376, "learning_rate": 6.413802736464414e-06, "loss": 0.1498, "num_input_tokens_seen": 109967376, "step": 50990 }, { "epoch": 9.358597907873005, "grad_norm": 0.0017214082181453705, "learning_rate": 6.413034640115864e-06, "loss": 0.0886, "num_input_tokens_seen": 109978672, "step": 50995 }, { "epoch": 9.359515507432556, "grad_norm": 73.52446746826172, "learning_rate": 6.412266507525165e-06, "loss": 0.2063, "num_input_tokens_seen": 109989072, "step": 51000 }, { "epoch": 9.360433106992108, "grad_norm": 0.017497768625617027, "learning_rate": 6.4114983387120185e-06, "loss": 0.0001, "num_input_tokens_seen": 110000176, "step": 51005 }, { "epoch": 9.361350706551661, "grad_norm": 0.004018696956336498, "learning_rate": 6.410730133696128e-06, "loss": 0.0004, "num_input_tokens_seen": 110012528, "step": 51010 }, { "epoch": 9.362268306111213, "grad_norm": 0.0010079165222123265, "learning_rate": 6.409961892497196e-06, "loss": 0.056, "num_input_tokens_seen": 110023216, "step": 51015 }, { "epoch": 9.363185905670765, "grad_norm": 0.0033521889708936214, "learning_rate": 6.409193615134928e-06, "loss": 0.0008, "num_input_tokens_seen": 110033360, "step": 51020 }, { "epoch": 9.364103505230318, "grad_norm": 0.0029822569340467453, "learning_rate": 6.408425301629026e-06, "loss": 0.0002, "num_input_tokens_seen": 110043504, "step": 51025 }, { "epoch": 9.36502110478987, "grad_norm": 22.220458984375, "learning_rate": 6.407656951999198e-06, "loss": 0.2661, "num_input_tokens_seen": 110054768, "step": 51030 }, { "epoch": 9.365938704349421, "grad_norm": 0.007606103550642729, "learning_rate": 6.406888566265152e-06, "loss": 0.0617, "num_input_tokens_seen": 110065616, "step": 51035 }, { "epoch": 9.366856303908975, "grad_norm": 0.01932719722390175, "learning_rate": 6.406120144446593e-06, "loss": 0.0007, "num_input_tokens_seen": 110077104, "step": 51040 }, { "epoch": 9.367773903468526, "grad_norm": 0.003701648209244013, "learning_rate": 6.405351686563233e-06, "loss": 0.0008, "num_input_tokens_seen": 110086512, "step": 51045 }, { "epoch": 9.368691503028078, "grad_norm": 0.011425146833062172, "learning_rate": 6.404583192634779e-06, "loss": 0.0013, "num_input_tokens_seen": 110097392, "step": 51050 }, { "epoch": 9.369609102587631, "grad_norm": 0.013730993494391441, "learning_rate": 6.403814662680945e-06, "loss": 0.1097, "num_input_tokens_seen": 110107504, "step": 51055 }, { "epoch": 9.370526702147183, "grad_norm": 0.03148520365357399, "learning_rate": 6.403046096721439e-06, "loss": 0.0001, "num_input_tokens_seen": 110118224, "step": 51060 }, { "epoch": 9.371444301706735, "grad_norm": 4.548114776611328, "learning_rate": 6.402277494775977e-06, "loss": 0.1578, "num_input_tokens_seen": 110128720, "step": 51065 }, { "epoch": 9.372361901266288, "grad_norm": 0.0020019840449094772, "learning_rate": 6.401508856864268e-06, "loss": 0.0008, "num_input_tokens_seen": 110138704, "step": 51070 }, { "epoch": 9.37327950082584, "grad_norm": 0.020664280280470848, "learning_rate": 6.400740183006031e-06, "loss": 0.0002, "num_input_tokens_seen": 110149456, "step": 51075 }, { "epoch": 9.374197100385391, "grad_norm": 0.02971569076180458, "learning_rate": 6.39997147322098e-06, "loss": 0.0001, "num_input_tokens_seen": 110159344, "step": 51080 }, { "epoch": 9.375114699944945, "grad_norm": 0.01733584888279438, "learning_rate": 6.399202727528828e-06, "loss": 0.0001, "num_input_tokens_seen": 110170320, "step": 51085 }, { "epoch": 9.376032299504496, "grad_norm": 383.40386962890625, "learning_rate": 6.398433945949295e-06, "loss": 0.2442, "num_input_tokens_seen": 110180400, "step": 51090 }, { "epoch": 9.376949899064048, "grad_norm": 0.03154735639691353, "learning_rate": 6.397665128502099e-06, "loss": 0.0003, "num_input_tokens_seen": 110191504, "step": 51095 }, { "epoch": 9.377867498623601, "grad_norm": 0.03190446272492409, "learning_rate": 6.39689627520696e-06, "loss": 0.0004, "num_input_tokens_seen": 110202832, "step": 51100 }, { "epoch": 9.378785098183153, "grad_norm": 0.4837281405925751, "learning_rate": 6.396127386083595e-06, "loss": 0.0004, "num_input_tokens_seen": 110212240, "step": 51105 }, { "epoch": 9.379702697742704, "grad_norm": 0.0005615550908260047, "learning_rate": 6.395358461151726e-06, "loss": 0.001, "num_input_tokens_seen": 110223600, "step": 51110 }, { "epoch": 9.380620297302258, "grad_norm": 0.29706719517707825, "learning_rate": 6.394589500431076e-06, "loss": 0.3505, "num_input_tokens_seen": 110234640, "step": 51115 }, { "epoch": 9.38153789686181, "grad_norm": 0.006130731664597988, "learning_rate": 6.393820503941367e-06, "loss": 0.0003, "num_input_tokens_seen": 110245008, "step": 51120 }, { "epoch": 9.382455496421361, "grad_norm": 0.02502710558474064, "learning_rate": 6.393051471702322e-06, "loss": 0.0006, "num_input_tokens_seen": 110256176, "step": 51125 }, { "epoch": 9.383373095980915, "grad_norm": 0.019146181643009186, "learning_rate": 6.3922824037336665e-06, "loss": 0.1471, "num_input_tokens_seen": 110265552, "step": 51130 }, { "epoch": 9.384290695540466, "grad_norm": 0.02428695745766163, "learning_rate": 6.391513300055123e-06, "loss": 0.0001, "num_input_tokens_seen": 110276016, "step": 51135 }, { "epoch": 9.385208295100018, "grad_norm": 0.03626156598329544, "learning_rate": 6.390744160686422e-06, "loss": 0.0004, "num_input_tokens_seen": 110287088, "step": 51140 }, { "epoch": 9.386125894659571, "grad_norm": 0.0008468659943901002, "learning_rate": 6.389974985647288e-06, "loss": 0.0246, "num_input_tokens_seen": 110297200, "step": 51145 }, { "epoch": 9.387043494219123, "grad_norm": 0.020098989829421043, "learning_rate": 6.38920577495745e-06, "loss": 0.0007, "num_input_tokens_seen": 110307888, "step": 51150 }, { "epoch": 9.387961093778674, "grad_norm": 0.005968411453068256, "learning_rate": 6.388436528636637e-06, "loss": 0.0002, "num_input_tokens_seen": 110318288, "step": 51155 }, { "epoch": 9.388878693338228, "grad_norm": 0.04877866804599762, "learning_rate": 6.387667246704579e-06, "loss": 0.1876, "num_input_tokens_seen": 110329552, "step": 51160 }, { "epoch": 9.38979629289778, "grad_norm": 0.003975698724389076, "learning_rate": 6.386897929181006e-06, "loss": 0.0012, "num_input_tokens_seen": 110340720, "step": 51165 }, { "epoch": 9.390713892457331, "grad_norm": 71.29981994628906, "learning_rate": 6.386128576085652e-06, "loss": 0.1571, "num_input_tokens_seen": 110350384, "step": 51170 }, { "epoch": 9.391631492016884, "grad_norm": 0.0801423043012619, "learning_rate": 6.385359187438248e-06, "loss": 0.0005, "num_input_tokens_seen": 110361648, "step": 51175 }, { "epoch": 9.392549091576436, "grad_norm": 0.021596670150756836, "learning_rate": 6.384589763258526e-06, "loss": 0.1844, "num_input_tokens_seen": 110373712, "step": 51180 }, { "epoch": 9.393466691135988, "grad_norm": 0.09424211084842682, "learning_rate": 6.383820303566226e-06, "loss": 0.0004, "num_input_tokens_seen": 110384688, "step": 51185 }, { "epoch": 9.394384290695541, "grad_norm": 0.021922336891293526, "learning_rate": 6.383050808381079e-06, "loss": 0.0231, "num_input_tokens_seen": 110394800, "step": 51190 }, { "epoch": 9.395301890255093, "grad_norm": 65.26525115966797, "learning_rate": 6.382281277722819e-06, "loss": 0.0393, "num_input_tokens_seen": 110404432, "step": 51195 }, { "epoch": 9.396219489814644, "grad_norm": 2.09218692779541, "learning_rate": 6.381511711611189e-06, "loss": 0.0006, "num_input_tokens_seen": 110415728, "step": 51200 }, { "epoch": 9.397137089374198, "grad_norm": 0.004387761000543833, "learning_rate": 6.380742110065925e-06, "loss": 0.0004, "num_input_tokens_seen": 110427120, "step": 51205 }, { "epoch": 9.39805468893375, "grad_norm": 0.0015436812536790967, "learning_rate": 6.3799724731067654e-06, "loss": 0.0005, "num_input_tokens_seen": 110438000, "step": 51210 }, { "epoch": 9.398972288493301, "grad_norm": 0.012312573380768299, "learning_rate": 6.379202800753451e-06, "loss": 0.0086, "num_input_tokens_seen": 110447440, "step": 51215 }, { "epoch": 9.399889888052854, "grad_norm": 0.006240768823772669, "learning_rate": 6.378433093025722e-06, "loss": 0.3286, "num_input_tokens_seen": 110457968, "step": 51220 }, { "epoch": 9.400807487612406, "grad_norm": 0.1852264702320099, "learning_rate": 6.377663349943319e-06, "loss": 0.0002, "num_input_tokens_seen": 110468560, "step": 51225 }, { "epoch": 9.401725087171958, "grad_norm": 0.09053260833024979, "learning_rate": 6.376893571525989e-06, "loss": 0.0003, "num_input_tokens_seen": 110479408, "step": 51230 }, { "epoch": 9.402642686731511, "grad_norm": 0.0030260407365858555, "learning_rate": 6.376123757793472e-06, "loss": 0.0005, "num_input_tokens_seen": 110490000, "step": 51235 }, { "epoch": 9.403560286291063, "grad_norm": 0.011380285024642944, "learning_rate": 6.375353908765514e-06, "loss": 0.0004, "num_input_tokens_seen": 110500880, "step": 51240 }, { "epoch": 9.404477885850614, "grad_norm": 0.01698719896376133, "learning_rate": 6.374584024461859e-06, "loss": 0.0706, "num_input_tokens_seen": 110512016, "step": 51245 }, { "epoch": 9.405395485410168, "grad_norm": 0.0010095100151374936, "learning_rate": 6.373814104902253e-06, "loss": 0.0041, "num_input_tokens_seen": 110522672, "step": 51250 }, { "epoch": 9.40631308496972, "grad_norm": 0.008016029372811317, "learning_rate": 6.373044150106446e-06, "loss": 0.0006, "num_input_tokens_seen": 110532784, "step": 51255 }, { "epoch": 9.40723068452927, "grad_norm": 0.15914735198020935, "learning_rate": 6.372274160094183e-06, "loss": 0.0764, "num_input_tokens_seen": 110543344, "step": 51260 }, { "epoch": 9.408148284088824, "grad_norm": 0.024641647934913635, "learning_rate": 6.371504134885217e-06, "loss": 0.1346, "num_input_tokens_seen": 110554704, "step": 51265 }, { "epoch": 9.409065883648376, "grad_norm": 1.065953016281128, "learning_rate": 6.370734074499294e-06, "loss": 0.0768, "num_input_tokens_seen": 110565936, "step": 51270 }, { "epoch": 9.409983483207927, "grad_norm": 0.037884362041950226, "learning_rate": 6.369963978956168e-06, "loss": 0.0003, "num_input_tokens_seen": 110576816, "step": 51275 }, { "epoch": 9.41090108276748, "grad_norm": 0.00971068162471056, "learning_rate": 6.369193848275587e-06, "loss": 0.0002, "num_input_tokens_seen": 110588880, "step": 51280 }, { "epoch": 9.411818682327032, "grad_norm": 0.04416518285870552, "learning_rate": 6.368423682477307e-06, "loss": 0.0001, "num_input_tokens_seen": 110598960, "step": 51285 }, { "epoch": 9.412736281886584, "grad_norm": 22.876766204833984, "learning_rate": 6.367653481581081e-06, "loss": 0.0947, "num_input_tokens_seen": 110610576, "step": 51290 }, { "epoch": 9.413653881446137, "grad_norm": 0.0010212231427431107, "learning_rate": 6.366883245606661e-06, "loss": 0.2095, "num_input_tokens_seen": 110621040, "step": 51295 }, { "epoch": 9.414571481005689, "grad_norm": 0.005725717172026634, "learning_rate": 6.366112974573806e-06, "loss": 0.0705, "num_input_tokens_seen": 110631664, "step": 51300 }, { "epoch": 9.41548908056524, "grad_norm": 0.004966412670910358, "learning_rate": 6.36534266850227e-06, "loss": 0.0019, "num_input_tokens_seen": 110642128, "step": 51305 }, { "epoch": 9.416406680124794, "grad_norm": 13.226118087768555, "learning_rate": 6.36457232741181e-06, "loss": 0.1118, "num_input_tokens_seen": 110654000, "step": 51310 }, { "epoch": 9.417324279684346, "grad_norm": 0.00936198141425848, "learning_rate": 6.363801951322186e-06, "loss": 0.0002, "num_input_tokens_seen": 110664720, "step": 51315 }, { "epoch": 9.418241879243897, "grad_norm": 0.009938906878232956, "learning_rate": 6.363031540253154e-06, "loss": 0.0072, "num_input_tokens_seen": 110675824, "step": 51320 }, { "epoch": 9.41915947880345, "grad_norm": 0.06429179012775421, "learning_rate": 6.362261094224477e-06, "loss": 0.1038, "num_input_tokens_seen": 110685904, "step": 51325 }, { "epoch": 9.420077078363002, "grad_norm": 0.012085911817848682, "learning_rate": 6.361490613255913e-06, "loss": 0.2697, "num_input_tokens_seen": 110697168, "step": 51330 }, { "epoch": 9.420994677922554, "grad_norm": 0.23674596846103668, "learning_rate": 6.360720097367225e-06, "loss": 0.194, "num_input_tokens_seen": 110709200, "step": 51335 }, { "epoch": 9.421912277482107, "grad_norm": 0.015848109498620033, "learning_rate": 6.359949546578176e-06, "loss": 0.0005, "num_input_tokens_seen": 110720432, "step": 51340 }, { "epoch": 9.422829877041659, "grad_norm": 0.7363243103027344, "learning_rate": 6.359178960908528e-06, "loss": 0.0012, "num_input_tokens_seen": 110731376, "step": 51345 }, { "epoch": 9.42374747660121, "grad_norm": 0.010061759501695633, "learning_rate": 6.358408340378049e-06, "loss": 0.3099, "num_input_tokens_seen": 110743056, "step": 51350 }, { "epoch": 9.424665076160764, "grad_norm": 0.0012888340279459953, "learning_rate": 6.357637685006498e-06, "loss": 0.0001, "num_input_tokens_seen": 110754416, "step": 51355 }, { "epoch": 9.425582675720316, "grad_norm": 0.07259954512119293, "learning_rate": 6.356866994813645e-06, "loss": 0.0005, "num_input_tokens_seen": 110764880, "step": 51360 }, { "epoch": 9.426500275279867, "grad_norm": 0.005800305865705013, "learning_rate": 6.356096269819259e-06, "loss": 0.0377, "num_input_tokens_seen": 110775536, "step": 51365 }, { "epoch": 9.42741787483942, "grad_norm": 0.014408726245164871, "learning_rate": 6.3553255100431025e-06, "loss": 0.0004, "num_input_tokens_seen": 110787088, "step": 51370 }, { "epoch": 9.428335474398972, "grad_norm": 0.03471427038311958, "learning_rate": 6.354554715504949e-06, "loss": 0.0975, "num_input_tokens_seen": 110797520, "step": 51375 }, { "epoch": 9.429253073958524, "grad_norm": 0.005447549745440483, "learning_rate": 6.353783886224565e-06, "loss": 0.0098, "num_input_tokens_seen": 110808176, "step": 51380 }, { "epoch": 9.430170673518077, "grad_norm": 0.04685390368103981, "learning_rate": 6.3530130222217244e-06, "loss": 0.0003, "num_input_tokens_seen": 110820336, "step": 51385 }, { "epoch": 9.431088273077629, "grad_norm": 0.036693163216114044, "learning_rate": 6.352242123516196e-06, "loss": 0.0001, "num_input_tokens_seen": 110831792, "step": 51390 }, { "epoch": 9.43200587263718, "grad_norm": 0.010480073280632496, "learning_rate": 6.351471190127753e-06, "loss": 0.0004, "num_input_tokens_seen": 110843248, "step": 51395 }, { "epoch": 9.432923472196734, "grad_norm": 0.36059051752090454, "learning_rate": 6.35070022207617e-06, "loss": 0.1222, "num_input_tokens_seen": 110854896, "step": 51400 }, { "epoch": 9.433841071756286, "grad_norm": 0.0062422617338597775, "learning_rate": 6.349929219381217e-06, "loss": 0.0002, "num_input_tokens_seen": 110865552, "step": 51405 }, { "epoch": 9.434758671315837, "grad_norm": 0.002367046196013689, "learning_rate": 6.349158182062671e-06, "loss": 0.0001, "num_input_tokens_seen": 110877456, "step": 51410 }, { "epoch": 9.43567627087539, "grad_norm": 0.012513415887951851, "learning_rate": 6.348387110140312e-06, "loss": 0.0002, "num_input_tokens_seen": 110889008, "step": 51415 }, { "epoch": 9.436593870434942, "grad_norm": 0.051605939865112305, "learning_rate": 6.347616003633911e-06, "loss": 0.0003, "num_input_tokens_seen": 110900240, "step": 51420 }, { "epoch": 9.437511469994494, "grad_norm": 0.0018907822668552399, "learning_rate": 6.346844862563249e-06, "loss": 0.0003, "num_input_tokens_seen": 110911504, "step": 51425 }, { "epoch": 9.438429069554047, "grad_norm": 0.039816275238990784, "learning_rate": 6.346073686948103e-06, "loss": 0.1627, "num_input_tokens_seen": 110922736, "step": 51430 }, { "epoch": 9.439346669113599, "grad_norm": 0.0086684450507164, "learning_rate": 6.345302476808254e-06, "loss": 0.1223, "num_input_tokens_seen": 110934288, "step": 51435 }, { "epoch": 9.44026426867315, "grad_norm": 0.034035950899124146, "learning_rate": 6.344531232163482e-06, "loss": 0.1409, "num_input_tokens_seen": 110945200, "step": 51440 }, { "epoch": 9.441181868232704, "grad_norm": 0.030402138829231262, "learning_rate": 6.343759953033566e-06, "loss": 0.0002, "num_input_tokens_seen": 110956784, "step": 51445 }, { "epoch": 9.442099467792255, "grad_norm": 0.01653037779033184, "learning_rate": 6.342988639438292e-06, "loss": 0.0247, "num_input_tokens_seen": 110967440, "step": 51450 }, { "epoch": 9.443017067351807, "grad_norm": 0.02175593189895153, "learning_rate": 6.342217291397439e-06, "loss": 0.0006, "num_input_tokens_seen": 110978640, "step": 51455 }, { "epoch": 9.44393466691136, "grad_norm": 0.014465969055891037, "learning_rate": 6.341445908930794e-06, "loss": 0.0002, "num_input_tokens_seen": 110987888, "step": 51460 }, { "epoch": 9.444852266470912, "grad_norm": 0.006810668855905533, "learning_rate": 6.34067449205814e-06, "loss": 0.0002, "num_input_tokens_seen": 110998224, "step": 51465 }, { "epoch": 9.445769866030464, "grad_norm": 0.0035384895745664835, "learning_rate": 6.339903040799262e-06, "loss": 0.0002, "num_input_tokens_seen": 111008688, "step": 51470 }, { "epoch": 9.446687465590017, "grad_norm": 0.14183378219604492, "learning_rate": 6.3391315551739495e-06, "loss": 0.1384, "num_input_tokens_seen": 111018352, "step": 51475 }, { "epoch": 9.447605065149569, "grad_norm": 0.011495468206703663, "learning_rate": 6.338360035201987e-06, "loss": 0.0001, "num_input_tokens_seen": 111029456, "step": 51480 }, { "epoch": 9.44852266470912, "grad_norm": 0.024728519842028618, "learning_rate": 6.337588480903164e-06, "loss": 0.0001, "num_input_tokens_seen": 111040560, "step": 51485 }, { "epoch": 9.449440264268674, "grad_norm": 0.012188798747956753, "learning_rate": 6.3368168922972695e-06, "loss": 0.0707, "num_input_tokens_seen": 111050544, "step": 51490 }, { "epoch": 9.450357863828225, "grad_norm": 0.0507010743021965, "learning_rate": 6.336045269404094e-06, "loss": 0.012, "num_input_tokens_seen": 111061648, "step": 51495 }, { "epoch": 9.451275463387777, "grad_norm": 0.018402712419629097, "learning_rate": 6.335273612243428e-06, "loss": 0.0018, "num_input_tokens_seen": 111072560, "step": 51500 }, { "epoch": 9.45219306294733, "grad_norm": 6.024792194366455, "learning_rate": 6.334501920835063e-06, "loss": 0.0019, "num_input_tokens_seen": 111083248, "step": 51505 }, { "epoch": 9.453110662506882, "grad_norm": 13.196600914001465, "learning_rate": 6.333730195198793e-06, "loss": 0.0033, "num_input_tokens_seen": 111093456, "step": 51510 }, { "epoch": 9.454028262066434, "grad_norm": 0.009454413317143917, "learning_rate": 6.332958435354409e-06, "loss": 0.0311, "num_input_tokens_seen": 111104560, "step": 51515 }, { "epoch": 9.454945861625987, "grad_norm": 0.02580084279179573, "learning_rate": 6.3321866413217085e-06, "loss": 0.001, "num_input_tokens_seen": 111114608, "step": 51520 }, { "epoch": 9.455863461185539, "grad_norm": 1.3340338468551636, "learning_rate": 6.331414813120485e-06, "loss": 0.0837, "num_input_tokens_seen": 111125200, "step": 51525 }, { "epoch": 9.45678106074509, "grad_norm": 0.020178893581032753, "learning_rate": 6.330642950770533e-06, "loss": 0.0002, "num_input_tokens_seen": 111134384, "step": 51530 }, { "epoch": 9.457698660304644, "grad_norm": 0.6582136750221252, "learning_rate": 6.329871054291654e-06, "loss": 0.1166, "num_input_tokens_seen": 111145200, "step": 51535 }, { "epoch": 9.458616259864195, "grad_norm": 0.03219667449593544, "learning_rate": 6.329099123703643e-06, "loss": 0.0001, "num_input_tokens_seen": 111155344, "step": 51540 }, { "epoch": 9.459533859423747, "grad_norm": 251.94129943847656, "learning_rate": 6.328327159026299e-06, "loss": 0.0406, "num_input_tokens_seen": 111165648, "step": 51545 }, { "epoch": 9.4604514589833, "grad_norm": 0.026288852095603943, "learning_rate": 6.327555160279423e-06, "loss": 0.0008, "num_input_tokens_seen": 111176912, "step": 51550 }, { "epoch": 9.461369058542852, "grad_norm": 0.05052807554602623, "learning_rate": 6.326783127482814e-06, "loss": 0.0042, "num_input_tokens_seen": 111186960, "step": 51555 }, { "epoch": 9.462286658102403, "grad_norm": 3.331793785095215, "learning_rate": 6.326011060656274e-06, "loss": 0.001, "num_input_tokens_seen": 111198512, "step": 51560 }, { "epoch": 9.463204257661957, "grad_norm": 0.00804943311959505, "learning_rate": 6.325238959819605e-06, "loss": 0.0001, "num_input_tokens_seen": 111208784, "step": 51565 }, { "epoch": 9.464121857221508, "grad_norm": 17.835411071777344, "learning_rate": 6.324466824992611e-06, "loss": 0.1152, "num_input_tokens_seen": 111220752, "step": 51570 }, { "epoch": 9.46503945678106, "grad_norm": 8.298501968383789, "learning_rate": 6.3236946561950965e-06, "loss": 0.0047, "num_input_tokens_seen": 111232592, "step": 51575 }, { "epoch": 9.465957056340613, "grad_norm": 0.2513769268989563, "learning_rate": 6.322922453446865e-06, "loss": 0.0622, "num_input_tokens_seen": 111242448, "step": 51580 }, { "epoch": 9.466874655900165, "grad_norm": 0.6084979772567749, "learning_rate": 6.322150216767723e-06, "loss": 0.0017, "num_input_tokens_seen": 111253744, "step": 51585 }, { "epoch": 9.467792255459717, "grad_norm": 0.42802512645721436, "learning_rate": 6.321377946177476e-06, "loss": 0.0011, "num_input_tokens_seen": 111264624, "step": 51590 }, { "epoch": 9.46870985501927, "grad_norm": 0.009009224362671375, "learning_rate": 6.320605641695934e-06, "loss": 0.0002, "num_input_tokens_seen": 111275472, "step": 51595 }, { "epoch": 9.469627454578822, "grad_norm": 0.01440854836255312, "learning_rate": 6.319833303342904e-06, "loss": 0.123, "num_input_tokens_seen": 111286960, "step": 51600 }, { "epoch": 9.470545054138373, "grad_norm": 0.0037566055543720722, "learning_rate": 6.319060931138194e-06, "loss": 0.0004, "num_input_tokens_seen": 111298320, "step": 51605 }, { "epoch": 9.471462653697927, "grad_norm": 0.033087000250816345, "learning_rate": 6.318288525101617e-06, "loss": 0.0004, "num_input_tokens_seen": 111310864, "step": 51610 }, { "epoch": 9.472380253257478, "grad_norm": 0.017218584194779396, "learning_rate": 6.317516085252982e-06, "loss": 0.0383, "num_input_tokens_seen": 111321808, "step": 51615 }, { "epoch": 9.47329785281703, "grad_norm": 0.002065817592665553, "learning_rate": 6.3167436116121015e-06, "loss": 0.0004, "num_input_tokens_seen": 111332688, "step": 51620 }, { "epoch": 9.474215452376583, "grad_norm": 0.0014071018667891622, "learning_rate": 6.315971104198788e-06, "loss": 0.0401, "num_input_tokens_seen": 111343600, "step": 51625 }, { "epoch": 9.475133051936135, "grad_norm": 0.03402823209762573, "learning_rate": 6.315198563032855e-06, "loss": 0.0008, "num_input_tokens_seen": 111354128, "step": 51630 }, { "epoch": 9.476050651495687, "grad_norm": 18.565746307373047, "learning_rate": 6.314425988134118e-06, "loss": 0.0266, "num_input_tokens_seen": 111366256, "step": 51635 }, { "epoch": 9.47696825105524, "grad_norm": 11.976725578308105, "learning_rate": 6.313653379522391e-06, "loss": 0.1317, "num_input_tokens_seen": 111377296, "step": 51640 }, { "epoch": 9.477885850614792, "grad_norm": 0.024526039138436317, "learning_rate": 6.31288073721749e-06, "loss": 0.0005, "num_input_tokens_seen": 111388560, "step": 51645 }, { "epoch": 9.478803450174343, "grad_norm": 0.03941627964377403, "learning_rate": 6.312108061239234e-06, "loss": 0.0001, "num_input_tokens_seen": 111399280, "step": 51650 }, { "epoch": 9.479721049733897, "grad_norm": 0.0015286525012925267, "learning_rate": 6.3113353516074396e-06, "loss": 0.0001, "num_input_tokens_seen": 111410224, "step": 51655 }, { "epoch": 9.480638649293448, "grad_norm": 0.019948842003941536, "learning_rate": 6.310562608341926e-06, "loss": 0.218, "num_input_tokens_seen": 111421488, "step": 51660 }, { "epoch": 9.481556248853, "grad_norm": 0.3754604160785675, "learning_rate": 6.3097898314625115e-06, "loss": 0.0007, "num_input_tokens_seen": 111434608, "step": 51665 }, { "epoch": 9.482473848412553, "grad_norm": 0.27669909596443176, "learning_rate": 6.309017020989019e-06, "loss": 0.0004, "num_input_tokens_seen": 111446000, "step": 51670 }, { "epoch": 9.483391447972105, "grad_norm": 0.09770500659942627, "learning_rate": 6.308244176941268e-06, "loss": 0.0082, "num_input_tokens_seen": 111458352, "step": 51675 }, { "epoch": 9.484309047531656, "grad_norm": 0.01343530137091875, "learning_rate": 6.307471299339082e-06, "loss": 0.0002, "num_input_tokens_seen": 111469712, "step": 51680 }, { "epoch": 9.48522664709121, "grad_norm": 0.006313167978078127, "learning_rate": 6.306698388202284e-06, "loss": 0.1815, "num_input_tokens_seen": 111480752, "step": 51685 }, { "epoch": 9.486144246650762, "grad_norm": 3.3751485347747803, "learning_rate": 6.305925443550695e-06, "loss": 0.0022, "num_input_tokens_seen": 111491024, "step": 51690 }, { "epoch": 9.487061846210313, "grad_norm": 0.14500156044960022, "learning_rate": 6.3051524654041466e-06, "loss": 0.0004, "num_input_tokens_seen": 111503152, "step": 51695 }, { "epoch": 9.487979445769867, "grad_norm": 101.26337432861328, "learning_rate": 6.304379453782457e-06, "loss": 0.0058, "num_input_tokens_seen": 111513232, "step": 51700 }, { "epoch": 9.488897045329418, "grad_norm": 0.004264975897967815, "learning_rate": 6.303606408705459e-06, "loss": 0.001, "num_input_tokens_seen": 111524016, "step": 51705 }, { "epoch": 9.48981464488897, "grad_norm": 0.13154704868793488, "learning_rate": 6.302833330192973e-06, "loss": 0.0004, "num_input_tokens_seen": 111535216, "step": 51710 }, { "epoch": 9.490732244448523, "grad_norm": 0.019519686698913574, "learning_rate": 6.302060218264834e-06, "loss": 0.0003, "num_input_tokens_seen": 111545456, "step": 51715 }, { "epoch": 9.491649844008075, "grad_norm": 0.02981189265847206, "learning_rate": 6.301287072940867e-06, "loss": 0.0002, "num_input_tokens_seen": 111556400, "step": 51720 }, { "epoch": 9.492567443567626, "grad_norm": 0.004320394713431597, "learning_rate": 6.300513894240905e-06, "loss": 0.0003, "num_input_tokens_seen": 111566160, "step": 51725 }, { "epoch": 9.49348504312718, "grad_norm": 0.034942626953125, "learning_rate": 6.299740682184776e-06, "loss": 0.0004, "num_input_tokens_seen": 111576912, "step": 51730 }, { "epoch": 9.494402642686731, "grad_norm": 0.008190993219614029, "learning_rate": 6.298967436792314e-06, "loss": 0.0009, "num_input_tokens_seen": 111587856, "step": 51735 }, { "epoch": 9.495320242246283, "grad_norm": 0.015843447297811508, "learning_rate": 6.298194158083349e-06, "loss": 0.0002, "num_input_tokens_seen": 111599440, "step": 51740 }, { "epoch": 9.496237841805836, "grad_norm": 0.04589582979679108, "learning_rate": 6.2974208460777175e-06, "loss": 0.0308, "num_input_tokens_seen": 111610256, "step": 51745 }, { "epoch": 9.497155441365388, "grad_norm": 0.10228979587554932, "learning_rate": 6.2966475007952495e-06, "loss": 0.0021, "num_input_tokens_seen": 111620432, "step": 51750 }, { "epoch": 9.49807304092494, "grad_norm": 0.007922562770545483, "learning_rate": 6.295874122255785e-06, "loss": 0.0002, "num_input_tokens_seen": 111630576, "step": 51755 }, { "epoch": 9.498990640484493, "grad_norm": 3.285802125930786, "learning_rate": 6.2951007104791576e-06, "loss": 0.2909, "num_input_tokens_seen": 111641424, "step": 51760 }, { "epoch": 9.499908240044045, "grad_norm": 0.004366826266050339, "learning_rate": 6.294327265485203e-06, "loss": 0.0002, "num_input_tokens_seen": 111653776, "step": 51765 }, { "epoch": 9.500825839603596, "grad_norm": 0.014219982549548149, "learning_rate": 6.2935537872937616e-06, "loss": 0.0704, "num_input_tokens_seen": 111662960, "step": 51770 }, { "epoch": 9.50174343916315, "grad_norm": 0.0026096724905073643, "learning_rate": 6.292780275924669e-06, "loss": 0.0001, "num_input_tokens_seen": 111674192, "step": 51775 }, { "epoch": 9.502661038722701, "grad_norm": 0.7969178557395935, "learning_rate": 6.2920067313977675e-06, "loss": 0.0004, "num_input_tokens_seen": 111685680, "step": 51780 }, { "epoch": 9.503578638282253, "grad_norm": 5.904099941253662, "learning_rate": 6.291233153732894e-06, "loss": 0.0318, "num_input_tokens_seen": 111695760, "step": 51785 }, { "epoch": 9.504496237841806, "grad_norm": 0.0043418193235993385, "learning_rate": 6.290459542949892e-06, "loss": 0.2512, "num_input_tokens_seen": 111708048, "step": 51790 }, { "epoch": 9.505413837401358, "grad_norm": 27.105661392211914, "learning_rate": 6.289685899068603e-06, "loss": 0.1006, "num_input_tokens_seen": 111719600, "step": 51795 }, { "epoch": 9.50633143696091, "grad_norm": 0.01590568572282791, "learning_rate": 6.28891222210887e-06, "loss": 0.0002, "num_input_tokens_seen": 111730192, "step": 51800 }, { "epoch": 9.507249036520463, "grad_norm": 0.00937732495367527, "learning_rate": 6.288138512090536e-06, "loss": 0.0006, "num_input_tokens_seen": 111741200, "step": 51805 }, { "epoch": 9.508166636080015, "grad_norm": 0.001116935396566987, "learning_rate": 6.287364769033444e-06, "loss": 0.0003, "num_input_tokens_seen": 111752048, "step": 51810 }, { "epoch": 9.509084235639566, "grad_norm": 0.009902124293148518, "learning_rate": 6.2865909929574424e-06, "loss": 0.0002, "num_input_tokens_seen": 111762992, "step": 51815 }, { "epoch": 9.51000183519912, "grad_norm": 0.01024691853672266, "learning_rate": 6.285817183882376e-06, "loss": 0.0002, "num_input_tokens_seen": 111773936, "step": 51820 }, { "epoch": 9.510919434758671, "grad_norm": 0.007637928240001202, "learning_rate": 6.285043341828091e-06, "loss": 0.0005, "num_input_tokens_seen": 111784336, "step": 51825 }, { "epoch": 9.511837034318223, "grad_norm": 92.0594482421875, "learning_rate": 6.284269466814437e-06, "loss": 0.0227, "num_input_tokens_seen": 111794832, "step": 51830 }, { "epoch": 9.512754633877776, "grad_norm": 0.37564706802368164, "learning_rate": 6.28349555886126e-06, "loss": 0.0003, "num_input_tokens_seen": 111806064, "step": 51835 }, { "epoch": 9.513672233437328, "grad_norm": 0.01584070920944214, "learning_rate": 6.282721617988411e-06, "loss": 0.0001, "num_input_tokens_seen": 111816976, "step": 51840 }, { "epoch": 9.51458983299688, "grad_norm": 0.15993288159370422, "learning_rate": 6.281947644215742e-06, "loss": 0.0003, "num_input_tokens_seen": 111828528, "step": 51845 }, { "epoch": 9.515507432556433, "grad_norm": 0.0015417926479130983, "learning_rate": 6.281173637563102e-06, "loss": 0.0, "num_input_tokens_seen": 111839280, "step": 51850 }, { "epoch": 9.516425032115984, "grad_norm": 0.002449002116918564, "learning_rate": 6.280399598050344e-06, "loss": 0.0002, "num_input_tokens_seen": 111850352, "step": 51855 }, { "epoch": 9.517342631675536, "grad_norm": 0.04846488684415817, "learning_rate": 6.279625525697322e-06, "loss": 0.0002, "num_input_tokens_seen": 111861584, "step": 51860 }, { "epoch": 9.51826023123509, "grad_norm": 0.01115739718079567, "learning_rate": 6.278851420523886e-06, "loss": 0.0001, "num_input_tokens_seen": 111872304, "step": 51865 }, { "epoch": 9.519177830794641, "grad_norm": 0.003919880371540785, "learning_rate": 6.278077282549895e-06, "loss": 0.0001, "num_input_tokens_seen": 111883280, "step": 51870 }, { "epoch": 9.520095430354193, "grad_norm": 0.08516747504472733, "learning_rate": 6.277303111795201e-06, "loss": 0.0001, "num_input_tokens_seen": 111892944, "step": 51875 }, { "epoch": 9.521013029913746, "grad_norm": 0.0009634897578507662, "learning_rate": 6.276528908279663e-06, "loss": 0.0, "num_input_tokens_seen": 111904304, "step": 51880 }, { "epoch": 9.521930629473298, "grad_norm": 0.012578270398080349, "learning_rate": 6.275754672023137e-06, "loss": 0.0004, "num_input_tokens_seen": 111916432, "step": 51885 }, { "epoch": 9.52284822903285, "grad_norm": 0.04348938167095184, "learning_rate": 6.27498040304548e-06, "loss": 0.0001, "num_input_tokens_seen": 111926768, "step": 51890 }, { "epoch": 9.523765828592403, "grad_norm": 45.08028793334961, "learning_rate": 6.274206101366553e-06, "loss": 0.0333, "num_input_tokens_seen": 111937552, "step": 51895 }, { "epoch": 9.524683428151954, "grad_norm": 0.0015530555974692106, "learning_rate": 6.273431767006213e-06, "loss": 0.0001, "num_input_tokens_seen": 111949136, "step": 51900 }, { "epoch": 9.525601027711506, "grad_norm": 0.0034106376115232706, "learning_rate": 6.272657399984323e-06, "loss": 0.0001, "num_input_tokens_seen": 111959504, "step": 51905 }, { "epoch": 9.52651862727106, "grad_norm": 0.0018825132865458727, "learning_rate": 6.271883000320742e-06, "loss": 0.0003, "num_input_tokens_seen": 111968272, "step": 51910 }, { "epoch": 9.527436226830611, "grad_norm": 0.003086647018790245, "learning_rate": 6.2711085680353345e-06, "loss": 0.0001, "num_input_tokens_seen": 111978896, "step": 51915 }, { "epoch": 9.528353826390163, "grad_norm": 0.006006609182804823, "learning_rate": 6.270334103147961e-06, "loss": 0.0006, "num_input_tokens_seen": 111989392, "step": 51920 }, { "epoch": 9.529271425949716, "grad_norm": 25.38992691040039, "learning_rate": 6.269559605678488e-06, "loss": 0.1785, "num_input_tokens_seen": 112000336, "step": 51925 }, { "epoch": 9.530189025509268, "grad_norm": 0.023070696741342545, "learning_rate": 6.268785075646777e-06, "loss": 0.0001, "num_input_tokens_seen": 112011984, "step": 51930 }, { "epoch": 9.53110662506882, "grad_norm": 0.008121965453028679, "learning_rate": 6.268010513072698e-06, "loss": 0.0001, "num_input_tokens_seen": 112023248, "step": 51935 }, { "epoch": 9.532024224628373, "grad_norm": 0.029565058648586273, "learning_rate": 6.267235917976112e-06, "loss": 0.0002, "num_input_tokens_seen": 112033840, "step": 51940 }, { "epoch": 9.532941824187924, "grad_norm": 0.007150184828788042, "learning_rate": 6.26646129037689e-06, "loss": 0.0001, "num_input_tokens_seen": 112043920, "step": 51945 }, { "epoch": 9.533859423747476, "grad_norm": 0.00302849430590868, "learning_rate": 6.265686630294897e-06, "loss": 0.0, "num_input_tokens_seen": 112055024, "step": 51950 }, { "epoch": 9.53477702330703, "grad_norm": 0.0016153539763763547, "learning_rate": 6.264911937750006e-06, "loss": 0.0, "num_input_tokens_seen": 112064880, "step": 51955 }, { "epoch": 9.53569462286658, "grad_norm": 0.08745278418064117, "learning_rate": 6.264137212762083e-06, "loss": 0.0001, "num_input_tokens_seen": 112075344, "step": 51960 }, { "epoch": 9.536612222426132, "grad_norm": 0.012580936774611473, "learning_rate": 6.263362455350999e-06, "loss": 0.0001, "num_input_tokens_seen": 112087312, "step": 51965 }, { "epoch": 9.537529821985686, "grad_norm": 0.02937203459441662, "learning_rate": 6.262587665536627e-06, "loss": 0.0001, "num_input_tokens_seen": 112097520, "step": 51970 }, { "epoch": 9.538447421545238, "grad_norm": 0.018240148201584816, "learning_rate": 6.261812843338837e-06, "loss": 0.0, "num_input_tokens_seen": 112108272, "step": 51975 }, { "epoch": 9.53936502110479, "grad_norm": 0.045663803815841675, "learning_rate": 6.261037988777505e-06, "loss": 0.0001, "num_input_tokens_seen": 112119248, "step": 51980 }, { "epoch": 9.540282620664343, "grad_norm": 0.007763139437884092, "learning_rate": 6.260263101872502e-06, "loss": 0.0, "num_input_tokens_seen": 112128528, "step": 51985 }, { "epoch": 9.541200220223894, "grad_norm": 0.06252316385507584, "learning_rate": 6.259488182643705e-06, "loss": 0.0003, "num_input_tokens_seen": 112139472, "step": 51990 }, { "epoch": 9.542117819783446, "grad_norm": 1.7461414337158203, "learning_rate": 6.258713231110987e-06, "loss": 0.0015, "num_input_tokens_seen": 112150096, "step": 51995 }, { "epoch": 9.543035419343, "grad_norm": 0.0069819483906030655, "learning_rate": 6.257938247294224e-06, "loss": 0.0036, "num_input_tokens_seen": 112160656, "step": 52000 }, { "epoch": 9.54395301890255, "grad_norm": 0.04617265239357948, "learning_rate": 6.2571632312132966e-06, "loss": 0.0001, "num_input_tokens_seen": 112170288, "step": 52005 }, { "epoch": 9.544870618462102, "grad_norm": 0.05350753664970398, "learning_rate": 6.25638818288808e-06, "loss": 0.0001, "num_input_tokens_seen": 112182096, "step": 52010 }, { "epoch": 9.545788218021656, "grad_norm": 0.018200170248746872, "learning_rate": 6.255613102338455e-06, "loss": 0.0617, "num_input_tokens_seen": 112192496, "step": 52015 }, { "epoch": 9.546705817581207, "grad_norm": 0.004899454768747091, "learning_rate": 6.254837989584299e-06, "loss": 0.0001, "num_input_tokens_seen": 112203728, "step": 52020 }, { "epoch": 9.547623417140759, "grad_norm": 0.22473283112049103, "learning_rate": 6.254062844645493e-06, "loss": 0.0003, "num_input_tokens_seen": 112213904, "step": 52025 }, { "epoch": 9.548541016700312, "grad_norm": 0.0003267037682235241, "learning_rate": 6.253287667541918e-06, "loss": 0.0, "num_input_tokens_seen": 112224720, "step": 52030 }, { "epoch": 9.549458616259864, "grad_norm": 0.01825307309627533, "learning_rate": 6.25251245829346e-06, "loss": 0.0002, "num_input_tokens_seen": 112235632, "step": 52035 }, { "epoch": 9.550376215819416, "grad_norm": 0.0013145997654646635, "learning_rate": 6.251737216919996e-06, "loss": 0.0001, "num_input_tokens_seen": 112247152, "step": 52040 }, { "epoch": 9.551293815378969, "grad_norm": 87.5225601196289, "learning_rate": 6.250961943441412e-06, "loss": 0.2141, "num_input_tokens_seen": 112258640, "step": 52045 }, { "epoch": 9.55221141493852, "grad_norm": 0.0016658956883475184, "learning_rate": 6.250186637877594e-06, "loss": 0.0001, "num_input_tokens_seen": 112269136, "step": 52050 }, { "epoch": 9.553129014498072, "grad_norm": 0.012416713871061802, "learning_rate": 6.249411300248427e-06, "loss": 0.0, "num_input_tokens_seen": 112280496, "step": 52055 }, { "epoch": 9.554046614057626, "grad_norm": 0.0006296444917097688, "learning_rate": 6.248635930573796e-06, "loss": 0.0, "num_input_tokens_seen": 112290640, "step": 52060 }, { "epoch": 9.554964213617177, "grad_norm": 0.0010882714996114373, "learning_rate": 6.247860528873588e-06, "loss": 0.0001, "num_input_tokens_seen": 112301360, "step": 52065 }, { "epoch": 9.555881813176729, "grad_norm": 0.0008298883331008255, "learning_rate": 6.247085095167691e-06, "loss": 0.0001, "num_input_tokens_seen": 112311984, "step": 52070 }, { "epoch": 9.556799412736282, "grad_norm": 0.0007985607953742146, "learning_rate": 6.246309629475995e-06, "loss": 0.0, "num_input_tokens_seen": 112323024, "step": 52075 }, { "epoch": 9.557717012295834, "grad_norm": 0.0030672564171254635, "learning_rate": 6.245534131818388e-06, "loss": 0.0006, "num_input_tokens_seen": 112333520, "step": 52080 }, { "epoch": 9.558634611855386, "grad_norm": 354.7872009277344, "learning_rate": 6.244758602214761e-06, "loss": 0.0883, "num_input_tokens_seen": 112343984, "step": 52085 }, { "epoch": 9.559552211414939, "grad_norm": 0.0018970040837302804, "learning_rate": 6.243983040685007e-06, "loss": 0.1532, "num_input_tokens_seen": 112353328, "step": 52090 }, { "epoch": 9.56046981097449, "grad_norm": 0.0017372174188494682, "learning_rate": 6.2432074472490135e-06, "loss": 0.0004, "num_input_tokens_seen": 112364336, "step": 52095 }, { "epoch": 9.561387410534042, "grad_norm": 16.8452205657959, "learning_rate": 6.242431821926678e-06, "loss": 0.1314, "num_input_tokens_seen": 112374768, "step": 52100 }, { "epoch": 9.562305010093596, "grad_norm": 0.010530396364629269, "learning_rate": 6.241656164737889e-06, "loss": 0.0, "num_input_tokens_seen": 112384784, "step": 52105 }, { "epoch": 9.563222609653147, "grad_norm": 46.17218780517578, "learning_rate": 6.2408804757025455e-06, "loss": 0.2473, "num_input_tokens_seen": 112396464, "step": 52110 }, { "epoch": 9.564140209212699, "grad_norm": 0.026800235733389854, "learning_rate": 6.240104754840541e-06, "loss": 0.0021, "num_input_tokens_seen": 112407024, "step": 52115 }, { "epoch": 9.565057808772252, "grad_norm": 78.38690185546875, "learning_rate": 6.239329002171771e-06, "loss": 0.0618, "num_input_tokens_seen": 112418352, "step": 52120 }, { "epoch": 9.565975408331804, "grad_norm": 0.03991814702749252, "learning_rate": 6.238553217716135e-06, "loss": 0.0002, "num_input_tokens_seen": 112430064, "step": 52125 }, { "epoch": 9.566893007891355, "grad_norm": 0.06594852358102798, "learning_rate": 6.237777401493526e-06, "loss": 0.033, "num_input_tokens_seen": 112440912, "step": 52130 }, { "epoch": 9.567810607450909, "grad_norm": 31.48949432373047, "learning_rate": 6.237001553523846e-06, "loss": 0.0485, "num_input_tokens_seen": 112451504, "step": 52135 }, { "epoch": 9.56872820701046, "grad_norm": 0.02861565165221691, "learning_rate": 6.236225673826992e-06, "loss": 0.1418, "num_input_tokens_seen": 112462832, "step": 52140 }, { "epoch": 9.569645806570012, "grad_norm": 0.003448499832302332, "learning_rate": 6.235449762422867e-06, "loss": 0.1596, "num_input_tokens_seen": 112472208, "step": 52145 }, { "epoch": 9.570563406129565, "grad_norm": 0.1031130701303482, "learning_rate": 6.23467381933137e-06, "loss": 0.169, "num_input_tokens_seen": 112482640, "step": 52150 }, { "epoch": 9.571481005689117, "grad_norm": 0.003008253173902631, "learning_rate": 6.2338978445724045e-06, "loss": 0.0041, "num_input_tokens_seen": 112494352, "step": 52155 }, { "epoch": 9.572398605248669, "grad_norm": 0.013554389588534832, "learning_rate": 6.233121838165869e-06, "loss": 0.0006, "num_input_tokens_seen": 112504336, "step": 52160 }, { "epoch": 9.573316204808222, "grad_norm": 0.01814490742981434, "learning_rate": 6.232345800131672e-06, "loss": 0.001, "num_input_tokens_seen": 112515632, "step": 52165 }, { "epoch": 9.574233804367774, "grad_norm": 11.979055404663086, "learning_rate": 6.231569730489713e-06, "loss": 0.0037, "num_input_tokens_seen": 112525648, "step": 52170 }, { "epoch": 9.575151403927325, "grad_norm": 0.17140159010887146, "learning_rate": 6.2307936292599e-06, "loss": 0.0002, "num_input_tokens_seen": 112536208, "step": 52175 }, { "epoch": 9.576069003486879, "grad_norm": 1.8924494981765747, "learning_rate": 6.230017496462138e-06, "loss": 0.0677, "num_input_tokens_seen": 112547216, "step": 52180 }, { "epoch": 9.57698660304643, "grad_norm": 0.04222550988197327, "learning_rate": 6.229241332116334e-06, "loss": 0.0015, "num_input_tokens_seen": 112557488, "step": 52185 }, { "epoch": 9.577904202605982, "grad_norm": 0.02131904475390911, "learning_rate": 6.228465136242394e-06, "loss": 0.0001, "num_input_tokens_seen": 112568016, "step": 52190 }, { "epoch": 9.578821802165535, "grad_norm": 0.005509661510586739, "learning_rate": 6.227688908860228e-06, "loss": 0.0, "num_input_tokens_seen": 112578896, "step": 52195 }, { "epoch": 9.579739401725087, "grad_norm": 0.017059266567230225, "learning_rate": 6.2269126499897445e-06, "loss": 0.0001, "num_input_tokens_seen": 112589104, "step": 52200 }, { "epoch": 9.580657001284639, "grad_norm": 0.0027835422661155462, "learning_rate": 6.2261363596508515e-06, "loss": 0.1503, "num_input_tokens_seen": 112600400, "step": 52205 }, { "epoch": 9.581574600844192, "grad_norm": 0.008406377397477627, "learning_rate": 6.225360037863462e-06, "loss": 0.1752, "num_input_tokens_seen": 112611184, "step": 52210 }, { "epoch": 9.582492200403744, "grad_norm": 0.0029048293363302946, "learning_rate": 6.224583684647488e-06, "loss": 0.0001, "num_input_tokens_seen": 112622480, "step": 52215 }, { "epoch": 9.583409799963295, "grad_norm": 0.001702567096799612, "learning_rate": 6.22380730002284e-06, "loss": 0.1008, "num_input_tokens_seen": 112633392, "step": 52220 }, { "epoch": 9.584327399522849, "grad_norm": 0.03042839653789997, "learning_rate": 6.223030884009431e-06, "loss": 0.0542, "num_input_tokens_seen": 112644304, "step": 52225 }, { "epoch": 9.5852449990824, "grad_norm": 0.0011804146924987435, "learning_rate": 6.2222544366271745e-06, "loss": 0.0065, "num_input_tokens_seen": 112654128, "step": 52230 }, { "epoch": 9.586162598641952, "grad_norm": 10.520502090454102, "learning_rate": 6.221477957895987e-06, "loss": 0.0167, "num_input_tokens_seen": 112665392, "step": 52235 }, { "epoch": 9.587080198201505, "grad_norm": 0.00340643129311502, "learning_rate": 6.220701447835782e-06, "loss": 0.0946, "num_input_tokens_seen": 112675696, "step": 52240 }, { "epoch": 9.587997797761057, "grad_norm": 0.035868700593709946, "learning_rate": 6.219924906466479e-06, "loss": 0.0002, "num_input_tokens_seen": 112685104, "step": 52245 }, { "epoch": 9.588915397320608, "grad_norm": 0.013976054266095161, "learning_rate": 6.219148333807991e-06, "loss": 0.0006, "num_input_tokens_seen": 112696912, "step": 52250 }, { "epoch": 9.589832996880162, "grad_norm": 0.005927103105932474, "learning_rate": 6.218371729880238e-06, "loss": 0.0009, "num_input_tokens_seen": 112708336, "step": 52255 }, { "epoch": 9.590750596439714, "grad_norm": 0.010048874653875828, "learning_rate": 6.217595094703138e-06, "loss": 0.0002, "num_input_tokens_seen": 112719664, "step": 52260 }, { "epoch": 9.591668195999265, "grad_norm": 0.4416409134864807, "learning_rate": 6.216818428296613e-06, "loss": 0.0456, "num_input_tokens_seen": 112730864, "step": 52265 }, { "epoch": 9.592585795558819, "grad_norm": 0.5300211310386658, "learning_rate": 6.216041730680579e-06, "loss": 0.0008, "num_input_tokens_seen": 112740752, "step": 52270 }, { "epoch": 9.59350339511837, "grad_norm": 71.61732482910156, "learning_rate": 6.2152650018749605e-06, "loss": 0.2316, "num_input_tokens_seen": 112754096, "step": 52275 }, { "epoch": 9.594420994677922, "grad_norm": 0.005235715303570032, "learning_rate": 6.214488241899677e-06, "loss": 0.001, "num_input_tokens_seen": 112764880, "step": 52280 }, { "epoch": 9.595338594237475, "grad_norm": 0.0016588037833571434, "learning_rate": 6.213711450774654e-06, "loss": 0.0001, "num_input_tokens_seen": 112775952, "step": 52285 }, { "epoch": 9.596256193797027, "grad_norm": 0.004589695483446121, "learning_rate": 6.212934628519812e-06, "loss": 0.0001, "num_input_tokens_seen": 112786992, "step": 52290 }, { "epoch": 9.597173793356578, "grad_norm": 0.012388046830892563, "learning_rate": 6.212157775155077e-06, "loss": 0.2071, "num_input_tokens_seen": 112797936, "step": 52295 }, { "epoch": 9.598091392916132, "grad_norm": 0.013006096705794334, "learning_rate": 6.211380890700374e-06, "loss": 0.0001, "num_input_tokens_seen": 112808560, "step": 52300 }, { "epoch": 9.599008992475683, "grad_norm": 127.27396392822266, "learning_rate": 6.210603975175629e-06, "loss": 0.2317, "num_input_tokens_seen": 112818160, "step": 52305 }, { "epoch": 9.599926592035235, "grad_norm": 0.06599641591310501, "learning_rate": 6.209827028600768e-06, "loss": 0.0002, "num_input_tokens_seen": 112828272, "step": 52310 }, { "epoch": 9.600844191594788, "grad_norm": 0.005356066394597292, "learning_rate": 6.209050050995717e-06, "loss": 0.0001, "num_input_tokens_seen": 112839472, "step": 52315 }, { "epoch": 9.60176179115434, "grad_norm": 0.07373949885368347, "learning_rate": 6.208273042380408e-06, "loss": 0.098, "num_input_tokens_seen": 112850032, "step": 52320 }, { "epoch": 9.602679390713892, "grad_norm": 0.004560729954391718, "learning_rate": 6.207496002774769e-06, "loss": 0.0003, "num_input_tokens_seen": 112860048, "step": 52325 }, { "epoch": 9.603596990273445, "grad_norm": 105.57225799560547, "learning_rate": 6.2067189321987265e-06, "loss": 0.1846, "num_input_tokens_seen": 112870320, "step": 52330 }, { "epoch": 9.604514589832997, "grad_norm": 0.0022793025709688663, "learning_rate": 6.205941830672215e-06, "loss": 0.0018, "num_input_tokens_seen": 112880464, "step": 52335 }, { "epoch": 9.605432189392548, "grad_norm": 0.002698758617043495, "learning_rate": 6.205164698215165e-06, "loss": 0.0001, "num_input_tokens_seen": 112891280, "step": 52340 }, { "epoch": 9.606349788952102, "grad_norm": 0.0015986185753718019, "learning_rate": 6.204387534847507e-06, "loss": 0.2162, "num_input_tokens_seen": 112901232, "step": 52345 }, { "epoch": 9.607267388511653, "grad_norm": 0.0006692828610539436, "learning_rate": 6.203610340589177e-06, "loss": 0.0001, "num_input_tokens_seen": 112912528, "step": 52350 }, { "epoch": 9.608184988071205, "grad_norm": 150.77381896972656, "learning_rate": 6.202833115460106e-06, "loss": 0.3289, "num_input_tokens_seen": 112922736, "step": 52355 }, { "epoch": 9.609102587630758, "grad_norm": 42.873409271240234, "learning_rate": 6.202055859480231e-06, "loss": 0.1096, "num_input_tokens_seen": 112933040, "step": 52360 }, { "epoch": 9.61002018719031, "grad_norm": 0.13176381587982178, "learning_rate": 6.201278572669485e-06, "loss": 0.0003, "num_input_tokens_seen": 112943632, "step": 52365 }, { "epoch": 9.610937786749862, "grad_norm": 0.04568175598978996, "learning_rate": 6.200501255047806e-06, "loss": 0.0002, "num_input_tokens_seen": 112954608, "step": 52370 }, { "epoch": 9.611855386309415, "grad_norm": 0.015798702836036682, "learning_rate": 6.19972390663513e-06, "loss": 0.0002, "num_input_tokens_seen": 112966864, "step": 52375 }, { "epoch": 9.612772985868967, "grad_norm": 0.0025200331583619118, "learning_rate": 6.198946527451395e-06, "loss": 0.0003, "num_input_tokens_seen": 112976272, "step": 52380 }, { "epoch": 9.613690585428518, "grad_norm": 392.977783203125, "learning_rate": 6.198169117516542e-06, "loss": 0.0806, "num_input_tokens_seen": 112986960, "step": 52385 }, { "epoch": 9.614608184988072, "grad_norm": 0.5244652032852173, "learning_rate": 6.197391676850505e-06, "loss": 0.0011, "num_input_tokens_seen": 112997104, "step": 52390 }, { "epoch": 9.615525784547623, "grad_norm": 1.2536612749099731, "learning_rate": 6.1966142054732295e-06, "loss": 0.0003, "num_input_tokens_seen": 113008112, "step": 52395 }, { "epoch": 9.616443384107175, "grad_norm": 0.0015444181626662612, "learning_rate": 6.195836703404652e-06, "loss": 0.0452, "num_input_tokens_seen": 113018288, "step": 52400 }, { "epoch": 9.617360983666728, "grad_norm": 0.002771437168121338, "learning_rate": 6.195059170664718e-06, "loss": 0.0002, "num_input_tokens_seen": 113030352, "step": 52405 }, { "epoch": 9.61827858322628, "grad_norm": 0.004941885359585285, "learning_rate": 6.194281607273368e-06, "loss": 0.1241, "num_input_tokens_seen": 113041040, "step": 52410 }, { "epoch": 9.619196182785831, "grad_norm": 0.13178540766239166, "learning_rate": 6.193504013250546e-06, "loss": 0.0444, "num_input_tokens_seen": 113050480, "step": 52415 }, { "epoch": 9.620113782345385, "grad_norm": 0.20515263080596924, "learning_rate": 6.192726388616196e-06, "loss": 0.0002, "num_input_tokens_seen": 113060944, "step": 52420 }, { "epoch": 9.621031381904936, "grad_norm": 8.157698631286621, "learning_rate": 6.1919487333902616e-06, "loss": 0.1156, "num_input_tokens_seen": 113071600, "step": 52425 }, { "epoch": 9.621948981464488, "grad_norm": 0.002942344406619668, "learning_rate": 6.19117104759269e-06, "loss": 0.0004, "num_input_tokens_seen": 113082800, "step": 52430 }, { "epoch": 9.622866581024041, "grad_norm": 0.001609479309991002, "learning_rate": 6.1903933312434285e-06, "loss": 0.0003, "num_input_tokens_seen": 113093872, "step": 52435 }, { "epoch": 9.623784180583593, "grad_norm": 0.030074678361415863, "learning_rate": 6.1896155843624215e-06, "loss": 0.2376, "num_input_tokens_seen": 113104048, "step": 52440 }, { "epoch": 9.624701780143145, "grad_norm": 0.011071105487644672, "learning_rate": 6.18883780696962e-06, "loss": 0.0539, "num_input_tokens_seen": 113115120, "step": 52445 }, { "epoch": 9.625619379702698, "grad_norm": 0.6740518808364868, "learning_rate": 6.18805999908497e-06, "loss": 0.0289, "num_input_tokens_seen": 113125584, "step": 52450 }, { "epoch": 9.62653697926225, "grad_norm": 0.036527469754219055, "learning_rate": 6.187282160728422e-06, "loss": 0.1006, "num_input_tokens_seen": 113135952, "step": 52455 }, { "epoch": 9.627454578821801, "grad_norm": 0.009251046925783157, "learning_rate": 6.186504291919928e-06, "loss": 0.0002, "num_input_tokens_seen": 113146224, "step": 52460 }, { "epoch": 9.628372178381355, "grad_norm": 0.09393197298049927, "learning_rate": 6.185726392679437e-06, "loss": 0.2034, "num_input_tokens_seen": 113156208, "step": 52465 }, { "epoch": 9.629289777940906, "grad_norm": 20.71170425415039, "learning_rate": 6.184948463026902e-06, "loss": 0.1055, "num_input_tokens_seen": 113166832, "step": 52470 }, { "epoch": 9.630207377500458, "grad_norm": 0.04796693101525307, "learning_rate": 6.184170502982276e-06, "loss": 0.0001, "num_input_tokens_seen": 113177232, "step": 52475 }, { "epoch": 9.631124977060011, "grad_norm": 0.42382746934890747, "learning_rate": 6.1833925125655104e-06, "loss": 0.0981, "num_input_tokens_seen": 113189200, "step": 52480 }, { "epoch": 9.632042576619563, "grad_norm": 0.0014386898837983608, "learning_rate": 6.1826144917965625e-06, "loss": 0.0015, "num_input_tokens_seen": 113200272, "step": 52485 }, { "epoch": 9.632960176179115, "grad_norm": 92.0975570678711, "learning_rate": 6.181836440695384e-06, "loss": 0.0161, "num_input_tokens_seen": 113211600, "step": 52490 }, { "epoch": 9.633877775738668, "grad_norm": 0.019088469445705414, "learning_rate": 6.181058359281935e-06, "loss": 0.0432, "num_input_tokens_seen": 113223056, "step": 52495 }, { "epoch": 9.63479537529822, "grad_norm": 0.006949140690267086, "learning_rate": 6.180280247576168e-06, "loss": 0.0002, "num_input_tokens_seen": 113233968, "step": 52500 }, { "epoch": 9.635712974857771, "grad_norm": 0.0569625198841095, "learning_rate": 6.179502105598041e-06, "loss": 0.098, "num_input_tokens_seen": 113243792, "step": 52505 }, { "epoch": 9.636630574417325, "grad_norm": 0.0034403358586132526, "learning_rate": 6.178723933367515e-06, "loss": 0.0002, "num_input_tokens_seen": 113255920, "step": 52510 }, { "epoch": 9.637548173976876, "grad_norm": 0.004688729532063007, "learning_rate": 6.177945730904545e-06, "loss": 0.0001, "num_input_tokens_seen": 113267664, "step": 52515 }, { "epoch": 9.638465773536428, "grad_norm": 0.0012601965572685003, "learning_rate": 6.177167498229095e-06, "loss": 0.0002, "num_input_tokens_seen": 113279376, "step": 52520 }, { "epoch": 9.639383373095981, "grad_norm": 0.0012902584858238697, "learning_rate": 6.176389235361121e-06, "loss": 0.0005, "num_input_tokens_seen": 113289168, "step": 52525 }, { "epoch": 9.640300972655533, "grad_norm": 10.004644393920898, "learning_rate": 6.175610942320588e-06, "loss": 0.1727, "num_input_tokens_seen": 113299536, "step": 52530 }, { "epoch": 9.641218572215084, "grad_norm": 0.0009258146164938807, "learning_rate": 6.174832619127455e-06, "loss": 0.0042, "num_input_tokens_seen": 113308592, "step": 52535 }, { "epoch": 9.642136171774638, "grad_norm": 0.029903488233685493, "learning_rate": 6.174054265801686e-06, "loss": 0.0004, "num_input_tokens_seen": 113319632, "step": 52540 }, { "epoch": 9.64305377133419, "grad_norm": 0.008449921384453773, "learning_rate": 6.173275882363245e-06, "loss": 0.0033, "num_input_tokens_seen": 113331184, "step": 52545 }, { "epoch": 9.643971370893741, "grad_norm": 0.02179485373198986, "learning_rate": 6.172497468832097e-06, "loss": 0.059, "num_input_tokens_seen": 113342384, "step": 52550 }, { "epoch": 9.644888970453295, "grad_norm": 0.24982640147209167, "learning_rate": 6.171719025228206e-06, "loss": 0.002, "num_input_tokens_seen": 113352784, "step": 52555 }, { "epoch": 9.645806570012846, "grad_norm": 0.01507816556841135, "learning_rate": 6.170940551571537e-06, "loss": 0.0002, "num_input_tokens_seen": 113362832, "step": 52560 }, { "epoch": 9.646724169572398, "grad_norm": 0.10873164981603622, "learning_rate": 6.170162047882059e-06, "loss": 0.0004, "num_input_tokens_seen": 113373488, "step": 52565 }, { "epoch": 9.647641769131951, "grad_norm": 0.027735818177461624, "learning_rate": 6.169383514179737e-06, "loss": 0.0003, "num_input_tokens_seen": 113384976, "step": 52570 }, { "epoch": 9.648559368691503, "grad_norm": 0.11802423745393753, "learning_rate": 6.168604950484541e-06, "loss": 0.0002, "num_input_tokens_seen": 113396368, "step": 52575 }, { "epoch": 9.649476968251054, "grad_norm": 2.1054022312164307, "learning_rate": 6.167826356816437e-06, "loss": 0.0019, "num_input_tokens_seen": 113406672, "step": 52580 }, { "epoch": 9.650394567810608, "grad_norm": 0.0016798705328255892, "learning_rate": 6.1670477331954e-06, "loss": 0.2407, "num_input_tokens_seen": 113418384, "step": 52585 }, { "epoch": 9.65131216737016, "grad_norm": 0.0022943164221942425, "learning_rate": 6.166269079641396e-06, "loss": 0.0453, "num_input_tokens_seen": 113430288, "step": 52590 }, { "epoch": 9.652229766929711, "grad_norm": 0.02867358922958374, "learning_rate": 6.165490396174398e-06, "loss": 0.0006, "num_input_tokens_seen": 113441232, "step": 52595 }, { "epoch": 9.653147366489264, "grad_norm": 0.004866320174187422, "learning_rate": 6.1647116828143776e-06, "loss": 0.0001, "num_input_tokens_seen": 113452048, "step": 52600 }, { "epoch": 9.654064966048816, "grad_norm": 0.06784047186374664, "learning_rate": 6.163932939581307e-06, "loss": 0.0001, "num_input_tokens_seen": 113462224, "step": 52605 }, { "epoch": 9.654982565608368, "grad_norm": 0.002423429861664772, "learning_rate": 6.163154166495161e-06, "loss": 0.0001, "num_input_tokens_seen": 113473456, "step": 52610 }, { "epoch": 9.655900165167921, "grad_norm": 0.000782691640779376, "learning_rate": 6.162375363575912e-06, "loss": 0.0001, "num_input_tokens_seen": 113483152, "step": 52615 }, { "epoch": 9.656817764727473, "grad_norm": 0.0005490249604918063, "learning_rate": 6.1615965308435384e-06, "loss": 0.0, "num_input_tokens_seen": 113494032, "step": 52620 }, { "epoch": 9.657735364287024, "grad_norm": 0.013977345079183578, "learning_rate": 6.160817668318013e-06, "loss": 0.0002, "num_input_tokens_seen": 113504656, "step": 52625 }, { "epoch": 9.658652963846578, "grad_norm": 0.000782210030592978, "learning_rate": 6.160038776019314e-06, "loss": 0.0001, "num_input_tokens_seen": 113515056, "step": 52630 }, { "epoch": 9.65957056340613, "grad_norm": 0.000526773335877806, "learning_rate": 6.159259853967419e-06, "loss": 0.0, "num_input_tokens_seen": 113524944, "step": 52635 }, { "epoch": 9.660488162965681, "grad_norm": 0.0021832003258168697, "learning_rate": 6.1584809021823035e-06, "loss": 0.1533, "num_input_tokens_seen": 113536400, "step": 52640 }, { "epoch": 9.661405762525234, "grad_norm": 0.026561375707387924, "learning_rate": 6.157701920683949e-06, "loss": 0.0001, "num_input_tokens_seen": 113547984, "step": 52645 }, { "epoch": 9.662323362084786, "grad_norm": 0.009915817528963089, "learning_rate": 6.156922909492336e-06, "loss": 0.0, "num_input_tokens_seen": 113558384, "step": 52650 }, { "epoch": 9.663240961644338, "grad_norm": 0.005120561923831701, "learning_rate": 6.156143868627442e-06, "loss": 0.1502, "num_input_tokens_seen": 113570000, "step": 52655 }, { "epoch": 9.664158561203891, "grad_norm": 0.00583481602370739, "learning_rate": 6.15536479810925e-06, "loss": 0.0007, "num_input_tokens_seen": 113580784, "step": 52660 }, { "epoch": 9.665076160763443, "grad_norm": 0.0018017101101577282, "learning_rate": 6.1545856979577414e-06, "loss": 0.2969, "num_input_tokens_seen": 113591664, "step": 52665 }, { "epoch": 9.665993760322994, "grad_norm": 0.004877348896116018, "learning_rate": 6.1538065681928994e-06, "loss": 0.0013, "num_input_tokens_seen": 113601744, "step": 52670 }, { "epoch": 9.666911359882548, "grad_norm": 0.0016453771386295557, "learning_rate": 6.1530274088347075e-06, "loss": 0.0001, "num_input_tokens_seen": 113612176, "step": 52675 }, { "epoch": 9.6678289594421, "grad_norm": 0.0019184616394340992, "learning_rate": 6.152248219903149e-06, "loss": 0.056, "num_input_tokens_seen": 113624112, "step": 52680 }, { "epoch": 9.66874655900165, "grad_norm": 0.005587961059063673, "learning_rate": 6.15146900141821e-06, "loss": 0.0002, "num_input_tokens_seen": 113634704, "step": 52685 }, { "epoch": 9.669664158561204, "grad_norm": 25.1326847076416, "learning_rate": 6.150689753399875e-06, "loss": 0.0824, "num_input_tokens_seen": 113645296, "step": 52690 }, { "epoch": 9.670581758120756, "grad_norm": 0.0018396767554804683, "learning_rate": 6.149910475868133e-06, "loss": 0.1314, "num_input_tokens_seen": 113655760, "step": 52695 }, { "epoch": 9.671499357680307, "grad_norm": 0.0032388917170464993, "learning_rate": 6.149131168842967e-06, "loss": 0.0001, "num_input_tokens_seen": 113666096, "step": 52700 }, { "epoch": 9.67241695723986, "grad_norm": 0.02371879294514656, "learning_rate": 6.148351832344369e-06, "loss": 0.0002, "num_input_tokens_seen": 113676496, "step": 52705 }, { "epoch": 9.673334556799412, "grad_norm": 0.0619831383228302, "learning_rate": 6.147572466392325e-06, "loss": 0.0002, "num_input_tokens_seen": 113686992, "step": 52710 }, { "epoch": 9.674252156358964, "grad_norm": 0.005188420880585909, "learning_rate": 6.146793071006828e-06, "loss": 0.0, "num_input_tokens_seen": 113697200, "step": 52715 }, { "epoch": 9.675169755918517, "grad_norm": 0.01404427271336317, "learning_rate": 6.146013646207864e-06, "loss": 0.1751, "num_input_tokens_seen": 113708240, "step": 52720 }, { "epoch": 9.676087355478069, "grad_norm": 0.005974697880446911, "learning_rate": 6.145234192015427e-06, "loss": 0.019, "num_input_tokens_seen": 113719504, "step": 52725 }, { "epoch": 9.67700495503762, "grad_norm": 0.0022766641341149807, "learning_rate": 6.1444547084495075e-06, "loss": 0.001, "num_input_tokens_seen": 113730096, "step": 52730 }, { "epoch": 9.677922554597174, "grad_norm": 0.0005280401674099267, "learning_rate": 6.143675195530099e-06, "loss": 0.0003, "num_input_tokens_seen": 113742416, "step": 52735 }, { "epoch": 9.678840154156726, "grad_norm": 0.0013306769542396069, "learning_rate": 6.142895653277194e-06, "loss": 0.0, "num_input_tokens_seen": 113752624, "step": 52740 }, { "epoch": 9.679757753716277, "grad_norm": 0.006236081011593342, "learning_rate": 6.142116081710787e-06, "loss": 0.0001, "num_input_tokens_seen": 113762896, "step": 52745 }, { "epoch": 9.68067535327583, "grad_norm": 0.044608473777770996, "learning_rate": 6.141336480850873e-06, "loss": 0.0001, "num_input_tokens_seen": 113773296, "step": 52750 }, { "epoch": 9.681592952835382, "grad_norm": 0.004523060284554958, "learning_rate": 6.140556850717446e-06, "loss": 0.0284, "num_input_tokens_seen": 113783696, "step": 52755 }, { "epoch": 9.682510552394934, "grad_norm": 0.0006295387865975499, "learning_rate": 6.139777191330504e-06, "loss": 0.0004, "num_input_tokens_seen": 113794768, "step": 52760 }, { "epoch": 9.683428151954487, "grad_norm": 0.01158551499247551, "learning_rate": 6.1389975027100445e-06, "loss": 0.0001, "num_input_tokens_seen": 113805776, "step": 52765 }, { "epoch": 9.684345751514039, "grad_norm": 0.007943015545606613, "learning_rate": 6.1382177848760625e-06, "loss": 0.0001, "num_input_tokens_seen": 113817456, "step": 52770 }, { "epoch": 9.68526335107359, "grad_norm": 0.006175614893436432, "learning_rate": 6.13743803784856e-06, "loss": 0.0001, "num_input_tokens_seen": 113828496, "step": 52775 }, { "epoch": 9.686180950633144, "grad_norm": 0.0007872063433751464, "learning_rate": 6.136658261647535e-06, "loss": 0.0, "num_input_tokens_seen": 113837936, "step": 52780 }, { "epoch": 9.687098550192696, "grad_norm": 0.0014211606467142701, "learning_rate": 6.135878456292986e-06, "loss": 0.0001, "num_input_tokens_seen": 113848624, "step": 52785 }, { "epoch": 9.688016149752247, "grad_norm": 0.0007352776592597365, "learning_rate": 6.135098621804915e-06, "loss": 0.0002, "num_input_tokens_seen": 113858608, "step": 52790 }, { "epoch": 9.6889337493118, "grad_norm": 0.6275522112846375, "learning_rate": 6.134318758203325e-06, "loss": 0.0001, "num_input_tokens_seen": 113869072, "step": 52795 }, { "epoch": 9.689851348871352, "grad_norm": 26.256364822387695, "learning_rate": 6.133538865508215e-06, "loss": 0.1377, "num_input_tokens_seen": 113879056, "step": 52800 }, { "epoch": 9.690768948430904, "grad_norm": 0.0018272942397743464, "learning_rate": 6.132758943739592e-06, "loss": 0.0001, "num_input_tokens_seen": 113890096, "step": 52805 }, { "epoch": 9.691686547990457, "grad_norm": 0.015764646232128143, "learning_rate": 6.131978992917455e-06, "loss": 0.0016, "num_input_tokens_seen": 113899984, "step": 52810 }, { "epoch": 9.692604147550009, "grad_norm": 0.07064446061849594, "learning_rate": 6.1311990130618135e-06, "loss": 0.0001, "num_input_tokens_seen": 113910576, "step": 52815 }, { "epoch": 9.69352174710956, "grad_norm": 0.3588034212589264, "learning_rate": 6.13041900419267e-06, "loss": 0.0002, "num_input_tokens_seen": 113921264, "step": 52820 }, { "epoch": 9.694439346669114, "grad_norm": 0.0006117763114161789, "learning_rate": 6.12963896633003e-06, "loss": 0.0005, "num_input_tokens_seen": 113931792, "step": 52825 }, { "epoch": 9.695356946228666, "grad_norm": 0.0010689517948776484, "learning_rate": 6.128858899493903e-06, "loss": 0.0, "num_input_tokens_seen": 113943888, "step": 52830 }, { "epoch": 9.696274545788217, "grad_norm": 0.0025997639168053865, "learning_rate": 6.128078803704294e-06, "loss": 0.0, "num_input_tokens_seen": 113955920, "step": 52835 }, { "epoch": 9.69719214534777, "grad_norm": 0.0009547580266371369, "learning_rate": 6.1272986789812125e-06, "loss": 0.0001, "num_input_tokens_seen": 113967312, "step": 52840 }, { "epoch": 9.698109744907322, "grad_norm": 0.0015312862815335393, "learning_rate": 6.126518525344666e-06, "loss": 0.1565, "num_input_tokens_seen": 113978000, "step": 52845 }, { "epoch": 9.699027344466874, "grad_norm": 35.492530822753906, "learning_rate": 6.125738342814667e-06, "loss": 0.1346, "num_input_tokens_seen": 113988784, "step": 52850 }, { "epoch": 9.699944944026427, "grad_norm": 0.009278100915253162, "learning_rate": 6.1249581314112214e-06, "loss": 0.272, "num_input_tokens_seen": 113999344, "step": 52855 }, { "epoch": 9.700862543585979, "grad_norm": 18.65165138244629, "learning_rate": 6.124177891154345e-06, "loss": 0.0041, "num_input_tokens_seen": 114010288, "step": 52860 }, { "epoch": 9.70178014314553, "grad_norm": 0.1778511106967926, "learning_rate": 6.123397622064049e-06, "loss": 0.0003, "num_input_tokens_seen": 114020304, "step": 52865 }, { "epoch": 9.702697742705084, "grad_norm": 0.01522406842559576, "learning_rate": 6.122617324160343e-06, "loss": 0.0001, "num_input_tokens_seen": 114030960, "step": 52870 }, { "epoch": 9.703615342264635, "grad_norm": 0.013106124475598335, "learning_rate": 6.121836997463244e-06, "loss": 0.0006, "num_input_tokens_seen": 114041872, "step": 52875 }, { "epoch": 9.704532941824187, "grad_norm": 0.007475442253053188, "learning_rate": 6.121056641992764e-06, "loss": 0.1067, "num_input_tokens_seen": 114052496, "step": 52880 }, { "epoch": 9.70545054138374, "grad_norm": 91.81060028076172, "learning_rate": 6.120276257768918e-06, "loss": 0.2139, "num_input_tokens_seen": 114063152, "step": 52885 }, { "epoch": 9.706368140943292, "grad_norm": 0.0015287165297195315, "learning_rate": 6.1194958448117246e-06, "loss": 0.001, "num_input_tokens_seen": 114074320, "step": 52890 }, { "epoch": 9.707285740502844, "grad_norm": 0.0020216170232743025, "learning_rate": 6.118715403141197e-06, "loss": 0.0001, "num_input_tokens_seen": 114084848, "step": 52895 }, { "epoch": 9.708203340062397, "grad_norm": 0.06979931145906448, "learning_rate": 6.117934932777352e-06, "loss": 0.0002, "num_input_tokens_seen": 114096944, "step": 52900 }, { "epoch": 9.709120939621949, "grad_norm": 0.07539893686771393, "learning_rate": 6.117154433740209e-06, "loss": 0.0004, "num_input_tokens_seen": 114108496, "step": 52905 }, { "epoch": 9.7100385391815, "grad_norm": 0.014880476519465446, "learning_rate": 6.1163739060497865e-06, "loss": 0.0007, "num_input_tokens_seen": 114118800, "step": 52910 }, { "epoch": 9.710956138741054, "grad_norm": 31.4554500579834, "learning_rate": 6.115593349726104e-06, "loss": 0.2063, "num_input_tokens_seen": 114129424, "step": 52915 }, { "epoch": 9.711873738300605, "grad_norm": 0.007103178650140762, "learning_rate": 6.114812764789179e-06, "loss": 0.0, "num_input_tokens_seen": 114140944, "step": 52920 }, { "epoch": 9.712791337860157, "grad_norm": 0.002438608091324568, "learning_rate": 6.114032151259038e-06, "loss": 0.0285, "num_input_tokens_seen": 114152144, "step": 52925 }, { "epoch": 9.71370893741971, "grad_norm": 31.33157730102539, "learning_rate": 6.113251509155695e-06, "loss": 0.2125, "num_input_tokens_seen": 114162256, "step": 52930 }, { "epoch": 9.714626536979262, "grad_norm": 0.020604075863957405, "learning_rate": 6.112470838499178e-06, "loss": 0.0003, "num_input_tokens_seen": 114174128, "step": 52935 }, { "epoch": 9.715544136538814, "grad_norm": 0.4092804193496704, "learning_rate": 6.1116901393095086e-06, "loss": 0.0629, "num_input_tokens_seen": 114185776, "step": 52940 }, { "epoch": 9.716461736098367, "grad_norm": 0.0013153721811249852, "learning_rate": 6.110909411606709e-06, "loss": 0.0002, "num_input_tokens_seen": 114194384, "step": 52945 }, { "epoch": 9.717379335657919, "grad_norm": 0.009376056492328644, "learning_rate": 6.110128655410806e-06, "loss": 0.0001, "num_input_tokens_seen": 114204720, "step": 52950 }, { "epoch": 9.71829693521747, "grad_norm": 0.0014751437120139599, "learning_rate": 6.109347870741823e-06, "loss": 0.0, "num_input_tokens_seen": 114215280, "step": 52955 }, { "epoch": 9.719214534777024, "grad_norm": 0.01674477383494377, "learning_rate": 6.1085670576197855e-06, "loss": 0.0029, "num_input_tokens_seen": 114225264, "step": 52960 }, { "epoch": 9.720132134336575, "grad_norm": 0.005357400514185429, "learning_rate": 6.107786216064723e-06, "loss": 0.0285, "num_input_tokens_seen": 114236336, "step": 52965 }, { "epoch": 9.721049733896127, "grad_norm": 0.0005321276839822531, "learning_rate": 6.107005346096659e-06, "loss": 0.0001, "num_input_tokens_seen": 114245648, "step": 52970 }, { "epoch": 9.72196733345568, "grad_norm": 0.0008309823460876942, "learning_rate": 6.106224447735625e-06, "loss": 0.0001, "num_input_tokens_seen": 114257200, "step": 52975 }, { "epoch": 9.722884933015232, "grad_norm": 0.00271590449847281, "learning_rate": 6.105443521001648e-06, "loss": 0.0001, "num_input_tokens_seen": 114268048, "step": 52980 }, { "epoch": 9.723802532574783, "grad_norm": 0.018659960478544235, "learning_rate": 6.104662565914758e-06, "loss": 0.3315, "num_input_tokens_seen": 114280016, "step": 52985 }, { "epoch": 9.724720132134337, "grad_norm": 0.15659929811954498, "learning_rate": 6.1038815824949845e-06, "loss": 0.0002, "num_input_tokens_seen": 114291280, "step": 52990 }, { "epoch": 9.725637731693888, "grad_norm": 0.0059741646982729435, "learning_rate": 6.1031005707623595e-06, "loss": 0.2001, "num_input_tokens_seen": 114300560, "step": 52995 }, { "epoch": 9.72655533125344, "grad_norm": 0.007280679419636726, "learning_rate": 6.102319530736916e-06, "loss": 0.0001, "num_input_tokens_seen": 114312432, "step": 53000 }, { "epoch": 9.727472930812993, "grad_norm": 0.0038972862530499697, "learning_rate": 6.101538462438684e-06, "loss": 0.1036, "num_input_tokens_seen": 114323376, "step": 53005 }, { "epoch": 9.728390530372545, "grad_norm": 0.0880383551120758, "learning_rate": 6.100757365887698e-06, "loss": 0.0003, "num_input_tokens_seen": 114334416, "step": 53010 }, { "epoch": 9.729308129932097, "grad_norm": 0.44133684039115906, "learning_rate": 6.099976241103991e-06, "loss": 0.0004, "num_input_tokens_seen": 114343920, "step": 53015 }, { "epoch": 9.73022572949165, "grad_norm": 0.03903937712311745, "learning_rate": 6.099195088107598e-06, "loss": 0.0001, "num_input_tokens_seen": 114354480, "step": 53020 }, { "epoch": 9.731143329051202, "grad_norm": 0.0006784311844967306, "learning_rate": 6.098413906918556e-06, "loss": 0.1166, "num_input_tokens_seen": 114365200, "step": 53025 }, { "epoch": 9.732060928610753, "grad_norm": 0.0485432893037796, "learning_rate": 6.097632697556898e-06, "loss": 0.1006, "num_input_tokens_seen": 114376848, "step": 53030 }, { "epoch": 9.732978528170307, "grad_norm": 0.014476806856691837, "learning_rate": 6.096851460042663e-06, "loss": 0.0978, "num_input_tokens_seen": 114386096, "step": 53035 }, { "epoch": 9.733896127729858, "grad_norm": 0.0005866423598490655, "learning_rate": 6.096070194395888e-06, "loss": 0.2145, "num_input_tokens_seen": 114397264, "step": 53040 }, { "epoch": 9.73481372728941, "grad_norm": 0.033456217497587204, "learning_rate": 6.095288900636611e-06, "loss": 0.2505, "num_input_tokens_seen": 114408656, "step": 53045 }, { "epoch": 9.735731326848963, "grad_norm": 0.042636897414922714, "learning_rate": 6.094507578784872e-06, "loss": 0.0001, "num_input_tokens_seen": 114420016, "step": 53050 }, { "epoch": 9.736648926408515, "grad_norm": 0.04480762034654617, "learning_rate": 6.093726228860709e-06, "loss": 0.1784, "num_input_tokens_seen": 114431216, "step": 53055 }, { "epoch": 9.737566525968067, "grad_norm": 0.0023965921718627214, "learning_rate": 6.092944850884165e-06, "loss": 0.0003, "num_input_tokens_seen": 114442096, "step": 53060 }, { "epoch": 9.73848412552762, "grad_norm": 0.013863388448953629, "learning_rate": 6.092163444875278e-06, "loss": 0.1584, "num_input_tokens_seen": 114453392, "step": 53065 }, { "epoch": 9.739401725087172, "grad_norm": 0.003795298980548978, "learning_rate": 6.091382010854091e-06, "loss": 0.1222, "num_input_tokens_seen": 114463952, "step": 53070 }, { "epoch": 9.740319324646723, "grad_norm": 25.636455535888672, "learning_rate": 6.0906005488406485e-06, "loss": 0.0765, "num_input_tokens_seen": 114474000, "step": 53075 }, { "epoch": 9.741236924206277, "grad_norm": 0.3214033544063568, "learning_rate": 6.089819058854991e-06, "loss": 0.0004, "num_input_tokens_seen": 114486128, "step": 53080 }, { "epoch": 9.742154523765828, "grad_norm": 0.021956318989396095, "learning_rate": 6.089037540917165e-06, "loss": 0.0735, "num_input_tokens_seen": 114497648, "step": 53085 }, { "epoch": 9.74307212332538, "grad_norm": 0.019262770190835, "learning_rate": 6.088255995047212e-06, "loss": 0.1631, "num_input_tokens_seen": 114509072, "step": 53090 }, { "epoch": 9.743989722884933, "grad_norm": 0.021705174818634987, "learning_rate": 6.087474421265179e-06, "loss": 0.0002, "num_input_tokens_seen": 114518608, "step": 53095 }, { "epoch": 9.744907322444485, "grad_norm": 0.013967396691441536, "learning_rate": 6.0866928195911155e-06, "loss": 0.1964, "num_input_tokens_seen": 114529776, "step": 53100 }, { "epoch": 9.745824922004036, "grad_norm": 0.04830647632479668, "learning_rate": 6.085911190045064e-06, "loss": 0.0014, "num_input_tokens_seen": 114539792, "step": 53105 }, { "epoch": 9.74674252156359, "grad_norm": 22.367507934570312, "learning_rate": 6.0851295326470726e-06, "loss": 0.1351, "num_input_tokens_seen": 114550256, "step": 53110 }, { "epoch": 9.747660121123142, "grad_norm": 0.09369724243879318, "learning_rate": 6.0843478474171925e-06, "loss": 0.0004, "num_input_tokens_seen": 114561648, "step": 53115 }, { "epoch": 9.748577720682695, "grad_norm": 0.03911084681749344, "learning_rate": 6.083566134375468e-06, "loss": 0.0003, "num_input_tokens_seen": 114571952, "step": 53120 }, { "epoch": 9.749495320242247, "grad_norm": 0.005795163568109274, "learning_rate": 6.082784393541954e-06, "loss": 0.0006, "num_input_tokens_seen": 114582480, "step": 53125 }, { "epoch": 9.750412919801798, "grad_norm": 0.029508866369724274, "learning_rate": 6.082002624936697e-06, "loss": 0.0002, "num_input_tokens_seen": 114593200, "step": 53130 }, { "epoch": 9.751330519361352, "grad_norm": 0.006520286202430725, "learning_rate": 6.08122082857975e-06, "loss": 0.0002, "num_input_tokens_seen": 114602960, "step": 53135 }, { "epoch": 9.752248118920903, "grad_norm": 0.007965579628944397, "learning_rate": 6.080439004491165e-06, "loss": 0.0764, "num_input_tokens_seen": 114612880, "step": 53140 }, { "epoch": 9.753165718480455, "grad_norm": 19.931625366210938, "learning_rate": 6.079657152690993e-06, "loss": 0.0539, "num_input_tokens_seen": 114625168, "step": 53145 }, { "epoch": 9.754083318040008, "grad_norm": 0.20587851107120514, "learning_rate": 6.07887527319929e-06, "loss": 0.1628, "num_input_tokens_seen": 114636496, "step": 53150 }, { "epoch": 9.75500091759956, "grad_norm": 0.14175790548324585, "learning_rate": 6.078093366036105e-06, "loss": 0.0004, "num_input_tokens_seen": 114647216, "step": 53155 }, { "epoch": 9.755918517159111, "grad_norm": 0.012073502875864506, "learning_rate": 6.0773114312215e-06, "loss": 0.0944, "num_input_tokens_seen": 114657264, "step": 53160 }, { "epoch": 9.756836116718665, "grad_norm": 45.33522415161133, "learning_rate": 6.076529468775524e-06, "loss": 0.0961, "num_input_tokens_seen": 114668240, "step": 53165 }, { "epoch": 9.757753716278216, "grad_norm": 0.034060195088386536, "learning_rate": 6.075747478718237e-06, "loss": 0.0005, "num_input_tokens_seen": 114679728, "step": 53170 }, { "epoch": 9.758671315837768, "grad_norm": 0.01576729491353035, "learning_rate": 6.074965461069693e-06, "loss": 0.0315, "num_input_tokens_seen": 114689840, "step": 53175 }, { "epoch": 9.759588915397321, "grad_norm": 41.56085205078125, "learning_rate": 6.074183415849952e-06, "loss": 0.2384, "num_input_tokens_seen": 114701232, "step": 53180 }, { "epoch": 9.760506514956873, "grad_norm": 118.03861999511719, "learning_rate": 6.073401343079071e-06, "loss": 0.0387, "num_input_tokens_seen": 114712080, "step": 53185 }, { "epoch": 9.761424114516425, "grad_norm": 0.023973790928721428, "learning_rate": 6.072619242777109e-06, "loss": 0.0003, "num_input_tokens_seen": 114722160, "step": 53190 }, { "epoch": 9.762341714075978, "grad_norm": 0.06343439966440201, "learning_rate": 6.0718371149641266e-06, "loss": 0.0109, "num_input_tokens_seen": 114733488, "step": 53195 }, { "epoch": 9.76325931363553, "grad_norm": 28.14345932006836, "learning_rate": 6.071054959660182e-06, "loss": 0.3737, "num_input_tokens_seen": 114744848, "step": 53200 }, { "epoch": 9.764176913195081, "grad_norm": 0.02574021741747856, "learning_rate": 6.070272776885338e-06, "loss": 0.0005, "num_input_tokens_seen": 114756048, "step": 53205 }, { "epoch": 9.765094512754635, "grad_norm": 0.0029882933013141155, "learning_rate": 6.069490566659657e-06, "loss": 0.1056, "num_input_tokens_seen": 114765840, "step": 53210 }, { "epoch": 9.766012112314186, "grad_norm": 0.34988829493522644, "learning_rate": 6.068708329003201e-06, "loss": 0.0206, "num_input_tokens_seen": 114776656, "step": 53215 }, { "epoch": 9.766929711873738, "grad_norm": 0.03797489404678345, "learning_rate": 6.067926063936031e-06, "loss": 0.1318, "num_input_tokens_seen": 114788272, "step": 53220 }, { "epoch": 9.767847311433291, "grad_norm": 2.352721929550171, "learning_rate": 6.067143771478213e-06, "loss": 0.0011, "num_input_tokens_seen": 114798832, "step": 53225 }, { "epoch": 9.768764910992843, "grad_norm": 0.009983114898204803, "learning_rate": 6.066361451649812e-06, "loss": 0.0006, "num_input_tokens_seen": 114809360, "step": 53230 }, { "epoch": 9.769682510552395, "grad_norm": 0.12456212192773819, "learning_rate": 6.065579104470892e-06, "loss": 0.001, "num_input_tokens_seen": 114819408, "step": 53235 }, { "epoch": 9.770600110111948, "grad_norm": 0.012606460601091385, "learning_rate": 6.06479672996152e-06, "loss": 0.1741, "num_input_tokens_seen": 114830224, "step": 53240 }, { "epoch": 9.7715177096715, "grad_norm": 0.020322920754551888, "learning_rate": 6.064014328141762e-06, "loss": 0.0001, "num_input_tokens_seen": 114840432, "step": 53245 }, { "epoch": 9.772435309231051, "grad_norm": 0.024764487519860268, "learning_rate": 6.063231899031684e-06, "loss": 0.0053, "num_input_tokens_seen": 114852016, "step": 53250 }, { "epoch": 9.773352908790605, "grad_norm": 0.04112667590379715, "learning_rate": 6.062449442651357e-06, "loss": 0.0006, "num_input_tokens_seen": 114863920, "step": 53255 }, { "epoch": 9.774270508350156, "grad_norm": 0.024285422638058662, "learning_rate": 6.061666959020849e-06, "loss": 0.0637, "num_input_tokens_seen": 114874992, "step": 53260 }, { "epoch": 9.775188107909708, "grad_norm": 0.02512039616703987, "learning_rate": 6.060884448160227e-06, "loss": 0.0007, "num_input_tokens_seen": 114885616, "step": 53265 }, { "epoch": 9.776105707469261, "grad_norm": 8.90228271484375, "learning_rate": 6.0601019100895654e-06, "loss": 0.0104, "num_input_tokens_seen": 114895536, "step": 53270 }, { "epoch": 9.777023307028813, "grad_norm": 0.028646918013691902, "learning_rate": 6.0593193448289315e-06, "loss": 0.0001, "num_input_tokens_seen": 114904752, "step": 53275 }, { "epoch": 9.777940906588364, "grad_norm": 0.0054755364544689655, "learning_rate": 6.058536752398398e-06, "loss": 0.0005, "num_input_tokens_seen": 114916144, "step": 53280 }, { "epoch": 9.778858506147918, "grad_norm": 0.09966360777616501, "learning_rate": 6.057754132818038e-06, "loss": 0.0012, "num_input_tokens_seen": 114927056, "step": 53285 }, { "epoch": 9.77977610570747, "grad_norm": 0.19678995013237, "learning_rate": 6.056971486107923e-06, "loss": 0.0005, "num_input_tokens_seen": 114937680, "step": 53290 }, { "epoch": 9.780693705267021, "grad_norm": 0.04450207203626633, "learning_rate": 6.056188812288129e-06, "loss": 0.0002, "num_input_tokens_seen": 114948912, "step": 53295 }, { "epoch": 9.781611304826574, "grad_norm": 0.0009399944683536887, "learning_rate": 6.055406111378727e-06, "loss": 0.0041, "num_input_tokens_seen": 114960368, "step": 53300 }, { "epoch": 9.782528904386126, "grad_norm": 508.5893249511719, "learning_rate": 6.0546233833997956e-06, "loss": 0.0405, "num_input_tokens_seen": 114972208, "step": 53305 }, { "epoch": 9.783446503945678, "grad_norm": 0.03508399426937103, "learning_rate": 6.053840628371408e-06, "loss": 0.0002, "num_input_tokens_seen": 114983184, "step": 53310 }, { "epoch": 9.784364103505231, "grad_norm": 0.014762058854103088, "learning_rate": 6.053057846313642e-06, "loss": 0.0004, "num_input_tokens_seen": 114994288, "step": 53315 }, { "epoch": 9.785281703064783, "grad_norm": 21.34368133544922, "learning_rate": 6.052275037246575e-06, "loss": 0.3137, "num_input_tokens_seen": 115004720, "step": 53320 }, { "epoch": 9.786199302624334, "grad_norm": 0.000897285935934633, "learning_rate": 6.051492201190285e-06, "loss": 0.0001, "num_input_tokens_seen": 115014992, "step": 53325 }, { "epoch": 9.787116902183888, "grad_norm": 0.005046375095844269, "learning_rate": 6.050709338164846e-06, "loss": 0.0004, "num_input_tokens_seen": 115025296, "step": 53330 }, { "epoch": 9.78803450174344, "grad_norm": 25.974233627319336, "learning_rate": 6.049926448190344e-06, "loss": 0.1722, "num_input_tokens_seen": 115035312, "step": 53335 }, { "epoch": 9.788952101302991, "grad_norm": 0.01758204773068428, "learning_rate": 6.049143531286855e-06, "loss": 0.0001, "num_input_tokens_seen": 115046096, "step": 53340 }, { "epoch": 9.789869700862544, "grad_norm": 0.03645270690321922, "learning_rate": 6.048360587474461e-06, "loss": 0.0014, "num_input_tokens_seen": 115056560, "step": 53345 }, { "epoch": 9.790787300422096, "grad_norm": 0.02784423902630806, "learning_rate": 6.047577616773242e-06, "loss": 0.0003, "num_input_tokens_seen": 115067344, "step": 53350 }, { "epoch": 9.791704899981648, "grad_norm": 23.150650024414062, "learning_rate": 6.0467946192032815e-06, "loss": 0.006, "num_input_tokens_seen": 115075888, "step": 53355 }, { "epoch": 9.792622499541201, "grad_norm": 0.0059583852998912334, "learning_rate": 6.04601159478466e-06, "loss": 0.0, "num_input_tokens_seen": 115084688, "step": 53360 }, { "epoch": 9.793540099100753, "grad_norm": 0.003261428326368332, "learning_rate": 6.045228543537463e-06, "loss": 0.0009, "num_input_tokens_seen": 115095216, "step": 53365 }, { "epoch": 9.794457698660304, "grad_norm": 0.006896838080137968, "learning_rate": 6.044445465481774e-06, "loss": 0.0001, "num_input_tokens_seen": 115107088, "step": 53370 }, { "epoch": 9.795375298219858, "grad_norm": 0.016466671600937843, "learning_rate": 6.043662360637678e-06, "loss": 0.0007, "num_input_tokens_seen": 115118128, "step": 53375 }, { "epoch": 9.79629289777941, "grad_norm": 9.69207763671875, "learning_rate": 6.0428792290252595e-06, "loss": 0.0162, "num_input_tokens_seen": 115129584, "step": 53380 }, { "epoch": 9.79721049733896, "grad_norm": 0.008175849914550781, "learning_rate": 6.0420960706646046e-06, "loss": 0.0001, "num_input_tokens_seen": 115140816, "step": 53385 }, { "epoch": 9.798128096898514, "grad_norm": 34.95915603637695, "learning_rate": 6.0413128855758e-06, "loss": 0.0532, "num_input_tokens_seen": 115151120, "step": 53390 }, { "epoch": 9.799045696458066, "grad_norm": 0.007500749547034502, "learning_rate": 6.040529673778936e-06, "loss": 0.0001, "num_input_tokens_seen": 115160912, "step": 53395 }, { "epoch": 9.799963296017618, "grad_norm": 0.10616080462932587, "learning_rate": 6.039746435294097e-06, "loss": 0.0004, "num_input_tokens_seen": 115171216, "step": 53400 }, { "epoch": 9.800880895577171, "grad_norm": 0.3133735954761505, "learning_rate": 6.038963170141374e-06, "loss": 0.1162, "num_input_tokens_seen": 115181968, "step": 53405 }, { "epoch": 9.801798495136723, "grad_norm": 0.22803282737731934, "learning_rate": 6.0381798783408555e-06, "loss": 0.0003, "num_input_tokens_seen": 115192048, "step": 53410 }, { "epoch": 9.802716094696274, "grad_norm": 0.047732941806316376, "learning_rate": 6.037396559912631e-06, "loss": 0.0001, "num_input_tokens_seen": 115204720, "step": 53415 }, { "epoch": 9.803633694255828, "grad_norm": 0.031373102217912674, "learning_rate": 6.036613214876795e-06, "loss": 0.0002, "num_input_tokens_seen": 115215664, "step": 53420 }, { "epoch": 9.80455129381538, "grad_norm": 0.16963587701320648, "learning_rate": 6.0358298432534355e-06, "loss": 0.0001, "num_input_tokens_seen": 115227024, "step": 53425 }, { "epoch": 9.80546889337493, "grad_norm": 0.002683380153030157, "learning_rate": 6.035046445062647e-06, "loss": 0.0001, "num_input_tokens_seen": 115235824, "step": 53430 }, { "epoch": 9.806386492934484, "grad_norm": 0.0007365756900981069, "learning_rate": 6.0342630203245204e-06, "loss": 0.1334, "num_input_tokens_seen": 115246992, "step": 53435 }, { "epoch": 9.807304092494036, "grad_norm": 0.0011190788354724646, "learning_rate": 6.03347956905915e-06, "loss": 0.0, "num_input_tokens_seen": 115258128, "step": 53440 }, { "epoch": 9.808221692053587, "grad_norm": 0.006441858597099781, "learning_rate": 6.03269609128663e-06, "loss": 0.0007, "num_input_tokens_seen": 115268720, "step": 53445 }, { "epoch": 9.80913929161314, "grad_norm": 0.0056203799322247505, "learning_rate": 6.031912587027057e-06, "loss": 0.0001, "num_input_tokens_seen": 115280592, "step": 53450 }, { "epoch": 9.810056891172692, "grad_norm": 0.19009758532047272, "learning_rate": 6.031129056300526e-06, "loss": 0.0182, "num_input_tokens_seen": 115291824, "step": 53455 }, { "epoch": 9.810974490732244, "grad_norm": 0.042900439351797104, "learning_rate": 6.030345499127131e-06, "loss": 0.0052, "num_input_tokens_seen": 115303152, "step": 53460 }, { "epoch": 9.811892090291797, "grad_norm": 136.44468688964844, "learning_rate": 6.029561915526971e-06, "loss": 0.1658, "num_input_tokens_seen": 115314480, "step": 53465 }, { "epoch": 9.812809689851349, "grad_norm": 0.0004843178321607411, "learning_rate": 6.0287783055201445e-06, "loss": 0.0645, "num_input_tokens_seen": 115324720, "step": 53470 }, { "epoch": 9.8137272894109, "grad_norm": 0.039336610585451126, "learning_rate": 6.027994669126748e-06, "loss": 0.2321, "num_input_tokens_seen": 115335920, "step": 53475 }, { "epoch": 9.814644888970454, "grad_norm": 0.0005149120115675032, "learning_rate": 6.027211006366882e-06, "loss": 0.0001, "num_input_tokens_seen": 115346512, "step": 53480 }, { "epoch": 9.815562488530006, "grad_norm": 0.004877419676631689, "learning_rate": 6.026427317260645e-06, "loss": 0.0003, "num_input_tokens_seen": 115357328, "step": 53485 }, { "epoch": 9.816480088089557, "grad_norm": 0.012358362786471844, "learning_rate": 6.0256436018281395e-06, "loss": 0.1439, "num_input_tokens_seen": 115368368, "step": 53490 }, { "epoch": 9.81739768764911, "grad_norm": 0.0028340546414256096, "learning_rate": 6.024859860089464e-06, "loss": 0.0, "num_input_tokens_seen": 115377520, "step": 53495 }, { "epoch": 9.818315287208662, "grad_norm": 0.09249962866306305, "learning_rate": 6.0240760920647215e-06, "loss": 0.0002, "num_input_tokens_seen": 115388656, "step": 53500 }, { "epoch": 9.819232886768214, "grad_norm": 0.003115264233201742, "learning_rate": 6.023292297774015e-06, "loss": 0.0002, "num_input_tokens_seen": 115400400, "step": 53505 }, { "epoch": 9.820150486327767, "grad_norm": 0.013228806667029858, "learning_rate": 6.022508477237447e-06, "loss": 0.0001, "num_input_tokens_seen": 115412784, "step": 53510 }, { "epoch": 9.821068085887319, "grad_norm": 0.00042307135299779475, "learning_rate": 6.021724630475122e-06, "loss": 0.1224, "num_input_tokens_seen": 115424624, "step": 53515 }, { "epoch": 9.82198568544687, "grad_norm": 0.02682112343609333, "learning_rate": 6.020940757507142e-06, "loss": 0.0001, "num_input_tokens_seen": 115436240, "step": 53520 }, { "epoch": 9.822903285006424, "grad_norm": 0.010701077058911324, "learning_rate": 6.020156858353614e-06, "loss": 0.0001, "num_input_tokens_seen": 115447472, "step": 53525 }, { "epoch": 9.823820884565976, "grad_norm": 0.010700498707592487, "learning_rate": 6.0193729330346465e-06, "loss": 0.0006, "num_input_tokens_seen": 115456880, "step": 53530 }, { "epoch": 9.824738484125527, "grad_norm": 51.52181625366211, "learning_rate": 6.01858898157034e-06, "loss": 0.0229, "num_input_tokens_seen": 115467920, "step": 53535 }, { "epoch": 9.82565608368508, "grad_norm": 0.32848086953163147, "learning_rate": 6.017805003980806e-06, "loss": 0.0003, "num_input_tokens_seen": 115479408, "step": 53540 }, { "epoch": 9.826573683244632, "grad_norm": 8.069189071655273, "learning_rate": 6.0170210002861515e-06, "loss": 0.1515, "num_input_tokens_seen": 115490512, "step": 53545 }, { "epoch": 9.827491282804184, "grad_norm": 0.002450199332088232, "learning_rate": 6.016236970506485e-06, "loss": 0.1378, "num_input_tokens_seen": 115501584, "step": 53550 }, { "epoch": 9.828408882363737, "grad_norm": 0.009787247516214848, "learning_rate": 6.015452914661914e-06, "loss": 0.001, "num_input_tokens_seen": 115512240, "step": 53555 }, { "epoch": 9.829326481923289, "grad_norm": 0.018031032755970955, "learning_rate": 6.01466883277255e-06, "loss": 0.0004, "num_input_tokens_seen": 115523024, "step": 53560 }, { "epoch": 9.83024408148284, "grad_norm": 0.001366983400657773, "learning_rate": 6.013884724858503e-06, "loss": 0.1196, "num_input_tokens_seen": 115533616, "step": 53565 }, { "epoch": 9.831161681042394, "grad_norm": 0.016039688140153885, "learning_rate": 6.013100590939885e-06, "loss": 0.0002, "num_input_tokens_seen": 115543888, "step": 53570 }, { "epoch": 9.832079280601945, "grad_norm": 0.11398051679134369, "learning_rate": 6.012316431036805e-06, "loss": 0.0004, "num_input_tokens_seen": 115555312, "step": 53575 }, { "epoch": 9.832996880161497, "grad_norm": 1.0397443771362305, "learning_rate": 6.011532245169379e-06, "loss": 0.0008, "num_input_tokens_seen": 115566928, "step": 53580 }, { "epoch": 9.83391447972105, "grad_norm": 0.013350630179047585, "learning_rate": 6.010748033357718e-06, "loss": 0.1223, "num_input_tokens_seen": 115578864, "step": 53585 }, { "epoch": 9.834832079280602, "grad_norm": 0.09414554387331009, "learning_rate": 6.009963795621938e-06, "loss": 0.0032, "num_input_tokens_seen": 115589072, "step": 53590 }, { "epoch": 9.835749678840154, "grad_norm": 0.004047599621117115, "learning_rate": 6.00917953198215e-06, "loss": 0.0011, "num_input_tokens_seen": 115598256, "step": 53595 }, { "epoch": 9.836667278399707, "grad_norm": 0.1436987668275833, "learning_rate": 6.008395242458472e-06, "loss": 0.1472, "num_input_tokens_seen": 115609488, "step": 53600 }, { "epoch": 9.837584877959259, "grad_norm": 0.03863922879099846, "learning_rate": 6.007610927071018e-06, "loss": 0.0001, "num_input_tokens_seen": 115621040, "step": 53605 }, { "epoch": 9.83850247751881, "grad_norm": 0.004516406450420618, "learning_rate": 6.006826585839907e-06, "loss": 0.2127, "num_input_tokens_seen": 115632304, "step": 53610 }, { "epoch": 9.839420077078364, "grad_norm": 0.06866142153739929, "learning_rate": 6.006042218785253e-06, "loss": 0.0002, "num_input_tokens_seen": 115642416, "step": 53615 }, { "epoch": 9.840337676637915, "grad_norm": 0.004880616441369057, "learning_rate": 6.0052578259271755e-06, "loss": 0.1161, "num_input_tokens_seen": 115652912, "step": 53620 }, { "epoch": 9.841255276197467, "grad_norm": 0.0035232342779636383, "learning_rate": 6.004473407285794e-06, "loss": 0.0001, "num_input_tokens_seen": 115663376, "step": 53625 }, { "epoch": 9.84217287575702, "grad_norm": 0.0021135425195097923, "learning_rate": 6.0036889628812245e-06, "loss": 0.0001, "num_input_tokens_seen": 115673968, "step": 53630 }, { "epoch": 9.843090475316572, "grad_norm": 0.013651174493134022, "learning_rate": 6.00290449273359e-06, "loss": 0.0003, "num_input_tokens_seen": 115683696, "step": 53635 }, { "epoch": 9.844008074876124, "grad_norm": 0.0013828995870426297, "learning_rate": 6.0021199968630095e-06, "loss": 0.0001, "num_input_tokens_seen": 115693968, "step": 53640 }, { "epoch": 9.844925674435677, "grad_norm": 0.0019614961929619312, "learning_rate": 6.0013354752896045e-06, "loss": 0.0001, "num_input_tokens_seen": 115704432, "step": 53645 }, { "epoch": 9.845843273995229, "grad_norm": 0.023280713707208633, "learning_rate": 6.000550928033496e-06, "loss": 0.0002, "num_input_tokens_seen": 115715120, "step": 53650 }, { "epoch": 9.84676087355478, "grad_norm": 0.024827634915709496, "learning_rate": 5.9997663551148074e-06, "loss": 0.0001, "num_input_tokens_seen": 115725520, "step": 53655 }, { "epoch": 9.847678473114334, "grad_norm": 0.002537929452955723, "learning_rate": 5.998981756553661e-06, "loss": 0.0944, "num_input_tokens_seen": 115735952, "step": 53660 }, { "epoch": 9.848596072673885, "grad_norm": 0.0071067954413592815, "learning_rate": 5.998197132370181e-06, "loss": 0.0041, "num_input_tokens_seen": 115745424, "step": 53665 }, { "epoch": 9.849513672233437, "grad_norm": 0.004052290227264166, "learning_rate": 5.997412482584491e-06, "loss": 0.0, "num_input_tokens_seen": 115756912, "step": 53670 }, { "epoch": 9.85043127179299, "grad_norm": 0.00945139117538929, "learning_rate": 5.996627807216717e-06, "loss": 0.0004, "num_input_tokens_seen": 115768336, "step": 53675 }, { "epoch": 9.851348871352542, "grad_norm": 0.023240933194756508, "learning_rate": 5.995843106286985e-06, "loss": 0.0005, "num_input_tokens_seen": 115780368, "step": 53680 }, { "epoch": 9.852266470912094, "grad_norm": 0.020421132445335388, "learning_rate": 5.9950583798154195e-06, "loss": 0.0886, "num_input_tokens_seen": 115792592, "step": 53685 }, { "epoch": 9.853184070471647, "grad_norm": 0.00650536036118865, "learning_rate": 5.99427362782215e-06, "loss": 0.0006, "num_input_tokens_seen": 115803792, "step": 53690 }, { "epoch": 9.854101670031199, "grad_norm": 0.008045731112360954, "learning_rate": 5.9934888503273015e-06, "loss": 0.0878, "num_input_tokens_seen": 115814288, "step": 53695 }, { "epoch": 9.85501926959075, "grad_norm": 0.01306084543466568, "learning_rate": 5.9927040473510056e-06, "loss": 0.0001, "num_input_tokens_seen": 115823344, "step": 53700 }, { "epoch": 9.855936869150304, "grad_norm": 0.003276645904406905, "learning_rate": 5.991919218913388e-06, "loss": 0.2262, "num_input_tokens_seen": 115833904, "step": 53705 }, { "epoch": 9.856854468709855, "grad_norm": 0.07102370262145996, "learning_rate": 5.991134365034579e-06, "loss": 0.2251, "num_input_tokens_seen": 115845584, "step": 53710 }, { "epoch": 9.857772068269407, "grad_norm": 0.06666332483291626, "learning_rate": 5.990349485734712e-06, "loss": 0.0002, "num_input_tokens_seen": 115856784, "step": 53715 }, { "epoch": 9.85868966782896, "grad_norm": 22.687082290649414, "learning_rate": 5.989564581033914e-06, "loss": 0.2159, "num_input_tokens_seen": 115867728, "step": 53720 }, { "epoch": 9.859607267388512, "grad_norm": 0.003629149869084358, "learning_rate": 5.988779650952319e-06, "loss": 0.0828, "num_input_tokens_seen": 115878224, "step": 53725 }, { "epoch": 9.860524866948063, "grad_norm": 0.0011722815688699484, "learning_rate": 5.987994695510058e-06, "loss": 0.0001, "num_input_tokens_seen": 115890096, "step": 53730 }, { "epoch": 9.861442466507617, "grad_norm": 0.02673676423728466, "learning_rate": 5.987209714727264e-06, "loss": 0.0948, "num_input_tokens_seen": 115902448, "step": 53735 }, { "epoch": 9.862360066067168, "grad_norm": 87.93351745605469, "learning_rate": 5.986424708624071e-06, "loss": 0.0599, "num_input_tokens_seen": 115913488, "step": 53740 }, { "epoch": 9.86327766562672, "grad_norm": 0.05258498713374138, "learning_rate": 5.985639677220613e-06, "loss": 0.0145, "num_input_tokens_seen": 115924848, "step": 53745 }, { "epoch": 9.864195265186273, "grad_norm": 0.003925578203052282, "learning_rate": 5.984854620537026e-06, "loss": 0.0, "num_input_tokens_seen": 115936016, "step": 53750 }, { "epoch": 9.865112864745825, "grad_norm": 0.01978369429707527, "learning_rate": 5.984069538593444e-06, "loss": 0.0002, "num_input_tokens_seen": 115947696, "step": 53755 }, { "epoch": 9.866030464305377, "grad_norm": 0.010167136788368225, "learning_rate": 5.983284431410003e-06, "loss": 0.0944, "num_input_tokens_seen": 115958960, "step": 53760 }, { "epoch": 9.86694806386493, "grad_norm": 20.653533935546875, "learning_rate": 5.982499299006841e-06, "loss": 0.0028, "num_input_tokens_seen": 115969904, "step": 53765 }, { "epoch": 9.867865663424482, "grad_norm": 0.004828402306884527, "learning_rate": 5.981714141404093e-06, "loss": 0.0001, "num_input_tokens_seen": 115980016, "step": 53770 }, { "epoch": 9.868783262984033, "grad_norm": 0.0071423775516450405, "learning_rate": 5.9809289586219e-06, "loss": 0.1268, "num_input_tokens_seen": 115991728, "step": 53775 }, { "epoch": 9.869700862543587, "grad_norm": 0.13673733174800873, "learning_rate": 5.9801437506804e-06, "loss": 0.0003, "num_input_tokens_seen": 116003504, "step": 53780 }, { "epoch": 9.870618462103138, "grad_norm": 0.00046121724881231785, "learning_rate": 5.9793585175997316e-06, "loss": 0.019, "num_input_tokens_seen": 116013584, "step": 53785 }, { "epoch": 9.87153606166269, "grad_norm": 15.258478164672852, "learning_rate": 5.978573259400034e-06, "loss": 0.0763, "num_input_tokens_seen": 116024784, "step": 53790 }, { "epoch": 9.872453661222243, "grad_norm": 0.001057233544997871, "learning_rate": 5.977787976101449e-06, "loss": 0.0331, "num_input_tokens_seen": 116035312, "step": 53795 }, { "epoch": 9.873371260781795, "grad_norm": 0.0260712131857872, "learning_rate": 5.9770026677241185e-06, "loss": 0.0003, "num_input_tokens_seen": 116045328, "step": 53800 }, { "epoch": 9.874288860341347, "grad_norm": 0.08782072365283966, "learning_rate": 5.9762173342881835e-06, "loss": 0.0002, "num_input_tokens_seen": 116056464, "step": 53805 }, { "epoch": 9.8752064599009, "grad_norm": 0.011365734040737152, "learning_rate": 5.975431975813788e-06, "loss": 0.0001, "num_input_tokens_seen": 116066512, "step": 53810 }, { "epoch": 9.876124059460452, "grad_norm": 0.03324459493160248, "learning_rate": 5.974646592321073e-06, "loss": 0.0208, "num_input_tokens_seen": 116077712, "step": 53815 }, { "epoch": 9.877041659020003, "grad_norm": 0.01420076284557581, "learning_rate": 5.973861183830183e-06, "loss": 0.0645, "num_input_tokens_seen": 116088944, "step": 53820 }, { "epoch": 9.877959258579557, "grad_norm": 0.12400806695222855, "learning_rate": 5.973075750361265e-06, "loss": 0.0014, "num_input_tokens_seen": 116098928, "step": 53825 }, { "epoch": 9.878876858139108, "grad_norm": 0.00425015389919281, "learning_rate": 5.9722902919344595e-06, "loss": 0.0174, "num_input_tokens_seen": 116110160, "step": 53830 }, { "epoch": 9.87979445769866, "grad_norm": 0.08240222185850143, "learning_rate": 5.971504808569917e-06, "loss": 0.0797, "num_input_tokens_seen": 116121456, "step": 53835 }, { "epoch": 9.880712057258213, "grad_norm": 0.272087961435318, "learning_rate": 5.970719300287781e-06, "loss": 0.1791, "num_input_tokens_seen": 116132240, "step": 53840 }, { "epoch": 9.881629656817765, "grad_norm": 0.21702520549297333, "learning_rate": 5.9699337671081996e-06, "loss": 0.0008, "num_input_tokens_seen": 116144560, "step": 53845 }, { "epoch": 9.882547256377316, "grad_norm": 1.2571617364883423, "learning_rate": 5.96914820905132e-06, "loss": 0.001, "num_input_tokens_seen": 116155408, "step": 53850 }, { "epoch": 9.88346485593687, "grad_norm": 0.0030929360073059797, "learning_rate": 5.9683626261372905e-06, "loss": 0.0481, "num_input_tokens_seen": 116166864, "step": 53855 }, { "epoch": 9.884382455496421, "grad_norm": 0.18411561846733093, "learning_rate": 5.967577018386263e-06, "loss": 0.0007, "num_input_tokens_seen": 116175600, "step": 53860 }, { "epoch": 9.885300055055973, "grad_norm": 0.010890446603298187, "learning_rate": 5.966791385818383e-06, "loss": 0.113, "num_input_tokens_seen": 116187888, "step": 53865 }, { "epoch": 9.886217654615526, "grad_norm": 177.34573364257812, "learning_rate": 5.966005728453801e-06, "loss": 0.0427, "num_input_tokens_seen": 116198032, "step": 53870 }, { "epoch": 9.887135254175078, "grad_norm": 0.01431917306035757, "learning_rate": 5.9652200463126705e-06, "loss": 0.0004, "num_input_tokens_seen": 116209104, "step": 53875 }, { "epoch": 9.88805285373463, "grad_norm": 0.009763737209141254, "learning_rate": 5.964434339415141e-06, "loss": 0.0418, "num_input_tokens_seen": 116220240, "step": 53880 }, { "epoch": 9.888970453294183, "grad_norm": 0.008651265874505043, "learning_rate": 5.963648607781367e-06, "loss": 0.0227, "num_input_tokens_seen": 116231600, "step": 53885 }, { "epoch": 9.889888052853735, "grad_norm": 0.05083605647087097, "learning_rate": 5.962862851431498e-06, "loss": 0.0007, "num_input_tokens_seen": 116241488, "step": 53890 }, { "epoch": 9.890805652413286, "grad_norm": 144.6105499267578, "learning_rate": 5.96207707038569e-06, "loss": 0.0131, "num_input_tokens_seen": 116252624, "step": 53895 }, { "epoch": 9.89172325197284, "grad_norm": 832.3015747070312, "learning_rate": 5.961291264664096e-06, "loss": 0.0711, "num_input_tokens_seen": 116262640, "step": 53900 }, { "epoch": 9.892640851532391, "grad_norm": 55.958045959472656, "learning_rate": 5.9605054342868705e-06, "loss": 0.2662, "num_input_tokens_seen": 116273200, "step": 53905 }, { "epoch": 9.893558451091943, "grad_norm": 0.16099560260772705, "learning_rate": 5.959719579274172e-06, "loss": 0.0009, "num_input_tokens_seen": 116282576, "step": 53910 }, { "epoch": 9.894476050651496, "grad_norm": 0.0013010138645768166, "learning_rate": 5.95893369964615e-06, "loss": 0.0001, "num_input_tokens_seen": 116293584, "step": 53915 }, { "epoch": 9.895393650211048, "grad_norm": 0.03333665058016777, "learning_rate": 5.958147795422967e-06, "loss": 0.0013, "num_input_tokens_seen": 116304848, "step": 53920 }, { "epoch": 9.8963112497706, "grad_norm": 55.869319915771484, "learning_rate": 5.957361866624777e-06, "loss": 0.1752, "num_input_tokens_seen": 116315536, "step": 53925 }, { "epoch": 9.897228849330153, "grad_norm": 0.40547043085098267, "learning_rate": 5.956575913271738e-06, "loss": 0.0013, "num_input_tokens_seen": 116325264, "step": 53930 }, { "epoch": 9.898146448889705, "grad_norm": 0.0007777807768434286, "learning_rate": 5.955789935384012e-06, "loss": 0.1159, "num_input_tokens_seen": 116336400, "step": 53935 }, { "epoch": 9.899064048449256, "grad_norm": 71.63201904296875, "learning_rate": 5.9550039329817536e-06, "loss": 0.1283, "num_input_tokens_seen": 116347824, "step": 53940 }, { "epoch": 9.89998164800881, "grad_norm": 1.1481863260269165, "learning_rate": 5.954217906085126e-06, "loss": 0.0016, "num_input_tokens_seen": 116358768, "step": 53945 }, { "epoch": 9.900899247568361, "grad_norm": 0.025410208851099014, "learning_rate": 5.953431854714287e-06, "loss": 0.0975, "num_input_tokens_seen": 116369424, "step": 53950 }, { "epoch": 9.901816847127913, "grad_norm": 0.003165405709296465, "learning_rate": 5.9526457788893975e-06, "loss": 0.1347, "num_input_tokens_seen": 116380944, "step": 53955 }, { "epoch": 9.902734446687466, "grad_norm": 0.0692422166466713, "learning_rate": 5.951859678630621e-06, "loss": 0.0002, "num_input_tokens_seen": 116393296, "step": 53960 }, { "epoch": 9.903652046247018, "grad_norm": 0.0007057074690237641, "learning_rate": 5.95107355395812e-06, "loss": 0.0002, "num_input_tokens_seen": 116404624, "step": 53965 }, { "epoch": 9.90456964580657, "grad_norm": 0.001369265140965581, "learning_rate": 5.950287404892057e-06, "loss": 0.0, "num_input_tokens_seen": 116416208, "step": 53970 }, { "epoch": 9.905487245366123, "grad_norm": 29.843246459960938, "learning_rate": 5.949501231452594e-06, "loss": 0.0534, "num_input_tokens_seen": 116426896, "step": 53975 }, { "epoch": 9.906404844925675, "grad_norm": 0.962095320224762, "learning_rate": 5.948715033659894e-06, "loss": 0.1178, "num_input_tokens_seen": 116436816, "step": 53980 }, { "epoch": 9.907322444485226, "grad_norm": 0.11870665848255157, "learning_rate": 5.947928811534127e-06, "loss": 0.001, "num_input_tokens_seen": 116448304, "step": 53985 }, { "epoch": 9.90824004404478, "grad_norm": 0.0038305348716676235, "learning_rate": 5.9471425650954525e-06, "loss": 0.0003, "num_input_tokens_seen": 116459600, "step": 53990 }, { "epoch": 9.909157643604331, "grad_norm": 33.14289855957031, "learning_rate": 5.946356294364041e-06, "loss": 0.2083, "num_input_tokens_seen": 116469840, "step": 53995 }, { "epoch": 9.910075243163883, "grad_norm": 0.0059183272533118725, "learning_rate": 5.9455699993600555e-06, "loss": 0.1722, "num_input_tokens_seen": 116480304, "step": 54000 }, { "epoch": 9.910992842723436, "grad_norm": 0.21546654403209686, "learning_rate": 5.944783680103666e-06, "loss": 0.0013, "num_input_tokens_seen": 116490992, "step": 54005 }, { "epoch": 9.911910442282988, "grad_norm": 0.10239585489034653, "learning_rate": 5.94399733661504e-06, "loss": 0.0353, "num_input_tokens_seen": 116502960, "step": 54010 }, { "epoch": 9.91282804184254, "grad_norm": 0.000475465931231156, "learning_rate": 5.943210968914343e-06, "loss": 0.0005, "num_input_tokens_seen": 116513360, "step": 54015 }, { "epoch": 9.913745641402093, "grad_norm": 0.014018765650689602, "learning_rate": 5.942424577021751e-06, "loss": 0.213, "num_input_tokens_seen": 116525520, "step": 54020 }, { "epoch": 9.914663240961644, "grad_norm": 0.13595260679721832, "learning_rate": 5.9416381609574246e-06, "loss": 0.0003, "num_input_tokens_seen": 116537072, "step": 54025 }, { "epoch": 9.915580840521196, "grad_norm": 0.15858866274356842, "learning_rate": 5.9408517207415404e-06, "loss": 0.0003, "num_input_tokens_seen": 116547568, "step": 54030 }, { "epoch": 9.91649844008075, "grad_norm": 8.918269157409668, "learning_rate": 5.940065256394269e-06, "loss": 0.0053, "num_input_tokens_seen": 116556656, "step": 54035 }, { "epoch": 9.917416039640301, "grad_norm": 0.0029781791381537914, "learning_rate": 5.939278767935779e-06, "loss": 0.1595, "num_input_tokens_seen": 116567440, "step": 54040 }, { "epoch": 9.918333639199853, "grad_norm": 0.021420544013381004, "learning_rate": 5.938492255386246e-06, "loss": 0.0005, "num_input_tokens_seen": 116578512, "step": 54045 }, { "epoch": 9.919251238759406, "grad_norm": 0.001423510955646634, "learning_rate": 5.9377057187658395e-06, "loss": 0.0002, "num_input_tokens_seen": 116588912, "step": 54050 }, { "epoch": 9.920168838318958, "grad_norm": 0.01280379667878151, "learning_rate": 5.936919158094736e-06, "loss": 0.0006, "num_input_tokens_seen": 116599376, "step": 54055 }, { "epoch": 9.92108643787851, "grad_norm": 0.0020044581033289433, "learning_rate": 5.936132573393106e-06, "loss": 0.0003, "num_input_tokens_seen": 116610224, "step": 54060 }, { "epoch": 9.922004037438063, "grad_norm": 0.032702598720788956, "learning_rate": 5.935345964681129e-06, "loss": 0.0145, "num_input_tokens_seen": 116621680, "step": 54065 }, { "epoch": 9.922921636997614, "grad_norm": 0.0034289169125258923, "learning_rate": 5.934559331978976e-06, "loss": 0.0004, "num_input_tokens_seen": 116632272, "step": 54070 }, { "epoch": 9.923839236557166, "grad_norm": 0.0031689805909991264, "learning_rate": 5.933772675306825e-06, "loss": 0.0001, "num_input_tokens_seen": 116644304, "step": 54075 }, { "epoch": 9.92475683611672, "grad_norm": 0.0272917952388525, "learning_rate": 5.932985994684854e-06, "loss": 0.0004, "num_input_tokens_seen": 116655472, "step": 54080 }, { "epoch": 9.925674435676271, "grad_norm": 0.0044670142233371735, "learning_rate": 5.932199290133236e-06, "loss": 0.0003, "num_input_tokens_seen": 116666064, "step": 54085 }, { "epoch": 9.926592035235823, "grad_norm": 0.13022834062576294, "learning_rate": 5.931412561672151e-06, "loss": 0.0003, "num_input_tokens_seen": 116676784, "step": 54090 }, { "epoch": 9.927509634795376, "grad_norm": 0.07710959017276764, "learning_rate": 5.930625809321778e-06, "loss": 0.0001, "num_input_tokens_seen": 116688016, "step": 54095 }, { "epoch": 9.928427234354928, "grad_norm": 4.9697136878967285, "learning_rate": 5.9298390331022945e-06, "loss": 0.0036, "num_input_tokens_seen": 116698256, "step": 54100 }, { "epoch": 9.92934483391448, "grad_norm": 0.001751829870045185, "learning_rate": 5.9290522330338825e-06, "loss": 0.0001, "num_input_tokens_seen": 116709008, "step": 54105 }, { "epoch": 9.930262433474033, "grad_norm": 0.0017967104213312268, "learning_rate": 5.9282654091367194e-06, "loss": 0.3129, "num_input_tokens_seen": 116720208, "step": 54110 }, { "epoch": 9.931180033033584, "grad_norm": 54.0426025390625, "learning_rate": 5.927478561430987e-06, "loss": 0.1224, "num_input_tokens_seen": 116730736, "step": 54115 }, { "epoch": 9.932097632593136, "grad_norm": 0.005430799443274736, "learning_rate": 5.926691689936869e-06, "loss": 0.0001, "num_input_tokens_seen": 116740880, "step": 54120 }, { "epoch": 9.93301523215269, "grad_norm": 0.03087591752409935, "learning_rate": 5.925904794674543e-06, "loss": 0.3036, "num_input_tokens_seen": 116751888, "step": 54125 }, { "epoch": 9.93393283171224, "grad_norm": 0.004012973513454199, "learning_rate": 5.925117875664195e-06, "loss": 0.0002, "num_input_tokens_seen": 116763024, "step": 54130 }, { "epoch": 9.934850431271792, "grad_norm": 0.9320414066314697, "learning_rate": 5.924330932926007e-06, "loss": 0.0043, "num_input_tokens_seen": 116773616, "step": 54135 }, { "epoch": 9.935768030831346, "grad_norm": 0.0026722147595137358, "learning_rate": 5.923543966480163e-06, "loss": 0.0002, "num_input_tokens_seen": 116783536, "step": 54140 }, { "epoch": 9.936685630390897, "grad_norm": 0.006057519931346178, "learning_rate": 5.922756976346848e-06, "loss": 0.002, "num_input_tokens_seen": 116794096, "step": 54145 }, { "epoch": 9.937603229950449, "grad_norm": 0.002780819544568658, "learning_rate": 5.921969962546247e-06, "loss": 0.1657, "num_input_tokens_seen": 116805680, "step": 54150 }, { "epoch": 9.938520829510002, "grad_norm": 0.0016622452531009912, "learning_rate": 5.9211829250985455e-06, "loss": 0.0008, "num_input_tokens_seen": 116816912, "step": 54155 }, { "epoch": 9.939438429069554, "grad_norm": 0.0006147412350401282, "learning_rate": 5.920395864023929e-06, "loss": 0.0001, "num_input_tokens_seen": 116826864, "step": 54160 }, { "epoch": 9.940356028629106, "grad_norm": 0.0014605610631406307, "learning_rate": 5.919608779342585e-06, "loss": 0.0013, "num_input_tokens_seen": 116837520, "step": 54165 }, { "epoch": 9.94127362818866, "grad_norm": 0.0049517517909407616, "learning_rate": 5.918821671074702e-06, "loss": 0.1647, "num_input_tokens_seen": 116848688, "step": 54170 }, { "epoch": 9.94219122774821, "grad_norm": 0.009175284765660763, "learning_rate": 5.918034539240466e-06, "loss": 0.1595, "num_input_tokens_seen": 116860688, "step": 54175 }, { "epoch": 9.943108827307762, "grad_norm": 0.10048091411590576, "learning_rate": 5.917247383860067e-06, "loss": 0.0001, "num_input_tokens_seen": 116871664, "step": 54180 }, { "epoch": 9.944026426867316, "grad_norm": 0.017621155828237534, "learning_rate": 5.916460204953696e-06, "loss": 0.0001, "num_input_tokens_seen": 116882384, "step": 54185 }, { "epoch": 9.944944026426867, "grad_norm": 0.010432771407067776, "learning_rate": 5.9156730025415396e-06, "loss": 0.0001, "num_input_tokens_seen": 116894224, "step": 54190 }, { "epoch": 9.945861625986419, "grad_norm": 0.06704340130090714, "learning_rate": 5.914885776643791e-06, "loss": 0.0001, "num_input_tokens_seen": 116904176, "step": 54195 }, { "epoch": 9.946779225545972, "grad_norm": 0.029069187119603157, "learning_rate": 5.914098527280638e-06, "loss": 0.0002, "num_input_tokens_seen": 116916368, "step": 54200 }, { "epoch": 9.947696825105524, "grad_norm": 0.14559942483901978, "learning_rate": 5.913311254472276e-06, "loss": 0.0004, "num_input_tokens_seen": 116926448, "step": 54205 }, { "epoch": 9.948614424665076, "grad_norm": 0.016804104670882225, "learning_rate": 5.912523958238896e-06, "loss": 0.0001, "num_input_tokens_seen": 116938096, "step": 54210 }, { "epoch": 9.949532024224629, "grad_norm": 0.001928546465933323, "learning_rate": 5.91173663860069e-06, "loss": 0.0017, "num_input_tokens_seen": 116948112, "step": 54215 }, { "epoch": 9.95044962378418, "grad_norm": 35.24949645996094, "learning_rate": 5.910949295577853e-06, "loss": 0.2064, "num_input_tokens_seen": 116959312, "step": 54220 }, { "epoch": 9.951367223343732, "grad_norm": 39.28691482543945, "learning_rate": 5.910161929190577e-06, "loss": 0.1005, "num_input_tokens_seen": 116970544, "step": 54225 }, { "epoch": 9.952284822903286, "grad_norm": 0.000826186325866729, "learning_rate": 5.9093745394590594e-06, "loss": 0.1036, "num_input_tokens_seen": 116982128, "step": 54230 }, { "epoch": 9.953202422462837, "grad_norm": 0.01855781488120556, "learning_rate": 5.908587126403494e-06, "loss": 0.0001, "num_input_tokens_seen": 116993360, "step": 54235 }, { "epoch": 9.954120022022389, "grad_norm": 0.0051332321017980576, "learning_rate": 5.907799690044078e-06, "loss": 0.0004, "num_input_tokens_seen": 117003248, "step": 54240 }, { "epoch": 9.955037621581942, "grad_norm": 0.028837310150265694, "learning_rate": 5.907012230401005e-06, "loss": 0.0002, "num_input_tokens_seen": 117013968, "step": 54245 }, { "epoch": 9.955955221141494, "grad_norm": 256.9009094238281, "learning_rate": 5.9062247474944745e-06, "loss": 0.1036, "num_input_tokens_seen": 117023856, "step": 54250 }, { "epoch": 9.956872820701046, "grad_norm": 94.32503509521484, "learning_rate": 5.905437241344685e-06, "loss": 0.333, "num_input_tokens_seen": 117034480, "step": 54255 }, { "epoch": 9.957790420260599, "grad_norm": 0.0045072101056575775, "learning_rate": 5.904649711971833e-06, "loss": 0.0017, "num_input_tokens_seen": 117045200, "step": 54260 }, { "epoch": 9.95870801982015, "grad_norm": 0.0033303643576800823, "learning_rate": 5.9038621593961175e-06, "loss": 0.0764, "num_input_tokens_seen": 117054096, "step": 54265 }, { "epoch": 9.959625619379702, "grad_norm": 0.0012315495405346155, "learning_rate": 5.903074583637738e-06, "loss": 0.0914, "num_input_tokens_seen": 117065808, "step": 54270 }, { "epoch": 9.960543218939256, "grad_norm": 0.015148314647376537, "learning_rate": 5.902286984716895e-06, "loss": 0.0002, "num_input_tokens_seen": 117075312, "step": 54275 }, { "epoch": 9.961460818498807, "grad_norm": 0.034494441002607346, "learning_rate": 5.901499362653791e-06, "loss": 0.0011, "num_input_tokens_seen": 117084976, "step": 54280 }, { "epoch": 9.962378418058359, "grad_norm": 0.006285141222178936, "learning_rate": 5.9007117174686245e-06, "loss": 0.0015, "num_input_tokens_seen": 117097648, "step": 54285 }, { "epoch": 9.963296017617912, "grad_norm": 0.012987805530428886, "learning_rate": 5.899924049181599e-06, "loss": 0.0002, "num_input_tokens_seen": 117108848, "step": 54290 }, { "epoch": 9.964213617177464, "grad_norm": 0.007535442244261503, "learning_rate": 5.899136357812917e-06, "loss": 0.0033, "num_input_tokens_seen": 117118448, "step": 54295 }, { "epoch": 9.965131216737015, "grad_norm": 0.01250600814819336, "learning_rate": 5.898348643382779e-06, "loss": 0.0005, "num_input_tokens_seen": 117129840, "step": 54300 }, { "epoch": 9.966048816296569, "grad_norm": 0.00956063810735941, "learning_rate": 5.897560905911391e-06, "loss": 0.0001, "num_input_tokens_seen": 117141456, "step": 54305 }, { "epoch": 9.96696641585612, "grad_norm": 0.0020363377407193184, "learning_rate": 5.896773145418958e-06, "loss": 0.0001, "num_input_tokens_seen": 117151920, "step": 54310 }, { "epoch": 9.967884015415672, "grad_norm": 0.06248706579208374, "learning_rate": 5.895985361925684e-06, "loss": 0.0001, "num_input_tokens_seen": 117163440, "step": 54315 }, { "epoch": 9.968801614975225, "grad_norm": 0.016553962603211403, "learning_rate": 5.895197555451771e-06, "loss": 0.0914, "num_input_tokens_seen": 117174640, "step": 54320 }, { "epoch": 9.969719214534777, "grad_norm": 0.01776307262480259, "learning_rate": 5.894409726017431e-06, "loss": 0.0003, "num_input_tokens_seen": 117183216, "step": 54325 }, { "epoch": 9.970636814094329, "grad_norm": 0.21485213935375214, "learning_rate": 5.893621873642867e-06, "loss": 0.0004, "num_input_tokens_seen": 117194064, "step": 54330 }, { "epoch": 9.971554413653882, "grad_norm": 0.006335836369544268, "learning_rate": 5.892833998348286e-06, "loss": 0.1596, "num_input_tokens_seen": 117202832, "step": 54335 }, { "epoch": 9.972472013213434, "grad_norm": 0.007279525510966778, "learning_rate": 5.892046100153899e-06, "loss": 0.0005, "num_input_tokens_seen": 117213552, "step": 54340 }, { "epoch": 9.973389612772985, "grad_norm": 0.01554765086621046, "learning_rate": 5.891258179079911e-06, "loss": 0.0002, "num_input_tokens_seen": 117223728, "step": 54345 }, { "epoch": 9.974307212332539, "grad_norm": 0.015239483676850796, "learning_rate": 5.89047023514653e-06, "loss": 0.0004, "num_input_tokens_seen": 117234320, "step": 54350 }, { "epoch": 9.97522481189209, "grad_norm": 0.007094224914908409, "learning_rate": 5.88968226837397e-06, "loss": 0.0449, "num_input_tokens_seen": 117245424, "step": 54355 }, { "epoch": 9.976142411451642, "grad_norm": 0.007638871204108, "learning_rate": 5.888894278782438e-06, "loss": 0.001, "num_input_tokens_seen": 117255568, "step": 54360 }, { "epoch": 9.977060011011195, "grad_norm": 0.0036900988779962063, "learning_rate": 5.888106266392146e-06, "loss": 0.0001, "num_input_tokens_seen": 117266672, "step": 54365 }, { "epoch": 9.977977610570747, "grad_norm": 0.020315976813435555, "learning_rate": 5.887318231223303e-06, "loss": 0.0005, "num_input_tokens_seen": 117277296, "step": 54370 }, { "epoch": 9.978895210130299, "grad_norm": 0.1141231507062912, "learning_rate": 5.886530173296126e-06, "loss": 0.0001, "num_input_tokens_seen": 117289616, "step": 54375 }, { "epoch": 9.979812809689852, "grad_norm": 0.0006720886449329555, "learning_rate": 5.885742092630821e-06, "loss": 0.147, "num_input_tokens_seen": 117300432, "step": 54380 }, { "epoch": 9.980730409249404, "grad_norm": 0.0054401494562625885, "learning_rate": 5.884953989247604e-06, "loss": 0.0002, "num_input_tokens_seen": 117311472, "step": 54385 }, { "epoch": 9.981648008808955, "grad_norm": 0.0012693112948909402, "learning_rate": 5.88416586316669e-06, "loss": 0.0003, "num_input_tokens_seen": 117321840, "step": 54390 }, { "epoch": 9.982565608368509, "grad_norm": 0.0017710216343402863, "learning_rate": 5.883377714408292e-06, "loss": 0.0214, "num_input_tokens_seen": 117332368, "step": 54395 }, { "epoch": 9.98348320792806, "grad_norm": 0.006543489173054695, "learning_rate": 5.882589542992624e-06, "loss": 0.0147, "num_input_tokens_seen": 117342704, "step": 54400 }, { "epoch": 9.984400807487612, "grad_norm": 0.09635882079601288, "learning_rate": 5.881801348939903e-06, "loss": 0.0002, "num_input_tokens_seen": 117353904, "step": 54405 }, { "epoch": 9.985318407047165, "grad_norm": 0.018854571506381035, "learning_rate": 5.881013132270343e-06, "loss": 0.0001, "num_input_tokens_seen": 117363888, "step": 54410 }, { "epoch": 9.986236006606717, "grad_norm": 20.074039459228516, "learning_rate": 5.880224893004163e-06, "loss": 0.0015, "num_input_tokens_seen": 117374672, "step": 54415 }, { "epoch": 9.987153606166268, "grad_norm": 0.0013568534050136805, "learning_rate": 5.879436631161577e-06, "loss": 0.0097, "num_input_tokens_seen": 117384592, "step": 54420 }, { "epoch": 9.988071205725822, "grad_norm": 0.0014498316450044513, "learning_rate": 5.8786483467628054e-06, "loss": 0.0005, "num_input_tokens_seen": 117395536, "step": 54425 }, { "epoch": 9.988988805285373, "grad_norm": 0.07495929300785065, "learning_rate": 5.877860039828065e-06, "loss": 0.0949, "num_input_tokens_seen": 117405744, "step": 54430 }, { "epoch": 9.989906404844925, "grad_norm": 0.002341208280995488, "learning_rate": 5.877071710377575e-06, "loss": 0.0004, "num_input_tokens_seen": 117416560, "step": 54435 }, { "epoch": 9.990824004404478, "grad_norm": 0.018135033547878265, "learning_rate": 5.876283358431556e-06, "loss": 0.2421, "num_input_tokens_seen": 117426224, "step": 54440 }, { "epoch": 9.99174160396403, "grad_norm": 0.009639568626880646, "learning_rate": 5.875494984010226e-06, "loss": 0.0001, "num_input_tokens_seen": 117436560, "step": 54445 }, { "epoch": 9.992659203523582, "grad_norm": 0.0011616323608905077, "learning_rate": 5.874706587133807e-06, "loss": 0.0015, "num_input_tokens_seen": 117446800, "step": 54450 }, { "epoch": 9.993576803083135, "grad_norm": 0.006727827247232199, "learning_rate": 5.87391816782252e-06, "loss": 0.0014, "num_input_tokens_seen": 117458288, "step": 54455 }, { "epoch": 9.994494402642687, "grad_norm": 15.86075210571289, "learning_rate": 5.873129726096585e-06, "loss": 0.4326, "num_input_tokens_seen": 117469264, "step": 54460 }, { "epoch": 9.995412002202238, "grad_norm": 0.02158522978425026, "learning_rate": 5.872341261976228e-06, "loss": 0.0002, "num_input_tokens_seen": 117480208, "step": 54465 }, { "epoch": 9.996329601761792, "grad_norm": 0.030199643224477768, "learning_rate": 5.871552775481668e-06, "loss": 0.0002, "num_input_tokens_seen": 117490768, "step": 54470 }, { "epoch": 9.997247201321343, "grad_norm": 0.00132934155408293, "learning_rate": 5.870764266633131e-06, "loss": 0.0027, "num_input_tokens_seen": 117501040, "step": 54475 }, { "epoch": 9.998164800880895, "grad_norm": 0.006783721968531609, "learning_rate": 5.8699757354508395e-06, "loss": 0.0001, "num_input_tokens_seen": 117512816, "step": 54480 }, { "epoch": 9.999082400440448, "grad_norm": 0.006557955406606197, "learning_rate": 5.869187181955018e-06, "loss": 0.1285, "num_input_tokens_seen": 117523056, "step": 54485 }, { "epoch": 10.0, "grad_norm": 0.04680979624390602, "learning_rate": 5.868398606165894e-06, "loss": 0.0001, "num_input_tokens_seen": 117532800, "step": 54490 }, { "epoch": 10.0, "eval_loss": 0.34945470094680786, "eval_runtime": 179.1534, "eval_samples_per_second": 30.415, "eval_steps_per_second": 7.608, "num_input_tokens_seen": 117532800, "step": 54490 }, { "epoch": 10.000917599559552, "grad_norm": 0.00491032749414444, "learning_rate": 5.86761000810369e-06, "loss": 0.0001, "num_input_tokens_seen": 117543552, "step": 54495 }, { "epoch": 10.001835199119105, "grad_norm": 0.007692485116422176, "learning_rate": 5.866821387788636e-06, "loss": 0.0003, "num_input_tokens_seen": 117554080, "step": 54500 }, { "epoch": 10.002752798678657, "grad_norm": 0.004225658252835274, "learning_rate": 5.8660327452409545e-06, "loss": 0.1752, "num_input_tokens_seen": 117564992, "step": 54505 }, { "epoch": 10.003670398238208, "grad_norm": 0.012210490182042122, "learning_rate": 5.8652440804808775e-06, "loss": 0.0947, "num_input_tokens_seen": 117576672, "step": 54510 }, { "epoch": 10.004587997797762, "grad_norm": 0.005284294486045837, "learning_rate": 5.8644553935286305e-06, "loss": 0.0001, "num_input_tokens_seen": 117587808, "step": 54515 }, { "epoch": 10.005505597357313, "grad_norm": 0.032484229654073715, "learning_rate": 5.863666684404442e-06, "loss": 0.0002, "num_input_tokens_seen": 117597920, "step": 54520 }, { "epoch": 10.006423196916865, "grad_norm": 0.033284734934568405, "learning_rate": 5.862877953128542e-06, "loss": 0.0001, "num_input_tokens_seen": 117608384, "step": 54525 }, { "epoch": 10.007340796476418, "grad_norm": 0.0007022321224212646, "learning_rate": 5.862089199721159e-06, "loss": 0.0001, "num_input_tokens_seen": 117620064, "step": 54530 }, { "epoch": 10.00825839603597, "grad_norm": 0.009325806982815266, "learning_rate": 5.861300424202525e-06, "loss": 0.1597, "num_input_tokens_seen": 117631520, "step": 54535 }, { "epoch": 10.009175995595522, "grad_norm": 0.01609598658978939, "learning_rate": 5.8605116265928685e-06, "loss": 0.0001, "num_input_tokens_seen": 117642784, "step": 54540 }, { "epoch": 10.010093595155075, "grad_norm": 0.12733854353427887, "learning_rate": 5.859722806912424e-06, "loss": 0.0004, "num_input_tokens_seen": 117653760, "step": 54545 }, { "epoch": 10.011011194714627, "grad_norm": 0.00439105462282896, "learning_rate": 5.858933965181421e-06, "loss": 0.0001, "num_input_tokens_seen": 117664832, "step": 54550 }, { "epoch": 10.011928794274178, "grad_norm": 0.05730082094669342, "learning_rate": 5.858145101420093e-06, "loss": 0.0003, "num_input_tokens_seen": 117677184, "step": 54555 }, { "epoch": 10.012846393833732, "grad_norm": 123.45387268066406, "learning_rate": 5.857356215648674e-06, "loss": 0.2079, "num_input_tokens_seen": 117687680, "step": 54560 }, { "epoch": 10.013763993393283, "grad_norm": 0.010537032969295979, "learning_rate": 5.856567307887397e-06, "loss": 0.0, "num_input_tokens_seen": 117698528, "step": 54565 }, { "epoch": 10.014681592952835, "grad_norm": 0.010245291516184807, "learning_rate": 5.8557783781564945e-06, "loss": 0.0001, "num_input_tokens_seen": 117710752, "step": 54570 }, { "epoch": 10.015599192512388, "grad_norm": 0.012825654819607735, "learning_rate": 5.854989426476204e-06, "loss": 0.0002, "num_input_tokens_seen": 117721184, "step": 54575 }, { "epoch": 10.01651679207194, "grad_norm": 0.19500145316123962, "learning_rate": 5.85420045286676e-06, "loss": 0.0004, "num_input_tokens_seen": 117730432, "step": 54580 }, { "epoch": 10.017434391631491, "grad_norm": 0.0015267377020791173, "learning_rate": 5.853411457348398e-06, "loss": 0.0003, "num_input_tokens_seen": 117740608, "step": 54585 }, { "epoch": 10.018351991191045, "grad_norm": 0.0038102190010249615, "learning_rate": 5.852622439941355e-06, "loss": 0.0824, "num_input_tokens_seen": 117751328, "step": 54590 }, { "epoch": 10.019269590750596, "grad_norm": 0.7131141424179077, "learning_rate": 5.8518334006658675e-06, "loss": 0.0623, "num_input_tokens_seen": 117763712, "step": 54595 }, { "epoch": 10.020187190310148, "grad_norm": 0.0011220014421269298, "learning_rate": 5.8510443395421735e-06, "loss": 0.0001, "num_input_tokens_seen": 117774592, "step": 54600 }, { "epoch": 10.021104789869701, "grad_norm": 0.007329802494496107, "learning_rate": 5.850255256590512e-06, "loss": 0.0004, "num_input_tokens_seen": 117784928, "step": 54605 }, { "epoch": 10.022022389429253, "grad_norm": 0.027374736964702606, "learning_rate": 5.84946615183112e-06, "loss": 0.0001, "num_input_tokens_seen": 117795840, "step": 54610 }, { "epoch": 10.022939988988805, "grad_norm": 0.029994715005159378, "learning_rate": 5.8486770252842376e-06, "loss": 0.0002, "num_input_tokens_seen": 117806816, "step": 54615 }, { "epoch": 10.023857588548358, "grad_norm": 0.0009530683164484799, "learning_rate": 5.847887876970106e-06, "loss": 0.0008, "num_input_tokens_seen": 117817120, "step": 54620 }, { "epoch": 10.02477518810791, "grad_norm": 0.027541102841496468, "learning_rate": 5.847098706908964e-06, "loss": 0.1068, "num_input_tokens_seen": 117828000, "step": 54625 }, { "epoch": 10.025692787667461, "grad_norm": 0.005130257457494736, "learning_rate": 5.846309515121052e-06, "loss": 0.0005, "num_input_tokens_seen": 117839264, "step": 54630 }, { "epoch": 10.026610387227015, "grad_norm": 0.001527432119473815, "learning_rate": 5.845520301626615e-06, "loss": 0.0001, "num_input_tokens_seen": 117850016, "step": 54635 }, { "epoch": 10.027527986786566, "grad_norm": 0.0016404468333348632, "learning_rate": 5.84473106644589e-06, "loss": 0.0002, "num_input_tokens_seen": 117860096, "step": 54640 }, { "epoch": 10.028445586346118, "grad_norm": 0.005737438332289457, "learning_rate": 5.843941809599123e-06, "loss": 0.0001, "num_input_tokens_seen": 117870624, "step": 54645 }, { "epoch": 10.029363185905671, "grad_norm": 0.023348502814769745, "learning_rate": 5.8431525311065585e-06, "loss": 0.1098, "num_input_tokens_seen": 117881664, "step": 54650 }, { "epoch": 10.030280785465223, "grad_norm": 0.0007217458914965391, "learning_rate": 5.842363230988436e-06, "loss": 0.0001, "num_input_tokens_seen": 117892480, "step": 54655 }, { "epoch": 10.031198385024775, "grad_norm": 0.00420977221801877, "learning_rate": 5.841573909265004e-06, "loss": 0.0001, "num_input_tokens_seen": 117902336, "step": 54660 }, { "epoch": 10.032115984584328, "grad_norm": 0.00675622746348381, "learning_rate": 5.840784565956504e-06, "loss": 0.0588, "num_input_tokens_seen": 117913888, "step": 54665 }, { "epoch": 10.03303358414388, "grad_norm": 0.0007262000581249595, "learning_rate": 5.8399952010831836e-06, "loss": 0.0, "num_input_tokens_seen": 117924032, "step": 54670 }, { "epoch": 10.033951183703431, "grad_norm": 0.027585051953792572, "learning_rate": 5.8392058146652885e-06, "loss": 0.0001, "num_input_tokens_seen": 117935424, "step": 54675 }, { "epoch": 10.034868783262985, "grad_norm": 0.006406676024198532, "learning_rate": 5.838416406723064e-06, "loss": 0.0001, "num_input_tokens_seen": 117947168, "step": 54680 }, { "epoch": 10.035786382822536, "grad_norm": 6.398768424987793, "learning_rate": 5.837626977276759e-06, "loss": 0.0055, "num_input_tokens_seen": 117957920, "step": 54685 }, { "epoch": 10.036703982382088, "grad_norm": 0.0023541911505162716, "learning_rate": 5.836837526346619e-06, "loss": 0.0, "num_input_tokens_seen": 117968032, "step": 54690 }, { "epoch": 10.037621581941641, "grad_norm": 0.0017919806996360421, "learning_rate": 5.836048053952895e-06, "loss": 0.0002, "num_input_tokens_seen": 117979744, "step": 54695 }, { "epoch": 10.038539181501193, "grad_norm": 0.0009742988040670753, "learning_rate": 5.835258560115834e-06, "loss": 0.0793, "num_input_tokens_seen": 117990048, "step": 54700 }, { "epoch": 10.039456781060744, "grad_norm": 0.005686070770025253, "learning_rate": 5.834469044855684e-06, "loss": 0.0001, "num_input_tokens_seen": 117999968, "step": 54705 }, { "epoch": 10.040374380620298, "grad_norm": 0.0035612289793789387, "learning_rate": 5.833679508192698e-06, "loss": 0.0002, "num_input_tokens_seen": 118011104, "step": 54710 }, { "epoch": 10.04129198017985, "grad_norm": 0.021847564727067947, "learning_rate": 5.8328899501471235e-06, "loss": 0.0101, "num_input_tokens_seen": 118022144, "step": 54715 }, { "epoch": 10.042209579739401, "grad_norm": 0.008140292018651962, "learning_rate": 5.832100370739214e-06, "loss": 0.0002, "num_input_tokens_seen": 118033216, "step": 54720 }, { "epoch": 10.043127179298954, "grad_norm": 23.49448013305664, "learning_rate": 5.831310769989219e-06, "loss": 0.0663, "num_input_tokens_seen": 118042912, "step": 54725 }, { "epoch": 10.044044778858506, "grad_norm": 0.0040011634118855, "learning_rate": 5.830521147917391e-06, "loss": 0.0015, "num_input_tokens_seen": 118053824, "step": 54730 }, { "epoch": 10.044962378418058, "grad_norm": 0.03653145954012871, "learning_rate": 5.8297315045439826e-06, "loss": 0.0003, "num_input_tokens_seen": 118063520, "step": 54735 }, { "epoch": 10.045879977977611, "grad_norm": 0.004884708672761917, "learning_rate": 5.8289418398892474e-06, "loss": 0.0001, "num_input_tokens_seen": 118074784, "step": 54740 }, { "epoch": 10.046797577537163, "grad_norm": 0.11191938817501068, "learning_rate": 5.828152153973439e-06, "loss": 0.0004, "num_input_tokens_seen": 118084416, "step": 54745 }, { "epoch": 10.047715177096714, "grad_norm": 0.10446074604988098, "learning_rate": 5.82736244681681e-06, "loss": 0.0005, "num_input_tokens_seen": 118094112, "step": 54750 }, { "epoch": 10.048632776656268, "grad_norm": 38.95897674560547, "learning_rate": 5.826572718439617e-06, "loss": 0.2433, "num_input_tokens_seen": 118104768, "step": 54755 }, { "epoch": 10.04955037621582, "grad_norm": 0.0033028048928827047, "learning_rate": 5.825782968862116e-06, "loss": 0.0001, "num_input_tokens_seen": 118116512, "step": 54760 }, { "epoch": 10.050467975775371, "grad_norm": 0.026057302951812744, "learning_rate": 5.8249931981045605e-06, "loss": 0.0015, "num_input_tokens_seen": 118128096, "step": 54765 }, { "epoch": 10.051385575334924, "grad_norm": 0.0005461288965307176, "learning_rate": 5.824203406187209e-06, "loss": 0.0001, "num_input_tokens_seen": 118139744, "step": 54770 }, { "epoch": 10.052303174894476, "grad_norm": 0.044620197266340256, "learning_rate": 5.823413593130317e-06, "loss": 0.0001, "num_input_tokens_seen": 118150720, "step": 54775 }, { "epoch": 10.053220774454028, "grad_norm": 0.004477956332266331, "learning_rate": 5.822623758954143e-06, "loss": 0.0001, "num_input_tokens_seen": 118162176, "step": 54780 }, { "epoch": 10.054138374013581, "grad_norm": 140.26751708984375, "learning_rate": 5.821833903678944e-06, "loss": 0.0855, "num_input_tokens_seen": 118173120, "step": 54785 }, { "epoch": 10.055055973573133, "grad_norm": 0.3658330738544464, "learning_rate": 5.821044027324978e-06, "loss": 0.0005, "num_input_tokens_seen": 118183552, "step": 54790 }, { "epoch": 10.055973573132684, "grad_norm": 0.01008511520922184, "learning_rate": 5.820254129912507e-06, "loss": 0.0004, "num_input_tokens_seen": 118193408, "step": 54795 }, { "epoch": 10.056891172692238, "grad_norm": 0.004210298415273428, "learning_rate": 5.819464211461789e-06, "loss": 0.0005, "num_input_tokens_seen": 118203872, "step": 54800 }, { "epoch": 10.05780877225179, "grad_norm": 0.001590335858054459, "learning_rate": 5.818674271993082e-06, "loss": 0.0002, "num_input_tokens_seen": 118214080, "step": 54805 }, { "epoch": 10.05872637181134, "grad_norm": 0.005625723861157894, "learning_rate": 5.8178843115266505e-06, "loss": 0.0244, "num_input_tokens_seen": 118225536, "step": 54810 }, { "epoch": 10.059643971370894, "grad_norm": 0.00449683191254735, "learning_rate": 5.8170943300827536e-06, "loss": 0.0002, "num_input_tokens_seen": 118235552, "step": 54815 }, { "epoch": 10.060561570930446, "grad_norm": 0.6427986025810242, "learning_rate": 5.816304327681653e-06, "loss": 0.0008, "num_input_tokens_seen": 118245376, "step": 54820 }, { "epoch": 10.061479170489998, "grad_norm": 0.009349852800369263, "learning_rate": 5.815514304343612e-06, "loss": 0.0013, "num_input_tokens_seen": 118256736, "step": 54825 }, { "epoch": 10.062396770049551, "grad_norm": 0.17661768198013306, "learning_rate": 5.814724260088894e-06, "loss": 0.0002, "num_input_tokens_seen": 118267008, "step": 54830 }, { "epoch": 10.063314369609103, "grad_norm": 6.267184734344482, "learning_rate": 5.813934194937762e-06, "loss": 0.0066, "num_input_tokens_seen": 118278272, "step": 54835 }, { "epoch": 10.064231969168654, "grad_norm": 0.00033850004547275603, "learning_rate": 5.813144108910476e-06, "loss": 0.1755, "num_input_tokens_seen": 118290400, "step": 54840 }, { "epoch": 10.065149568728208, "grad_norm": 0.0014496803050860763, "learning_rate": 5.812354002027307e-06, "loss": 0.0001, "num_input_tokens_seen": 118302080, "step": 54845 }, { "epoch": 10.06606716828776, "grad_norm": 0.0007247411995194852, "learning_rate": 5.811563874308514e-06, "loss": 0.0001, "num_input_tokens_seen": 118312320, "step": 54850 }, { "epoch": 10.06698476784731, "grad_norm": 23.222225189208984, "learning_rate": 5.810773725774369e-06, "loss": 0.0532, "num_input_tokens_seen": 118323104, "step": 54855 }, { "epoch": 10.067902367406864, "grad_norm": 0.006146248895674944, "learning_rate": 5.809983556445131e-06, "loss": 0.0047, "num_input_tokens_seen": 118334240, "step": 54860 }, { "epoch": 10.068819966966416, "grad_norm": 428.98553466796875, "learning_rate": 5.8091933663410714e-06, "loss": 0.1005, "num_input_tokens_seen": 118345216, "step": 54865 }, { "epoch": 10.069737566525967, "grad_norm": 0.002060233848169446, "learning_rate": 5.808403155482457e-06, "loss": 0.0883, "num_input_tokens_seen": 118355584, "step": 54870 }, { "epoch": 10.07065516608552, "grad_norm": 0.0026766767259687185, "learning_rate": 5.807612923889554e-06, "loss": 0.0, "num_input_tokens_seen": 118366112, "step": 54875 }, { "epoch": 10.071572765645072, "grad_norm": 0.0027662082575261593, "learning_rate": 5.806822671582631e-06, "loss": 0.0001, "num_input_tokens_seen": 118376064, "step": 54880 }, { "epoch": 10.072490365204624, "grad_norm": 0.03451436385512352, "learning_rate": 5.806032398581958e-06, "loss": 0.0081, "num_input_tokens_seen": 118386560, "step": 54885 }, { "epoch": 10.073407964764177, "grad_norm": 57.6323127746582, "learning_rate": 5.805242104907801e-06, "loss": 0.1502, "num_input_tokens_seen": 118396960, "step": 54890 }, { "epoch": 10.074325564323729, "grad_norm": 0.010323352180421352, "learning_rate": 5.804451790580434e-06, "loss": 0.0002, "num_input_tokens_seen": 118408160, "step": 54895 }, { "epoch": 10.07524316388328, "grad_norm": 0.0012287190183997154, "learning_rate": 5.803661455620124e-06, "loss": 0.0, "num_input_tokens_seen": 118419136, "step": 54900 }, { "epoch": 10.076160763442834, "grad_norm": 0.027056805789470673, "learning_rate": 5.802871100047145e-06, "loss": 0.0001, "num_input_tokens_seen": 118429280, "step": 54905 }, { "epoch": 10.077078363002386, "grad_norm": 0.017688939347863197, "learning_rate": 5.802080723881766e-06, "loss": 0.1439, "num_input_tokens_seen": 118439520, "step": 54910 }, { "epoch": 10.077995962561937, "grad_norm": 0.0016783748287707567, "learning_rate": 5.801290327144258e-06, "loss": 0.0354, "num_input_tokens_seen": 118451872, "step": 54915 }, { "epoch": 10.07891356212149, "grad_norm": 0.00934868585318327, "learning_rate": 5.800499909854896e-06, "loss": 0.0001, "num_input_tokens_seen": 118461920, "step": 54920 }, { "epoch": 10.079831161681042, "grad_norm": 0.5426450967788696, "learning_rate": 5.799709472033952e-06, "loss": 0.0005, "num_input_tokens_seen": 118473280, "step": 54925 }, { "epoch": 10.080748761240594, "grad_norm": 1.227174997329712, "learning_rate": 5.798919013701701e-06, "loss": 0.0013, "num_input_tokens_seen": 118484832, "step": 54930 }, { "epoch": 10.081666360800147, "grad_norm": 0.00761700002476573, "learning_rate": 5.798128534878413e-06, "loss": 0.0001, "num_input_tokens_seen": 118494656, "step": 54935 }, { "epoch": 10.082583960359699, "grad_norm": 0.05829228460788727, "learning_rate": 5.797338035584367e-06, "loss": 0.1223, "num_input_tokens_seen": 118504416, "step": 54940 }, { "epoch": 10.08350155991925, "grad_norm": 0.008223704062402248, "learning_rate": 5.796547515839836e-06, "loss": 0.0001, "num_input_tokens_seen": 118515840, "step": 54945 }, { "epoch": 10.084419159478804, "grad_norm": 0.03026813082396984, "learning_rate": 5.7957569756650944e-06, "loss": 0.0001, "num_input_tokens_seen": 118526688, "step": 54950 }, { "epoch": 10.085336759038356, "grad_norm": 0.02677970938384533, "learning_rate": 5.794966415080421e-06, "loss": 0.0001, "num_input_tokens_seen": 118537152, "step": 54955 }, { "epoch": 10.086254358597907, "grad_norm": 1.0397905111312866, "learning_rate": 5.79417583410609e-06, "loss": 0.0011, "num_input_tokens_seen": 118546400, "step": 54960 }, { "epoch": 10.08717195815746, "grad_norm": 0.003205582732334733, "learning_rate": 5.793385232762379e-06, "loss": 0.0021, "num_input_tokens_seen": 118556384, "step": 54965 }, { "epoch": 10.088089557717012, "grad_norm": 0.005613596178591251, "learning_rate": 5.792594611069569e-06, "loss": 0.0001, "num_input_tokens_seen": 118566432, "step": 54970 }, { "epoch": 10.089007157276564, "grad_norm": 0.0660267174243927, "learning_rate": 5.791803969047933e-06, "loss": 0.0841, "num_input_tokens_seen": 118577248, "step": 54975 }, { "epoch": 10.089924756836117, "grad_norm": 0.0006816009990870953, "learning_rate": 5.7910133067177544e-06, "loss": 0.0, "num_input_tokens_seen": 118587680, "step": 54980 }, { "epoch": 10.090842356395669, "grad_norm": 0.001526131760329008, "learning_rate": 5.790222624099309e-06, "loss": 0.0001, "num_input_tokens_seen": 118597888, "step": 54985 }, { "epoch": 10.09175995595522, "grad_norm": 0.037389595061540604, "learning_rate": 5.789431921212879e-06, "loss": 0.0001, "num_input_tokens_seen": 118609280, "step": 54990 }, { "epoch": 10.092677555514774, "grad_norm": 30.310354232788086, "learning_rate": 5.788641198078744e-06, "loss": 0.1005, "num_input_tokens_seen": 118620096, "step": 54995 }, { "epoch": 10.093595155074325, "grad_norm": 0.0026760867331176996, "learning_rate": 5.787850454717183e-06, "loss": 0.1255, "num_input_tokens_seen": 118630208, "step": 55000 }, { "epoch": 10.094512754633877, "grad_norm": 0.005138854030519724, "learning_rate": 5.78705969114848e-06, "loss": 0.0, "num_input_tokens_seen": 118640384, "step": 55005 }, { "epoch": 10.09543035419343, "grad_norm": 0.013196317479014397, "learning_rate": 5.786268907392916e-06, "loss": 0.0003, "num_input_tokens_seen": 118651200, "step": 55010 }, { "epoch": 10.096347953752982, "grad_norm": 0.0007774399709887803, "learning_rate": 5.785478103470773e-06, "loss": 0.0131, "num_input_tokens_seen": 118663200, "step": 55015 }, { "epoch": 10.097265553312534, "grad_norm": 0.001761813648045063, "learning_rate": 5.784687279402332e-06, "loss": 0.0001, "num_input_tokens_seen": 118673664, "step": 55020 }, { "epoch": 10.098183152872087, "grad_norm": 0.03404524549841881, "learning_rate": 5.783896435207881e-06, "loss": 0.0001, "num_input_tokens_seen": 118684896, "step": 55025 }, { "epoch": 10.099100752431639, "grad_norm": 0.03619713708758354, "learning_rate": 5.783105570907701e-06, "loss": 0.012, "num_input_tokens_seen": 118694688, "step": 55030 }, { "epoch": 10.10001835199119, "grad_norm": 0.001559821655973792, "learning_rate": 5.782314686522076e-06, "loss": 0.0001, "num_input_tokens_seen": 118705568, "step": 55035 }, { "epoch": 10.100935951550744, "grad_norm": 0.09956524521112442, "learning_rate": 5.781523782071292e-06, "loss": 0.0003, "num_input_tokens_seen": 118716416, "step": 55040 }, { "epoch": 10.101853551110295, "grad_norm": 0.008721433579921722, "learning_rate": 5.780732857575634e-06, "loss": 0.0001, "num_input_tokens_seen": 118727904, "step": 55045 }, { "epoch": 10.102771150669847, "grad_norm": 0.04436599835753441, "learning_rate": 5.779941913055389e-06, "loss": 0.0001, "num_input_tokens_seen": 118738848, "step": 55050 }, { "epoch": 10.1036887502294, "grad_norm": 0.0006087179062888026, "learning_rate": 5.779150948530844e-06, "loss": 0.0, "num_input_tokens_seen": 118749760, "step": 55055 }, { "epoch": 10.104606349788952, "grad_norm": 0.002784474054351449, "learning_rate": 5.778359964022282e-06, "loss": 0.0, "num_input_tokens_seen": 118760704, "step": 55060 }, { "epoch": 10.105523949348504, "grad_norm": 0.013605122454464436, "learning_rate": 5.777568959549995e-06, "loss": 0.0001, "num_input_tokens_seen": 118771584, "step": 55065 }, { "epoch": 10.106441548908057, "grad_norm": 0.0014897511573508382, "learning_rate": 5.776777935134268e-06, "loss": 0.0, "num_input_tokens_seen": 118783424, "step": 55070 }, { "epoch": 10.107359148467609, "grad_norm": 0.004253599792718887, "learning_rate": 5.775986890795391e-06, "loss": 0.0043, "num_input_tokens_seen": 118793184, "step": 55075 }, { "epoch": 10.10827674802716, "grad_norm": 17.020109176635742, "learning_rate": 5.775195826553655e-06, "loss": 0.0051, "num_input_tokens_seen": 118804128, "step": 55080 }, { "epoch": 10.109194347586714, "grad_norm": 0.011655870825052261, "learning_rate": 5.774404742429345e-06, "loss": 0.0001, "num_input_tokens_seen": 118815424, "step": 55085 }, { "epoch": 10.110111947146265, "grad_norm": 0.005496433470398188, "learning_rate": 5.773613638442754e-06, "loss": 0.0001, "num_input_tokens_seen": 118824256, "step": 55090 }, { "epoch": 10.111029546705817, "grad_norm": 392.163818359375, "learning_rate": 5.772822514614174e-06, "loss": 0.0296, "num_input_tokens_seen": 118835648, "step": 55095 }, { "epoch": 10.11194714626537, "grad_norm": 0.0037427006755024195, "learning_rate": 5.772031370963892e-06, "loss": 0.1472, "num_input_tokens_seen": 118846720, "step": 55100 }, { "epoch": 10.112864745824922, "grad_norm": 0.013052166439592838, "learning_rate": 5.771240207512203e-06, "loss": 0.0001, "num_input_tokens_seen": 118857344, "step": 55105 }, { "epoch": 10.113782345384474, "grad_norm": 0.005185478832572699, "learning_rate": 5.770449024279398e-06, "loss": 0.0001, "num_input_tokens_seen": 118866912, "step": 55110 }, { "epoch": 10.114699944944027, "grad_norm": 29.78156852722168, "learning_rate": 5.769657821285769e-06, "loss": 0.0285, "num_input_tokens_seen": 118877952, "step": 55115 }, { "epoch": 10.115617544503579, "grad_norm": 0.0008898896048776805, "learning_rate": 5.76886659855161e-06, "loss": 0.0001, "num_input_tokens_seen": 118887680, "step": 55120 }, { "epoch": 10.11653514406313, "grad_norm": 0.006664710585027933, "learning_rate": 5.7680753560972155e-06, "loss": 0.0646, "num_input_tokens_seen": 118898752, "step": 55125 }, { "epoch": 10.117452743622684, "grad_norm": 0.002885465743020177, "learning_rate": 5.767284093942877e-06, "loss": 0.0, "num_input_tokens_seen": 118908928, "step": 55130 }, { "epoch": 10.118370343182235, "grad_norm": 0.6823320984840393, "learning_rate": 5.766492812108891e-06, "loss": 0.0002, "num_input_tokens_seen": 118919904, "step": 55135 }, { "epoch": 10.119287942741787, "grad_norm": 0.01911436766386032, "learning_rate": 5.7657015106155536e-06, "loss": 0.0001, "num_input_tokens_seen": 118931680, "step": 55140 }, { "epoch": 10.12020554230134, "grad_norm": 0.002917924430221319, "learning_rate": 5.764910189483157e-06, "loss": 0.0401, "num_input_tokens_seen": 118942080, "step": 55145 }, { "epoch": 10.121123141860892, "grad_norm": 0.046211373060941696, "learning_rate": 5.764118848732001e-06, "loss": 0.0004, "num_input_tokens_seen": 118954048, "step": 55150 }, { "epoch": 10.122040741420443, "grad_norm": 0.0025333452504128218, "learning_rate": 5.76332748838238e-06, "loss": 0.0001, "num_input_tokens_seen": 118965536, "step": 55155 }, { "epoch": 10.122958340979997, "grad_norm": 0.011677336879074574, "learning_rate": 5.762536108454593e-06, "loss": 0.0004, "num_input_tokens_seen": 118977088, "step": 55160 }, { "epoch": 10.123875940539548, "grad_norm": 0.1679711937904358, "learning_rate": 5.761744708968937e-06, "loss": 0.1659, "num_input_tokens_seen": 118988480, "step": 55165 }, { "epoch": 10.1247935400991, "grad_norm": 0.012411262840032578, "learning_rate": 5.760953289945709e-06, "loss": 0.0001, "num_input_tokens_seen": 118999712, "step": 55170 }, { "epoch": 10.125711139658653, "grad_norm": 0.0005512642674148083, "learning_rate": 5.760161851405208e-06, "loss": 0.0, "num_input_tokens_seen": 119011328, "step": 55175 }, { "epoch": 10.126628739218205, "grad_norm": 0.02944822795689106, "learning_rate": 5.759370393367733e-06, "loss": 0.0001, "num_input_tokens_seen": 119021696, "step": 55180 }, { "epoch": 10.127546338777757, "grad_norm": 0.023145969957113266, "learning_rate": 5.7585789158535865e-06, "loss": 0.0001, "num_input_tokens_seen": 119031936, "step": 55185 }, { "epoch": 10.12846393833731, "grad_norm": 46.04281997680664, "learning_rate": 5.757787418883065e-06, "loss": 0.1163, "num_input_tokens_seen": 119042016, "step": 55190 }, { "epoch": 10.129381537896862, "grad_norm": 0.0008926166337914765, "learning_rate": 5.756995902476471e-06, "loss": 0.0008, "num_input_tokens_seen": 119054048, "step": 55195 }, { "epoch": 10.130299137456413, "grad_norm": 0.005245854612439871, "learning_rate": 5.756204366654107e-06, "loss": 0.0916, "num_input_tokens_seen": 119064992, "step": 55200 }, { "epoch": 10.131216737015967, "grad_norm": 0.0071172043681144714, "learning_rate": 5.75541281143627e-06, "loss": 0.0007, "num_input_tokens_seen": 119074880, "step": 55205 }, { "epoch": 10.132134336575518, "grad_norm": 34.32820510864258, "learning_rate": 5.7546212368432665e-06, "loss": 0.3159, "num_input_tokens_seen": 119085920, "step": 55210 }, { "epoch": 10.13305193613507, "grad_norm": 0.0007702974253334105, "learning_rate": 5.753829642895399e-06, "loss": 0.0003, "num_input_tokens_seen": 119096032, "step": 55215 }, { "epoch": 10.133969535694623, "grad_norm": 0.003888250095769763, "learning_rate": 5.753038029612968e-06, "loss": 0.0001, "num_input_tokens_seen": 119107040, "step": 55220 }, { "epoch": 10.134887135254175, "grad_norm": 0.0006718590739183128, "learning_rate": 5.752246397016279e-06, "loss": 0.0004, "num_input_tokens_seen": 119117824, "step": 55225 }, { "epoch": 10.135804734813727, "grad_norm": 0.0133976424112916, "learning_rate": 5.751454745125636e-06, "loss": 0.0005, "num_input_tokens_seen": 119128928, "step": 55230 }, { "epoch": 10.13672233437328, "grad_norm": 0.0020885788835585117, "learning_rate": 5.750663073961343e-06, "loss": 0.0016, "num_input_tokens_seen": 119138944, "step": 55235 }, { "epoch": 10.137639933932832, "grad_norm": 0.04123548045754433, "learning_rate": 5.749871383543706e-06, "loss": 0.0002, "num_input_tokens_seen": 119150016, "step": 55240 }, { "epoch": 10.138557533492383, "grad_norm": 0.0027018433902412653, "learning_rate": 5.7490796738930285e-06, "loss": 0.101, "num_input_tokens_seen": 119160128, "step": 55245 }, { "epoch": 10.139475133051937, "grad_norm": 0.015192831866443157, "learning_rate": 5.748287945029621e-06, "loss": 0.0004, "num_input_tokens_seen": 119171872, "step": 55250 }, { "epoch": 10.140392732611488, "grad_norm": 0.001311308122240007, "learning_rate": 5.747496196973786e-06, "loss": 0.0001, "num_input_tokens_seen": 119183552, "step": 55255 }, { "epoch": 10.14131033217104, "grad_norm": 0.0008468133164569736, "learning_rate": 5.746704429745833e-06, "loss": 0.1168, "num_input_tokens_seen": 119195168, "step": 55260 }, { "epoch": 10.142227931730593, "grad_norm": 0.003805415006354451, "learning_rate": 5.7459126433660696e-06, "loss": 0.0002, "num_input_tokens_seen": 119207008, "step": 55265 }, { "epoch": 10.143145531290145, "grad_norm": 0.4692831039428711, "learning_rate": 5.745120837854801e-06, "loss": 0.0002, "num_input_tokens_seen": 119217184, "step": 55270 }, { "epoch": 10.144063130849696, "grad_norm": 0.01962263137102127, "learning_rate": 5.744329013232338e-06, "loss": 0.0001, "num_input_tokens_seen": 119227648, "step": 55275 }, { "epoch": 10.14498073040925, "grad_norm": 0.015805264934897423, "learning_rate": 5.743537169518989e-06, "loss": 0.0, "num_input_tokens_seen": 119238176, "step": 55280 }, { "epoch": 10.145898329968801, "grad_norm": 0.00399597454816103, "learning_rate": 5.742745306735066e-06, "loss": 0.0001, "num_input_tokens_seen": 119249728, "step": 55285 }, { "epoch": 10.146815929528353, "grad_norm": 0.002564486814662814, "learning_rate": 5.7419534249008745e-06, "loss": 0.1381, "num_input_tokens_seen": 119262176, "step": 55290 }, { "epoch": 10.147733529087906, "grad_norm": 0.19919773936271667, "learning_rate": 5.741161524036728e-06, "loss": 0.003, "num_input_tokens_seen": 119272768, "step": 55295 }, { "epoch": 10.148651128647458, "grad_norm": 0.9937046766281128, "learning_rate": 5.740369604162939e-06, "loss": 0.0003, "num_input_tokens_seen": 119283680, "step": 55300 }, { "epoch": 10.14956872820701, "grad_norm": 0.0018193976720795035, "learning_rate": 5.739577665299815e-06, "loss": 0.0001, "num_input_tokens_seen": 119293248, "step": 55305 }, { "epoch": 10.150486327766563, "grad_norm": 0.0009516950813122094, "learning_rate": 5.738785707467671e-06, "loss": 0.0226, "num_input_tokens_seen": 119304544, "step": 55310 }, { "epoch": 10.151403927326115, "grad_norm": 0.000520292145665735, "learning_rate": 5.737993730686819e-06, "loss": 0.0001, "num_input_tokens_seen": 119314432, "step": 55315 }, { "epoch": 10.152321526885666, "grad_norm": 0.0007848709356039762, "learning_rate": 5.737201734977571e-06, "loss": 0.0001, "num_input_tokens_seen": 119325408, "step": 55320 }, { "epoch": 10.15323912644522, "grad_norm": 19.153522491455078, "learning_rate": 5.736409720360241e-06, "loss": 0.0588, "num_input_tokens_seen": 119334944, "step": 55325 }, { "epoch": 10.154156726004771, "grad_norm": 0.0004039020568598062, "learning_rate": 5.735617686855144e-06, "loss": 0.0001, "num_input_tokens_seen": 119345888, "step": 55330 }, { "epoch": 10.155074325564323, "grad_norm": 0.2752770185470581, "learning_rate": 5.734825634482593e-06, "loss": 0.0002, "num_input_tokens_seen": 119358176, "step": 55335 }, { "epoch": 10.155991925123876, "grad_norm": 0.08044159412384033, "learning_rate": 5.734033563262902e-06, "loss": 0.001, "num_input_tokens_seen": 119368800, "step": 55340 }, { "epoch": 10.156909524683428, "grad_norm": 0.0459228940308094, "learning_rate": 5.7332414732163885e-06, "loss": 0.0001, "num_input_tokens_seen": 119380608, "step": 55345 }, { "epoch": 10.15782712424298, "grad_norm": 0.00202024239115417, "learning_rate": 5.732449364363368e-06, "loss": 0.0002, "num_input_tokens_seen": 119391616, "step": 55350 }, { "epoch": 10.158744723802533, "grad_norm": 0.0005320595228113234, "learning_rate": 5.731657236724156e-06, "loss": 0.0003, "num_input_tokens_seen": 119401696, "step": 55355 }, { "epoch": 10.159662323362085, "grad_norm": 0.0010957984486594796, "learning_rate": 5.730865090319072e-06, "loss": 0.0001, "num_input_tokens_seen": 119413280, "step": 55360 }, { "epoch": 10.160579922921636, "grad_norm": 0.0006614572484977543, "learning_rate": 5.730072925168429e-06, "loss": 0.0014, "num_input_tokens_seen": 119423488, "step": 55365 }, { "epoch": 10.16149752248119, "grad_norm": 0.006006159354001284, "learning_rate": 5.729280741292548e-06, "loss": 0.0023, "num_input_tokens_seen": 119434016, "step": 55370 }, { "epoch": 10.162415122040741, "grad_norm": 0.004549672361463308, "learning_rate": 5.7284885387117465e-06, "loss": 0.0, "num_input_tokens_seen": 119444832, "step": 55375 }, { "epoch": 10.163332721600293, "grad_norm": 0.000752390653360635, "learning_rate": 5.727696317446342e-06, "loss": 0.0001, "num_input_tokens_seen": 119456128, "step": 55380 }, { "epoch": 10.164250321159846, "grad_norm": 0.0011324238730594516, "learning_rate": 5.726904077516655e-06, "loss": 0.0007, "num_input_tokens_seen": 119467232, "step": 55385 }, { "epoch": 10.165167920719398, "grad_norm": 0.0008322937064804137, "learning_rate": 5.726111818943004e-06, "loss": 0.0, "num_input_tokens_seen": 119477760, "step": 55390 }, { "epoch": 10.16608552027895, "grad_norm": 0.0030348871368914843, "learning_rate": 5.725319541745712e-06, "loss": 0.0853, "num_input_tokens_seen": 119489280, "step": 55395 }, { "epoch": 10.167003119838503, "grad_norm": 0.04147720709443092, "learning_rate": 5.724527245945097e-06, "loss": 0.0533, "num_input_tokens_seen": 119500320, "step": 55400 }, { "epoch": 10.167920719398055, "grad_norm": 0.05189107730984688, "learning_rate": 5.723734931561481e-06, "loss": 0.0401, "num_input_tokens_seen": 119511360, "step": 55405 }, { "epoch": 10.168838318957606, "grad_norm": 0.0026548421010375023, "learning_rate": 5.722942598615187e-06, "loss": 0.1223, "num_input_tokens_seen": 119522368, "step": 55410 }, { "epoch": 10.16975591851716, "grad_norm": 0.006204515695571899, "learning_rate": 5.7221502471265345e-06, "loss": 0.0, "num_input_tokens_seen": 119531232, "step": 55415 }, { "epoch": 10.170673518076711, "grad_norm": 0.010679055005311966, "learning_rate": 5.7213578771158465e-06, "loss": 0.017, "num_input_tokens_seen": 119540576, "step": 55420 }, { "epoch": 10.171591117636263, "grad_norm": 0.0020034906920045614, "learning_rate": 5.720565488603449e-06, "loss": 0.0008, "num_input_tokens_seen": 119551040, "step": 55425 }, { "epoch": 10.172508717195816, "grad_norm": 0.014947127550840378, "learning_rate": 5.719773081609662e-06, "loss": 0.0001, "num_input_tokens_seen": 119561568, "step": 55430 }, { "epoch": 10.173426316755368, "grad_norm": 0.0026236914563924074, "learning_rate": 5.718980656154812e-06, "loss": 0.0007, "num_input_tokens_seen": 119573280, "step": 55435 }, { "epoch": 10.17434391631492, "grad_norm": 0.002093277405947447, "learning_rate": 5.7181882122592215e-06, "loss": 0.0, "num_input_tokens_seen": 119585184, "step": 55440 }, { "epoch": 10.175261515874473, "grad_norm": 0.006066887639462948, "learning_rate": 5.717395749943217e-06, "loss": 0.0002, "num_input_tokens_seen": 119596224, "step": 55445 }, { "epoch": 10.176179115434024, "grad_norm": 0.0714922696352005, "learning_rate": 5.716603269227124e-06, "loss": 0.0001, "num_input_tokens_seen": 119605280, "step": 55450 }, { "epoch": 10.177096714993576, "grad_norm": 0.0035118882078677416, "learning_rate": 5.715810770131267e-06, "loss": 0.0207, "num_input_tokens_seen": 119615648, "step": 55455 }, { "epoch": 10.17801431455313, "grad_norm": 0.0005913578206673265, "learning_rate": 5.715018252675974e-06, "loss": 0.0002, "num_input_tokens_seen": 119625824, "step": 55460 }, { "epoch": 10.178931914112681, "grad_norm": 0.0033706119284033775, "learning_rate": 5.71422571688157e-06, "loss": 0.0001, "num_input_tokens_seen": 119636896, "step": 55465 }, { "epoch": 10.179849513672233, "grad_norm": 0.0023515108041465282, "learning_rate": 5.713433162768383e-06, "loss": 0.0, "num_input_tokens_seen": 119647904, "step": 55470 }, { "epoch": 10.180767113231786, "grad_norm": 0.09356480091810226, "learning_rate": 5.712640590356742e-06, "loss": 0.017, "num_input_tokens_seen": 119657536, "step": 55475 }, { "epoch": 10.181684712791338, "grad_norm": 0.014856645837426186, "learning_rate": 5.711847999666974e-06, "loss": 0.0003, "num_input_tokens_seen": 119669216, "step": 55480 }, { "epoch": 10.18260231235089, "grad_norm": 1.7710201740264893, "learning_rate": 5.711055390719409e-06, "loss": 0.0007, "num_input_tokens_seen": 119678304, "step": 55485 }, { "epoch": 10.183519911910443, "grad_norm": 0.10505612194538116, "learning_rate": 5.710262763534374e-06, "loss": 0.0001, "num_input_tokens_seen": 119689792, "step": 55490 }, { "epoch": 10.184437511469994, "grad_norm": 132.45068359375, "learning_rate": 5.709470118132201e-06, "loss": 0.2283, "num_input_tokens_seen": 119700480, "step": 55495 }, { "epoch": 10.185355111029546, "grad_norm": 0.06790117919445038, "learning_rate": 5.708677454533218e-06, "loss": 0.0001, "num_input_tokens_seen": 119710464, "step": 55500 }, { "epoch": 10.1862727105891, "grad_norm": 0.004906943533569574, "learning_rate": 5.707884772757757e-06, "loss": 0.0, "num_input_tokens_seen": 119721376, "step": 55505 }, { "epoch": 10.187190310148651, "grad_norm": 0.0005739383632317185, "learning_rate": 5.707092072826149e-06, "loss": 0.0002, "num_input_tokens_seen": 119733536, "step": 55510 }, { "epoch": 10.188107909708203, "grad_norm": 0.0014511103508993983, "learning_rate": 5.7062993547587246e-06, "loss": 0.0001, "num_input_tokens_seen": 119744384, "step": 55515 }, { "epoch": 10.189025509267756, "grad_norm": 0.001319359173066914, "learning_rate": 5.705506618575818e-06, "loss": 0.0, "num_input_tokens_seen": 119755200, "step": 55520 }, { "epoch": 10.189943108827308, "grad_norm": 0.02567763812839985, "learning_rate": 5.704713864297758e-06, "loss": 0.0, "num_input_tokens_seen": 119766176, "step": 55525 }, { "epoch": 10.19086070838686, "grad_norm": 0.001620980678126216, "learning_rate": 5.70392109194488e-06, "loss": 0.0001, "num_input_tokens_seen": 119775424, "step": 55530 }, { "epoch": 10.191778307946413, "grad_norm": 0.004941350314766169, "learning_rate": 5.7031283015375175e-06, "loss": 0.0001, "num_input_tokens_seen": 119785952, "step": 55535 }, { "epoch": 10.192695907505964, "grad_norm": 0.004857862368226051, "learning_rate": 5.702335493096003e-06, "loss": 0.0, "num_input_tokens_seen": 119796928, "step": 55540 }, { "epoch": 10.193613507065516, "grad_norm": 0.0003331528860144317, "learning_rate": 5.701542666640674e-06, "loss": 0.0, "num_input_tokens_seen": 119807744, "step": 55545 }, { "epoch": 10.19453110662507, "grad_norm": 0.0002899077662732452, "learning_rate": 5.70074982219186e-06, "loss": 0.0225, "num_input_tokens_seen": 119818112, "step": 55550 }, { "epoch": 10.19544870618462, "grad_norm": 0.000553544785361737, "learning_rate": 5.6999569597699e-06, "loss": 0.0036, "num_input_tokens_seen": 119828448, "step": 55555 }, { "epoch": 10.196366305744172, "grad_norm": 0.00038818916073068976, "learning_rate": 5.69916407939513e-06, "loss": 0.0001, "num_input_tokens_seen": 119839296, "step": 55560 }, { "epoch": 10.197283905303726, "grad_norm": 0.0020040457602590322, "learning_rate": 5.698371181087884e-06, "loss": 0.0003, "num_input_tokens_seen": 119849312, "step": 55565 }, { "epoch": 10.198201504863277, "grad_norm": 0.0014557683607563376, "learning_rate": 5.6975782648684995e-06, "loss": 0.0977, "num_input_tokens_seen": 119860480, "step": 55570 }, { "epoch": 10.199119104422829, "grad_norm": 0.0042503634467720985, "learning_rate": 5.696785330757314e-06, "loss": 0.0001, "num_input_tokens_seen": 119871520, "step": 55575 }, { "epoch": 10.200036703982382, "grad_norm": 0.26116424798965454, "learning_rate": 5.695992378774665e-06, "loss": 0.0003, "num_input_tokens_seen": 119883136, "step": 55580 }, { "epoch": 10.200954303541934, "grad_norm": 0.024462370201945305, "learning_rate": 5.695199408940889e-06, "loss": 0.0001, "num_input_tokens_seen": 119893216, "step": 55585 }, { "epoch": 10.201871903101486, "grad_norm": 0.0006056277779862285, "learning_rate": 5.694406421276327e-06, "loss": 0.0003, "num_input_tokens_seen": 119903360, "step": 55590 }, { "epoch": 10.20278950266104, "grad_norm": 0.0009610666893422604, "learning_rate": 5.693613415801317e-06, "loss": 0.0, "num_input_tokens_seen": 119914592, "step": 55595 }, { "epoch": 10.20370710222059, "grad_norm": 0.0014400756917893887, "learning_rate": 5.692820392536196e-06, "loss": 0.1751, "num_input_tokens_seen": 119925920, "step": 55600 }, { "epoch": 10.204624701780142, "grad_norm": 0.0013093632878735662, "learning_rate": 5.692027351501307e-06, "loss": 0.0, "num_input_tokens_seen": 119934944, "step": 55605 }, { "epoch": 10.205542301339696, "grad_norm": 0.08090569823980331, "learning_rate": 5.691234292716988e-06, "loss": 0.0001, "num_input_tokens_seen": 119946592, "step": 55610 }, { "epoch": 10.206459900899247, "grad_norm": 0.000714791240170598, "learning_rate": 5.69044121620358e-06, "loss": 0.0001, "num_input_tokens_seen": 119957152, "step": 55615 }, { "epoch": 10.207377500458799, "grad_norm": 0.0006023606401868165, "learning_rate": 5.689648121981427e-06, "loss": 0.0247, "num_input_tokens_seen": 119966912, "step": 55620 }, { "epoch": 10.208295100018352, "grad_norm": 0.16302913427352905, "learning_rate": 5.688855010070867e-06, "loss": 0.0001, "num_input_tokens_seen": 119977536, "step": 55625 }, { "epoch": 10.209212699577904, "grad_norm": 0.0032311896793544292, "learning_rate": 5.688061880492245e-06, "loss": 0.1635, "num_input_tokens_seen": 119989408, "step": 55630 }, { "epoch": 10.210130299137456, "grad_norm": 0.007173234596848488, "learning_rate": 5.687268733265901e-06, "loss": 0.0, "num_input_tokens_seen": 119998848, "step": 55635 }, { "epoch": 10.211047898697009, "grad_norm": 0.004261108115315437, "learning_rate": 5.68647556841218e-06, "loss": 0.0, "num_input_tokens_seen": 120010816, "step": 55640 }, { "epoch": 10.21196549825656, "grad_norm": 186.572265625, "learning_rate": 5.685682385951424e-06, "loss": 0.0763, "num_input_tokens_seen": 120022144, "step": 55645 }, { "epoch": 10.212883097816112, "grad_norm": 0.0007851528353057802, "learning_rate": 5.684889185903977e-06, "loss": 0.0, "num_input_tokens_seen": 120031360, "step": 55650 }, { "epoch": 10.213800697375666, "grad_norm": 0.0024956613779067993, "learning_rate": 5.684095968290185e-06, "loss": 0.0002, "num_input_tokens_seen": 120041888, "step": 55655 }, { "epoch": 10.214718296935217, "grad_norm": 0.06405338644981384, "learning_rate": 5.683302733130391e-06, "loss": 0.0001, "num_input_tokens_seen": 120052384, "step": 55660 }, { "epoch": 10.215635896494769, "grad_norm": 0.0006344469729810953, "learning_rate": 5.682509480444941e-06, "loss": 0.0, "num_input_tokens_seen": 120063360, "step": 55665 }, { "epoch": 10.216553496054322, "grad_norm": 0.0061146533116698265, "learning_rate": 5.681716210254181e-06, "loss": 0.0001, "num_input_tokens_seen": 120074432, "step": 55670 }, { "epoch": 10.217471095613874, "grad_norm": 0.0018818352837115526, "learning_rate": 5.680922922578456e-06, "loss": 0.0001, "num_input_tokens_seen": 120085216, "step": 55675 }, { "epoch": 10.218388695173426, "grad_norm": 1.3385674953460693, "learning_rate": 5.6801296174381145e-06, "loss": 0.0101, "num_input_tokens_seen": 120096640, "step": 55680 }, { "epoch": 10.219306294732979, "grad_norm": 0.00762843107804656, "learning_rate": 5.6793362948535015e-06, "loss": 0.0001, "num_input_tokens_seen": 120107712, "step": 55685 }, { "epoch": 10.22022389429253, "grad_norm": 0.001710312906652689, "learning_rate": 5.678542954844967e-06, "loss": 0.0, "num_input_tokens_seen": 120117856, "step": 55690 }, { "epoch": 10.221141493852082, "grad_norm": 138.34579467773438, "learning_rate": 5.677749597432854e-06, "loss": 0.1501, "num_input_tokens_seen": 120129728, "step": 55695 }, { "epoch": 10.222059093411636, "grad_norm": 0.0008880328969098628, "learning_rate": 5.6769562226375175e-06, "loss": 0.0, "num_input_tokens_seen": 120140928, "step": 55700 }, { "epoch": 10.222976692971187, "grad_norm": 0.0013502479996532202, "learning_rate": 5.676162830479303e-06, "loss": 0.1534, "num_input_tokens_seen": 120151872, "step": 55705 }, { "epoch": 10.223894292530739, "grad_norm": 0.1525772213935852, "learning_rate": 5.675369420978558e-06, "loss": 0.0018, "num_input_tokens_seen": 120162496, "step": 55710 }, { "epoch": 10.224811892090292, "grad_norm": 3.035410165786743, "learning_rate": 5.6745759941556345e-06, "loss": 0.0008, "num_input_tokens_seen": 120173728, "step": 55715 }, { "epoch": 10.225729491649844, "grad_norm": 0.00549102621152997, "learning_rate": 5.673782550030883e-06, "loss": 0.0, "num_input_tokens_seen": 120185248, "step": 55720 }, { "epoch": 10.226647091209395, "grad_norm": 0.032993342727422714, "learning_rate": 5.672989088624652e-06, "loss": 0.0051, "num_input_tokens_seen": 120196896, "step": 55725 }, { "epoch": 10.227564690768949, "grad_norm": 21.88676643371582, "learning_rate": 5.6721956099572965e-06, "loss": 0.3313, "num_input_tokens_seen": 120207552, "step": 55730 }, { "epoch": 10.2284822903285, "grad_norm": 0.0003385801683180034, "learning_rate": 5.671402114049163e-06, "loss": 0.0001, "num_input_tokens_seen": 120218624, "step": 55735 }, { "epoch": 10.229399889888052, "grad_norm": 19.78912925720215, "learning_rate": 5.670608600920607e-06, "loss": 0.1721, "num_input_tokens_seen": 120228672, "step": 55740 }, { "epoch": 10.230317489447605, "grad_norm": 0.00032895736512728035, "learning_rate": 5.669815070591979e-06, "loss": 0.1719, "num_input_tokens_seen": 120239136, "step": 55745 }, { "epoch": 10.231235089007157, "grad_norm": 0.0003258306242059916, "learning_rate": 5.669021523083632e-06, "loss": 0.147, "num_input_tokens_seen": 120250080, "step": 55750 }, { "epoch": 10.232152688566709, "grad_norm": 0.04080856218934059, "learning_rate": 5.668227958415921e-06, "loss": 0.0001, "num_input_tokens_seen": 120261312, "step": 55755 }, { "epoch": 10.233070288126262, "grad_norm": 0.011119474656879902, "learning_rate": 5.6674343766091974e-06, "loss": 0.0001, "num_input_tokens_seen": 120270304, "step": 55760 }, { "epoch": 10.233987887685814, "grad_norm": 0.0004759003350045532, "learning_rate": 5.666640777683818e-06, "loss": 0.0, "num_input_tokens_seen": 120281312, "step": 55765 }, { "epoch": 10.234905487245365, "grad_norm": 0.002379004145041108, "learning_rate": 5.6658471616601355e-06, "loss": 0.1254, "num_input_tokens_seen": 120291552, "step": 55770 }, { "epoch": 10.235823086804919, "grad_norm": 0.018823115155100822, "learning_rate": 5.665053528558504e-06, "loss": 0.0001, "num_input_tokens_seen": 120303200, "step": 55775 }, { "epoch": 10.23674068636447, "grad_norm": 0.0038762157782912254, "learning_rate": 5.664259878399282e-06, "loss": 0.0104, "num_input_tokens_seen": 120314016, "step": 55780 }, { "epoch": 10.237658285924022, "grad_norm": 0.041569359600543976, "learning_rate": 5.6634662112028225e-06, "loss": 0.0003, "num_input_tokens_seen": 120324992, "step": 55785 }, { "epoch": 10.238575885483575, "grad_norm": 0.021995823830366135, "learning_rate": 5.662672526989484e-06, "loss": 0.0025, "num_input_tokens_seen": 120335232, "step": 55790 }, { "epoch": 10.239493485043127, "grad_norm": 13.183876037597656, "learning_rate": 5.661878825779621e-06, "loss": 0.0072, "num_input_tokens_seen": 120346496, "step": 55795 }, { "epoch": 10.240411084602679, "grad_norm": 0.0015909175854176283, "learning_rate": 5.661085107593593e-06, "loss": 0.0505, "num_input_tokens_seen": 120356096, "step": 55800 }, { "epoch": 10.241328684162232, "grad_norm": 0.013649961911141872, "learning_rate": 5.660291372451756e-06, "loss": 0.067, "num_input_tokens_seen": 120365760, "step": 55805 }, { "epoch": 10.242246283721784, "grad_norm": 0.05909212306141853, "learning_rate": 5.659497620374469e-06, "loss": 0.0002, "num_input_tokens_seen": 120376480, "step": 55810 }, { "epoch": 10.243163883281335, "grad_norm": 0.20441265404224396, "learning_rate": 5.65870385138209e-06, "loss": 0.0008, "num_input_tokens_seen": 120387808, "step": 55815 }, { "epoch": 10.244081482840889, "grad_norm": 0.01082156877964735, "learning_rate": 5.657910065494978e-06, "loss": 0.1595, "num_input_tokens_seen": 120399712, "step": 55820 }, { "epoch": 10.24499908240044, "grad_norm": 0.674524188041687, "learning_rate": 5.657116262733492e-06, "loss": 0.0119, "num_input_tokens_seen": 120411296, "step": 55825 }, { "epoch": 10.245916681959992, "grad_norm": 0.0005639206501655281, "learning_rate": 5.656322443117993e-06, "loss": 0.0003, "num_input_tokens_seen": 120421312, "step": 55830 }, { "epoch": 10.246834281519545, "grad_norm": 0.016163120046257973, "learning_rate": 5.655528606668839e-06, "loss": 0.0843, "num_input_tokens_seen": 120431968, "step": 55835 }, { "epoch": 10.247751881079097, "grad_norm": 0.02180814929306507, "learning_rate": 5.654734753406394e-06, "loss": 0.0001, "num_input_tokens_seen": 120442112, "step": 55840 }, { "epoch": 10.248669480638648, "grad_norm": 0.000641266698949039, "learning_rate": 5.653940883351017e-06, "loss": 0.0001, "num_input_tokens_seen": 120451680, "step": 55845 }, { "epoch": 10.249587080198202, "grad_norm": 0.09991790354251862, "learning_rate": 5.653146996523069e-06, "loss": 0.0005, "num_input_tokens_seen": 120462272, "step": 55850 }, { "epoch": 10.250504679757753, "grad_norm": 0.018467888236045837, "learning_rate": 5.6523530929429145e-06, "loss": 0.0001, "num_input_tokens_seen": 120472800, "step": 55855 }, { "epoch": 10.251422279317305, "grad_norm": 0.0017336696619167924, "learning_rate": 5.6515591726309124e-06, "loss": 0.0002, "num_input_tokens_seen": 120483712, "step": 55860 }, { "epoch": 10.252339878876858, "grad_norm": 14.782788276672363, "learning_rate": 5.65076523560743e-06, "loss": 0.0914, "num_input_tokens_seen": 120494400, "step": 55865 }, { "epoch": 10.25325747843641, "grad_norm": 0.0013252212665975094, "learning_rate": 5.649971281892826e-06, "loss": 0.0001, "num_input_tokens_seen": 120504544, "step": 55870 }, { "epoch": 10.254175077995962, "grad_norm": 0.0013899968471378088, "learning_rate": 5.649177311507465e-06, "loss": 0.0003, "num_input_tokens_seen": 120515136, "step": 55875 }, { "epoch": 10.255092677555515, "grad_norm": 0.013560017570853233, "learning_rate": 5.6483833244717136e-06, "loss": 0.0002, "num_input_tokens_seen": 120526048, "step": 55880 }, { "epoch": 10.256010277115067, "grad_norm": 0.008184793405234814, "learning_rate": 5.6475893208059325e-06, "loss": 0.0001, "num_input_tokens_seen": 120537216, "step": 55885 }, { "epoch": 10.256927876674618, "grad_norm": 0.020436573773622513, "learning_rate": 5.646795300530492e-06, "loss": 0.0002, "num_input_tokens_seen": 120548832, "step": 55890 }, { "epoch": 10.257845476234172, "grad_norm": 0.00368555192835629, "learning_rate": 5.646001263665753e-06, "loss": 0.0001, "num_input_tokens_seen": 120560544, "step": 55895 }, { "epoch": 10.258763075793723, "grad_norm": 6.1103925704956055, "learning_rate": 5.645207210232084e-06, "loss": 0.01, "num_input_tokens_seen": 120571072, "step": 55900 }, { "epoch": 10.259680675353275, "grad_norm": 0.0036517740227282047, "learning_rate": 5.644413140249849e-06, "loss": 0.0003, "num_input_tokens_seen": 120581728, "step": 55905 }, { "epoch": 10.260598274912828, "grad_norm": 0.040345728397369385, "learning_rate": 5.643619053739417e-06, "loss": 0.0004, "num_input_tokens_seen": 120592832, "step": 55910 }, { "epoch": 10.26151587447238, "grad_norm": 0.0028793488163501024, "learning_rate": 5.642824950721153e-06, "loss": 0.0001, "num_input_tokens_seen": 120604000, "step": 55915 }, { "epoch": 10.262433474031932, "grad_norm": 0.03236788883805275, "learning_rate": 5.642030831215423e-06, "loss": 0.1067, "num_input_tokens_seen": 120615296, "step": 55920 }, { "epoch": 10.263351073591485, "grad_norm": 0.05600463226437569, "learning_rate": 5.641236695242601e-06, "loss": 0.0708, "num_input_tokens_seen": 120626432, "step": 55925 }, { "epoch": 10.264268673151037, "grad_norm": 0.001072340295650065, "learning_rate": 5.640442542823049e-06, "loss": 0.0098, "num_input_tokens_seen": 120637344, "step": 55930 }, { "epoch": 10.265186272710588, "grad_norm": 0.08235884457826614, "learning_rate": 5.639648373977139e-06, "loss": 0.338, "num_input_tokens_seen": 120646912, "step": 55935 }, { "epoch": 10.266103872270142, "grad_norm": 0.007124359719455242, "learning_rate": 5.63885418872524e-06, "loss": 0.0008, "num_input_tokens_seen": 120658048, "step": 55940 }, { "epoch": 10.267021471829693, "grad_norm": 0.012867304496467113, "learning_rate": 5.6380599870877205e-06, "loss": 0.0001, "num_input_tokens_seen": 120667424, "step": 55945 }, { "epoch": 10.267939071389245, "grad_norm": 0.06748015433549881, "learning_rate": 5.637265769084953e-06, "loss": 0.0002, "num_input_tokens_seen": 120676960, "step": 55950 }, { "epoch": 10.268856670948798, "grad_norm": 0.02038044109940529, "learning_rate": 5.6364715347373045e-06, "loss": 0.0021, "num_input_tokens_seen": 120688224, "step": 55955 }, { "epoch": 10.26977427050835, "grad_norm": 0.0009023173479363322, "learning_rate": 5.635677284065147e-06, "loss": 0.0, "num_input_tokens_seen": 120699040, "step": 55960 }, { "epoch": 10.270691870067902, "grad_norm": 0.01383745763450861, "learning_rate": 5.6348830170888535e-06, "loss": 0.0, "num_input_tokens_seen": 120710752, "step": 55965 }, { "epoch": 10.271609469627455, "grad_norm": 0.0008907535229809582, "learning_rate": 5.634088733828794e-06, "loss": 0.0, "num_input_tokens_seen": 120721408, "step": 55970 }, { "epoch": 10.272527069187007, "grad_norm": 0.0019738448318094015, "learning_rate": 5.633294434305343e-06, "loss": 0.0, "num_input_tokens_seen": 120731808, "step": 55975 }, { "epoch": 10.273444668746558, "grad_norm": 0.0009149187826551497, "learning_rate": 5.632500118538869e-06, "loss": 0.0001, "num_input_tokens_seen": 120741600, "step": 55980 }, { "epoch": 10.274362268306112, "grad_norm": 0.0008526491583324969, "learning_rate": 5.631705786549748e-06, "loss": 0.2782, "num_input_tokens_seen": 120752096, "step": 55985 }, { "epoch": 10.275279867865663, "grad_norm": 0.029151692986488342, "learning_rate": 5.630911438358353e-06, "loss": 0.0001, "num_input_tokens_seen": 120763200, "step": 55990 }, { "epoch": 10.276197467425215, "grad_norm": 0.008777608163654804, "learning_rate": 5.630117073985057e-06, "loss": 0.0001, "num_input_tokens_seen": 120773856, "step": 55995 }, { "epoch": 10.277115066984768, "grad_norm": 0.02332461066544056, "learning_rate": 5.629322693450236e-06, "loss": 0.0001, "num_input_tokens_seen": 120784480, "step": 56000 }, { "epoch": 10.27803266654432, "grad_norm": 0.0006189962150529027, "learning_rate": 5.6285282967742615e-06, "loss": 0.0013, "num_input_tokens_seen": 120794912, "step": 56005 }, { "epoch": 10.278950266103871, "grad_norm": 0.028301600366830826, "learning_rate": 5.6277338839775104e-06, "loss": 0.2097, "num_input_tokens_seen": 120804768, "step": 56010 }, { "epoch": 10.279867865663425, "grad_norm": 0.009348980151116848, "learning_rate": 5.626939455080359e-06, "loss": 0.0, "num_input_tokens_seen": 120815104, "step": 56015 }, { "epoch": 10.280785465222976, "grad_norm": 0.007944270968437195, "learning_rate": 5.626145010103182e-06, "loss": 0.0001, "num_input_tokens_seen": 120824832, "step": 56020 }, { "epoch": 10.281703064782528, "grad_norm": 0.008957210928201675, "learning_rate": 5.625350549066357e-06, "loss": 0.0001, "num_input_tokens_seen": 120836864, "step": 56025 }, { "epoch": 10.282620664342081, "grad_norm": 0.12777648866176605, "learning_rate": 5.6245560719902585e-06, "loss": 0.0001, "num_input_tokens_seen": 120847392, "step": 56030 }, { "epoch": 10.283538263901633, "grad_norm": 0.006104486994445324, "learning_rate": 5.6237615788952634e-06, "loss": 0.0001, "num_input_tokens_seen": 120857248, "step": 56035 }, { "epoch": 10.284455863461186, "grad_norm": 0.006411520764231682, "learning_rate": 5.622967069801752e-06, "loss": 0.151, "num_input_tokens_seen": 120868736, "step": 56040 }, { "epoch": 10.285373463020738, "grad_norm": 60.9473762512207, "learning_rate": 5.622172544730101e-06, "loss": 0.1645, "num_input_tokens_seen": 120878336, "step": 56045 }, { "epoch": 10.28629106258029, "grad_norm": 0.09766046702861786, "learning_rate": 5.6213780037006885e-06, "loss": 0.0002, "num_input_tokens_seen": 120887648, "step": 56050 }, { "epoch": 10.287208662139843, "grad_norm": 0.013865496963262558, "learning_rate": 5.6205834467338925e-06, "loss": 0.0006, "num_input_tokens_seen": 120898432, "step": 56055 }, { "epoch": 10.288126261699395, "grad_norm": 0.01182626187801361, "learning_rate": 5.619788873850094e-06, "loss": 0.0001, "num_input_tokens_seen": 120907680, "step": 56060 }, { "epoch": 10.289043861258946, "grad_norm": 0.0005027306033298373, "learning_rate": 5.6189942850696695e-06, "loss": 0.0001, "num_input_tokens_seen": 120918624, "step": 56065 }, { "epoch": 10.2899614608185, "grad_norm": 0.00051597022684291, "learning_rate": 5.618199680413003e-06, "loss": 0.0001, "num_input_tokens_seen": 120929024, "step": 56070 }, { "epoch": 10.290879060378051, "grad_norm": 0.04016672074794769, "learning_rate": 5.617405059900472e-06, "loss": 0.0001, "num_input_tokens_seen": 120938240, "step": 56075 }, { "epoch": 10.291796659937603, "grad_norm": 0.0014154136879369617, "learning_rate": 5.616610423552458e-06, "loss": 0.0, "num_input_tokens_seen": 120948672, "step": 56080 }, { "epoch": 10.292714259497156, "grad_norm": 0.00019829129450954497, "learning_rate": 5.615815771389342e-06, "loss": 0.0001, "num_input_tokens_seen": 120959360, "step": 56085 }, { "epoch": 10.293631859056708, "grad_norm": 0.005513826385140419, "learning_rate": 5.615021103431506e-06, "loss": 0.1159, "num_input_tokens_seen": 120969600, "step": 56090 }, { "epoch": 10.29454945861626, "grad_norm": 0.01769443415105343, "learning_rate": 5.614226419699332e-06, "loss": 0.0001, "num_input_tokens_seen": 120979328, "step": 56095 }, { "epoch": 10.295467058175813, "grad_norm": 0.008813466876745224, "learning_rate": 5.613431720213203e-06, "loss": 0.0, "num_input_tokens_seen": 120990240, "step": 56100 }, { "epoch": 10.296384657735365, "grad_norm": 0.00044338495354168117, "learning_rate": 5.6126370049935e-06, "loss": 0.0002, "num_input_tokens_seen": 121001952, "step": 56105 }, { "epoch": 10.297302257294916, "grad_norm": 0.009869066998362541, "learning_rate": 5.611842274060609e-06, "loss": 0.0012, "num_input_tokens_seen": 121012608, "step": 56110 }, { "epoch": 10.29821985685447, "grad_norm": 0.11047617346048355, "learning_rate": 5.611047527434909e-06, "loss": 0.0003, "num_input_tokens_seen": 121023776, "step": 56115 }, { "epoch": 10.299137456414021, "grad_norm": 0.0013637860538437963, "learning_rate": 5.61025276513679e-06, "loss": 0.0, "num_input_tokens_seen": 121035008, "step": 56120 }, { "epoch": 10.300055055973573, "grad_norm": 0.013946096412837505, "learning_rate": 5.609457987186631e-06, "loss": 0.0, "num_input_tokens_seen": 121045600, "step": 56125 }, { "epoch": 10.300972655533126, "grad_norm": 0.10287268459796906, "learning_rate": 5.608663193604822e-06, "loss": 0.1564, "num_input_tokens_seen": 121056000, "step": 56130 }, { "epoch": 10.301890255092678, "grad_norm": 0.12357194721698761, "learning_rate": 5.607868384411744e-06, "loss": 0.0002, "num_input_tokens_seen": 121067552, "step": 56135 }, { "epoch": 10.30280785465223, "grad_norm": 0.0014391130534932017, "learning_rate": 5.607073559627784e-06, "loss": 0.0001, "num_input_tokens_seen": 121078496, "step": 56140 }, { "epoch": 10.303725454211783, "grad_norm": 0.09111490100622177, "learning_rate": 5.606278719273327e-06, "loss": 0.0001, "num_input_tokens_seen": 121088032, "step": 56145 }, { "epoch": 10.304643053771334, "grad_norm": 0.05107105150818825, "learning_rate": 5.605483863368762e-06, "loss": 0.0003, "num_input_tokens_seen": 121099008, "step": 56150 }, { "epoch": 10.305560653330886, "grad_norm": 0.03143337741494179, "learning_rate": 5.604688991934474e-06, "loss": 0.0001, "num_input_tokens_seen": 121110688, "step": 56155 }, { "epoch": 10.30647825289044, "grad_norm": 0.017225226387381554, "learning_rate": 5.60389410499085e-06, "loss": 0.0001, "num_input_tokens_seen": 121121856, "step": 56160 }, { "epoch": 10.307395852449991, "grad_norm": 0.001954671461135149, "learning_rate": 5.603099202558279e-06, "loss": 0.3001, "num_input_tokens_seen": 121133728, "step": 56165 }, { "epoch": 10.308313452009543, "grad_norm": 0.004224242176860571, "learning_rate": 5.602304284657146e-06, "loss": 0.0001, "num_input_tokens_seen": 121145280, "step": 56170 }, { "epoch": 10.309231051569096, "grad_norm": 0.0034032335970550776, "learning_rate": 5.601509351307844e-06, "loss": 0.0, "num_input_tokens_seen": 121156288, "step": 56175 }, { "epoch": 10.310148651128648, "grad_norm": 0.017790572717785835, "learning_rate": 5.600714402530759e-06, "loss": 0.0032, "num_input_tokens_seen": 121167936, "step": 56180 }, { "epoch": 10.3110662506882, "grad_norm": 0.003823396749794483, "learning_rate": 5.5999194383462806e-06, "loss": 0.0006, "num_input_tokens_seen": 121179168, "step": 56185 }, { "epoch": 10.311983850247753, "grad_norm": 0.01209963671863079, "learning_rate": 5.599124458774797e-06, "loss": 0.0, "num_input_tokens_seen": 121190560, "step": 56190 }, { "epoch": 10.312901449807304, "grad_norm": 0.000332107039866969, "learning_rate": 5.598329463836702e-06, "loss": 0.0002, "num_input_tokens_seen": 121201664, "step": 56195 }, { "epoch": 10.313819049366856, "grad_norm": 0.0012458623386919498, "learning_rate": 5.597534453552381e-06, "loss": 0.0012, "num_input_tokens_seen": 121212416, "step": 56200 }, { "epoch": 10.31473664892641, "grad_norm": 0.0027748355641961098, "learning_rate": 5.5967394279422286e-06, "loss": 0.0, "num_input_tokens_seen": 121222688, "step": 56205 }, { "epoch": 10.315654248485961, "grad_norm": 0.0011046133004128933, "learning_rate": 5.595944387026635e-06, "loss": 0.0, "num_input_tokens_seen": 121233888, "step": 56210 }, { "epoch": 10.316571848045513, "grad_norm": 0.002091738162562251, "learning_rate": 5.595149330825991e-06, "loss": 0.0, "num_input_tokens_seen": 121245152, "step": 56215 }, { "epoch": 10.317489447605066, "grad_norm": 0.0006577069289050996, "learning_rate": 5.594354259360689e-06, "loss": 0.0004, "num_input_tokens_seen": 121253952, "step": 56220 }, { "epoch": 10.318407047164618, "grad_norm": 0.053489308804273605, "learning_rate": 5.593559172651122e-06, "loss": 0.0001, "num_input_tokens_seen": 121264352, "step": 56225 }, { "epoch": 10.31932464672417, "grad_norm": 0.0009305006242357194, "learning_rate": 5.592764070717682e-06, "loss": 0.0001, "num_input_tokens_seen": 121275488, "step": 56230 }, { "epoch": 10.320242246283723, "grad_norm": 0.0034784660674631596, "learning_rate": 5.591968953580762e-06, "loss": 0.0006, "num_input_tokens_seen": 121284800, "step": 56235 }, { "epoch": 10.321159845843274, "grad_norm": 0.03147273510694504, "learning_rate": 5.5911738212607554e-06, "loss": 0.0001, "num_input_tokens_seen": 121294528, "step": 56240 }, { "epoch": 10.322077445402826, "grad_norm": 0.0015423726290464401, "learning_rate": 5.590378673778059e-06, "loss": 0.0001, "num_input_tokens_seen": 121306304, "step": 56245 }, { "epoch": 10.32299504496238, "grad_norm": 0.0005278585595078766, "learning_rate": 5.589583511153061e-06, "loss": 0.0, "num_input_tokens_seen": 121317600, "step": 56250 }, { "epoch": 10.323912644521931, "grad_norm": 0.0010214828653261065, "learning_rate": 5.588788333406162e-06, "loss": 0.1719, "num_input_tokens_seen": 121327264, "step": 56255 }, { "epoch": 10.324830244081483, "grad_norm": 0.0014176580589264631, "learning_rate": 5.587993140557755e-06, "loss": 0.0378, "num_input_tokens_seen": 121338752, "step": 56260 }, { "epoch": 10.325747843641036, "grad_norm": 0.002620187122374773, "learning_rate": 5.5871979326282335e-06, "loss": 0.0, "num_input_tokens_seen": 121350720, "step": 56265 }, { "epoch": 10.326665443200588, "grad_norm": 28.946027755737305, "learning_rate": 5.586402709637997e-06, "loss": 0.1162, "num_input_tokens_seen": 121360832, "step": 56270 }, { "epoch": 10.32758304276014, "grad_norm": 0.009347430430352688, "learning_rate": 5.585607471607438e-06, "loss": 0.0004, "num_input_tokens_seen": 121372288, "step": 56275 }, { "epoch": 10.328500642319693, "grad_norm": 0.0012175999581813812, "learning_rate": 5.584812218556955e-06, "loss": 0.0001, "num_input_tokens_seen": 121382624, "step": 56280 }, { "epoch": 10.329418241879244, "grad_norm": 0.017173243686556816, "learning_rate": 5.584016950506947e-06, "loss": 0.0001, "num_input_tokens_seen": 121393248, "step": 56285 }, { "epoch": 10.330335841438796, "grad_norm": 11.526881217956543, "learning_rate": 5.583221667477807e-06, "loss": 0.1099, "num_input_tokens_seen": 121403296, "step": 56290 }, { "epoch": 10.33125344099835, "grad_norm": 0.0026086780708283186, "learning_rate": 5.582426369489937e-06, "loss": 0.0001, "num_input_tokens_seen": 121415264, "step": 56295 }, { "epoch": 10.3321710405579, "grad_norm": 0.0307435542345047, "learning_rate": 5.581631056563732e-06, "loss": 0.1098, "num_input_tokens_seen": 121426944, "step": 56300 }, { "epoch": 10.333088640117452, "grad_norm": 0.009333626367151737, "learning_rate": 5.580835728719593e-06, "loss": 0.0004, "num_input_tokens_seen": 121438112, "step": 56305 }, { "epoch": 10.334006239677006, "grad_norm": 0.07884891331195831, "learning_rate": 5.5800403859779175e-06, "loss": 0.0016, "num_input_tokens_seen": 121449504, "step": 56310 }, { "epoch": 10.334923839236557, "grad_norm": 3.2162528038024902, "learning_rate": 5.579245028359104e-06, "loss": 0.0014, "num_input_tokens_seen": 121460512, "step": 56315 }, { "epoch": 10.335841438796109, "grad_norm": 0.0107313496991992, "learning_rate": 5.5784496558835545e-06, "loss": 0.0005, "num_input_tokens_seen": 121472512, "step": 56320 }, { "epoch": 10.336759038355662, "grad_norm": 0.004597978200763464, "learning_rate": 5.5776542685716665e-06, "loss": 0.0, "num_input_tokens_seen": 121482848, "step": 56325 }, { "epoch": 10.337676637915214, "grad_norm": 0.002340425504371524, "learning_rate": 5.576858866443844e-06, "loss": 0.0015, "num_input_tokens_seen": 121493056, "step": 56330 }, { "epoch": 10.338594237474766, "grad_norm": 0.04187462851405144, "learning_rate": 5.576063449520484e-06, "loss": 0.0017, "num_input_tokens_seen": 121503520, "step": 56335 }, { "epoch": 10.339511837034319, "grad_norm": 0.00653241528198123, "learning_rate": 5.57526801782199e-06, "loss": 0.0008, "num_input_tokens_seen": 121513728, "step": 56340 }, { "epoch": 10.34042943659387, "grad_norm": 0.16742107272148132, "learning_rate": 5.574472571368763e-06, "loss": 0.2692, "num_input_tokens_seen": 121524032, "step": 56345 }, { "epoch": 10.341347036153422, "grad_norm": 0.023443235084414482, "learning_rate": 5.573677110181204e-06, "loss": 0.1322, "num_input_tokens_seen": 121535392, "step": 56350 }, { "epoch": 10.342264635712976, "grad_norm": 0.02830885536968708, "learning_rate": 5.572881634279716e-06, "loss": 0.0001, "num_input_tokens_seen": 121547168, "step": 56355 }, { "epoch": 10.343182235272527, "grad_norm": 0.023430662229657173, "learning_rate": 5.572086143684703e-06, "loss": 0.1732, "num_input_tokens_seen": 121557536, "step": 56360 }, { "epoch": 10.344099834832079, "grad_norm": 0.05889112874865532, "learning_rate": 5.571290638416566e-06, "loss": 0.0001, "num_input_tokens_seen": 121568448, "step": 56365 }, { "epoch": 10.345017434391632, "grad_norm": 0.0023026992566883564, "learning_rate": 5.570495118495711e-06, "loss": 0.1722, "num_input_tokens_seen": 121578272, "step": 56370 }, { "epoch": 10.345935033951184, "grad_norm": 0.12219604104757309, "learning_rate": 5.569699583942539e-06, "loss": 0.0003, "num_input_tokens_seen": 121589216, "step": 56375 }, { "epoch": 10.346852633510736, "grad_norm": 0.0003407877229619771, "learning_rate": 5.568904034777458e-06, "loss": 0.0002, "num_input_tokens_seen": 121600192, "step": 56380 }, { "epoch": 10.347770233070289, "grad_norm": 0.22750209271907806, "learning_rate": 5.5681084710208675e-06, "loss": 0.0003, "num_input_tokens_seen": 121610464, "step": 56385 }, { "epoch": 10.34868783262984, "grad_norm": 0.0048440443351864815, "learning_rate": 5.567312892693176e-06, "loss": 0.0001, "num_input_tokens_seen": 121622080, "step": 56390 }, { "epoch": 10.349605432189392, "grad_norm": 20.6540470123291, "learning_rate": 5.566517299814789e-06, "loss": 0.0602, "num_input_tokens_seen": 121632832, "step": 56395 }, { "epoch": 10.350523031748946, "grad_norm": 0.003151089418679476, "learning_rate": 5.565721692406109e-06, "loss": 0.0015, "num_input_tokens_seen": 121642432, "step": 56400 }, { "epoch": 10.351440631308497, "grad_norm": 0.0824432224035263, "learning_rate": 5.564926070487548e-06, "loss": 0.0003, "num_input_tokens_seen": 121653024, "step": 56405 }, { "epoch": 10.352358230868049, "grad_norm": 0.012883638963103294, "learning_rate": 5.564130434079506e-06, "loss": 0.001, "num_input_tokens_seen": 121663840, "step": 56410 }, { "epoch": 10.353275830427602, "grad_norm": 0.0529048852622509, "learning_rate": 5.563334783202393e-06, "loss": 0.0017, "num_input_tokens_seen": 121674048, "step": 56415 }, { "epoch": 10.354193429987154, "grad_norm": 0.0043907384388148785, "learning_rate": 5.5625391178766164e-06, "loss": 0.0001, "num_input_tokens_seen": 121685184, "step": 56420 }, { "epoch": 10.355111029546705, "grad_norm": 0.8435046672821045, "learning_rate": 5.561743438122583e-06, "loss": 0.0005, "num_input_tokens_seen": 121696000, "step": 56425 }, { "epoch": 10.356028629106259, "grad_norm": 0.006411864422261715, "learning_rate": 5.560947743960702e-06, "loss": 0.0001, "num_input_tokens_seen": 121706080, "step": 56430 }, { "epoch": 10.35694622866581, "grad_norm": 0.0009696328197605908, "learning_rate": 5.5601520354113805e-06, "loss": 0.3439, "num_input_tokens_seen": 121717472, "step": 56435 }, { "epoch": 10.357863828225362, "grad_norm": 0.027399800717830658, "learning_rate": 5.559356312495027e-06, "loss": 0.1595, "num_input_tokens_seen": 121728096, "step": 56440 }, { "epoch": 10.358781427784916, "grad_norm": 0.011704008094966412, "learning_rate": 5.55856057523205e-06, "loss": 0.0005, "num_input_tokens_seen": 121738880, "step": 56445 }, { "epoch": 10.359699027344467, "grad_norm": 101.70100402832031, "learning_rate": 5.557764823642862e-06, "loss": 0.1378, "num_input_tokens_seen": 121750432, "step": 56450 }, { "epoch": 10.360616626904019, "grad_norm": 19.52516746520996, "learning_rate": 5.556969057747871e-06, "loss": 0.0038, "num_input_tokens_seen": 121760768, "step": 56455 }, { "epoch": 10.361534226463572, "grad_norm": 0.3615284264087677, "learning_rate": 5.556173277567485e-06, "loss": 0.0006, "num_input_tokens_seen": 121772832, "step": 56460 }, { "epoch": 10.362451826023124, "grad_norm": 0.06214423105120659, "learning_rate": 5.555377483122117e-06, "loss": 0.0001, "num_input_tokens_seen": 121782144, "step": 56465 }, { "epoch": 10.363369425582675, "grad_norm": 0.04657341539859772, "learning_rate": 5.554581674432177e-06, "loss": 0.0001, "num_input_tokens_seen": 121793024, "step": 56470 }, { "epoch": 10.364287025142229, "grad_norm": 0.0043127224780619144, "learning_rate": 5.553785851518076e-06, "loss": 0.0646, "num_input_tokens_seen": 121803712, "step": 56475 }, { "epoch": 10.36520462470178, "grad_norm": 0.0011210102820768952, "learning_rate": 5.552990014400228e-06, "loss": 0.1566, "num_input_tokens_seen": 121814400, "step": 56480 }, { "epoch": 10.366122224261332, "grad_norm": 0.021062003448605537, "learning_rate": 5.552194163099042e-06, "loss": 0.0, "num_input_tokens_seen": 121824672, "step": 56485 }, { "epoch": 10.367039823820885, "grad_norm": 20.075925827026367, "learning_rate": 5.551398297634931e-06, "loss": 0.1653, "num_input_tokens_seen": 121835360, "step": 56490 }, { "epoch": 10.367957423380437, "grad_norm": 0.023444049060344696, "learning_rate": 5.55060241802831e-06, "loss": 0.1289, "num_input_tokens_seen": 121845760, "step": 56495 }, { "epoch": 10.368875022939989, "grad_norm": 0.016220292076468468, "learning_rate": 5.549806524299589e-06, "loss": 0.0032, "num_input_tokens_seen": 121857920, "step": 56500 }, { "epoch": 10.369792622499542, "grad_norm": 0.007682783994823694, "learning_rate": 5.5490106164691835e-06, "loss": 0.0001, "num_input_tokens_seen": 121868480, "step": 56505 }, { "epoch": 10.370710222059094, "grad_norm": 0.010397005826234818, "learning_rate": 5.548214694557506e-06, "loss": 0.0002, "num_input_tokens_seen": 121879104, "step": 56510 }, { "epoch": 10.371627821618645, "grad_norm": 0.008798751048743725, "learning_rate": 5.547418758584973e-06, "loss": 0.0379, "num_input_tokens_seen": 121890784, "step": 56515 }, { "epoch": 10.372545421178199, "grad_norm": 0.0019853231497108936, "learning_rate": 5.546622808571994e-06, "loss": 0.0012, "num_input_tokens_seen": 121900672, "step": 56520 }, { "epoch": 10.37346302073775, "grad_norm": 55.32249069213867, "learning_rate": 5.545826844538988e-06, "loss": 0.0079, "num_input_tokens_seen": 121910560, "step": 56525 }, { "epoch": 10.374380620297302, "grad_norm": 0.03211592882871628, "learning_rate": 5.545030866506373e-06, "loss": 0.0004, "num_input_tokens_seen": 121921536, "step": 56530 }, { "epoch": 10.375298219856855, "grad_norm": 0.003593806177377701, "learning_rate": 5.544234874494557e-06, "loss": 0.0001, "num_input_tokens_seen": 121931264, "step": 56535 }, { "epoch": 10.376215819416407, "grad_norm": 0.16918613016605377, "learning_rate": 5.543438868523961e-06, "loss": 0.0001, "num_input_tokens_seen": 121943008, "step": 56540 }, { "epoch": 10.377133418975959, "grad_norm": 70.05376434326172, "learning_rate": 5.542642848615001e-06, "loss": 0.2189, "num_input_tokens_seen": 121953728, "step": 56545 }, { "epoch": 10.378051018535512, "grad_norm": 1.8477201461791992, "learning_rate": 5.541846814788094e-06, "loss": 0.2224, "num_input_tokens_seen": 121964256, "step": 56550 }, { "epoch": 10.378968618095064, "grad_norm": 0.007691607344895601, "learning_rate": 5.541050767063653e-06, "loss": 0.0001, "num_input_tokens_seen": 121975488, "step": 56555 }, { "epoch": 10.379886217654615, "grad_norm": 0.005278459750115871, "learning_rate": 5.5402547054621e-06, "loss": 0.0001, "num_input_tokens_seen": 121986336, "step": 56560 }, { "epoch": 10.380803817214169, "grad_norm": 0.020135944709181786, "learning_rate": 5.5394586300038524e-06, "loss": 0.0001, "num_input_tokens_seen": 121997728, "step": 56565 }, { "epoch": 10.38172141677372, "grad_norm": 0.0017148818587884307, "learning_rate": 5.538662540709324e-06, "loss": 0.0001, "num_input_tokens_seen": 122010304, "step": 56570 }, { "epoch": 10.382639016333272, "grad_norm": 0.0011705863289535046, "learning_rate": 5.537866437598938e-06, "loss": 0.0001, "num_input_tokens_seen": 122022496, "step": 56575 }, { "epoch": 10.383556615892825, "grad_norm": 0.018065065145492554, "learning_rate": 5.537070320693112e-06, "loss": 0.0001, "num_input_tokens_seen": 122032992, "step": 56580 }, { "epoch": 10.384474215452377, "grad_norm": 0.018514422699809074, "learning_rate": 5.536274190012264e-06, "loss": 0.0001, "num_input_tokens_seen": 122043040, "step": 56585 }, { "epoch": 10.385391815011928, "grad_norm": 0.0008398293284699321, "learning_rate": 5.535478045576814e-06, "loss": 0.0001, "num_input_tokens_seen": 122054304, "step": 56590 }, { "epoch": 10.386309414571482, "grad_norm": 0.0028810889925807714, "learning_rate": 5.534681887407183e-06, "loss": 0.0002, "num_input_tokens_seen": 122064992, "step": 56595 }, { "epoch": 10.387227014131033, "grad_norm": 0.011667818762362003, "learning_rate": 5.533885715523788e-06, "loss": 0.0001, "num_input_tokens_seen": 122074880, "step": 56600 }, { "epoch": 10.388144613690585, "grad_norm": 0.03302031010389328, "learning_rate": 5.533089529947054e-06, "loss": 0.0382, "num_input_tokens_seen": 122086016, "step": 56605 }, { "epoch": 10.389062213250138, "grad_norm": 0.0011168795172125101, "learning_rate": 5.532293330697399e-06, "loss": 0.0001, "num_input_tokens_seen": 122096480, "step": 56610 }, { "epoch": 10.38997981280969, "grad_norm": 0.002337545156478882, "learning_rate": 5.531497117795246e-06, "loss": 0.0, "num_input_tokens_seen": 122105856, "step": 56615 }, { "epoch": 10.390897412369242, "grad_norm": 0.0005244463100098073, "learning_rate": 5.530700891261014e-06, "loss": 0.0002, "num_input_tokens_seen": 122117024, "step": 56620 }, { "epoch": 10.391815011928795, "grad_norm": 0.10174962133169174, "learning_rate": 5.529904651115128e-06, "loss": 0.0287, "num_input_tokens_seen": 122127456, "step": 56625 }, { "epoch": 10.392732611488347, "grad_norm": 0.0990779846906662, "learning_rate": 5.529108397378008e-06, "loss": 0.0002, "num_input_tokens_seen": 122138336, "step": 56630 }, { "epoch": 10.393650211047898, "grad_norm": 0.005774707067757845, "learning_rate": 5.528312130070078e-06, "loss": 0.0005, "num_input_tokens_seen": 122149056, "step": 56635 }, { "epoch": 10.394567810607452, "grad_norm": 0.0009883391903713346, "learning_rate": 5.527515849211762e-06, "loss": 0.0001, "num_input_tokens_seen": 122159744, "step": 56640 }, { "epoch": 10.395485410167003, "grad_norm": 0.0014123909641057253, "learning_rate": 5.52671955482348e-06, "loss": 0.0004, "num_input_tokens_seen": 122170176, "step": 56645 }, { "epoch": 10.396403009726555, "grad_norm": 0.05596425384283066, "learning_rate": 5.525923246925659e-06, "loss": 0.0002, "num_input_tokens_seen": 122180288, "step": 56650 }, { "epoch": 10.397320609286108, "grad_norm": 0.000732541608158499, "learning_rate": 5.5251269255387206e-06, "loss": 0.0, "num_input_tokens_seen": 122190720, "step": 56655 }, { "epoch": 10.39823820884566, "grad_norm": 0.17034794390201569, "learning_rate": 5.524330590683092e-06, "loss": 0.0002, "num_input_tokens_seen": 122202048, "step": 56660 }, { "epoch": 10.399155808405212, "grad_norm": 0.0009811506606638432, "learning_rate": 5.523534242379196e-06, "loss": 0.0012, "num_input_tokens_seen": 122212864, "step": 56665 }, { "epoch": 10.400073407964765, "grad_norm": 152.0744171142578, "learning_rate": 5.5227378806474574e-06, "loss": 0.0733, "num_input_tokens_seen": 122222912, "step": 56670 }, { "epoch": 10.400991007524317, "grad_norm": 0.004942314233630896, "learning_rate": 5.521941505508303e-06, "loss": 0.0, "num_input_tokens_seen": 122233536, "step": 56675 }, { "epoch": 10.401908607083868, "grad_norm": 0.04985817149281502, "learning_rate": 5.5211451169821586e-06, "loss": 0.0329, "num_input_tokens_seen": 122243904, "step": 56680 }, { "epoch": 10.402826206643422, "grad_norm": 32.946937561035156, "learning_rate": 5.520348715089448e-06, "loss": 0.1439, "num_input_tokens_seen": 122253504, "step": 56685 }, { "epoch": 10.403743806202973, "grad_norm": 0.0026066957507282495, "learning_rate": 5.5195522998506e-06, "loss": 0.0001, "num_input_tokens_seen": 122264288, "step": 56690 }, { "epoch": 10.404661405762525, "grad_norm": 0.014357979409396648, "learning_rate": 5.51875587128604e-06, "loss": 0.0106, "num_input_tokens_seen": 122275008, "step": 56695 }, { "epoch": 10.405579005322078, "grad_norm": 0.07666002959012985, "learning_rate": 5.517959429416198e-06, "loss": 0.0001, "num_input_tokens_seen": 122285728, "step": 56700 }, { "epoch": 10.40649660488163, "grad_norm": 0.0029571535997092724, "learning_rate": 5.517162974261498e-06, "loss": 0.0003, "num_input_tokens_seen": 122297344, "step": 56705 }, { "epoch": 10.407414204441181, "grad_norm": 0.002686037216335535, "learning_rate": 5.516366505842368e-06, "loss": 0.033, "num_input_tokens_seen": 122306528, "step": 56710 }, { "epoch": 10.408331804000735, "grad_norm": 0.00385847850702703, "learning_rate": 5.51557002417924e-06, "loss": 0.1348, "num_input_tokens_seen": 122316928, "step": 56715 }, { "epoch": 10.409249403560286, "grad_norm": 0.014397162944078445, "learning_rate": 5.514773529292537e-06, "loss": 0.2112, "num_input_tokens_seen": 122327968, "step": 56720 }, { "epoch": 10.410167003119838, "grad_norm": 0.02467329427599907, "learning_rate": 5.513977021202693e-06, "loss": 0.0002, "num_input_tokens_seen": 122340064, "step": 56725 }, { "epoch": 10.411084602679392, "grad_norm": 0.0004933277959935367, "learning_rate": 5.513180499930134e-06, "loss": 0.0, "num_input_tokens_seen": 122350784, "step": 56730 }, { "epoch": 10.412002202238943, "grad_norm": 0.002282818779349327, "learning_rate": 5.512383965495292e-06, "loss": 0.001, "num_input_tokens_seen": 122359936, "step": 56735 }, { "epoch": 10.412919801798495, "grad_norm": 0.0019244805444031954, "learning_rate": 5.511587417918593e-06, "loss": 0.0001, "num_input_tokens_seen": 122370208, "step": 56740 }, { "epoch": 10.413837401358048, "grad_norm": 0.0018486034823581576, "learning_rate": 5.510790857220472e-06, "loss": 0.0478, "num_input_tokens_seen": 122381824, "step": 56745 }, { "epoch": 10.4147550009176, "grad_norm": 0.021429920569062233, "learning_rate": 5.509994283421356e-06, "loss": 0.0013, "num_input_tokens_seen": 122392512, "step": 56750 }, { "epoch": 10.415672600477151, "grad_norm": 27.001428604125977, "learning_rate": 5.509197696541677e-06, "loss": 0.1346, "num_input_tokens_seen": 122403456, "step": 56755 }, { "epoch": 10.416590200036705, "grad_norm": 0.07909750193357468, "learning_rate": 5.508401096601867e-06, "loss": 0.001, "num_input_tokens_seen": 122414848, "step": 56760 }, { "epoch": 10.417507799596256, "grad_norm": 0.00964504573494196, "learning_rate": 5.5076044836223565e-06, "loss": 0.0001, "num_input_tokens_seen": 122425280, "step": 56765 }, { "epoch": 10.418425399155808, "grad_norm": 38.30149841308594, "learning_rate": 5.5068078576235776e-06, "loss": 0.0021, "num_input_tokens_seen": 122434368, "step": 56770 }, { "epoch": 10.419342998715361, "grad_norm": 0.013426396064460278, "learning_rate": 5.506011218625962e-06, "loss": 0.0001, "num_input_tokens_seen": 122445760, "step": 56775 }, { "epoch": 10.420260598274913, "grad_norm": 0.02715824358165264, "learning_rate": 5.505214566649944e-06, "loss": 0.0001, "num_input_tokens_seen": 122456320, "step": 56780 }, { "epoch": 10.421178197834465, "grad_norm": 0.024465899914503098, "learning_rate": 5.504417901715955e-06, "loss": 0.144, "num_input_tokens_seen": 122466272, "step": 56785 }, { "epoch": 10.422095797394018, "grad_norm": 0.00041248041088692844, "learning_rate": 5.503621223844429e-06, "loss": 0.0, "num_input_tokens_seen": 122475712, "step": 56790 }, { "epoch": 10.42301339695357, "grad_norm": 0.1364547163248062, "learning_rate": 5.5028245330557975e-06, "loss": 0.1966, "num_input_tokens_seen": 122484896, "step": 56795 }, { "epoch": 10.423930996513121, "grad_norm": 2.0154800415039062, "learning_rate": 5.502027829370497e-06, "loss": 0.0003, "num_input_tokens_seen": 122495808, "step": 56800 }, { "epoch": 10.424848596072675, "grad_norm": 28.251222610473633, "learning_rate": 5.5012311128089615e-06, "loss": 0.0426, "num_input_tokens_seen": 122506720, "step": 56805 }, { "epoch": 10.425766195632226, "grad_norm": 0.019889788702130318, "learning_rate": 5.500434383391624e-06, "loss": 0.0001, "num_input_tokens_seen": 122518976, "step": 56810 }, { "epoch": 10.426683795191778, "grad_norm": 0.003078159876167774, "learning_rate": 5.4996376411389205e-06, "loss": 0.0383, "num_input_tokens_seen": 122531264, "step": 56815 }, { "epoch": 10.427601394751331, "grad_norm": 0.01871572807431221, "learning_rate": 5.498840886071285e-06, "loss": 0.0003, "num_input_tokens_seen": 122542528, "step": 56820 }, { "epoch": 10.428518994310883, "grad_norm": 0.005179295781999826, "learning_rate": 5.498044118209155e-06, "loss": 0.0002, "num_input_tokens_seen": 122553536, "step": 56825 }, { "epoch": 10.429436593870435, "grad_norm": 0.06662248075008392, "learning_rate": 5.497247337572964e-06, "loss": 0.0001, "num_input_tokens_seen": 122564096, "step": 56830 }, { "epoch": 10.430354193429988, "grad_norm": 0.031151695176959038, "learning_rate": 5.496450544183151e-06, "loss": 0.0119, "num_input_tokens_seen": 122574400, "step": 56835 }, { "epoch": 10.43127179298954, "grad_norm": 0.699925422668457, "learning_rate": 5.495653738060151e-06, "loss": 0.0006, "num_input_tokens_seen": 122584352, "step": 56840 }, { "epoch": 10.432189392549091, "grad_norm": 0.07488805800676346, "learning_rate": 5.494856919224398e-06, "loss": 0.0128, "num_input_tokens_seen": 122595232, "step": 56845 }, { "epoch": 10.433106992108645, "grad_norm": 33.50191116333008, "learning_rate": 5.494060087696336e-06, "loss": 0.0649, "num_input_tokens_seen": 122607040, "step": 56850 }, { "epoch": 10.434024591668196, "grad_norm": 7.148589134216309, "learning_rate": 5.493263243496396e-06, "loss": 0.0105, "num_input_tokens_seen": 122618528, "step": 56855 }, { "epoch": 10.434942191227748, "grad_norm": 0.05028637498617172, "learning_rate": 5.492466386645019e-06, "loss": 0.0001, "num_input_tokens_seen": 122629344, "step": 56860 }, { "epoch": 10.435859790787301, "grad_norm": 0.002914692973718047, "learning_rate": 5.491669517162642e-06, "loss": 0.0, "num_input_tokens_seen": 122639008, "step": 56865 }, { "epoch": 10.436777390346853, "grad_norm": 0.003453716402873397, "learning_rate": 5.490872635069705e-06, "loss": 0.0002, "num_input_tokens_seen": 122651392, "step": 56870 }, { "epoch": 10.437694989906404, "grad_norm": 0.001676869811490178, "learning_rate": 5.490075740386644e-06, "loss": 0.1391, "num_input_tokens_seen": 122661760, "step": 56875 }, { "epoch": 10.438612589465958, "grad_norm": 0.03208339214324951, "learning_rate": 5.4892788331339005e-06, "loss": 0.0, "num_input_tokens_seen": 122673248, "step": 56880 }, { "epoch": 10.43953018902551, "grad_norm": 0.08338850736618042, "learning_rate": 5.488481913331914e-06, "loss": 0.0003, "num_input_tokens_seen": 122683648, "step": 56885 }, { "epoch": 10.440447788585061, "grad_norm": 0.1584022045135498, "learning_rate": 5.487684981001124e-06, "loss": 0.116, "num_input_tokens_seen": 122695456, "step": 56890 }, { "epoch": 10.441365388144614, "grad_norm": 0.00858781673014164, "learning_rate": 5.486888036161968e-06, "loss": 0.0001, "num_input_tokens_seen": 122706048, "step": 56895 }, { "epoch": 10.442282987704166, "grad_norm": 0.54423987865448, "learning_rate": 5.48609107883489e-06, "loss": 0.0011, "num_input_tokens_seen": 122717184, "step": 56900 }, { "epoch": 10.443200587263718, "grad_norm": 0.0017513786442577839, "learning_rate": 5.485294109040328e-06, "loss": 0.001, "num_input_tokens_seen": 122728640, "step": 56905 }, { "epoch": 10.444118186823271, "grad_norm": 0.004561136942356825, "learning_rate": 5.4844971267987255e-06, "loss": 0.0001, "num_input_tokens_seen": 122739072, "step": 56910 }, { "epoch": 10.445035786382823, "grad_norm": 0.10222097486257553, "learning_rate": 5.483700132130522e-06, "loss": 0.0001, "num_input_tokens_seen": 122749312, "step": 56915 }, { "epoch": 10.445953385942374, "grad_norm": 0.3806973993778229, "learning_rate": 5.482903125056159e-06, "loss": 0.2295, "num_input_tokens_seen": 122760352, "step": 56920 }, { "epoch": 10.446870985501928, "grad_norm": 0.0013352902606129646, "learning_rate": 5.482106105596081e-06, "loss": 0.001, "num_input_tokens_seen": 122770880, "step": 56925 }, { "epoch": 10.44778858506148, "grad_norm": 0.4205450415611267, "learning_rate": 5.481309073770728e-06, "loss": 0.0005, "num_input_tokens_seen": 122781600, "step": 56930 }, { "epoch": 10.448706184621031, "grad_norm": 0.001112528843805194, "learning_rate": 5.480512029600542e-06, "loss": 0.0, "num_input_tokens_seen": 122791264, "step": 56935 }, { "epoch": 10.449623784180584, "grad_norm": 0.03723020851612091, "learning_rate": 5.479714973105968e-06, "loss": 0.2688, "num_input_tokens_seen": 122801504, "step": 56940 }, { "epoch": 10.450541383740136, "grad_norm": 0.04493463411927223, "learning_rate": 5.478917904307448e-06, "loss": 0.1159, "num_input_tokens_seen": 122811936, "step": 56945 }, { "epoch": 10.451458983299688, "grad_norm": 0.002496343804523349, "learning_rate": 5.478120823225427e-06, "loss": 0.0001, "num_input_tokens_seen": 122822336, "step": 56950 }, { "epoch": 10.452376582859241, "grad_norm": 35.76667785644531, "learning_rate": 5.477323729880347e-06, "loss": 0.3376, "num_input_tokens_seen": 122832512, "step": 56955 }, { "epoch": 10.453294182418793, "grad_norm": 0.00043734381324611604, "learning_rate": 5.476526624292654e-06, "loss": 0.1377, "num_input_tokens_seen": 122842784, "step": 56960 }, { "epoch": 10.454211781978344, "grad_norm": 0.018915727734565735, "learning_rate": 5.475729506482791e-06, "loss": 0.0046, "num_input_tokens_seen": 122854656, "step": 56965 }, { "epoch": 10.455129381537898, "grad_norm": 0.0009786123409867287, "learning_rate": 5.474932376471204e-06, "loss": 0.0325, "num_input_tokens_seen": 122865504, "step": 56970 }, { "epoch": 10.45604698109745, "grad_norm": 0.028730614110827446, "learning_rate": 5.474135234278337e-06, "loss": 0.0001, "num_input_tokens_seen": 122876256, "step": 56975 }, { "epoch": 10.456964580657, "grad_norm": 0.001993197947740555, "learning_rate": 5.473338079924637e-06, "loss": 0.0003, "num_input_tokens_seen": 122887488, "step": 56980 }, { "epoch": 10.457882180216554, "grad_norm": 1.346579670906067, "learning_rate": 5.472540913430547e-06, "loss": 0.1075, "num_input_tokens_seen": 122898848, "step": 56985 }, { "epoch": 10.458799779776106, "grad_norm": 0.0022292733192443848, "learning_rate": 5.471743734816517e-06, "loss": 0.0145, "num_input_tokens_seen": 122909984, "step": 56990 }, { "epoch": 10.459717379335657, "grad_norm": 0.04137880727648735, "learning_rate": 5.470946544102992e-06, "loss": 0.0013, "num_input_tokens_seen": 122920384, "step": 56995 }, { "epoch": 10.46063497889521, "grad_norm": 0.0011123629519715905, "learning_rate": 5.470149341310415e-06, "loss": 0.1697, "num_input_tokens_seen": 122933440, "step": 57000 }, { "epoch": 10.461552578454762, "grad_norm": 5.859216213226318, "learning_rate": 5.469352126459237e-06, "loss": 0.009, "num_input_tokens_seen": 122944224, "step": 57005 }, { "epoch": 10.462470178014314, "grad_norm": 22.838010787963867, "learning_rate": 5.468554899569905e-06, "loss": 0.1145, "num_input_tokens_seen": 122956192, "step": 57010 }, { "epoch": 10.463387777573868, "grad_norm": 0.014387933537364006, "learning_rate": 5.4677576606628665e-06, "loss": 0.0006, "num_input_tokens_seen": 122967424, "step": 57015 }, { "epoch": 10.46430537713342, "grad_norm": 0.017497500404715538, "learning_rate": 5.466960409758569e-06, "loss": 0.0023, "num_input_tokens_seen": 122977504, "step": 57020 }, { "epoch": 10.46522297669297, "grad_norm": 0.029210714623332024, "learning_rate": 5.46616314687746e-06, "loss": 0.0037, "num_input_tokens_seen": 122987584, "step": 57025 }, { "epoch": 10.466140576252524, "grad_norm": 0.0017727898666635156, "learning_rate": 5.4653658720399885e-06, "loss": 0.0001, "num_input_tokens_seen": 122998784, "step": 57030 }, { "epoch": 10.467058175812076, "grad_norm": 0.017345504835247993, "learning_rate": 5.4645685852666045e-06, "loss": 0.0011, "num_input_tokens_seen": 123009312, "step": 57035 }, { "epoch": 10.467975775371627, "grad_norm": 0.001021328498609364, "learning_rate": 5.463771286577755e-06, "loss": 0.2287, "num_input_tokens_seen": 123020192, "step": 57040 }, { "epoch": 10.46889337493118, "grad_norm": 0.27079519629478455, "learning_rate": 5.4629739759938926e-06, "loss": 0.0004, "num_input_tokens_seen": 123031264, "step": 57045 }, { "epoch": 10.469810974490732, "grad_norm": 0.038911767303943634, "learning_rate": 5.462176653535464e-06, "loss": 0.0002, "num_input_tokens_seen": 123042656, "step": 57050 }, { "epoch": 10.470728574050284, "grad_norm": 0.04849906638264656, "learning_rate": 5.46137931922292e-06, "loss": 0.0001, "num_input_tokens_seen": 123054016, "step": 57055 }, { "epoch": 10.471646173609837, "grad_norm": 0.008702500723302364, "learning_rate": 5.460581973076713e-06, "loss": 0.0, "num_input_tokens_seen": 123063840, "step": 57060 }, { "epoch": 10.472563773169389, "grad_norm": 0.16106215119361877, "learning_rate": 5.459784615117292e-06, "loss": 0.0002, "num_input_tokens_seen": 123074336, "step": 57065 }, { "epoch": 10.47348137272894, "grad_norm": 0.006636140868067741, "learning_rate": 5.458987245365108e-06, "loss": 0.0, "num_input_tokens_seen": 123084672, "step": 57070 }, { "epoch": 10.474398972288494, "grad_norm": 0.8195241093635559, "learning_rate": 5.4581898638406115e-06, "loss": 0.0008, "num_input_tokens_seen": 123095776, "step": 57075 }, { "epoch": 10.475316571848046, "grad_norm": 30.94024658203125, "learning_rate": 5.4573924705642565e-06, "loss": 0.1945, "num_input_tokens_seen": 123107104, "step": 57080 }, { "epoch": 10.476234171407597, "grad_norm": 0.0044010416604578495, "learning_rate": 5.456595065556493e-06, "loss": 0.0, "num_input_tokens_seen": 123118464, "step": 57085 }, { "epoch": 10.47715177096715, "grad_norm": 0.002432862762361765, "learning_rate": 5.455797648837774e-06, "loss": 0.2157, "num_input_tokens_seen": 123129792, "step": 57090 }, { "epoch": 10.478069370526702, "grad_norm": 0.015607628971338272, "learning_rate": 5.455000220428551e-06, "loss": 0.0011, "num_input_tokens_seen": 123141312, "step": 57095 }, { "epoch": 10.478986970086254, "grad_norm": 93.75394439697266, "learning_rate": 5.454202780349279e-06, "loss": 0.0976, "num_input_tokens_seen": 123151616, "step": 57100 }, { "epoch": 10.479904569645807, "grad_norm": 0.0009867546614259481, "learning_rate": 5.4534053286204084e-06, "loss": 0.0001, "num_input_tokens_seen": 123162368, "step": 57105 }, { "epoch": 10.480822169205359, "grad_norm": 0.008533432148396969, "learning_rate": 5.452607865262394e-06, "loss": 0.0001, "num_input_tokens_seen": 123173568, "step": 57110 }, { "epoch": 10.48173976876491, "grad_norm": 0.3179078996181488, "learning_rate": 5.451810390295689e-06, "loss": 0.0002, "num_input_tokens_seen": 123183648, "step": 57115 }, { "epoch": 10.482657368324464, "grad_norm": 67.4979248046875, "learning_rate": 5.451012903740749e-06, "loss": 0.4598, "num_input_tokens_seen": 123194528, "step": 57120 }, { "epoch": 10.483574967884016, "grad_norm": 0.008086246438324451, "learning_rate": 5.4502154056180255e-06, "loss": 0.0001, "num_input_tokens_seen": 123206720, "step": 57125 }, { "epoch": 10.484492567443567, "grad_norm": 60.26477813720703, "learning_rate": 5.449417895947976e-06, "loss": 0.1837, "num_input_tokens_seen": 123216224, "step": 57130 }, { "epoch": 10.48541016700312, "grad_norm": 0.0006026193732395768, "learning_rate": 5.4486203747510525e-06, "loss": 0.0002, "num_input_tokens_seen": 123227872, "step": 57135 }, { "epoch": 10.486327766562672, "grad_norm": 56.66249084472656, "learning_rate": 5.447822842047712e-06, "loss": 0.2219, "num_input_tokens_seen": 123238144, "step": 57140 }, { "epoch": 10.487245366122224, "grad_norm": 0.0018077089916914701, "learning_rate": 5.447025297858411e-06, "loss": 0.0, "num_input_tokens_seen": 123248448, "step": 57145 }, { "epoch": 10.488162965681777, "grad_norm": 0.009206937626004219, "learning_rate": 5.446227742203603e-06, "loss": 0.0002, "num_input_tokens_seen": 123259616, "step": 57150 }, { "epoch": 10.489080565241329, "grad_norm": 0.018916266039013863, "learning_rate": 5.445430175103745e-06, "loss": 0.0, "num_input_tokens_seen": 123269472, "step": 57155 }, { "epoch": 10.48999816480088, "grad_norm": 0.052049420773983, "learning_rate": 5.444632596579293e-06, "loss": 0.0003, "num_input_tokens_seen": 123281056, "step": 57160 }, { "epoch": 10.490915764360434, "grad_norm": 8.679265022277832, "learning_rate": 5.443835006650704e-06, "loss": 0.025, "num_input_tokens_seen": 123290720, "step": 57165 }, { "epoch": 10.491833363919985, "grad_norm": 0.008941330946981907, "learning_rate": 5.443037405338436e-06, "loss": 0.092, "num_input_tokens_seen": 123301856, "step": 57170 }, { "epoch": 10.492750963479537, "grad_norm": 0.022818133234977722, "learning_rate": 5.442239792662944e-06, "loss": 0.0001, "num_input_tokens_seen": 123312640, "step": 57175 }, { "epoch": 10.49366856303909, "grad_norm": 0.006968457251787186, "learning_rate": 5.441442168644688e-06, "loss": 0.0004, "num_input_tokens_seen": 123323104, "step": 57180 }, { "epoch": 10.494586162598642, "grad_norm": 0.001037035370245576, "learning_rate": 5.4406445333041235e-06, "loss": 0.0004, "num_input_tokens_seen": 123334240, "step": 57185 }, { "epoch": 10.495503762158194, "grad_norm": 0.1604793220758438, "learning_rate": 5.439846886661711e-06, "loss": 0.0007, "num_input_tokens_seen": 123345728, "step": 57190 }, { "epoch": 10.496421361717747, "grad_norm": 0.0010527390986680984, "learning_rate": 5.439049228737906e-06, "loss": 0.0003, "num_input_tokens_seen": 123356704, "step": 57195 }, { "epoch": 10.497338961277299, "grad_norm": 0.0013464309740811586, "learning_rate": 5.4382515595531695e-06, "loss": 0.0014, "num_input_tokens_seen": 123367712, "step": 57200 }, { "epoch": 10.49825656083685, "grad_norm": 0.00027968219364993274, "learning_rate": 5.43745387912796e-06, "loss": 0.0963, "num_input_tokens_seen": 123378048, "step": 57205 }, { "epoch": 10.499174160396404, "grad_norm": 0.03342220187187195, "learning_rate": 5.436656187482736e-06, "loss": 0.0005, "num_input_tokens_seen": 123389312, "step": 57210 }, { "epoch": 10.500091759955955, "grad_norm": 20.210948944091797, "learning_rate": 5.435858484637957e-06, "loss": 0.061, "num_input_tokens_seen": 123400576, "step": 57215 }, { "epoch": 10.501009359515507, "grad_norm": 0.051285505294799805, "learning_rate": 5.435060770614085e-06, "loss": 0.0483, "num_input_tokens_seen": 123412064, "step": 57220 }, { "epoch": 10.50192695907506, "grad_norm": 0.003039357950910926, "learning_rate": 5.434263045431577e-06, "loss": 0.0, "num_input_tokens_seen": 123422752, "step": 57225 }, { "epoch": 10.502844558634612, "grad_norm": 0.011843577958643436, "learning_rate": 5.433465309110896e-06, "loss": 0.0002, "num_input_tokens_seen": 123432512, "step": 57230 }, { "epoch": 10.503762158194164, "grad_norm": 0.0005532989744096994, "learning_rate": 5.4326675616725e-06, "loss": 0.0131, "num_input_tokens_seen": 123443552, "step": 57235 }, { "epoch": 10.504679757753717, "grad_norm": 0.40468135476112366, "learning_rate": 5.431869803136852e-06, "loss": 0.001, "num_input_tokens_seen": 123452768, "step": 57240 }, { "epoch": 10.505597357313269, "grad_norm": 0.0018778465455397964, "learning_rate": 5.431072033524415e-06, "loss": 0.0823, "num_input_tokens_seen": 123463296, "step": 57245 }, { "epoch": 10.50651495687282, "grad_norm": 8.303051948547363, "learning_rate": 5.430274252855646e-06, "loss": 0.0016, "num_input_tokens_seen": 123474752, "step": 57250 }, { "epoch": 10.507432556432374, "grad_norm": 0.36732396483421326, "learning_rate": 5.429476461151011e-06, "loss": 0.0007, "num_input_tokens_seen": 123484960, "step": 57255 }, { "epoch": 10.508350155991925, "grad_norm": 0.002243409864604473, "learning_rate": 5.428678658430969e-06, "loss": 0.0001, "num_input_tokens_seen": 123496512, "step": 57260 }, { "epoch": 10.509267755551477, "grad_norm": 0.0010065059177577496, "learning_rate": 5.427880844715986e-06, "loss": 0.0007, "num_input_tokens_seen": 123507392, "step": 57265 }, { "epoch": 10.51018535511103, "grad_norm": 0.0004325264017097652, "learning_rate": 5.4270830200265205e-06, "loss": 0.0703, "num_input_tokens_seen": 123516832, "step": 57270 }, { "epoch": 10.511102954670582, "grad_norm": 0.2072402834892273, "learning_rate": 5.4262851843830364e-06, "loss": 0.001, "num_input_tokens_seen": 123528832, "step": 57275 }, { "epoch": 10.512020554230133, "grad_norm": 0.0005289014661684632, "learning_rate": 5.425487337806001e-06, "loss": 0.0331, "num_input_tokens_seen": 123539232, "step": 57280 }, { "epoch": 10.512938153789687, "grad_norm": 0.0064232987351715565, "learning_rate": 5.424689480315872e-06, "loss": 0.0, "num_input_tokens_seen": 123550880, "step": 57285 }, { "epoch": 10.513855753349238, "grad_norm": 0.002748162019997835, "learning_rate": 5.423891611933119e-06, "loss": 0.0023, "num_input_tokens_seen": 123561472, "step": 57290 }, { "epoch": 10.51477335290879, "grad_norm": 0.011556875891983509, "learning_rate": 5.423093732678201e-06, "loss": 0.0427, "num_input_tokens_seen": 123571840, "step": 57295 }, { "epoch": 10.515690952468344, "grad_norm": 0.0034236914943903685, "learning_rate": 5.422295842571585e-06, "loss": 0.008, "num_input_tokens_seen": 123583424, "step": 57300 }, { "epoch": 10.516608552027895, "grad_norm": 0.06062209978699684, "learning_rate": 5.421497941633735e-06, "loss": 0.0001, "num_input_tokens_seen": 123594752, "step": 57305 }, { "epoch": 10.517526151587447, "grad_norm": 0.005004222504794598, "learning_rate": 5.420700029885118e-06, "loss": 0.0008, "num_input_tokens_seen": 123605504, "step": 57310 }, { "epoch": 10.518443751147, "grad_norm": 0.48244228959083557, "learning_rate": 5.419902107346195e-06, "loss": 0.0004, "num_input_tokens_seen": 123617248, "step": 57315 }, { "epoch": 10.519361350706552, "grad_norm": 0.007962711155414581, "learning_rate": 5.419104174037434e-06, "loss": 0.0, "num_input_tokens_seen": 123628864, "step": 57320 }, { "epoch": 10.520278950266103, "grad_norm": 0.0017226871568709612, "learning_rate": 5.4183062299793e-06, "loss": 0.0, "num_input_tokens_seen": 123639552, "step": 57325 }, { "epoch": 10.521196549825657, "grad_norm": 0.0035815201699733734, "learning_rate": 5.417508275192261e-06, "loss": 0.0001, "num_input_tokens_seen": 123649440, "step": 57330 }, { "epoch": 10.522114149385208, "grad_norm": 0.03192564845085144, "learning_rate": 5.416710309696781e-06, "loss": 0.0, "num_input_tokens_seen": 123657632, "step": 57335 }, { "epoch": 10.52303174894476, "grad_norm": 0.00033485417952761054, "learning_rate": 5.415912333513328e-06, "loss": 0.0, "num_input_tokens_seen": 123667840, "step": 57340 }, { "epoch": 10.523949348504313, "grad_norm": 0.005972911138087511, "learning_rate": 5.415114346662366e-06, "loss": 0.0, "num_input_tokens_seen": 123679424, "step": 57345 }, { "epoch": 10.524866948063865, "grad_norm": 0.000906962261069566, "learning_rate": 5.414316349164367e-06, "loss": 0.1632, "num_input_tokens_seen": 123690016, "step": 57350 }, { "epoch": 10.525784547623417, "grad_norm": 0.002178569557145238, "learning_rate": 5.413518341039795e-06, "loss": 0.0, "num_input_tokens_seen": 123701472, "step": 57355 }, { "epoch": 10.52670214718297, "grad_norm": 0.0011853951727971435, "learning_rate": 5.4127203223091176e-06, "loss": 0.0, "num_input_tokens_seen": 123712288, "step": 57360 }, { "epoch": 10.527619746742522, "grad_norm": 0.01831727661192417, "learning_rate": 5.411922292992805e-06, "loss": 0.0003, "num_input_tokens_seen": 123721952, "step": 57365 }, { "epoch": 10.528537346302073, "grad_norm": 0.0003144191869068891, "learning_rate": 5.4111242531113225e-06, "loss": 0.0001, "num_input_tokens_seen": 123732928, "step": 57370 }, { "epoch": 10.529454945861627, "grad_norm": 0.002994734328240156, "learning_rate": 5.410326202685141e-06, "loss": 0.0001, "num_input_tokens_seen": 123742656, "step": 57375 }, { "epoch": 10.530372545421178, "grad_norm": 0.013184136711061, "learning_rate": 5.409528141734729e-06, "loss": 0.0002, "num_input_tokens_seen": 123753312, "step": 57380 }, { "epoch": 10.53129014498073, "grad_norm": 0.0009248206624761224, "learning_rate": 5.408730070280553e-06, "loss": 0.0004, "num_input_tokens_seen": 123763456, "step": 57385 }, { "epoch": 10.532207744540283, "grad_norm": 0.1356370896100998, "learning_rate": 5.4079319883430855e-06, "loss": 0.0003, "num_input_tokens_seen": 123775584, "step": 57390 }, { "epoch": 10.533125344099835, "grad_norm": 1.6321595907211304, "learning_rate": 5.407133895942794e-06, "loss": 0.0003, "num_input_tokens_seen": 123786592, "step": 57395 }, { "epoch": 10.534042943659387, "grad_norm": 0.007472306024283171, "learning_rate": 5.406335793100151e-06, "loss": 0.0001, "num_input_tokens_seen": 123796576, "step": 57400 }, { "epoch": 10.53496054321894, "grad_norm": 0.0042243716306984425, "learning_rate": 5.405537679835623e-06, "loss": 0.0, "num_input_tokens_seen": 123807808, "step": 57405 }, { "epoch": 10.535878142778492, "grad_norm": 0.003983639180660248, "learning_rate": 5.404739556169683e-06, "loss": 0.0, "num_input_tokens_seen": 123817536, "step": 57410 }, { "epoch": 10.536795742338043, "grad_norm": 0.0014868768630549312, "learning_rate": 5.403941422122799e-06, "loss": 0.0001, "num_input_tokens_seen": 123829056, "step": 57415 }, { "epoch": 10.537713341897597, "grad_norm": 0.017014604061841965, "learning_rate": 5.403143277715446e-06, "loss": 0.0109, "num_input_tokens_seen": 123838656, "step": 57420 }, { "epoch": 10.538630941457148, "grad_norm": 0.0880575180053711, "learning_rate": 5.402345122968091e-06, "loss": 0.0001, "num_input_tokens_seen": 123849536, "step": 57425 }, { "epoch": 10.5395485410167, "grad_norm": 0.00556669756770134, "learning_rate": 5.401546957901207e-06, "loss": 0.0001, "num_input_tokens_seen": 123859488, "step": 57430 }, { "epoch": 10.540466140576253, "grad_norm": 0.0030678045004606247, "learning_rate": 5.4007487825352655e-06, "loss": 0.0, "num_input_tokens_seen": 123869792, "step": 57435 }, { "epoch": 10.541383740135805, "grad_norm": 42.1087532043457, "learning_rate": 5.399950596890741e-06, "loss": 0.0588, "num_input_tokens_seen": 123881216, "step": 57440 }, { "epoch": 10.542301339695356, "grad_norm": 0.0037699600216001272, "learning_rate": 5.399152400988101e-06, "loss": 0.1719, "num_input_tokens_seen": 123891872, "step": 57445 }, { "epoch": 10.54321893925491, "grad_norm": 0.12267360836267471, "learning_rate": 5.3983541948478236e-06, "loss": 0.1657, "num_input_tokens_seen": 123903136, "step": 57450 }, { "epoch": 10.544136538814461, "grad_norm": 0.0003171357966493815, "learning_rate": 5.397555978490375e-06, "loss": 0.0175, "num_input_tokens_seen": 123914048, "step": 57455 }, { "epoch": 10.545054138374013, "grad_norm": 0.007995261810719967, "learning_rate": 5.3967577519362335e-06, "loss": 0.0002, "num_input_tokens_seen": 123925376, "step": 57460 }, { "epoch": 10.545971737933566, "grad_norm": 0.006380242761224508, "learning_rate": 5.395959515205871e-06, "loss": 0.0032, "num_input_tokens_seen": 123935808, "step": 57465 }, { "epoch": 10.546889337493118, "grad_norm": 0.0036075206007808447, "learning_rate": 5.395161268319759e-06, "loss": 0.0, "num_input_tokens_seen": 123947584, "step": 57470 }, { "epoch": 10.54780693705267, "grad_norm": 0.007128443568944931, "learning_rate": 5.394363011298376e-06, "loss": 0.2001, "num_input_tokens_seen": 123958720, "step": 57475 }, { "epoch": 10.548724536612223, "grad_norm": 0.0011427808785811067, "learning_rate": 5.393564744162189e-06, "loss": 0.0001, "num_input_tokens_seen": 123968832, "step": 57480 }, { "epoch": 10.549642136171775, "grad_norm": 0.015013803727924824, "learning_rate": 5.392766466931678e-06, "loss": 0.0624, "num_input_tokens_seen": 123981024, "step": 57485 }, { "epoch": 10.550559735731326, "grad_norm": 0.1424451470375061, "learning_rate": 5.391968179627317e-06, "loss": 0.0003, "num_input_tokens_seen": 123991616, "step": 57490 }, { "epoch": 10.55147733529088, "grad_norm": 0.0005306656239554286, "learning_rate": 5.391169882269579e-06, "loss": 0.0001, "num_input_tokens_seen": 124002656, "step": 57495 }, { "epoch": 10.552394934850431, "grad_norm": 0.9719078540802002, "learning_rate": 5.3903715748789395e-06, "loss": 0.001, "num_input_tokens_seen": 124013728, "step": 57500 }, { "epoch": 10.553312534409983, "grad_norm": 0.000587312038987875, "learning_rate": 5.389573257475874e-06, "loss": 0.0001, "num_input_tokens_seen": 124024224, "step": 57505 }, { "epoch": 10.554230133969536, "grad_norm": 0.07831466197967529, "learning_rate": 5.388774930080858e-06, "loss": 0.0001, "num_input_tokens_seen": 124034304, "step": 57510 }, { "epoch": 10.555147733529088, "grad_norm": 0.0013965568505227566, "learning_rate": 5.387976592714369e-06, "loss": 0.0001, "num_input_tokens_seen": 124045376, "step": 57515 }, { "epoch": 10.55606533308864, "grad_norm": 0.00023912226606626064, "learning_rate": 5.387178245396881e-06, "loss": 0.2482, "num_input_tokens_seen": 124056928, "step": 57520 }, { "epoch": 10.556982932648193, "grad_norm": 0.002311968244612217, "learning_rate": 5.386379888148871e-06, "loss": 0.0002, "num_input_tokens_seen": 124066688, "step": 57525 }, { "epoch": 10.557900532207745, "grad_norm": 0.0008910387987270951, "learning_rate": 5.3855815209908156e-06, "loss": 0.0009, "num_input_tokens_seen": 124077344, "step": 57530 }, { "epoch": 10.558818131767296, "grad_norm": 0.008400170132517815, "learning_rate": 5.384783143943191e-06, "loss": 0.0121, "num_input_tokens_seen": 124089120, "step": 57535 }, { "epoch": 10.55973573132685, "grad_norm": 0.0458393357694149, "learning_rate": 5.383984757026476e-06, "loss": 0.0, "num_input_tokens_seen": 124099808, "step": 57540 }, { "epoch": 10.560653330886401, "grad_norm": 0.005663696210831404, "learning_rate": 5.383186360261147e-06, "loss": 0.0452, "num_input_tokens_seen": 124110528, "step": 57545 }, { "epoch": 10.561570930445953, "grad_norm": 0.004029734060168266, "learning_rate": 5.3823879536676815e-06, "loss": 0.0, "num_input_tokens_seen": 124121120, "step": 57550 }, { "epoch": 10.562488530005506, "grad_norm": 0.004980696365237236, "learning_rate": 5.381589537266559e-06, "loss": 0.0002, "num_input_tokens_seen": 124133216, "step": 57555 }, { "epoch": 10.563406129565058, "grad_norm": 0.001848052372224629, "learning_rate": 5.380791111078253e-06, "loss": 0.0001, "num_input_tokens_seen": 124144224, "step": 57560 }, { "epoch": 10.56432372912461, "grad_norm": 0.0004260481509845704, "learning_rate": 5.3799926751232475e-06, "loss": 0.0001, "num_input_tokens_seen": 124154784, "step": 57565 }, { "epoch": 10.565241328684163, "grad_norm": 0.0027664380613714457, "learning_rate": 5.379194229422019e-06, "loss": 0.0646, "num_input_tokens_seen": 124165760, "step": 57570 }, { "epoch": 10.566158928243714, "grad_norm": 0.7426347136497498, "learning_rate": 5.378395773995047e-06, "loss": 0.0004, "num_input_tokens_seen": 124176704, "step": 57575 }, { "epoch": 10.567076527803266, "grad_norm": 0.00028647613362409174, "learning_rate": 5.377597308862807e-06, "loss": 0.0001, "num_input_tokens_seen": 124187232, "step": 57580 }, { "epoch": 10.56799412736282, "grad_norm": 0.01962665282189846, "learning_rate": 5.3767988340457835e-06, "loss": 0.0, "num_input_tokens_seen": 124199520, "step": 57585 }, { "epoch": 10.568911726922371, "grad_norm": 0.0020568373147398233, "learning_rate": 5.376000349564453e-06, "loss": 0.0, "num_input_tokens_seen": 124210976, "step": 57590 }, { "epoch": 10.569829326481923, "grad_norm": 0.02694302424788475, "learning_rate": 5.375201855439296e-06, "loss": 0.0001, "num_input_tokens_seen": 124221440, "step": 57595 }, { "epoch": 10.570746926041476, "grad_norm": 0.12313015013933182, "learning_rate": 5.374403351690795e-06, "loss": 0.5034, "num_input_tokens_seen": 124231648, "step": 57600 }, { "epoch": 10.571664525601028, "grad_norm": 0.00032335700234398246, "learning_rate": 5.373604838339426e-06, "loss": 0.0001, "num_input_tokens_seen": 124242560, "step": 57605 }, { "epoch": 10.57258212516058, "grad_norm": 0.002022137399762869, "learning_rate": 5.3728063154056735e-06, "loss": 0.0072, "num_input_tokens_seen": 124252992, "step": 57610 }, { "epoch": 10.573499724720133, "grad_norm": 27.063405990600586, "learning_rate": 5.372007782910015e-06, "loss": 0.2782, "num_input_tokens_seen": 124263936, "step": 57615 }, { "epoch": 10.574417324279684, "grad_norm": 0.0004718810087069869, "learning_rate": 5.371209240872934e-06, "loss": 0.0, "num_input_tokens_seen": 124273472, "step": 57620 }, { "epoch": 10.575334923839236, "grad_norm": 0.0013262153370305896, "learning_rate": 5.370410689314912e-06, "loss": 0.0001, "num_input_tokens_seen": 124283136, "step": 57625 }, { "epoch": 10.57625252339879, "grad_norm": 0.0034790022764354944, "learning_rate": 5.369612128256431e-06, "loss": 0.2256, "num_input_tokens_seen": 124294560, "step": 57630 }, { "epoch": 10.577170122958341, "grad_norm": 0.010167854838073254, "learning_rate": 5.368813557717969e-06, "loss": 0.0001, "num_input_tokens_seen": 124305024, "step": 57635 }, { "epoch": 10.578087722517893, "grad_norm": 0.0005197995342314243, "learning_rate": 5.368014977720013e-06, "loss": 0.0377, "num_input_tokens_seen": 124315808, "step": 57640 }, { "epoch": 10.579005322077446, "grad_norm": 0.0081626670435071, "learning_rate": 5.367216388283042e-06, "loss": 0.0001, "num_input_tokens_seen": 124328288, "step": 57645 }, { "epoch": 10.579922921636998, "grad_norm": 0.0063428268767893314, "learning_rate": 5.366417789427541e-06, "loss": 0.0006, "num_input_tokens_seen": 124340320, "step": 57650 }, { "epoch": 10.58084052119655, "grad_norm": 0.03758354112505913, "learning_rate": 5.36561918117399e-06, "loss": 0.1814, "num_input_tokens_seen": 124352128, "step": 57655 }, { "epoch": 10.581758120756103, "grad_norm": 0.025536201894283295, "learning_rate": 5.364820563542875e-06, "loss": 0.0001, "num_input_tokens_seen": 124363648, "step": 57660 }, { "epoch": 10.582675720315654, "grad_norm": 0.00115706876385957, "learning_rate": 5.364021936554678e-06, "loss": 0.0, "num_input_tokens_seen": 124375040, "step": 57665 }, { "epoch": 10.583593319875206, "grad_norm": 0.06211269274353981, "learning_rate": 5.3632233002298805e-06, "loss": 0.0003, "num_input_tokens_seen": 124384288, "step": 57670 }, { "epoch": 10.58451091943476, "grad_norm": 27.840444564819336, "learning_rate": 5.36242465458897e-06, "loss": 0.0335, "num_input_tokens_seen": 124395168, "step": 57675 }, { "epoch": 10.585428518994311, "grad_norm": 0.0005796350887976587, "learning_rate": 5.361625999652429e-06, "loss": 0.0004, "num_input_tokens_seen": 124405216, "step": 57680 }, { "epoch": 10.586346118553863, "grad_norm": 1.0957629680633545, "learning_rate": 5.360827335440741e-06, "loss": 0.0011, "num_input_tokens_seen": 124416832, "step": 57685 }, { "epoch": 10.587263718113416, "grad_norm": 0.01401449553668499, "learning_rate": 5.360028661974391e-06, "loss": 0.0, "num_input_tokens_seen": 124426528, "step": 57690 }, { "epoch": 10.588181317672968, "grad_norm": 0.02795625850558281, "learning_rate": 5.359229979273863e-06, "loss": 0.0001, "num_input_tokens_seen": 124437120, "step": 57695 }, { "epoch": 10.58909891723252, "grad_norm": 0.0029601682908833027, "learning_rate": 5.358431287359646e-06, "loss": 0.0, "num_input_tokens_seen": 124448960, "step": 57700 }, { "epoch": 10.590016516792073, "grad_norm": 0.002115743001922965, "learning_rate": 5.357632586252218e-06, "loss": 0.0003, "num_input_tokens_seen": 124459520, "step": 57705 }, { "epoch": 10.590934116351624, "grad_norm": 22.078845977783203, "learning_rate": 5.356833875972071e-06, "loss": 0.119, "num_input_tokens_seen": 124470272, "step": 57710 }, { "epoch": 10.591851715911176, "grad_norm": 0.002982678357511759, "learning_rate": 5.356035156539687e-06, "loss": 0.0071, "num_input_tokens_seen": 124480480, "step": 57715 }, { "epoch": 10.59276931547073, "grad_norm": 0.0012105229543522, "learning_rate": 5.355236427975553e-06, "loss": 0.033, "num_input_tokens_seen": 124491040, "step": 57720 }, { "epoch": 10.59368691503028, "grad_norm": 0.0038246922194957733, "learning_rate": 5.354437690300156e-06, "loss": 0.0001, "num_input_tokens_seen": 124502208, "step": 57725 }, { "epoch": 10.594604514589832, "grad_norm": 0.12305323034524918, "learning_rate": 5.353638943533982e-06, "loss": 0.0002, "num_input_tokens_seen": 124513376, "step": 57730 }, { "epoch": 10.595522114149386, "grad_norm": 0.025116153061389923, "learning_rate": 5.3528401876975155e-06, "loss": 0.0079, "num_input_tokens_seen": 124524928, "step": 57735 }, { "epoch": 10.596439713708937, "grad_norm": 0.00794184859842062, "learning_rate": 5.352041422811247e-06, "loss": 0.0096, "num_input_tokens_seen": 124535872, "step": 57740 }, { "epoch": 10.597357313268489, "grad_norm": 0.004647656809538603, "learning_rate": 5.3512426488956605e-06, "loss": 0.0015, "num_input_tokens_seen": 124545376, "step": 57745 }, { "epoch": 10.598274912828042, "grad_norm": 0.003362169023603201, "learning_rate": 5.350443865971244e-06, "loss": 0.0041, "num_input_tokens_seen": 124554912, "step": 57750 }, { "epoch": 10.599192512387594, "grad_norm": 0.0004049093113280833, "learning_rate": 5.349645074058486e-06, "loss": 0.0001, "num_input_tokens_seen": 124564160, "step": 57755 }, { "epoch": 10.600110111947146, "grad_norm": 0.010112898424267769, "learning_rate": 5.348846273177874e-06, "loss": 0.0002, "num_input_tokens_seen": 124575168, "step": 57760 }, { "epoch": 10.601027711506699, "grad_norm": 0.02518460527062416, "learning_rate": 5.348047463349896e-06, "loss": 0.0001, "num_input_tokens_seen": 124585216, "step": 57765 }, { "epoch": 10.60194531106625, "grad_norm": 13.305617332458496, "learning_rate": 5.34724864459504e-06, "loss": 0.0667, "num_input_tokens_seen": 124596448, "step": 57770 }, { "epoch": 10.602862910625802, "grad_norm": 33.847900390625, "learning_rate": 5.346449816933793e-06, "loss": 0.0934, "num_input_tokens_seen": 124607648, "step": 57775 }, { "epoch": 10.603780510185356, "grad_norm": 0.3356568515300751, "learning_rate": 5.345650980386645e-06, "loss": 0.0045, "num_input_tokens_seen": 124618624, "step": 57780 }, { "epoch": 10.604698109744907, "grad_norm": 0.0019935821183025837, "learning_rate": 5.344852134974087e-06, "loss": 0.0003, "num_input_tokens_seen": 124629856, "step": 57785 }, { "epoch": 10.605615709304459, "grad_norm": 0.5592734813690186, "learning_rate": 5.344053280716604e-06, "loss": 0.198, "num_input_tokens_seen": 124641216, "step": 57790 }, { "epoch": 10.606533308864012, "grad_norm": 85.52880859375, "learning_rate": 5.34325441763469e-06, "loss": 0.0914, "num_input_tokens_seen": 124652288, "step": 57795 }, { "epoch": 10.607450908423564, "grad_norm": 0.35328909754753113, "learning_rate": 5.3424555457488314e-06, "loss": 0.1595, "num_input_tokens_seen": 124663264, "step": 57800 }, { "epoch": 10.608368507983116, "grad_norm": 0.002292350400239229, "learning_rate": 5.341656665079518e-06, "loss": 0.0001, "num_input_tokens_seen": 124674240, "step": 57805 }, { "epoch": 10.609286107542669, "grad_norm": 0.0003393018851056695, "learning_rate": 5.340857775647241e-06, "loss": 0.0, "num_input_tokens_seen": 124686400, "step": 57810 }, { "epoch": 10.61020370710222, "grad_norm": 0.004175659269094467, "learning_rate": 5.340058877472491e-06, "loss": 0.1346, "num_input_tokens_seen": 124696864, "step": 57815 }, { "epoch": 10.611121306661772, "grad_norm": 0.009338682517409325, "learning_rate": 5.339259970575757e-06, "loss": 0.0, "num_input_tokens_seen": 124707136, "step": 57820 }, { "epoch": 10.612038906221326, "grad_norm": 0.0016085569513961673, "learning_rate": 5.33846105497753e-06, "loss": 0.0001, "num_input_tokens_seen": 124717600, "step": 57825 }, { "epoch": 10.612956505780877, "grad_norm": 0.0012095385463908315, "learning_rate": 5.337662130698303e-06, "loss": 0.0001, "num_input_tokens_seen": 124727968, "step": 57830 }, { "epoch": 10.613874105340429, "grad_norm": 0.011947118677198887, "learning_rate": 5.336863197758565e-06, "loss": 0.0008, "num_input_tokens_seen": 124738368, "step": 57835 }, { "epoch": 10.614791704899982, "grad_norm": 0.013074925169348717, "learning_rate": 5.336064256178809e-06, "loss": 0.0008, "num_input_tokens_seen": 124748960, "step": 57840 }, { "epoch": 10.615709304459534, "grad_norm": 0.0070417411625385284, "learning_rate": 5.335265305979524e-06, "loss": 0.0, "num_input_tokens_seen": 124760256, "step": 57845 }, { "epoch": 10.616626904019085, "grad_norm": 0.022805705666542053, "learning_rate": 5.334466347181205e-06, "loss": 0.0001, "num_input_tokens_seen": 124770688, "step": 57850 }, { "epoch": 10.617544503578639, "grad_norm": 0.003493680153042078, "learning_rate": 5.3336673798043414e-06, "loss": 0.0045, "num_input_tokens_seen": 124779232, "step": 57855 }, { "epoch": 10.61846210313819, "grad_norm": 0.0008116339449770749, "learning_rate": 5.332868403869428e-06, "loss": 0.0001, "num_input_tokens_seen": 124790752, "step": 57860 }, { "epoch": 10.619379702697742, "grad_norm": 0.4592723548412323, "learning_rate": 5.332069419396955e-06, "loss": 0.0002, "num_input_tokens_seen": 124802592, "step": 57865 }, { "epoch": 10.620297302257296, "grad_norm": 9.396529197692871, "learning_rate": 5.331270426407416e-06, "loss": 0.0023, "num_input_tokens_seen": 124813952, "step": 57870 }, { "epoch": 10.621214901816847, "grad_norm": 0.0020626552868634462, "learning_rate": 5.330471424921304e-06, "loss": 0.0, "num_input_tokens_seen": 124826240, "step": 57875 }, { "epoch": 10.622132501376399, "grad_norm": 0.01108589582145214, "learning_rate": 5.329672414959112e-06, "loss": 0.1284, "num_input_tokens_seen": 124838848, "step": 57880 }, { "epoch": 10.623050100935952, "grad_norm": 0.000485140917589888, "learning_rate": 5.328873396541334e-06, "loss": 0.0, "num_input_tokens_seen": 124849920, "step": 57885 }, { "epoch": 10.623967700495504, "grad_norm": 0.0034888696391135454, "learning_rate": 5.328074369688463e-06, "loss": 0.0, "num_input_tokens_seen": 124858752, "step": 57890 }, { "epoch": 10.624885300055055, "grad_norm": 0.004047724884003401, "learning_rate": 5.327275334420993e-06, "loss": 0.0, "num_input_tokens_seen": 124868992, "step": 57895 }, { "epoch": 10.625802899614609, "grad_norm": 8.047324180603027, "learning_rate": 5.326476290759417e-06, "loss": 0.0941, "num_input_tokens_seen": 124879168, "step": 57900 }, { "epoch": 10.62672049917416, "grad_norm": 0.00524053443223238, "learning_rate": 5.325677238724231e-06, "loss": 0.0174, "num_input_tokens_seen": 124889568, "step": 57905 }, { "epoch": 10.627638098733712, "grad_norm": 104.52581787109375, "learning_rate": 5.324878178335928e-06, "loss": 0.0739, "num_input_tokens_seen": 124902240, "step": 57910 }, { "epoch": 10.628555698293265, "grad_norm": 0.24999050796031952, "learning_rate": 5.324079109615003e-06, "loss": 0.0007, "num_input_tokens_seen": 124912640, "step": 57915 }, { "epoch": 10.629473297852817, "grad_norm": 0.006462705321609974, "learning_rate": 5.323280032581952e-06, "loss": 0.0003, "num_input_tokens_seen": 124923520, "step": 57920 }, { "epoch": 10.630390897412369, "grad_norm": 0.00963092315942049, "learning_rate": 5.322480947257269e-06, "loss": 0.0002, "num_input_tokens_seen": 124933504, "step": 57925 }, { "epoch": 10.631308496971922, "grad_norm": 19.364376068115234, "learning_rate": 5.32168185366145e-06, "loss": 0.1543, "num_input_tokens_seen": 124943968, "step": 57930 }, { "epoch": 10.632226096531474, "grad_norm": 14.975984573364258, "learning_rate": 5.320882751814989e-06, "loss": 0.0309, "num_input_tokens_seen": 124954976, "step": 57935 }, { "epoch": 10.633143696091025, "grad_norm": 0.022330963984131813, "learning_rate": 5.320083641738383e-06, "loss": 0.1036, "num_input_tokens_seen": 124966176, "step": 57940 }, { "epoch": 10.634061295650579, "grad_norm": 168.33692932128906, "learning_rate": 5.319284523452128e-06, "loss": 0.0229, "num_input_tokens_seen": 124976960, "step": 57945 }, { "epoch": 10.63497889521013, "grad_norm": 0.024979786947369576, "learning_rate": 5.31848539697672e-06, "loss": 0.0855, "num_input_tokens_seen": 124988160, "step": 57950 }, { "epoch": 10.635896494769682, "grad_norm": 0.09147960692644119, "learning_rate": 5.3176862623326555e-06, "loss": 0.0002, "num_input_tokens_seen": 124997984, "step": 57955 }, { "epoch": 10.636814094329235, "grad_norm": 0.18015819787979126, "learning_rate": 5.31688711954043e-06, "loss": 0.0686, "num_input_tokens_seen": 125009024, "step": 57960 }, { "epoch": 10.637731693888787, "grad_norm": 0.0063370899297297, "learning_rate": 5.31608796862054e-06, "loss": 0.0005, "num_input_tokens_seen": 125020384, "step": 57965 }, { "epoch": 10.638649293448339, "grad_norm": 29.28031349182129, "learning_rate": 5.3152888095934865e-06, "loss": 0.2124, "num_input_tokens_seen": 125030592, "step": 57970 }, { "epoch": 10.639566893007892, "grad_norm": 1.1747323274612427, "learning_rate": 5.314489642479761e-06, "loss": 0.0015, "num_input_tokens_seen": 125043072, "step": 57975 }, { "epoch": 10.640484492567444, "grad_norm": 1.534368634223938, "learning_rate": 5.313690467299865e-06, "loss": 0.0014, "num_input_tokens_seen": 125055104, "step": 57980 }, { "epoch": 10.641402092126995, "grad_norm": 0.006251728627830744, "learning_rate": 5.312891284074293e-06, "loss": 0.0002, "num_input_tokens_seen": 125066208, "step": 57985 }, { "epoch": 10.642319691686549, "grad_norm": 0.005798271391540766, "learning_rate": 5.312092092823546e-06, "loss": 0.0001, "num_input_tokens_seen": 125077024, "step": 57990 }, { "epoch": 10.6432372912461, "grad_norm": 0.0003901594609487802, "learning_rate": 5.311292893568119e-06, "loss": 0.0, "num_input_tokens_seen": 125087648, "step": 57995 }, { "epoch": 10.644154890805652, "grad_norm": 0.038544561713933945, "learning_rate": 5.310493686328513e-06, "loss": 0.0013, "num_input_tokens_seen": 125097504, "step": 58000 }, { "epoch": 10.645072490365205, "grad_norm": 0.10640155524015427, "learning_rate": 5.3096944711252255e-06, "loss": 0.0003, "num_input_tokens_seen": 125108032, "step": 58005 }, { "epoch": 10.645990089924757, "grad_norm": 0.04410857334733009, "learning_rate": 5.308895247978754e-06, "loss": 0.0007, "num_input_tokens_seen": 125119264, "step": 58010 }, { "epoch": 10.646907689484308, "grad_norm": 0.0017462698742747307, "learning_rate": 5.308096016909597e-06, "loss": 0.0452, "num_input_tokens_seen": 125130720, "step": 58015 }, { "epoch": 10.647825289043862, "grad_norm": 0.008676531724631786, "learning_rate": 5.307296777938258e-06, "loss": 0.1099, "num_input_tokens_seen": 125142240, "step": 58020 }, { "epoch": 10.648742888603413, "grad_norm": 0.005506538320332766, "learning_rate": 5.30649753108523e-06, "loss": 0.1222, "num_input_tokens_seen": 125152352, "step": 58025 }, { "epoch": 10.649660488162965, "grad_norm": 0.025712160393595695, "learning_rate": 5.305698276371017e-06, "loss": 0.0002, "num_input_tokens_seen": 125163744, "step": 58030 }, { "epoch": 10.650578087722518, "grad_norm": 0.0049699037335813046, "learning_rate": 5.304899013816116e-06, "loss": 0.0226, "num_input_tokens_seen": 125174048, "step": 58035 }, { "epoch": 10.65149568728207, "grad_norm": 0.0012739519588649273, "learning_rate": 5.304099743441029e-06, "loss": 0.172, "num_input_tokens_seen": 125184736, "step": 58040 }, { "epoch": 10.652413286841622, "grad_norm": 19.89533805847168, "learning_rate": 5.303300465266254e-06, "loss": 0.1752, "num_input_tokens_seen": 125194464, "step": 58045 }, { "epoch": 10.653330886401175, "grad_norm": 0.025440655648708344, "learning_rate": 5.3025011793122915e-06, "loss": 0.0001, "num_input_tokens_seen": 125206112, "step": 58050 }, { "epoch": 10.654248485960727, "grad_norm": 0.0033899396657943726, "learning_rate": 5.301701885599644e-06, "loss": 0.0004, "num_input_tokens_seen": 125217568, "step": 58055 }, { "epoch": 10.655166085520278, "grad_norm": 0.0026589774060994387, "learning_rate": 5.3009025841488105e-06, "loss": 0.0008, "num_input_tokens_seen": 125229216, "step": 58060 }, { "epoch": 10.656083685079832, "grad_norm": 40.0589599609375, "learning_rate": 5.300103274980291e-06, "loss": 0.1133, "num_input_tokens_seen": 125240352, "step": 58065 }, { "epoch": 10.657001284639383, "grad_norm": 0.0010236718226224184, "learning_rate": 5.299303958114589e-06, "loss": 0.0001, "num_input_tokens_seen": 125250752, "step": 58070 }, { "epoch": 10.657918884198935, "grad_norm": 0.10554621368646622, "learning_rate": 5.2985046335722025e-06, "loss": 0.0006, "num_input_tokens_seen": 125261856, "step": 58075 }, { "epoch": 10.658836483758488, "grad_norm": 0.11777748912572861, "learning_rate": 5.297705301373637e-06, "loss": 0.0002, "num_input_tokens_seen": 125272672, "step": 58080 }, { "epoch": 10.65975408331804, "grad_norm": 0.004116882104426622, "learning_rate": 5.2969059615393906e-06, "loss": 0.0006, "num_input_tokens_seen": 125282656, "step": 58085 }, { "epoch": 10.660671682877592, "grad_norm": 0.015386545099318027, "learning_rate": 5.296106614089966e-06, "loss": 0.0006, "num_input_tokens_seen": 125293632, "step": 58090 }, { "epoch": 10.661589282437145, "grad_norm": 0.0014910651370882988, "learning_rate": 5.295307259045866e-06, "loss": 0.0001, "num_input_tokens_seen": 125304864, "step": 58095 }, { "epoch": 10.662506881996697, "grad_norm": 0.0063690743409097195, "learning_rate": 5.294507896427593e-06, "loss": 0.0589, "num_input_tokens_seen": 125316000, "step": 58100 }, { "epoch": 10.663424481556248, "grad_norm": 1.1732085943222046, "learning_rate": 5.2937085262556486e-06, "loss": 0.0009, "num_input_tokens_seen": 125326176, "step": 58105 }, { "epoch": 10.664342081115802, "grad_norm": 0.07097826153039932, "learning_rate": 5.292909148550535e-06, "loss": 0.0001, "num_input_tokens_seen": 125338208, "step": 58110 }, { "epoch": 10.665259680675353, "grad_norm": 0.025176528841257095, "learning_rate": 5.292109763332758e-06, "loss": 0.0001, "num_input_tokens_seen": 125348480, "step": 58115 }, { "epoch": 10.666177280234905, "grad_norm": 0.41046610474586487, "learning_rate": 5.291310370622816e-06, "loss": 0.0113, "num_input_tokens_seen": 125359456, "step": 58120 }, { "epoch": 10.667094879794458, "grad_norm": 5.0005645751953125, "learning_rate": 5.2905109704412146e-06, "loss": 0.0051, "num_input_tokens_seen": 125369408, "step": 58125 }, { "epoch": 10.66801247935401, "grad_norm": 224.11488342285156, "learning_rate": 5.28971156280846e-06, "loss": 0.2227, "num_input_tokens_seen": 125380032, "step": 58130 }, { "epoch": 10.668930078913561, "grad_norm": 0.0003730154421646148, "learning_rate": 5.288912147745049e-06, "loss": 0.0001, "num_input_tokens_seen": 125391360, "step": 58135 }, { "epoch": 10.669847678473115, "grad_norm": 9.346899032592773, "learning_rate": 5.2881127252714916e-06, "loss": 0.0549, "num_input_tokens_seen": 125402816, "step": 58140 }, { "epoch": 10.670765278032667, "grad_norm": 9.976438522338867, "learning_rate": 5.2873132954082875e-06, "loss": 0.0064, "num_input_tokens_seen": 125413600, "step": 58145 }, { "epoch": 10.671682877592218, "grad_norm": 0.006409577559679747, "learning_rate": 5.286513858175943e-06, "loss": 0.0005, "num_input_tokens_seen": 125423296, "step": 58150 }, { "epoch": 10.672600477151772, "grad_norm": 0.0018664764938876033, "learning_rate": 5.285714413594963e-06, "loss": 0.0001, "num_input_tokens_seen": 125434496, "step": 58155 }, { "epoch": 10.673518076711323, "grad_norm": 0.003631236730143428, "learning_rate": 5.284914961685852e-06, "loss": 0.1036, "num_input_tokens_seen": 125445664, "step": 58160 }, { "epoch": 10.674435676270875, "grad_norm": 0.000430965592386201, "learning_rate": 5.284115502469113e-06, "loss": 0.0001, "num_input_tokens_seen": 125457728, "step": 58165 }, { "epoch": 10.675353275830428, "grad_norm": 0.0031876841094344854, "learning_rate": 5.283316035965251e-06, "loss": 0.1626, "num_input_tokens_seen": 125468704, "step": 58170 }, { "epoch": 10.67627087538998, "grad_norm": 0.002353198593482375, "learning_rate": 5.282516562194773e-06, "loss": 0.0001, "num_input_tokens_seen": 125480224, "step": 58175 }, { "epoch": 10.677188474949531, "grad_norm": 0.0018566020298749208, "learning_rate": 5.281717081178183e-06, "loss": 0.0008, "num_input_tokens_seen": 125490720, "step": 58180 }, { "epoch": 10.678106074509085, "grad_norm": 0.0005623122560791671, "learning_rate": 5.280917592935985e-06, "loss": 0.0001, "num_input_tokens_seen": 125500576, "step": 58185 }, { "epoch": 10.679023674068636, "grad_norm": 0.03501443937420845, "learning_rate": 5.280118097488687e-06, "loss": 0.0001, "num_input_tokens_seen": 125511552, "step": 58190 }, { "epoch": 10.679941273628188, "grad_norm": 0.0005697833257727325, "learning_rate": 5.279318594856792e-06, "loss": 0.0001, "num_input_tokens_seen": 125521920, "step": 58195 }, { "epoch": 10.680858873187741, "grad_norm": 0.0027721920050680637, "learning_rate": 5.278519085060811e-06, "loss": 0.0451, "num_input_tokens_seen": 125532832, "step": 58200 }, { "epoch": 10.681776472747293, "grad_norm": 0.0005732340505346656, "learning_rate": 5.277719568121245e-06, "loss": 0.0001, "num_input_tokens_seen": 125544000, "step": 58205 }, { "epoch": 10.682694072306845, "grad_norm": 0.025604059919714928, "learning_rate": 5.276920044058603e-06, "loss": 0.0001, "num_input_tokens_seen": 125555584, "step": 58210 }, { "epoch": 10.683611671866398, "grad_norm": 0.005470083560794592, "learning_rate": 5.276120512893392e-06, "loss": 0.0001, "num_input_tokens_seen": 125565696, "step": 58215 }, { "epoch": 10.68452927142595, "grad_norm": 0.004279912915080786, "learning_rate": 5.275320974646118e-06, "loss": 0.0059, "num_input_tokens_seen": 125576480, "step": 58220 }, { "epoch": 10.685446870985501, "grad_norm": 0.0017691125394776464, "learning_rate": 5.2745214293372874e-06, "loss": 0.0002, "num_input_tokens_seen": 125587264, "step": 58225 }, { "epoch": 10.686364470545055, "grad_norm": 12.981062889099121, "learning_rate": 5.273721876987405e-06, "loss": 0.0207, "num_input_tokens_seen": 125597344, "step": 58230 }, { "epoch": 10.687282070104606, "grad_norm": 0.0006161633646115661, "learning_rate": 5.272922317616983e-06, "loss": 0.0, "num_input_tokens_seen": 125608128, "step": 58235 }, { "epoch": 10.688199669664158, "grad_norm": 0.0007111284648999572, "learning_rate": 5.272122751246526e-06, "loss": 0.0001, "num_input_tokens_seen": 125619616, "step": 58240 }, { "epoch": 10.689117269223711, "grad_norm": 0.011719376780092716, "learning_rate": 5.271323177896543e-06, "loss": 0.0, "num_input_tokens_seen": 125629984, "step": 58245 }, { "epoch": 10.690034868783263, "grad_norm": 0.06410378962755203, "learning_rate": 5.2705235975875416e-06, "loss": 0.0041, "num_input_tokens_seen": 125641056, "step": 58250 }, { "epoch": 10.690952468342815, "grad_norm": 0.002806782955303788, "learning_rate": 5.269724010340027e-06, "loss": 0.0002, "num_input_tokens_seen": 125652096, "step": 58255 }, { "epoch": 10.691870067902368, "grad_norm": 3.7227699756622314, "learning_rate": 5.26892441617451e-06, "loss": 0.0011, "num_input_tokens_seen": 125663232, "step": 58260 }, { "epoch": 10.69278766746192, "grad_norm": 0.0022643443662673235, "learning_rate": 5.2681248151115e-06, "loss": 0.0003, "num_input_tokens_seen": 125674656, "step": 58265 }, { "epoch": 10.693705267021471, "grad_norm": 0.019430609419941902, "learning_rate": 5.267325207171504e-06, "loss": 0.0001, "num_input_tokens_seen": 125685952, "step": 58270 }, { "epoch": 10.694622866581025, "grad_norm": 0.03015247732400894, "learning_rate": 5.266525592375031e-06, "loss": 0.0001, "num_input_tokens_seen": 125697088, "step": 58275 }, { "epoch": 10.695540466140576, "grad_norm": 0.01039864867925644, "learning_rate": 5.265725970742588e-06, "loss": 0.0, "num_input_tokens_seen": 125709536, "step": 58280 }, { "epoch": 10.696458065700128, "grad_norm": 0.165621817111969, "learning_rate": 5.264926342294686e-06, "loss": 0.0001, "num_input_tokens_seen": 125720352, "step": 58285 }, { "epoch": 10.697375665259681, "grad_norm": 0.0068964832462370396, "learning_rate": 5.264126707051836e-06, "loss": 0.0, "num_input_tokens_seen": 125731936, "step": 58290 }, { "epoch": 10.698293264819233, "grad_norm": 0.010663437657058239, "learning_rate": 5.263327065034542e-06, "loss": 0.0, "num_input_tokens_seen": 125744320, "step": 58295 }, { "epoch": 10.699210864378784, "grad_norm": 0.0013323636958375573, "learning_rate": 5.262527416263319e-06, "loss": 0.0, "num_input_tokens_seen": 125754304, "step": 58300 }, { "epoch": 10.700128463938338, "grad_norm": 0.0006439979188144207, "learning_rate": 5.261727760758674e-06, "loss": 0.0, "num_input_tokens_seen": 125763712, "step": 58305 }, { "epoch": 10.70104606349789, "grad_norm": 0.5618327856063843, "learning_rate": 5.260928098541117e-06, "loss": 0.0007, "num_input_tokens_seen": 125774528, "step": 58310 }, { "epoch": 10.701963663057441, "grad_norm": 0.00023414738825522363, "learning_rate": 5.260128429631159e-06, "loss": 0.0002, "num_input_tokens_seen": 125785632, "step": 58315 }, { "epoch": 10.702881262616994, "grad_norm": 0.001300868229009211, "learning_rate": 5.259328754049311e-06, "loss": 0.0207, "num_input_tokens_seen": 125796448, "step": 58320 }, { "epoch": 10.703798862176546, "grad_norm": 0.09461738914251328, "learning_rate": 5.258529071816082e-06, "loss": 0.1908, "num_input_tokens_seen": 125808800, "step": 58325 }, { "epoch": 10.704716461736098, "grad_norm": 0.06237082555890083, "learning_rate": 5.257729382951983e-06, "loss": 0.0001, "num_input_tokens_seen": 125820352, "step": 58330 }, { "epoch": 10.705634061295651, "grad_norm": 0.0005582842277362943, "learning_rate": 5.256929687477524e-06, "loss": 0.0, "num_input_tokens_seen": 125831680, "step": 58335 }, { "epoch": 10.706551660855203, "grad_norm": 0.03531614691019058, "learning_rate": 5.256129985413218e-06, "loss": 0.0823, "num_input_tokens_seen": 125842272, "step": 58340 }, { "epoch": 10.707469260414754, "grad_norm": 0.007818707264959812, "learning_rate": 5.255330276779572e-06, "loss": 0.0004, "num_input_tokens_seen": 125853952, "step": 58345 }, { "epoch": 10.708386859974308, "grad_norm": 0.0009439402492716908, "learning_rate": 5.254530561597103e-06, "loss": 0.0, "num_input_tokens_seen": 125866528, "step": 58350 }, { "epoch": 10.70930445953386, "grad_norm": 0.003958362154662609, "learning_rate": 5.253730839886318e-06, "loss": 0.0001, "num_input_tokens_seen": 125877120, "step": 58355 }, { "epoch": 10.710222059093411, "grad_norm": 0.08171305805444717, "learning_rate": 5.252931111667731e-06, "loss": 0.0285, "num_input_tokens_seen": 125887392, "step": 58360 }, { "epoch": 10.711139658652964, "grad_norm": 0.020822178572416306, "learning_rate": 5.252131376961853e-06, "loss": 0.0, "num_input_tokens_seen": 125898848, "step": 58365 }, { "epoch": 10.712057258212516, "grad_norm": 0.0002798069326672703, "learning_rate": 5.251331635789196e-06, "loss": 0.0, "num_input_tokens_seen": 125909568, "step": 58370 }, { "epoch": 10.712974857772068, "grad_norm": 0.0881732776761055, "learning_rate": 5.250531888170273e-06, "loss": 0.1221, "num_input_tokens_seen": 125920800, "step": 58375 }, { "epoch": 10.713892457331621, "grad_norm": 0.07574660331010818, "learning_rate": 5.2497321341255944e-06, "loss": 0.0002, "num_input_tokens_seen": 125932320, "step": 58380 }, { "epoch": 10.714810056891173, "grad_norm": 0.00086684845155105, "learning_rate": 5.248932373675673e-06, "loss": 0.0001, "num_input_tokens_seen": 125941984, "step": 58385 }, { "epoch": 10.715727656450724, "grad_norm": 0.013392284512519836, "learning_rate": 5.248132606841023e-06, "loss": 0.2469, "num_input_tokens_seen": 125953632, "step": 58390 }, { "epoch": 10.716645256010278, "grad_norm": 0.0003126736846752465, "learning_rate": 5.247332833642156e-06, "loss": 0.0, "num_input_tokens_seen": 125964352, "step": 58395 }, { "epoch": 10.71756285556983, "grad_norm": 0.015698976814746857, "learning_rate": 5.246533054099585e-06, "loss": 0.1745, "num_input_tokens_seen": 125975456, "step": 58400 }, { "epoch": 10.71848045512938, "grad_norm": 45.60230255126953, "learning_rate": 5.245733268233822e-06, "loss": 0.147, "num_input_tokens_seen": 125987712, "step": 58405 }, { "epoch": 10.719398054688934, "grad_norm": 0.4909600615501404, "learning_rate": 5.244933476065384e-06, "loss": 0.0002, "num_input_tokens_seen": 125998080, "step": 58410 }, { "epoch": 10.720315654248486, "grad_norm": 0.033467549830675125, "learning_rate": 5.24413367761478e-06, "loss": 0.0003, "num_input_tokens_seen": 126010144, "step": 58415 }, { "epoch": 10.721233253808037, "grad_norm": 0.0005284320213831961, "learning_rate": 5.243333872902527e-06, "loss": 0.0003, "num_input_tokens_seen": 126020288, "step": 58420 }, { "epoch": 10.72215085336759, "grad_norm": 0.0031488100066781044, "learning_rate": 5.242534061949136e-06, "loss": 0.0051, "num_input_tokens_seen": 126030592, "step": 58425 }, { "epoch": 10.723068452927143, "grad_norm": 0.0007419079192914069, "learning_rate": 5.241734244775122e-06, "loss": 0.0, "num_input_tokens_seen": 126042048, "step": 58430 }, { "epoch": 10.723986052486694, "grad_norm": 0.001278952113352716, "learning_rate": 5.240934421401e-06, "loss": 0.1252, "num_input_tokens_seen": 126053056, "step": 58435 }, { "epoch": 10.724903652046248, "grad_norm": 0.02145613357424736, "learning_rate": 5.2401345918472835e-06, "loss": 0.0532, "num_input_tokens_seen": 126064064, "step": 58440 }, { "epoch": 10.7258212516058, "grad_norm": 0.12406923621892929, "learning_rate": 5.239334756134486e-06, "loss": 0.0003, "num_input_tokens_seen": 126074592, "step": 58445 }, { "epoch": 10.72673885116535, "grad_norm": 0.03644629195332527, "learning_rate": 5.238534914283125e-06, "loss": 0.0003, "num_input_tokens_seen": 126084864, "step": 58450 }, { "epoch": 10.727656450724904, "grad_norm": 0.4041573703289032, "learning_rate": 5.237735066313712e-06, "loss": 0.1819, "num_input_tokens_seen": 126094272, "step": 58455 }, { "epoch": 10.728574050284456, "grad_norm": 0.020478028804063797, "learning_rate": 5.236935212246763e-06, "loss": 0.0001, "num_input_tokens_seen": 126103296, "step": 58460 }, { "epoch": 10.729491649844007, "grad_norm": 0.018395137041807175, "learning_rate": 5.236135352102793e-06, "loss": 0.059, "num_input_tokens_seen": 126114080, "step": 58465 }, { "epoch": 10.73040924940356, "grad_norm": 0.0023679460864514112, "learning_rate": 5.235335485902317e-06, "loss": 0.0011, "num_input_tokens_seen": 126125152, "step": 58470 }, { "epoch": 10.731326848963112, "grad_norm": 0.010876458138227463, "learning_rate": 5.23453561366585e-06, "loss": 0.0, "num_input_tokens_seen": 126134368, "step": 58475 }, { "epoch": 10.732244448522664, "grad_norm": 0.01232299767434597, "learning_rate": 5.233735735413909e-06, "loss": 0.0006, "num_input_tokens_seen": 126144896, "step": 58480 }, { "epoch": 10.733162048082217, "grad_norm": 0.0018460429273545742, "learning_rate": 5.232935851167008e-06, "loss": 0.0945, "num_input_tokens_seen": 126154080, "step": 58485 }, { "epoch": 10.734079647641769, "grad_norm": 0.010063409805297852, "learning_rate": 5.232135960945664e-06, "loss": 0.0002, "num_input_tokens_seen": 126164832, "step": 58490 }, { "epoch": 10.73499724720132, "grad_norm": 0.42904025316238403, "learning_rate": 5.231336064770392e-06, "loss": 0.0005, "num_input_tokens_seen": 126176896, "step": 58495 }, { "epoch": 10.735914846760874, "grad_norm": 0.006374920718371868, "learning_rate": 5.2305361626617104e-06, "loss": 0.1228, "num_input_tokens_seen": 126188320, "step": 58500 }, { "epoch": 10.736832446320426, "grad_norm": 0.018802214413881302, "learning_rate": 5.229736254640131e-06, "loss": 0.0002, "num_input_tokens_seen": 126198208, "step": 58505 }, { "epoch": 10.737750045879977, "grad_norm": 0.0034532323479652405, "learning_rate": 5.228936340726174e-06, "loss": 0.0, "num_input_tokens_seen": 126208992, "step": 58510 }, { "epoch": 10.73866764543953, "grad_norm": 0.012645397335290909, "learning_rate": 5.228136420940353e-06, "loss": 0.002, "num_input_tokens_seen": 126219936, "step": 58515 }, { "epoch": 10.739585244999082, "grad_norm": 0.017052356153726578, "learning_rate": 5.227336495303188e-06, "loss": 0.0001, "num_input_tokens_seen": 126230144, "step": 58520 }, { "epoch": 10.740502844558634, "grad_norm": 0.22618937492370605, "learning_rate": 5.2265365638351936e-06, "loss": 0.0002, "num_input_tokens_seen": 126241312, "step": 58525 }, { "epoch": 10.741420444118187, "grad_norm": 0.0010970128932967782, "learning_rate": 5.225736626556888e-06, "loss": 0.0001, "num_input_tokens_seen": 126250944, "step": 58530 }, { "epoch": 10.742338043677739, "grad_norm": 0.002232123166322708, "learning_rate": 5.224936683488787e-06, "loss": 0.1185, "num_input_tokens_seen": 126260640, "step": 58535 }, { "epoch": 10.74325564323729, "grad_norm": 0.4353817403316498, "learning_rate": 5.224136734651409e-06, "loss": 0.0379, "num_input_tokens_seen": 126271104, "step": 58540 }, { "epoch": 10.744173242796844, "grad_norm": 0.006848420947790146, "learning_rate": 5.223336780065271e-06, "loss": 0.0001, "num_input_tokens_seen": 126282048, "step": 58545 }, { "epoch": 10.745090842356396, "grad_norm": 0.002006609458476305, "learning_rate": 5.222536819750891e-06, "loss": 0.0001, "num_input_tokens_seen": 126292864, "step": 58550 }, { "epoch": 10.746008441915947, "grad_norm": 16.2632999420166, "learning_rate": 5.221736853728786e-06, "loss": 0.2473, "num_input_tokens_seen": 126303040, "step": 58555 }, { "epoch": 10.7469260414755, "grad_norm": 0.2869914472103119, "learning_rate": 5.220936882019475e-06, "loss": 0.0005, "num_input_tokens_seen": 126311040, "step": 58560 }, { "epoch": 10.747843641035052, "grad_norm": 0.004391522612422705, "learning_rate": 5.220136904643475e-06, "loss": 0.073, "num_input_tokens_seen": 126321856, "step": 58565 }, { "epoch": 10.748761240594604, "grad_norm": 0.004565164912492037, "learning_rate": 5.219336921621305e-06, "loss": 0.0, "num_input_tokens_seen": 126332800, "step": 58570 }, { "epoch": 10.749678840154157, "grad_norm": 0.03439069539308548, "learning_rate": 5.218536932973483e-06, "loss": 0.0004, "num_input_tokens_seen": 126343264, "step": 58575 }, { "epoch": 10.750596439713709, "grad_norm": 0.0016792579554021358, "learning_rate": 5.217736938720527e-06, "loss": 0.0052, "num_input_tokens_seen": 126354432, "step": 58580 }, { "epoch": 10.75151403927326, "grad_norm": 1.125547170639038, "learning_rate": 5.216936938882956e-06, "loss": 0.1892, "num_input_tokens_seen": 126365312, "step": 58585 }, { "epoch": 10.752431638832814, "grad_norm": 0.051265161484479904, "learning_rate": 5.216136933481288e-06, "loss": 0.0003, "num_input_tokens_seen": 126374880, "step": 58590 }, { "epoch": 10.753349238392365, "grad_norm": 0.0031982294749468565, "learning_rate": 5.215336922536044e-06, "loss": 0.0001, "num_input_tokens_seen": 126385920, "step": 58595 }, { "epoch": 10.754266837951917, "grad_norm": 0.005276503972709179, "learning_rate": 5.214536906067742e-06, "loss": 0.0001, "num_input_tokens_seen": 126396640, "step": 58600 }, { "epoch": 10.75518443751147, "grad_norm": 0.04224316030740738, "learning_rate": 5.213736884096899e-06, "loss": 0.0005, "num_input_tokens_seen": 126407328, "step": 58605 }, { "epoch": 10.756102037071022, "grad_norm": 0.011564599350094795, "learning_rate": 5.2129368566440385e-06, "loss": 0.0071, "num_input_tokens_seen": 126418048, "step": 58610 }, { "epoch": 10.757019636630574, "grad_norm": 0.081778384745121, "learning_rate": 5.2121368237296756e-06, "loss": 0.0003, "num_input_tokens_seen": 126429280, "step": 58615 }, { "epoch": 10.757937236190127, "grad_norm": 58.22468948364258, "learning_rate": 5.211336785374334e-06, "loss": 0.1261, "num_input_tokens_seen": 126440576, "step": 58620 }, { "epoch": 10.758854835749679, "grad_norm": 0.0015375561779364944, "learning_rate": 5.210536741598528e-06, "loss": 0.0616, "num_input_tokens_seen": 126451328, "step": 58625 }, { "epoch": 10.75977243530923, "grad_norm": 0.015570351853966713, "learning_rate": 5.209736692422783e-06, "loss": 0.1711, "num_input_tokens_seen": 126461856, "step": 58630 }, { "epoch": 10.760690034868784, "grad_norm": 3.0655596256256104, "learning_rate": 5.2089366378676176e-06, "loss": 0.0011, "num_input_tokens_seen": 126472448, "step": 58635 }, { "epoch": 10.761607634428335, "grad_norm": 0.0015722153475508094, "learning_rate": 5.20813657795355e-06, "loss": 0.0001, "num_input_tokens_seen": 126483552, "step": 58640 }, { "epoch": 10.762525233987887, "grad_norm": 0.007284586317837238, "learning_rate": 5.207336512701102e-06, "loss": 0.0, "num_input_tokens_seen": 126494656, "step": 58645 }, { "epoch": 10.76344283354744, "grad_norm": 0.04281230270862579, "learning_rate": 5.206536442130794e-06, "loss": 0.0793, "num_input_tokens_seen": 126506208, "step": 58650 }, { "epoch": 10.764360433106992, "grad_norm": 0.017551613971590996, "learning_rate": 5.205736366263148e-06, "loss": 0.0007, "num_input_tokens_seen": 126517088, "step": 58655 }, { "epoch": 10.765278032666544, "grad_norm": 0.2167363166809082, "learning_rate": 5.2049362851186805e-06, "loss": 0.0053, "num_input_tokens_seen": 126527936, "step": 58660 }, { "epoch": 10.766195632226097, "grad_norm": 0.010868235491216183, "learning_rate": 5.204136198717915e-06, "loss": 0.0058, "num_input_tokens_seen": 126539488, "step": 58665 }, { "epoch": 10.767113231785649, "grad_norm": 0.0011486667208373547, "learning_rate": 5.203336107081374e-06, "loss": 0.0002, "num_input_tokens_seen": 126551456, "step": 58670 }, { "epoch": 10.7680308313452, "grad_norm": 0.12029020488262177, "learning_rate": 5.202536010229575e-06, "loss": 0.0001, "num_input_tokens_seen": 126562464, "step": 58675 }, { "epoch": 10.768948430904754, "grad_norm": 0.0016696200473234057, "learning_rate": 5.201735908183043e-06, "loss": 0.2519, "num_input_tokens_seen": 126574080, "step": 58680 }, { "epoch": 10.769866030464305, "grad_norm": 0.5162138938903809, "learning_rate": 5.200935800962297e-06, "loss": 0.006, "num_input_tokens_seen": 126585216, "step": 58685 }, { "epoch": 10.770783630023857, "grad_norm": 0.000987756415270269, "learning_rate": 5.20013568858786e-06, "loss": 0.0352, "num_input_tokens_seen": 126597312, "step": 58690 }, { "epoch": 10.77170122958341, "grad_norm": 0.04791173338890076, "learning_rate": 5.199335571080252e-06, "loss": 0.0001, "num_input_tokens_seen": 126607360, "step": 58695 }, { "epoch": 10.772618829142962, "grad_norm": 0.06286295503377914, "learning_rate": 5.198535448459996e-06, "loss": 0.0149, "num_input_tokens_seen": 126617568, "step": 58700 }, { "epoch": 10.773536428702513, "grad_norm": 0.010946742258965969, "learning_rate": 5.197735320747612e-06, "loss": 0.0562, "num_input_tokens_seen": 126628096, "step": 58705 }, { "epoch": 10.774454028262067, "grad_norm": 0.0015374493086710572, "learning_rate": 5.196935187963625e-06, "loss": 0.0646, "num_input_tokens_seen": 126638816, "step": 58710 }, { "epoch": 10.775371627821619, "grad_norm": 0.0021110859233886003, "learning_rate": 5.196135050128554e-06, "loss": 0.0001, "num_input_tokens_seen": 126650880, "step": 58715 }, { "epoch": 10.77628922738117, "grad_norm": 0.37975573539733887, "learning_rate": 5.1953349072629255e-06, "loss": 0.0004, "num_input_tokens_seen": 126661792, "step": 58720 }, { "epoch": 10.777206826940724, "grad_norm": 0.15039218962192535, "learning_rate": 5.194534759387257e-06, "loss": 0.0001, "num_input_tokens_seen": 126673344, "step": 58725 }, { "epoch": 10.778124426500275, "grad_norm": 0.0021478452254086733, "learning_rate": 5.193734606522075e-06, "loss": 0.0307, "num_input_tokens_seen": 126683776, "step": 58730 }, { "epoch": 10.779042026059827, "grad_norm": 0.0007758037536405027, "learning_rate": 5.1929344486878995e-06, "loss": 0.0, "num_input_tokens_seen": 126695264, "step": 58735 }, { "epoch": 10.77995962561938, "grad_norm": 0.005111416801810265, "learning_rate": 5.192134285905255e-06, "loss": 0.0002, "num_input_tokens_seen": 126706944, "step": 58740 }, { "epoch": 10.780877225178932, "grad_norm": 0.04026462882757187, "learning_rate": 5.191334118194664e-06, "loss": 0.0002, "num_input_tokens_seen": 126717824, "step": 58745 }, { "epoch": 10.781794824738483, "grad_norm": 0.9533747434616089, "learning_rate": 5.190533945576649e-06, "loss": 0.0012, "num_input_tokens_seen": 126729024, "step": 58750 }, { "epoch": 10.782712424298037, "grad_norm": 0.0024505765177309513, "learning_rate": 5.1897337680717345e-06, "loss": 0.0002, "num_input_tokens_seen": 126739808, "step": 58755 }, { "epoch": 10.783630023857588, "grad_norm": 0.008615548722445965, "learning_rate": 5.188933585700442e-06, "loss": 0.0, "num_input_tokens_seen": 126750368, "step": 58760 }, { "epoch": 10.78454762341714, "grad_norm": 0.0005882798577658832, "learning_rate": 5.188133398483295e-06, "loss": 0.1221, "num_input_tokens_seen": 126761312, "step": 58765 }, { "epoch": 10.785465222976693, "grad_norm": 0.024889511987566948, "learning_rate": 5.18733320644082e-06, "loss": 0.0285, "num_input_tokens_seen": 126772608, "step": 58770 }, { "epoch": 10.786382822536245, "grad_norm": 0.002571306424215436, "learning_rate": 5.186533009593536e-06, "loss": 0.0, "num_input_tokens_seen": 126782592, "step": 58775 }, { "epoch": 10.787300422095797, "grad_norm": 0.001246394356712699, "learning_rate": 5.185732807961971e-06, "loss": 0.1345, "num_input_tokens_seen": 126793344, "step": 58780 }, { "epoch": 10.78821802165535, "grad_norm": 30.139863967895508, "learning_rate": 5.184932601566648e-06, "loss": 0.1284, "num_input_tokens_seen": 126802816, "step": 58785 }, { "epoch": 10.789135621214902, "grad_norm": 0.0006050831871107221, "learning_rate": 5.18413239042809e-06, "loss": 0.0001, "num_input_tokens_seen": 126813760, "step": 58790 }, { "epoch": 10.790053220774453, "grad_norm": 0.007067493628710508, "learning_rate": 5.183332174566821e-06, "loss": 0.0001, "num_input_tokens_seen": 126824064, "step": 58795 }, { "epoch": 10.790970820334007, "grad_norm": 0.00038694124668836594, "learning_rate": 5.182531954003365e-06, "loss": 0.0001, "num_input_tokens_seen": 126834272, "step": 58800 }, { "epoch": 10.791888419893558, "grad_norm": 63.409400939941406, "learning_rate": 5.181731728758249e-06, "loss": 0.2157, "num_input_tokens_seen": 126845888, "step": 58805 }, { "epoch": 10.79280601945311, "grad_norm": 0.002641846891492605, "learning_rate": 5.180931498851995e-06, "loss": 0.0, "num_input_tokens_seen": 126856608, "step": 58810 }, { "epoch": 10.793723619012663, "grad_norm": 0.0014701440231874585, "learning_rate": 5.18013126430513e-06, "loss": 0.0001, "num_input_tokens_seen": 126866880, "step": 58815 }, { "epoch": 10.794641218572215, "grad_norm": 0.002698614727705717, "learning_rate": 5.1793310251381755e-06, "loss": 0.0001, "num_input_tokens_seen": 126878240, "step": 58820 }, { "epoch": 10.795558818131767, "grad_norm": 17.699113845825195, "learning_rate": 5.178530781371658e-06, "loss": 0.1689, "num_input_tokens_seen": 126889824, "step": 58825 }, { "epoch": 10.79647641769132, "grad_norm": 0.005498918239027262, "learning_rate": 5.177730533026104e-06, "loss": 0.0004, "num_input_tokens_seen": 126900576, "step": 58830 }, { "epoch": 10.797394017250872, "grad_norm": 0.0005198039580136538, "learning_rate": 5.1769302801220355e-06, "loss": 0.0001, "num_input_tokens_seen": 126910304, "step": 58835 }, { "epoch": 10.798311616810423, "grad_norm": 0.0023405991960316896, "learning_rate": 5.176130022679981e-06, "loss": 0.0003, "num_input_tokens_seen": 126921280, "step": 58840 }, { "epoch": 10.799229216369977, "grad_norm": 0.02107616700232029, "learning_rate": 5.175329760720463e-06, "loss": 0.0377, "num_input_tokens_seen": 126932192, "step": 58845 }, { "epoch": 10.800146815929528, "grad_norm": 0.04927234351634979, "learning_rate": 5.174529494264009e-06, "loss": 0.0071, "num_input_tokens_seen": 126943072, "step": 58850 }, { "epoch": 10.80106441548908, "grad_norm": 0.0035536005161702633, "learning_rate": 5.173729223331146e-06, "loss": 0.1721, "num_input_tokens_seen": 126953696, "step": 58855 }, { "epoch": 10.801982015048633, "grad_norm": 0.014717777259647846, "learning_rate": 5.172928947942395e-06, "loss": 0.0002, "num_input_tokens_seen": 126964544, "step": 58860 }, { "epoch": 10.802899614608185, "grad_norm": 0.0331416092813015, "learning_rate": 5.172128668118286e-06, "loss": 0.0001, "num_input_tokens_seen": 126975392, "step": 58865 }, { "epoch": 10.803817214167736, "grad_norm": 0.006922164466232061, "learning_rate": 5.171328383879341e-06, "loss": 0.0001, "num_input_tokens_seen": 126986496, "step": 58870 }, { "epoch": 10.80473481372729, "grad_norm": 0.028288640081882477, "learning_rate": 5.170528095246091e-06, "loss": 0.2157, "num_input_tokens_seen": 126996864, "step": 58875 }, { "epoch": 10.805652413286841, "grad_norm": 0.003941140603274107, "learning_rate": 5.1697278022390595e-06, "loss": 0.0001, "num_input_tokens_seen": 127007744, "step": 58880 }, { "epoch": 10.806570012846393, "grad_norm": 0.027658721432089806, "learning_rate": 5.1689275048787725e-06, "loss": 0.0001, "num_input_tokens_seen": 127019200, "step": 58885 }, { "epoch": 10.807487612405946, "grad_norm": 0.014213315211236477, "learning_rate": 5.168127203185756e-06, "loss": 0.0005, "num_input_tokens_seen": 127029344, "step": 58890 }, { "epoch": 10.808405211965498, "grad_norm": 0.001052937819622457, "learning_rate": 5.1673268971805376e-06, "loss": 0.0007, "num_input_tokens_seen": 127039776, "step": 58895 }, { "epoch": 10.80932281152505, "grad_norm": 0.0051060146652162075, "learning_rate": 5.166526586883644e-06, "loss": 0.0001, "num_input_tokens_seen": 127050272, "step": 58900 }, { "epoch": 10.810240411084603, "grad_norm": 0.0012544469209387898, "learning_rate": 5.165726272315602e-06, "loss": 0.0003, "num_input_tokens_seen": 127061536, "step": 58905 }, { "epoch": 10.811158010644155, "grad_norm": 0.002911958610638976, "learning_rate": 5.164925953496937e-06, "loss": 0.0003, "num_input_tokens_seen": 127072320, "step": 58910 }, { "epoch": 10.812075610203706, "grad_norm": 0.0052299234084784985, "learning_rate": 5.164125630448178e-06, "loss": 0.0001, "num_input_tokens_seen": 127083392, "step": 58915 }, { "epoch": 10.81299320976326, "grad_norm": 0.013036634773015976, "learning_rate": 5.163325303189851e-06, "loss": 0.0621, "num_input_tokens_seen": 127093632, "step": 58920 }, { "epoch": 10.813910809322811, "grad_norm": 0.0021835286170244217, "learning_rate": 5.162524971742483e-06, "loss": 0.0004, "num_input_tokens_seen": 127105024, "step": 58925 }, { "epoch": 10.814828408882363, "grad_norm": 0.002073293086141348, "learning_rate": 5.161724636126602e-06, "loss": 0.0003, "num_input_tokens_seen": 127116800, "step": 58930 }, { "epoch": 10.815746008441916, "grad_norm": 0.03904421254992485, "learning_rate": 5.160924296362733e-06, "loss": 0.0001, "num_input_tokens_seen": 127127584, "step": 58935 }, { "epoch": 10.816663608001468, "grad_norm": 0.0009120256872847676, "learning_rate": 5.160123952471406e-06, "loss": 0.0003, "num_input_tokens_seen": 127138272, "step": 58940 }, { "epoch": 10.81758120756102, "grad_norm": 0.0027387049049139023, "learning_rate": 5.159323604473146e-06, "loss": 0.0001, "num_input_tokens_seen": 127149088, "step": 58945 }, { "epoch": 10.818498807120573, "grad_norm": 0.0013687436003237963, "learning_rate": 5.158523252388486e-06, "loss": 0.0452, "num_input_tokens_seen": 127161216, "step": 58950 }, { "epoch": 10.819416406680125, "grad_norm": 2.166182518005371, "learning_rate": 5.1577228962379475e-06, "loss": 0.0002, "num_input_tokens_seen": 127171744, "step": 58955 }, { "epoch": 10.820334006239676, "grad_norm": 4.624520301818848, "learning_rate": 5.156922536042061e-06, "loss": 0.0066, "num_input_tokens_seen": 127182848, "step": 58960 }, { "epoch": 10.82125160579923, "grad_norm": 0.08257916569709778, "learning_rate": 5.156122171821356e-06, "loss": 0.0071, "num_input_tokens_seen": 127192992, "step": 58965 }, { "epoch": 10.822169205358781, "grad_norm": 0.0011632307432591915, "learning_rate": 5.1553218035963595e-06, "loss": 0.0, "num_input_tokens_seen": 127203200, "step": 58970 }, { "epoch": 10.823086804918333, "grad_norm": 0.003895192639902234, "learning_rate": 5.154521431387599e-06, "loss": 0.0, "num_input_tokens_seen": 127213568, "step": 58975 }, { "epoch": 10.824004404477886, "grad_norm": 0.013547717593610287, "learning_rate": 5.153721055215602e-06, "loss": 0.0, "num_input_tokens_seen": 127224160, "step": 58980 }, { "epoch": 10.824922004037438, "grad_norm": 0.001029826351441443, "learning_rate": 5.152920675100899e-06, "loss": 0.0002, "num_input_tokens_seen": 127233664, "step": 58985 }, { "epoch": 10.82583960359699, "grad_norm": 0.0034731116611510515, "learning_rate": 5.152120291064019e-06, "loss": 0.0, "num_input_tokens_seen": 127244032, "step": 58990 }, { "epoch": 10.826757203156543, "grad_norm": 0.015835242345929146, "learning_rate": 5.151319903125488e-06, "loss": 0.1844, "num_input_tokens_seen": 127254560, "step": 58995 }, { "epoch": 10.827674802716095, "grad_norm": 0.0039845299907028675, "learning_rate": 5.150519511305837e-06, "loss": 0.1253, "num_input_tokens_seen": 127266816, "step": 59000 }, { "epoch": 10.828592402275646, "grad_norm": 29.440202713012695, "learning_rate": 5.149719115625592e-06, "loss": 0.1159, "num_input_tokens_seen": 127277088, "step": 59005 }, { "epoch": 10.8295100018352, "grad_norm": 0.0006647308473475277, "learning_rate": 5.148918716105284e-06, "loss": 0.0001, "num_input_tokens_seen": 127286080, "step": 59010 }, { "epoch": 10.830427601394751, "grad_norm": 0.0026531910989433527, "learning_rate": 5.1481183127654444e-06, "loss": 0.0119, "num_input_tokens_seen": 127296832, "step": 59015 }, { "epoch": 10.831345200954303, "grad_norm": 0.13892537355422974, "learning_rate": 5.147317905626598e-06, "loss": 0.0002, "num_input_tokens_seen": 127308576, "step": 59020 }, { "epoch": 10.832262800513856, "grad_norm": 0.05555720254778862, "learning_rate": 5.146517494709276e-06, "loss": 0.0002, "num_input_tokens_seen": 127320832, "step": 59025 }, { "epoch": 10.833180400073408, "grad_norm": 0.015662118792533875, "learning_rate": 5.145717080034007e-06, "loss": 0.0001, "num_input_tokens_seen": 127330752, "step": 59030 }, { "epoch": 10.83409799963296, "grad_norm": 0.06292253732681274, "learning_rate": 5.14491666162132e-06, "loss": 0.2443, "num_input_tokens_seen": 127340736, "step": 59035 }, { "epoch": 10.835015599192513, "grad_norm": 0.31683704257011414, "learning_rate": 5.144116239491746e-06, "loss": 0.0004, "num_input_tokens_seen": 127351072, "step": 59040 }, { "epoch": 10.835933198752064, "grad_norm": 0.035775840282440186, "learning_rate": 5.143315813665814e-06, "loss": 0.0001, "num_input_tokens_seen": 127360128, "step": 59045 }, { "epoch": 10.836850798311616, "grad_norm": 98.48078918457031, "learning_rate": 5.142515384164053e-06, "loss": 0.2441, "num_input_tokens_seen": 127370912, "step": 59050 }, { "epoch": 10.83776839787117, "grad_norm": 0.012294682674109936, "learning_rate": 5.141714951006993e-06, "loss": 0.0001, "num_input_tokens_seen": 127382944, "step": 59055 }, { "epoch": 10.838685997430721, "grad_norm": 0.02453218214213848, "learning_rate": 5.140914514215164e-06, "loss": 0.0208, "num_input_tokens_seen": 127394624, "step": 59060 }, { "epoch": 10.839603596990273, "grad_norm": 0.0012832627398893237, "learning_rate": 5.140114073809097e-06, "loss": 0.0001, "num_input_tokens_seen": 127404800, "step": 59065 }, { "epoch": 10.840521196549826, "grad_norm": 0.2671930193901062, "learning_rate": 5.139313629809321e-06, "loss": 0.0006, "num_input_tokens_seen": 127414880, "step": 59070 }, { "epoch": 10.841438796109378, "grad_norm": 0.03152001276612282, "learning_rate": 5.138513182236367e-06, "loss": 0.0133, "num_input_tokens_seen": 127426112, "step": 59075 }, { "epoch": 10.84235639566893, "grad_norm": 0.08365338295698166, "learning_rate": 5.137712731110764e-06, "loss": 0.0002, "num_input_tokens_seen": 127437248, "step": 59080 }, { "epoch": 10.843273995228483, "grad_norm": 0.02178940735757351, "learning_rate": 5.136912276453041e-06, "loss": 0.0015, "num_input_tokens_seen": 127448640, "step": 59085 }, { "epoch": 10.844191594788034, "grad_norm": 0.7197670936584473, "learning_rate": 5.1361118182837325e-06, "loss": 0.0005, "num_input_tokens_seen": 127459264, "step": 59090 }, { "epoch": 10.845109194347586, "grad_norm": 0.0016147125279530883, "learning_rate": 5.135311356623366e-06, "loss": 0.0002, "num_input_tokens_seen": 127470176, "step": 59095 }, { "epoch": 10.84602679390714, "grad_norm": 0.03003544919192791, "learning_rate": 5.134510891492474e-06, "loss": 0.1629, "num_input_tokens_seen": 127481088, "step": 59100 }, { "epoch": 10.846944393466691, "grad_norm": 0.0015868949703872204, "learning_rate": 5.133710422911584e-06, "loss": 0.0264, "num_input_tokens_seen": 127492224, "step": 59105 }, { "epoch": 10.847861993026243, "grad_norm": 0.004506114404648542, "learning_rate": 5.132909950901231e-06, "loss": 0.0003, "num_input_tokens_seen": 127503264, "step": 59110 }, { "epoch": 10.848779592585796, "grad_norm": 0.018003888428211212, "learning_rate": 5.132109475481942e-06, "loss": 0.1567, "num_input_tokens_seen": 127514624, "step": 59115 }, { "epoch": 10.849697192145348, "grad_norm": 0.014092689380049706, "learning_rate": 5.1313089966742504e-06, "loss": 0.0001, "num_input_tokens_seen": 127524608, "step": 59120 }, { "epoch": 10.8506147917049, "grad_norm": 0.03191922605037689, "learning_rate": 5.130508514498687e-06, "loss": 0.0003, "num_input_tokens_seen": 127534496, "step": 59125 }, { "epoch": 10.851532391264453, "grad_norm": 0.07988330721855164, "learning_rate": 5.129708028975782e-06, "loss": 0.0004, "num_input_tokens_seen": 127546624, "step": 59130 }, { "epoch": 10.852449990824004, "grad_norm": 0.002607319736853242, "learning_rate": 5.128907540126068e-06, "loss": 0.0001, "num_input_tokens_seen": 127556800, "step": 59135 }, { "epoch": 10.853367590383556, "grad_norm": 0.008999432437121868, "learning_rate": 5.1281070479700746e-06, "loss": 0.0004, "num_input_tokens_seen": 127567232, "step": 59140 }, { "epoch": 10.85428518994311, "grad_norm": 0.1145639717578888, "learning_rate": 5.1273065525283335e-06, "loss": 0.0003, "num_input_tokens_seen": 127579104, "step": 59145 }, { "epoch": 10.85520278950266, "grad_norm": 0.03116762824356556, "learning_rate": 5.126506053821379e-06, "loss": 0.0002, "num_input_tokens_seen": 127589280, "step": 59150 }, { "epoch": 10.856120389062212, "grad_norm": 0.0038830905687063932, "learning_rate": 5.125705551869737e-06, "loss": 0.0131, "num_input_tokens_seen": 127599808, "step": 59155 }, { "epoch": 10.857037988621766, "grad_norm": 0.00883952621370554, "learning_rate": 5.124905046693944e-06, "loss": 0.0, "num_input_tokens_seen": 127611968, "step": 59160 }, { "epoch": 10.857955588181317, "grad_norm": 0.09240606427192688, "learning_rate": 5.1241045383145295e-06, "loss": 0.0001, "num_input_tokens_seen": 127623520, "step": 59165 }, { "epoch": 10.858873187740869, "grad_norm": 0.0010701034916564822, "learning_rate": 5.123304026752026e-06, "loss": 0.0, "num_input_tokens_seen": 127634336, "step": 59170 }, { "epoch": 10.859790787300422, "grad_norm": 0.007112937979400158, "learning_rate": 5.122503512026966e-06, "loss": 0.0002, "num_input_tokens_seen": 127644704, "step": 59175 }, { "epoch": 10.860708386859974, "grad_norm": 0.015261142514646053, "learning_rate": 5.121702994159881e-06, "loss": 0.0001, "num_input_tokens_seen": 127655264, "step": 59180 }, { "epoch": 10.861625986419526, "grad_norm": 0.0007680614362470806, "learning_rate": 5.1209024731713035e-06, "loss": 0.0225, "num_input_tokens_seen": 127665600, "step": 59185 }, { "epoch": 10.862543585979079, "grad_norm": 0.0020843937527388334, "learning_rate": 5.120101949081763e-06, "loss": 0.0003, "num_input_tokens_seen": 127677696, "step": 59190 }, { "epoch": 10.86346118553863, "grad_norm": 0.001370774582028389, "learning_rate": 5.119301421911793e-06, "loss": 0.0, "num_input_tokens_seen": 127688320, "step": 59195 }, { "epoch": 10.864378785098182, "grad_norm": 0.001909672748297453, "learning_rate": 5.118500891681929e-06, "loss": 0.0001, "num_input_tokens_seen": 127699584, "step": 59200 }, { "epoch": 10.865296384657736, "grad_norm": 0.00781368650496006, "learning_rate": 5.1177003584127e-06, "loss": 0.0001, "num_input_tokens_seen": 127709152, "step": 59205 }, { "epoch": 10.866213984217287, "grad_norm": 0.0017402888042852283, "learning_rate": 5.116899822124639e-06, "loss": 0.0001, "num_input_tokens_seen": 127719040, "step": 59210 }, { "epoch": 10.867131583776839, "grad_norm": 0.26494818925857544, "learning_rate": 5.116099282838277e-06, "loss": 0.0004, "num_input_tokens_seen": 127729536, "step": 59215 }, { "epoch": 10.868049183336392, "grad_norm": 0.005314034875482321, "learning_rate": 5.11529874057415e-06, "loss": 0.0285, "num_input_tokens_seen": 127739072, "step": 59220 }, { "epoch": 10.868966782895944, "grad_norm": 0.6652498245239258, "learning_rate": 5.1144981953527895e-06, "loss": 0.0072, "num_input_tokens_seen": 127748800, "step": 59225 }, { "epoch": 10.869884382455496, "grad_norm": 0.004151432774960995, "learning_rate": 5.113697647194726e-06, "loss": 0.0532, "num_input_tokens_seen": 127759648, "step": 59230 }, { "epoch": 10.870801982015049, "grad_norm": 0.009902607649564743, "learning_rate": 5.1128970961204975e-06, "loss": 0.0021, "num_input_tokens_seen": 127770592, "step": 59235 }, { "epoch": 10.8717195815746, "grad_norm": 0.00588533328846097, "learning_rate": 5.1120965421506305e-06, "loss": 0.0001, "num_input_tokens_seen": 127781984, "step": 59240 }, { "epoch": 10.872637181134152, "grad_norm": 0.04709824547171593, "learning_rate": 5.111295985305662e-06, "loss": 0.0002, "num_input_tokens_seen": 127793824, "step": 59245 }, { "epoch": 10.873554780693706, "grad_norm": 0.0004335496632847935, "learning_rate": 5.110495425606124e-06, "loss": 0.0001, "num_input_tokens_seen": 127804992, "step": 59250 }, { "epoch": 10.874472380253257, "grad_norm": 0.0359976552426815, "learning_rate": 5.1096948630725484e-06, "loss": 0.1789, "num_input_tokens_seen": 127815552, "step": 59255 }, { "epoch": 10.875389979812809, "grad_norm": 0.022794444113969803, "learning_rate": 5.108894297725472e-06, "loss": 0.0617, "num_input_tokens_seen": 127826976, "step": 59260 }, { "epoch": 10.876307579372362, "grad_norm": 0.10056141018867493, "learning_rate": 5.1080937295854225e-06, "loss": 0.1068, "num_input_tokens_seen": 127838048, "step": 59265 }, { "epoch": 10.877225178931914, "grad_norm": 0.6752822995185852, "learning_rate": 5.107293158672939e-06, "loss": 0.0859, "num_input_tokens_seen": 127847904, "step": 59270 }, { "epoch": 10.878142778491465, "grad_norm": 0.009566103108227253, "learning_rate": 5.10649258500855e-06, "loss": 0.0001, "num_input_tokens_seen": 127859392, "step": 59275 }, { "epoch": 10.879060378051019, "grad_norm": 0.0009682371746748686, "learning_rate": 5.105692008612793e-06, "loss": 0.0005, "num_input_tokens_seen": 127870656, "step": 59280 }, { "epoch": 10.87997797761057, "grad_norm": 0.0006321964901871979, "learning_rate": 5.104891429506199e-06, "loss": 0.0001, "num_input_tokens_seen": 127882912, "step": 59285 }, { "epoch": 10.880895577170122, "grad_norm": 0.0008137139375321567, "learning_rate": 5.104090847709302e-06, "loss": 0.0978, "num_input_tokens_seen": 127893376, "step": 59290 }, { "epoch": 10.881813176729676, "grad_norm": 0.009940261952579021, "learning_rate": 5.1032902632426375e-06, "loss": 0.0002, "num_input_tokens_seen": 127904288, "step": 59295 }, { "epoch": 10.882730776289227, "grad_norm": 0.010366442613303661, "learning_rate": 5.1024896761267366e-06, "loss": 0.0144, "num_input_tokens_seen": 127915520, "step": 59300 }, { "epoch": 10.883648375848779, "grad_norm": 0.014199587516486645, "learning_rate": 5.101689086382134e-06, "loss": 0.0001, "num_input_tokens_seen": 127926688, "step": 59305 }, { "epoch": 10.884565975408332, "grad_norm": 0.007376387715339661, "learning_rate": 5.1008884940293655e-06, "loss": 0.0002, "num_input_tokens_seen": 127937472, "step": 59310 }, { "epoch": 10.885483574967884, "grad_norm": 0.005001387093216181, "learning_rate": 5.100087899088962e-06, "loss": 0.0002, "num_input_tokens_seen": 127947840, "step": 59315 }, { "epoch": 10.886401174527435, "grad_norm": 0.009511369280517101, "learning_rate": 5.09928730158146e-06, "loss": 0.0002, "num_input_tokens_seen": 127958912, "step": 59320 }, { "epoch": 10.887318774086989, "grad_norm": 0.004335753154009581, "learning_rate": 5.098486701527392e-06, "loss": 0.0001, "num_input_tokens_seen": 127967872, "step": 59325 }, { "epoch": 10.88823637364654, "grad_norm": 0.5069494247436523, "learning_rate": 5.097686098947293e-06, "loss": 0.0707, "num_input_tokens_seen": 127978752, "step": 59330 }, { "epoch": 10.889153973206092, "grad_norm": 0.004855102859437466, "learning_rate": 5.096885493861698e-06, "loss": 0.0001, "num_input_tokens_seen": 127989376, "step": 59335 }, { "epoch": 10.890071572765645, "grad_norm": 0.0007300705183297396, "learning_rate": 5.096084886291139e-06, "loss": 0.0, "num_input_tokens_seen": 127999712, "step": 59340 }, { "epoch": 10.890989172325197, "grad_norm": 0.020047416910529137, "learning_rate": 5.0952842762561515e-06, "loss": 0.1267, "num_input_tokens_seen": 128010912, "step": 59345 }, { "epoch": 10.891906771884749, "grad_norm": 0.0021755103953182697, "learning_rate": 5.094483663777271e-06, "loss": 0.0001, "num_input_tokens_seen": 128023360, "step": 59350 }, { "epoch": 10.892824371444302, "grad_norm": 0.002665165113285184, "learning_rate": 5.09368304887503e-06, "loss": 0.0001, "num_input_tokens_seen": 128034432, "step": 59355 }, { "epoch": 10.893741971003854, "grad_norm": 19.445362091064453, "learning_rate": 5.0928824315699645e-06, "loss": 0.0764, "num_input_tokens_seen": 128045280, "step": 59360 }, { "epoch": 10.894659570563405, "grad_norm": 0.000855301390402019, "learning_rate": 5.092081811882608e-06, "loss": 0.0001, "num_input_tokens_seen": 128055136, "step": 59365 }, { "epoch": 10.895577170122959, "grad_norm": 0.0010452904971316457, "learning_rate": 5.091281189833497e-06, "loss": 0.0796, "num_input_tokens_seen": 128066656, "step": 59370 }, { "epoch": 10.89649476968251, "grad_norm": 0.006939490791410208, "learning_rate": 5.090480565443163e-06, "loss": 0.0306, "num_input_tokens_seen": 128078752, "step": 59375 }, { "epoch": 10.897412369242062, "grad_norm": 0.0016861062031239271, "learning_rate": 5.0896799387321435e-06, "loss": 0.0002, "num_input_tokens_seen": 128088832, "step": 59380 }, { "epoch": 10.898329968801615, "grad_norm": 0.0038274305406957865, "learning_rate": 5.088879309720973e-06, "loss": 0.0, "num_input_tokens_seen": 128099264, "step": 59385 }, { "epoch": 10.899247568361167, "grad_norm": 31.757570266723633, "learning_rate": 5.088078678430186e-06, "loss": 0.1318, "num_input_tokens_seen": 128111296, "step": 59390 }, { "epoch": 10.900165167920719, "grad_norm": 0.030981870368123055, "learning_rate": 5.087278044880317e-06, "loss": 0.0003, "num_input_tokens_seen": 128122624, "step": 59395 }, { "epoch": 10.901082767480272, "grad_norm": 0.02271234430372715, "learning_rate": 5.086477409091902e-06, "loss": 0.0002, "num_input_tokens_seen": 128134048, "step": 59400 }, { "epoch": 10.902000367039824, "grad_norm": 0.012187848798930645, "learning_rate": 5.085676771085476e-06, "loss": 0.0008, "num_input_tokens_seen": 128144320, "step": 59405 }, { "epoch": 10.902917966599375, "grad_norm": 0.23952926695346832, "learning_rate": 5.084876130881572e-06, "loss": 0.0003, "num_input_tokens_seen": 128154592, "step": 59410 }, { "epoch": 10.903835566158929, "grad_norm": 0.0005827672430314124, "learning_rate": 5.084075488500727e-06, "loss": 0.1005, "num_input_tokens_seen": 128164576, "step": 59415 }, { "epoch": 10.90475316571848, "grad_norm": 0.0012054754188284278, "learning_rate": 5.0832748439634775e-06, "loss": 0.019, "num_input_tokens_seen": 128175072, "step": 59420 }, { "epoch": 10.905670765278032, "grad_norm": 0.00917738862335682, "learning_rate": 5.082474197290356e-06, "loss": 0.0676, "num_input_tokens_seen": 128186688, "step": 59425 }, { "epoch": 10.906588364837585, "grad_norm": 0.022283997386693954, "learning_rate": 5.081673548501899e-06, "loss": 0.1037, "num_input_tokens_seen": 128196032, "step": 59430 }, { "epoch": 10.907505964397137, "grad_norm": 0.008472660556435585, "learning_rate": 5.0808728976186426e-06, "loss": 0.0, "num_input_tokens_seen": 128205440, "step": 59435 }, { "epoch": 10.908423563956688, "grad_norm": 0.05098647251725197, "learning_rate": 5.080072244661121e-06, "loss": 0.1196, "num_input_tokens_seen": 128216224, "step": 59440 }, { "epoch": 10.909341163516242, "grad_norm": 0.0006100572645664215, "learning_rate": 5.079271589649872e-06, "loss": 0.0001, "num_input_tokens_seen": 128227360, "step": 59445 }, { "epoch": 10.910258763075793, "grad_norm": 0.0089571513235569, "learning_rate": 5.078470932605428e-06, "loss": 0.0001, "num_input_tokens_seen": 128237696, "step": 59450 }, { "epoch": 10.911176362635345, "grad_norm": 0.030834194272756577, "learning_rate": 5.077670273548327e-06, "loss": 0.0001, "num_input_tokens_seen": 128247584, "step": 59455 }, { "epoch": 10.912093962194898, "grad_norm": 0.04703182354569435, "learning_rate": 5.076869612499105e-06, "loss": 0.2415, "num_input_tokens_seen": 128259456, "step": 59460 }, { "epoch": 10.91301156175445, "grad_norm": 0.0007896199240349233, "learning_rate": 5.076068949478294e-06, "loss": 0.0352, "num_input_tokens_seen": 128270240, "step": 59465 }, { "epoch": 10.913929161314002, "grad_norm": 0.001227564294822514, "learning_rate": 5.075268284506435e-06, "loss": 0.0002, "num_input_tokens_seen": 128280800, "step": 59470 }, { "epoch": 10.914846760873555, "grad_norm": 0.013077166862785816, "learning_rate": 5.07446761760406e-06, "loss": 0.0079, "num_input_tokens_seen": 128291456, "step": 59475 }, { "epoch": 10.915764360433107, "grad_norm": 0.008258596993982792, "learning_rate": 5.073666948791706e-06, "loss": 0.0, "num_input_tokens_seen": 128302944, "step": 59480 }, { "epoch": 10.916681959992658, "grad_norm": 0.0006460216245613992, "learning_rate": 5.072866278089908e-06, "loss": 0.0, "num_input_tokens_seen": 128313952, "step": 59485 }, { "epoch": 10.917599559552212, "grad_norm": 0.014987152069807053, "learning_rate": 5.072065605519203e-06, "loss": 0.1439, "num_input_tokens_seen": 128324704, "step": 59490 }, { "epoch": 10.918517159111763, "grad_norm": 0.01856708526611328, "learning_rate": 5.071264931100129e-06, "loss": 0.0002, "num_input_tokens_seen": 128335616, "step": 59495 }, { "epoch": 10.919434758671315, "grad_norm": 0.001242985250428319, "learning_rate": 5.070464254853218e-06, "loss": 0.0794, "num_input_tokens_seen": 128346624, "step": 59500 }, { "epoch": 10.920352358230868, "grad_norm": 0.04470191150903702, "learning_rate": 5.069663576799009e-06, "loss": 0.0057, "num_input_tokens_seen": 128356000, "step": 59505 }, { "epoch": 10.92126995779042, "grad_norm": 0.0008605991606600583, "learning_rate": 5.068862896958036e-06, "loss": 0.1971, "num_input_tokens_seen": 128367424, "step": 59510 }, { "epoch": 10.922187557349972, "grad_norm": 0.0005672942497767508, "learning_rate": 5.0680622153508365e-06, "loss": 0.0002, "num_input_tokens_seen": 128379744, "step": 59515 }, { "epoch": 10.923105156909525, "grad_norm": 0.0012459291610866785, "learning_rate": 5.067261531997948e-06, "loss": 0.0352, "num_input_tokens_seen": 128392160, "step": 59520 }, { "epoch": 10.924022756469077, "grad_norm": 0.0007306523039005697, "learning_rate": 5.066460846919905e-06, "loss": 0.0008, "num_input_tokens_seen": 128402752, "step": 59525 }, { "epoch": 10.924940356028628, "grad_norm": 0.008973857387900352, "learning_rate": 5.065660160137245e-06, "loss": 0.0, "num_input_tokens_seen": 128413408, "step": 59530 }, { "epoch": 10.925857955588182, "grad_norm": 0.7202003002166748, "learning_rate": 5.0648594716705024e-06, "loss": 0.1629, "num_input_tokens_seen": 128424160, "step": 59535 }, { "epoch": 10.926775555147733, "grad_norm": 0.0007946556434035301, "learning_rate": 5.0640587815402145e-06, "loss": 0.0, "num_input_tokens_seen": 128435712, "step": 59540 }, { "epoch": 10.927693154707285, "grad_norm": 46.41793441772461, "learning_rate": 5.063258089766919e-06, "loss": 0.1286, "num_input_tokens_seen": 128445696, "step": 59545 }, { "epoch": 10.928610754266838, "grad_norm": 0.0027621889021247625, "learning_rate": 5.062457396371151e-06, "loss": 0.0003, "num_input_tokens_seen": 128456064, "step": 59550 }, { "epoch": 10.92952835382639, "grad_norm": 0.0023441328667104244, "learning_rate": 5.061656701373449e-06, "loss": 0.0008, "num_input_tokens_seen": 128467296, "step": 59555 }, { "epoch": 10.930445953385941, "grad_norm": 0.007790111470967531, "learning_rate": 5.060856004794347e-06, "loss": 0.002, "num_input_tokens_seen": 128479008, "step": 59560 }, { "epoch": 10.931363552945495, "grad_norm": 0.12104254215955734, "learning_rate": 5.060055306654383e-06, "loss": 0.0043, "num_input_tokens_seen": 128489280, "step": 59565 }, { "epoch": 10.932281152505047, "grad_norm": 0.0012099266750738025, "learning_rate": 5.0592546069740945e-06, "loss": 0.0003, "num_input_tokens_seen": 128499936, "step": 59570 }, { "epoch": 10.933198752064598, "grad_norm": 0.0014026617864146829, "learning_rate": 5.058453905774015e-06, "loss": 0.0001, "num_input_tokens_seen": 128512000, "step": 59575 }, { "epoch": 10.934116351624152, "grad_norm": 0.0029042698442935944, "learning_rate": 5.057653203074686e-06, "loss": 0.0007, "num_input_tokens_seen": 128522912, "step": 59580 }, { "epoch": 10.935033951183703, "grad_norm": 0.016190728172659874, "learning_rate": 5.0568524988966395e-06, "loss": 0.0004, "num_input_tokens_seen": 128533888, "step": 59585 }, { "epoch": 10.935951550743255, "grad_norm": 0.00021704242681153119, "learning_rate": 5.056051793260416e-06, "loss": 0.0001, "num_input_tokens_seen": 128544768, "step": 59590 }, { "epoch": 10.936869150302808, "grad_norm": 0.016708290204405785, "learning_rate": 5.05525108618655e-06, "loss": 0.0001, "num_input_tokens_seen": 128555488, "step": 59595 }, { "epoch": 10.93778674986236, "grad_norm": 0.019635945558547974, "learning_rate": 5.054450377695579e-06, "loss": 0.0674, "num_input_tokens_seen": 128567360, "step": 59600 }, { "epoch": 10.938704349421911, "grad_norm": 0.3276612460613251, "learning_rate": 5.053649667808041e-06, "loss": 0.0004, "num_input_tokens_seen": 128577760, "step": 59605 }, { "epoch": 10.939621948981465, "grad_norm": 0.051525816321372986, "learning_rate": 5.052848956544471e-06, "loss": 0.0, "num_input_tokens_seen": 128588864, "step": 59610 }, { "epoch": 10.940539548541016, "grad_norm": 0.017078086733818054, "learning_rate": 5.05204824392541e-06, "loss": 0.0001, "num_input_tokens_seen": 128599808, "step": 59615 }, { "epoch": 10.941457148100568, "grad_norm": 0.00526118278503418, "learning_rate": 5.051247529971388e-06, "loss": 0.0353, "num_input_tokens_seen": 128610272, "step": 59620 }, { "epoch": 10.942374747660121, "grad_norm": 0.018118971958756447, "learning_rate": 5.050446814702948e-06, "loss": 0.0003, "num_input_tokens_seen": 128622112, "step": 59625 }, { "epoch": 10.943292347219673, "grad_norm": 0.00856028962880373, "learning_rate": 5.049646098140627e-06, "loss": 0.0001, "num_input_tokens_seen": 128632800, "step": 59630 }, { "epoch": 10.944209946779225, "grad_norm": 0.02603488601744175, "learning_rate": 5.048845380304959e-06, "loss": 0.0001, "num_input_tokens_seen": 128644096, "step": 59635 }, { "epoch": 10.945127546338778, "grad_norm": 0.003033967223018408, "learning_rate": 5.048044661216484e-06, "loss": 0.001, "num_input_tokens_seen": 128655168, "step": 59640 }, { "epoch": 10.94604514589833, "grad_norm": 0.033928461372852325, "learning_rate": 5.047243940895736e-06, "loss": 0.0001, "num_input_tokens_seen": 128666688, "step": 59645 }, { "epoch": 10.946962745457881, "grad_norm": 0.008634302765130997, "learning_rate": 5.046443219363255e-06, "loss": 0.0, "num_input_tokens_seen": 128678080, "step": 59650 }, { "epoch": 10.947880345017435, "grad_norm": 49.03373336791992, "learning_rate": 5.045642496639578e-06, "loss": 0.1502, "num_input_tokens_seen": 128689312, "step": 59655 }, { "epoch": 10.948797944576986, "grad_norm": 0.2020811289548874, "learning_rate": 5.044841772745241e-06, "loss": 0.0116, "num_input_tokens_seen": 128700064, "step": 59660 }, { "epoch": 10.949715544136538, "grad_norm": 0.07362250238656998, "learning_rate": 5.044041047700783e-06, "loss": 0.0001, "num_input_tokens_seen": 128712000, "step": 59665 }, { "epoch": 10.950633143696091, "grad_norm": 0.00450403057038784, "learning_rate": 5.043240321526739e-06, "loss": 0.0, "num_input_tokens_seen": 128722880, "step": 59670 }, { "epoch": 10.951550743255643, "grad_norm": 0.13183128833770752, "learning_rate": 5.042439594243649e-06, "loss": 0.0001, "num_input_tokens_seen": 128735104, "step": 59675 }, { "epoch": 10.952468342815195, "grad_norm": 10.320375442504883, "learning_rate": 5.041638865872048e-06, "loss": 0.0168, "num_input_tokens_seen": 128745920, "step": 59680 }, { "epoch": 10.953385942374748, "grad_norm": 0.006054284982383251, "learning_rate": 5.040838136432475e-06, "loss": 0.0005, "num_input_tokens_seen": 128756992, "step": 59685 }, { "epoch": 10.9543035419343, "grad_norm": 0.0014815989416092634, "learning_rate": 5.040037405945468e-06, "loss": 0.0001, "num_input_tokens_seen": 128767392, "step": 59690 }, { "epoch": 10.955221141493851, "grad_norm": 0.0009671522420831025, "learning_rate": 5.039236674431562e-06, "loss": 0.0, "num_input_tokens_seen": 128778112, "step": 59695 }, { "epoch": 10.956138741053405, "grad_norm": 0.00239074626006186, "learning_rate": 5.038435941911297e-06, "loss": 0.0, "num_input_tokens_seen": 128788832, "step": 59700 }, { "epoch": 10.957056340612956, "grad_norm": 0.0002601858286652714, "learning_rate": 5.03763520840521e-06, "loss": 0.0001, "num_input_tokens_seen": 128800448, "step": 59705 }, { "epoch": 10.957973940172508, "grad_norm": 0.0009378832182846963, "learning_rate": 5.036834473933838e-06, "loss": 0.0001, "num_input_tokens_seen": 128812384, "step": 59710 }, { "epoch": 10.958891539732061, "grad_norm": 0.0018539093434810638, "learning_rate": 5.036033738517719e-06, "loss": 0.0001, "num_input_tokens_seen": 128822400, "step": 59715 }, { "epoch": 10.959809139291613, "grad_norm": 0.007741469424217939, "learning_rate": 5.03523300217739e-06, "loss": 0.2823, "num_input_tokens_seen": 128833728, "step": 59720 }, { "epoch": 10.960726738851164, "grad_norm": 2.2756543159484863, "learning_rate": 5.03443226493339e-06, "loss": 0.0012, "num_input_tokens_seen": 128842688, "step": 59725 }, { "epoch": 10.961644338410718, "grad_norm": 0.006812691688537598, "learning_rate": 5.033631526806254e-06, "loss": 0.0001, "num_input_tokens_seen": 128852352, "step": 59730 }, { "epoch": 10.96256193797027, "grad_norm": 0.0013788033975288272, "learning_rate": 5.032830787816523e-06, "loss": 0.0001, "num_input_tokens_seen": 128861600, "step": 59735 }, { "epoch": 10.963479537529821, "grad_norm": 0.002666122978553176, "learning_rate": 5.032030047984733e-06, "loss": 0.0108, "num_input_tokens_seen": 128872832, "step": 59740 }, { "epoch": 10.964397137089374, "grad_norm": 0.003671841463074088, "learning_rate": 5.031229307331421e-06, "loss": 0.0029, "num_input_tokens_seen": 128882816, "step": 59745 }, { "epoch": 10.965314736648926, "grad_norm": 0.000605982553679496, "learning_rate": 5.030428565877127e-06, "loss": 0.0, "num_input_tokens_seen": 128892960, "step": 59750 }, { "epoch": 10.966232336208478, "grad_norm": 0.0005268036620691419, "learning_rate": 5.0296278236423855e-06, "loss": 0.0001, "num_input_tokens_seen": 128904192, "step": 59755 }, { "epoch": 10.967149935768031, "grad_norm": 0.001685230410657823, "learning_rate": 5.028827080647738e-06, "loss": 0.0, "num_input_tokens_seen": 128915712, "step": 59760 }, { "epoch": 10.968067535327583, "grad_norm": 0.0019706746097654104, "learning_rate": 5.0280263369137205e-06, "loss": 0.0, "num_input_tokens_seen": 128926464, "step": 59765 }, { "epoch": 10.968985134887134, "grad_norm": 0.017494412139058113, "learning_rate": 5.027225592460869e-06, "loss": 0.0001, "num_input_tokens_seen": 128936640, "step": 59770 }, { "epoch": 10.969902734446688, "grad_norm": 0.0779685378074646, "learning_rate": 5.026424847309725e-06, "loss": 0.0001, "num_input_tokens_seen": 128948000, "step": 59775 }, { "epoch": 10.97082033400624, "grad_norm": 0.0012472935486584902, "learning_rate": 5.025624101480826e-06, "loss": 0.0001, "num_input_tokens_seen": 128958784, "step": 59780 }, { "epoch": 10.971737933565791, "grad_norm": 0.3391368091106415, "learning_rate": 5.024823354994707e-06, "loss": 0.0001, "num_input_tokens_seen": 128969184, "step": 59785 }, { "epoch": 10.972655533125344, "grad_norm": 0.0011874785413965583, "learning_rate": 5.024022607871907e-06, "loss": 0.0329, "num_input_tokens_seen": 128979552, "step": 59790 }, { "epoch": 10.973573132684896, "grad_norm": 0.001562408171594143, "learning_rate": 5.023221860132964e-06, "loss": 0.0004, "num_input_tokens_seen": 128990912, "step": 59795 }, { "epoch": 10.974490732244448, "grad_norm": 0.029333753511309624, "learning_rate": 5.022421111798418e-06, "loss": 0.0001, "num_input_tokens_seen": 129000896, "step": 59800 }, { "epoch": 10.975408331804001, "grad_norm": 0.00037331704515963793, "learning_rate": 5.021620362888803e-06, "loss": 0.0, "num_input_tokens_seen": 129011392, "step": 59805 }, { "epoch": 10.976325931363553, "grad_norm": 0.0003671394952107221, "learning_rate": 5.02081961342466e-06, "loss": 0.147, "num_input_tokens_seen": 129022016, "step": 59810 }, { "epoch": 10.977243530923104, "grad_norm": 0.0018750125309452415, "learning_rate": 5.0200188634265265e-06, "loss": 0.0, "num_input_tokens_seen": 129033120, "step": 59815 }, { "epoch": 10.978161130482658, "grad_norm": 0.3127705156803131, "learning_rate": 5.019218112914939e-06, "loss": 0.0018, "num_input_tokens_seen": 129043136, "step": 59820 }, { "epoch": 10.97907873004221, "grad_norm": 0.0003741312539204955, "learning_rate": 5.018417361910439e-06, "loss": 0.0, "num_input_tokens_seen": 129052000, "step": 59825 }, { "epoch": 10.97999632960176, "grad_norm": 0.0009841127321124077, "learning_rate": 5.017616610433561e-06, "loss": 0.0006, "num_input_tokens_seen": 129062688, "step": 59830 }, { "epoch": 10.980913929161314, "grad_norm": 0.0005895713111385703, "learning_rate": 5.016815858504844e-06, "loss": 0.0, "num_input_tokens_seen": 129072608, "step": 59835 }, { "epoch": 10.981831528720866, "grad_norm": 0.00015688914572820067, "learning_rate": 5.016015106144827e-06, "loss": 0.0, "num_input_tokens_seen": 129083968, "step": 59840 }, { "epoch": 10.982749128280417, "grad_norm": 5.943063259124756, "learning_rate": 5.015214353374046e-06, "loss": 0.0051, "num_input_tokens_seen": 129095392, "step": 59845 }, { "epoch": 10.983666727839971, "grad_norm": 0.18274159729480743, "learning_rate": 5.014413600213043e-06, "loss": 0.0002, "num_input_tokens_seen": 129106368, "step": 59850 }, { "epoch": 10.984584327399523, "grad_norm": 0.009934169240295887, "learning_rate": 5.013612846682351e-06, "loss": 0.0001, "num_input_tokens_seen": 129117376, "step": 59855 }, { "epoch": 10.985501926959074, "grad_norm": 40.97660827636719, "learning_rate": 5.01281209280251e-06, "loss": 0.2201, "num_input_tokens_seen": 129127552, "step": 59860 }, { "epoch": 10.986419526518628, "grad_norm": 0.007695898413658142, "learning_rate": 5.012011338594061e-06, "loss": 0.0, "num_input_tokens_seen": 129139296, "step": 59865 }, { "epoch": 10.98733712607818, "grad_norm": 0.09219498187303543, "learning_rate": 5.011210584077538e-06, "loss": 0.1032, "num_input_tokens_seen": 129150624, "step": 59870 }, { "epoch": 10.98825472563773, "grad_norm": 0.02446189522743225, "learning_rate": 5.010409829273483e-06, "loss": 0.0, "num_input_tokens_seen": 129161280, "step": 59875 }, { "epoch": 10.989172325197284, "grad_norm": 0.009233229793608189, "learning_rate": 5.009609074202431e-06, "loss": 0.0478, "num_input_tokens_seen": 129172128, "step": 59880 }, { "epoch": 10.990089924756836, "grad_norm": 0.030016813427209854, "learning_rate": 5.008808318884921e-06, "loss": 0.0001, "num_input_tokens_seen": 129183616, "step": 59885 }, { "epoch": 10.991007524316387, "grad_norm": 0.09255238622426987, "learning_rate": 5.008007563341491e-06, "loss": 0.0001, "num_input_tokens_seen": 129193408, "step": 59890 }, { "epoch": 10.99192512387594, "grad_norm": 0.003493077587336302, "learning_rate": 5.007206807592679e-06, "loss": 0.0011, "num_input_tokens_seen": 129203040, "step": 59895 }, { "epoch": 10.992842723435492, "grad_norm": 0.005625330377370119, "learning_rate": 5.006406051659025e-06, "loss": 0.0001, "num_input_tokens_seen": 129212864, "step": 59900 }, { "epoch": 10.993760322995044, "grad_norm": 0.022812752053141594, "learning_rate": 5.005605295561065e-06, "loss": 0.0002, "num_input_tokens_seen": 129223392, "step": 59905 }, { "epoch": 10.994677922554597, "grad_norm": 0.004637135658413172, "learning_rate": 5.004804539319338e-06, "loss": 0.1721, "num_input_tokens_seen": 129235008, "step": 59910 }, { "epoch": 10.995595522114149, "grad_norm": 0.0032127639278769493, "learning_rate": 5.004003782954382e-06, "loss": 0.0001, "num_input_tokens_seen": 129246272, "step": 59915 }, { "epoch": 10.996513121673702, "grad_norm": 0.007539772894233465, "learning_rate": 5.0032030264867335e-06, "loss": 0.0331, "num_input_tokens_seen": 129256864, "step": 59920 }, { "epoch": 10.997430721233254, "grad_norm": 0.0006336430669762194, "learning_rate": 5.002402269936935e-06, "loss": 0.0002, "num_input_tokens_seen": 129268704, "step": 59925 }, { "epoch": 10.998348320792806, "grad_norm": 0.0030934184323996305, "learning_rate": 5.00160151332552e-06, "loss": 0.0, "num_input_tokens_seen": 129278624, "step": 59930 }, { "epoch": 10.999265920352359, "grad_norm": 0.0006371735944412649, "learning_rate": 5.00080075667303e-06, "loss": 0.0001, "num_input_tokens_seen": 129287968, "step": 59935 }, { "epoch": 11.00018351991191, "grad_norm": 0.0025908511597663164, "learning_rate": 5e-06, "loss": 0.1627, "num_input_tokens_seen": 129298080, "step": 59940 }, { "epoch": 11.001101119471462, "grad_norm": 0.0069129993207752705, "learning_rate": 4.999199243326973e-06, "loss": 0.0001, "num_input_tokens_seen": 129309504, "step": 59945 }, { "epoch": 11.002018719031016, "grad_norm": 0.0024595821741968393, "learning_rate": 4.9983984866744806e-06, "loss": 0.1658, "num_input_tokens_seen": 129319552, "step": 59950 }, { "epoch": 11.002936318590567, "grad_norm": 0.13733839988708496, "learning_rate": 4.997597730063067e-06, "loss": 0.0001, "num_input_tokens_seen": 129329536, "step": 59955 }, { "epoch": 11.003853918150119, "grad_norm": 2.369515895843506, "learning_rate": 4.996796973513267e-06, "loss": 0.0568, "num_input_tokens_seen": 129339584, "step": 59960 }, { "epoch": 11.004771517709672, "grad_norm": 0.0007352089160121977, "learning_rate": 4.9959962170456215e-06, "loss": 0.1502, "num_input_tokens_seen": 129349792, "step": 59965 }, { "epoch": 11.005689117269224, "grad_norm": 42.14850997924805, "learning_rate": 4.995195460680663e-06, "loss": 0.1068, "num_input_tokens_seen": 129359648, "step": 59970 }, { "epoch": 11.006606716828776, "grad_norm": 0.0011780346976593137, "learning_rate": 4.994394704438936e-06, "loss": 0.0001, "num_input_tokens_seen": 129370432, "step": 59975 }, { "epoch": 11.007524316388329, "grad_norm": 0.0076777334325015545, "learning_rate": 4.993593948340977e-06, "loss": 0.1037, "num_input_tokens_seen": 129381376, "step": 59980 }, { "epoch": 11.00844191594788, "grad_norm": 0.004530861508101225, "learning_rate": 4.992793192407322e-06, "loss": 0.0001, "num_input_tokens_seen": 129392416, "step": 59985 }, { "epoch": 11.009359515507432, "grad_norm": 0.1054520308971405, "learning_rate": 4.9919924366585096e-06, "loss": 0.0001, "num_input_tokens_seen": 129402432, "step": 59990 }, { "epoch": 11.010277115066986, "grad_norm": 0.0033907522447407246, "learning_rate": 4.991191681115081e-06, "loss": 0.002, "num_input_tokens_seen": 129412608, "step": 59995 }, { "epoch": 11.011194714626537, "grad_norm": 0.0005714751314371824, "learning_rate": 4.990390925797569e-06, "loss": 0.1604, "num_input_tokens_seen": 129423104, "step": 60000 }, { "epoch": 11.012112314186089, "grad_norm": 0.18818587064743042, "learning_rate": 4.989590170726518e-06, "loss": 0.0004, "num_input_tokens_seen": 129435200, "step": 60005 }, { "epoch": 11.013029913745642, "grad_norm": 0.02166035585105419, "learning_rate": 4.988789415922463e-06, "loss": 0.0001, "num_input_tokens_seen": 129445056, "step": 60010 }, { "epoch": 11.013947513305194, "grad_norm": 0.0014202659949660301, "learning_rate": 4.987988661405941e-06, "loss": 0.0001, "num_input_tokens_seen": 129456672, "step": 60015 }, { "epoch": 11.014865112864745, "grad_norm": 0.20550432801246643, "learning_rate": 4.98718790719749e-06, "loss": 0.0002, "num_input_tokens_seen": 129467040, "step": 60020 }, { "epoch": 11.015782712424299, "grad_norm": 0.020198946818709373, "learning_rate": 4.986387153317651e-06, "loss": 0.0, "num_input_tokens_seen": 129478368, "step": 60025 }, { "epoch": 11.01670031198385, "grad_norm": 0.02219465747475624, "learning_rate": 4.98558639978696e-06, "loss": 0.0001, "num_input_tokens_seen": 129488800, "step": 60030 }, { "epoch": 11.017617911543402, "grad_norm": 0.0012169241672381759, "learning_rate": 4.984785646625954e-06, "loss": 0.0005, "num_input_tokens_seen": 129499072, "step": 60035 }, { "epoch": 11.018535511102955, "grad_norm": 0.000543975445907563, "learning_rate": 4.983984893855174e-06, "loss": 0.0007, "num_input_tokens_seen": 129510080, "step": 60040 }, { "epoch": 11.019453110662507, "grad_norm": 0.022277146577835083, "learning_rate": 4.983184141495158e-06, "loss": 0.0, "num_input_tokens_seen": 129521632, "step": 60045 }, { "epoch": 11.020370710222059, "grad_norm": 0.33623459935188293, "learning_rate": 4.9823833895664394e-06, "loss": 0.0004, "num_input_tokens_seen": 129533024, "step": 60050 }, { "epoch": 11.021288309781612, "grad_norm": 0.0026653974782675505, "learning_rate": 4.9815826380895625e-06, "loss": 0.0, "num_input_tokens_seen": 129544288, "step": 60055 }, { "epoch": 11.022205909341164, "grad_norm": 0.017770741134881973, "learning_rate": 4.9807818870850614e-06, "loss": 0.0001, "num_input_tokens_seen": 129554112, "step": 60060 }, { "epoch": 11.023123508900715, "grad_norm": 2.488264799118042, "learning_rate": 4.979981136573476e-06, "loss": 0.0003, "num_input_tokens_seen": 129564832, "step": 60065 }, { "epoch": 11.024041108460269, "grad_norm": 0.048486851155757904, "learning_rate": 4.97918038657534e-06, "loss": 0.0004, "num_input_tokens_seen": 129575776, "step": 60070 }, { "epoch": 11.02495870801982, "grad_norm": 0.006575628649443388, "learning_rate": 4.978379637111198e-06, "loss": 0.0, "num_input_tokens_seen": 129586048, "step": 60075 }, { "epoch": 11.025876307579372, "grad_norm": 0.0020444721449166536, "learning_rate": 4.9775788882015854e-06, "loss": 0.0119, "num_input_tokens_seen": 129597152, "step": 60080 }, { "epoch": 11.026793907138925, "grad_norm": 0.9914748072624207, "learning_rate": 4.976778139867037e-06, "loss": 0.0019, "num_input_tokens_seen": 129608608, "step": 60085 }, { "epoch": 11.027711506698477, "grad_norm": 0.015671320259571075, "learning_rate": 4.975977392128095e-06, "loss": 0.0, "num_input_tokens_seen": 129619360, "step": 60090 }, { "epoch": 11.028629106258029, "grad_norm": 0.0013913856819272041, "learning_rate": 4.975176645005295e-06, "loss": 0.0, "num_input_tokens_seen": 129630496, "step": 60095 }, { "epoch": 11.029546705817582, "grad_norm": 0.0006599451298825443, "learning_rate": 4.974375898519177e-06, "loss": 0.0, "num_input_tokens_seen": 129640992, "step": 60100 }, { "epoch": 11.030464305377134, "grad_norm": 0.0006212911102920771, "learning_rate": 4.973575152690276e-06, "loss": 0.0207, "num_input_tokens_seen": 129652256, "step": 60105 }, { "epoch": 11.031381904936685, "grad_norm": 0.001117921550758183, "learning_rate": 4.9727744075391315e-06, "loss": 0.0, "num_input_tokens_seen": 129662528, "step": 60110 }, { "epoch": 11.032299504496239, "grad_norm": 0.0003809613117482513, "learning_rate": 4.971973663086281e-06, "loss": 0.0478, "num_input_tokens_seen": 129674208, "step": 60115 }, { "epoch": 11.03321710405579, "grad_norm": 0.009066330268979073, "learning_rate": 4.971172919352263e-06, "loss": 0.0, "num_input_tokens_seen": 129686176, "step": 60120 }, { "epoch": 11.034134703615342, "grad_norm": 0.006206955295056105, "learning_rate": 4.970372176357615e-06, "loss": 0.0, "num_input_tokens_seen": 129696384, "step": 60125 }, { "epoch": 11.035052303174895, "grad_norm": 0.006512718740850687, "learning_rate": 4.969571434122876e-06, "loss": 0.0001, "num_input_tokens_seen": 129707200, "step": 60130 }, { "epoch": 11.035969902734447, "grad_norm": 3.807741403579712, "learning_rate": 4.968770692668579e-06, "loss": 0.004, "num_input_tokens_seen": 129716960, "step": 60135 }, { "epoch": 11.036887502293999, "grad_norm": 0.0003634718887042254, "learning_rate": 4.967969952015269e-06, "loss": 0.0144, "num_input_tokens_seen": 129727904, "step": 60140 }, { "epoch": 11.037805101853552, "grad_norm": 0.0016630120808258653, "learning_rate": 4.9671692121834785e-06, "loss": 0.0001, "num_input_tokens_seen": 129738144, "step": 60145 }, { "epoch": 11.038722701413104, "grad_norm": 0.0023815466556698084, "learning_rate": 4.966368473193748e-06, "loss": 0.0001, "num_input_tokens_seen": 129749280, "step": 60150 }, { "epoch": 11.039640300972655, "grad_norm": 0.00034541721106506884, "learning_rate": 4.965567735066611e-06, "loss": 0.0, "num_input_tokens_seen": 129758400, "step": 60155 }, { "epoch": 11.040557900532209, "grad_norm": 0.0022706647869199514, "learning_rate": 4.964766997822611e-06, "loss": 0.0, "num_input_tokens_seen": 129769376, "step": 60160 }, { "epoch": 11.04147550009176, "grad_norm": 746.1079711914062, "learning_rate": 4.963966261482283e-06, "loss": 0.1407, "num_input_tokens_seen": 129780384, "step": 60165 }, { "epoch": 11.042393099651312, "grad_norm": 0.0007483811350539327, "learning_rate": 4.9631655260661624e-06, "loss": 0.0, "num_input_tokens_seen": 129790688, "step": 60170 }, { "epoch": 11.043310699210865, "grad_norm": 0.0011463126866146922, "learning_rate": 4.962364791594791e-06, "loss": 0.0, "num_input_tokens_seen": 129800352, "step": 60175 }, { "epoch": 11.044228298770417, "grad_norm": 0.00086007866775617, "learning_rate": 4.961564058088705e-06, "loss": 0.0, "num_input_tokens_seen": 129810752, "step": 60180 }, { "epoch": 11.045145898329968, "grad_norm": 0.0005755749880336225, "learning_rate": 4.960763325568438e-06, "loss": 0.0174, "num_input_tokens_seen": 129820544, "step": 60185 }, { "epoch": 11.046063497889522, "grad_norm": 0.005934425629675388, "learning_rate": 4.959962594054533e-06, "loss": 0.0, "num_input_tokens_seen": 129830880, "step": 60190 }, { "epoch": 11.046981097449073, "grad_norm": 0.004785897675901651, "learning_rate": 4.959161863567526e-06, "loss": 0.0, "num_input_tokens_seen": 129841760, "step": 60195 }, { "epoch": 11.047898697008625, "grad_norm": 0.0014059684472158551, "learning_rate": 4.958361134127953e-06, "loss": 0.0, "num_input_tokens_seen": 129852992, "step": 60200 }, { "epoch": 11.048816296568178, "grad_norm": 0.00046855639084242284, "learning_rate": 4.957560405756352e-06, "loss": 0.0001, "num_input_tokens_seen": 129863584, "step": 60205 }, { "epoch": 11.04973389612773, "grad_norm": 392.75592041015625, "learning_rate": 4.956759678473263e-06, "loss": 0.175, "num_input_tokens_seen": 129873696, "step": 60210 }, { "epoch": 11.050651495687282, "grad_norm": 0.001235809875652194, "learning_rate": 4.955958952299219e-06, "loss": 0.1782, "num_input_tokens_seen": 129884576, "step": 60215 }, { "epoch": 11.051569095246835, "grad_norm": 5.915364742279053, "learning_rate": 4.95515822725476e-06, "loss": 0.0078, "num_input_tokens_seen": 129895680, "step": 60220 }, { "epoch": 11.052486694806387, "grad_norm": 0.0018010975327342749, "learning_rate": 4.954357503360424e-06, "loss": 0.0, "num_input_tokens_seen": 129907616, "step": 60225 }, { "epoch": 11.053404294365938, "grad_norm": 0.0009623371297493577, "learning_rate": 4.953556780636747e-06, "loss": 0.1815, "num_input_tokens_seen": 129917216, "step": 60230 }, { "epoch": 11.054321893925492, "grad_norm": 0.3696284294128418, "learning_rate": 4.952756059104265e-06, "loss": 0.0005, "num_input_tokens_seen": 129927840, "step": 60235 }, { "epoch": 11.055239493485043, "grad_norm": 0.0009038970456458628, "learning_rate": 4.951955338783518e-06, "loss": 0.0, "num_input_tokens_seen": 129939456, "step": 60240 }, { "epoch": 11.056157093044595, "grad_norm": 0.0004870906413998455, "learning_rate": 4.951154619695043e-06, "loss": 0.0465, "num_input_tokens_seen": 129950912, "step": 60245 }, { "epoch": 11.057074692604148, "grad_norm": 0.04590930417180061, "learning_rate": 4.9503539018593755e-06, "loss": 0.0226, "num_input_tokens_seen": 129960064, "step": 60250 }, { "epoch": 11.0579922921637, "grad_norm": 0.0005425361450761557, "learning_rate": 4.949553185297052e-06, "loss": 0.0, "num_input_tokens_seen": 129970112, "step": 60255 }, { "epoch": 11.058909891723252, "grad_norm": 0.012599224224686623, "learning_rate": 4.9487524700286125e-06, "loss": 0.0, "num_input_tokens_seen": 129981344, "step": 60260 }, { "epoch": 11.059827491282805, "grad_norm": 0.10795808583498001, "learning_rate": 4.947951756074594e-06, "loss": 0.0002, "num_input_tokens_seen": 129992256, "step": 60265 }, { "epoch": 11.060745090842357, "grad_norm": 0.002417322713881731, "learning_rate": 4.9471510434555295e-06, "loss": 0.0002, "num_input_tokens_seen": 130002432, "step": 60270 }, { "epoch": 11.061662690401908, "grad_norm": 0.0013353300746530294, "learning_rate": 4.94635033219196e-06, "loss": 0.1751, "num_input_tokens_seen": 130012480, "step": 60275 }, { "epoch": 11.062580289961462, "grad_norm": 0.0028944197110831738, "learning_rate": 4.945549622304422e-06, "loss": 0.0001, "num_input_tokens_seen": 130022464, "step": 60280 }, { "epoch": 11.063497889521013, "grad_norm": 0.0009469636715948582, "learning_rate": 4.944748913813453e-06, "loss": 0.0002, "num_input_tokens_seen": 130033408, "step": 60285 }, { "epoch": 11.064415489080565, "grad_norm": 0.00420844741165638, "learning_rate": 4.943948206739586e-06, "loss": 0.0001, "num_input_tokens_seen": 130044640, "step": 60290 }, { "epoch": 11.065333088640118, "grad_norm": 0.0007015935843810439, "learning_rate": 4.943147501103362e-06, "loss": 0.001, "num_input_tokens_seen": 130054336, "step": 60295 }, { "epoch": 11.06625068819967, "grad_norm": 0.0030416005756706, "learning_rate": 4.942346796925317e-06, "loss": 0.0, "num_input_tokens_seen": 130065280, "step": 60300 }, { "epoch": 11.067168287759221, "grad_norm": 0.0005008489242754877, "learning_rate": 4.941546094225986e-06, "loss": 0.0733, "num_input_tokens_seen": 130076704, "step": 60305 }, { "epoch": 11.068085887318775, "grad_norm": 0.0008162258309312165, "learning_rate": 4.940745393025907e-06, "loss": 0.0, "num_input_tokens_seen": 130086720, "step": 60310 }, { "epoch": 11.069003486878326, "grad_norm": 0.0006475562113337219, "learning_rate": 4.939944693345618e-06, "loss": 0.0001, "num_input_tokens_seen": 130097888, "step": 60315 }, { "epoch": 11.069921086437878, "grad_norm": 0.004374608863145113, "learning_rate": 4.939143995205654e-06, "loss": 0.0, "num_input_tokens_seen": 130108608, "step": 60320 }, { "epoch": 11.070838685997431, "grad_norm": 0.0005306195234879851, "learning_rate": 4.938343298626552e-06, "loss": 0.0001, "num_input_tokens_seen": 130118944, "step": 60325 }, { "epoch": 11.071756285556983, "grad_norm": 0.0006306406576186419, "learning_rate": 4.93754260362885e-06, "loss": 0.0001, "num_input_tokens_seen": 130129312, "step": 60330 }, { "epoch": 11.072673885116535, "grad_norm": 0.0059667606838047504, "learning_rate": 4.936741910233082e-06, "loss": 0.0, "num_input_tokens_seen": 130140320, "step": 60335 }, { "epoch": 11.073591484676088, "grad_norm": 0.005169091280549765, "learning_rate": 4.935941218459786e-06, "loss": 0.0, "num_input_tokens_seen": 130150176, "step": 60340 }, { "epoch": 11.07450908423564, "grad_norm": 0.0017337186727672815, "learning_rate": 4.935140528329499e-06, "loss": 0.1499, "num_input_tokens_seen": 130160864, "step": 60345 }, { "epoch": 11.075426683795191, "grad_norm": 0.08223188668489456, "learning_rate": 4.934339839862758e-06, "loss": 0.0001, "num_input_tokens_seen": 130172544, "step": 60350 }, { "epoch": 11.076344283354745, "grad_norm": 0.00756125757470727, "learning_rate": 4.933539153080095e-06, "loss": 0.0002, "num_input_tokens_seen": 130184032, "step": 60355 }, { "epoch": 11.077261882914296, "grad_norm": 0.00048578265705145895, "learning_rate": 4.932738468002053e-06, "loss": 0.0, "num_input_tokens_seen": 130195840, "step": 60360 }, { "epoch": 11.078179482473848, "grad_norm": 0.002131971064954996, "learning_rate": 4.931937784649164e-06, "loss": 0.0079, "num_input_tokens_seen": 130205856, "step": 60365 }, { "epoch": 11.079097082033401, "grad_norm": 0.0010174710769206285, "learning_rate": 4.931137103041964e-06, "loss": 0.004, "num_input_tokens_seen": 130217120, "step": 60370 }, { "epoch": 11.080014681592953, "grad_norm": 0.002106721280142665, "learning_rate": 4.930336423200993e-06, "loss": 0.0, "num_input_tokens_seen": 130227904, "step": 60375 }, { "epoch": 11.080932281152505, "grad_norm": 0.9967845678329468, "learning_rate": 4.929535745146784e-06, "loss": 0.0011, "num_input_tokens_seen": 130238048, "step": 60380 }, { "epoch": 11.081849880712058, "grad_norm": 0.0012708900030702353, "learning_rate": 4.928735068899874e-06, "loss": 0.0, "num_input_tokens_seen": 130248640, "step": 60385 }, { "epoch": 11.08276748027161, "grad_norm": 0.0035405377857387066, "learning_rate": 4.927934394480797e-06, "loss": 0.0001, "num_input_tokens_seen": 130259360, "step": 60390 }, { "epoch": 11.083685079831161, "grad_norm": 0.0012502827448770404, "learning_rate": 4.927133721910093e-06, "loss": 0.0001, "num_input_tokens_seen": 130270688, "step": 60395 }, { "epoch": 11.084602679390715, "grad_norm": 0.0446353405714035, "learning_rate": 4.926333051208297e-06, "loss": 0.0002, "num_input_tokens_seen": 130281888, "step": 60400 }, { "epoch": 11.085520278950266, "grad_norm": 72.62013244628906, "learning_rate": 4.925532382395941e-06, "loss": 0.0996, "num_input_tokens_seen": 130292096, "step": 60405 }, { "epoch": 11.086437878509818, "grad_norm": 0.0004718317068181932, "learning_rate": 4.9247317154935665e-06, "loss": 0.0, "num_input_tokens_seen": 130303584, "step": 60410 }, { "epoch": 11.087355478069371, "grad_norm": 19.254924774169922, "learning_rate": 4.923931050521707e-06, "loss": 0.0245, "num_input_tokens_seen": 130315072, "step": 60415 }, { "epoch": 11.088273077628923, "grad_norm": 0.0003971613186877221, "learning_rate": 4.923130387500898e-06, "loss": 0.0, "num_input_tokens_seen": 130324896, "step": 60420 }, { "epoch": 11.089190677188475, "grad_norm": 0.011174308136105537, "learning_rate": 4.922329726451674e-06, "loss": 0.0001, "num_input_tokens_seen": 130335040, "step": 60425 }, { "epoch": 11.090108276748028, "grad_norm": 0.007536116987466812, "learning_rate": 4.921529067394574e-06, "loss": 0.0, "num_input_tokens_seen": 130345824, "step": 60430 }, { "epoch": 11.09102587630758, "grad_norm": 0.0006208583363331854, "learning_rate": 4.920728410350129e-06, "loss": 0.0001, "num_input_tokens_seen": 130356704, "step": 60435 }, { "epoch": 11.091943475867131, "grad_norm": 0.0012293695472180843, "learning_rate": 4.919927755338879e-06, "loss": 0.0, "num_input_tokens_seen": 130367328, "step": 60440 }, { "epoch": 11.092861075426685, "grad_norm": 16.98436737060547, "learning_rate": 4.919127102381359e-06, "loss": 0.0796, "num_input_tokens_seen": 130376960, "step": 60445 }, { "epoch": 11.093778674986236, "grad_norm": 0.004883036948740482, "learning_rate": 4.918326451498103e-06, "loss": 0.0001, "num_input_tokens_seen": 130388192, "step": 60450 }, { "epoch": 11.094696274545788, "grad_norm": 0.0004989632871001959, "learning_rate": 4.917525802709645e-06, "loss": 0.0, "num_input_tokens_seen": 130399104, "step": 60455 }, { "epoch": 11.095613874105341, "grad_norm": 0.001236779149621725, "learning_rate": 4.916725156036525e-06, "loss": 0.0159, "num_input_tokens_seen": 130409952, "step": 60460 }, { "epoch": 11.096531473664893, "grad_norm": 0.012351933866739273, "learning_rate": 4.9159245114992746e-06, "loss": 0.0001, "num_input_tokens_seen": 130421088, "step": 60465 }, { "epoch": 11.097449073224444, "grad_norm": 0.0018879385897889733, "learning_rate": 4.915123869118431e-06, "loss": 0.0, "num_input_tokens_seen": 130431968, "step": 60470 }, { "epoch": 11.098366672783998, "grad_norm": 0.000771049119066447, "learning_rate": 4.914323228914526e-06, "loss": 0.0072, "num_input_tokens_seen": 130441952, "step": 60475 }, { "epoch": 11.09928427234355, "grad_norm": 26.083703994750977, "learning_rate": 4.913522590908099e-06, "loss": 0.031, "num_input_tokens_seen": 130452288, "step": 60480 }, { "epoch": 11.100201871903101, "grad_norm": 0.02645668387413025, "learning_rate": 4.912721955119685e-06, "loss": 0.0, "num_input_tokens_seen": 130462880, "step": 60485 }, { "epoch": 11.101119471462654, "grad_norm": 0.3286276161670685, "learning_rate": 4.911921321569814e-06, "loss": 0.0006, "num_input_tokens_seen": 130474528, "step": 60490 }, { "epoch": 11.102037071022206, "grad_norm": 0.0006780885742045939, "learning_rate": 4.911120690279028e-06, "loss": 0.0001, "num_input_tokens_seen": 130485504, "step": 60495 }, { "epoch": 11.102954670581758, "grad_norm": 0.002413054695352912, "learning_rate": 4.910320061267857e-06, "loss": 0.0002, "num_input_tokens_seen": 130496576, "step": 60500 }, { "epoch": 11.103872270141311, "grad_norm": 0.0022147735580801964, "learning_rate": 4.909519434556837e-06, "loss": 0.0006, "num_input_tokens_seen": 130508768, "step": 60505 }, { "epoch": 11.104789869700863, "grad_norm": 0.3826749622821808, "learning_rate": 4.908718810166504e-06, "loss": 0.0002, "num_input_tokens_seen": 130520832, "step": 60510 }, { "epoch": 11.105707469260414, "grad_norm": 32.93852996826172, "learning_rate": 4.907918188117393e-06, "loss": 0.1128, "num_input_tokens_seen": 130530528, "step": 60515 }, { "epoch": 11.106625068819968, "grad_norm": 0.00653001107275486, "learning_rate": 4.907117568430038e-06, "loss": 0.033, "num_input_tokens_seen": 130540192, "step": 60520 }, { "epoch": 11.10754266837952, "grad_norm": 0.07721801102161407, "learning_rate": 4.906316951124971e-06, "loss": 0.0002, "num_input_tokens_seen": 130551264, "step": 60525 }, { "epoch": 11.108460267939071, "grad_norm": 0.20669184625148773, "learning_rate": 4.9055163362227305e-06, "loss": 0.0002, "num_input_tokens_seen": 130562528, "step": 60530 }, { "epoch": 11.109377867498624, "grad_norm": 0.005480916239321232, "learning_rate": 4.90471572374385e-06, "loss": 0.0006, "num_input_tokens_seen": 130572512, "step": 60535 }, { "epoch": 11.110295467058176, "grad_norm": 0.01756439357995987, "learning_rate": 4.903915113708862e-06, "loss": 0.0038, "num_input_tokens_seen": 130583680, "step": 60540 }, { "epoch": 11.111213066617728, "grad_norm": 0.03148039057850838, "learning_rate": 4.903114506138304e-06, "loss": 0.0002, "num_input_tokens_seen": 130594112, "step": 60545 }, { "epoch": 11.112130666177281, "grad_norm": 209.6343994140625, "learning_rate": 4.902313901052709e-06, "loss": 0.0675, "num_input_tokens_seen": 130605152, "step": 60550 }, { "epoch": 11.113048265736833, "grad_norm": 0.022620607167482376, "learning_rate": 4.90151329847261e-06, "loss": 0.0131, "num_input_tokens_seen": 130615872, "step": 60555 }, { "epoch": 11.113965865296384, "grad_norm": 3.2910025119781494, "learning_rate": 4.900712698418541e-06, "loss": 0.0016, "num_input_tokens_seen": 130626304, "step": 60560 }, { "epoch": 11.114883464855938, "grad_norm": 0.0009856083197519183, "learning_rate": 4.899912100911039e-06, "loss": 0.0, "num_input_tokens_seen": 130638400, "step": 60565 }, { "epoch": 11.11580106441549, "grad_norm": 0.011256183497607708, "learning_rate": 4.899111505970637e-06, "loss": 0.0001, "num_input_tokens_seen": 130648352, "step": 60570 }, { "epoch": 11.11671866397504, "grad_norm": 97.3064956665039, "learning_rate": 4.898310913617866e-06, "loss": 0.0192, "num_input_tokens_seen": 130658240, "step": 60575 }, { "epoch": 11.117636263534594, "grad_norm": 0.002728769788518548, "learning_rate": 4.897510323873264e-06, "loss": 0.0003, "num_input_tokens_seen": 130668800, "step": 60580 }, { "epoch": 11.118553863094146, "grad_norm": 0.035456519573926926, "learning_rate": 4.896709736757365e-06, "loss": 0.0001, "num_input_tokens_seen": 130679328, "step": 60585 }, { "epoch": 11.119471462653697, "grad_norm": 0.023774169385433197, "learning_rate": 4.8959091522906985e-06, "loss": 0.0001, "num_input_tokens_seen": 130689152, "step": 60590 }, { "epoch": 11.12038906221325, "grad_norm": 0.0025088482070714235, "learning_rate": 4.895108570493802e-06, "loss": 0.0002, "num_input_tokens_seen": 130700000, "step": 60595 }, { "epoch": 11.121306661772802, "grad_norm": 0.014357144013047218, "learning_rate": 4.894307991387209e-06, "loss": 0.0002, "num_input_tokens_seen": 130710976, "step": 60600 }, { "epoch": 11.122224261332354, "grad_norm": 0.01453031413257122, "learning_rate": 4.893507414991452e-06, "loss": 0.0001, "num_input_tokens_seen": 130721312, "step": 60605 }, { "epoch": 11.123141860891907, "grad_norm": 0.012362232431769371, "learning_rate": 4.892706841327063e-06, "loss": 0.0001, "num_input_tokens_seen": 130731520, "step": 60610 }, { "epoch": 11.124059460451459, "grad_norm": 0.0014302738709375262, "learning_rate": 4.891906270414578e-06, "loss": 0.0, "num_input_tokens_seen": 130741184, "step": 60615 }, { "epoch": 11.12497706001101, "grad_norm": 0.019780293107032776, "learning_rate": 4.891105702274531e-06, "loss": 0.0, "num_input_tokens_seen": 130751456, "step": 60620 }, { "epoch": 11.125894659570564, "grad_norm": 0.01580498181283474, "learning_rate": 4.890305136927453e-06, "loss": 0.0001, "num_input_tokens_seen": 130763168, "step": 60625 }, { "epoch": 11.126812259130116, "grad_norm": 0.0014623907627537847, "learning_rate": 4.889504574393877e-06, "loss": 0.0011, "num_input_tokens_seen": 130773984, "step": 60630 }, { "epoch": 11.127729858689667, "grad_norm": 0.002569687319919467, "learning_rate": 4.88870401469434e-06, "loss": 0.0001, "num_input_tokens_seen": 130784352, "step": 60635 }, { "epoch": 11.12864745824922, "grad_norm": 0.0007382654584944248, "learning_rate": 4.88790345784937e-06, "loss": 0.0004, "num_input_tokens_seen": 130796320, "step": 60640 }, { "epoch": 11.129565057808772, "grad_norm": 0.0006627857219427824, "learning_rate": 4.887102903879505e-06, "loss": 0.0003, "num_input_tokens_seen": 130808224, "step": 60645 }, { "epoch": 11.130482657368324, "grad_norm": 0.0024450107011944056, "learning_rate": 4.886302352805274e-06, "loss": 0.0, "num_input_tokens_seen": 130817344, "step": 60650 }, { "epoch": 11.131400256927877, "grad_norm": 0.0041405437514185905, "learning_rate": 4.885501804647212e-06, "loss": 0.0008, "num_input_tokens_seen": 130826592, "step": 60655 }, { "epoch": 11.132317856487429, "grad_norm": 0.000559068750590086, "learning_rate": 4.8847012594258505e-06, "loss": 0.0, "num_input_tokens_seen": 130836672, "step": 60660 }, { "epoch": 11.13323545604698, "grad_norm": 0.41134336590766907, "learning_rate": 4.883900717161724e-06, "loss": 0.0003, "num_input_tokens_seen": 130847744, "step": 60665 }, { "epoch": 11.134153055606534, "grad_norm": 0.0008838878129608929, "learning_rate": 4.883100177875364e-06, "loss": 0.0, "num_input_tokens_seen": 130859808, "step": 60670 }, { "epoch": 11.135070655166086, "grad_norm": 0.0030572579707950354, "learning_rate": 4.882299641587301e-06, "loss": 0.0001, "num_input_tokens_seen": 130870464, "step": 60675 }, { "epoch": 11.135988254725637, "grad_norm": 0.0017596217803657055, "learning_rate": 4.881499108318072e-06, "loss": 0.0, "num_input_tokens_seen": 130880672, "step": 60680 }, { "epoch": 11.13690585428519, "grad_norm": 0.007840766571462154, "learning_rate": 4.880698578088209e-06, "loss": 0.0, "num_input_tokens_seen": 130891136, "step": 60685 }, { "epoch": 11.137823453844742, "grad_norm": 0.003610461950302124, "learning_rate": 4.879898050918238e-06, "loss": 0.0001, "num_input_tokens_seen": 130902208, "step": 60690 }, { "epoch": 11.138741053404294, "grad_norm": 0.0004518509085755795, "learning_rate": 4.879097526828699e-06, "loss": 0.0001, "num_input_tokens_seen": 130913152, "step": 60695 }, { "epoch": 11.139658652963847, "grad_norm": 0.10796353965997696, "learning_rate": 4.878297005840121e-06, "loss": 0.0, "num_input_tokens_seen": 130924288, "step": 60700 }, { "epoch": 11.140576252523399, "grad_norm": 0.2920324504375458, "learning_rate": 4.877496487973036e-06, "loss": 0.1194, "num_input_tokens_seen": 130934304, "step": 60705 }, { "epoch": 11.14149385208295, "grad_norm": 0.00629260390996933, "learning_rate": 4.876695973247974e-06, "loss": 0.2063, "num_input_tokens_seen": 130944352, "step": 60710 }, { "epoch": 11.142411451642504, "grad_norm": 0.0024991200771182775, "learning_rate": 4.875895461685471e-06, "loss": 0.0001, "num_input_tokens_seen": 130955872, "step": 60715 }, { "epoch": 11.143329051202056, "grad_norm": 0.0005975772510282695, "learning_rate": 4.875094953306058e-06, "loss": 0.0, "num_input_tokens_seen": 130966144, "step": 60720 }, { "epoch": 11.144246650761607, "grad_norm": 0.005611027590930462, "learning_rate": 4.874294448130264e-06, "loss": 0.0, "num_input_tokens_seen": 130977600, "step": 60725 }, { "epoch": 11.14516425032116, "grad_norm": 0.06465566903352737, "learning_rate": 4.873493946178624e-06, "loss": 0.0001, "num_input_tokens_seen": 130987168, "step": 60730 }, { "epoch": 11.146081849880712, "grad_norm": 0.0005173741956241429, "learning_rate": 4.872693447471667e-06, "loss": 0.0, "num_input_tokens_seen": 130999456, "step": 60735 }, { "epoch": 11.146999449440264, "grad_norm": 0.006083456799387932, "learning_rate": 4.871892952029928e-06, "loss": 0.0674, "num_input_tokens_seen": 131010336, "step": 60740 }, { "epoch": 11.147917048999817, "grad_norm": 0.007127329707145691, "learning_rate": 4.8710924598739336e-06, "loss": 0.0001, "num_input_tokens_seen": 131021472, "step": 60745 }, { "epoch": 11.148834648559369, "grad_norm": 0.0017928506713360548, "learning_rate": 4.87029197102422e-06, "loss": 0.0001, "num_input_tokens_seen": 131031936, "step": 60750 }, { "epoch": 11.14975224811892, "grad_norm": 0.0013821233296766877, "learning_rate": 4.869491485501314e-06, "loss": 0.1971, "num_input_tokens_seen": 131043808, "step": 60755 }, { "epoch": 11.150669847678474, "grad_norm": 0.0024674127344042063, "learning_rate": 4.86869100332575e-06, "loss": 0.0, "num_input_tokens_seen": 131055424, "step": 60760 }, { "epoch": 11.151587447238025, "grad_norm": 0.0007417192682623863, "learning_rate": 4.867890524518059e-06, "loss": 0.0158, "num_input_tokens_seen": 131066368, "step": 60765 }, { "epoch": 11.152505046797577, "grad_norm": 0.00609730277210474, "learning_rate": 4.867090049098772e-06, "loss": 0.0, "num_input_tokens_seen": 131077760, "step": 60770 }, { "epoch": 11.15342264635713, "grad_norm": 0.00032649890636093915, "learning_rate": 4.866289577088416e-06, "loss": 0.0001, "num_input_tokens_seen": 131088096, "step": 60775 }, { "epoch": 11.154340245916682, "grad_norm": 0.007732986938208342, "learning_rate": 4.865489108507529e-06, "loss": 0.0, "num_input_tokens_seen": 131098080, "step": 60780 }, { "epoch": 11.155257845476234, "grad_norm": 0.0014790318673476577, "learning_rate": 4.864688643376636e-06, "loss": 0.0018, "num_input_tokens_seen": 131108192, "step": 60785 }, { "epoch": 11.156175445035787, "grad_norm": 0.0012937497813254595, "learning_rate": 4.86388818171627e-06, "loss": 0.0001, "num_input_tokens_seen": 131119424, "step": 60790 }, { "epoch": 11.157093044595339, "grad_norm": 0.001553399139083922, "learning_rate": 4.863087723546959e-06, "loss": 0.0, "num_input_tokens_seen": 131130080, "step": 60795 }, { "epoch": 11.15801064415489, "grad_norm": 0.0004014412115793675, "learning_rate": 4.862287268889239e-06, "loss": 0.0, "num_input_tokens_seen": 131141152, "step": 60800 }, { "epoch": 11.158928243714444, "grad_norm": 0.0013669438194483519, "learning_rate": 4.861486817763636e-06, "loss": 0.0001, "num_input_tokens_seen": 131151872, "step": 60805 }, { "epoch": 11.159845843273995, "grad_norm": 0.07310879230499268, "learning_rate": 4.860686370190679e-06, "loss": 0.0, "num_input_tokens_seen": 131162720, "step": 60810 }, { "epoch": 11.160763442833547, "grad_norm": 0.0009607644169591367, "learning_rate": 4.859885926190904e-06, "loss": 0.0004, "num_input_tokens_seen": 131173376, "step": 60815 }, { "epoch": 11.1616810423931, "grad_norm": 0.0034792530350387096, "learning_rate": 4.859085485784837e-06, "loss": 0.0001, "num_input_tokens_seen": 131185056, "step": 60820 }, { "epoch": 11.162598641952652, "grad_norm": 0.0023271567188203335, "learning_rate": 4.858285048993007e-06, "loss": 0.0, "num_input_tokens_seen": 131194624, "step": 60825 }, { "epoch": 11.163516241512204, "grad_norm": 0.0013642035191878676, "learning_rate": 4.857484615835948e-06, "loss": 0.0, "num_input_tokens_seen": 131204160, "step": 60830 }, { "epoch": 11.164433841071757, "grad_norm": 0.0021921296138316393, "learning_rate": 4.856684186334188e-06, "loss": 0.0, "num_input_tokens_seen": 131214912, "step": 60835 }, { "epoch": 11.165351440631309, "grad_norm": 0.0011141400318592787, "learning_rate": 4.855883760508256e-06, "loss": 0.0, "num_input_tokens_seen": 131225216, "step": 60840 }, { "epoch": 11.16626904019086, "grad_norm": 0.0005174042889848351, "learning_rate": 4.855083338378682e-06, "loss": 0.0, "num_input_tokens_seen": 131234656, "step": 60845 }, { "epoch": 11.167186639750414, "grad_norm": 0.0018391561461612582, "learning_rate": 4.854282919965994e-06, "loss": 0.0001, "num_input_tokens_seen": 131245344, "step": 60850 }, { "epoch": 11.168104239309965, "grad_norm": 0.000986538827419281, "learning_rate": 4.853482505290726e-06, "loss": 0.0, "num_input_tokens_seen": 131256192, "step": 60855 }, { "epoch": 11.169021838869517, "grad_norm": 0.0013266880996525288, "learning_rate": 4.852682094373403e-06, "loss": 0.0005, "num_input_tokens_seen": 131267552, "step": 60860 }, { "epoch": 11.16993943842907, "grad_norm": 0.04831346496939659, "learning_rate": 4.851881687234557e-06, "loss": 0.0, "num_input_tokens_seen": 131278912, "step": 60865 }, { "epoch": 11.170857037988622, "grad_norm": 0.0004980250378139317, "learning_rate": 4.851081283894717e-06, "loss": 0.0, "num_input_tokens_seen": 131289984, "step": 60870 }, { "epoch": 11.171774637548173, "grad_norm": 0.0006275210762396455, "learning_rate": 4.8502808843744085e-06, "loss": 0.033, "num_input_tokens_seen": 131301024, "step": 60875 }, { "epoch": 11.172692237107727, "grad_norm": 0.0029615077655762434, "learning_rate": 4.849480488694164e-06, "loss": 0.0, "num_input_tokens_seen": 131311584, "step": 60880 }, { "epoch": 11.173609836667278, "grad_norm": 0.006841977126896381, "learning_rate": 4.848680096874514e-06, "loss": 0.0019, "num_input_tokens_seen": 131322048, "step": 60885 }, { "epoch": 11.17452743622683, "grad_norm": 0.0010099188657477498, "learning_rate": 4.8478797089359836e-06, "loss": 0.0002, "num_input_tokens_seen": 131332672, "step": 60890 }, { "epoch": 11.175445035786383, "grad_norm": 0.0006425969186238945, "learning_rate": 4.8470793248991014e-06, "loss": 0.0703, "num_input_tokens_seen": 131344992, "step": 60895 }, { "epoch": 11.176362635345935, "grad_norm": 0.0026997122913599014, "learning_rate": 4.846278944784399e-06, "loss": 0.0001, "num_input_tokens_seen": 131354592, "step": 60900 }, { "epoch": 11.177280234905487, "grad_norm": 0.01707535795867443, "learning_rate": 4.845478568612404e-06, "loss": 0.0001, "num_input_tokens_seen": 131365440, "step": 60905 }, { "epoch": 11.17819783446504, "grad_norm": 0.006642656400799751, "learning_rate": 4.844678196403641e-06, "loss": 0.0004, "num_input_tokens_seen": 131375680, "step": 60910 }, { "epoch": 11.179115434024592, "grad_norm": 0.014944893307983875, "learning_rate": 4.843877828178645e-06, "loss": 0.0, "num_input_tokens_seen": 131385376, "step": 60915 }, { "epoch": 11.180033033584143, "grad_norm": 112.74595642089844, "learning_rate": 4.84307746395794e-06, "loss": 0.1503, "num_input_tokens_seen": 131395488, "step": 60920 }, { "epoch": 11.180950633143697, "grad_norm": 0.02310340292751789, "learning_rate": 4.842277103762055e-06, "loss": 0.0, "num_input_tokens_seen": 131406016, "step": 60925 }, { "epoch": 11.181868232703248, "grad_norm": 0.07851988077163696, "learning_rate": 4.841476747611516e-06, "loss": 0.0001, "num_input_tokens_seen": 131416640, "step": 60930 }, { "epoch": 11.1827858322628, "grad_norm": 0.0004739048017654568, "learning_rate": 4.840676395526855e-06, "loss": 0.0244, "num_input_tokens_seen": 131426752, "step": 60935 }, { "epoch": 11.183703431822353, "grad_norm": 0.0008727897657081485, "learning_rate": 4.839876047528597e-06, "loss": 0.065, "num_input_tokens_seen": 131435712, "step": 60940 }, { "epoch": 11.184621031381905, "grad_norm": 0.002996902447193861, "learning_rate": 4.839075703637268e-06, "loss": 0.0338, "num_input_tokens_seen": 131446144, "step": 60945 }, { "epoch": 11.185538630941457, "grad_norm": 0.0017359640914946795, "learning_rate": 4.838275363873401e-06, "loss": 0.0003, "num_input_tokens_seen": 131456160, "step": 60950 }, { "epoch": 11.18645623050101, "grad_norm": 0.03254431113600731, "learning_rate": 4.837475028257519e-06, "loss": 0.0001, "num_input_tokens_seen": 131465728, "step": 60955 }, { "epoch": 11.187373830060562, "grad_norm": 0.005036819726228714, "learning_rate": 4.83667469681015e-06, "loss": 0.0001, "num_input_tokens_seen": 131476448, "step": 60960 }, { "epoch": 11.188291429620113, "grad_norm": 0.03481313958764076, "learning_rate": 4.835874369551823e-06, "loss": 0.0009, "num_input_tokens_seen": 131487392, "step": 60965 }, { "epoch": 11.189209029179667, "grad_norm": 0.7583267092704773, "learning_rate": 4.835074046503064e-06, "loss": 0.0006, "num_input_tokens_seen": 131498496, "step": 60970 }, { "epoch": 11.190126628739218, "grad_norm": 0.0010557726491242647, "learning_rate": 4.834273727684399e-06, "loss": 0.0524, "num_input_tokens_seen": 131508512, "step": 60975 }, { "epoch": 11.19104422829877, "grad_norm": 0.029312986880540848, "learning_rate": 4.8334734131163565e-06, "loss": 0.0023, "num_input_tokens_seen": 131519968, "step": 60980 }, { "epoch": 11.191961827858323, "grad_norm": 246.37466430664062, "learning_rate": 4.832673102819463e-06, "loss": 0.0225, "num_input_tokens_seen": 131530816, "step": 60985 }, { "epoch": 11.192879427417875, "grad_norm": 0.0005929625476710498, "learning_rate": 4.831872796814246e-06, "loss": 0.0, "num_input_tokens_seen": 131540672, "step": 60990 }, { "epoch": 11.193797026977427, "grad_norm": 0.0011337405303493142, "learning_rate": 4.831072495121228e-06, "loss": 0.0, "num_input_tokens_seen": 131550400, "step": 60995 }, { "epoch": 11.19471462653698, "grad_norm": 0.00047419575275853276, "learning_rate": 4.830272197760942e-06, "loss": 0.0, "num_input_tokens_seen": 131561856, "step": 61000 }, { "epoch": 11.195632226096532, "grad_norm": 0.003717175219208002, "learning_rate": 4.829471904753911e-06, "loss": 0.0329, "num_input_tokens_seen": 131571040, "step": 61005 }, { "epoch": 11.196549825656083, "grad_norm": 0.0008546800818294287, "learning_rate": 4.8286716161206586e-06, "loss": 0.0, "num_input_tokens_seen": 131582304, "step": 61010 }, { "epoch": 11.197467425215637, "grad_norm": 0.000677785777952522, "learning_rate": 4.827871331881716e-06, "loss": 0.1546, "num_input_tokens_seen": 131592128, "step": 61015 }, { "epoch": 11.198385024775188, "grad_norm": 0.0011188272619619966, "learning_rate": 4.827071052057607e-06, "loss": 0.0, "num_input_tokens_seen": 131602336, "step": 61020 }, { "epoch": 11.19930262433474, "grad_norm": 0.0008331191493198276, "learning_rate": 4.826270776668857e-06, "loss": 0.0017, "num_input_tokens_seen": 131612608, "step": 61025 }, { "epoch": 11.200220223894293, "grad_norm": 0.0014169912319630384, "learning_rate": 4.825470505735991e-06, "loss": 0.0, "num_input_tokens_seen": 131622720, "step": 61030 }, { "epoch": 11.201137823453845, "grad_norm": 0.0016395014245063066, "learning_rate": 4.824670239279538e-06, "loss": 0.1067, "num_input_tokens_seen": 131634688, "step": 61035 }, { "epoch": 11.202055423013396, "grad_norm": 0.00035978847881779075, "learning_rate": 4.823869977320021e-06, "loss": 0.0, "num_input_tokens_seen": 131645120, "step": 61040 }, { "epoch": 11.20297302257295, "grad_norm": 0.003938662353903055, "learning_rate": 4.8230697198779645e-06, "loss": 0.0001, "num_input_tokens_seen": 131656896, "step": 61045 }, { "epoch": 11.203890622132501, "grad_norm": 0.002551518613472581, "learning_rate": 4.822269466973898e-06, "loss": 0.0009, "num_input_tokens_seen": 131669344, "step": 61050 }, { "epoch": 11.204808221692053, "grad_norm": 0.0007572330650873482, "learning_rate": 4.821469218628344e-06, "loss": 0.0, "num_input_tokens_seen": 131680608, "step": 61055 }, { "epoch": 11.205725821251606, "grad_norm": 0.0010122249368578196, "learning_rate": 4.820668974861827e-06, "loss": 0.0, "num_input_tokens_seen": 131691968, "step": 61060 }, { "epoch": 11.206643420811158, "grad_norm": 0.029553404077887535, "learning_rate": 4.819868735694873e-06, "loss": 0.0, "num_input_tokens_seen": 131702336, "step": 61065 }, { "epoch": 11.20756102037071, "grad_norm": 0.004425814375281334, "learning_rate": 4.819068501148006e-06, "loss": 0.0014, "num_input_tokens_seen": 131712992, "step": 61070 }, { "epoch": 11.208478619930263, "grad_norm": 0.0009235508623532951, "learning_rate": 4.818268271241752e-06, "loss": 0.0, "num_input_tokens_seen": 131723360, "step": 61075 }, { "epoch": 11.209396219489815, "grad_norm": 0.0006689112051390111, "learning_rate": 4.817468045996635e-06, "loss": 0.0, "num_input_tokens_seen": 131735360, "step": 61080 }, { "epoch": 11.210313819049366, "grad_norm": 0.010680056177079678, "learning_rate": 4.816667825433181e-06, "loss": 0.0, "num_input_tokens_seen": 131746048, "step": 61085 }, { "epoch": 11.21123141860892, "grad_norm": 0.0353899821639061, "learning_rate": 4.815867609571913e-06, "loss": 0.0, "num_input_tokens_seen": 131757120, "step": 61090 }, { "epoch": 11.212149018168471, "grad_norm": 0.016943106427788734, "learning_rate": 4.815067398433353e-06, "loss": 0.0004, "num_input_tokens_seen": 131767744, "step": 61095 }, { "epoch": 11.213066617728023, "grad_norm": 0.028973953798413277, "learning_rate": 4.8142671920380295e-06, "loss": 0.0001, "num_input_tokens_seen": 131777856, "step": 61100 }, { "epoch": 11.213984217287576, "grad_norm": 0.005239717196673155, "learning_rate": 4.813466990406465e-06, "loss": 0.0, "num_input_tokens_seen": 131788832, "step": 61105 }, { "epoch": 11.214901816847128, "grad_norm": 0.5731867551803589, "learning_rate": 4.812666793559183e-06, "loss": 0.0004, "num_input_tokens_seen": 131798496, "step": 61110 }, { "epoch": 11.21581941640668, "grad_norm": 0.0008598686545155942, "learning_rate": 4.811866601516705e-06, "loss": 0.0001, "num_input_tokens_seen": 131809792, "step": 61115 }, { "epoch": 11.216737015966233, "grad_norm": 0.0005574347451329231, "learning_rate": 4.81106641429956e-06, "loss": 0.0, "num_input_tokens_seen": 131819904, "step": 61120 }, { "epoch": 11.217654615525785, "grad_norm": 0.01661718264222145, "learning_rate": 4.810266231928268e-06, "loss": 0.0803, "num_input_tokens_seen": 131831232, "step": 61125 }, { "epoch": 11.218572215085336, "grad_norm": 0.0007126473938114941, "learning_rate": 4.8094660544233515e-06, "loss": 0.0, "num_input_tokens_seen": 131841824, "step": 61130 }, { "epoch": 11.21948981464489, "grad_norm": 0.002094283467158675, "learning_rate": 4.808665881805337e-06, "loss": 0.0, "num_input_tokens_seen": 131852768, "step": 61135 }, { "epoch": 11.220407414204441, "grad_norm": 0.0004976668860763311, "learning_rate": 4.807865714094747e-06, "loss": 0.0, "num_input_tokens_seen": 131862592, "step": 61140 }, { "epoch": 11.221325013763993, "grad_norm": 0.0015636355383321643, "learning_rate": 4.8070655513121005e-06, "loss": 0.0004, "num_input_tokens_seen": 131873152, "step": 61145 }, { "epoch": 11.222242613323546, "grad_norm": 0.0005909997271373868, "learning_rate": 4.806265393477926e-06, "loss": 0.0, "num_input_tokens_seen": 131883872, "step": 61150 }, { "epoch": 11.223160212883098, "grad_norm": 0.0012700565857812762, "learning_rate": 4.805465240612744e-06, "loss": 0.0, "num_input_tokens_seen": 131895136, "step": 61155 }, { "epoch": 11.22407781244265, "grad_norm": 0.0015547380317002535, "learning_rate": 4.804665092737077e-06, "loss": 0.0001, "num_input_tokens_seen": 131906912, "step": 61160 }, { "epoch": 11.224995412002203, "grad_norm": 465.7382507324219, "learning_rate": 4.803864949871447e-06, "loss": 0.0207, "num_input_tokens_seen": 131918176, "step": 61165 }, { "epoch": 11.225913011561754, "grad_norm": 0.013627025298774242, "learning_rate": 4.803064812036376e-06, "loss": 0.0, "num_input_tokens_seen": 131928640, "step": 61170 }, { "epoch": 11.226830611121306, "grad_norm": 0.0007130228914320469, "learning_rate": 4.802264679252389e-06, "loss": 0.0001, "num_input_tokens_seen": 131940320, "step": 61175 }, { "epoch": 11.22774821068086, "grad_norm": 0.0009122883784584701, "learning_rate": 4.801464551540005e-06, "loss": 0.0012, "num_input_tokens_seen": 131951264, "step": 61180 }, { "epoch": 11.228665810240411, "grad_norm": 0.0009503355249762535, "learning_rate": 4.80066442891975e-06, "loss": 0.0, "num_input_tokens_seen": 131961696, "step": 61185 }, { "epoch": 11.229583409799963, "grad_norm": 0.00035204889718443155, "learning_rate": 4.799864311412143e-06, "loss": 0.0, "num_input_tokens_seen": 131972992, "step": 61190 }, { "epoch": 11.230501009359516, "grad_norm": 0.0006530338432639837, "learning_rate": 4.799064199037704e-06, "loss": 0.0001, "num_input_tokens_seen": 131984256, "step": 61195 }, { "epoch": 11.231418608919068, "grad_norm": 0.0005122612928971648, "learning_rate": 4.798264091816958e-06, "loss": 0.0051, "num_input_tokens_seen": 131996000, "step": 61200 }, { "epoch": 11.23233620847862, "grad_norm": 0.2599175274372101, "learning_rate": 4.7974639897704255e-06, "loss": 0.0001, "num_input_tokens_seen": 132006688, "step": 61205 }, { "epoch": 11.233253808038173, "grad_norm": 0.003658723086118698, "learning_rate": 4.7966638929186285e-06, "loss": 0.0, "num_input_tokens_seen": 132016640, "step": 61210 }, { "epoch": 11.234171407597724, "grad_norm": 0.0017928684828802943, "learning_rate": 4.795863801282085e-06, "loss": 0.0, "num_input_tokens_seen": 132028608, "step": 61215 }, { "epoch": 11.235089007157276, "grad_norm": 0.0009043078171089292, "learning_rate": 4.795063714881321e-06, "loss": 0.0003, "num_input_tokens_seen": 132038976, "step": 61220 }, { "epoch": 11.23600660671683, "grad_norm": 0.0008173405076377094, "learning_rate": 4.794263633736856e-06, "loss": 0.0, "num_input_tokens_seen": 132049856, "step": 61225 }, { "epoch": 11.236924206276381, "grad_norm": 0.0005164733738638461, "learning_rate": 4.793463557869206e-06, "loss": 0.1938, "num_input_tokens_seen": 132060480, "step": 61230 }, { "epoch": 11.237841805835933, "grad_norm": 0.01436389610171318, "learning_rate": 4.7926634872988985e-06, "loss": 0.0, "num_input_tokens_seen": 132071392, "step": 61235 }, { "epoch": 11.238759405395486, "grad_norm": 0.0004786861827597022, "learning_rate": 4.791863422046452e-06, "loss": 0.0097, "num_input_tokens_seen": 132081376, "step": 61240 }, { "epoch": 11.239677004955038, "grad_norm": 0.0007880827179178596, "learning_rate": 4.791063362132386e-06, "loss": 0.0, "num_input_tokens_seen": 132091776, "step": 61245 }, { "epoch": 11.24059460451459, "grad_norm": 0.0009596726158633828, "learning_rate": 4.790263307577218e-06, "loss": 0.0032, "num_input_tokens_seen": 132101856, "step": 61250 }, { "epoch": 11.241512204074143, "grad_norm": 0.0007480800850316882, "learning_rate": 4.789463258401472e-06, "loss": 0.0014, "num_input_tokens_seen": 132113792, "step": 61255 }, { "epoch": 11.242429803633694, "grad_norm": 0.04936860874295235, "learning_rate": 4.7886632146256695e-06, "loss": 0.0001, "num_input_tokens_seen": 132125120, "step": 61260 }, { "epoch": 11.243347403193246, "grad_norm": 0.0015202248468995094, "learning_rate": 4.787863176270324e-06, "loss": 0.0001, "num_input_tokens_seen": 132136096, "step": 61265 }, { "epoch": 11.2442650027528, "grad_norm": 0.0007093247841112316, "learning_rate": 4.787063143355963e-06, "loss": 0.0002, "num_input_tokens_seen": 132146208, "step": 61270 }, { "epoch": 11.245182602312351, "grad_norm": 0.0013544816756621003, "learning_rate": 4.786263115903102e-06, "loss": 0.0, "num_input_tokens_seen": 132158080, "step": 61275 }, { "epoch": 11.246100201871903, "grad_norm": 0.04902786761522293, "learning_rate": 4.78546309393226e-06, "loss": 0.0, "num_input_tokens_seen": 132169952, "step": 61280 }, { "epoch": 11.247017801431456, "grad_norm": 0.0019046743400394917, "learning_rate": 4.784663077463957e-06, "loss": 0.0, "num_input_tokens_seen": 132180192, "step": 61285 }, { "epoch": 11.247935400991008, "grad_norm": 0.0006900840089656413, "learning_rate": 4.783863066518713e-06, "loss": 0.2439, "num_input_tokens_seen": 132193024, "step": 61290 }, { "epoch": 11.24885300055056, "grad_norm": 2.7217350006103516, "learning_rate": 4.783063061117045e-06, "loss": 0.0916, "num_input_tokens_seen": 132204320, "step": 61295 }, { "epoch": 11.249770600110113, "grad_norm": 0.001280025695450604, "learning_rate": 4.782263061279474e-06, "loss": 0.0, "num_input_tokens_seen": 132214816, "step": 61300 }, { "epoch": 11.250688199669664, "grad_norm": 0.000259736756561324, "learning_rate": 4.781463067026519e-06, "loss": 0.0045, "num_input_tokens_seen": 132225696, "step": 61305 }, { "epoch": 11.251605799229216, "grad_norm": 0.0030714794993400574, "learning_rate": 4.7806630783786965e-06, "loss": 0.0079, "num_input_tokens_seen": 132236768, "step": 61310 }, { "epoch": 11.25252339878877, "grad_norm": 0.0036186252254992723, "learning_rate": 4.779863095356525e-06, "loss": 0.0, "num_input_tokens_seen": 132247328, "step": 61315 }, { "epoch": 11.25344099834832, "grad_norm": 0.0006355619989335537, "learning_rate": 4.779063117980526e-06, "loss": 0.0001, "num_input_tokens_seen": 132257536, "step": 61320 }, { "epoch": 11.254358597907872, "grad_norm": 0.00022441583860199898, "learning_rate": 4.778263146271215e-06, "loss": 0.0, "num_input_tokens_seen": 132267808, "step": 61325 }, { "epoch": 11.255276197467426, "grad_norm": 0.0003832067013718188, "learning_rate": 4.777463180249111e-06, "loss": 0.0, "num_input_tokens_seen": 132278816, "step": 61330 }, { "epoch": 11.256193797026977, "grad_norm": 0.20565776526927948, "learning_rate": 4.77666321993473e-06, "loss": 0.0, "num_input_tokens_seen": 132289664, "step": 61335 }, { "epoch": 11.257111396586529, "grad_norm": 0.0009569903486408293, "learning_rate": 4.7758632653485925e-06, "loss": 0.0, "num_input_tokens_seen": 132301632, "step": 61340 }, { "epoch": 11.258028996146082, "grad_norm": 0.008031582459807396, "learning_rate": 4.7750633165112155e-06, "loss": 0.0001, "num_input_tokens_seen": 132312768, "step": 61345 }, { "epoch": 11.258946595705634, "grad_norm": 0.0003191465511918068, "learning_rate": 4.774263373443113e-06, "loss": 0.0, "num_input_tokens_seen": 132324640, "step": 61350 }, { "epoch": 11.259864195265186, "grad_norm": 0.0009367413586005569, "learning_rate": 4.773463436164807e-06, "loss": 0.0, "num_input_tokens_seen": 132334752, "step": 61355 }, { "epoch": 11.260781794824739, "grad_norm": 0.00038048753049224615, "learning_rate": 4.772663504696814e-06, "loss": 0.0, "num_input_tokens_seen": 132345408, "step": 61360 }, { "epoch": 11.26169939438429, "grad_norm": 1.4884541034698486, "learning_rate": 4.7718635790596465e-06, "loss": 0.0007, "num_input_tokens_seen": 132356224, "step": 61365 }, { "epoch": 11.262616993943842, "grad_norm": 0.005092072766274214, "learning_rate": 4.771063659273828e-06, "loss": 0.001, "num_input_tokens_seen": 132366784, "step": 61370 }, { "epoch": 11.263534593503396, "grad_norm": 0.001398906810209155, "learning_rate": 4.770263745359871e-06, "loss": 0.1036, "num_input_tokens_seen": 132378144, "step": 61375 }, { "epoch": 11.264452193062947, "grad_norm": 0.00225042924284935, "learning_rate": 4.769463837338293e-06, "loss": 0.0002, "num_input_tokens_seen": 132388608, "step": 61380 }, { "epoch": 11.265369792622499, "grad_norm": 0.005981180351227522, "learning_rate": 4.7686639352296085e-06, "loss": 0.0003, "num_input_tokens_seen": 132399840, "step": 61385 }, { "epoch": 11.266287392182052, "grad_norm": 0.00039856135845184326, "learning_rate": 4.7678640390543365e-06, "loss": 0.0, "num_input_tokens_seen": 132408832, "step": 61390 }, { "epoch": 11.267204991741604, "grad_norm": 0.00039069197373464704, "learning_rate": 4.767064148832993e-06, "loss": 0.0, "num_input_tokens_seen": 132418432, "step": 61395 }, { "epoch": 11.268122591301156, "grad_norm": 0.0005322813522070646, "learning_rate": 4.766264264586092e-06, "loss": 0.0, "num_input_tokens_seen": 132428928, "step": 61400 }, { "epoch": 11.269040190860709, "grad_norm": 0.0004623155400622636, "learning_rate": 4.765464386334151e-06, "loss": 0.0001, "num_input_tokens_seen": 132439584, "step": 61405 }, { "epoch": 11.26995779042026, "grad_norm": 0.00998840294778347, "learning_rate": 4.764664514097686e-06, "loss": 0.0, "num_input_tokens_seen": 132450272, "step": 61410 }, { "epoch": 11.270875389979812, "grad_norm": 9.99950122833252, "learning_rate": 4.763864647897208e-06, "loss": 0.0045, "num_input_tokens_seen": 132461408, "step": 61415 }, { "epoch": 11.271792989539366, "grad_norm": 0.05939026549458504, "learning_rate": 4.763064787753239e-06, "loss": 0.0005, "num_input_tokens_seen": 132470880, "step": 61420 }, { "epoch": 11.272710589098917, "grad_norm": 0.002666439628228545, "learning_rate": 4.7622649336862905e-06, "loss": 0.0, "num_input_tokens_seen": 132482496, "step": 61425 }, { "epoch": 11.273628188658469, "grad_norm": 0.0004955556942149997, "learning_rate": 4.761465085716877e-06, "loss": 0.0002, "num_input_tokens_seen": 132494400, "step": 61430 }, { "epoch": 11.274545788218022, "grad_norm": 0.0005920479306951165, "learning_rate": 4.760665243865514e-06, "loss": 0.0329, "num_input_tokens_seen": 132504832, "step": 61435 }, { "epoch": 11.275463387777574, "grad_norm": 0.0032091934699565172, "learning_rate": 4.759865408152718e-06, "loss": 0.0, "num_input_tokens_seen": 132514336, "step": 61440 }, { "epoch": 11.276380987337125, "grad_norm": 0.0043823509477078915, "learning_rate": 4.759065578599002e-06, "loss": 0.0, "num_input_tokens_seen": 132525088, "step": 61445 }, { "epoch": 11.277298586896679, "grad_norm": 0.0005783343804068863, "learning_rate": 4.758265755224878e-06, "loss": 0.1221, "num_input_tokens_seen": 132535680, "step": 61450 }, { "epoch": 11.27821618645623, "grad_norm": 0.0009002125007100403, "learning_rate": 4.757465938050866e-06, "loss": 0.0, "num_input_tokens_seen": 132547200, "step": 61455 }, { "epoch": 11.279133786015782, "grad_norm": 0.0003533049894031137, "learning_rate": 4.756666127097476e-06, "loss": 0.0001, "num_input_tokens_seen": 132556864, "step": 61460 }, { "epoch": 11.280051385575335, "grad_norm": 0.002146465238183737, "learning_rate": 4.7558663223852205e-06, "loss": 0.0001, "num_input_tokens_seen": 132567904, "step": 61465 }, { "epoch": 11.280968985134887, "grad_norm": 0.0006817277171649039, "learning_rate": 4.755066523934617e-06, "loss": 0.1501, "num_input_tokens_seen": 132577920, "step": 61470 }, { "epoch": 11.281886584694439, "grad_norm": 0.0015133782289922237, "learning_rate": 4.754266731766179e-06, "loss": 0.0, "num_input_tokens_seen": 132588896, "step": 61475 }, { "epoch": 11.282804184253992, "grad_norm": 0.0008845381089486182, "learning_rate": 4.753466945900417e-06, "loss": 0.0, "num_input_tokens_seen": 132599072, "step": 61480 }, { "epoch": 11.283721783813544, "grad_norm": 0.0004144182021263987, "learning_rate": 4.752667166357845e-06, "loss": 0.0, "num_input_tokens_seen": 132609408, "step": 61485 }, { "epoch": 11.284639383373095, "grad_norm": 0.001756371813826263, "learning_rate": 4.751867393158978e-06, "loss": 0.0, "num_input_tokens_seen": 132619584, "step": 61490 }, { "epoch": 11.285556982932649, "grad_norm": 0.0004887689719907939, "learning_rate": 4.751067626324328e-06, "loss": 0.0, "num_input_tokens_seen": 132630592, "step": 61495 }, { "epoch": 11.2864745824922, "grad_norm": 0.00235057738609612, "learning_rate": 4.750267865874406e-06, "loss": 0.0001, "num_input_tokens_seen": 132639936, "step": 61500 }, { "epoch": 11.287392182051752, "grad_norm": 0.5547659397125244, "learning_rate": 4.749468111829729e-06, "loss": 0.2066, "num_input_tokens_seen": 132651840, "step": 61505 }, { "epoch": 11.288309781611305, "grad_norm": 0.003267325460910797, "learning_rate": 4.748668364210805e-06, "loss": 0.0, "num_input_tokens_seen": 132662624, "step": 61510 }, { "epoch": 11.289227381170857, "grad_norm": 0.002074534771963954, "learning_rate": 4.747868623038148e-06, "loss": 0.0, "num_input_tokens_seen": 132673600, "step": 61515 }, { "epoch": 11.290144980730409, "grad_norm": 0.00032677844865247607, "learning_rate": 4.7470688883322695e-06, "loss": 0.0, "num_input_tokens_seen": 132685216, "step": 61520 }, { "epoch": 11.291062580289962, "grad_norm": 0.014202924445271492, "learning_rate": 4.7462691601136825e-06, "loss": 0.0, "num_input_tokens_seen": 132695616, "step": 61525 }, { "epoch": 11.291980179849514, "grad_norm": 0.2881779372692108, "learning_rate": 4.7454694384029e-06, "loss": 0.0003, "num_input_tokens_seen": 132705920, "step": 61530 }, { "epoch": 11.292897779409065, "grad_norm": 0.0041326722130179405, "learning_rate": 4.744669723220428e-06, "loss": 0.0001, "num_input_tokens_seen": 132716096, "step": 61535 }, { "epoch": 11.293815378968619, "grad_norm": 0.0024681752547621727, "learning_rate": 4.743870014586784e-06, "loss": 0.0001, "num_input_tokens_seen": 132726592, "step": 61540 }, { "epoch": 11.29473297852817, "grad_norm": 0.0006141049670986831, "learning_rate": 4.743070312522478e-06, "loss": 0.0, "num_input_tokens_seen": 132738016, "step": 61545 }, { "epoch": 11.295650578087722, "grad_norm": 0.0005880113458260894, "learning_rate": 4.742270617048018e-06, "loss": 0.0, "num_input_tokens_seen": 132748704, "step": 61550 }, { "epoch": 11.296568177647275, "grad_norm": 0.00048038255772553384, "learning_rate": 4.7414709281839195e-06, "loss": 0.0, "num_input_tokens_seen": 132758272, "step": 61555 }, { "epoch": 11.297485777206827, "grad_norm": 0.0003297058283351362, "learning_rate": 4.740671245950691e-06, "loss": 0.0, "num_input_tokens_seen": 132768544, "step": 61560 }, { "epoch": 11.298403376766379, "grad_norm": 0.00035756349097937346, "learning_rate": 4.739871570368842e-06, "loss": 0.007, "num_input_tokens_seen": 132779904, "step": 61565 }, { "epoch": 11.299320976325932, "grad_norm": 0.0003633054730016738, "learning_rate": 4.739071901458883e-06, "loss": 0.0001, "num_input_tokens_seen": 132789632, "step": 61570 }, { "epoch": 11.300238575885484, "grad_norm": 0.0010095887118950486, "learning_rate": 4.738272239241328e-06, "loss": 0.0, "num_input_tokens_seen": 132799520, "step": 61575 }, { "epoch": 11.301156175445035, "grad_norm": 0.00028328580083325505, "learning_rate": 4.737472583736683e-06, "loss": 0.0, "num_input_tokens_seen": 132809888, "step": 61580 }, { "epoch": 11.302073775004589, "grad_norm": 0.0010714699747040868, "learning_rate": 4.7366729349654585e-06, "loss": 0.0001, "num_input_tokens_seen": 132820672, "step": 61585 }, { "epoch": 11.30299137456414, "grad_norm": 0.0005558941047638655, "learning_rate": 4.735873292948167e-06, "loss": 0.0, "num_input_tokens_seen": 132832416, "step": 61590 }, { "epoch": 11.303908974123692, "grad_norm": 87.61970520019531, "learning_rate": 4.735073657705315e-06, "loss": 0.2224, "num_input_tokens_seen": 132843200, "step": 61595 }, { "epoch": 11.304826573683245, "grad_norm": 0.0003932981926482171, "learning_rate": 4.734274029257414e-06, "loss": 0.0, "num_input_tokens_seen": 132854272, "step": 61600 }, { "epoch": 11.305744173242797, "grad_norm": 0.0002772352017927915, "learning_rate": 4.733474407624972e-06, "loss": 0.1191, "num_input_tokens_seen": 132864416, "step": 61605 }, { "epoch": 11.306661772802348, "grad_norm": 0.033019114285707474, "learning_rate": 4.732674792828497e-06, "loss": 0.0, "num_input_tokens_seen": 132875328, "step": 61610 }, { "epoch": 11.307579372361902, "grad_norm": 0.0008117258548736572, "learning_rate": 4.7318751848885004e-06, "loss": 0.0002, "num_input_tokens_seen": 132886784, "step": 61615 }, { "epoch": 11.308496971921453, "grad_norm": 70.61688995361328, "learning_rate": 4.73107558382549e-06, "loss": 0.6083, "num_input_tokens_seen": 132897760, "step": 61620 }, { "epoch": 11.309414571481005, "grad_norm": 0.003004105994477868, "learning_rate": 4.730275989659974e-06, "loss": 0.0001, "num_input_tokens_seen": 132909664, "step": 61625 }, { "epoch": 11.310332171040558, "grad_norm": 0.007969210855662823, "learning_rate": 4.729476402412461e-06, "loss": 0.0, "num_input_tokens_seen": 132920384, "step": 61630 }, { "epoch": 11.31124977060011, "grad_norm": 0.0007728686323389411, "learning_rate": 4.728676822103457e-06, "loss": 0.0001, "num_input_tokens_seen": 132931168, "step": 61635 }, { "epoch": 11.312167370159662, "grad_norm": 0.2714694142341614, "learning_rate": 4.7278772487534745e-06, "loss": 0.0011, "num_input_tokens_seen": 132940992, "step": 61640 }, { "epoch": 11.313084969719215, "grad_norm": 0.0015623659128323197, "learning_rate": 4.727077682383018e-06, "loss": 0.0, "num_input_tokens_seen": 132952352, "step": 61645 }, { "epoch": 11.314002569278767, "grad_norm": 0.03034684993326664, "learning_rate": 4.7262781230125966e-06, "loss": 0.0, "num_input_tokens_seen": 132963968, "step": 61650 }, { "epoch": 11.314920168838318, "grad_norm": 0.06782715767621994, "learning_rate": 4.725478570662715e-06, "loss": 0.0241, "num_input_tokens_seen": 132975360, "step": 61655 }, { "epoch": 11.315837768397872, "grad_norm": 0.005461395252496004, "learning_rate": 4.724679025353885e-06, "loss": 0.0, "num_input_tokens_seen": 132987232, "step": 61660 }, { "epoch": 11.316755367957423, "grad_norm": 0.030950255692005157, "learning_rate": 4.7238794871066105e-06, "loss": 0.0, "num_input_tokens_seen": 132999136, "step": 61665 }, { "epoch": 11.317672967516975, "grad_norm": 123.62004089355469, "learning_rate": 4.723079955941397e-06, "loss": 0.0361, "num_input_tokens_seen": 133009664, "step": 61670 }, { "epoch": 11.318590567076528, "grad_norm": 0.003840753808617592, "learning_rate": 4.7222804318787555e-06, "loss": 0.0001, "num_input_tokens_seen": 133020320, "step": 61675 }, { "epoch": 11.31950816663608, "grad_norm": 0.001662117661908269, "learning_rate": 4.7214809149391914e-06, "loss": 0.0003, "num_input_tokens_seen": 133030528, "step": 61680 }, { "epoch": 11.320425766195632, "grad_norm": 0.0016981029184535146, "learning_rate": 4.720681405143207e-06, "loss": 0.0, "num_input_tokens_seen": 133041664, "step": 61685 }, { "epoch": 11.321343365755185, "grad_norm": 0.0008374886238016188, "learning_rate": 4.719881902511315e-06, "loss": 0.0002, "num_input_tokens_seen": 133052736, "step": 61690 }, { "epoch": 11.322260965314737, "grad_norm": 0.0037972538266330957, "learning_rate": 4.7190824070640176e-06, "loss": 0.001, "num_input_tokens_seen": 133064288, "step": 61695 }, { "epoch": 11.323178564874288, "grad_norm": 0.48481011390686035, "learning_rate": 4.7182829188218205e-06, "loss": 0.0003, "num_input_tokens_seen": 133075328, "step": 61700 }, { "epoch": 11.324096164433842, "grad_norm": 0.0058795432560145855, "learning_rate": 4.7174834378052294e-06, "loss": 0.0645, "num_input_tokens_seen": 133085696, "step": 61705 }, { "epoch": 11.325013763993393, "grad_norm": 0.0002132588706444949, "learning_rate": 4.71668396403475e-06, "loss": 0.0, "num_input_tokens_seen": 133096224, "step": 61710 }, { "epoch": 11.325931363552945, "grad_norm": 0.0008359084604308009, "learning_rate": 4.715884497530889e-06, "loss": 0.0883, "num_input_tokens_seen": 133107360, "step": 61715 }, { "epoch": 11.326848963112498, "grad_norm": 0.006930328439921141, "learning_rate": 4.7150850383141485e-06, "loss": 0.0, "num_input_tokens_seen": 133117568, "step": 61720 }, { "epoch": 11.32776656267205, "grad_norm": 0.005370890721678734, "learning_rate": 4.7142855864050375e-06, "loss": 0.0, "num_input_tokens_seen": 133127968, "step": 61725 }, { "epoch": 11.328684162231601, "grad_norm": 0.003930842503905296, "learning_rate": 4.713486141824058e-06, "loss": 0.0, "num_input_tokens_seen": 133138912, "step": 61730 }, { "epoch": 11.329601761791155, "grad_norm": 0.0020060138776898384, "learning_rate": 4.7126867045917125e-06, "loss": 0.0, "num_input_tokens_seen": 133149152, "step": 61735 }, { "epoch": 11.330519361350706, "grad_norm": 0.00029911569436080754, "learning_rate": 4.71188727472851e-06, "loss": 0.2251, "num_input_tokens_seen": 133159488, "step": 61740 }, { "epoch": 11.331436960910258, "grad_norm": 0.017746713012456894, "learning_rate": 4.711087852254953e-06, "loss": 0.0001, "num_input_tokens_seen": 133171040, "step": 61745 }, { "epoch": 11.332354560469811, "grad_norm": 0.0024941922165453434, "learning_rate": 4.710288437191544e-06, "loss": 0.2844, "num_input_tokens_seen": 133180448, "step": 61750 }, { "epoch": 11.333272160029363, "grad_norm": 0.00167823676019907, "learning_rate": 4.709489029558785e-06, "loss": 0.0, "num_input_tokens_seen": 133189856, "step": 61755 }, { "epoch": 11.334189759588915, "grad_norm": 0.0024241784121841192, "learning_rate": 4.708689629377185e-06, "loss": 0.0, "num_input_tokens_seen": 133200192, "step": 61760 }, { "epoch": 11.335107359148468, "grad_norm": 0.008231514133512974, "learning_rate": 4.707890236667244e-06, "loss": 0.0, "num_input_tokens_seen": 133210304, "step": 61765 }, { "epoch": 11.33602495870802, "grad_norm": 0.0006252177408896387, "learning_rate": 4.707090851449465e-06, "loss": 0.0703, "num_input_tokens_seen": 133220160, "step": 61770 }, { "epoch": 11.336942558267571, "grad_norm": 0.0016671766061335802, "learning_rate": 4.706291473744352e-06, "loss": 0.0, "num_input_tokens_seen": 133230656, "step": 61775 }, { "epoch": 11.337860157827125, "grad_norm": 0.0400807186961174, "learning_rate": 4.705492103572409e-06, "loss": 0.0001, "num_input_tokens_seen": 133241184, "step": 61780 }, { "epoch": 11.338777757386676, "grad_norm": 0.00561274541541934, "learning_rate": 4.7046927409541356e-06, "loss": 0.1252, "num_input_tokens_seen": 133252000, "step": 61785 }, { "epoch": 11.339695356946228, "grad_norm": 0.0028272983618080616, "learning_rate": 4.703893385910035e-06, "loss": 0.0, "num_input_tokens_seen": 133264800, "step": 61790 }, { "epoch": 11.340612956505781, "grad_norm": 0.011967266909778118, "learning_rate": 4.703094038460612e-06, "loss": 0.1439, "num_input_tokens_seen": 133276384, "step": 61795 }, { "epoch": 11.341530556065333, "grad_norm": 0.0007387444493360817, "learning_rate": 4.7022946986263655e-06, "loss": 0.0, "num_input_tokens_seen": 133287008, "step": 61800 }, { "epoch": 11.342448155624885, "grad_norm": 0.001520478050224483, "learning_rate": 4.7014953664277975e-06, "loss": 0.0, "num_input_tokens_seen": 133297824, "step": 61805 }, { "epoch": 11.343365755184438, "grad_norm": 0.0009141078335233033, "learning_rate": 4.700696041885413e-06, "loss": 0.0, "num_input_tokens_seen": 133309536, "step": 61810 }, { "epoch": 11.34428335474399, "grad_norm": 0.00047315453412011266, "learning_rate": 4.699896725019711e-06, "loss": 0.0, "num_input_tokens_seen": 133320032, "step": 61815 }, { "epoch": 11.345200954303541, "grad_norm": 0.0006205395329743624, "learning_rate": 4.699097415851191e-06, "loss": 0.0001, "num_input_tokens_seen": 133331200, "step": 61820 }, { "epoch": 11.346118553863095, "grad_norm": 0.003904449986293912, "learning_rate": 4.698298114400358e-06, "loss": 0.0, "num_input_tokens_seen": 133341856, "step": 61825 }, { "epoch": 11.347036153422646, "grad_norm": 0.0008894276106730103, "learning_rate": 4.69749882068771e-06, "loss": 0.0, "num_input_tokens_seen": 133351296, "step": 61830 }, { "epoch": 11.347953752982198, "grad_norm": 0.0013786250492557883, "learning_rate": 4.696699534733748e-06, "loss": 0.0, "num_input_tokens_seen": 133363424, "step": 61835 }, { "epoch": 11.348871352541751, "grad_norm": 55.08167266845703, "learning_rate": 4.695900256558973e-06, "loss": 0.13, "num_input_tokens_seen": 133373600, "step": 61840 }, { "epoch": 11.349788952101303, "grad_norm": 0.00028951719286851585, "learning_rate": 4.695100986183885e-06, "loss": 0.0, "num_input_tokens_seen": 133385056, "step": 61845 }, { "epoch": 11.350706551660855, "grad_norm": 0.0001781023311195895, "learning_rate": 4.694301723628986e-06, "loss": 0.0, "num_input_tokens_seen": 133396384, "step": 61850 }, { "epoch": 11.351624151220408, "grad_norm": 0.0003126836090814322, "learning_rate": 4.69350246891477e-06, "loss": 0.0, "num_input_tokens_seen": 133407008, "step": 61855 }, { "epoch": 11.35254175077996, "grad_norm": 0.000682327663525939, "learning_rate": 4.6927032220617445e-06, "loss": 0.0002, "num_input_tokens_seen": 133418144, "step": 61860 }, { "epoch": 11.353459350339511, "grad_norm": 0.002714769449084997, "learning_rate": 4.6919039830904035e-06, "loss": 0.0, "num_input_tokens_seen": 133430176, "step": 61865 }, { "epoch": 11.354376949899065, "grad_norm": 0.0009712534374557436, "learning_rate": 4.691104752021246e-06, "loss": 0.0001, "num_input_tokens_seen": 133440960, "step": 61870 }, { "epoch": 11.355294549458616, "grad_norm": 0.0005000616074539721, "learning_rate": 4.690305528874775e-06, "loss": 0.1314, "num_input_tokens_seen": 133453120, "step": 61875 }, { "epoch": 11.356212149018168, "grad_norm": 0.0003167123068124056, "learning_rate": 4.689506313671489e-06, "loss": 0.0, "num_input_tokens_seen": 133464544, "step": 61880 }, { "epoch": 11.357129748577721, "grad_norm": 0.0010330440709367394, "learning_rate": 4.6887071064318825e-06, "loss": 0.0, "num_input_tokens_seen": 133476032, "step": 61885 }, { "epoch": 11.358047348137273, "grad_norm": 0.03789723664522171, "learning_rate": 4.687907907176455e-06, "loss": 0.0, "num_input_tokens_seen": 133486048, "step": 61890 }, { "epoch": 11.358964947696824, "grad_norm": 0.0006125977379269898, "learning_rate": 4.687108715925709e-06, "loss": 0.0001, "num_input_tokens_seen": 133496512, "step": 61895 }, { "epoch": 11.359882547256378, "grad_norm": 73.18684387207031, "learning_rate": 4.686309532700138e-06, "loss": 0.0376, "num_input_tokens_seen": 133508896, "step": 61900 }, { "epoch": 11.36080014681593, "grad_norm": 0.0018159645842388272, "learning_rate": 4.68551035752024e-06, "loss": 0.0, "num_input_tokens_seen": 133519328, "step": 61905 }, { "epoch": 11.361717746375481, "grad_norm": 31.282329559326172, "learning_rate": 4.684711190406516e-06, "loss": 0.1564, "num_input_tokens_seen": 133528576, "step": 61910 }, { "epoch": 11.362635345935034, "grad_norm": 0.009197087027132511, "learning_rate": 4.683912031379461e-06, "loss": 0.0004, "num_input_tokens_seen": 133539552, "step": 61915 }, { "epoch": 11.363552945494586, "grad_norm": 0.02408766560256481, "learning_rate": 4.6831128804595715e-06, "loss": 0.0, "num_input_tokens_seen": 133550112, "step": 61920 }, { "epoch": 11.364470545054138, "grad_norm": 1.0691169500350952, "learning_rate": 4.682313737667347e-06, "loss": 0.0002, "num_input_tokens_seen": 133560736, "step": 61925 }, { "epoch": 11.365388144613691, "grad_norm": 0.001777279656380415, "learning_rate": 4.681514603023281e-06, "loss": 0.0, "num_input_tokens_seen": 133572128, "step": 61930 }, { "epoch": 11.366305744173243, "grad_norm": 0.0009356022346764803, "learning_rate": 4.680715476547873e-06, "loss": 0.0, "num_input_tokens_seen": 133583360, "step": 61935 }, { "epoch": 11.367223343732794, "grad_norm": 0.01775546558201313, "learning_rate": 4.679916358261617e-06, "loss": 0.0, "num_input_tokens_seen": 133594336, "step": 61940 }, { "epoch": 11.368140943292348, "grad_norm": 0.0018865169258788228, "learning_rate": 4.679117248185012e-06, "loss": 0.0, "num_input_tokens_seen": 133604512, "step": 61945 }, { "epoch": 11.3690585428519, "grad_norm": 0.0007995424675755203, "learning_rate": 4.6783181463385524e-06, "loss": 0.0, "num_input_tokens_seen": 133613728, "step": 61950 }, { "epoch": 11.369976142411451, "grad_norm": 0.00033082033041864634, "learning_rate": 4.677519052742731e-06, "loss": 0.0002, "num_input_tokens_seen": 133624256, "step": 61955 }, { "epoch": 11.370893741971004, "grad_norm": 0.010011589154601097, "learning_rate": 4.676719967418049e-06, "loss": 0.0002, "num_input_tokens_seen": 133635296, "step": 61960 }, { "epoch": 11.371811341530556, "grad_norm": 0.005734041798859835, "learning_rate": 4.675920890384998e-06, "loss": 0.0001, "num_input_tokens_seen": 133646144, "step": 61965 }, { "epoch": 11.372728941090108, "grad_norm": 0.0012002954026684165, "learning_rate": 4.6751218216640745e-06, "loss": 0.0, "num_input_tokens_seen": 133657216, "step": 61970 }, { "epoch": 11.373646540649661, "grad_norm": 0.04480033367872238, "learning_rate": 4.67432276127577e-06, "loss": 0.0001, "num_input_tokens_seen": 133666784, "step": 61975 }, { "epoch": 11.374564140209213, "grad_norm": 0.00038585637230426073, "learning_rate": 4.673523709240584e-06, "loss": 0.0002, "num_input_tokens_seen": 133676352, "step": 61980 }, { "epoch": 11.375481739768764, "grad_norm": 0.0019413403933867812, "learning_rate": 4.67272466557901e-06, "loss": 0.0002, "num_input_tokens_seen": 133686944, "step": 61985 }, { "epoch": 11.376399339328318, "grad_norm": 0.00900765135884285, "learning_rate": 4.671925630311538e-06, "loss": 0.0001, "num_input_tokens_seen": 133698688, "step": 61990 }, { "epoch": 11.37731693888787, "grad_norm": 0.005363598931580782, "learning_rate": 4.671126603458668e-06, "loss": 0.0001, "num_input_tokens_seen": 133708256, "step": 61995 }, { "epoch": 11.37823453844742, "grad_norm": 0.0011510207550600171, "learning_rate": 4.67032758504089e-06, "loss": 0.0001, "num_input_tokens_seen": 133719456, "step": 62000 }, { "epoch": 11.379152138006974, "grad_norm": 0.0026722298935055733, "learning_rate": 4.669528575078696e-06, "loss": 0.0, "num_input_tokens_seen": 133730432, "step": 62005 }, { "epoch": 11.380069737566526, "grad_norm": 0.00026349391555413604, "learning_rate": 4.668729573592585e-06, "loss": 0.0, "num_input_tokens_seen": 133739840, "step": 62010 }, { "epoch": 11.380987337126077, "grad_norm": 0.06287412345409393, "learning_rate": 4.667930580603047e-06, "loss": 0.0, "num_input_tokens_seen": 133751104, "step": 62015 }, { "epoch": 11.38190493668563, "grad_norm": 0.024622751399874687, "learning_rate": 4.667131596130575e-06, "loss": 0.0, "num_input_tokens_seen": 133762880, "step": 62020 }, { "epoch": 11.382822536245182, "grad_norm": 0.0022251694463193417, "learning_rate": 4.6663326201956585e-06, "loss": 0.0, "num_input_tokens_seen": 133773952, "step": 62025 }, { "epoch": 11.383740135804734, "grad_norm": 0.0011897820513695478, "learning_rate": 4.665533652818796e-06, "loss": 0.0, "num_input_tokens_seen": 133784864, "step": 62030 }, { "epoch": 11.384657735364287, "grad_norm": 0.0008257341105490923, "learning_rate": 4.664734694020477e-06, "loss": 0.0001, "num_input_tokens_seen": 133797344, "step": 62035 }, { "epoch": 11.385575334923839, "grad_norm": 0.0014270840911194682, "learning_rate": 4.663935743821193e-06, "loss": 0.0, "num_input_tokens_seen": 133808448, "step": 62040 }, { "epoch": 11.38649293448339, "grad_norm": 0.0004117472271900624, "learning_rate": 4.663136802241436e-06, "loss": 0.1283, "num_input_tokens_seen": 133819008, "step": 62045 }, { "epoch": 11.387410534042944, "grad_norm": 0.0033279573544859886, "learning_rate": 4.662337869301699e-06, "loss": 0.0, "num_input_tokens_seen": 133830528, "step": 62050 }, { "epoch": 11.388328133602496, "grad_norm": 0.02202313020825386, "learning_rate": 4.6615389450224696e-06, "loss": 0.0001, "num_input_tokens_seen": 133841376, "step": 62055 }, { "epoch": 11.389245733162047, "grad_norm": 0.0011588565539568663, "learning_rate": 4.6607400294242444e-06, "loss": 0.0, "num_input_tokens_seen": 133851744, "step": 62060 }, { "epoch": 11.3901633327216, "grad_norm": 0.0020761408377438784, "learning_rate": 4.659941122527511e-06, "loss": 0.0008, "num_input_tokens_seen": 133862880, "step": 62065 }, { "epoch": 11.391080932281152, "grad_norm": 0.00020186352776363492, "learning_rate": 4.659142224352761e-06, "loss": 0.0001, "num_input_tokens_seen": 133873408, "step": 62070 }, { "epoch": 11.391998531840704, "grad_norm": 0.0004224531876388937, "learning_rate": 4.658343334920482e-06, "loss": 0.0002, "num_input_tokens_seen": 133882720, "step": 62075 }, { "epoch": 11.392916131400257, "grad_norm": 0.0013681990094482899, "learning_rate": 4.65754445425117e-06, "loss": 0.0, "num_input_tokens_seen": 133892672, "step": 62080 }, { "epoch": 11.393833730959809, "grad_norm": 0.0005722806672565639, "learning_rate": 4.656745582365312e-06, "loss": 0.0913, "num_input_tokens_seen": 133903552, "step": 62085 }, { "epoch": 11.39475133051936, "grad_norm": 0.0028871765825897455, "learning_rate": 4.6559467192833956e-06, "loss": 0.0, "num_input_tokens_seen": 133914432, "step": 62090 }, { "epoch": 11.395668930078914, "grad_norm": 0.0002746260433923453, "learning_rate": 4.655147865025914e-06, "loss": 0.0002, "num_input_tokens_seen": 133925056, "step": 62095 }, { "epoch": 11.396586529638466, "grad_norm": 0.00032299666781909764, "learning_rate": 4.6543490196133566e-06, "loss": 0.0002, "num_input_tokens_seen": 133934816, "step": 62100 }, { "epoch": 11.397504129198017, "grad_norm": 0.0009911064989864826, "learning_rate": 4.65355018306621e-06, "loss": 0.0, "num_input_tokens_seen": 133945824, "step": 62105 }, { "epoch": 11.39842172875757, "grad_norm": 0.010933875106275082, "learning_rate": 4.652751355404962e-06, "loss": 0.1876, "num_input_tokens_seen": 133957024, "step": 62110 }, { "epoch": 11.399339328317122, "grad_norm": 0.0012578361202031374, "learning_rate": 4.6519525366501066e-06, "loss": 0.0, "num_input_tokens_seen": 133967680, "step": 62115 }, { "epoch": 11.400256927876674, "grad_norm": 0.0003382067079655826, "learning_rate": 4.651153726822128e-06, "loss": 0.0001, "num_input_tokens_seen": 133979520, "step": 62120 }, { "epoch": 11.401174527436227, "grad_norm": 0.0025480296462774277, "learning_rate": 4.650354925941515e-06, "loss": 0.0041, "num_input_tokens_seen": 133991360, "step": 62125 }, { "epoch": 11.402092126995779, "grad_norm": 0.007511599455028772, "learning_rate": 4.649556134028757e-06, "loss": 0.0, "num_input_tokens_seen": 134000832, "step": 62130 }, { "epoch": 11.40300972655533, "grad_norm": 0.0005099129630252719, "learning_rate": 4.648757351104341e-06, "loss": 0.0001, "num_input_tokens_seen": 134011264, "step": 62135 }, { "epoch": 11.403927326114884, "grad_norm": 0.03382602706551552, "learning_rate": 4.647958577188754e-06, "loss": 0.0, "num_input_tokens_seen": 134022720, "step": 62140 }, { "epoch": 11.404844925674436, "grad_norm": 0.0022670228499919176, "learning_rate": 4.647159812302485e-06, "loss": 0.0, "num_input_tokens_seen": 134031712, "step": 62145 }, { "epoch": 11.405762525233987, "grad_norm": 0.0004455400921870023, "learning_rate": 4.646361056466019e-06, "loss": 0.0, "num_input_tokens_seen": 134041440, "step": 62150 }, { "epoch": 11.40668012479354, "grad_norm": 0.0032721352763473988, "learning_rate": 4.6455623096998445e-06, "loss": 0.0, "num_input_tokens_seen": 134051904, "step": 62155 }, { "epoch": 11.407597724353092, "grad_norm": 0.000265825423412025, "learning_rate": 4.6447635720244475e-06, "loss": 0.0016, "num_input_tokens_seen": 134061728, "step": 62160 }, { "epoch": 11.408515323912644, "grad_norm": 0.004680137149989605, "learning_rate": 4.643964843460314e-06, "loss": 0.0, "num_input_tokens_seen": 134072480, "step": 62165 }, { "epoch": 11.409432923472197, "grad_norm": 0.002031729556620121, "learning_rate": 4.643166124027931e-06, "loss": 0.0001, "num_input_tokens_seen": 134083360, "step": 62170 }, { "epoch": 11.410350523031749, "grad_norm": 13.349851608276367, "learning_rate": 4.642367413747782e-06, "loss": 0.0131, "num_input_tokens_seen": 134094048, "step": 62175 }, { "epoch": 11.4112681225913, "grad_norm": 24.029052734375, "learning_rate": 4.641568712640357e-06, "loss": 0.2251, "num_input_tokens_seen": 134104800, "step": 62180 }, { "epoch": 11.412185722150854, "grad_norm": 27.523658752441406, "learning_rate": 4.640770020726137e-06, "loss": 0.1098, "num_input_tokens_seen": 134115040, "step": 62185 }, { "epoch": 11.413103321710405, "grad_norm": 0.019396141171455383, "learning_rate": 4.639971338025609e-06, "loss": 0.0, "num_input_tokens_seen": 134126208, "step": 62190 }, { "epoch": 11.414020921269957, "grad_norm": 0.008432598784565926, "learning_rate": 4.63917266455926e-06, "loss": 0.0, "num_input_tokens_seen": 134137472, "step": 62195 }, { "epoch": 11.41493852082951, "grad_norm": 22.18247413635254, "learning_rate": 4.638374000347573e-06, "loss": 0.1567, "num_input_tokens_seen": 134146624, "step": 62200 }, { "epoch": 11.415856120389062, "grad_norm": 0.0008655314450152218, "learning_rate": 4.637575345411031e-06, "loss": 0.0011, "num_input_tokens_seen": 134157024, "step": 62205 }, { "epoch": 11.416773719948614, "grad_norm": 0.0017978493124246597, "learning_rate": 4.6367766997701195e-06, "loss": 0.0, "num_input_tokens_seen": 134168672, "step": 62210 }, { "epoch": 11.417691319508167, "grad_norm": 0.0022881405893713236, "learning_rate": 4.635978063445324e-06, "loss": 0.0002, "num_input_tokens_seen": 134179520, "step": 62215 }, { "epoch": 11.418608919067719, "grad_norm": 0.0008922296110540628, "learning_rate": 4.635179436457127e-06, "loss": 0.0001, "num_input_tokens_seen": 134189888, "step": 62220 }, { "epoch": 11.41952651862727, "grad_norm": 0.0011146609904244542, "learning_rate": 4.63438081882601e-06, "loss": 0.0, "num_input_tokens_seen": 134200960, "step": 62225 }, { "epoch": 11.420444118186824, "grad_norm": 0.0007181931287050247, "learning_rate": 4.63358221057246e-06, "loss": 0.0, "num_input_tokens_seen": 134212352, "step": 62230 }, { "epoch": 11.421361717746375, "grad_norm": 0.0006103338091634214, "learning_rate": 4.632783611716959e-06, "loss": 0.0, "num_input_tokens_seen": 134223072, "step": 62235 }, { "epoch": 11.422279317305927, "grad_norm": 0.010867071337997913, "learning_rate": 4.631985022279989e-06, "loss": 0.0, "num_input_tokens_seen": 134233632, "step": 62240 }, { "epoch": 11.42319691686548, "grad_norm": 0.015505962073802948, "learning_rate": 4.631186442282032e-06, "loss": 0.0107, "num_input_tokens_seen": 134243904, "step": 62245 }, { "epoch": 11.424114516425032, "grad_norm": 0.38315901160240173, "learning_rate": 4.630387871743572e-06, "loss": 0.0048, "num_input_tokens_seen": 134255232, "step": 62250 }, { "epoch": 11.425032115984584, "grad_norm": 19.60024642944336, "learning_rate": 4.629589310685089e-06, "loss": 0.0284, "num_input_tokens_seen": 134266432, "step": 62255 }, { "epoch": 11.425949715544137, "grad_norm": 0.0006440455326810479, "learning_rate": 4.6287907591270665e-06, "loss": 0.0, "num_input_tokens_seen": 134277664, "step": 62260 }, { "epoch": 11.426867315103689, "grad_norm": 0.0003563151112757623, "learning_rate": 4.627992217089987e-06, "loss": 0.0002, "num_input_tokens_seen": 134287552, "step": 62265 }, { "epoch": 11.42778491466324, "grad_norm": 0.5160678029060364, "learning_rate": 4.62719368459433e-06, "loss": 0.0003, "num_input_tokens_seen": 134298592, "step": 62270 }, { "epoch": 11.428702514222794, "grad_norm": 0.006441942881792784, "learning_rate": 4.626395161660575e-06, "loss": 0.0001, "num_input_tokens_seen": 134309120, "step": 62275 }, { "epoch": 11.429620113782345, "grad_norm": 0.10536034405231476, "learning_rate": 4.625596648309208e-06, "loss": 0.0226, "num_input_tokens_seen": 134319296, "step": 62280 }, { "epoch": 11.430537713341897, "grad_norm": 0.013407035730779171, "learning_rate": 4.6247981445607055e-06, "loss": 0.0001, "num_input_tokens_seen": 134331392, "step": 62285 }, { "epoch": 11.43145531290145, "grad_norm": 0.001843403559178114, "learning_rate": 4.62399965043555e-06, "loss": 0.0032, "num_input_tokens_seen": 134342784, "step": 62290 }, { "epoch": 11.432372912461002, "grad_norm": 0.0027748465072363615, "learning_rate": 4.623201165954217e-06, "loss": 0.0, "num_input_tokens_seen": 134354016, "step": 62295 }, { "epoch": 11.433290512020553, "grad_norm": 0.0013592650648206472, "learning_rate": 4.6224026911371945e-06, "loss": 0.0, "num_input_tokens_seen": 134365760, "step": 62300 }, { "epoch": 11.434208111580107, "grad_norm": 0.0602334700524807, "learning_rate": 4.621604226004957e-06, "loss": 0.0004, "num_input_tokens_seen": 134375168, "step": 62305 }, { "epoch": 11.435125711139658, "grad_norm": 0.0007976552005857229, "learning_rate": 4.620805770577982e-06, "loss": 0.0, "num_input_tokens_seen": 134386272, "step": 62310 }, { "epoch": 11.43604331069921, "grad_norm": 0.001477869343943894, "learning_rate": 4.620007324876753e-06, "loss": 0.0, "num_input_tokens_seen": 134397440, "step": 62315 }, { "epoch": 11.436960910258763, "grad_norm": 0.1615639626979828, "learning_rate": 4.619208888921748e-06, "loss": 0.0002, "num_input_tokens_seen": 134407616, "step": 62320 }, { "epoch": 11.437878509818315, "grad_norm": 0.00044584248098544776, "learning_rate": 4.618410462733442e-06, "loss": 0.0536, "num_input_tokens_seen": 134418624, "step": 62325 }, { "epoch": 11.438796109377867, "grad_norm": 0.0016772859962657094, "learning_rate": 4.617612046332319e-06, "loss": 0.0, "num_input_tokens_seen": 134429152, "step": 62330 }, { "epoch": 11.43971370893742, "grad_norm": 0.01905641332268715, "learning_rate": 4.616813639738855e-06, "loss": 0.0, "num_input_tokens_seen": 134439072, "step": 62335 }, { "epoch": 11.440631308496972, "grad_norm": 0.002915544668212533, "learning_rate": 4.616015242973526e-06, "loss": 0.0023, "num_input_tokens_seen": 134449984, "step": 62340 }, { "epoch": 11.441548908056523, "grad_norm": 0.001031698426231742, "learning_rate": 4.615216856056809e-06, "loss": 0.0002, "num_input_tokens_seen": 134460704, "step": 62345 }, { "epoch": 11.442466507616077, "grad_norm": 0.0010066565591841936, "learning_rate": 4.614418479009186e-06, "loss": 0.0001, "num_input_tokens_seen": 134471328, "step": 62350 }, { "epoch": 11.443384107175628, "grad_norm": 0.001608310267329216, "learning_rate": 4.613620111851132e-06, "loss": 0.0002, "num_input_tokens_seen": 134481216, "step": 62355 }, { "epoch": 11.44430170673518, "grad_norm": 0.0026127093005925417, "learning_rate": 4.6128217546031205e-06, "loss": 0.0, "num_input_tokens_seen": 134491488, "step": 62360 }, { "epoch": 11.445219306294733, "grad_norm": 0.020192530006170273, "learning_rate": 4.612023407285633e-06, "loss": 0.0011, "num_input_tokens_seen": 134503456, "step": 62365 }, { "epoch": 11.446136905854285, "grad_norm": 0.008955977857112885, "learning_rate": 4.611225069919144e-06, "loss": 0.0006, "num_input_tokens_seen": 134514208, "step": 62370 }, { "epoch": 11.447054505413837, "grad_norm": 0.016828065738081932, "learning_rate": 4.610426742524126e-06, "loss": 0.2219, "num_input_tokens_seen": 134524672, "step": 62375 }, { "epoch": 11.44797210497339, "grad_norm": 0.0021510375663638115, "learning_rate": 4.609628425121061e-06, "loss": 0.0, "num_input_tokens_seen": 134534656, "step": 62380 }, { "epoch": 11.448889704532942, "grad_norm": 0.00029230432119220495, "learning_rate": 4.6088301177304225e-06, "loss": 0.0, "num_input_tokens_seen": 134545152, "step": 62385 }, { "epoch": 11.449807304092493, "grad_norm": 0.0003706467687152326, "learning_rate": 4.608031820372686e-06, "loss": 0.0004, "num_input_tokens_seen": 134555424, "step": 62390 }, { "epoch": 11.450724903652047, "grad_norm": 0.020215807482600212, "learning_rate": 4.607233533068322e-06, "loss": 0.0, "num_input_tokens_seen": 134566048, "step": 62395 }, { "epoch": 11.451642503211598, "grad_norm": 0.0013366825878620148, "learning_rate": 4.6064352558378115e-06, "loss": 0.0, "num_input_tokens_seen": 134577248, "step": 62400 }, { "epoch": 11.45256010277115, "grad_norm": 0.08596567809581757, "learning_rate": 4.605636988701627e-06, "loss": 0.0001, "num_input_tokens_seen": 134588384, "step": 62405 }, { "epoch": 11.453477702330703, "grad_norm": 0.008081788197159767, "learning_rate": 4.604838731680241e-06, "loss": 0.0, "num_input_tokens_seen": 134599552, "step": 62410 }, { "epoch": 11.454395301890255, "grad_norm": 0.0003860401047859341, "learning_rate": 4.60404048479413e-06, "loss": 0.0, "num_input_tokens_seen": 134610528, "step": 62415 }, { "epoch": 11.455312901449807, "grad_norm": 0.00025312305660918355, "learning_rate": 4.603242248063768e-06, "loss": 0.0, "num_input_tokens_seen": 134621088, "step": 62420 }, { "epoch": 11.45623050100936, "grad_norm": 0.0025707257445901632, "learning_rate": 4.602444021509626e-06, "loss": 0.0, "num_input_tokens_seen": 134632256, "step": 62425 }, { "epoch": 11.457148100568912, "grad_norm": 0.0013769272482022643, "learning_rate": 4.601645805152178e-06, "loss": 0.0588, "num_input_tokens_seen": 134643328, "step": 62430 }, { "epoch": 11.458065700128463, "grad_norm": 0.0013004271313548088, "learning_rate": 4.6008475990119e-06, "loss": 0.0, "num_input_tokens_seen": 134654016, "step": 62435 }, { "epoch": 11.458983299688017, "grad_norm": 0.0006237187189981341, "learning_rate": 4.600049403109262e-06, "loss": 0.0, "num_input_tokens_seen": 134665824, "step": 62440 }, { "epoch": 11.459900899247568, "grad_norm": 0.0004306357295718044, "learning_rate": 4.5992512174647345e-06, "loss": 0.0, "num_input_tokens_seen": 134676352, "step": 62445 }, { "epoch": 11.46081849880712, "grad_norm": 0.0003092045371886343, "learning_rate": 4.598453042098794e-06, "loss": 0.0, "num_input_tokens_seen": 134687456, "step": 62450 }, { "epoch": 11.461736098366673, "grad_norm": 0.000744802295230329, "learning_rate": 4.597654877031911e-06, "loss": 0.0, "num_input_tokens_seen": 134698816, "step": 62455 }, { "epoch": 11.462653697926225, "grad_norm": 0.0003971160331275314, "learning_rate": 4.596856722284556e-06, "loss": 0.0, "num_input_tokens_seen": 134707968, "step": 62460 }, { "epoch": 11.463571297485776, "grad_norm": 0.0009914868278428912, "learning_rate": 4.5960585778772025e-06, "loss": 0.0, "num_input_tokens_seen": 134719072, "step": 62465 }, { "epoch": 11.46448889704533, "grad_norm": 0.004512416664510965, "learning_rate": 4.595260443830319e-06, "loss": 0.1594, "num_input_tokens_seen": 134731200, "step": 62470 }, { "epoch": 11.465406496604881, "grad_norm": 0.00019875323050655425, "learning_rate": 4.594462320164378e-06, "loss": 0.0, "num_input_tokens_seen": 134741792, "step": 62475 }, { "epoch": 11.466324096164433, "grad_norm": 0.00047194352373480797, "learning_rate": 4.59366420689985e-06, "loss": 0.0, "num_input_tokens_seen": 134752992, "step": 62480 }, { "epoch": 11.467241695723986, "grad_norm": 0.0007107460405677557, "learning_rate": 4.5928661040572065e-06, "loss": 0.0016, "num_input_tokens_seen": 134764416, "step": 62485 }, { "epoch": 11.468159295283538, "grad_norm": 0.002597826300188899, "learning_rate": 4.592068011656916e-06, "loss": 0.0, "num_input_tokens_seen": 134773792, "step": 62490 }, { "epoch": 11.46907689484309, "grad_norm": 0.000303342763800174, "learning_rate": 4.591269929719447e-06, "loss": 0.0059, "num_input_tokens_seen": 134784608, "step": 62495 }, { "epoch": 11.469994494402643, "grad_norm": 0.0006572339916601777, "learning_rate": 4.590471858265273e-06, "loss": 0.0001, "num_input_tokens_seen": 134795616, "step": 62500 }, { "epoch": 11.470912093962195, "grad_norm": 0.00044673646334558725, "learning_rate": 4.589673797314861e-06, "loss": 0.0, "num_input_tokens_seen": 134805728, "step": 62505 }, { "epoch": 11.471829693521746, "grad_norm": 0.00020226433116476983, "learning_rate": 4.5888757468886774e-06, "loss": 0.0, "num_input_tokens_seen": 134817696, "step": 62510 }, { "epoch": 11.4727472930813, "grad_norm": 0.0003007009217981249, "learning_rate": 4.588077707007196e-06, "loss": 0.0001, "num_input_tokens_seen": 134829568, "step": 62515 }, { "epoch": 11.473664892640851, "grad_norm": 0.3387223482131958, "learning_rate": 4.587279677690883e-06, "loss": 0.0002, "num_input_tokens_seen": 134839168, "step": 62520 }, { "epoch": 11.474582492200403, "grad_norm": 0.0007877051830291748, "learning_rate": 4.586481658960208e-06, "loss": 0.0, "num_input_tokens_seen": 134850112, "step": 62525 }, { "epoch": 11.475500091759956, "grad_norm": 0.0003286341088823974, "learning_rate": 4.585683650835634e-06, "loss": 0.0, "num_input_tokens_seen": 134860512, "step": 62530 }, { "epoch": 11.476417691319508, "grad_norm": 0.007170562632381916, "learning_rate": 4.584885653337634e-06, "loss": 0.0, "num_input_tokens_seen": 134870784, "step": 62535 }, { "epoch": 11.47733529087906, "grad_norm": 0.00046068907249718904, "learning_rate": 4.584087666486675e-06, "loss": 0.0001, "num_input_tokens_seen": 134880896, "step": 62540 }, { "epoch": 11.478252890438613, "grad_norm": 0.0022394831757992506, "learning_rate": 4.5832896903032195e-06, "loss": 0.0, "num_input_tokens_seen": 134892128, "step": 62545 }, { "epoch": 11.479170489998165, "grad_norm": 0.0010910601122304797, "learning_rate": 4.582491724807741e-06, "loss": 0.0, "num_input_tokens_seen": 134902784, "step": 62550 }, { "epoch": 11.480088089557716, "grad_norm": 0.0005498013342730701, "learning_rate": 4.581693770020701e-06, "loss": 0.0, "num_input_tokens_seen": 134914304, "step": 62555 }, { "epoch": 11.48100568911727, "grad_norm": 0.00037682894617319107, "learning_rate": 4.580895825962568e-06, "loss": 0.0001, "num_input_tokens_seen": 134925120, "step": 62560 }, { "epoch": 11.481923288676821, "grad_norm": 0.00218421989120543, "learning_rate": 4.580097892653808e-06, "loss": 0.0, "num_input_tokens_seen": 134935264, "step": 62565 }, { "epoch": 11.482840888236373, "grad_norm": 0.0008278042660094798, "learning_rate": 4.579299970114884e-06, "loss": 0.0001, "num_input_tokens_seen": 134945184, "step": 62570 }, { "epoch": 11.483758487795926, "grad_norm": 0.0027230512350797653, "learning_rate": 4.578502058366266e-06, "loss": 0.0, "num_input_tokens_seen": 134957408, "step": 62575 }, { "epoch": 11.484676087355478, "grad_norm": 0.00036240569897927344, "learning_rate": 4.577704157428416e-06, "loss": 0.0001, "num_input_tokens_seen": 134968000, "step": 62580 }, { "epoch": 11.48559368691503, "grad_norm": 0.0006362151470966637, "learning_rate": 4.576906267321801e-06, "loss": 0.0001, "num_input_tokens_seen": 134978528, "step": 62585 }, { "epoch": 11.486511286474583, "grad_norm": 0.025369659066200256, "learning_rate": 4.576108388066884e-06, "loss": 0.0, "num_input_tokens_seen": 134989344, "step": 62590 }, { "epoch": 11.487428886034134, "grad_norm": 0.009585928171873093, "learning_rate": 4.575310519684127e-06, "loss": 0.0, "num_input_tokens_seen": 135000864, "step": 62595 }, { "epoch": 11.488346485593686, "grad_norm": 0.0003335180226713419, "learning_rate": 4.574512662194001e-06, "loss": 0.0, "num_input_tokens_seen": 135010112, "step": 62600 }, { "epoch": 11.48926408515324, "grad_norm": 0.009496680460870266, "learning_rate": 4.573714815616964e-06, "loss": 0.0, "num_input_tokens_seen": 135021568, "step": 62605 }, { "epoch": 11.490181684712791, "grad_norm": 0.0016573149478062987, "learning_rate": 4.572916979973482e-06, "loss": 0.0, "num_input_tokens_seen": 135032704, "step": 62610 }, { "epoch": 11.491099284272343, "grad_norm": 0.001068380195647478, "learning_rate": 4.572119155284016e-06, "loss": 0.2094, "num_input_tokens_seen": 135042784, "step": 62615 }, { "epoch": 11.492016883831896, "grad_norm": 0.0016549879219383001, "learning_rate": 4.571321341569032e-06, "loss": 0.0079, "num_input_tokens_seen": 135052256, "step": 62620 }, { "epoch": 11.492934483391448, "grad_norm": 0.0004874355799984187, "learning_rate": 4.5705235388489914e-06, "loss": 0.0, "num_input_tokens_seen": 135063456, "step": 62625 }, { "epoch": 11.493852082951, "grad_norm": 0.00038487810525111854, "learning_rate": 4.569725747144354e-06, "loss": 0.175, "num_input_tokens_seen": 135073920, "step": 62630 }, { "epoch": 11.494769682510553, "grad_norm": 0.0009779169922694564, "learning_rate": 4.5689279664755866e-06, "loss": 0.0, "num_input_tokens_seen": 135084800, "step": 62635 }, { "epoch": 11.495687282070104, "grad_norm": 0.00019564363174140453, "learning_rate": 4.568130196863149e-06, "loss": 0.0008, "num_input_tokens_seen": 135094528, "step": 62640 }, { "epoch": 11.496604881629656, "grad_norm": 0.017366981133818626, "learning_rate": 4.5673324383275e-06, "loss": 0.0, "num_input_tokens_seen": 135104768, "step": 62645 }, { "epoch": 11.49752248118921, "grad_norm": 0.0043282476253807545, "learning_rate": 4.566534690889106e-06, "loss": 0.0, "num_input_tokens_seen": 135115744, "step": 62650 }, { "epoch": 11.498440080748761, "grad_norm": 0.0025116060860455036, "learning_rate": 4.5657369545684245e-06, "loss": 0.0, "num_input_tokens_seen": 135126560, "step": 62655 }, { "epoch": 11.499357680308313, "grad_norm": 0.008418126963078976, "learning_rate": 4.5649392293859175e-06, "loss": 0.0, "num_input_tokens_seen": 135137664, "step": 62660 }, { "epoch": 11.500275279867866, "grad_norm": 0.00044611922930926085, "learning_rate": 4.564141515362043e-06, "loss": 0.0, "num_input_tokens_seen": 135148192, "step": 62665 }, { "epoch": 11.501192879427418, "grad_norm": 0.0023050387389957905, "learning_rate": 4.563343812517265e-06, "loss": 0.0, "num_input_tokens_seen": 135159040, "step": 62670 }, { "epoch": 11.50211047898697, "grad_norm": 0.002614351222291589, "learning_rate": 4.562546120872042e-06, "loss": 0.019, "num_input_tokens_seen": 135169440, "step": 62675 }, { "epoch": 11.503028078546523, "grad_norm": 0.00048195323324762285, "learning_rate": 4.561748440446831e-06, "loss": 0.0079, "num_input_tokens_seen": 135178432, "step": 62680 }, { "epoch": 11.503945678106074, "grad_norm": 0.0009253103053197265, "learning_rate": 4.560950771262096e-06, "loss": 0.0, "num_input_tokens_seen": 135188608, "step": 62685 }, { "epoch": 11.504863277665628, "grad_norm": 46.972354888916016, "learning_rate": 4.560153113338291e-06, "loss": 0.2313, "num_input_tokens_seen": 135199040, "step": 62690 }, { "epoch": 11.50578087722518, "grad_norm": 0.0023818304762244225, "learning_rate": 4.559355466695878e-06, "loss": 0.0, "num_input_tokens_seen": 135208992, "step": 62695 }, { "epoch": 11.506698476784731, "grad_norm": 18.19043731689453, "learning_rate": 4.558557831355313e-06, "loss": 0.119, "num_input_tokens_seen": 135220288, "step": 62700 }, { "epoch": 11.507616076344284, "grad_norm": 103.0450439453125, "learning_rate": 4.557760207337057e-06, "loss": 0.1994, "num_input_tokens_seen": 135230752, "step": 62705 }, { "epoch": 11.508533675903836, "grad_norm": 0.0005542137078009546, "learning_rate": 4.556962594661567e-06, "loss": 0.0, "num_input_tokens_seen": 135239392, "step": 62710 }, { "epoch": 11.509451275463388, "grad_norm": 0.007312759291380644, "learning_rate": 4.556164993349296e-06, "loss": 0.0008, "num_input_tokens_seen": 135250016, "step": 62715 }, { "epoch": 11.510368875022941, "grad_norm": 0.005715562496334314, "learning_rate": 4.555367403420709e-06, "loss": 0.0352, "num_input_tokens_seen": 135260480, "step": 62720 }, { "epoch": 11.511286474582493, "grad_norm": 0.0005446767900139093, "learning_rate": 4.554569824896258e-06, "loss": 0.1286, "num_input_tokens_seen": 135271456, "step": 62725 }, { "epoch": 11.512204074142044, "grad_norm": 0.00039737424231134355, "learning_rate": 4.553772257796398e-06, "loss": 0.0, "num_input_tokens_seen": 135282592, "step": 62730 }, { "epoch": 11.513121673701598, "grad_norm": 0.0003525481151882559, "learning_rate": 4.55297470214159e-06, "loss": 0.0003, "num_input_tokens_seen": 135292384, "step": 62735 }, { "epoch": 11.51403927326115, "grad_norm": 0.001492133131250739, "learning_rate": 4.55217715795229e-06, "loss": 0.0, "num_input_tokens_seen": 135302784, "step": 62740 }, { "epoch": 11.5149568728207, "grad_norm": 0.0027618571184575558, "learning_rate": 4.55137962524895e-06, "loss": 0.0, "num_input_tokens_seen": 135312320, "step": 62745 }, { "epoch": 11.515874472380254, "grad_norm": 3.856065273284912, "learning_rate": 4.550582104052025e-06, "loss": 0.0023, "num_input_tokens_seen": 135324672, "step": 62750 }, { "epoch": 11.516792071939806, "grad_norm": 0.0003892668755725026, "learning_rate": 4.549784594381976e-06, "loss": 0.0, "num_input_tokens_seen": 135334240, "step": 62755 }, { "epoch": 11.517709671499357, "grad_norm": 0.0009622415527701378, "learning_rate": 4.548987096259254e-06, "loss": 0.0, "num_input_tokens_seen": 135345152, "step": 62760 }, { "epoch": 11.51862727105891, "grad_norm": 0.0032673419918864965, "learning_rate": 4.548189609704311e-06, "loss": 0.0, "num_input_tokens_seen": 135354368, "step": 62765 }, { "epoch": 11.519544870618462, "grad_norm": 0.2471049726009369, "learning_rate": 4.547392134737607e-06, "loss": 0.0617, "num_input_tokens_seen": 135364512, "step": 62770 }, { "epoch": 11.520462470178014, "grad_norm": 25.15281105041504, "learning_rate": 4.546594671379594e-06, "loss": 0.1066, "num_input_tokens_seen": 135375232, "step": 62775 }, { "epoch": 11.521380069737567, "grad_norm": 0.00858142226934433, "learning_rate": 4.545797219650723e-06, "loss": 0.0, "num_input_tokens_seen": 135386656, "step": 62780 }, { "epoch": 11.522297669297119, "grad_norm": 0.06497542560100555, "learning_rate": 4.54499977957145e-06, "loss": 0.033, "num_input_tokens_seen": 135397792, "step": 62785 }, { "epoch": 11.52321526885667, "grad_norm": 0.0012220509815961123, "learning_rate": 4.544202351162227e-06, "loss": 0.0001, "num_input_tokens_seen": 135408448, "step": 62790 }, { "epoch": 11.524132868416224, "grad_norm": 0.004996187519282103, "learning_rate": 4.543404934443509e-06, "loss": 0.0001, "num_input_tokens_seen": 135419200, "step": 62795 }, { "epoch": 11.525050467975776, "grad_norm": 0.002394527429714799, "learning_rate": 4.542607529435744e-06, "loss": 0.0001, "num_input_tokens_seen": 135429792, "step": 62800 }, { "epoch": 11.525968067535327, "grad_norm": 0.01276805903762579, "learning_rate": 4.541810136159389e-06, "loss": 0.0, "num_input_tokens_seen": 135441056, "step": 62805 }, { "epoch": 11.52688566709488, "grad_norm": 28.634737014770508, "learning_rate": 4.541012754634895e-06, "loss": 0.0801, "num_input_tokens_seen": 135452064, "step": 62810 }, { "epoch": 11.527803266654432, "grad_norm": 0.0005820864462293684, "learning_rate": 4.540215384882709e-06, "loss": 0.0, "num_input_tokens_seen": 135462656, "step": 62815 }, { "epoch": 11.528720866213984, "grad_norm": 0.0009701771778054535, "learning_rate": 4.539418026923289e-06, "loss": 0.0, "num_input_tokens_seen": 135474464, "step": 62820 }, { "epoch": 11.529638465773537, "grad_norm": 0.073038250207901, "learning_rate": 4.538620680777081e-06, "loss": 0.0004, "num_input_tokens_seen": 135485536, "step": 62825 }, { "epoch": 11.530556065333089, "grad_norm": 0.0032216371037065983, "learning_rate": 4.537823346464536e-06, "loss": 0.0001, "num_input_tokens_seen": 135497152, "step": 62830 }, { "epoch": 11.53147366489264, "grad_norm": 0.009109281934797764, "learning_rate": 4.537026024006109e-06, "loss": 0.0, "num_input_tokens_seen": 135508000, "step": 62835 }, { "epoch": 11.532391264452194, "grad_norm": 0.013632732443511486, "learning_rate": 4.536228713422246e-06, "loss": 0.1006, "num_input_tokens_seen": 135520416, "step": 62840 }, { "epoch": 11.533308864011746, "grad_norm": 0.08443773537874222, "learning_rate": 4.535431414733398e-06, "loss": 0.0015, "num_input_tokens_seen": 135530944, "step": 62845 }, { "epoch": 11.534226463571297, "grad_norm": 0.016792377457022667, "learning_rate": 4.534634127960012e-06, "loss": 0.001, "num_input_tokens_seen": 135540768, "step": 62850 }, { "epoch": 11.53514406313085, "grad_norm": 0.035528477281332016, "learning_rate": 4.533836853122542e-06, "loss": 0.0914, "num_input_tokens_seen": 135550816, "step": 62855 }, { "epoch": 11.536061662690402, "grad_norm": 0.005469894036650658, "learning_rate": 4.5330395902414335e-06, "loss": 0.0008, "num_input_tokens_seen": 135562048, "step": 62860 }, { "epoch": 11.536979262249954, "grad_norm": 0.013018239289522171, "learning_rate": 4.532242339337134e-06, "loss": 0.0, "num_input_tokens_seen": 135573760, "step": 62865 }, { "epoch": 11.537896861809507, "grad_norm": 0.00447458541020751, "learning_rate": 4.5314451004300954e-06, "loss": 0.0, "num_input_tokens_seen": 135583104, "step": 62870 }, { "epoch": 11.538814461369059, "grad_norm": 0.005632966291159391, "learning_rate": 4.530647873540764e-06, "loss": 0.0003, "num_input_tokens_seen": 135594080, "step": 62875 }, { "epoch": 11.53973206092861, "grad_norm": 0.0004392239497974515, "learning_rate": 4.529850658689587e-06, "loss": 0.0002, "num_input_tokens_seen": 135604416, "step": 62880 }, { "epoch": 11.540649660488164, "grad_norm": 0.0005344150704331696, "learning_rate": 4.52905345589701e-06, "loss": 0.0, "num_input_tokens_seen": 135614464, "step": 62885 }, { "epoch": 11.541567260047715, "grad_norm": 0.0018435512902215123, "learning_rate": 4.528256265183484e-06, "loss": 0.0, "num_input_tokens_seen": 135626144, "step": 62890 }, { "epoch": 11.542484859607267, "grad_norm": 0.18247731029987335, "learning_rate": 4.527459086569454e-06, "loss": 0.0002, "num_input_tokens_seen": 135637728, "step": 62895 }, { "epoch": 11.54340245916682, "grad_norm": 0.00469637056812644, "learning_rate": 4.526661920075365e-06, "loss": 0.0, "num_input_tokens_seen": 135648896, "step": 62900 }, { "epoch": 11.544320058726372, "grad_norm": 0.00241294177249074, "learning_rate": 4.525864765721665e-06, "loss": 0.0, "num_input_tokens_seen": 135659808, "step": 62905 }, { "epoch": 11.545237658285924, "grad_norm": 0.0018568341620266438, "learning_rate": 4.5250676235287985e-06, "loss": 0.0, "num_input_tokens_seen": 135670528, "step": 62910 }, { "epoch": 11.546155257845477, "grad_norm": 0.002584695117548108, "learning_rate": 4.52427049351721e-06, "loss": 0.0022, "num_input_tokens_seen": 135680928, "step": 62915 }, { "epoch": 11.547072857405029, "grad_norm": 0.127229243516922, "learning_rate": 4.5234733757073475e-06, "loss": 0.0004, "num_input_tokens_seen": 135690528, "step": 62920 }, { "epoch": 11.54799045696458, "grad_norm": 0.0017321197083219886, "learning_rate": 4.522676270119654e-06, "loss": 0.0001, "num_input_tokens_seen": 135701728, "step": 62925 }, { "epoch": 11.548908056524134, "grad_norm": 0.000742431846447289, "learning_rate": 4.521879176774575e-06, "loss": 0.0, "num_input_tokens_seen": 135713152, "step": 62930 }, { "epoch": 11.549825656083685, "grad_norm": 0.0015830197371542454, "learning_rate": 4.5210820956925525e-06, "loss": 0.0001, "num_input_tokens_seen": 135724160, "step": 62935 }, { "epoch": 11.550743255643237, "grad_norm": 0.00037785008316859603, "learning_rate": 4.520285026894033e-06, "loss": 0.0588, "num_input_tokens_seen": 135734624, "step": 62940 }, { "epoch": 11.55166085520279, "grad_norm": 0.040819548070430756, "learning_rate": 4.51948797039946e-06, "loss": 0.0001, "num_input_tokens_seen": 135745088, "step": 62945 }, { "epoch": 11.552578454762342, "grad_norm": 0.0012256295885890722, "learning_rate": 4.518690926229273e-06, "loss": 0.0, "num_input_tokens_seen": 135756256, "step": 62950 }, { "epoch": 11.553496054321894, "grad_norm": 56.17596435546875, "learning_rate": 4.517893894403921e-06, "loss": 0.1532, "num_input_tokens_seen": 135767104, "step": 62955 }, { "epoch": 11.554413653881447, "grad_norm": 0.0012253228342160583, "learning_rate": 4.517096874943842e-06, "loss": 0.0001, "num_input_tokens_seen": 135779808, "step": 62960 }, { "epoch": 11.555331253440999, "grad_norm": 0.004785222001373768, "learning_rate": 4.516299867869478e-06, "loss": 0.0853, "num_input_tokens_seen": 135790944, "step": 62965 }, { "epoch": 11.55624885300055, "grad_norm": 0.00046008240315131843, "learning_rate": 4.515502873201275e-06, "loss": 0.0014, "num_input_tokens_seen": 135802144, "step": 62970 }, { "epoch": 11.557166452560104, "grad_norm": 0.00925511121749878, "learning_rate": 4.514705890959673e-06, "loss": 0.0002, "num_input_tokens_seen": 135811296, "step": 62975 }, { "epoch": 11.558084052119655, "grad_norm": 0.002967440988868475, "learning_rate": 4.513908921165113e-06, "loss": 0.5377, "num_input_tokens_seen": 135822592, "step": 62980 }, { "epoch": 11.559001651679207, "grad_norm": 0.017144817858934402, "learning_rate": 4.513111963838033e-06, "loss": 0.0225, "num_input_tokens_seen": 135832512, "step": 62985 }, { "epoch": 11.55991925123876, "grad_norm": 0.0011267169611528516, "learning_rate": 4.512315018998878e-06, "loss": 0.0001, "num_input_tokens_seen": 135843648, "step": 62990 }, { "epoch": 11.560836850798312, "grad_norm": 0.0107853589579463, "learning_rate": 4.511518086668088e-06, "loss": 0.0003, "num_input_tokens_seen": 135854176, "step": 62995 }, { "epoch": 11.561754450357864, "grad_norm": 0.0019106855615973473, "learning_rate": 4.5107211668661e-06, "loss": 0.0762, "num_input_tokens_seen": 135865952, "step": 63000 }, { "epoch": 11.562672049917417, "grad_norm": 0.0015093449037522078, "learning_rate": 4.5099242596133575e-06, "loss": 0.0001, "num_input_tokens_seen": 135877440, "step": 63005 }, { "epoch": 11.563589649476969, "grad_norm": 0.0013176280772313476, "learning_rate": 4.509127364930297e-06, "loss": 0.0012, "num_input_tokens_seen": 135888096, "step": 63010 }, { "epoch": 11.56450724903652, "grad_norm": 1.2416495084762573, "learning_rate": 4.50833048283736e-06, "loss": 0.0005, "num_input_tokens_seen": 135899808, "step": 63015 }, { "epoch": 11.565424848596074, "grad_norm": 0.0014765927335247397, "learning_rate": 4.5075336133549825e-06, "loss": 0.0079, "num_input_tokens_seen": 135910848, "step": 63020 }, { "epoch": 11.566342448155625, "grad_norm": 0.8043553233146667, "learning_rate": 4.5067367565036055e-06, "loss": 0.0005, "num_input_tokens_seen": 135921984, "step": 63025 }, { "epoch": 11.567260047715177, "grad_norm": 0.002665200736373663, "learning_rate": 4.505939912303667e-06, "loss": 0.0004, "num_input_tokens_seen": 135933152, "step": 63030 }, { "epoch": 11.56817764727473, "grad_norm": 57.583003997802734, "learning_rate": 4.505143080775602e-06, "loss": 0.002, "num_input_tokens_seen": 135944000, "step": 63035 }, { "epoch": 11.569095246834282, "grad_norm": 0.006342682987451553, "learning_rate": 4.504346261939851e-06, "loss": 0.0944, "num_input_tokens_seen": 135954944, "step": 63040 }, { "epoch": 11.570012846393833, "grad_norm": 34.875518798828125, "learning_rate": 4.503549455816851e-06, "loss": 0.0091, "num_input_tokens_seen": 135965728, "step": 63045 }, { "epoch": 11.570930445953387, "grad_norm": 0.0006212962325662374, "learning_rate": 4.502752662427036e-06, "loss": 0.0001, "num_input_tokens_seen": 135977344, "step": 63050 }, { "epoch": 11.571848045512938, "grad_norm": 0.00033722573425620794, "learning_rate": 4.501955881790846e-06, "loss": 0.0002, "num_input_tokens_seen": 135987776, "step": 63055 }, { "epoch": 11.57276564507249, "grad_norm": 0.02784569188952446, "learning_rate": 4.501159113928716e-06, "loss": 0.0, "num_input_tokens_seen": 135998592, "step": 63060 }, { "epoch": 11.573683244632043, "grad_norm": 0.0011013965122401714, "learning_rate": 4.500362358861082e-06, "loss": 0.1285, "num_input_tokens_seen": 136009632, "step": 63065 }, { "epoch": 11.574600844191595, "grad_norm": 0.017502311617136, "learning_rate": 4.499565616608377e-06, "loss": 0.0003, "num_input_tokens_seen": 136019968, "step": 63070 }, { "epoch": 11.575518443751147, "grad_norm": 0.002192340325564146, "learning_rate": 4.49876888719104e-06, "loss": 0.0, "num_input_tokens_seen": 136031360, "step": 63075 }, { "epoch": 11.5764360433107, "grad_norm": 0.0004383510386105627, "learning_rate": 4.497972170629505e-06, "loss": 0.0029, "num_input_tokens_seen": 136044096, "step": 63080 }, { "epoch": 11.577353642870252, "grad_norm": 0.00278464681468904, "learning_rate": 4.4971754669442025e-06, "loss": 0.0, "num_input_tokens_seen": 136055840, "step": 63085 }, { "epoch": 11.578271242429803, "grad_norm": 0.002611497649922967, "learning_rate": 4.496378776155573e-06, "loss": 0.0264, "num_input_tokens_seen": 136067168, "step": 63090 }, { "epoch": 11.579188841989357, "grad_norm": 0.015270485542714596, "learning_rate": 4.495582098284047e-06, "loss": 0.0001, "num_input_tokens_seen": 136078464, "step": 63095 }, { "epoch": 11.580106441548908, "grad_norm": 0.0005018861847929657, "learning_rate": 4.494785433350057e-06, "loss": 0.0001, "num_input_tokens_seen": 136087392, "step": 63100 }, { "epoch": 11.58102404110846, "grad_norm": 0.002687784843146801, "learning_rate": 4.493988781374039e-06, "loss": 0.0, "num_input_tokens_seen": 136098208, "step": 63105 }, { "epoch": 11.581941640668013, "grad_norm": 0.005726659670472145, "learning_rate": 4.493192142376423e-06, "loss": 0.0, "num_input_tokens_seen": 136110304, "step": 63110 }, { "epoch": 11.582859240227565, "grad_norm": 0.0009912853129208088, "learning_rate": 4.492395516377645e-06, "loss": 0.0001, "num_input_tokens_seen": 136120544, "step": 63115 }, { "epoch": 11.583776839787117, "grad_norm": 343.6163635253906, "learning_rate": 4.4915989033981346e-06, "loss": 0.0264, "num_input_tokens_seen": 136128896, "step": 63120 }, { "epoch": 11.58469443934667, "grad_norm": 0.003031929489225149, "learning_rate": 4.490802303458324e-06, "loss": 0.0001, "num_input_tokens_seen": 136138912, "step": 63125 }, { "epoch": 11.585612038906222, "grad_norm": 0.001239517005160451, "learning_rate": 4.490005716578646e-06, "loss": 0.0, "num_input_tokens_seen": 136148832, "step": 63130 }, { "epoch": 11.586529638465773, "grad_norm": 0.0038382364436984062, "learning_rate": 4.489209142779528e-06, "loss": 0.0, "num_input_tokens_seen": 136159680, "step": 63135 }, { "epoch": 11.587447238025327, "grad_norm": 0.006895979400724173, "learning_rate": 4.488412582081408e-06, "loss": 0.0478, "num_input_tokens_seen": 136168352, "step": 63140 }, { "epoch": 11.588364837584878, "grad_norm": 0.0008535425295121968, "learning_rate": 4.48761603450471e-06, "loss": 0.0001, "num_input_tokens_seen": 136180416, "step": 63145 }, { "epoch": 11.58928243714443, "grad_norm": 0.00995340384542942, "learning_rate": 4.486819500069866e-06, "loss": 0.0, "num_input_tokens_seen": 136190304, "step": 63150 }, { "epoch": 11.590200036703983, "grad_norm": 0.0016217200318351388, "learning_rate": 4.486022978797308e-06, "loss": 0.0, "num_input_tokens_seen": 136200256, "step": 63155 }, { "epoch": 11.591117636263535, "grad_norm": 0.000353713781805709, "learning_rate": 4.4852264707074635e-06, "loss": 0.0, "num_input_tokens_seen": 136211168, "step": 63160 }, { "epoch": 11.592035235823086, "grad_norm": 0.0020864224061369896, "learning_rate": 4.484429975820763e-06, "loss": 0.0001, "num_input_tokens_seen": 136221248, "step": 63165 }, { "epoch": 11.59295283538264, "grad_norm": 0.0006142894853837788, "learning_rate": 4.483633494157632e-06, "loss": 0.0012, "num_input_tokens_seen": 136231584, "step": 63170 }, { "epoch": 11.593870434942191, "grad_norm": 0.0008854979532770813, "learning_rate": 4.482837025738504e-06, "loss": 0.0001, "num_input_tokens_seen": 136241696, "step": 63175 }, { "epoch": 11.594788034501743, "grad_norm": 0.0015231490833684802, "learning_rate": 4.482040570583805e-06, "loss": 0.0097, "num_input_tokens_seen": 136252544, "step": 63180 }, { "epoch": 11.595705634061297, "grad_norm": 0.007220268715173006, "learning_rate": 4.481244128713959e-06, "loss": 0.0001, "num_input_tokens_seen": 136263904, "step": 63185 }, { "epoch": 11.596623233620848, "grad_norm": 0.000318260834319517, "learning_rate": 4.4804477001494015e-06, "loss": 0.0, "num_input_tokens_seen": 136272864, "step": 63190 }, { "epoch": 11.5975408331804, "grad_norm": 0.006528065074235201, "learning_rate": 4.479651284910554e-06, "loss": 0.0002, "num_input_tokens_seen": 136283200, "step": 63195 }, { "epoch": 11.598458432739953, "grad_norm": 0.002276175655424595, "learning_rate": 4.478854883017845e-06, "loss": 0.0001, "num_input_tokens_seen": 136292672, "step": 63200 }, { "epoch": 11.599376032299505, "grad_norm": 0.004751764237880707, "learning_rate": 4.4780584944916975e-06, "loss": 0.0, "num_input_tokens_seen": 136304096, "step": 63205 }, { "epoch": 11.600293631859056, "grad_norm": 0.0012387869646772742, "learning_rate": 4.477262119352543e-06, "loss": 0.019, "num_input_tokens_seen": 136312896, "step": 63210 }, { "epoch": 11.60121123141861, "grad_norm": 0.0012442910810932517, "learning_rate": 4.476465757620806e-06, "loss": 0.3188, "num_input_tokens_seen": 136323328, "step": 63215 }, { "epoch": 11.602128830978161, "grad_norm": 56.089603424072266, "learning_rate": 4.4756694093169086e-06, "loss": 0.119, "num_input_tokens_seen": 136334240, "step": 63220 }, { "epoch": 11.603046430537713, "grad_norm": 0.0025357536505907774, "learning_rate": 4.47487307446128e-06, "loss": 0.0145, "num_input_tokens_seen": 136345536, "step": 63225 }, { "epoch": 11.603964030097266, "grad_norm": 0.15474557876586914, "learning_rate": 4.474076753074342e-06, "loss": 0.0002, "num_input_tokens_seen": 136358528, "step": 63230 }, { "epoch": 11.604881629656818, "grad_norm": 0.0006255483021959662, "learning_rate": 4.47328044517652e-06, "loss": 0.0, "num_input_tokens_seen": 136369408, "step": 63235 }, { "epoch": 11.60579922921637, "grad_norm": 0.0019164480036124587, "learning_rate": 4.47248415078824e-06, "loss": 0.0001, "num_input_tokens_seen": 136379936, "step": 63240 }, { "epoch": 11.606716828775923, "grad_norm": 134.5835418701172, "learning_rate": 4.471687869929923e-06, "loss": 0.048, "num_input_tokens_seen": 136391424, "step": 63245 }, { "epoch": 11.607634428335475, "grad_norm": 0.011424548923969269, "learning_rate": 4.470891602621994e-06, "loss": 0.0, "num_input_tokens_seen": 136402112, "step": 63250 }, { "epoch": 11.608552027895026, "grad_norm": 0.0004666948225349188, "learning_rate": 4.470095348884873e-06, "loss": 0.0001, "num_input_tokens_seen": 136411712, "step": 63255 }, { "epoch": 11.60946962745458, "grad_norm": 0.0009383769356645644, "learning_rate": 4.4692991087389866e-06, "loss": 0.0617, "num_input_tokens_seen": 136423392, "step": 63260 }, { "epoch": 11.610387227014131, "grad_norm": 0.004268843214958906, "learning_rate": 4.468502882204756e-06, "loss": 0.1813, "num_input_tokens_seen": 136435424, "step": 63265 }, { "epoch": 11.611304826573683, "grad_norm": 0.007953066378831863, "learning_rate": 4.467706669302601e-06, "loss": 0.0008, "num_input_tokens_seen": 136447712, "step": 63270 }, { "epoch": 11.612222426133236, "grad_norm": 0.0012225466780364513, "learning_rate": 4.466910470052947e-06, "loss": 0.0005, "num_input_tokens_seen": 136459104, "step": 63275 }, { "epoch": 11.613140025692788, "grad_norm": 84.20845794677734, "learning_rate": 4.466114284476213e-06, "loss": 0.0762, "num_input_tokens_seen": 136469696, "step": 63280 }, { "epoch": 11.61405762525234, "grad_norm": 3.989020824432373, "learning_rate": 4.465318112592818e-06, "loss": 0.2378, "num_input_tokens_seen": 136480992, "step": 63285 }, { "epoch": 11.614975224811893, "grad_norm": 0.00057815300533548, "learning_rate": 4.464521954423187e-06, "loss": 0.0001, "num_input_tokens_seen": 136491712, "step": 63290 }, { "epoch": 11.615892824371445, "grad_norm": 0.030580049380660057, "learning_rate": 4.463725809987738e-06, "loss": 0.0001, "num_input_tokens_seen": 136502048, "step": 63295 }, { "epoch": 11.616810423930996, "grad_norm": 0.0063770306296646595, "learning_rate": 4.46292967930689e-06, "loss": 0.2064, "num_input_tokens_seen": 136513536, "step": 63300 }, { "epoch": 11.61772802349055, "grad_norm": 0.004609372466802597, "learning_rate": 4.462133562401062e-06, "loss": 0.0002, "num_input_tokens_seen": 136523808, "step": 63305 }, { "epoch": 11.618645623050101, "grad_norm": 0.04079157114028931, "learning_rate": 4.461337459290677e-06, "loss": 0.0763, "num_input_tokens_seen": 136534816, "step": 63310 }, { "epoch": 11.619563222609653, "grad_norm": 0.018865332007408142, "learning_rate": 4.460541369996151e-06, "loss": 0.0001, "num_input_tokens_seen": 136545088, "step": 63315 }, { "epoch": 11.620480822169206, "grad_norm": 0.028811154887080193, "learning_rate": 4.459745294537901e-06, "loss": 0.0001, "num_input_tokens_seen": 136555680, "step": 63320 }, { "epoch": 11.621398421728758, "grad_norm": 0.12567342817783356, "learning_rate": 4.458949232936349e-06, "loss": 0.0002, "num_input_tokens_seen": 136565824, "step": 63325 }, { "epoch": 11.62231602128831, "grad_norm": 0.17453572154045105, "learning_rate": 4.458153185211909e-06, "loss": 0.0133, "num_input_tokens_seen": 136575904, "step": 63330 }, { "epoch": 11.623233620847863, "grad_norm": 0.0018346728757023811, "learning_rate": 4.457357151385e-06, "loss": 0.0002, "num_input_tokens_seen": 136585248, "step": 63335 }, { "epoch": 11.624151220407414, "grad_norm": 0.0020926676224917173, "learning_rate": 4.45656113147604e-06, "loss": 0.0002, "num_input_tokens_seen": 136597152, "step": 63340 }, { "epoch": 11.625068819966966, "grad_norm": 0.042737338691949844, "learning_rate": 4.455765125505444e-06, "loss": 0.0001, "num_input_tokens_seen": 136608288, "step": 63345 }, { "epoch": 11.62598641952652, "grad_norm": 0.17077675461769104, "learning_rate": 4.454969133493631e-06, "loss": 0.0002, "num_input_tokens_seen": 136618816, "step": 63350 }, { "epoch": 11.626904019086071, "grad_norm": 0.028969429433345795, "learning_rate": 4.454173155461011e-06, "loss": 0.0002, "num_input_tokens_seen": 136628736, "step": 63355 }, { "epoch": 11.627821618645623, "grad_norm": 0.001204594154842198, "learning_rate": 4.453377191428007e-06, "loss": 0.0, "num_input_tokens_seen": 136639488, "step": 63360 }, { "epoch": 11.628739218205176, "grad_norm": 0.31642037630081177, "learning_rate": 4.4525812414150304e-06, "loss": 0.0004, "num_input_tokens_seen": 136651040, "step": 63365 }, { "epoch": 11.629656817764728, "grad_norm": 0.004746817052364349, "learning_rate": 4.451785305442494e-06, "loss": 0.0002, "num_input_tokens_seen": 136662080, "step": 63370 }, { "epoch": 11.63057441732428, "grad_norm": 0.01978723518550396, "learning_rate": 4.450989383530817e-06, "loss": 0.0001, "num_input_tokens_seen": 136671968, "step": 63375 }, { "epoch": 11.631492016883833, "grad_norm": 0.002734564943239093, "learning_rate": 4.450193475700412e-06, "loss": 0.1813, "num_input_tokens_seen": 136682656, "step": 63380 }, { "epoch": 11.632409616443384, "grad_norm": 0.0008178101852536201, "learning_rate": 4.449397581971693e-06, "loss": 0.0001, "num_input_tokens_seen": 136694240, "step": 63385 }, { "epoch": 11.633327216002936, "grad_norm": 0.0022168096620589495, "learning_rate": 4.448601702365069e-06, "loss": 0.0014, "num_input_tokens_seen": 136703648, "step": 63390 }, { "epoch": 11.63424481556249, "grad_norm": 0.028635868802666664, "learning_rate": 4.44780583690096e-06, "loss": 0.0004, "num_input_tokens_seen": 136713312, "step": 63395 }, { "epoch": 11.635162415122041, "grad_norm": 0.00040773709770292044, "learning_rate": 4.447009985599775e-06, "loss": 0.001, "num_input_tokens_seen": 136723968, "step": 63400 }, { "epoch": 11.636080014681593, "grad_norm": 0.0023696215357631445, "learning_rate": 4.446214148481924e-06, "loss": 0.004, "num_input_tokens_seen": 136734464, "step": 63405 }, { "epoch": 11.636997614241146, "grad_norm": 0.002327646827325225, "learning_rate": 4.445418325567825e-06, "loss": 0.0, "num_input_tokens_seen": 136745728, "step": 63410 }, { "epoch": 11.637915213800698, "grad_norm": 0.0018022360745817423, "learning_rate": 4.444622516877886e-06, "loss": 0.0, "num_input_tokens_seen": 136756576, "step": 63415 }, { "epoch": 11.63883281336025, "grad_norm": 0.002888355404138565, "learning_rate": 4.443826722432517e-06, "loss": 0.0329, "num_input_tokens_seen": 136768032, "step": 63420 }, { "epoch": 11.639750412919803, "grad_norm": 0.022562000900506973, "learning_rate": 4.4430309422521315e-06, "loss": 0.0001, "num_input_tokens_seen": 136778272, "step": 63425 }, { "epoch": 11.640668012479354, "grad_norm": 0.0014129288028925657, "learning_rate": 4.442235176357139e-06, "loss": 0.0, "num_input_tokens_seen": 136789184, "step": 63430 }, { "epoch": 11.641585612038906, "grad_norm": 0.022486446425318718, "learning_rate": 4.441439424767951e-06, "loss": 0.0, "num_input_tokens_seen": 136800352, "step": 63435 }, { "epoch": 11.64250321159846, "grad_norm": 0.14012224972248077, "learning_rate": 4.440643687504975e-06, "loss": 0.0002, "num_input_tokens_seen": 136811072, "step": 63440 }, { "epoch": 11.64342081115801, "grad_norm": 0.036588069051504135, "learning_rate": 4.439847964588622e-06, "loss": 0.0001, "num_input_tokens_seen": 136820864, "step": 63445 }, { "epoch": 11.644338410717562, "grad_norm": 0.03673824667930603, "learning_rate": 4.4390522560393e-06, "loss": 0.046, "num_input_tokens_seen": 136831424, "step": 63450 }, { "epoch": 11.645256010277116, "grad_norm": 0.0007839978206902742, "learning_rate": 4.438256561877417e-06, "loss": 0.0, "num_input_tokens_seen": 136842368, "step": 63455 }, { "epoch": 11.646173609836667, "grad_norm": 6.829383373260498, "learning_rate": 4.437460882123384e-06, "loss": 0.0029, "num_input_tokens_seen": 136853248, "step": 63460 }, { "epoch": 11.64709120939622, "grad_norm": 0.011334037408232689, "learning_rate": 4.436665216797609e-06, "loss": 0.0051, "num_input_tokens_seen": 136863872, "step": 63465 }, { "epoch": 11.648008808955773, "grad_norm": 0.24225887656211853, "learning_rate": 4.435869565920497e-06, "loss": 0.001, "num_input_tokens_seen": 136875072, "step": 63470 }, { "epoch": 11.648926408515324, "grad_norm": 0.007656459230929613, "learning_rate": 4.435073929512454e-06, "loss": 0.0002, "num_input_tokens_seen": 136886464, "step": 63475 }, { "epoch": 11.649844008074876, "grad_norm": 0.0038546384312212467, "learning_rate": 4.4342783075938915e-06, "loss": 0.0001, "num_input_tokens_seen": 136897696, "step": 63480 }, { "epoch": 11.65076160763443, "grad_norm": 0.003197185695171356, "learning_rate": 4.433482700185213e-06, "loss": 0.0, "num_input_tokens_seen": 136907552, "step": 63485 }, { "epoch": 11.65167920719398, "grad_norm": 0.00992753729224205, "learning_rate": 4.432687107306825e-06, "loss": 0.0, "num_input_tokens_seen": 136919072, "step": 63490 }, { "epoch": 11.652596806753532, "grad_norm": 0.0011476523941382766, "learning_rate": 4.431891528979133e-06, "loss": 0.0007, "num_input_tokens_seen": 136929632, "step": 63495 }, { "epoch": 11.653514406313086, "grad_norm": 0.002104280050843954, "learning_rate": 4.431095965222545e-06, "loss": 0.1875, "num_input_tokens_seen": 136940672, "step": 63500 }, { "epoch": 11.654432005872637, "grad_norm": 0.00032589572947472334, "learning_rate": 4.430300416057461e-06, "loss": 0.0001, "num_input_tokens_seen": 136951328, "step": 63505 }, { "epoch": 11.655349605432189, "grad_norm": 0.0013265387387946248, "learning_rate": 4.42950488150429e-06, "loss": 0.0, "num_input_tokens_seen": 136962496, "step": 63510 }, { "epoch": 11.656267204991742, "grad_norm": 0.000438955903518945, "learning_rate": 4.428709361583435e-06, "loss": 0.1066, "num_input_tokens_seen": 136973984, "step": 63515 }, { "epoch": 11.657184804551294, "grad_norm": 0.059135157614946365, "learning_rate": 4.427913856315299e-06, "loss": 0.0001, "num_input_tokens_seen": 136984800, "step": 63520 }, { "epoch": 11.658102404110846, "grad_norm": 0.051547423005104065, "learning_rate": 4.427118365720285e-06, "loss": 0.1514, "num_input_tokens_seen": 136995392, "step": 63525 }, { "epoch": 11.659020003670399, "grad_norm": 0.0013159937225282192, "learning_rate": 4.426322889818798e-06, "loss": 0.0, "num_input_tokens_seen": 137006560, "step": 63530 }, { "epoch": 11.65993760322995, "grad_norm": 0.00031761167338117957, "learning_rate": 4.42552742863124e-06, "loss": 0.0007, "num_input_tokens_seen": 137017664, "step": 63535 }, { "epoch": 11.660855202789502, "grad_norm": 0.0015589608810842037, "learning_rate": 4.424731982178012e-06, "loss": 0.1532, "num_input_tokens_seen": 137029600, "step": 63540 }, { "epoch": 11.661772802349056, "grad_norm": 70.74942016601562, "learning_rate": 4.423936550479519e-06, "loss": 0.1565, "num_input_tokens_seen": 137041152, "step": 63545 }, { "epoch": 11.662690401908607, "grad_norm": 0.009457966312766075, "learning_rate": 4.423141133556158e-06, "loss": 0.0, "num_input_tokens_seen": 137052128, "step": 63550 }, { "epoch": 11.663608001468159, "grad_norm": 0.007796932011842728, "learning_rate": 4.4223457314283335e-06, "loss": 0.0001, "num_input_tokens_seen": 137063104, "step": 63555 }, { "epoch": 11.664525601027712, "grad_norm": 0.7348083853721619, "learning_rate": 4.421550344116447e-06, "loss": 0.0006, "num_input_tokens_seen": 137074144, "step": 63560 }, { "epoch": 11.665443200587264, "grad_norm": 0.0013373313704505563, "learning_rate": 4.420754971640898e-06, "loss": 0.0001, "num_input_tokens_seen": 137085440, "step": 63565 }, { "epoch": 11.666360800146816, "grad_norm": 0.014282156713306904, "learning_rate": 4.419959614022086e-06, "loss": 0.0001, "num_input_tokens_seen": 137096992, "step": 63570 }, { "epoch": 11.667278399706369, "grad_norm": 0.0014354928862303495, "learning_rate": 4.419164271280408e-06, "loss": 0.1907, "num_input_tokens_seen": 137107424, "step": 63575 }, { "epoch": 11.66819599926592, "grad_norm": 0.0028518654871731997, "learning_rate": 4.418368943436269e-06, "loss": 0.0001, "num_input_tokens_seen": 137117984, "step": 63580 }, { "epoch": 11.669113598825472, "grad_norm": 0.00596822751685977, "learning_rate": 4.4175736305100656e-06, "loss": 0.002, "num_input_tokens_seen": 137129120, "step": 63585 }, { "epoch": 11.670031198385026, "grad_norm": 0.0005467783776111901, "learning_rate": 4.416778332522193e-06, "loss": 0.0, "num_input_tokens_seen": 137139808, "step": 63590 }, { "epoch": 11.670948797944577, "grad_norm": 0.002216626890003681, "learning_rate": 4.415983049493055e-06, "loss": 0.0057, "num_input_tokens_seen": 137150656, "step": 63595 }, { "epoch": 11.671866397504129, "grad_norm": 63.71171188354492, "learning_rate": 4.415187781443047e-06, "loss": 0.1629, "num_input_tokens_seen": 137161568, "step": 63600 }, { "epoch": 11.672783997063682, "grad_norm": 0.008286740630865097, "learning_rate": 4.414392528392563e-06, "loss": 0.0, "num_input_tokens_seen": 137171456, "step": 63605 }, { "epoch": 11.673701596623234, "grad_norm": 0.022236406803131104, "learning_rate": 4.413597290362005e-06, "loss": 0.0001, "num_input_tokens_seen": 137182848, "step": 63610 }, { "epoch": 11.674619196182785, "grad_norm": 0.0010762755991891026, "learning_rate": 4.412802067371768e-06, "loss": 0.0, "num_input_tokens_seen": 137192896, "step": 63615 }, { "epoch": 11.675536795742339, "grad_norm": 0.00494360039010644, "learning_rate": 4.4120068594422485e-06, "loss": 0.225, "num_input_tokens_seen": 137203904, "step": 63620 }, { "epoch": 11.67645439530189, "grad_norm": 0.023999985307455063, "learning_rate": 4.411211666593839e-06, "loss": 0.0001, "num_input_tokens_seen": 137213696, "step": 63625 }, { "epoch": 11.677371994861442, "grad_norm": 0.0004916585749015212, "learning_rate": 4.41041648884694e-06, "loss": 0.0001, "num_input_tokens_seen": 137224224, "step": 63630 }, { "epoch": 11.678289594420995, "grad_norm": 0.00044123013503849506, "learning_rate": 4.4096213262219436e-06, "loss": 0.1782, "num_input_tokens_seen": 137234560, "step": 63635 }, { "epoch": 11.679207193980547, "grad_norm": 0.0015155626460909843, "learning_rate": 4.408826178739245e-06, "loss": 0.002, "num_input_tokens_seen": 137244224, "step": 63640 }, { "epoch": 11.680124793540099, "grad_norm": 0.04066178575158119, "learning_rate": 4.408031046419239e-06, "loss": 0.0002, "num_input_tokens_seen": 137255008, "step": 63645 }, { "epoch": 11.681042393099652, "grad_norm": 0.006898963358253241, "learning_rate": 4.407235929282319e-06, "loss": 0.0, "num_input_tokens_seen": 137265920, "step": 63650 }, { "epoch": 11.681959992659204, "grad_norm": 0.0008184765465557575, "learning_rate": 4.406440827348879e-06, "loss": 0.0002, "num_input_tokens_seen": 137278048, "step": 63655 }, { "epoch": 11.682877592218755, "grad_norm": 0.0030774439219385386, "learning_rate": 4.405645740639312e-06, "loss": 0.0, "num_input_tokens_seen": 137290016, "step": 63660 }, { "epoch": 11.683795191778309, "grad_norm": 0.00020643562311306596, "learning_rate": 4.404850669174011e-06, "loss": 0.0001, "num_input_tokens_seen": 137302432, "step": 63665 }, { "epoch": 11.68471279133786, "grad_norm": 0.0011205017799511552, "learning_rate": 4.404055612973367e-06, "loss": 0.0, "num_input_tokens_seen": 137312928, "step": 63670 }, { "epoch": 11.685630390897412, "grad_norm": 0.003783889813348651, "learning_rate": 4.403260572057772e-06, "loss": 0.0002, "num_input_tokens_seen": 137324800, "step": 63675 }, { "epoch": 11.686547990456965, "grad_norm": 0.1294558346271515, "learning_rate": 4.4024655464476204e-06, "loss": 0.0003, "num_input_tokens_seen": 137336000, "step": 63680 }, { "epoch": 11.687465590016517, "grad_norm": 0.0007396818255074322, "learning_rate": 4.401670536163301e-06, "loss": 0.004, "num_input_tokens_seen": 137347584, "step": 63685 }, { "epoch": 11.688383189576069, "grad_norm": 0.0318339541554451, "learning_rate": 4.400875541225203e-06, "loss": 0.0002, "num_input_tokens_seen": 137358304, "step": 63690 }, { "epoch": 11.689300789135622, "grad_norm": 0.0005137884872965515, "learning_rate": 4.400080561653721e-06, "loss": 0.0029, "num_input_tokens_seen": 137369248, "step": 63695 }, { "epoch": 11.690218388695174, "grad_norm": 0.004395311698317528, "learning_rate": 4.399285597469242e-06, "loss": 0.0001, "num_input_tokens_seen": 137380512, "step": 63700 }, { "epoch": 11.691135988254725, "grad_norm": 0.007775548845529556, "learning_rate": 4.398490648692158e-06, "loss": 0.0119, "num_input_tokens_seen": 137391392, "step": 63705 }, { "epoch": 11.692053587814279, "grad_norm": 0.9916610717773438, "learning_rate": 4.397695715342854e-06, "loss": 0.1471, "num_input_tokens_seen": 137402624, "step": 63710 }, { "epoch": 11.69297118737383, "grad_norm": 0.0007244159933179617, "learning_rate": 4.396900797441723e-06, "loss": 0.0006, "num_input_tokens_seen": 137413664, "step": 63715 }, { "epoch": 11.693888786933382, "grad_norm": 0.0020569709595292807, "learning_rate": 4.396105895009151e-06, "loss": 0.0002, "num_input_tokens_seen": 137424672, "step": 63720 }, { "epoch": 11.694806386492935, "grad_norm": 0.002062194747850299, "learning_rate": 4.395311008065527e-06, "loss": 0.0001, "num_input_tokens_seen": 137435360, "step": 63725 }, { "epoch": 11.695723986052487, "grad_norm": 0.0006120766629464924, "learning_rate": 4.394516136631239e-06, "loss": 0.0, "num_input_tokens_seen": 137446400, "step": 63730 }, { "epoch": 11.696641585612038, "grad_norm": 0.0018809864996001124, "learning_rate": 4.393721280726674e-06, "loss": 0.0, "num_input_tokens_seen": 137457632, "step": 63735 }, { "epoch": 11.697559185171592, "grad_norm": 0.0003541560145094991, "learning_rate": 4.392926440372218e-06, "loss": 0.0, "num_input_tokens_seen": 137467328, "step": 63740 }, { "epoch": 11.698476784731143, "grad_norm": 0.08566688746213913, "learning_rate": 4.3921316155882575e-06, "loss": 0.0001, "num_input_tokens_seen": 137479168, "step": 63745 }, { "epoch": 11.699394384290695, "grad_norm": 0.06673788279294968, "learning_rate": 4.3913368063951795e-06, "loss": 0.0, "num_input_tokens_seen": 137490784, "step": 63750 }, { "epoch": 11.700311983850249, "grad_norm": 0.026561392471194267, "learning_rate": 4.390542012813369e-06, "loss": 0.0001, "num_input_tokens_seen": 137501408, "step": 63755 }, { "epoch": 11.7012295834098, "grad_norm": 0.0011229391675442457, "learning_rate": 4.389747234863211e-06, "loss": 0.0002, "num_input_tokens_seen": 137513120, "step": 63760 }, { "epoch": 11.702147182969352, "grad_norm": 0.08636396378278732, "learning_rate": 4.3889524725650915e-06, "loss": 0.0, "num_input_tokens_seen": 137522144, "step": 63765 }, { "epoch": 11.703064782528905, "grad_norm": 0.002662917133420706, "learning_rate": 4.388157725939392e-06, "loss": 0.0001, "num_input_tokens_seen": 137533856, "step": 63770 }, { "epoch": 11.703982382088457, "grad_norm": 0.0014494205825030804, "learning_rate": 4.3873629950065e-06, "loss": 0.0, "num_input_tokens_seen": 137542880, "step": 63775 }, { "epoch": 11.704899981648008, "grad_norm": 0.09851963073015213, "learning_rate": 4.386568279786799e-06, "loss": 0.0001, "num_input_tokens_seen": 137553600, "step": 63780 }, { "epoch": 11.705817581207562, "grad_norm": 0.0003465446934569627, "learning_rate": 4.3857735803006695e-06, "loss": 0.0, "num_input_tokens_seen": 137564384, "step": 63785 }, { "epoch": 11.706735180767113, "grad_norm": 0.0006272198515944183, "learning_rate": 4.384978896568496e-06, "loss": 0.0001, "num_input_tokens_seen": 137573248, "step": 63790 }, { "epoch": 11.707652780326665, "grad_norm": 0.001127283787354827, "learning_rate": 4.384184228610659e-06, "loss": 0.0, "num_input_tokens_seen": 137583616, "step": 63795 }, { "epoch": 11.708570379886218, "grad_norm": 0.0003236604970879853, "learning_rate": 4.383389576447544e-06, "loss": 0.1006, "num_input_tokens_seen": 137594784, "step": 63800 }, { "epoch": 11.70948797944577, "grad_norm": 0.0026825142558664083, "learning_rate": 4.382594940099531e-06, "loss": 0.0042, "num_input_tokens_seen": 137606464, "step": 63805 }, { "epoch": 11.710405579005322, "grad_norm": 0.009787969291210175, "learning_rate": 4.381800319586998e-06, "loss": 0.1097, "num_input_tokens_seen": 137617248, "step": 63810 }, { "epoch": 11.711323178564875, "grad_norm": 0.0006222163792699575, "learning_rate": 4.381005714930331e-06, "loss": 0.0, "num_input_tokens_seen": 137628992, "step": 63815 }, { "epoch": 11.712240778124427, "grad_norm": 0.0011335432063788176, "learning_rate": 4.380211126149909e-06, "loss": 0.1035, "num_input_tokens_seen": 137639264, "step": 63820 }, { "epoch": 11.713158377683978, "grad_norm": 0.001987623516470194, "learning_rate": 4.379416553266108e-06, "loss": 0.0, "num_input_tokens_seen": 137650720, "step": 63825 }, { "epoch": 11.714075977243532, "grad_norm": 0.0007215355290099978, "learning_rate": 4.378621996299313e-06, "loss": 0.0, "num_input_tokens_seen": 137661088, "step": 63830 }, { "epoch": 11.714993576803083, "grad_norm": 37.107177734375, "learning_rate": 4.377827455269901e-06, "loss": 0.244, "num_input_tokens_seen": 137671488, "step": 63835 }, { "epoch": 11.715911176362635, "grad_norm": 0.07881313562393188, "learning_rate": 4.37703293019825e-06, "loss": 0.0001, "num_input_tokens_seen": 137682080, "step": 63840 }, { "epoch": 11.716828775922188, "grad_norm": 0.0010121079394593835, "learning_rate": 4.3762384211047365e-06, "loss": 0.0, "num_input_tokens_seen": 137692832, "step": 63845 }, { "epoch": 11.71774637548174, "grad_norm": 0.0023548067547380924, "learning_rate": 4.375443928009743e-06, "loss": 0.0001, "num_input_tokens_seen": 137703520, "step": 63850 }, { "epoch": 11.718663975041292, "grad_norm": 0.003147422568872571, "learning_rate": 4.3746494509336455e-06, "loss": 0.0, "num_input_tokens_seen": 137712928, "step": 63855 }, { "epoch": 11.719581574600845, "grad_norm": 0.010133162140846252, "learning_rate": 4.373854989896819e-06, "loss": 0.0, "num_input_tokens_seen": 137723360, "step": 63860 }, { "epoch": 11.720499174160397, "grad_norm": 0.0018361564725637436, "learning_rate": 4.373060544919642e-06, "loss": 0.0001, "num_input_tokens_seen": 137733920, "step": 63865 }, { "epoch": 11.721416773719948, "grad_norm": 0.0002129646309185773, "learning_rate": 4.37226611602249e-06, "loss": 0.0001, "num_input_tokens_seen": 137745152, "step": 63870 }, { "epoch": 11.722334373279502, "grad_norm": 0.0026180176064372063, "learning_rate": 4.371471703225739e-06, "loss": 0.0, "num_input_tokens_seen": 137755168, "step": 63875 }, { "epoch": 11.723251972839053, "grad_norm": 0.0035294245462864637, "learning_rate": 4.370677306549766e-06, "loss": 0.0, "num_input_tokens_seen": 137765696, "step": 63880 }, { "epoch": 11.724169572398605, "grad_norm": 0.0014709861716255546, "learning_rate": 4.369882926014945e-06, "loss": 0.0, "num_input_tokens_seen": 137776160, "step": 63885 }, { "epoch": 11.725087171958158, "grad_norm": 0.00037107497337274253, "learning_rate": 4.369088561641649e-06, "loss": 0.0, "num_input_tokens_seen": 137787552, "step": 63890 }, { "epoch": 11.72600477151771, "grad_norm": 0.0004676393000409007, "learning_rate": 4.3682942134502525e-06, "loss": 0.0, "num_input_tokens_seen": 137797504, "step": 63895 }, { "epoch": 11.726922371077261, "grad_norm": 101.1525650024414, "learning_rate": 4.367499881461132e-06, "loss": 0.11, "num_input_tokens_seen": 137808480, "step": 63900 }, { "epoch": 11.727839970636815, "grad_norm": 0.07655997574329376, "learning_rate": 4.36670556569466e-06, "loss": 0.0401, "num_input_tokens_seen": 137819776, "step": 63905 }, { "epoch": 11.728757570196366, "grad_norm": 40.908199310302734, "learning_rate": 4.365911266171206e-06, "loss": 0.151, "num_input_tokens_seen": 137829056, "step": 63910 }, { "epoch": 11.729675169755918, "grad_norm": 0.0028432817198336124, "learning_rate": 4.365116982911147e-06, "loss": 0.0, "num_input_tokens_seen": 137840352, "step": 63915 }, { "epoch": 11.730592769315471, "grad_norm": 0.00493107782676816, "learning_rate": 4.364322715934854e-06, "loss": 0.0001, "num_input_tokens_seen": 137850496, "step": 63920 }, { "epoch": 11.731510368875023, "grad_norm": 0.020865153521299362, "learning_rate": 4.363528465262699e-06, "loss": 0.0007, "num_input_tokens_seen": 137861024, "step": 63925 }, { "epoch": 11.732427968434575, "grad_norm": 0.020300431177020073, "learning_rate": 4.3627342309150485e-06, "loss": 0.0011, "num_input_tokens_seen": 137871584, "step": 63930 }, { "epoch": 11.733345567994128, "grad_norm": 0.0017303598579019308, "learning_rate": 4.361940012912281e-06, "loss": 0.0003, "num_input_tokens_seen": 137883040, "step": 63935 }, { "epoch": 11.73426316755368, "grad_norm": 0.018181122839450836, "learning_rate": 4.361145811274762e-06, "loss": 0.0, "num_input_tokens_seen": 137893440, "step": 63940 }, { "epoch": 11.735180767113231, "grad_norm": 0.0004561736132018268, "learning_rate": 4.360351626022861e-06, "loss": 0.197, "num_input_tokens_seen": 137905376, "step": 63945 }, { "epoch": 11.736098366672785, "grad_norm": 0.002431460889056325, "learning_rate": 4.359557457176952e-06, "loss": 0.0002, "num_input_tokens_seen": 137916224, "step": 63950 }, { "epoch": 11.737015966232336, "grad_norm": 0.0008365586982108653, "learning_rate": 4.358763304757402e-06, "loss": 0.1407, "num_input_tokens_seen": 137926752, "step": 63955 }, { "epoch": 11.737933565791888, "grad_norm": 0.027408041059970856, "learning_rate": 4.357969168784577e-06, "loss": 0.0001, "num_input_tokens_seen": 137936960, "step": 63960 }, { "epoch": 11.738851165351441, "grad_norm": 0.0027655374724417925, "learning_rate": 4.357175049278849e-06, "loss": 0.0001, "num_input_tokens_seen": 137947904, "step": 63965 }, { "epoch": 11.739768764910993, "grad_norm": 8.349921226501465, "learning_rate": 4.356380946260585e-06, "loss": 0.1271, "num_input_tokens_seen": 137957664, "step": 63970 }, { "epoch": 11.740686364470545, "grad_norm": 0.003550566965714097, "learning_rate": 4.355586859750153e-06, "loss": 0.001, "num_input_tokens_seen": 137968384, "step": 63975 }, { "epoch": 11.741603964030098, "grad_norm": 0.03632103651762009, "learning_rate": 4.354792789767918e-06, "loss": 0.0, "num_input_tokens_seen": 137980448, "step": 63980 }, { "epoch": 11.74252156358965, "grad_norm": 0.0035218968987464905, "learning_rate": 4.353998736334248e-06, "loss": 0.0001, "num_input_tokens_seen": 137990176, "step": 63985 }, { "epoch": 11.743439163149201, "grad_norm": 0.005261069163680077, "learning_rate": 4.35320469946951e-06, "loss": 0.0, "num_input_tokens_seen": 137998496, "step": 63990 }, { "epoch": 11.744356762708755, "grad_norm": 0.016660869121551514, "learning_rate": 4.352410679194067e-06, "loss": 0.0, "num_input_tokens_seen": 138008864, "step": 63995 }, { "epoch": 11.745274362268306, "grad_norm": 0.0052801985293626785, "learning_rate": 4.351616675528288e-06, "loss": 0.0, "num_input_tokens_seen": 138019104, "step": 64000 }, { "epoch": 11.746191961827858, "grad_norm": 0.002909925766289234, "learning_rate": 4.350822688492537e-06, "loss": 0.0, "num_input_tokens_seen": 138029568, "step": 64005 }, { "epoch": 11.747109561387411, "grad_norm": 0.0007272708462551236, "learning_rate": 4.350028718107175e-06, "loss": 0.0, "num_input_tokens_seen": 138040928, "step": 64010 }, { "epoch": 11.748027160946963, "grad_norm": 0.003060869872570038, "learning_rate": 4.349234764392572e-06, "loss": 0.0, "num_input_tokens_seen": 138052032, "step": 64015 }, { "epoch": 11.748944760506514, "grad_norm": 0.0008751862915232778, "learning_rate": 4.348440827369088e-06, "loss": 0.0001, "num_input_tokens_seen": 138061536, "step": 64020 }, { "epoch": 11.749862360066068, "grad_norm": 0.0006023073801770806, "learning_rate": 4.347646907057088e-06, "loss": 0.1007, "num_input_tokens_seen": 138072832, "step": 64025 }, { "epoch": 11.75077995962562, "grad_norm": 0.0010484247468411922, "learning_rate": 4.346853003476931e-06, "loss": 0.0007, "num_input_tokens_seen": 138084992, "step": 64030 }, { "epoch": 11.751697559185171, "grad_norm": 0.0005790591822005808, "learning_rate": 4.346059116648984e-06, "loss": 0.1097, "num_input_tokens_seen": 138095648, "step": 64035 }, { "epoch": 11.752615158744725, "grad_norm": 0.0004089509602636099, "learning_rate": 4.3452652465936075e-06, "loss": 0.0009, "num_input_tokens_seen": 138106016, "step": 64040 }, { "epoch": 11.753532758304276, "grad_norm": 0.0018944531911984086, "learning_rate": 4.344471393331161e-06, "loss": 0.0, "num_input_tokens_seen": 138116896, "step": 64045 }, { "epoch": 11.754450357863828, "grad_norm": 0.010758241638541222, "learning_rate": 4.3436775568820085e-06, "loss": 0.0001, "num_input_tokens_seen": 138127552, "step": 64050 }, { "epoch": 11.755367957423381, "grad_norm": 0.00029085762798786163, "learning_rate": 4.34288373726651e-06, "loss": 0.0, "num_input_tokens_seen": 138139616, "step": 64055 }, { "epoch": 11.756285556982933, "grad_norm": 0.001834411988966167, "learning_rate": 4.3420899345050235e-06, "loss": 0.0, "num_input_tokens_seen": 138150144, "step": 64060 }, { "epoch": 11.757203156542484, "grad_norm": 0.0008423994295299053, "learning_rate": 4.341296148617911e-06, "loss": 0.0001, "num_input_tokens_seen": 138161504, "step": 64065 }, { "epoch": 11.758120756102038, "grad_norm": 0.0008144775638356805, "learning_rate": 4.340502379625533e-06, "loss": 0.0, "num_input_tokens_seen": 138172896, "step": 64070 }, { "epoch": 11.75903835566159, "grad_norm": 0.0009076342103071511, "learning_rate": 4.339708627548246e-06, "loss": 0.016, "num_input_tokens_seen": 138184064, "step": 64075 }, { "epoch": 11.759955955221141, "grad_norm": 0.0023159326519817114, "learning_rate": 4.338914892406408e-06, "loss": 0.0328, "num_input_tokens_seen": 138194880, "step": 64080 }, { "epoch": 11.760873554780694, "grad_norm": 0.05819559097290039, "learning_rate": 4.33812117422038e-06, "loss": 0.0002, "num_input_tokens_seen": 138204480, "step": 64085 }, { "epoch": 11.761791154340246, "grad_norm": 0.23751425743103027, "learning_rate": 4.337327473010518e-06, "loss": 0.0002, "num_input_tokens_seen": 138216416, "step": 64090 }, { "epoch": 11.762708753899798, "grad_norm": 0.000876867794431746, "learning_rate": 4.3365337887971775e-06, "loss": 0.0244, "num_input_tokens_seen": 138226848, "step": 64095 }, { "epoch": 11.763626353459351, "grad_norm": 0.1751318871974945, "learning_rate": 4.335740121600719e-06, "loss": 0.0003, "num_input_tokens_seen": 138237568, "step": 64100 }, { "epoch": 11.764543953018903, "grad_norm": 0.01951194368302822, "learning_rate": 4.334946471441497e-06, "loss": 0.2094, "num_input_tokens_seen": 138247840, "step": 64105 }, { "epoch": 11.765461552578454, "grad_norm": 0.0007908559055067599, "learning_rate": 4.334152838339867e-06, "loss": 0.0, "num_input_tokens_seen": 138258752, "step": 64110 }, { "epoch": 11.766379152138008, "grad_norm": 0.0017968440661206841, "learning_rate": 4.3333592223161826e-06, "loss": 0.0001, "num_input_tokens_seen": 138270688, "step": 64115 }, { "epoch": 11.76729675169756, "grad_norm": 0.03694406524300575, "learning_rate": 4.332565623390803e-06, "loss": 0.0002, "num_input_tokens_seen": 138282432, "step": 64120 }, { "epoch": 11.768214351257111, "grad_norm": 0.0032046292908489704, "learning_rate": 4.331772041584081e-06, "loss": 0.2844, "num_input_tokens_seen": 138294720, "step": 64125 }, { "epoch": 11.769131950816664, "grad_norm": 0.002802202245220542, "learning_rate": 4.3309784769163685e-06, "loss": 0.0001, "num_input_tokens_seen": 138306880, "step": 64130 }, { "epoch": 11.770049550376216, "grad_norm": 0.0005892461049370468, "learning_rate": 4.330184929408023e-06, "loss": 0.0, "num_input_tokens_seen": 138318528, "step": 64135 }, { "epoch": 11.770967149935768, "grad_norm": 0.00344651797786355, "learning_rate": 4.329391399079396e-06, "loss": 0.0, "num_input_tokens_seen": 138329792, "step": 64140 }, { "epoch": 11.771884749495321, "grad_norm": 0.0009022516314871609, "learning_rate": 4.328597885950838e-06, "loss": 0.0005, "num_input_tokens_seen": 138340096, "step": 64145 }, { "epoch": 11.772802349054873, "grad_norm": 0.7010807394981384, "learning_rate": 4.327804390042706e-06, "loss": 0.0011, "num_input_tokens_seen": 138350496, "step": 64150 }, { "epoch": 11.773719948614424, "grad_norm": 0.00031484634382650256, "learning_rate": 4.327010911375349e-06, "loss": 0.0, "num_input_tokens_seen": 138360416, "step": 64155 }, { "epoch": 11.774637548173978, "grad_norm": 0.004523958545178175, "learning_rate": 4.3262174499691195e-06, "loss": 0.0001, "num_input_tokens_seen": 138371936, "step": 64160 }, { "epoch": 11.77555514773353, "grad_norm": 0.00021541859314311296, "learning_rate": 4.3254240058443655e-06, "loss": 0.0, "num_input_tokens_seen": 138383168, "step": 64165 }, { "epoch": 11.77647274729308, "grad_norm": 47.543678283691406, "learning_rate": 4.324630579021444e-06, "loss": 0.0108, "num_input_tokens_seen": 138394240, "step": 64170 }, { "epoch": 11.777390346852634, "grad_norm": 0.0006823369185440242, "learning_rate": 4.3238371695207e-06, "loss": 0.0, "num_input_tokens_seen": 138404608, "step": 64175 }, { "epoch": 11.778307946412186, "grad_norm": 0.00408218102529645, "learning_rate": 4.323043777362483e-06, "loss": 0.0, "num_input_tokens_seen": 138414912, "step": 64180 }, { "epoch": 11.779225545971737, "grad_norm": 0.0002886704751290381, "learning_rate": 4.3222504025671466e-06, "loss": 0.0, "num_input_tokens_seen": 138425664, "step": 64185 }, { "epoch": 11.78014314553129, "grad_norm": 0.005799994338303804, "learning_rate": 4.321457045155035e-06, "loss": 0.0005, "num_input_tokens_seen": 138436576, "step": 64190 }, { "epoch": 11.781060745090842, "grad_norm": 0.0016285423189401627, "learning_rate": 4.3206637051464985e-06, "loss": 0.0, "num_input_tokens_seen": 138446816, "step": 64195 }, { "epoch": 11.781978344650394, "grad_norm": 0.0006250526639632881, "learning_rate": 4.319870382561887e-06, "loss": 0.0853, "num_input_tokens_seen": 138457248, "step": 64200 }, { "epoch": 11.782895944209947, "grad_norm": 0.0004445445956662297, "learning_rate": 4.319077077421545e-06, "loss": 0.0, "num_input_tokens_seen": 138468160, "step": 64205 }, { "epoch": 11.783813543769499, "grad_norm": 0.00028114355518482625, "learning_rate": 4.318283789745821e-06, "loss": 0.0, "num_input_tokens_seen": 138478080, "step": 64210 }, { "epoch": 11.78473114332905, "grad_norm": 0.0006965109496377409, "learning_rate": 4.31749051955506e-06, "loss": 0.0001, "num_input_tokens_seen": 138490208, "step": 64215 }, { "epoch": 11.785648742888604, "grad_norm": 0.0005733639700338244, "learning_rate": 4.31669726686961e-06, "loss": 0.0, "num_input_tokens_seen": 138499840, "step": 64220 }, { "epoch": 11.786566342448156, "grad_norm": 0.03873635455965996, "learning_rate": 4.315904031709817e-06, "loss": 0.0003, "num_input_tokens_seen": 138511680, "step": 64225 }, { "epoch": 11.787483942007707, "grad_norm": 0.0001769085502019152, "learning_rate": 4.315110814096023e-06, "loss": 0.0001, "num_input_tokens_seen": 138522592, "step": 64230 }, { "epoch": 11.78840154156726, "grad_norm": 0.0005687656230293214, "learning_rate": 4.314317614048577e-06, "loss": 0.0, "num_input_tokens_seen": 138531936, "step": 64235 }, { "epoch": 11.789319141126812, "grad_norm": 0.001347091281786561, "learning_rate": 4.313524431587822e-06, "loss": 0.0, "num_input_tokens_seen": 138542432, "step": 64240 }, { "epoch": 11.790236740686364, "grad_norm": 0.00037630824954248965, "learning_rate": 4.312731266734101e-06, "loss": 0.0, "num_input_tokens_seen": 138553408, "step": 64245 }, { "epoch": 11.791154340245917, "grad_norm": 0.0038929448928683996, "learning_rate": 4.311938119507756e-06, "loss": 0.0, "num_input_tokens_seen": 138564352, "step": 64250 }, { "epoch": 11.792071939805469, "grad_norm": 0.00025147272390313447, "learning_rate": 4.311144989929134e-06, "loss": 0.0006, "num_input_tokens_seen": 138575520, "step": 64255 }, { "epoch": 11.79298953936502, "grad_norm": 0.00056948367273435, "learning_rate": 4.310351878018575e-06, "loss": 0.0002, "num_input_tokens_seen": 138586304, "step": 64260 }, { "epoch": 11.793907138924574, "grad_norm": 0.001854029018431902, "learning_rate": 4.30955878379642e-06, "loss": 0.0, "num_input_tokens_seen": 138596160, "step": 64265 }, { "epoch": 11.794824738484126, "grad_norm": 0.002357390010729432, "learning_rate": 4.3087657072830136e-06, "loss": 0.0, "num_input_tokens_seen": 138606464, "step": 64270 }, { "epoch": 11.795742338043677, "grad_norm": 0.0002210888487752527, "learning_rate": 4.307972648498696e-06, "loss": 0.0001, "num_input_tokens_seen": 138616512, "step": 64275 }, { "epoch": 11.79665993760323, "grad_norm": 0.01606392301619053, "learning_rate": 4.307179607463806e-06, "loss": 0.0, "num_input_tokens_seen": 138626784, "step": 64280 }, { "epoch": 11.797577537162782, "grad_norm": 0.010493641719222069, "learning_rate": 4.306386584198685e-06, "loss": 0.1005, "num_input_tokens_seen": 138637408, "step": 64285 }, { "epoch": 11.798495136722334, "grad_norm": 0.0002649406378623098, "learning_rate": 4.305593578723674e-06, "loss": 0.2594, "num_input_tokens_seen": 138646752, "step": 64290 }, { "epoch": 11.799412736281887, "grad_norm": 0.01804940588772297, "learning_rate": 4.304800591059113e-06, "loss": 0.2876, "num_input_tokens_seen": 138658208, "step": 64295 }, { "epoch": 11.800330335841439, "grad_norm": 0.004574453923851252, "learning_rate": 4.304007621225336e-06, "loss": 0.0, "num_input_tokens_seen": 138668512, "step": 64300 }, { "epoch": 11.80124793540099, "grad_norm": 0.00016521118232049048, "learning_rate": 4.303214669242688e-06, "loss": 0.0, "num_input_tokens_seen": 138679456, "step": 64305 }, { "epoch": 11.802165534960544, "grad_norm": 0.0380806103348732, "learning_rate": 4.302421735131502e-06, "loss": 0.0, "num_input_tokens_seen": 138688608, "step": 64310 }, { "epoch": 11.803083134520095, "grad_norm": 0.0004198167589493096, "learning_rate": 4.301628818912117e-06, "loss": 0.0005, "num_input_tokens_seen": 138699776, "step": 64315 }, { "epoch": 11.804000734079647, "grad_norm": 0.0004544124531093985, "learning_rate": 4.3008359206048716e-06, "loss": 0.0, "num_input_tokens_seen": 138710848, "step": 64320 }, { "epoch": 11.8049183336392, "grad_norm": 0.004734738264232874, "learning_rate": 4.300043040230101e-06, "loss": 0.098, "num_input_tokens_seen": 138721376, "step": 64325 }, { "epoch": 11.805835933198752, "grad_norm": 0.0037715870421379805, "learning_rate": 4.29925017780814e-06, "loss": 0.0002, "num_input_tokens_seen": 138733120, "step": 64330 }, { "epoch": 11.806753532758304, "grad_norm": 0.0010479434859007597, "learning_rate": 4.298457333359328e-06, "loss": 0.0001, "num_input_tokens_seen": 138744352, "step": 64335 }, { "epoch": 11.807671132317857, "grad_norm": 1.3797767162322998, "learning_rate": 4.297664506903998e-06, "loss": 0.0001, "num_input_tokens_seen": 138755328, "step": 64340 }, { "epoch": 11.808588731877409, "grad_norm": 0.004895865451544523, "learning_rate": 4.296871698462485e-06, "loss": 0.0001, "num_input_tokens_seen": 138765824, "step": 64345 }, { "epoch": 11.80950633143696, "grad_norm": 0.0015523659531027079, "learning_rate": 4.296078908055121e-06, "loss": 0.0097, "num_input_tokens_seen": 138776320, "step": 64350 }, { "epoch": 11.810423930996514, "grad_norm": 0.00033097233972512186, "learning_rate": 4.295286135702243e-06, "loss": 0.0772, "num_input_tokens_seen": 138787712, "step": 64355 }, { "epoch": 11.811341530556065, "grad_norm": 0.015950297936797142, "learning_rate": 4.294493381424185e-06, "loss": 0.0001, "num_input_tokens_seen": 138798464, "step": 64360 }, { "epoch": 11.812259130115617, "grad_norm": 0.0035025603137910366, "learning_rate": 4.293700645241276e-06, "loss": 0.0002, "num_input_tokens_seen": 138809088, "step": 64365 }, { "epoch": 11.81317672967517, "grad_norm": 0.00011970376363024116, "learning_rate": 4.292907927173853e-06, "loss": 0.0001, "num_input_tokens_seen": 138819424, "step": 64370 }, { "epoch": 11.814094329234722, "grad_norm": 0.04573151096701622, "learning_rate": 4.292115227242245e-06, "loss": 0.0001, "num_input_tokens_seen": 138829728, "step": 64375 }, { "epoch": 11.815011928794274, "grad_norm": 0.007538917474448681, "learning_rate": 4.2913225454667844e-06, "loss": 0.0001, "num_input_tokens_seen": 138840384, "step": 64380 }, { "epoch": 11.815929528353827, "grad_norm": 0.01716538891196251, "learning_rate": 4.290529881867801e-06, "loss": 0.0, "num_input_tokens_seen": 138850592, "step": 64385 }, { "epoch": 11.816847127913379, "grad_norm": 0.0007235525990836322, "learning_rate": 4.289737236465627e-06, "loss": 0.0, "num_input_tokens_seen": 138860416, "step": 64390 }, { "epoch": 11.81776472747293, "grad_norm": 0.0005072933272458613, "learning_rate": 4.288944609280593e-06, "loss": 0.0001, "num_input_tokens_seen": 138871840, "step": 64395 }, { "epoch": 11.818682327032484, "grad_norm": 0.003014527028426528, "learning_rate": 4.2881520003330265e-06, "loss": 0.0002, "num_input_tokens_seen": 138883936, "step": 64400 }, { "epoch": 11.819599926592035, "grad_norm": 249.30657958984375, "learning_rate": 4.28735940964326e-06, "loss": 0.0812, "num_input_tokens_seen": 138894848, "step": 64405 }, { "epoch": 11.820517526151587, "grad_norm": 0.0019657276570796967, "learning_rate": 4.286566837231617e-06, "loss": 0.0, "num_input_tokens_seen": 138906720, "step": 64410 }, { "epoch": 11.82143512571114, "grad_norm": 0.0038452374283224344, "learning_rate": 4.285774283118431e-06, "loss": 0.136, "num_input_tokens_seen": 138918304, "step": 64415 }, { "epoch": 11.822352725270692, "grad_norm": 0.00033816564246080816, "learning_rate": 4.284981747324028e-06, "loss": 0.0001, "num_input_tokens_seen": 138928928, "step": 64420 }, { "epoch": 11.823270324830244, "grad_norm": 0.0014991568168625236, "learning_rate": 4.284189229868735e-06, "loss": 0.0001, "num_input_tokens_seen": 138940416, "step": 64425 }, { "epoch": 11.824187924389797, "grad_norm": 0.2533988356590271, "learning_rate": 4.283396730772879e-06, "loss": 0.0003, "num_input_tokens_seen": 138951040, "step": 64430 }, { "epoch": 11.825105523949349, "grad_norm": 0.0015938283177092671, "learning_rate": 4.282604250056784e-06, "loss": 0.0001, "num_input_tokens_seen": 138962048, "step": 64435 }, { "epoch": 11.8260231235089, "grad_norm": 0.0003228118584956974, "learning_rate": 4.281811787740779e-06, "loss": 0.0, "num_input_tokens_seen": 138971968, "step": 64440 }, { "epoch": 11.826940723068454, "grad_norm": 0.0020912077743560076, "learning_rate": 4.28101934384519e-06, "loss": 0.0, "num_input_tokens_seen": 138982880, "step": 64445 }, { "epoch": 11.827858322628005, "grad_norm": 0.15222854912281036, "learning_rate": 4.280226918390338e-06, "loss": 0.1945, "num_input_tokens_seen": 138994016, "step": 64450 }, { "epoch": 11.828775922187557, "grad_norm": 0.0006449712673202157, "learning_rate": 4.2794345113965524e-06, "loss": 0.0001, "num_input_tokens_seen": 139003552, "step": 64455 }, { "epoch": 11.82969352174711, "grad_norm": 0.0003908253274857998, "learning_rate": 4.278642122884154e-06, "loss": 0.0, "num_input_tokens_seen": 139013728, "step": 64460 }, { "epoch": 11.830611121306662, "grad_norm": 0.005947769619524479, "learning_rate": 4.277849752873466e-06, "loss": 0.0087, "num_input_tokens_seen": 139023328, "step": 64465 }, { "epoch": 11.831528720866213, "grad_norm": 5.721101760864258, "learning_rate": 4.277057401384815e-06, "loss": 0.0009, "num_input_tokens_seen": 139034624, "step": 64470 }, { "epoch": 11.832446320425767, "grad_norm": 1.368201732635498, "learning_rate": 4.276265068438521e-06, "loss": 0.0012, "num_input_tokens_seen": 139044640, "step": 64475 }, { "epoch": 11.833363919985318, "grad_norm": 0.002173070563003421, "learning_rate": 4.2754727540549056e-06, "loss": 0.2094, "num_input_tokens_seen": 139056032, "step": 64480 }, { "epoch": 11.83428151954487, "grad_norm": 0.005961637012660503, "learning_rate": 4.2746804582542886e-06, "loss": 0.0, "num_input_tokens_seen": 139066848, "step": 64485 }, { "epoch": 11.835199119104423, "grad_norm": 0.004432208836078644, "learning_rate": 4.273888181056997e-06, "loss": 0.0001, "num_input_tokens_seen": 139077376, "step": 64490 }, { "epoch": 11.836116718663975, "grad_norm": 0.0002239512250525877, "learning_rate": 4.2730959224833475e-06, "loss": 0.0003, "num_input_tokens_seen": 139088640, "step": 64495 }, { "epoch": 11.837034318223527, "grad_norm": 0.01180440653115511, "learning_rate": 4.27230368255366e-06, "loss": 0.0001, "num_input_tokens_seen": 139099648, "step": 64500 }, { "epoch": 11.83795191778308, "grad_norm": 0.1138940081000328, "learning_rate": 4.271511461288255e-06, "loss": 0.0001, "num_input_tokens_seen": 139110688, "step": 64505 }, { "epoch": 11.838869517342632, "grad_norm": 0.0018535827985033393, "learning_rate": 4.270719258707454e-06, "loss": 0.0, "num_input_tokens_seen": 139121728, "step": 64510 }, { "epoch": 11.839787116902183, "grad_norm": 0.00844375416636467, "learning_rate": 4.269927074831571e-06, "loss": 0.0002, "num_input_tokens_seen": 139133280, "step": 64515 }, { "epoch": 11.840704716461737, "grad_norm": 0.06331195682287216, "learning_rate": 4.26913490968093e-06, "loss": 0.0001, "num_input_tokens_seen": 139143584, "step": 64520 }, { "epoch": 11.841622316021288, "grad_norm": 0.0002692550770007074, "learning_rate": 4.268342763275844e-06, "loss": 0.0, "num_input_tokens_seen": 139153344, "step": 64525 }, { "epoch": 11.84253991558084, "grad_norm": 0.0024570971727371216, "learning_rate": 4.2675506356366336e-06, "loss": 0.0, "num_input_tokens_seen": 139165024, "step": 64530 }, { "epoch": 11.843457515140393, "grad_norm": 362.1375427246094, "learning_rate": 4.2667585267836114e-06, "loss": 0.0264, "num_input_tokens_seen": 139176224, "step": 64535 }, { "epoch": 11.844375114699945, "grad_norm": 0.00801877211779356, "learning_rate": 4.265966436737099e-06, "loss": 0.0001, "num_input_tokens_seen": 139187008, "step": 64540 }, { "epoch": 11.845292714259497, "grad_norm": 0.0005078769172541797, "learning_rate": 4.2651743655174096e-06, "loss": 0.0616, "num_input_tokens_seen": 139198240, "step": 64545 }, { "epoch": 11.84621031381905, "grad_norm": 27.15200424194336, "learning_rate": 4.264382313144857e-06, "loss": 0.2625, "num_input_tokens_seen": 139209408, "step": 64550 }, { "epoch": 11.847127913378602, "grad_norm": 0.0004378423618618399, "learning_rate": 4.26359027963976e-06, "loss": 0.0, "num_input_tokens_seen": 139219200, "step": 64555 }, { "epoch": 11.848045512938153, "grad_norm": 0.01039472222328186, "learning_rate": 4.262798265022431e-06, "loss": 0.0, "num_input_tokens_seen": 139229024, "step": 64560 }, { "epoch": 11.848963112497707, "grad_norm": 0.0017316951416432858, "learning_rate": 4.2620062693131834e-06, "loss": 0.0, "num_input_tokens_seen": 139240512, "step": 64565 }, { "epoch": 11.849880712057258, "grad_norm": 0.0006119619938544929, "learning_rate": 4.2612142925323294e-06, "loss": 0.0, "num_input_tokens_seen": 139251424, "step": 64570 }, { "epoch": 11.85079831161681, "grad_norm": 0.0015097620198503137, "learning_rate": 4.260422334700187e-06, "loss": 0.0, "num_input_tokens_seen": 139261824, "step": 64575 }, { "epoch": 11.851715911176363, "grad_norm": 0.0013658145908266306, "learning_rate": 4.2596303958370635e-06, "loss": 0.0001, "num_input_tokens_seen": 139272896, "step": 64580 }, { "epoch": 11.852633510735915, "grad_norm": 0.004801468923687935, "learning_rate": 4.258838475963273e-06, "loss": 0.0, "num_input_tokens_seen": 139283968, "step": 64585 }, { "epoch": 11.853551110295466, "grad_norm": 0.0013828298542648554, "learning_rate": 4.258046575099126e-06, "loss": 0.001, "num_input_tokens_seen": 139293664, "step": 64590 }, { "epoch": 11.85446870985502, "grad_norm": 0.0007280218414962292, "learning_rate": 4.257254693264937e-06, "loss": 0.0, "num_input_tokens_seen": 139305632, "step": 64595 }, { "epoch": 11.855386309414571, "grad_norm": 0.00043060260941274464, "learning_rate": 4.256462830481012e-06, "loss": 0.2188, "num_input_tokens_seen": 139315616, "step": 64600 }, { "epoch": 11.856303908974123, "grad_norm": 0.00023293303092941642, "learning_rate": 4.255670986767664e-06, "loss": 0.0, "num_input_tokens_seen": 139327200, "step": 64605 }, { "epoch": 11.857221508533677, "grad_norm": 0.0009926712373271585, "learning_rate": 4.254879162145201e-06, "loss": 0.0, "num_input_tokens_seen": 139339232, "step": 64610 }, { "epoch": 11.858139108093228, "grad_norm": 0.003484525252133608, "learning_rate": 4.254087356633934e-06, "loss": 0.0, "num_input_tokens_seen": 139350144, "step": 64615 }, { "epoch": 11.85905670765278, "grad_norm": 0.0007061387877911329, "learning_rate": 4.2532955702541686e-06, "loss": 0.0131, "num_input_tokens_seen": 139361568, "step": 64620 }, { "epoch": 11.859974307212333, "grad_norm": 0.0007924907840788364, "learning_rate": 4.2525038030262155e-06, "loss": 0.0, "num_input_tokens_seen": 139371520, "step": 64625 }, { "epoch": 11.860891906771885, "grad_norm": 0.01496801059693098, "learning_rate": 4.25171205497038e-06, "loss": 0.0, "num_input_tokens_seen": 139383456, "step": 64630 }, { "epoch": 11.861809506331436, "grad_norm": 0.0005495637888088822, "learning_rate": 4.250920326106971e-06, "loss": 0.0, "num_input_tokens_seen": 139394240, "step": 64635 }, { "epoch": 11.86272710589099, "grad_norm": 0.0039820303209125996, "learning_rate": 4.2501286164562965e-06, "loss": 0.2532, "num_input_tokens_seen": 139406048, "step": 64640 }, { "epoch": 11.863644705450541, "grad_norm": 0.23654328286647797, "learning_rate": 4.249336926038659e-06, "loss": 0.0099, "num_input_tokens_seen": 139416320, "step": 64645 }, { "epoch": 11.864562305010093, "grad_norm": 10.77101993560791, "learning_rate": 4.248545254874365e-06, "loss": 0.0119, "num_input_tokens_seen": 139426656, "step": 64650 }, { "epoch": 11.865479904569646, "grad_norm": 0.001122695510275662, "learning_rate": 4.247753602983722e-06, "loss": 0.0, "num_input_tokens_seen": 139438048, "step": 64655 }, { "epoch": 11.866397504129198, "grad_norm": 0.011115305125713348, "learning_rate": 4.246961970387034e-06, "loss": 0.0013, "num_input_tokens_seen": 139449472, "step": 64660 }, { "epoch": 11.86731510368875, "grad_norm": 0.0038815594743937254, "learning_rate": 4.246170357104604e-06, "loss": 0.0974, "num_input_tokens_seen": 139460128, "step": 64665 }, { "epoch": 11.868232703248303, "grad_norm": 0.0011723061325028539, "learning_rate": 4.245378763156734e-06, "loss": 0.0097, "num_input_tokens_seen": 139470912, "step": 64670 }, { "epoch": 11.869150302807855, "grad_norm": 0.0035965736024081707, "learning_rate": 4.2445871885637314e-06, "loss": 0.0, "num_input_tokens_seen": 139482144, "step": 64675 }, { "epoch": 11.870067902367406, "grad_norm": 0.0013995515182614326, "learning_rate": 4.2437956333458966e-06, "loss": 0.0, "num_input_tokens_seen": 139491328, "step": 64680 }, { "epoch": 11.87098550192696, "grad_norm": 0.0070448974147439, "learning_rate": 4.243004097523529e-06, "loss": 0.0001, "num_input_tokens_seen": 139502560, "step": 64685 }, { "epoch": 11.871903101486511, "grad_norm": 0.0283763837069273, "learning_rate": 4.242212581116936e-06, "loss": 0.0001, "num_input_tokens_seen": 139513664, "step": 64690 }, { "epoch": 11.872820701046063, "grad_norm": 0.09204452484846115, "learning_rate": 4.241421084146415e-06, "loss": 0.0002, "num_input_tokens_seen": 139524416, "step": 64695 }, { "epoch": 11.873738300605616, "grad_norm": 0.05593061074614525, "learning_rate": 4.240629606632268e-06, "loss": 0.025, "num_input_tokens_seen": 139536288, "step": 64700 }, { "epoch": 11.874655900165168, "grad_norm": 0.06455200165510178, "learning_rate": 4.239838148594793e-06, "loss": 0.0001, "num_input_tokens_seen": 139547872, "step": 64705 }, { "epoch": 11.87557349972472, "grad_norm": 1.6370110511779785, "learning_rate": 4.239046710054293e-06, "loss": 0.0003, "num_input_tokens_seen": 139559456, "step": 64710 }, { "epoch": 11.876491099284273, "grad_norm": 0.0011539787519723177, "learning_rate": 4.238255291031065e-06, "loss": 0.0, "num_input_tokens_seen": 139570016, "step": 64715 }, { "epoch": 11.877408698843825, "grad_norm": 0.0019612142350524664, "learning_rate": 4.237463891545408e-06, "loss": 0.0001, "num_input_tokens_seen": 139580480, "step": 64720 }, { "epoch": 11.878326298403376, "grad_norm": 0.0005120556452311575, "learning_rate": 4.236672511617621e-06, "loss": 0.0001, "num_input_tokens_seen": 139591360, "step": 64725 }, { "epoch": 11.87924389796293, "grad_norm": 0.000690300774294883, "learning_rate": 4.235881151268001e-06, "loss": 0.0, "num_input_tokens_seen": 139602176, "step": 64730 }, { "epoch": 11.880161497522481, "grad_norm": 0.0010725175961852074, "learning_rate": 4.235089810516844e-06, "loss": 0.0, "num_input_tokens_seen": 139613056, "step": 64735 }, { "epoch": 11.881079097082033, "grad_norm": 0.004218824207782745, "learning_rate": 4.234298489384449e-06, "loss": 0.0, "num_input_tokens_seen": 139623680, "step": 64740 }, { "epoch": 11.881996696641586, "grad_norm": 0.0010873812716454268, "learning_rate": 4.23350718789111e-06, "loss": 0.0004, "num_input_tokens_seen": 139633760, "step": 64745 }, { "epoch": 11.882914296201138, "grad_norm": 0.0012125408975407481, "learning_rate": 4.232715906057126e-06, "loss": 0.2535, "num_input_tokens_seen": 139644896, "step": 64750 }, { "epoch": 11.88383189576069, "grad_norm": 0.0019664522260427475, "learning_rate": 4.231924643902786e-06, "loss": 0.0028, "num_input_tokens_seen": 139656352, "step": 64755 }, { "epoch": 11.884749495320243, "grad_norm": 0.006777737755328417, "learning_rate": 4.231133401448391e-06, "loss": 0.0, "num_input_tokens_seen": 139667456, "step": 64760 }, { "epoch": 11.885667094879794, "grad_norm": 0.00035569138708524406, "learning_rate": 4.230342178714233e-06, "loss": 0.0, "num_input_tokens_seen": 139677728, "step": 64765 }, { "epoch": 11.886584694439346, "grad_norm": 0.0024076360277831554, "learning_rate": 4.229550975720603e-06, "loss": 0.0001, "num_input_tokens_seen": 139688640, "step": 64770 }, { "epoch": 11.8875022939989, "grad_norm": 0.000902629632037133, "learning_rate": 4.2287597924877986e-06, "loss": 0.0, "num_input_tokens_seen": 139700032, "step": 64775 }, { "epoch": 11.888419893558451, "grad_norm": 0.0020726462826132774, "learning_rate": 4.22796862903611e-06, "loss": 0.0029, "num_input_tokens_seen": 139711680, "step": 64780 }, { "epoch": 11.889337493118003, "grad_norm": 0.005445447284728289, "learning_rate": 4.227177485385827e-06, "loss": 0.1066, "num_input_tokens_seen": 139722176, "step": 64785 }, { "epoch": 11.890255092677556, "grad_norm": 0.0005422930116765201, "learning_rate": 4.226386361557247e-06, "loss": 0.0, "num_input_tokens_seen": 139733568, "step": 64790 }, { "epoch": 11.891172692237108, "grad_norm": 0.00030441582202911377, "learning_rate": 4.225595257570657e-06, "loss": 0.0, "num_input_tokens_seen": 139744256, "step": 64795 }, { "epoch": 11.89209029179666, "grad_norm": 0.0049569630064070225, "learning_rate": 4.224804173446349e-06, "loss": 0.0, "num_input_tokens_seen": 139755584, "step": 64800 }, { "epoch": 11.893007891356213, "grad_norm": 51.205570220947266, "learning_rate": 4.224013109204609e-06, "loss": 0.222, "num_input_tokens_seen": 139765120, "step": 64805 }, { "epoch": 11.893925490915764, "grad_norm": 0.0002727023675106466, "learning_rate": 4.223222064865733e-06, "loss": 0.0, "num_input_tokens_seen": 139776320, "step": 64810 }, { "epoch": 11.894843090475316, "grad_norm": 0.003535378025844693, "learning_rate": 4.222431040450008e-06, "loss": 0.0001, "num_input_tokens_seen": 139787008, "step": 64815 }, { "epoch": 11.89576069003487, "grad_norm": 22.608570098876953, "learning_rate": 4.221640035977719e-06, "loss": 0.2122, "num_input_tokens_seen": 139797888, "step": 64820 }, { "epoch": 11.896678289594421, "grad_norm": 0.0008685854263603687, "learning_rate": 4.220849051469158e-06, "loss": 0.0, "num_input_tokens_seen": 139810496, "step": 64825 }, { "epoch": 11.897595889153973, "grad_norm": 0.11075666546821594, "learning_rate": 4.220058086944611e-06, "loss": 0.0002, "num_input_tokens_seen": 139821344, "step": 64830 }, { "epoch": 11.898513488713526, "grad_norm": 0.0005668647936545312, "learning_rate": 4.219267142424367e-06, "loss": 0.0036, "num_input_tokens_seen": 139831392, "step": 64835 }, { "epoch": 11.899431088273078, "grad_norm": 0.00010974927863571793, "learning_rate": 4.218476217928709e-06, "loss": 0.0, "num_input_tokens_seen": 139842816, "step": 64840 }, { "epoch": 11.90034868783263, "grad_norm": 0.009147695265710354, "learning_rate": 4.217685313477926e-06, "loss": 0.0352, "num_input_tokens_seen": 139854080, "step": 64845 }, { "epoch": 11.901266287392183, "grad_norm": 0.00017935640062205493, "learning_rate": 4.216894429092301e-06, "loss": 0.0001, "num_input_tokens_seen": 139865088, "step": 64850 }, { "epoch": 11.902183886951734, "grad_norm": 0.0004924897220917046, "learning_rate": 4.21610356479212e-06, "loss": 0.2407, "num_input_tokens_seen": 139876224, "step": 64855 }, { "epoch": 11.903101486511286, "grad_norm": 0.012738503515720367, "learning_rate": 4.215312720597669e-06, "loss": 0.0001, "num_input_tokens_seen": 139887296, "step": 64860 }, { "epoch": 11.90401908607084, "grad_norm": 0.05335459113121033, "learning_rate": 4.21452189652923e-06, "loss": 0.0001, "num_input_tokens_seen": 139898368, "step": 64865 }, { "epoch": 11.90493668563039, "grad_norm": 0.007661490701138973, "learning_rate": 4.213731092607085e-06, "loss": 0.0, "num_input_tokens_seen": 139907904, "step": 64870 }, { "epoch": 11.905854285189942, "grad_norm": 46.933982849121094, "learning_rate": 4.212940308851521e-06, "loss": 0.1689, "num_input_tokens_seen": 139918048, "step": 64875 }, { "epoch": 11.906771884749496, "grad_norm": 0.023050712421536446, "learning_rate": 4.212149545282819e-06, "loss": 0.0174, "num_input_tokens_seen": 139928576, "step": 64880 }, { "epoch": 11.907689484309048, "grad_norm": 0.00273366691544652, "learning_rate": 4.211358801921259e-06, "loss": 0.0, "num_input_tokens_seen": 139938080, "step": 64885 }, { "epoch": 11.9086070838686, "grad_norm": 0.05158599093556404, "learning_rate": 4.210568078787122e-06, "loss": 0.1597, "num_input_tokens_seen": 139949536, "step": 64890 }, { "epoch": 11.909524683428153, "grad_norm": 0.013905227184295654, "learning_rate": 4.209777375900692e-06, "loss": 0.0002, "num_input_tokens_seen": 139960352, "step": 64895 }, { "epoch": 11.910442282987704, "grad_norm": 0.0032928111031651497, "learning_rate": 4.208986693282248e-06, "loss": 0.0004, "num_input_tokens_seen": 139972128, "step": 64900 }, { "epoch": 11.911359882547256, "grad_norm": 0.002137554343789816, "learning_rate": 4.2081960309520676e-06, "loss": 0.1117, "num_input_tokens_seen": 139983328, "step": 64905 }, { "epoch": 11.91227748210681, "grad_norm": 4.40167236328125, "learning_rate": 4.207405388930434e-06, "loss": 0.0012, "num_input_tokens_seen": 139993760, "step": 64910 }, { "epoch": 11.91319508166636, "grad_norm": 0.00396018847823143, "learning_rate": 4.206614767237622e-06, "loss": 0.0023, "num_input_tokens_seen": 140004480, "step": 64915 }, { "epoch": 11.914112681225912, "grad_norm": 0.0013896586606279016, "learning_rate": 4.205824165893912e-06, "loss": 0.0, "num_input_tokens_seen": 140014912, "step": 64920 }, { "epoch": 11.915030280785466, "grad_norm": 0.00064730184385553, "learning_rate": 4.205033584919581e-06, "loss": 0.0001, "num_input_tokens_seen": 140025056, "step": 64925 }, { "epoch": 11.915947880345017, "grad_norm": 0.02978380396962166, "learning_rate": 4.204243024334907e-06, "loss": 0.0001, "num_input_tokens_seen": 140036736, "step": 64930 }, { "epoch": 11.916865479904569, "grad_norm": 0.4551739990711212, "learning_rate": 4.203452484160167e-06, "loss": 0.0914, "num_input_tokens_seen": 140047360, "step": 64935 }, { "epoch": 11.917783079464122, "grad_norm": 0.0016709588235244155, "learning_rate": 4.202661964415635e-06, "loss": 0.0008, "num_input_tokens_seen": 140059008, "step": 64940 }, { "epoch": 11.918700679023674, "grad_norm": 0.026232440024614334, "learning_rate": 4.201871465121589e-06, "loss": 0.1377, "num_input_tokens_seen": 140070240, "step": 64945 }, { "epoch": 11.919618278583226, "grad_norm": 0.0015744083793833852, "learning_rate": 4.201080986298302e-06, "loss": 0.0002, "num_input_tokens_seen": 140081056, "step": 64950 }, { "epoch": 11.920535878142779, "grad_norm": 0.4117148518562317, "learning_rate": 4.200290527966048e-06, "loss": 0.0649, "num_input_tokens_seen": 140091232, "step": 64955 }, { "epoch": 11.92145347770233, "grad_norm": 0.0615648590028286, "learning_rate": 4.199500090145105e-06, "loss": 0.0003, "num_input_tokens_seen": 140103072, "step": 64960 }, { "epoch": 11.922371077261882, "grad_norm": 0.0010339019354432821, "learning_rate": 4.1987096728557435e-06, "loss": 0.0001, "num_input_tokens_seen": 140113440, "step": 64965 }, { "epoch": 11.923288676821436, "grad_norm": 0.0034634792245924473, "learning_rate": 4.197919276118235e-06, "loss": 0.0006, "num_input_tokens_seen": 140124736, "step": 64970 }, { "epoch": 11.924206276380987, "grad_norm": 0.03874046728014946, "learning_rate": 4.197128899952856e-06, "loss": 0.0001, "num_input_tokens_seen": 140135232, "step": 64975 }, { "epoch": 11.925123875940539, "grad_norm": 0.32056206464767456, "learning_rate": 4.196338544379877e-06, "loss": 0.0004, "num_input_tokens_seen": 140146752, "step": 64980 }, { "epoch": 11.926041475500092, "grad_norm": 0.017101574689149857, "learning_rate": 4.195548209419568e-06, "loss": 0.0001, "num_input_tokens_seen": 140157152, "step": 64985 }, { "epoch": 11.926959075059644, "grad_norm": 0.00878823734819889, "learning_rate": 4.194757895092199e-06, "loss": 0.0, "num_input_tokens_seen": 140168576, "step": 64990 }, { "epoch": 11.927876674619196, "grad_norm": 0.0020396430045366287, "learning_rate": 4.193967601418044e-06, "loss": 0.0002, "num_input_tokens_seen": 140178784, "step": 64995 }, { "epoch": 11.928794274178749, "grad_norm": 0.0004471739521250129, "learning_rate": 4.19317732841737e-06, "loss": 0.0, "num_input_tokens_seen": 140189536, "step": 65000 }, { "epoch": 11.9297118737383, "grad_norm": 0.001139461761340499, "learning_rate": 4.192387076110446e-06, "loss": 0.0, "num_input_tokens_seen": 140198912, "step": 65005 }, { "epoch": 11.930629473297852, "grad_norm": 0.0004440259363036603, "learning_rate": 4.191596844517544e-06, "loss": 0.0001, "num_input_tokens_seen": 140210048, "step": 65010 }, { "epoch": 11.931547072857406, "grad_norm": 0.008051238022744656, "learning_rate": 4.190806633658929e-06, "loss": 0.0001, "num_input_tokens_seen": 140221760, "step": 65015 }, { "epoch": 11.932464672416957, "grad_norm": 0.0067896535620093346, "learning_rate": 4.190016443554871e-06, "loss": 0.0, "num_input_tokens_seen": 140231904, "step": 65020 }, { "epoch": 11.933382271976509, "grad_norm": 0.0072014848701655865, "learning_rate": 4.1892262742256325e-06, "loss": 0.0019, "num_input_tokens_seen": 140244064, "step": 65025 }, { "epoch": 11.934299871536062, "grad_norm": 0.0007753059617243707, "learning_rate": 4.1884361256914864e-06, "loss": 0.0, "num_input_tokens_seen": 140255264, "step": 65030 }, { "epoch": 11.935217471095614, "grad_norm": 0.0016871888656169176, "learning_rate": 4.187645997972696e-06, "loss": 0.0001, "num_input_tokens_seen": 140265088, "step": 65035 }, { "epoch": 11.936135070655165, "grad_norm": 29.12805938720703, "learning_rate": 4.186855891089525e-06, "loss": 0.0025, "num_input_tokens_seen": 140275936, "step": 65040 }, { "epoch": 11.937052670214719, "grad_norm": 0.0006385621381923556, "learning_rate": 4.186065805062241e-06, "loss": 0.0913, "num_input_tokens_seen": 140286496, "step": 65045 }, { "epoch": 11.93797026977427, "grad_norm": 0.013015983626246452, "learning_rate": 4.185275739911107e-06, "loss": 0.0, "num_input_tokens_seen": 140298880, "step": 65050 }, { "epoch": 11.938887869333822, "grad_norm": 0.0002859231608454138, "learning_rate": 4.184485695656388e-06, "loss": 0.0001, "num_input_tokens_seen": 140309504, "step": 65055 }, { "epoch": 11.939805468893375, "grad_norm": 0.00037991569843143225, "learning_rate": 4.183695672318348e-06, "loss": 0.0, "num_input_tokens_seen": 140320704, "step": 65060 }, { "epoch": 11.940723068452927, "grad_norm": 0.11457973718643188, "learning_rate": 4.182905669917248e-06, "loss": 0.0001, "num_input_tokens_seen": 140332384, "step": 65065 }, { "epoch": 11.941640668012479, "grad_norm": 0.0054174140095710754, "learning_rate": 4.182115688473352e-06, "loss": 0.0006, "num_input_tokens_seen": 140343168, "step": 65070 }, { "epoch": 11.942558267572032, "grad_norm": 57.559608459472656, "learning_rate": 4.181325728006919e-06, "loss": 0.0193, "num_input_tokens_seen": 140353984, "step": 65075 }, { "epoch": 11.943475867131584, "grad_norm": 0.016173284500837326, "learning_rate": 4.180535788538213e-06, "loss": 0.0001, "num_input_tokens_seen": 140364256, "step": 65080 }, { "epoch": 11.944393466691135, "grad_norm": 0.0015748529694974422, "learning_rate": 4.179745870087495e-06, "loss": 0.0002, "num_input_tokens_seen": 140375488, "step": 65085 }, { "epoch": 11.945311066250689, "grad_norm": 6.875133514404297, "learning_rate": 4.178955972675022e-06, "loss": 0.0026, "num_input_tokens_seen": 140385472, "step": 65090 }, { "epoch": 11.94622866581024, "grad_norm": 0.009196281433105469, "learning_rate": 4.178166096321058e-06, "loss": 0.0002, "num_input_tokens_seen": 140396160, "step": 65095 }, { "epoch": 11.947146265369792, "grad_norm": 2.7122397422790527, "learning_rate": 4.17737624104586e-06, "loss": 0.001, "num_input_tokens_seen": 140406720, "step": 65100 }, { "epoch": 11.948063864929345, "grad_norm": 0.0014519277028739452, "learning_rate": 4.1765864068696834e-06, "loss": 0.0, "num_input_tokens_seen": 140418176, "step": 65105 }, { "epoch": 11.948981464488897, "grad_norm": 0.00193180947098881, "learning_rate": 4.175796593812792e-06, "loss": 0.0, "num_input_tokens_seen": 140430304, "step": 65110 }, { "epoch": 11.949899064048449, "grad_norm": 0.7718250751495361, "learning_rate": 4.175006801895441e-06, "loss": 0.0001, "num_input_tokens_seen": 140441664, "step": 65115 }, { "epoch": 11.950816663608002, "grad_norm": 0.00046993562136776745, "learning_rate": 4.174217031137886e-06, "loss": 0.0, "num_input_tokens_seen": 140452544, "step": 65120 }, { "epoch": 11.951734263167554, "grad_norm": 49.71900939941406, "learning_rate": 4.173427281560383e-06, "loss": 0.3505, "num_input_tokens_seen": 140463168, "step": 65125 }, { "epoch": 11.952651862727105, "grad_norm": 0.0003986993106082082, "learning_rate": 4.172637553183191e-06, "loss": 0.0207, "num_input_tokens_seen": 140473088, "step": 65130 }, { "epoch": 11.953569462286659, "grad_norm": 0.2341429889202118, "learning_rate": 4.1718478460265635e-06, "loss": 0.0002, "num_input_tokens_seen": 140484928, "step": 65135 }, { "epoch": 11.95448706184621, "grad_norm": 0.007349261082708836, "learning_rate": 4.171058160110754e-06, "loss": 0.0, "num_input_tokens_seen": 140495488, "step": 65140 }, { "epoch": 11.955404661405762, "grad_norm": 0.02573285810649395, "learning_rate": 4.170268495456018e-06, "loss": 0.0, "num_input_tokens_seen": 140506816, "step": 65145 }, { "epoch": 11.956322260965315, "grad_norm": 0.00038929597940295935, "learning_rate": 4.169478852082611e-06, "loss": 0.0, "num_input_tokens_seen": 140517280, "step": 65150 }, { "epoch": 11.957239860524867, "grad_norm": 0.0006833873339928687, "learning_rate": 4.168689230010783e-06, "loss": 0.0003, "num_input_tokens_seen": 140528864, "step": 65155 }, { "epoch": 11.958157460084418, "grad_norm": 50.54248046875, "learning_rate": 4.167899629260788e-06, "loss": 0.2313, "num_input_tokens_seen": 140540064, "step": 65160 }, { "epoch": 11.959075059643972, "grad_norm": 213.44923400878906, "learning_rate": 4.167110049852878e-06, "loss": 0.0822, "num_input_tokens_seen": 140551328, "step": 65165 }, { "epoch": 11.959992659203524, "grad_norm": 5.007748603820801, "learning_rate": 4.166320491807303e-06, "loss": 0.2235, "num_input_tokens_seen": 140562304, "step": 65170 }, { "epoch": 11.960910258763075, "grad_norm": 0.0019077666802331805, "learning_rate": 4.1655309551443165e-06, "loss": 0.0, "num_input_tokens_seen": 140574240, "step": 65175 }, { "epoch": 11.961827858322629, "grad_norm": 0.12747929990291595, "learning_rate": 4.164741439884168e-06, "loss": 0.0001, "num_input_tokens_seen": 140585248, "step": 65180 }, { "epoch": 11.96274545788218, "grad_norm": 0.0006500765448436141, "learning_rate": 4.163951946047107e-06, "loss": 0.0674, "num_input_tokens_seen": 140596320, "step": 65185 }, { "epoch": 11.963663057441732, "grad_norm": 0.0003037276037503034, "learning_rate": 4.163162473653381e-06, "loss": 0.04, "num_input_tokens_seen": 140605888, "step": 65190 }, { "epoch": 11.964580657001285, "grad_norm": 0.00040579214692115784, "learning_rate": 4.162373022723242e-06, "loss": 0.4157, "num_input_tokens_seen": 140615968, "step": 65195 }, { "epoch": 11.965498256560837, "grad_norm": 0.0008097590180113912, "learning_rate": 4.161583593276938e-06, "loss": 0.1284, "num_input_tokens_seen": 140629216, "step": 65200 }, { "epoch": 11.966415856120388, "grad_norm": 0.012511570006608963, "learning_rate": 4.160794185334715e-06, "loss": 0.0, "num_input_tokens_seen": 140639840, "step": 65205 }, { "epoch": 11.967333455679942, "grad_norm": 0.018552517518401146, "learning_rate": 4.160004798916817e-06, "loss": 0.0001, "num_input_tokens_seen": 140649312, "step": 65210 }, { "epoch": 11.968251055239493, "grad_norm": 0.0020428383722901344, "learning_rate": 4.1592154340434975e-06, "loss": 0.0001, "num_input_tokens_seen": 140660608, "step": 65215 }, { "epoch": 11.969168654799045, "grad_norm": 0.00022221320250537246, "learning_rate": 4.158426090734999e-06, "loss": 0.0, "num_input_tokens_seen": 140671968, "step": 65220 }, { "epoch": 11.970086254358598, "grad_norm": 0.0035282839089632034, "learning_rate": 4.157636769011564e-06, "loss": 0.0001, "num_input_tokens_seen": 140683904, "step": 65225 }, { "epoch": 11.97100385391815, "grad_norm": 0.011812186799943447, "learning_rate": 4.156847468893443e-06, "loss": 0.0, "num_input_tokens_seen": 140696320, "step": 65230 }, { "epoch": 11.971921453477702, "grad_norm": 16.235958099365234, "learning_rate": 4.156058190400878e-06, "loss": 0.0883, "num_input_tokens_seen": 140708000, "step": 65235 }, { "epoch": 11.972839053037255, "grad_norm": 0.00018270201690029353, "learning_rate": 4.15526893355411e-06, "loss": 0.0004, "num_input_tokens_seen": 140719040, "step": 65240 }, { "epoch": 11.973756652596807, "grad_norm": 0.2825738787651062, "learning_rate": 4.154479698373386e-06, "loss": 0.1316, "num_input_tokens_seen": 140728736, "step": 65245 }, { "epoch": 11.974674252156358, "grad_norm": 0.006154724862426519, "learning_rate": 4.153690484878949e-06, "loss": 0.0, "num_input_tokens_seen": 140739520, "step": 65250 }, { "epoch": 11.975591851715912, "grad_norm": 0.000995310372672975, "learning_rate": 4.152901293091038e-06, "loss": 0.0003, "num_input_tokens_seen": 140749632, "step": 65255 }, { "epoch": 11.976509451275463, "grad_norm": 0.0051101259887218475, "learning_rate": 4.152112123029896e-06, "loss": 0.0004, "num_input_tokens_seen": 140761056, "step": 65260 }, { "epoch": 11.977427050835015, "grad_norm": 0.002636990277096629, "learning_rate": 4.151322974715763e-06, "loss": 0.1252, "num_input_tokens_seen": 140771104, "step": 65265 }, { "epoch": 11.978344650394568, "grad_norm": 0.000870499643497169, "learning_rate": 4.150533848168881e-06, "loss": 0.0, "num_input_tokens_seen": 140781600, "step": 65270 }, { "epoch": 11.97926224995412, "grad_norm": 0.08416720479726791, "learning_rate": 4.149744743409489e-06, "loss": 0.0001, "num_input_tokens_seen": 140792960, "step": 65275 }, { "epoch": 11.980179849513672, "grad_norm": 0.015602207742631435, "learning_rate": 4.148955660457827e-06, "loss": 0.0001, "num_input_tokens_seen": 140804736, "step": 65280 }, { "epoch": 11.981097449073225, "grad_norm": 0.00032341352198272943, "learning_rate": 4.148166599334135e-06, "loss": 0.0002, "num_input_tokens_seen": 140817184, "step": 65285 }, { "epoch": 11.982015048632777, "grad_norm": 0.0007592664333060384, "learning_rate": 4.147377560058645e-06, "loss": 0.0, "num_input_tokens_seen": 140827104, "step": 65290 }, { "epoch": 11.982932648192328, "grad_norm": 2.058096170425415, "learning_rate": 4.1465885426516025e-06, "loss": 0.0013, "num_input_tokens_seen": 140838592, "step": 65295 }, { "epoch": 11.983850247751882, "grad_norm": 0.005687091965228319, "learning_rate": 4.145799547133242e-06, "loss": 0.0674, "num_input_tokens_seen": 140849024, "step": 65300 }, { "epoch": 11.984767847311433, "grad_norm": 0.10727890580892563, "learning_rate": 4.145010573523798e-06, "loss": 0.0009, "num_input_tokens_seen": 140859264, "step": 65305 }, { "epoch": 11.985685446870985, "grad_norm": 0.0028429494705051184, "learning_rate": 4.144221621843506e-06, "loss": 0.0009, "num_input_tokens_seen": 140870752, "step": 65310 }, { "epoch": 11.986603046430538, "grad_norm": 0.0014050232712179422, "learning_rate": 4.143432692112605e-06, "loss": 0.0, "num_input_tokens_seen": 140881120, "step": 65315 }, { "epoch": 11.98752064599009, "grad_norm": 0.6250331997871399, "learning_rate": 4.142643784351328e-06, "loss": 0.0031, "num_input_tokens_seen": 140891520, "step": 65320 }, { "epoch": 11.988438245549641, "grad_norm": 28.860639572143555, "learning_rate": 4.141854898579907e-06, "loss": 0.0207, "num_input_tokens_seen": 140902688, "step": 65325 }, { "epoch": 11.989355845109195, "grad_norm": 0.0008522611460648477, "learning_rate": 4.14106603481858e-06, "loss": 0.0, "num_input_tokens_seen": 140913984, "step": 65330 }, { "epoch": 11.990273444668746, "grad_norm": 0.002904311055317521, "learning_rate": 4.140277193087579e-06, "loss": 0.0, "num_input_tokens_seen": 140925856, "step": 65335 }, { "epoch": 11.991191044228298, "grad_norm": 0.000227574331802316, "learning_rate": 4.139488373407133e-06, "loss": 0.0, "num_input_tokens_seen": 140937280, "step": 65340 }, { "epoch": 11.992108643787851, "grad_norm": 0.00043943303171545267, "learning_rate": 4.138699575797477e-06, "loss": 0.0, "num_input_tokens_seen": 140947232, "step": 65345 }, { "epoch": 11.993026243347403, "grad_norm": 20.467144012451172, "learning_rate": 4.137910800278842e-06, "loss": 0.4407, "num_input_tokens_seen": 140958528, "step": 65350 }, { "epoch": 11.993943842906955, "grad_norm": 0.019228683784604073, "learning_rate": 4.137122046871461e-06, "loss": 0.0001, "num_input_tokens_seen": 140966528, "step": 65355 }, { "epoch": 11.994861442466508, "grad_norm": 0.001147075672633946, "learning_rate": 4.13633331559556e-06, "loss": 0.0001, "num_input_tokens_seen": 140976704, "step": 65360 }, { "epoch": 11.99577904202606, "grad_norm": 0.021811844781041145, "learning_rate": 4.13554460647137e-06, "loss": 0.172, "num_input_tokens_seen": 140987360, "step": 65365 }, { "epoch": 11.996696641585611, "grad_norm": 0.00029010666185058653, "learning_rate": 4.134755919519123e-06, "loss": 0.0, "num_input_tokens_seen": 140998848, "step": 65370 }, { "epoch": 11.997614241145165, "grad_norm": 0.019714023917913437, "learning_rate": 4.1339672547590454e-06, "loss": 0.0, "num_input_tokens_seen": 141010112, "step": 65375 }, { "epoch": 11.998531840704716, "grad_norm": 0.00506111653521657, "learning_rate": 4.133178612211366e-06, "loss": 0.0005, "num_input_tokens_seen": 141021344, "step": 65380 }, { "epoch": 11.999449440264268, "grad_norm": 0.00864278431981802, "learning_rate": 4.132389991896311e-06, "loss": 0.0002, "num_input_tokens_seen": 141032288, "step": 65385 }, { "epoch": 12.0, "eval_loss": 0.37321633100509644, "eval_runtime": 179.3943, "eval_samples_per_second": 30.374, "eval_steps_per_second": 7.598, "num_input_tokens_seen": 141036960, "step": 65388 }, { "epoch": 12.000367039823821, "grad_norm": 0.00262644630856812, "learning_rate": 4.131601393834108e-06, "loss": 0.0, "num_input_tokens_seen": 141041440, "step": 65390 }, { "epoch": 12.001284639383373, "grad_norm": 0.05482546240091324, "learning_rate": 4.1308128180449826e-06, "loss": 0.0002, "num_input_tokens_seen": 141052512, "step": 65395 }, { "epoch": 12.002202238942925, "grad_norm": 0.003986324183642864, "learning_rate": 4.130024264549162e-06, "loss": 0.0, "num_input_tokens_seen": 141064640, "step": 65400 }, { "epoch": 12.003119838502478, "grad_norm": 0.01294076070189476, "learning_rate": 4.129235733366872e-06, "loss": 0.0001, "num_input_tokens_seen": 141075872, "step": 65405 }, { "epoch": 12.00403743806203, "grad_norm": 0.0005690252291969955, "learning_rate": 4.128447224518333e-06, "loss": 0.0001, "num_input_tokens_seen": 141086816, "step": 65410 }, { "epoch": 12.004955037621581, "grad_norm": 0.08386623114347458, "learning_rate": 4.127658738023774e-06, "loss": 0.0006, "num_input_tokens_seen": 141096160, "step": 65415 }, { "epoch": 12.005872637181135, "grad_norm": 0.00040394384996034205, "learning_rate": 4.126870273903416e-06, "loss": 0.0002, "num_input_tokens_seen": 141105120, "step": 65420 }, { "epoch": 12.006790236740686, "grad_norm": 0.004400538746267557, "learning_rate": 4.126081832177481e-06, "loss": 0.0, "num_input_tokens_seen": 141114880, "step": 65425 }, { "epoch": 12.007707836300238, "grad_norm": 0.0025123993400484324, "learning_rate": 4.125293412866194e-06, "loss": 0.0001, "num_input_tokens_seen": 141124640, "step": 65430 }, { "epoch": 12.008625435859791, "grad_norm": 0.007635076530277729, "learning_rate": 4.124505015989775e-06, "loss": 0.0001, "num_input_tokens_seen": 141135616, "step": 65435 }, { "epoch": 12.009543035419343, "grad_norm": 0.0015636156313121319, "learning_rate": 4.123716641568447e-06, "loss": 0.0011, "num_input_tokens_seen": 141144416, "step": 65440 }, { "epoch": 12.010460634978894, "grad_norm": 0.004916323348879814, "learning_rate": 4.122928289622426e-06, "loss": 0.0, "num_input_tokens_seen": 141155328, "step": 65445 }, { "epoch": 12.011378234538448, "grad_norm": 0.0028708854224532843, "learning_rate": 4.122139960171937e-06, "loss": 0.0097, "num_input_tokens_seen": 141166400, "step": 65450 }, { "epoch": 12.012295834098, "grad_norm": 0.00039079622365534306, "learning_rate": 4.121351653237197e-06, "loss": 0.0, "num_input_tokens_seen": 141177664, "step": 65455 }, { "epoch": 12.013213433657551, "grad_norm": 0.004492935258895159, "learning_rate": 4.120563368838425e-06, "loss": 0.0, "num_input_tokens_seen": 141190048, "step": 65460 }, { "epoch": 12.014131033217105, "grad_norm": 31.67706871032715, "learning_rate": 4.119775106995839e-06, "loss": 0.0025, "num_input_tokens_seen": 141201440, "step": 65465 }, { "epoch": 12.015048632776656, "grad_norm": 0.00017541919078212231, "learning_rate": 4.1189868677296585e-06, "loss": 0.0, "num_input_tokens_seen": 141211648, "step": 65470 }, { "epoch": 12.015966232336208, "grad_norm": 0.5314239263534546, "learning_rate": 4.1181986510601e-06, "loss": 0.0003, "num_input_tokens_seen": 141221376, "step": 65475 }, { "epoch": 12.016883831895761, "grad_norm": 0.021688196808099747, "learning_rate": 4.1174104570073775e-06, "loss": 0.0, "num_input_tokens_seen": 141231616, "step": 65480 }, { "epoch": 12.017801431455313, "grad_norm": 0.0021235239692032337, "learning_rate": 4.11662228559171e-06, "loss": 0.0, "num_input_tokens_seen": 141242336, "step": 65485 }, { "epoch": 12.018719031014864, "grad_norm": 0.01500751357525587, "learning_rate": 4.115834136833312e-06, "loss": 0.0, "num_input_tokens_seen": 141253088, "step": 65490 }, { "epoch": 12.019636630574418, "grad_norm": 0.04993470385670662, "learning_rate": 4.115046010752397e-06, "loss": 0.0, "num_input_tokens_seen": 141262944, "step": 65495 }, { "epoch": 12.02055423013397, "grad_norm": 0.00016531170695088804, "learning_rate": 4.1142579073691815e-06, "loss": 0.0, "num_input_tokens_seen": 141272608, "step": 65500 }, { "epoch": 12.021471829693521, "grad_norm": 0.045593179762363434, "learning_rate": 4.113469826703878e-06, "loss": 0.0001, "num_input_tokens_seen": 141282912, "step": 65505 }, { "epoch": 12.022389429253074, "grad_norm": 0.0010828969534486532, "learning_rate": 4.1126817687766966e-06, "loss": 0.0361, "num_input_tokens_seen": 141294400, "step": 65510 }, { "epoch": 12.023307028812626, "grad_norm": 0.0036114007234573364, "learning_rate": 4.111893733607856e-06, "loss": 0.0, "num_input_tokens_seen": 141305984, "step": 65515 }, { "epoch": 12.024224628372178, "grad_norm": 0.0018131928518414497, "learning_rate": 4.111105721217563e-06, "loss": 0.0001, "num_input_tokens_seen": 141317760, "step": 65520 }, { "epoch": 12.025142227931731, "grad_norm": 0.006144732236862183, "learning_rate": 4.110317731626032e-06, "loss": 0.0001, "num_input_tokens_seen": 141328288, "step": 65525 }, { "epoch": 12.026059827491283, "grad_norm": 0.0038978583179414272, "learning_rate": 4.10952976485347e-06, "loss": 0.0285, "num_input_tokens_seen": 141338688, "step": 65530 }, { "epoch": 12.026977427050834, "grad_norm": 0.0013649051543325186, "learning_rate": 4.108741820920091e-06, "loss": 0.0002, "num_input_tokens_seen": 141349152, "step": 65535 }, { "epoch": 12.027895026610388, "grad_norm": 0.0013257660903036594, "learning_rate": 4.107953899846103e-06, "loss": 0.0, "num_input_tokens_seen": 141359936, "step": 65540 }, { "epoch": 12.02881262616994, "grad_norm": 0.001020615454763174, "learning_rate": 4.107166001651713e-06, "loss": 0.0, "num_input_tokens_seen": 141370144, "step": 65545 }, { "epoch": 12.029730225729491, "grad_norm": 3.4470221996307373, "learning_rate": 4.106378126357134e-06, "loss": 0.0006, "num_input_tokens_seen": 141380736, "step": 65550 }, { "epoch": 12.030647825289044, "grad_norm": 0.02332111820578575, "learning_rate": 4.10559027398257e-06, "loss": 0.0, "num_input_tokens_seen": 141391072, "step": 65555 }, { "epoch": 12.031565424848596, "grad_norm": 0.0005775229074060917, "learning_rate": 4.1048024445482286e-06, "loss": 0.0, "num_input_tokens_seen": 141401120, "step": 65560 }, { "epoch": 12.032483024408148, "grad_norm": 0.03215861693024635, "learning_rate": 4.104014638074319e-06, "loss": 0.0001, "num_input_tokens_seen": 141410816, "step": 65565 }, { "epoch": 12.033400623967701, "grad_norm": 0.0005606736522167921, "learning_rate": 4.103226854581044e-06, "loss": 0.0, "num_input_tokens_seen": 141421856, "step": 65570 }, { "epoch": 12.034318223527253, "grad_norm": 7.655165973119438e-05, "learning_rate": 4.10243909408861e-06, "loss": 0.0, "num_input_tokens_seen": 141432032, "step": 65575 }, { "epoch": 12.035235823086804, "grad_norm": 0.002707101171836257, "learning_rate": 4.101651356617223e-06, "loss": 0.0, "num_input_tokens_seen": 141443200, "step": 65580 }, { "epoch": 12.036153422646358, "grad_norm": 0.014443949796259403, "learning_rate": 4.100863642187085e-06, "loss": 0.0001, "num_input_tokens_seen": 141454336, "step": 65585 }, { "epoch": 12.03707102220591, "grad_norm": 0.0004265019088052213, "learning_rate": 4.1000759508184025e-06, "loss": 0.0, "num_input_tokens_seen": 141465856, "step": 65590 }, { "epoch": 12.03798862176546, "grad_norm": 0.01432296447455883, "learning_rate": 4.099288282531376e-06, "loss": 0.0, "num_input_tokens_seen": 141477568, "step": 65595 }, { "epoch": 12.038906221325014, "grad_norm": 0.0019771726801991463, "learning_rate": 4.09850063734621e-06, "loss": 0.0, "num_input_tokens_seen": 141487584, "step": 65600 }, { "epoch": 12.039823820884566, "grad_norm": 0.000433163222623989, "learning_rate": 4.0977130152831056e-06, "loss": 0.0, "num_input_tokens_seen": 141496480, "step": 65605 }, { "epoch": 12.040741420444117, "grad_norm": 0.0041350736282765865, "learning_rate": 4.096925416362264e-06, "loss": 0.0, "num_input_tokens_seen": 141507392, "step": 65610 }, { "epoch": 12.04165902000367, "grad_norm": 0.03631927818059921, "learning_rate": 4.096137840603883e-06, "loss": 0.0, "num_input_tokens_seen": 141518560, "step": 65615 }, { "epoch": 12.042576619563222, "grad_norm": 0.0018454925157129765, "learning_rate": 4.09535028802817e-06, "loss": 0.1782, "num_input_tokens_seen": 141529696, "step": 65620 }, { "epoch": 12.043494219122774, "grad_norm": 0.0020644706673920155, "learning_rate": 4.0945627586553176e-06, "loss": 0.0, "num_input_tokens_seen": 141541056, "step": 65625 }, { "epoch": 12.044411818682327, "grad_norm": 0.006926542613655329, "learning_rate": 4.0937752525055255e-06, "loss": 0.0, "num_input_tokens_seen": 141552256, "step": 65630 }, { "epoch": 12.045329418241879, "grad_norm": 0.0005199025617912412, "learning_rate": 4.092987769598996e-06, "loss": 0.0, "num_input_tokens_seen": 141563072, "step": 65635 }, { "epoch": 12.04624701780143, "grad_norm": 0.004291422199457884, "learning_rate": 4.092200309955925e-06, "loss": 0.0001, "num_input_tokens_seen": 141574624, "step": 65640 }, { "epoch": 12.047164617360984, "grad_norm": 0.0020148304756730795, "learning_rate": 4.091412873596507e-06, "loss": 0.0001, "num_input_tokens_seen": 141584480, "step": 65645 }, { "epoch": 12.048082216920536, "grad_norm": 0.001538950833491981, "learning_rate": 4.090625460540941e-06, "loss": 0.0, "num_input_tokens_seen": 141594464, "step": 65650 }, { "epoch": 12.048999816480087, "grad_norm": 0.003908862359821796, "learning_rate": 4.089838070809424e-06, "loss": 0.0797, "num_input_tokens_seen": 141604416, "step": 65655 }, { "epoch": 12.04991741603964, "grad_norm": 0.0049203927628695965, "learning_rate": 4.08905070442215e-06, "loss": 0.0004, "num_input_tokens_seen": 141615424, "step": 65660 }, { "epoch": 12.050835015599192, "grad_norm": 0.00016314632375724614, "learning_rate": 4.088263361399311e-06, "loss": 0.0, "num_input_tokens_seen": 141626944, "step": 65665 }, { "epoch": 12.051752615158744, "grad_norm": 0.0016069714911282063, "learning_rate": 4.087476041761106e-06, "loss": 0.0003, "num_input_tokens_seen": 141638400, "step": 65670 }, { "epoch": 12.052670214718297, "grad_norm": 0.01862994022667408, "learning_rate": 4.086688745527726e-06, "loss": 0.0, "num_input_tokens_seen": 141650176, "step": 65675 }, { "epoch": 12.053587814277849, "grad_norm": 0.0001995022175833583, "learning_rate": 4.0859014727193634e-06, "loss": 0.0, "num_input_tokens_seen": 141660896, "step": 65680 }, { "epoch": 12.0545054138374, "grad_norm": 0.0011026771971955895, "learning_rate": 4.085114223356211e-06, "loss": 0.0, "num_input_tokens_seen": 141671968, "step": 65685 }, { "epoch": 12.055423013396954, "grad_norm": 0.012822029180824757, "learning_rate": 4.084326997458462e-06, "loss": 0.0, "num_input_tokens_seen": 141681920, "step": 65690 }, { "epoch": 12.056340612956506, "grad_norm": 0.12790894508361816, "learning_rate": 4.083539795046305e-06, "loss": 0.0001, "num_input_tokens_seen": 141692928, "step": 65695 }, { "epoch": 12.057258212516057, "grad_norm": 0.0006760024116374552, "learning_rate": 4.082752616139934e-06, "loss": 0.0001, "num_input_tokens_seen": 141703904, "step": 65700 }, { "epoch": 12.05817581207561, "grad_norm": 0.012957734055817127, "learning_rate": 4.081965460759536e-06, "loss": 0.1626, "num_input_tokens_seen": 141714144, "step": 65705 }, { "epoch": 12.059093411635162, "grad_norm": 0.0004065275425091386, "learning_rate": 4.0811783289253e-06, "loss": 0.0, "num_input_tokens_seen": 141725568, "step": 65710 }, { "epoch": 12.060011011194714, "grad_norm": 0.0027221227064728737, "learning_rate": 4.080391220657416e-06, "loss": 0.0001, "num_input_tokens_seen": 141736576, "step": 65715 }, { "epoch": 12.060928610754267, "grad_norm": 0.0005361652001738548, "learning_rate": 4.079604135976073e-06, "loss": 0.0, "num_input_tokens_seen": 141748640, "step": 65720 }, { "epoch": 12.061846210313819, "grad_norm": 0.00038772926200181246, "learning_rate": 4.078817074901457e-06, "loss": 0.0, "num_input_tokens_seen": 141759168, "step": 65725 }, { "epoch": 12.06276380987337, "grad_norm": 0.0003107728553004563, "learning_rate": 4.078030037453753e-06, "loss": 0.0008, "num_input_tokens_seen": 141770016, "step": 65730 }, { "epoch": 12.063681409432924, "grad_norm": 0.00046794454101473093, "learning_rate": 4.077243023653153e-06, "loss": 0.0, "num_input_tokens_seen": 141780576, "step": 65735 }, { "epoch": 12.064599008992476, "grad_norm": 0.0012788347667083144, "learning_rate": 4.076456033519839e-06, "loss": 0.0, "num_input_tokens_seen": 141791232, "step": 65740 }, { "epoch": 12.065516608552027, "grad_norm": 0.0026998042594641447, "learning_rate": 4.075669067073994e-06, "loss": 0.0078, "num_input_tokens_seen": 141801248, "step": 65745 }, { "epoch": 12.06643420811158, "grad_norm": 0.0025500222109258175, "learning_rate": 4.074882124335806e-06, "loss": 0.0, "num_input_tokens_seen": 141812288, "step": 65750 }, { "epoch": 12.067351807671132, "grad_norm": 53.79450988769531, "learning_rate": 4.074095205325459e-06, "loss": 0.1782, "num_input_tokens_seen": 141822720, "step": 65755 }, { "epoch": 12.068269407230684, "grad_norm": 0.0003392895741853863, "learning_rate": 4.073308310063134e-06, "loss": 0.0792, "num_input_tokens_seen": 141831456, "step": 65760 }, { "epoch": 12.069187006790237, "grad_norm": 0.0012975841527804732, "learning_rate": 4.0725214385690135e-06, "loss": 0.0, "num_input_tokens_seen": 141841504, "step": 65765 }, { "epoch": 12.070104606349789, "grad_norm": 0.0013066512765362859, "learning_rate": 4.071734590863282e-06, "loss": 0.0, "num_input_tokens_seen": 141851008, "step": 65770 }, { "epoch": 12.07102220590934, "grad_norm": 0.003242823528125882, "learning_rate": 4.07094776696612e-06, "loss": 0.0, "num_input_tokens_seen": 141862976, "step": 65775 }, { "epoch": 12.071939805468894, "grad_norm": 0.0003963086346630007, "learning_rate": 4.070160966897705e-06, "loss": 0.0, "num_input_tokens_seen": 141872672, "step": 65780 }, { "epoch": 12.072857405028445, "grad_norm": 0.00016255327500402927, "learning_rate": 4.069374190678223e-06, "loss": 0.0284, "num_input_tokens_seen": 141883680, "step": 65785 }, { "epoch": 12.073775004587997, "grad_norm": 0.001038696151226759, "learning_rate": 4.06858743832785e-06, "loss": 0.0002, "num_input_tokens_seen": 141894784, "step": 65790 }, { "epoch": 12.07469260414755, "grad_norm": 0.0036351848393678665, "learning_rate": 4.0678007098667665e-06, "loss": 0.0, "num_input_tokens_seen": 141904192, "step": 65795 }, { "epoch": 12.075610203707102, "grad_norm": 0.006296927575021982, "learning_rate": 4.067014005315149e-06, "loss": 0.0306, "num_input_tokens_seen": 141915040, "step": 65800 }, { "epoch": 12.076527803266654, "grad_norm": 0.0010651849443092942, "learning_rate": 4.066227324693176e-06, "loss": 0.0, "num_input_tokens_seen": 141925120, "step": 65805 }, { "epoch": 12.077445402826207, "grad_norm": 0.007836904376745224, "learning_rate": 4.065440668021025e-06, "loss": 0.0, "num_input_tokens_seen": 141936096, "step": 65810 }, { "epoch": 12.078363002385759, "grad_norm": 0.0003291301254648715, "learning_rate": 4.064654035318872e-06, "loss": 0.0, "num_input_tokens_seen": 141946368, "step": 65815 }, { "epoch": 12.07928060194531, "grad_norm": 0.000512493948917836, "learning_rate": 4.063867426606894e-06, "loss": 0.0011, "num_input_tokens_seen": 141956512, "step": 65820 }, { "epoch": 12.080198201504864, "grad_norm": 0.00043617404298856854, "learning_rate": 4.063080841905267e-06, "loss": 0.0001, "num_input_tokens_seen": 141967488, "step": 65825 }, { "epoch": 12.081115801064415, "grad_norm": 0.0008411137969233096, "learning_rate": 4.0622942812341605e-06, "loss": 0.1408, "num_input_tokens_seen": 141976928, "step": 65830 }, { "epoch": 12.082033400623967, "grad_norm": 0.000249086762778461, "learning_rate": 4.061507744613756e-06, "loss": 0.0, "num_input_tokens_seen": 141987424, "step": 65835 }, { "epoch": 12.08295100018352, "grad_norm": 0.18770046532154083, "learning_rate": 4.060721232064223e-06, "loss": 0.0001, "num_input_tokens_seen": 141997920, "step": 65840 }, { "epoch": 12.083868599743072, "grad_norm": 0.0003739224630407989, "learning_rate": 4.059934743605734e-06, "loss": 0.0, "num_input_tokens_seen": 142009632, "step": 65845 }, { "epoch": 12.084786199302624, "grad_norm": 0.0009802711429074407, "learning_rate": 4.05914827925846e-06, "loss": 0.0002, "num_input_tokens_seen": 142020320, "step": 65850 }, { "epoch": 12.085703798862177, "grad_norm": 0.036590442061424255, "learning_rate": 4.058361839042576e-06, "loss": 0.0, "num_input_tokens_seen": 142032128, "step": 65855 }, { "epoch": 12.086621398421729, "grad_norm": 0.0029215302783995867, "learning_rate": 4.057575422978253e-06, "loss": 0.0, "num_input_tokens_seen": 142043328, "step": 65860 }, { "epoch": 12.08753899798128, "grad_norm": 0.017894970253109932, "learning_rate": 4.056789031085656e-06, "loss": 0.001, "num_input_tokens_seen": 142053920, "step": 65865 }, { "epoch": 12.088456597540834, "grad_norm": 0.0004910047282464802, "learning_rate": 4.056002663384961e-06, "loss": 0.0, "num_input_tokens_seen": 142064448, "step": 65870 }, { "epoch": 12.089374197100385, "grad_norm": 0.0032248550560325384, "learning_rate": 4.0552163198963355e-06, "loss": 0.0, "num_input_tokens_seen": 142073600, "step": 65875 }, { "epoch": 12.090291796659937, "grad_norm": 0.001611142186447978, "learning_rate": 4.0544300006399445e-06, "loss": 0.0, "num_input_tokens_seen": 142083616, "step": 65880 }, { "epoch": 12.09120939621949, "grad_norm": 319.3238525390625, "learning_rate": 4.053643705635961e-06, "loss": 0.0287, "num_input_tokens_seen": 142094400, "step": 65885 }, { "epoch": 12.092126995779042, "grad_norm": 0.002942885272204876, "learning_rate": 4.052857434904549e-06, "loss": 0.0, "num_input_tokens_seen": 142105536, "step": 65890 }, { "epoch": 12.093044595338593, "grad_norm": 0.0007143343100324273, "learning_rate": 4.0520711884658755e-06, "loss": 0.0001, "num_input_tokens_seen": 142117408, "step": 65895 }, { "epoch": 12.093962194898147, "grad_norm": 0.007749101147055626, "learning_rate": 4.051284966340107e-06, "loss": 0.0, "num_input_tokens_seen": 142128704, "step": 65900 }, { "epoch": 12.094879794457698, "grad_norm": 0.0022933264262974262, "learning_rate": 4.050498768547408e-06, "loss": 0.0, "num_input_tokens_seen": 142139552, "step": 65905 }, { "epoch": 12.09579739401725, "grad_norm": 0.008349195122718811, "learning_rate": 4.0497125951079455e-06, "loss": 0.0001, "num_input_tokens_seen": 142151648, "step": 65910 }, { "epoch": 12.096714993576803, "grad_norm": 0.0014371894067153335, "learning_rate": 4.048926446041881e-06, "loss": 0.0, "num_input_tokens_seen": 142160992, "step": 65915 }, { "epoch": 12.097632593136355, "grad_norm": 0.07562942057847977, "learning_rate": 4.0481403213693795e-06, "loss": 0.0002, "num_input_tokens_seen": 142171776, "step": 65920 }, { "epoch": 12.098550192695907, "grad_norm": 0.005432661157101393, "learning_rate": 4.047354221110604e-06, "loss": 0.0, "num_input_tokens_seen": 142183584, "step": 65925 }, { "epoch": 12.09946779225546, "grad_norm": 0.0015668882988393307, "learning_rate": 4.0465681452857155e-06, "loss": 0.0, "num_input_tokens_seen": 142194464, "step": 65930 }, { "epoch": 12.100385391815012, "grad_norm": 0.0011110440827906132, "learning_rate": 4.045782093914876e-06, "loss": 0.0, "num_input_tokens_seen": 142204320, "step": 65935 }, { "epoch": 12.101302991374563, "grad_norm": 0.015678081661462784, "learning_rate": 4.044996067018247e-06, "loss": 0.0, "num_input_tokens_seen": 142215072, "step": 65940 }, { "epoch": 12.102220590934117, "grad_norm": 0.06980777531862259, "learning_rate": 4.0442100646159906e-06, "loss": 0.1315, "num_input_tokens_seen": 142225568, "step": 65945 }, { "epoch": 12.103138190493668, "grad_norm": 0.030337553471326828, "learning_rate": 4.043424086728262e-06, "loss": 0.0001, "num_input_tokens_seen": 142236608, "step": 65950 }, { "epoch": 12.10405579005322, "grad_norm": 0.0007385381613858044, "learning_rate": 4.042638133375225e-06, "loss": 0.0, "num_input_tokens_seen": 142247616, "step": 65955 }, { "epoch": 12.104973389612773, "grad_norm": 0.001224203733727336, "learning_rate": 4.041852204577035e-06, "loss": 0.0, "num_input_tokens_seen": 142257888, "step": 65960 }, { "epoch": 12.105890989172325, "grad_norm": 7.5251922607421875, "learning_rate": 4.04106630035385e-06, "loss": 0.0063, "num_input_tokens_seen": 142268544, "step": 65965 }, { "epoch": 12.106808588731877, "grad_norm": 0.030873574316501617, "learning_rate": 4.040280420725831e-06, "loss": 0.0008, "num_input_tokens_seen": 142279456, "step": 65970 }, { "epoch": 12.10772618829143, "grad_norm": 0.001369584002532065, "learning_rate": 4.03949456571313e-06, "loss": 0.0, "num_input_tokens_seen": 142292192, "step": 65975 }, { "epoch": 12.108643787850982, "grad_norm": 0.00019397190771996975, "learning_rate": 4.038708735335906e-06, "loss": 0.0, "num_input_tokens_seen": 142302304, "step": 65980 }, { "epoch": 12.109561387410533, "grad_norm": 0.0017330523114651442, "learning_rate": 4.037922929614311e-06, "loss": 0.0451, "num_input_tokens_seen": 142314144, "step": 65985 }, { "epoch": 12.110478986970087, "grad_norm": 0.0010672420030459762, "learning_rate": 4.037137148568503e-06, "loss": 0.0002, "num_input_tokens_seen": 142323680, "step": 65990 }, { "epoch": 12.111396586529638, "grad_norm": 0.0035807315725833178, "learning_rate": 4.036351392218635e-06, "loss": 0.0001, "num_input_tokens_seen": 142333984, "step": 65995 }, { "epoch": 12.11231418608919, "grad_norm": 0.0006204121164046228, "learning_rate": 4.03556566058486e-06, "loss": 0.0159, "num_input_tokens_seen": 142344672, "step": 66000 }, { "epoch": 12.113231785648743, "grad_norm": 0.011793053708970547, "learning_rate": 4.03477995368733e-06, "loss": 0.0265, "num_input_tokens_seen": 142356096, "step": 66005 }, { "epoch": 12.114149385208295, "grad_norm": 0.001727165887132287, "learning_rate": 4.033994271546201e-06, "loss": 0.0003, "num_input_tokens_seen": 142367648, "step": 66010 }, { "epoch": 12.115066984767846, "grad_norm": 0.0019431854598224163, "learning_rate": 4.033208614181619e-06, "loss": 0.0191, "num_input_tokens_seen": 142378752, "step": 66015 }, { "epoch": 12.1159845843274, "grad_norm": 0.0015531250974163413, "learning_rate": 4.03242298161374e-06, "loss": 0.0, "num_input_tokens_seen": 142389824, "step": 66020 }, { "epoch": 12.116902183886952, "grad_norm": 0.0005253935814835131, "learning_rate": 4.031637373862711e-06, "loss": 0.0003, "num_input_tokens_seen": 142400608, "step": 66025 }, { "epoch": 12.117819783446503, "grad_norm": 0.0014281455660238862, "learning_rate": 4.030851790948681e-06, "loss": 0.0006, "num_input_tokens_seen": 142411936, "step": 66030 }, { "epoch": 12.118737383006057, "grad_norm": 0.0006018413696438074, "learning_rate": 4.030066232891801e-06, "loss": 0.2969, "num_input_tokens_seen": 142422304, "step": 66035 }, { "epoch": 12.119654982565608, "grad_norm": 0.0006462895544245839, "learning_rate": 4.029280699712221e-06, "loss": 0.0, "num_input_tokens_seen": 142432864, "step": 66040 }, { "epoch": 12.12057258212516, "grad_norm": 0.06609325855970383, "learning_rate": 4.028495191430085e-06, "loss": 0.0, "num_input_tokens_seen": 142442784, "step": 66045 }, { "epoch": 12.121490181684713, "grad_norm": 0.0002453222987242043, "learning_rate": 4.0277097080655405e-06, "loss": 0.0009, "num_input_tokens_seen": 142453600, "step": 66050 }, { "epoch": 12.122407781244265, "grad_norm": 0.00292931217700243, "learning_rate": 4.026924249638737e-06, "loss": 0.0001, "num_input_tokens_seen": 142464032, "step": 66055 }, { "epoch": 12.123325380803816, "grad_norm": 0.008405636064708233, "learning_rate": 4.026138816169819e-06, "loss": 0.0, "num_input_tokens_seen": 142475584, "step": 66060 }, { "epoch": 12.12424298036337, "grad_norm": 0.012681514956057072, "learning_rate": 4.02535340767893e-06, "loss": 0.0, "num_input_tokens_seen": 142484832, "step": 66065 }, { "epoch": 12.125160579922921, "grad_norm": 69.93367004394531, "learning_rate": 4.024568024186213e-06, "loss": 0.019, "num_input_tokens_seen": 142496288, "step": 66070 }, { "epoch": 12.126078179482473, "grad_norm": 148.01995849609375, "learning_rate": 4.023782665711818e-06, "loss": 0.0294, "num_input_tokens_seen": 142507488, "step": 66075 }, { "epoch": 12.126995779042026, "grad_norm": 0.024936208501458168, "learning_rate": 4.022997332275883e-06, "loss": 0.0001, "num_input_tokens_seen": 142517824, "step": 66080 }, { "epoch": 12.127913378601578, "grad_norm": 0.000816753541585058, "learning_rate": 4.0222120238985515e-06, "loss": 0.0, "num_input_tokens_seen": 142528320, "step": 66085 }, { "epoch": 12.12883097816113, "grad_norm": 0.039960362017154694, "learning_rate": 4.021426740599967e-06, "loss": 0.0001, "num_input_tokens_seen": 142538400, "step": 66090 }, { "epoch": 12.129748577720683, "grad_norm": 0.02056596428155899, "learning_rate": 4.020641482400272e-06, "loss": 0.0001, "num_input_tokens_seen": 142549472, "step": 66095 }, { "epoch": 12.130666177280235, "grad_norm": 0.0019074879819527268, "learning_rate": 4.019856249319601e-06, "loss": 0.0, "num_input_tokens_seen": 142559776, "step": 66100 }, { "epoch": 12.131583776839786, "grad_norm": 36.989295959472656, "learning_rate": 4.019071041378101e-06, "loss": 0.1221, "num_input_tokens_seen": 142570208, "step": 66105 }, { "epoch": 12.13250137639934, "grad_norm": 0.0011542481370270252, "learning_rate": 4.018285858595908e-06, "loss": 0.0002, "num_input_tokens_seen": 142579456, "step": 66110 }, { "epoch": 12.133418975958891, "grad_norm": 0.0025836164131760597, "learning_rate": 4.017500700993162e-06, "loss": 0.0, "num_input_tokens_seen": 142589024, "step": 66115 }, { "epoch": 12.134336575518443, "grad_norm": 0.004023305140435696, "learning_rate": 4.016715568589999e-06, "loss": 0.0, "num_input_tokens_seen": 142600288, "step": 66120 }, { "epoch": 12.135254175077996, "grad_norm": 73.13914489746094, "learning_rate": 4.015930461406557e-06, "loss": 0.2228, "num_input_tokens_seen": 142611168, "step": 66125 }, { "epoch": 12.136171774637548, "grad_norm": 0.04277192801237106, "learning_rate": 4.015145379462976e-06, "loss": 0.1657, "num_input_tokens_seen": 142620896, "step": 66130 }, { "epoch": 12.1370893741971, "grad_norm": 0.0027548647485673428, "learning_rate": 4.014360322779387e-06, "loss": 0.0079, "num_input_tokens_seen": 142632096, "step": 66135 }, { "epoch": 12.138006973756653, "grad_norm": 0.0012175810988992453, "learning_rate": 4.01357529137593e-06, "loss": 0.0, "num_input_tokens_seen": 142642368, "step": 66140 }, { "epoch": 12.138924573316205, "grad_norm": 0.0008623397443443537, "learning_rate": 4.012790285272738e-06, "loss": 0.0, "num_input_tokens_seen": 142653632, "step": 66145 }, { "epoch": 12.139842172875756, "grad_norm": 0.0006624201196245849, "learning_rate": 4.012005304489943e-06, "loss": 0.0, "num_input_tokens_seen": 142664352, "step": 66150 }, { "epoch": 12.14075977243531, "grad_norm": 0.07975921034812927, "learning_rate": 4.011220349047683e-06, "loss": 0.0001, "num_input_tokens_seen": 142674976, "step": 66155 }, { "epoch": 12.141677371994861, "grad_norm": 0.0017389474669471383, "learning_rate": 4.010435418966088e-06, "loss": 0.0, "num_input_tokens_seen": 142685856, "step": 66160 }, { "epoch": 12.142594971554413, "grad_norm": 0.0004006745293736458, "learning_rate": 4.0096505142652905e-06, "loss": 0.0, "num_input_tokens_seen": 142695712, "step": 66165 }, { "epoch": 12.143512571113966, "grad_norm": 0.0015325596323236823, "learning_rate": 4.0088656349654205e-06, "loss": 0.0, "num_input_tokens_seen": 142706304, "step": 66170 }, { "epoch": 12.144430170673518, "grad_norm": 0.05568377301096916, "learning_rate": 4.008080781086614e-06, "loss": 0.0015, "num_input_tokens_seen": 142717280, "step": 66175 }, { "epoch": 12.14534777023307, "grad_norm": 0.0032817255705595016, "learning_rate": 4.007295952648998e-06, "loss": 0.0032, "num_input_tokens_seen": 142728928, "step": 66180 }, { "epoch": 12.146265369792623, "grad_norm": 17.298568725585938, "learning_rate": 4.0065111496726985e-06, "loss": 0.0007, "num_input_tokens_seen": 142740096, "step": 66185 }, { "epoch": 12.147182969352174, "grad_norm": 0.001516456832177937, "learning_rate": 4.0057263721778515e-06, "loss": 0.0173, "num_input_tokens_seen": 142751200, "step": 66190 }, { "epoch": 12.148100568911726, "grad_norm": 0.008043952286243439, "learning_rate": 4.004941620184582e-06, "loss": 0.0, "num_input_tokens_seen": 142762464, "step": 66195 }, { "epoch": 12.14901816847128, "grad_norm": 0.00028214589110575616, "learning_rate": 4.004156893713016e-06, "loss": 0.0, "num_input_tokens_seen": 142774016, "step": 66200 }, { "epoch": 12.149935768030831, "grad_norm": 0.00043419169378466904, "learning_rate": 4.003372192783284e-06, "loss": 0.0, "num_input_tokens_seen": 142785248, "step": 66205 }, { "epoch": 12.150853367590383, "grad_norm": 0.0008663082262501121, "learning_rate": 4.00258751741551e-06, "loss": 0.0, "num_input_tokens_seen": 142795424, "step": 66210 }, { "epoch": 12.151770967149936, "grad_norm": 0.001976429019123316, "learning_rate": 4.001802867629821e-06, "loss": 0.0, "num_input_tokens_seen": 142806176, "step": 66215 }, { "epoch": 12.152688566709488, "grad_norm": 0.00202002190053463, "learning_rate": 4.00101824344634e-06, "loss": 0.0001, "num_input_tokens_seen": 142817376, "step": 66220 }, { "epoch": 12.15360616626904, "grad_norm": 104.6734619140625, "learning_rate": 4.000233644885193e-06, "loss": 0.1128, "num_input_tokens_seen": 142826368, "step": 66225 }, { "epoch": 12.154523765828593, "grad_norm": 0.0014114158693701029, "learning_rate": 3.999449071966505e-06, "loss": 0.0012, "num_input_tokens_seen": 142838080, "step": 66230 }, { "epoch": 12.155441365388144, "grad_norm": 0.011932561174035072, "learning_rate": 3.998664524710396e-06, "loss": 0.0, "num_input_tokens_seen": 142848608, "step": 66235 }, { "epoch": 12.156358964947696, "grad_norm": 0.0031005090568214655, "learning_rate": 3.997880003136991e-06, "loss": 0.0001, "num_input_tokens_seen": 142859168, "step": 66240 }, { "epoch": 12.15727656450725, "grad_norm": 0.00018326083954889327, "learning_rate": 3.9970955072664115e-06, "loss": 0.0, "num_input_tokens_seen": 142870592, "step": 66245 }, { "epoch": 12.158194164066801, "grad_norm": 0.0002545043535064906, "learning_rate": 3.996311037118776e-06, "loss": 0.0, "num_input_tokens_seen": 142880256, "step": 66250 }, { "epoch": 12.159111763626353, "grad_norm": 0.0008195178816094995, "learning_rate": 3.995526592714207e-06, "loss": 0.1198, "num_input_tokens_seen": 142890976, "step": 66255 }, { "epoch": 12.160029363185906, "grad_norm": 0.0012810347834601998, "learning_rate": 3.994742174072826e-06, "loss": 0.0005, "num_input_tokens_seen": 142901856, "step": 66260 }, { "epoch": 12.160946962745458, "grad_norm": 0.006296344567090273, "learning_rate": 3.993957781214749e-06, "loss": 0.0, "num_input_tokens_seen": 142912704, "step": 66265 }, { "epoch": 12.16186456230501, "grad_norm": 0.051365647464990616, "learning_rate": 3.993173414160094e-06, "loss": 0.0001, "num_input_tokens_seen": 142923456, "step": 66270 }, { "epoch": 12.162782161864563, "grad_norm": 0.010630324482917786, "learning_rate": 3.992389072928983e-06, "loss": 0.0426, "num_input_tokens_seen": 142934112, "step": 66275 }, { "epoch": 12.163699761424114, "grad_norm": 0.00017537405074108392, "learning_rate": 3.9916047575415304e-06, "loss": 0.194, "num_input_tokens_seen": 142945248, "step": 66280 }, { "epoch": 12.164617360983666, "grad_norm": 0.0006789241451770067, "learning_rate": 3.9908204680178505e-06, "loss": 0.0, "num_input_tokens_seen": 142956320, "step": 66285 }, { "epoch": 12.16553496054322, "grad_norm": 231.89695739746094, "learning_rate": 3.990036204378064e-06, "loss": 0.1782, "num_input_tokens_seen": 142968320, "step": 66290 }, { "epoch": 12.16645256010277, "grad_norm": 0.00020533536735456437, "learning_rate": 3.989251966642284e-06, "loss": 0.0, "num_input_tokens_seen": 142979392, "step": 66295 }, { "epoch": 12.167370159662322, "grad_norm": 0.006184027995914221, "learning_rate": 3.988467754830623e-06, "loss": 0.0, "num_input_tokens_seen": 142990112, "step": 66300 }, { "epoch": 12.168287759221876, "grad_norm": 0.0034253071062266827, "learning_rate": 3.9876835689631955e-06, "loss": 0.0, "num_input_tokens_seen": 143001952, "step": 66305 }, { "epoch": 12.169205358781428, "grad_norm": 0.07098422944545746, "learning_rate": 3.986899409060117e-06, "loss": 0.0001, "num_input_tokens_seen": 143013120, "step": 66310 }, { "epoch": 12.17012295834098, "grad_norm": 0.00032955664210021496, "learning_rate": 3.986115275141499e-06, "loss": 0.0001, "num_input_tokens_seen": 143024384, "step": 66315 }, { "epoch": 12.171040557900533, "grad_norm": 0.06742769479751587, "learning_rate": 3.98533116722745e-06, "loss": 0.0001, "num_input_tokens_seen": 143034560, "step": 66320 }, { "epoch": 12.171958157460084, "grad_norm": 0.0010266201570630074, "learning_rate": 3.984547085338087e-06, "loss": 0.0, "num_input_tokens_seen": 143046048, "step": 66325 }, { "epoch": 12.172875757019636, "grad_norm": 0.07627062499523163, "learning_rate": 3.983763029493517e-06, "loss": 0.0001, "num_input_tokens_seen": 143057504, "step": 66330 }, { "epoch": 12.17379335657919, "grad_norm": 0.00027912502991966903, "learning_rate": 3.982978999713849e-06, "loss": 0.2063, "num_input_tokens_seen": 143067648, "step": 66335 }, { "epoch": 12.17471095613874, "grad_norm": 139.7286834716797, "learning_rate": 3.9821949960191944e-06, "loss": 0.0119, "num_input_tokens_seen": 143078560, "step": 66340 }, { "epoch": 12.175628555698292, "grad_norm": 0.002280924003571272, "learning_rate": 3.981411018429661e-06, "loss": 0.0001, "num_input_tokens_seen": 143090240, "step": 66345 }, { "epoch": 12.176546155257846, "grad_norm": 0.0009428660850971937, "learning_rate": 3.980627066965356e-06, "loss": 0.0001, "num_input_tokens_seen": 143099808, "step": 66350 }, { "epoch": 12.177463754817397, "grad_norm": 0.0008576253894716501, "learning_rate": 3.979843141646385e-06, "loss": 0.0, "num_input_tokens_seen": 143109504, "step": 66355 }, { "epoch": 12.178381354376949, "grad_norm": 0.00033475153031758964, "learning_rate": 3.9790592424928596e-06, "loss": 0.0, "num_input_tokens_seen": 143119904, "step": 66360 }, { "epoch": 12.179298953936502, "grad_norm": 0.02064567618072033, "learning_rate": 3.978275369524881e-06, "loss": 0.0, "num_input_tokens_seen": 143131648, "step": 66365 }, { "epoch": 12.180216553496054, "grad_norm": 0.0008048558374866843, "learning_rate": 3.977491522762553e-06, "loss": 0.0, "num_input_tokens_seen": 143142560, "step": 66370 }, { "epoch": 12.181134153055606, "grad_norm": 0.04384078457951546, "learning_rate": 3.976707702225986e-06, "loss": 0.0001, "num_input_tokens_seen": 143153472, "step": 66375 }, { "epoch": 12.182051752615159, "grad_norm": 0.0040434286929667, "learning_rate": 3.97592390793528e-06, "loss": 0.0, "num_input_tokens_seen": 143164160, "step": 66380 }, { "epoch": 12.18296935217471, "grad_norm": 0.00839220080524683, "learning_rate": 3.975140139910538e-06, "loss": 0.0045, "num_input_tokens_seen": 143175200, "step": 66385 }, { "epoch": 12.183886951734262, "grad_norm": 0.012386121787130833, "learning_rate": 3.974356398171862e-06, "loss": 0.0, "num_input_tokens_seen": 143185152, "step": 66390 }, { "epoch": 12.184804551293816, "grad_norm": 0.019345438107848167, "learning_rate": 3.973572682739356e-06, "loss": 0.1221, "num_input_tokens_seen": 143196192, "step": 66395 }, { "epoch": 12.185722150853367, "grad_norm": 0.0006594655569642782, "learning_rate": 3.972788993633121e-06, "loss": 0.0, "num_input_tokens_seen": 143206880, "step": 66400 }, { "epoch": 12.186639750412919, "grad_norm": 0.00048247509403154254, "learning_rate": 3.972005330873253e-06, "loss": 0.0, "num_input_tokens_seen": 143218304, "step": 66405 }, { "epoch": 12.187557349972472, "grad_norm": 0.0008998529519885778, "learning_rate": 3.971221694479857e-06, "loss": 0.0, "num_input_tokens_seen": 143228672, "step": 66410 }, { "epoch": 12.188474949532024, "grad_norm": 0.007455230224877596, "learning_rate": 3.970438084473031e-06, "loss": 0.0, "num_input_tokens_seen": 143240544, "step": 66415 }, { "epoch": 12.189392549091576, "grad_norm": 0.0008627845090813935, "learning_rate": 3.96965450087287e-06, "loss": 0.0, "num_input_tokens_seen": 143252032, "step": 66420 }, { "epoch": 12.190310148651129, "grad_norm": 0.0013968003913760185, "learning_rate": 3.968870943699477e-06, "loss": 0.0, "num_input_tokens_seen": 143262560, "step": 66425 }, { "epoch": 12.19122774821068, "grad_norm": 0.004796986468136311, "learning_rate": 3.968087412972945e-06, "loss": 0.0, "num_input_tokens_seen": 143273760, "step": 66430 }, { "epoch": 12.192145347770232, "grad_norm": 0.0005606702761724591, "learning_rate": 3.967303908713372e-06, "loss": 0.0, "num_input_tokens_seen": 143284896, "step": 66435 }, { "epoch": 12.193062947329786, "grad_norm": 0.034595392644405365, "learning_rate": 3.966520430940852e-06, "loss": 0.0001, "num_input_tokens_seen": 143294528, "step": 66440 }, { "epoch": 12.193980546889337, "grad_norm": 0.0006001013680361211, "learning_rate": 3.965736979675481e-06, "loss": 0.0, "num_input_tokens_seen": 143305472, "step": 66445 }, { "epoch": 12.194898146448889, "grad_norm": 0.002791087608784437, "learning_rate": 3.9649535549373555e-06, "loss": 0.0, "num_input_tokens_seen": 143315584, "step": 66450 }, { "epoch": 12.195815746008442, "grad_norm": 0.003356133121997118, "learning_rate": 3.964170156746565e-06, "loss": 0.0, "num_input_tokens_seen": 143327008, "step": 66455 }, { "epoch": 12.196733345567994, "grad_norm": 0.025782311335206032, "learning_rate": 3.963386785123207e-06, "loss": 0.0, "num_input_tokens_seen": 143339296, "step": 66460 }, { "epoch": 12.197650945127545, "grad_norm": 0.014698652550578117, "learning_rate": 3.9626034400873695e-06, "loss": 0.0, "num_input_tokens_seen": 143350016, "step": 66465 }, { "epoch": 12.198568544687099, "grad_norm": 0.0005948166362941265, "learning_rate": 3.961820121659145e-06, "loss": 0.0, "num_input_tokens_seen": 143360704, "step": 66470 }, { "epoch": 12.19948614424665, "grad_norm": 0.014302907511591911, "learning_rate": 3.9610368298586275e-06, "loss": 0.0, "num_input_tokens_seen": 143372384, "step": 66475 }, { "epoch": 12.200403743806202, "grad_norm": 0.0006560453330166638, "learning_rate": 3.960253564705905e-06, "loss": 0.0, "num_input_tokens_seen": 143383392, "step": 66480 }, { "epoch": 12.201321343365755, "grad_norm": 0.0006171730346977711, "learning_rate": 3.959470326221066e-06, "loss": 0.0, "num_input_tokens_seen": 143394400, "step": 66485 }, { "epoch": 12.202238942925307, "grad_norm": 0.0017891150200739503, "learning_rate": 3.9586871144242e-06, "loss": 0.0, "num_input_tokens_seen": 143404544, "step": 66490 }, { "epoch": 12.20315654248486, "grad_norm": 0.001975721213966608, "learning_rate": 3.957903929335397e-06, "loss": 0.0, "num_input_tokens_seen": 143416192, "step": 66495 }, { "epoch": 12.204074142044412, "grad_norm": 0.00045929933548904955, "learning_rate": 3.957120770974743e-06, "loss": 0.0, "num_input_tokens_seen": 143427616, "step": 66500 }, { "epoch": 12.204991741603964, "grad_norm": 0.00020181669970043004, "learning_rate": 3.956337639362323e-06, "loss": 0.0, "num_input_tokens_seen": 143438752, "step": 66505 }, { "epoch": 12.205909341163517, "grad_norm": 0.0033742354717105627, "learning_rate": 3.955554534518227e-06, "loss": 0.0, "num_input_tokens_seen": 143449824, "step": 66510 }, { "epoch": 12.206826940723069, "grad_norm": 0.000918125850148499, "learning_rate": 3.954771456462538e-06, "loss": 0.0, "num_input_tokens_seen": 143460992, "step": 66515 }, { "epoch": 12.20774454028262, "grad_norm": 0.0037621748633682728, "learning_rate": 3.953988405215342e-06, "loss": 0.0, "num_input_tokens_seen": 143472416, "step": 66520 }, { "epoch": 12.208662139842174, "grad_norm": 0.02196151204407215, "learning_rate": 3.953205380796719e-06, "loss": 0.0, "num_input_tokens_seen": 143483552, "step": 66525 }, { "epoch": 12.209579739401725, "grad_norm": 0.0014446445275098085, "learning_rate": 3.952422383226759e-06, "loss": 0.0, "num_input_tokens_seen": 143494784, "step": 66530 }, { "epoch": 12.210497338961277, "grad_norm": 0.004212323110550642, "learning_rate": 3.951639412525541e-06, "loss": 0.0, "num_input_tokens_seen": 143506304, "step": 66535 }, { "epoch": 12.21141493852083, "grad_norm": 0.0037612332962453365, "learning_rate": 3.9508564687131465e-06, "loss": 0.0, "num_input_tokens_seen": 143518112, "step": 66540 }, { "epoch": 12.212332538080382, "grad_norm": 0.0018770896131172776, "learning_rate": 3.950073551809657e-06, "loss": 0.0, "num_input_tokens_seen": 143527264, "step": 66545 }, { "epoch": 12.213250137639934, "grad_norm": 0.0052648582495749, "learning_rate": 3.9492906618351545e-06, "loss": 0.0001, "num_input_tokens_seen": 143539488, "step": 66550 }, { "epoch": 12.214167737199487, "grad_norm": 0.00024247310648206621, "learning_rate": 3.948507798809718e-06, "loss": 0.0001, "num_input_tokens_seen": 143549792, "step": 66555 }, { "epoch": 12.215085336759039, "grad_norm": 0.0014780022902414203, "learning_rate": 3.9477249627534265e-06, "loss": 0.0, "num_input_tokens_seen": 143560192, "step": 66560 }, { "epoch": 12.21600293631859, "grad_norm": 0.0015837588580325246, "learning_rate": 3.9469421536863595e-06, "loss": 0.0, "num_input_tokens_seen": 143570944, "step": 66565 }, { "epoch": 12.216920535878144, "grad_norm": 0.001117031672038138, "learning_rate": 3.946159371628593e-06, "loss": 0.0, "num_input_tokens_seen": 143582592, "step": 66570 }, { "epoch": 12.217838135437695, "grad_norm": 0.0021667354740202427, "learning_rate": 3.945376616600205e-06, "loss": 0.0001, "num_input_tokens_seen": 143592320, "step": 66575 }, { "epoch": 12.218755734997247, "grad_norm": 0.009191574528813362, "learning_rate": 3.944593888621274e-06, "loss": 0.0001, "num_input_tokens_seen": 143602176, "step": 66580 }, { "epoch": 12.2196733345568, "grad_norm": 0.021878546103835106, "learning_rate": 3.943811187711873e-06, "loss": 0.0, "num_input_tokens_seen": 143613056, "step": 66585 }, { "epoch": 12.220590934116352, "grad_norm": 0.0005180682055652142, "learning_rate": 3.943028513892078e-06, "loss": 0.0, "num_input_tokens_seen": 143623712, "step": 66590 }, { "epoch": 12.221508533675904, "grad_norm": 0.0006809529149904847, "learning_rate": 3.942245867181964e-06, "loss": 0.0, "num_input_tokens_seen": 143634560, "step": 66595 }, { "epoch": 12.222426133235457, "grad_norm": 0.009020345285534859, "learning_rate": 3.941463247601604e-06, "loss": 0.0, "num_input_tokens_seen": 143643648, "step": 66600 }, { "epoch": 12.223343732795009, "grad_norm": 0.12838315963745117, "learning_rate": 3.940680655171069e-06, "loss": 0.0001, "num_input_tokens_seen": 143654528, "step": 66605 }, { "epoch": 12.22426133235456, "grad_norm": 0.0004015049780718982, "learning_rate": 3.939898089910436e-06, "loss": 0.0, "num_input_tokens_seen": 143666336, "step": 66610 }, { "epoch": 12.225178931914114, "grad_norm": 0.0012628381373360753, "learning_rate": 3.939115551839774e-06, "loss": 0.0, "num_input_tokens_seen": 143676544, "step": 66615 }, { "epoch": 12.226096531473665, "grad_norm": 0.013888636603951454, "learning_rate": 3.9383330409791545e-06, "loss": 0.0, "num_input_tokens_seen": 143688384, "step": 66620 }, { "epoch": 12.227014131033217, "grad_norm": 0.00021642718638759106, "learning_rate": 3.937550557348644e-06, "loss": 0.0, "num_input_tokens_seen": 143699072, "step": 66625 }, { "epoch": 12.22793173059277, "grad_norm": 0.00031537609174847603, "learning_rate": 3.936768100968317e-06, "loss": 0.0, "num_input_tokens_seen": 143707744, "step": 66630 }, { "epoch": 12.228849330152322, "grad_norm": 0.0021125716157257557, "learning_rate": 3.935985671858241e-06, "loss": 0.0, "num_input_tokens_seen": 143719680, "step": 66635 }, { "epoch": 12.229766929711873, "grad_norm": 0.00024703051894903183, "learning_rate": 3.935203270038481e-06, "loss": 0.0, "num_input_tokens_seen": 143730592, "step": 66640 }, { "epoch": 12.230684529271427, "grad_norm": 0.00022408868244383484, "learning_rate": 3.934420895529109e-06, "loss": 0.0, "num_input_tokens_seen": 143741888, "step": 66645 }, { "epoch": 12.231602128830978, "grad_norm": 0.0007616690127179027, "learning_rate": 3.933638548350189e-06, "loss": 0.0, "num_input_tokens_seen": 143751424, "step": 66650 }, { "epoch": 12.23251972839053, "grad_norm": 0.00024056047550402582, "learning_rate": 3.932856228521788e-06, "loss": 0.0005, "num_input_tokens_seen": 143762656, "step": 66655 }, { "epoch": 12.233437327950083, "grad_norm": 0.0006215209723450243, "learning_rate": 3.93207393606397e-06, "loss": 0.0, "num_input_tokens_seen": 143773120, "step": 66660 }, { "epoch": 12.234354927509635, "grad_norm": 449.5501708984375, "learning_rate": 3.931291670996801e-06, "loss": 0.0376, "num_input_tokens_seen": 143784352, "step": 66665 }, { "epoch": 12.235272527069187, "grad_norm": 0.00308351032435894, "learning_rate": 3.930509433340344e-06, "loss": 0.0, "num_input_tokens_seen": 143794784, "step": 66670 }, { "epoch": 12.23619012662874, "grad_norm": 0.001859593903645873, "learning_rate": 3.929727223114662e-06, "loss": 0.0, "num_input_tokens_seen": 143806176, "step": 66675 }, { "epoch": 12.237107726188292, "grad_norm": 0.00019693579815793782, "learning_rate": 3.928945040339819e-06, "loss": 0.0, "num_input_tokens_seen": 143816032, "step": 66680 }, { "epoch": 12.238025325747843, "grad_norm": 0.00019441389304120094, "learning_rate": 3.928162885035877e-06, "loss": 0.0, "num_input_tokens_seen": 143827072, "step": 66685 }, { "epoch": 12.238942925307397, "grad_norm": 0.0004257875552866608, "learning_rate": 3.927380757222892e-06, "loss": 0.0, "num_input_tokens_seen": 143837472, "step": 66690 }, { "epoch": 12.239860524866948, "grad_norm": 0.00010463520447956398, "learning_rate": 3.92659865692093e-06, "loss": 0.0, "num_input_tokens_seen": 143849376, "step": 66695 }, { "epoch": 12.2407781244265, "grad_norm": 0.0002391274319961667, "learning_rate": 3.92581658415005e-06, "loss": 0.0, "num_input_tokens_seen": 143858880, "step": 66700 }, { "epoch": 12.241695723986053, "grad_norm": 0.004870557691901922, "learning_rate": 3.925034538930309e-06, "loss": 0.0, "num_input_tokens_seen": 143871712, "step": 66705 }, { "epoch": 12.242613323545605, "grad_norm": 0.0024737429339438677, "learning_rate": 3.9242525212817645e-06, "loss": 0.0284, "num_input_tokens_seen": 143881760, "step": 66710 }, { "epoch": 12.243530923105157, "grad_norm": 0.00019497632456477731, "learning_rate": 3.923470531224478e-06, "loss": 0.0, "num_input_tokens_seen": 143892224, "step": 66715 }, { "epoch": 12.24444852266471, "grad_norm": 0.000745822791941464, "learning_rate": 3.9226885687785035e-06, "loss": 0.0, "num_input_tokens_seen": 143902144, "step": 66720 }, { "epoch": 12.245366122224262, "grad_norm": 0.000562158995307982, "learning_rate": 3.921906633963894e-06, "loss": 0.0, "num_input_tokens_seen": 143913280, "step": 66725 }, { "epoch": 12.246283721783813, "grad_norm": 0.015364328399300575, "learning_rate": 3.921124726800713e-06, "loss": 0.0, "num_input_tokens_seen": 143924224, "step": 66730 }, { "epoch": 12.247201321343367, "grad_norm": 0.00045953027438372374, "learning_rate": 3.920342847309009e-06, "loss": 0.0352, "num_input_tokens_seen": 143935648, "step": 66735 }, { "epoch": 12.248118920902918, "grad_norm": 0.008264818228781223, "learning_rate": 3.919560995508836e-06, "loss": 0.0, "num_input_tokens_seen": 143946080, "step": 66740 }, { "epoch": 12.24903652046247, "grad_norm": 53.509742736816406, "learning_rate": 3.918779171420251e-06, "loss": 0.0131, "num_input_tokens_seen": 143957952, "step": 66745 }, { "epoch": 12.249954120022023, "grad_norm": 0.000973895366769284, "learning_rate": 3.917997375063305e-06, "loss": 0.0, "num_input_tokens_seen": 143968672, "step": 66750 }, { "epoch": 12.250871719581575, "grad_norm": 0.0002736696333158761, "learning_rate": 3.917215606458049e-06, "loss": 0.0, "num_input_tokens_seen": 143979072, "step": 66755 }, { "epoch": 12.251789319141126, "grad_norm": 0.010114417411386967, "learning_rate": 3.916433865624533e-06, "loss": 0.0, "num_input_tokens_seen": 143990848, "step": 66760 }, { "epoch": 12.25270691870068, "grad_norm": 0.0014382610097527504, "learning_rate": 3.915652152582809e-06, "loss": 0.0, "num_input_tokens_seen": 144002720, "step": 66765 }, { "epoch": 12.253624518260231, "grad_norm": 0.0008825454860925674, "learning_rate": 3.914870467352928e-06, "loss": 0.0, "num_input_tokens_seen": 144011392, "step": 66770 }, { "epoch": 12.254542117819783, "grad_norm": 0.0004326452617533505, "learning_rate": 3.914088809954937e-06, "loss": 0.0, "num_input_tokens_seen": 144022944, "step": 66775 }, { "epoch": 12.255459717379336, "grad_norm": 0.0010776609415188432, "learning_rate": 3.913307180408886e-06, "loss": 0.0, "num_input_tokens_seen": 144032800, "step": 66780 }, { "epoch": 12.256377316938888, "grad_norm": 0.0011908153537660837, "learning_rate": 3.912525578734822e-06, "loss": 0.0001, "num_input_tokens_seen": 144043552, "step": 66785 }, { "epoch": 12.25729491649844, "grad_norm": 1.4236054420471191, "learning_rate": 3.9117440049527885e-06, "loss": 0.0002, "num_input_tokens_seen": 144055104, "step": 66790 }, { "epoch": 12.258212516057993, "grad_norm": 0.00020886381389573216, "learning_rate": 3.910962459082837e-06, "loss": 0.0001, "num_input_tokens_seen": 144066720, "step": 66795 }, { "epoch": 12.259130115617545, "grad_norm": 0.00040133082075044513, "learning_rate": 3.910180941145011e-06, "loss": 0.0025, "num_input_tokens_seen": 144078048, "step": 66800 }, { "epoch": 12.260047715177096, "grad_norm": 0.0014805539976805449, "learning_rate": 3.909399451159354e-06, "loss": 0.0006, "num_input_tokens_seen": 144088416, "step": 66805 }, { "epoch": 12.26096531473665, "grad_norm": 0.00020859359938185662, "learning_rate": 3.90861798914591e-06, "loss": 0.1254, "num_input_tokens_seen": 144099200, "step": 66810 }, { "epoch": 12.261882914296201, "grad_norm": 0.00022538415214512497, "learning_rate": 3.907836555124724e-06, "loss": 0.0, "num_input_tokens_seen": 144111424, "step": 66815 }, { "epoch": 12.262800513855753, "grad_norm": 0.0005841200472787023, "learning_rate": 3.907055149115838e-06, "loss": 0.056, "num_input_tokens_seen": 144122560, "step": 66820 }, { "epoch": 12.263718113415306, "grad_norm": 0.9379125237464905, "learning_rate": 3.906273771139291e-06, "loss": 0.0004, "num_input_tokens_seen": 144132448, "step": 66825 }, { "epoch": 12.264635712974858, "grad_norm": 11.486176490783691, "learning_rate": 3.905492421215129e-06, "loss": 0.0119, "num_input_tokens_seen": 144144320, "step": 66830 }, { "epoch": 12.26555331253441, "grad_norm": 0.00033405545400455594, "learning_rate": 3.9047110993633905e-06, "loss": 0.0, "num_input_tokens_seen": 144155136, "step": 66835 }, { "epoch": 12.266470912093963, "grad_norm": 0.0007224180735647678, "learning_rate": 3.9039298056041145e-06, "loss": 0.0, "num_input_tokens_seen": 144165696, "step": 66840 }, { "epoch": 12.267388511653515, "grad_norm": 0.00018828673637472093, "learning_rate": 3.903148539957339e-06, "loss": 0.0532, "num_input_tokens_seen": 144177056, "step": 66845 }, { "epoch": 12.268306111213066, "grad_norm": 0.005367327481508255, "learning_rate": 3.902367302443104e-06, "loss": 0.0002, "num_input_tokens_seen": 144187648, "step": 66850 }, { "epoch": 12.26922371077262, "grad_norm": 0.0005448778974823654, "learning_rate": 3.901586093081447e-06, "loss": 0.0001, "num_input_tokens_seen": 144197984, "step": 66855 }, { "epoch": 12.270141310332171, "grad_norm": 0.0024730160366743803, "learning_rate": 3.900804911892402e-06, "loss": 0.0, "num_input_tokens_seen": 144208288, "step": 66860 }, { "epoch": 12.271058909891723, "grad_norm": 0.0009656832553446293, "learning_rate": 3.900023758896011e-06, "loss": 0.0, "num_input_tokens_seen": 144218016, "step": 66865 }, { "epoch": 12.271976509451276, "grad_norm": 0.0008385420660488307, "learning_rate": 3.899242634112304e-06, "loss": 0.0002, "num_input_tokens_seen": 144228672, "step": 66870 }, { "epoch": 12.272894109010828, "grad_norm": 0.0018710995791479945, "learning_rate": 3.8984615375613175e-06, "loss": 0.0, "num_input_tokens_seen": 144239424, "step": 66875 }, { "epoch": 12.27381170857038, "grad_norm": 56.62446212768555, "learning_rate": 3.897680469263085e-06, "loss": 0.056, "num_input_tokens_seen": 144250624, "step": 66880 }, { "epoch": 12.274729308129933, "grad_norm": 0.00037078699097037315, "learning_rate": 3.896899429237641e-06, "loss": 0.0001, "num_input_tokens_seen": 144261728, "step": 66885 }, { "epoch": 12.275646907689485, "grad_norm": 0.004584047477692366, "learning_rate": 3.896118417505016e-06, "loss": 0.0, "num_input_tokens_seen": 144272832, "step": 66890 }, { "epoch": 12.276564507249036, "grad_norm": 0.004311373922973871, "learning_rate": 3.8953374340852435e-06, "loss": 0.0001, "num_input_tokens_seen": 144285088, "step": 66895 }, { "epoch": 12.27748210680859, "grad_norm": 0.00018112410907633603, "learning_rate": 3.8945564789983535e-06, "loss": 0.0002, "num_input_tokens_seen": 144296960, "step": 66900 }, { "epoch": 12.278399706368141, "grad_norm": 0.0011381194926798344, "learning_rate": 3.893775552264377e-06, "loss": 0.0009, "num_input_tokens_seen": 144306944, "step": 66905 }, { "epoch": 12.279317305927693, "grad_norm": 0.0011868259171023965, "learning_rate": 3.892994653903342e-06, "loss": 0.0001, "num_input_tokens_seen": 144318528, "step": 66910 }, { "epoch": 12.280234905487246, "grad_norm": 0.004834455903619528, "learning_rate": 3.892213783935279e-06, "loss": 0.0, "num_input_tokens_seen": 144329056, "step": 66915 }, { "epoch": 12.281152505046798, "grad_norm": 0.10572399944067001, "learning_rate": 3.891432942380215e-06, "loss": 0.0002, "num_input_tokens_seen": 144339200, "step": 66920 }, { "epoch": 12.28207010460635, "grad_norm": 0.00023902434622868896, "learning_rate": 3.8906521292581775e-06, "loss": 0.0, "num_input_tokens_seen": 144350752, "step": 66925 }, { "epoch": 12.282987704165903, "grad_norm": 0.03650106117129326, "learning_rate": 3.889871344589195e-06, "loss": 0.0005, "num_input_tokens_seen": 144361152, "step": 66930 }, { "epoch": 12.283905303725454, "grad_norm": 0.0014968918403610587, "learning_rate": 3.8890905883932926e-06, "loss": 0.0, "num_input_tokens_seen": 144372448, "step": 66935 }, { "epoch": 12.284822903285006, "grad_norm": 0.0004339124425314367, "learning_rate": 3.888309860690493e-06, "loss": 0.0001, "num_input_tokens_seen": 144383680, "step": 66940 }, { "epoch": 12.28574050284456, "grad_norm": 0.0011449706507846713, "learning_rate": 3.887529161500822e-06, "loss": 0.0001, "num_input_tokens_seen": 144396224, "step": 66945 }, { "epoch": 12.286658102404111, "grad_norm": 0.0026792448479682207, "learning_rate": 3.886748490844306e-06, "loss": 0.0146, "num_input_tokens_seen": 144408672, "step": 66950 }, { "epoch": 12.287575701963663, "grad_norm": 0.0003059382434003055, "learning_rate": 3.885967848740965e-06, "loss": 0.0001, "num_input_tokens_seen": 144418976, "step": 66955 }, { "epoch": 12.288493301523216, "grad_norm": 0.010616696439683437, "learning_rate": 3.885187235210821e-06, "loss": 0.0003, "num_input_tokens_seen": 144429088, "step": 66960 }, { "epoch": 12.289410901082768, "grad_norm": 0.00019817064458038658, "learning_rate": 3.884406650273897e-06, "loss": 0.0, "num_input_tokens_seen": 144439296, "step": 66965 }, { "epoch": 12.29032850064232, "grad_norm": 0.005737687461078167, "learning_rate": 3.883626093950215e-06, "loss": 0.0001, "num_input_tokens_seen": 144449856, "step": 66970 }, { "epoch": 12.291246100201873, "grad_norm": 9.77099480223842e-05, "learning_rate": 3.882845566259792e-06, "loss": 0.0001, "num_input_tokens_seen": 144461568, "step": 66975 }, { "epoch": 12.292163699761424, "grad_norm": 0.0027493222150951624, "learning_rate": 3.88206506722265e-06, "loss": 0.0, "num_input_tokens_seen": 144472192, "step": 66980 }, { "epoch": 12.293081299320976, "grad_norm": 0.00018914406246040016, "learning_rate": 3.881284596858805e-06, "loss": 0.0, "num_input_tokens_seen": 144483552, "step": 66985 }, { "epoch": 12.29399889888053, "grad_norm": 0.0006060993764549494, "learning_rate": 3.880504155188277e-06, "loss": 0.0, "num_input_tokens_seen": 144493984, "step": 66990 }, { "epoch": 12.294916498440081, "grad_norm": 0.00014989507326390594, "learning_rate": 3.879723742231082e-06, "loss": 0.0001, "num_input_tokens_seen": 144505824, "step": 66995 }, { "epoch": 12.295834097999633, "grad_norm": 0.006933178752660751, "learning_rate": 3.878943358007238e-06, "loss": 0.0, "num_input_tokens_seen": 144516384, "step": 67000 }, { "epoch": 12.296751697559186, "grad_norm": 0.00020265477360226214, "learning_rate": 3.878163002536759e-06, "loss": 0.0, "num_input_tokens_seen": 144527264, "step": 67005 }, { "epoch": 12.297669297118738, "grad_norm": 0.0008330722339451313, "learning_rate": 3.877382675839657e-06, "loss": 0.0, "num_input_tokens_seen": 144537824, "step": 67010 }, { "epoch": 12.29858689667829, "grad_norm": 0.315868616104126, "learning_rate": 3.876602377935953e-06, "loss": 0.0002, "num_input_tokens_seen": 144546272, "step": 67015 }, { "epoch": 12.299504496237843, "grad_norm": 0.0016221324913203716, "learning_rate": 3.875822108845657e-06, "loss": 0.0674, "num_input_tokens_seen": 144558048, "step": 67020 }, { "epoch": 12.300422095797394, "grad_norm": 0.0017346306703984737, "learning_rate": 3.87504186858878e-06, "loss": 0.0001, "num_input_tokens_seen": 144568672, "step": 67025 }, { "epoch": 12.301339695356946, "grad_norm": 0.0010296556865796447, "learning_rate": 3.8742616571853355e-06, "loss": 0.0, "num_input_tokens_seen": 144579392, "step": 67030 }, { "epoch": 12.3022572949165, "grad_norm": 0.0002829751465469599, "learning_rate": 3.873481474655336e-06, "loss": 0.0, "num_input_tokens_seen": 144590624, "step": 67035 }, { "epoch": 12.30317489447605, "grad_norm": 0.00019805383635684848, "learning_rate": 3.87270132101879e-06, "loss": 0.0, "num_input_tokens_seen": 144602912, "step": 67040 }, { "epoch": 12.304092494035602, "grad_norm": 0.00042169837979599833, "learning_rate": 3.871921196295706e-06, "loss": 0.0, "num_input_tokens_seen": 144613568, "step": 67045 }, { "epoch": 12.305010093595156, "grad_norm": 0.00035508241853676736, "learning_rate": 3.8711411005060985e-06, "loss": 0.0001, "num_input_tokens_seen": 144625568, "step": 67050 }, { "epoch": 12.305927693154707, "grad_norm": 0.0002485075674485415, "learning_rate": 3.870361033669971e-06, "loss": 0.0001, "num_input_tokens_seen": 144635520, "step": 67055 }, { "epoch": 12.306845292714259, "grad_norm": 0.00018749403534457088, "learning_rate": 3.869580995807331e-06, "loss": 0.0, "num_input_tokens_seen": 144647072, "step": 67060 }, { "epoch": 12.307762892273812, "grad_norm": 0.0004391293623484671, "learning_rate": 3.868800986938187e-06, "loss": 0.0002, "num_input_tokens_seen": 144657920, "step": 67065 }, { "epoch": 12.308680491833364, "grad_norm": 0.0012213471345603466, "learning_rate": 3.868021007082546e-06, "loss": 0.0, "num_input_tokens_seen": 144668256, "step": 67070 }, { "epoch": 12.309598091392916, "grad_norm": 0.00047475381870754063, "learning_rate": 3.867241056260411e-06, "loss": 0.0, "num_input_tokens_seen": 144678976, "step": 67075 }, { "epoch": 12.31051569095247, "grad_norm": 0.0003591535205487162, "learning_rate": 3.8664611344917865e-06, "loss": 0.0, "num_input_tokens_seen": 144688768, "step": 67080 }, { "epoch": 12.31143329051202, "grad_norm": 0.00014612464292440563, "learning_rate": 3.865681241796677e-06, "loss": 0.0, "num_input_tokens_seen": 144700032, "step": 67085 }, { "epoch": 12.312350890071572, "grad_norm": 0.0003686454729177058, "learning_rate": 3.864901378195086e-06, "loss": 0.0, "num_input_tokens_seen": 144711040, "step": 67090 }, { "epoch": 12.313268489631126, "grad_norm": 0.0008437538053840399, "learning_rate": 3.864121543707016e-06, "loss": 0.0532, "num_input_tokens_seen": 144722656, "step": 67095 }, { "epoch": 12.314186089190677, "grad_norm": 0.00047860396443866193, "learning_rate": 3.863341738352468e-06, "loss": 0.0, "num_input_tokens_seen": 144733344, "step": 67100 }, { "epoch": 12.315103688750229, "grad_norm": 0.0029053620528429747, "learning_rate": 3.862561962151442e-06, "loss": 0.0, "num_input_tokens_seen": 144743616, "step": 67105 }, { "epoch": 12.316021288309782, "grad_norm": 0.002197476802393794, "learning_rate": 3.8617822151239374e-06, "loss": 0.0, "num_input_tokens_seen": 144754432, "step": 67110 }, { "epoch": 12.316938887869334, "grad_norm": 0.00032175841624848545, "learning_rate": 3.861002497289957e-06, "loss": 0.0, "num_input_tokens_seen": 144765248, "step": 67115 }, { "epoch": 12.317856487428886, "grad_norm": 0.0006666153785772622, "learning_rate": 3.860222808669498e-06, "loss": 0.0, "num_input_tokens_seen": 144776608, "step": 67120 }, { "epoch": 12.318774086988439, "grad_norm": 0.00020218643476255238, "learning_rate": 3.859443149282556e-06, "loss": 0.0005, "num_input_tokens_seen": 144787488, "step": 67125 }, { "epoch": 12.31969168654799, "grad_norm": 0.0009361114935018122, "learning_rate": 3.858663519149128e-06, "loss": 0.0, "num_input_tokens_seen": 144798592, "step": 67130 }, { "epoch": 12.320609286107542, "grad_norm": 0.006491812411695719, "learning_rate": 3.857883918289215e-06, "loss": 0.0003, "num_input_tokens_seen": 144809632, "step": 67135 }, { "epoch": 12.321526885667096, "grad_norm": 0.0001452541328035295, "learning_rate": 3.857104346722808e-06, "loss": 0.0, "num_input_tokens_seen": 144819744, "step": 67140 }, { "epoch": 12.322444485226647, "grad_norm": 0.0007673038053326309, "learning_rate": 3.856324804469901e-06, "loss": 0.2094, "num_input_tokens_seen": 144830208, "step": 67145 }, { "epoch": 12.323362084786199, "grad_norm": 0.0044380431063473225, "learning_rate": 3.855545291550493e-06, "loss": 0.0119, "num_input_tokens_seen": 144841344, "step": 67150 }, { "epoch": 12.324279684345752, "grad_norm": 0.001726461574435234, "learning_rate": 3.854765807984575e-06, "loss": 0.0, "num_input_tokens_seen": 144851456, "step": 67155 }, { "epoch": 12.325197283905304, "grad_norm": 0.000443507160525769, "learning_rate": 3.853986353792138e-06, "loss": 0.0, "num_input_tokens_seen": 144861792, "step": 67160 }, { "epoch": 12.326114883464856, "grad_norm": 0.000565911119338125, "learning_rate": 3.853206928993174e-06, "loss": 0.0002, "num_input_tokens_seen": 144873728, "step": 67165 }, { "epoch": 12.327032483024409, "grad_norm": 0.0003915352572221309, "learning_rate": 3.852427533607676e-06, "loss": 0.0, "num_input_tokens_seen": 144885024, "step": 67170 }, { "epoch": 12.32795008258396, "grad_norm": 0.006030240561813116, "learning_rate": 3.851648167655634e-06, "loss": 0.0, "num_input_tokens_seen": 144895488, "step": 67175 }, { "epoch": 12.328867682143512, "grad_norm": 0.00022856882424093783, "learning_rate": 3.850868831157034e-06, "loss": 0.0, "num_input_tokens_seen": 144905760, "step": 67180 }, { "epoch": 12.329785281703066, "grad_norm": 0.013068396598100662, "learning_rate": 3.850089524131869e-06, "loss": 0.0, "num_input_tokens_seen": 144917312, "step": 67185 }, { "epoch": 12.330702881262617, "grad_norm": 0.0011531243799254298, "learning_rate": 3.849310246600126e-06, "loss": 0.0, "num_input_tokens_seen": 144927520, "step": 67190 }, { "epoch": 12.331620480822169, "grad_norm": 0.22274735569953918, "learning_rate": 3.848530998581792e-06, "loss": 0.0001, "num_input_tokens_seen": 144937280, "step": 67195 }, { "epoch": 12.332538080381722, "grad_norm": 0.0007229212787933648, "learning_rate": 3.847751780096852e-06, "loss": 0.0617, "num_input_tokens_seen": 144948320, "step": 67200 }, { "epoch": 12.333455679941274, "grad_norm": 0.0013621265534311533, "learning_rate": 3.846972591165293e-06, "loss": 0.0001, "num_input_tokens_seen": 144959200, "step": 67205 }, { "epoch": 12.334373279500825, "grad_norm": 33.58428192138672, "learning_rate": 3.846193431807102e-06, "loss": 0.2332, "num_input_tokens_seen": 144970528, "step": 67210 }, { "epoch": 12.335290879060379, "grad_norm": 36.2665901184082, "learning_rate": 3.845414302042259e-06, "loss": 0.0328, "num_input_tokens_seen": 144982336, "step": 67215 }, { "epoch": 12.33620847861993, "grad_norm": 0.1124655231833458, "learning_rate": 3.844635201890751e-06, "loss": 0.0002, "num_input_tokens_seen": 144992256, "step": 67220 }, { "epoch": 12.337126078179482, "grad_norm": 0.0030321748927235603, "learning_rate": 3.843856131372561e-06, "loss": 0.0, "num_input_tokens_seen": 145003488, "step": 67225 }, { "epoch": 12.338043677739035, "grad_norm": 0.0010534680914133787, "learning_rate": 3.843077090507664e-06, "loss": 0.2503, "num_input_tokens_seen": 145014880, "step": 67230 }, { "epoch": 12.338961277298587, "grad_norm": 0.004025212489068508, "learning_rate": 3.842298079316052e-06, "loss": 0.0284, "num_input_tokens_seen": 145024864, "step": 67235 }, { "epoch": 12.339878876858139, "grad_norm": 0.0005857559735886753, "learning_rate": 3.841519097817698e-06, "loss": 0.0003, "num_input_tokens_seen": 145035520, "step": 67240 }, { "epoch": 12.340796476417692, "grad_norm": 0.0034229126758873463, "learning_rate": 3.840740146032582e-06, "loss": 0.0001, "num_input_tokens_seen": 145045504, "step": 67245 }, { "epoch": 12.341714075977244, "grad_norm": 0.0007947820704430342, "learning_rate": 3.839961223980686e-06, "loss": 0.0, "num_input_tokens_seen": 145056192, "step": 67250 }, { "epoch": 12.342631675536795, "grad_norm": 0.0034539210610091686, "learning_rate": 3.8391823316819886e-06, "loss": 0.0, "num_input_tokens_seen": 145067424, "step": 67255 }, { "epoch": 12.343549275096349, "grad_norm": 0.0014075543731451035, "learning_rate": 3.838403469156464e-06, "loss": 0.0001, "num_input_tokens_seen": 145078144, "step": 67260 }, { "epoch": 12.3444668746559, "grad_norm": 0.00022103142691776156, "learning_rate": 3.837624636424088e-06, "loss": 0.0, "num_input_tokens_seen": 145088960, "step": 67265 }, { "epoch": 12.345384474215452, "grad_norm": 0.10315738618373871, "learning_rate": 3.836845833504841e-06, "loss": 0.0002, "num_input_tokens_seen": 145101024, "step": 67270 }, { "epoch": 12.346302073775005, "grad_norm": 0.0019720452837646008, "learning_rate": 3.836067060418695e-06, "loss": 0.0, "num_input_tokens_seen": 145111680, "step": 67275 }, { "epoch": 12.347219673334557, "grad_norm": 0.032733939588069916, "learning_rate": 3.835288317185623e-06, "loss": 0.0, "num_input_tokens_seen": 145122464, "step": 67280 }, { "epoch": 12.348137272894109, "grad_norm": 0.000656544347293675, "learning_rate": 3.8345096038256035e-06, "loss": 0.0003, "num_input_tokens_seen": 145132512, "step": 67285 }, { "epoch": 12.349054872453662, "grad_norm": 0.0007084101089276373, "learning_rate": 3.8337309203586055e-06, "loss": 0.0, "num_input_tokens_seen": 145143360, "step": 67290 }, { "epoch": 12.349972472013214, "grad_norm": 8.82004165649414, "learning_rate": 3.832952266804602e-06, "loss": 0.0054, "num_input_tokens_seen": 145153728, "step": 67295 }, { "epoch": 12.350890071572765, "grad_norm": 0.5014089345932007, "learning_rate": 3.832173643183564e-06, "loss": 0.0002, "num_input_tokens_seen": 145165280, "step": 67300 }, { "epoch": 12.351807671132319, "grad_norm": 0.00031409546500071883, "learning_rate": 3.831395049515461e-06, "loss": 0.0001, "num_input_tokens_seen": 145176416, "step": 67305 }, { "epoch": 12.35272527069187, "grad_norm": 0.0018042997689917684, "learning_rate": 3.830616485820264e-06, "loss": 0.0, "num_input_tokens_seen": 145188064, "step": 67310 }, { "epoch": 12.353642870251422, "grad_norm": 0.0008691922412253916, "learning_rate": 3.829837952117942e-06, "loss": 0.0001, "num_input_tokens_seen": 145199616, "step": 67315 }, { "epoch": 12.354560469810975, "grad_norm": 0.004795441869646311, "learning_rate": 3.829059448428465e-06, "loss": 0.0131, "num_input_tokens_seen": 145210304, "step": 67320 }, { "epoch": 12.355478069370527, "grad_norm": 0.0008677531150169671, "learning_rate": 3.828280974771796e-06, "loss": 0.1159, "num_input_tokens_seen": 145221504, "step": 67325 }, { "epoch": 12.356395668930078, "grad_norm": 0.04155537113547325, "learning_rate": 3.827502531167903e-06, "loss": 0.0, "num_input_tokens_seen": 145233248, "step": 67330 }, { "epoch": 12.357313268489632, "grad_norm": 0.0002303598157595843, "learning_rate": 3.826724117636756e-06, "loss": 0.0, "num_input_tokens_seen": 145244256, "step": 67335 }, { "epoch": 12.358230868049183, "grad_norm": 0.0001482227089582011, "learning_rate": 3.825945734198315e-06, "loss": 0.0, "num_input_tokens_seen": 145256000, "step": 67340 }, { "epoch": 12.359148467608735, "grad_norm": 0.0008322485955432057, "learning_rate": 3.825167380872547e-06, "loss": 0.0, "num_input_tokens_seen": 145266080, "step": 67345 }, { "epoch": 12.360066067168288, "grad_norm": 0.0005355315515771508, "learning_rate": 3.824389057679413e-06, "loss": 0.0, "num_input_tokens_seen": 145276768, "step": 67350 }, { "epoch": 12.36098366672784, "grad_norm": 0.0005788904963992536, "learning_rate": 3.82361076463888e-06, "loss": 0.0, "num_input_tokens_seen": 145287264, "step": 67355 }, { "epoch": 12.361901266287392, "grad_norm": 0.008725390769541264, "learning_rate": 3.822832501770908e-06, "loss": 0.0, "num_input_tokens_seen": 145298560, "step": 67360 }, { "epoch": 12.362818865846945, "grad_norm": 0.0003938661247957498, "learning_rate": 3.822054269095455e-06, "loss": 0.0, "num_input_tokens_seen": 145309280, "step": 67365 }, { "epoch": 12.363736465406497, "grad_norm": 0.004270039498806, "learning_rate": 3.821276066632487e-06, "loss": 0.0, "num_input_tokens_seen": 145320640, "step": 67370 }, { "epoch": 12.364654064966048, "grad_norm": 0.5662597417831421, "learning_rate": 3.82049789440196e-06, "loss": 0.0001, "num_input_tokens_seen": 145332096, "step": 67375 }, { "epoch": 12.365571664525602, "grad_norm": 0.0003834418603219092, "learning_rate": 3.819719752423833e-06, "loss": 0.0, "num_input_tokens_seen": 145341696, "step": 67380 }, { "epoch": 12.366489264085153, "grad_norm": 0.005057133734226227, "learning_rate": 3.8189416407180665e-06, "loss": 0.0, "num_input_tokens_seen": 145353504, "step": 67385 }, { "epoch": 12.367406863644705, "grad_norm": 0.0005336771719157696, "learning_rate": 3.8181635593046165e-06, "loss": 0.0001, "num_input_tokens_seen": 145364608, "step": 67390 }, { "epoch": 12.368324463204258, "grad_norm": 0.003036161884665489, "learning_rate": 3.81738550820344e-06, "loss": 0.0, "num_input_tokens_seen": 145374464, "step": 67395 }, { "epoch": 12.36924206276381, "grad_norm": 0.0029513321351259947, "learning_rate": 3.8166074874344895e-06, "loss": 0.0, "num_input_tokens_seen": 145385344, "step": 67400 }, { "epoch": 12.370159662323362, "grad_norm": 0.0004425362276379019, "learning_rate": 3.8158294970177256e-06, "loss": 0.0001, "num_input_tokens_seen": 145396480, "step": 67405 }, { "epoch": 12.371077261882915, "grad_norm": 0.0001561089011374861, "learning_rate": 3.8150515369731e-06, "loss": 0.0001, "num_input_tokens_seen": 145406880, "step": 67410 }, { "epoch": 12.371994861442467, "grad_norm": 0.00016604014672338963, "learning_rate": 3.814273607320564e-06, "loss": 0.0, "num_input_tokens_seen": 145418080, "step": 67415 }, { "epoch": 12.372912461002018, "grad_norm": 0.0003900544252246618, "learning_rate": 3.813495708080074e-06, "loss": 0.0, "num_input_tokens_seen": 145428416, "step": 67420 }, { "epoch": 12.373830060561572, "grad_norm": 0.0007405122159980237, "learning_rate": 3.8127178392715795e-06, "loss": 0.0532, "num_input_tokens_seen": 145439136, "step": 67425 }, { "epoch": 12.374747660121123, "grad_norm": 0.0006508594960905612, "learning_rate": 3.8119400009150308e-06, "loss": 0.0001, "num_input_tokens_seen": 145449728, "step": 67430 }, { "epoch": 12.375665259680675, "grad_norm": 0.05289441719651222, "learning_rate": 3.811162193030382e-06, "loss": 0.0, "num_input_tokens_seen": 145460768, "step": 67435 }, { "epoch": 12.376582859240228, "grad_norm": 0.001120634377002716, "learning_rate": 3.81038441563758e-06, "loss": 0.0, "num_input_tokens_seen": 145470752, "step": 67440 }, { "epoch": 12.37750045879978, "grad_norm": 0.0002921240811701864, "learning_rate": 3.809606668756574e-06, "loss": 0.0, "num_input_tokens_seen": 145480768, "step": 67445 }, { "epoch": 12.378418058359332, "grad_norm": 0.010747913271188736, "learning_rate": 3.80882895240731e-06, "loss": 0.0001, "num_input_tokens_seen": 145492096, "step": 67450 }, { "epoch": 12.379335657918885, "grad_norm": 0.00010966354602715, "learning_rate": 3.8080512666097393e-06, "loss": 0.0003, "num_input_tokens_seen": 145503072, "step": 67455 }, { "epoch": 12.380253257478437, "grad_norm": 0.0007337249116972089, "learning_rate": 3.8072736113838068e-06, "loss": 0.0, "num_input_tokens_seen": 145514048, "step": 67460 }, { "epoch": 12.381170857037988, "grad_norm": 0.0010169746819883585, "learning_rate": 3.8064959867494543e-06, "loss": 0.0, "num_input_tokens_seen": 145525440, "step": 67465 }, { "epoch": 12.382088456597542, "grad_norm": 0.02581070549786091, "learning_rate": 3.805718392726633e-06, "loss": 0.0, "num_input_tokens_seen": 145535744, "step": 67470 }, { "epoch": 12.383006056157093, "grad_norm": 0.0008445329149253666, "learning_rate": 3.804940829335284e-06, "loss": 0.0, "num_input_tokens_seen": 145544704, "step": 67475 }, { "epoch": 12.383923655716645, "grad_norm": 0.007666710764169693, "learning_rate": 3.80416329659535e-06, "loss": 0.0001, "num_input_tokens_seen": 145555264, "step": 67480 }, { "epoch": 12.384841255276198, "grad_norm": 0.0003364894655533135, "learning_rate": 3.8033857945267726e-06, "loss": 0.0, "num_input_tokens_seen": 145565888, "step": 67485 }, { "epoch": 12.38575885483575, "grad_norm": 0.00020716416474897414, "learning_rate": 3.802608323149497e-06, "loss": 0.0762, "num_input_tokens_seen": 145576800, "step": 67490 }, { "epoch": 12.386676454395301, "grad_norm": 0.00035454623866826296, "learning_rate": 3.8018308824834616e-06, "loss": 0.0, "num_input_tokens_seen": 145587520, "step": 67495 }, { "epoch": 12.387594053954855, "grad_norm": 0.0004919788334518671, "learning_rate": 3.801053472548605e-06, "loss": 0.0, "num_input_tokens_seen": 145597376, "step": 67500 }, { "epoch": 12.388511653514406, "grad_norm": 0.00028010315145365894, "learning_rate": 3.800276093364871e-06, "loss": 0.0, "num_input_tokens_seen": 145607744, "step": 67505 }, { "epoch": 12.389429253073958, "grad_norm": 0.010579828172922134, "learning_rate": 3.799498744952196e-06, "loss": 0.1969, "num_input_tokens_seen": 145618400, "step": 67510 }, { "epoch": 12.390346852633511, "grad_norm": 0.004666656721383333, "learning_rate": 3.798721427330516e-06, "loss": 0.0, "num_input_tokens_seen": 145629024, "step": 67515 }, { "epoch": 12.391264452193063, "grad_norm": 0.00018781617109198123, "learning_rate": 3.7979441405197713e-06, "loss": 0.119, "num_input_tokens_seen": 145640576, "step": 67520 }, { "epoch": 12.392182051752615, "grad_norm": 0.11936662346124649, "learning_rate": 3.797166884539895e-06, "loss": 0.0001, "num_input_tokens_seen": 145650048, "step": 67525 }, { "epoch": 12.393099651312168, "grad_norm": 0.000882372260093689, "learning_rate": 3.796389659410825e-06, "loss": 0.0, "num_input_tokens_seen": 145660512, "step": 67530 }, { "epoch": 12.39401725087172, "grad_norm": 0.0027412259951233864, "learning_rate": 3.7956124651524934e-06, "loss": 0.0, "num_input_tokens_seen": 145670464, "step": 67535 }, { "epoch": 12.394934850431271, "grad_norm": 0.0016721958527341485, "learning_rate": 3.794835301784837e-06, "loss": 0.0009, "num_input_tokens_seen": 145679840, "step": 67540 }, { "epoch": 12.395852449990825, "grad_norm": 0.0038968119770288467, "learning_rate": 3.7940581693277865e-06, "loss": 0.0001, "num_input_tokens_seen": 145690944, "step": 67545 }, { "epoch": 12.396770049550376, "grad_norm": 0.011293867602944374, "learning_rate": 3.793281067801274e-06, "loss": 0.0, "num_input_tokens_seen": 145701952, "step": 67550 }, { "epoch": 12.397687649109928, "grad_norm": 0.004940096288919449, "learning_rate": 3.792503997225233e-06, "loss": 0.0, "num_input_tokens_seen": 145712256, "step": 67555 }, { "epoch": 12.398605248669481, "grad_norm": 0.0004884297377429903, "learning_rate": 3.7917269576195935e-06, "loss": 0.2844, "num_input_tokens_seen": 145723392, "step": 67560 }, { "epoch": 12.399522848229033, "grad_norm": 0.0009475736878812313, "learning_rate": 3.7909499490042824e-06, "loss": 0.0, "num_input_tokens_seen": 145734208, "step": 67565 }, { "epoch": 12.400440447788585, "grad_norm": 0.0009468237403780222, "learning_rate": 3.7901729713992338e-06, "loss": 0.0, "num_input_tokens_seen": 145744896, "step": 67570 }, { "epoch": 12.401358047348138, "grad_norm": 0.0002578280109446496, "learning_rate": 3.789396024824373e-06, "loss": 0.0, "num_input_tokens_seen": 145755040, "step": 67575 }, { "epoch": 12.40227564690769, "grad_norm": 0.04980717599391937, "learning_rate": 3.7886191092996284e-06, "loss": 0.0001, "num_input_tokens_seen": 145766720, "step": 67580 }, { "epoch": 12.403193246467241, "grad_norm": 0.0001932811428559944, "learning_rate": 3.7878422248449237e-06, "loss": 0.2844, "num_input_tokens_seen": 145777792, "step": 67585 }, { "epoch": 12.404110846026795, "grad_norm": 0.0007902801735326648, "learning_rate": 3.7870653714801897e-06, "loss": 0.0005, "num_input_tokens_seen": 145789344, "step": 67590 }, { "epoch": 12.405028445586346, "grad_norm": 0.08371196687221527, "learning_rate": 3.786288549225349e-06, "loss": 0.0001, "num_input_tokens_seen": 145800064, "step": 67595 }, { "epoch": 12.405946045145898, "grad_norm": 0.0011920240940526128, "learning_rate": 3.785511758100323e-06, "loss": 0.001, "num_input_tokens_seen": 145809504, "step": 67600 }, { "epoch": 12.406863644705451, "grad_norm": 0.0006983862840570509, "learning_rate": 3.7847349981250408e-06, "loss": 0.0, "num_input_tokens_seen": 145819712, "step": 67605 }, { "epoch": 12.407781244265003, "grad_norm": 0.00031329193734563887, "learning_rate": 3.7839582693194227e-06, "loss": 0.0, "num_input_tokens_seen": 145830176, "step": 67610 }, { "epoch": 12.408698843824554, "grad_norm": 0.002453291555866599, "learning_rate": 3.78318157170339e-06, "loss": 0.0, "num_input_tokens_seen": 145841376, "step": 67615 }, { "epoch": 12.409616443384108, "grad_norm": 0.003008894622325897, "learning_rate": 3.7824049052968625e-06, "loss": 0.0003, "num_input_tokens_seen": 145852928, "step": 67620 }, { "epoch": 12.41053404294366, "grad_norm": 0.035758115351200104, "learning_rate": 3.781628270119763e-06, "loss": 0.0002, "num_input_tokens_seen": 145863648, "step": 67625 }, { "epoch": 12.411451642503211, "grad_norm": 0.001902711228467524, "learning_rate": 3.780851666192011e-06, "loss": 0.0, "num_input_tokens_seen": 145872544, "step": 67630 }, { "epoch": 12.412369242062764, "grad_norm": 0.009999926201999187, "learning_rate": 3.780075093533523e-06, "loss": 0.0, "num_input_tokens_seen": 145882432, "step": 67635 }, { "epoch": 12.413286841622316, "grad_norm": 0.012217207811772823, "learning_rate": 3.7792985521642187e-06, "loss": 0.0, "num_input_tokens_seen": 145892512, "step": 67640 }, { "epoch": 12.414204441181868, "grad_norm": 0.0005017432267777622, "learning_rate": 3.7785220421040153e-06, "loss": 0.0, "num_input_tokens_seen": 145903712, "step": 67645 }, { "epoch": 12.415122040741421, "grad_norm": 0.0003329915925860405, "learning_rate": 3.777745563372826e-06, "loss": 0.0001, "num_input_tokens_seen": 145913984, "step": 67650 }, { "epoch": 12.416039640300973, "grad_norm": 0.0002602466265670955, "learning_rate": 3.7769691159905706e-06, "loss": 0.0001, "num_input_tokens_seen": 145925824, "step": 67655 }, { "epoch": 12.416957239860524, "grad_norm": 0.00036445254227146506, "learning_rate": 3.7761926999771624e-06, "loss": 0.0, "num_input_tokens_seen": 145936864, "step": 67660 }, { "epoch": 12.417874839420078, "grad_norm": 0.0019570717122405767, "learning_rate": 3.7754163153525147e-06, "loss": 0.0001, "num_input_tokens_seen": 145947328, "step": 67665 }, { "epoch": 12.41879243897963, "grad_norm": 0.0002872916520573199, "learning_rate": 3.774639962136538e-06, "loss": 0.1719, "num_input_tokens_seen": 145957376, "step": 67670 }, { "epoch": 12.419710038539181, "grad_norm": 0.00013286697503644973, "learning_rate": 3.773863640349149e-06, "loss": 0.0005, "num_input_tokens_seen": 145968352, "step": 67675 }, { "epoch": 12.420627638098734, "grad_norm": 0.00042283200309611857, "learning_rate": 3.7730873500102584e-06, "loss": 0.0, "num_input_tokens_seen": 145979904, "step": 67680 }, { "epoch": 12.421545237658286, "grad_norm": 0.0007529548020102084, "learning_rate": 3.7723110911397727e-06, "loss": 0.0, "num_input_tokens_seen": 145990976, "step": 67685 }, { "epoch": 12.422462837217838, "grad_norm": 3.003878355026245, "learning_rate": 3.771534863757607e-06, "loss": 0.0011, "num_input_tokens_seen": 146000000, "step": 67690 }, { "epoch": 12.423380436777391, "grad_norm": 0.005514535121619701, "learning_rate": 3.7707586678836685e-06, "loss": 0.0, "num_input_tokens_seen": 146011008, "step": 67695 }, { "epoch": 12.424298036336943, "grad_norm": 0.00028632424073293805, "learning_rate": 3.769982503537862e-06, "loss": 0.0, "num_input_tokens_seen": 146021120, "step": 67700 }, { "epoch": 12.425215635896494, "grad_norm": 0.010988728143274784, "learning_rate": 3.7692063707401013e-06, "loss": 0.0, "num_input_tokens_seen": 146031648, "step": 67705 }, { "epoch": 12.426133235456048, "grad_norm": 0.0003952432598453015, "learning_rate": 3.768430269510289e-06, "loss": 0.0, "num_input_tokens_seen": 146042816, "step": 67710 }, { "epoch": 12.4270508350156, "grad_norm": 0.0011722970521077514, "learning_rate": 3.7676541998683315e-06, "loss": 0.0, "num_input_tokens_seen": 146052288, "step": 67715 }, { "epoch": 12.42796843457515, "grad_norm": 0.00024615071015432477, "learning_rate": 3.7668781618341315e-06, "loss": 0.0001, "num_input_tokens_seen": 146061920, "step": 67720 }, { "epoch": 12.428886034134704, "grad_norm": 0.0004677055694628507, "learning_rate": 3.766102155427598e-06, "loss": 0.0, "num_input_tokens_seen": 146072992, "step": 67725 }, { "epoch": 12.429803633694256, "grad_norm": 0.002280458575114608, "learning_rate": 3.7653261806686316e-06, "loss": 0.0, "num_input_tokens_seen": 146084192, "step": 67730 }, { "epoch": 12.430721233253808, "grad_norm": 0.00020901617244817317, "learning_rate": 3.7645502375771337e-06, "loss": 0.0008, "num_input_tokens_seen": 146095392, "step": 67735 }, { "epoch": 12.431638832813361, "grad_norm": 0.010883983224630356, "learning_rate": 3.763774326173009e-06, "loss": 0.0, "num_input_tokens_seen": 146105600, "step": 67740 }, { "epoch": 12.432556432372913, "grad_norm": 0.00024655283777974546, "learning_rate": 3.762998446476156e-06, "loss": 0.0, "num_input_tokens_seen": 146115648, "step": 67745 }, { "epoch": 12.433474031932464, "grad_norm": 0.0009217541082762182, "learning_rate": 3.7622225985064763e-06, "loss": 0.0, "num_input_tokens_seen": 146125568, "step": 67750 }, { "epoch": 12.434391631492018, "grad_norm": 0.00024926746846176684, "learning_rate": 3.761446782283867e-06, "loss": 0.0002, "num_input_tokens_seen": 146135264, "step": 67755 }, { "epoch": 12.43530923105157, "grad_norm": 0.00033303562668152153, "learning_rate": 3.76067099782823e-06, "loss": 0.0, "num_input_tokens_seen": 146144448, "step": 67760 }, { "epoch": 12.43622683061112, "grad_norm": 0.0006495064008049667, "learning_rate": 3.759895245159461e-06, "loss": 0.0, "num_input_tokens_seen": 146154560, "step": 67765 }, { "epoch": 12.437144430170674, "grad_norm": 0.0007404009229503572, "learning_rate": 3.759119524297455e-06, "loss": 0.0, "num_input_tokens_seen": 146166528, "step": 67770 }, { "epoch": 12.438062029730226, "grad_norm": 0.00019028587848879397, "learning_rate": 3.758343835262112e-06, "loss": 0.0, "num_input_tokens_seen": 146176480, "step": 67775 }, { "epoch": 12.438979629289777, "grad_norm": 0.0004217959358356893, "learning_rate": 3.7575681780733253e-06, "loss": 0.0, "num_input_tokens_seen": 146187616, "step": 67780 }, { "epoch": 12.43989722884933, "grad_norm": 0.0018531654495745897, "learning_rate": 3.756792552750987e-06, "loss": 0.0, "num_input_tokens_seen": 146198464, "step": 67785 }, { "epoch": 12.440814828408882, "grad_norm": 0.0006473429384641349, "learning_rate": 3.756016959314995e-06, "loss": 0.0, "num_input_tokens_seen": 146210240, "step": 67790 }, { "epoch": 12.441732427968434, "grad_norm": 0.0006533985724672675, "learning_rate": 3.75524139778524e-06, "loss": 0.0, "num_input_tokens_seen": 146219840, "step": 67795 }, { "epoch": 12.442650027527987, "grad_norm": 0.05168839544057846, "learning_rate": 3.7544658681816137e-06, "loss": 0.0, "num_input_tokens_seen": 146230688, "step": 67800 }, { "epoch": 12.443567627087539, "grad_norm": 0.00047074430040083826, "learning_rate": 3.7536903705240057e-06, "loss": 0.1938, "num_input_tokens_seen": 146241824, "step": 67805 }, { "epoch": 12.44448522664709, "grad_norm": 0.00019866734510287642, "learning_rate": 3.7529149048323098e-06, "loss": 0.0, "num_input_tokens_seen": 146252224, "step": 67810 }, { "epoch": 12.445402826206644, "grad_norm": 0.00039243747596628964, "learning_rate": 3.7521394711264148e-06, "loss": 0.0, "num_input_tokens_seen": 146263136, "step": 67815 }, { "epoch": 12.446320425766196, "grad_norm": 0.0006045978516340256, "learning_rate": 3.7513640694262056e-06, "loss": 0.0, "num_input_tokens_seen": 146274112, "step": 67820 }, { "epoch": 12.447238025325747, "grad_norm": 0.07044526189565659, "learning_rate": 3.7505886997515744e-06, "loss": 0.056, "num_input_tokens_seen": 146284672, "step": 67825 }, { "epoch": 12.4481556248853, "grad_norm": 0.00024113536346703768, "learning_rate": 3.7498133621224073e-06, "loss": 0.0, "num_input_tokens_seen": 146295968, "step": 67830 }, { "epoch": 12.449073224444852, "grad_norm": 0.0007111643208190799, "learning_rate": 3.7490380565585886e-06, "loss": 0.0001, "num_input_tokens_seen": 146306816, "step": 67835 }, { "epoch": 12.449990824004404, "grad_norm": 0.0013175944332033396, "learning_rate": 3.748262783080006e-06, "loss": 0.0, "num_input_tokens_seen": 146317504, "step": 67840 }, { "epoch": 12.450908423563957, "grad_norm": 0.0005103955045342445, "learning_rate": 3.747487541706542e-06, "loss": 0.0, "num_input_tokens_seen": 146327520, "step": 67845 }, { "epoch": 12.451826023123509, "grad_norm": 0.001465840032324195, "learning_rate": 3.746712332458082e-06, "loss": 0.0, "num_input_tokens_seen": 146338240, "step": 67850 }, { "epoch": 12.45274362268306, "grad_norm": 0.0008282000781036913, "learning_rate": 3.7459371553545076e-06, "loss": 0.0762, "num_input_tokens_seen": 146349632, "step": 67855 }, { "epoch": 12.453661222242614, "grad_norm": 0.0003803060972131789, "learning_rate": 3.745162010415703e-06, "loss": 0.0001, "num_input_tokens_seen": 146358912, "step": 67860 }, { "epoch": 12.454578821802166, "grad_norm": 0.0010578089859336615, "learning_rate": 3.744386897661547e-06, "loss": 0.0, "num_input_tokens_seen": 146369760, "step": 67865 }, { "epoch": 12.455496421361717, "grad_norm": 0.0001777109719114378, "learning_rate": 3.7436118171119198e-06, "loss": 0.0, "num_input_tokens_seen": 146379968, "step": 67870 }, { "epoch": 12.45641402092127, "grad_norm": 0.0003844760940410197, "learning_rate": 3.742836768786704e-06, "loss": 0.0, "num_input_tokens_seen": 146390880, "step": 67875 }, { "epoch": 12.457331620480822, "grad_norm": 0.02050071768462658, "learning_rate": 3.742061752705777e-06, "loss": 0.0268, "num_input_tokens_seen": 146402368, "step": 67880 }, { "epoch": 12.458249220040374, "grad_norm": 0.0036174310371279716, "learning_rate": 3.7412867688890144e-06, "loss": 0.0001, "num_input_tokens_seen": 146413280, "step": 67885 }, { "epoch": 12.459166819599927, "grad_norm": 0.00045411832979880273, "learning_rate": 3.7405118173562964e-06, "loss": 0.0, "num_input_tokens_seen": 146424448, "step": 67890 }, { "epoch": 12.460084419159479, "grad_norm": 0.003778046229854226, "learning_rate": 3.7397368981274994e-06, "loss": 0.0007, "num_input_tokens_seen": 146434112, "step": 67895 }, { "epoch": 12.46100201871903, "grad_norm": 0.003570667002350092, "learning_rate": 3.7389620112224966e-06, "loss": 0.0, "num_input_tokens_seen": 146445824, "step": 67900 }, { "epoch": 12.461919618278584, "grad_norm": 0.0005401878152042627, "learning_rate": 3.7381871566611626e-06, "loss": 0.0, "num_input_tokens_seen": 146455680, "step": 67905 }, { "epoch": 12.462837217838135, "grad_norm": 0.00020021187083330005, "learning_rate": 3.7374123344633745e-06, "loss": 0.0007, "num_input_tokens_seen": 146466944, "step": 67910 }, { "epoch": 12.463754817397687, "grad_norm": 0.004526499658823013, "learning_rate": 3.7366375446490028e-06, "loss": 0.002, "num_input_tokens_seen": 146478176, "step": 67915 }, { "epoch": 12.46467241695724, "grad_norm": 0.00093853595899418, "learning_rate": 3.7358627872379183e-06, "loss": 0.1557, "num_input_tokens_seen": 146488864, "step": 67920 }, { "epoch": 12.465590016516792, "grad_norm": 0.002529183169826865, "learning_rate": 3.7350880622499956e-06, "loss": 0.0, "num_input_tokens_seen": 146499968, "step": 67925 }, { "epoch": 12.466507616076344, "grad_norm": 0.0016169283771887422, "learning_rate": 3.734313369705104e-06, "loss": 0.0, "num_input_tokens_seen": 146510944, "step": 67930 }, { "epoch": 12.467425215635897, "grad_norm": 0.0002060256665572524, "learning_rate": 3.733538709623113e-06, "loss": 0.0, "num_input_tokens_seen": 146520928, "step": 67935 }, { "epoch": 12.468342815195449, "grad_norm": 0.0006485924241133034, "learning_rate": 3.7327640820238893e-06, "loss": 0.0, "num_input_tokens_seen": 146532832, "step": 67940 }, { "epoch": 12.469260414755, "grad_norm": 0.00029779699980281293, "learning_rate": 3.7319894869273043e-06, "loss": 0.0, "num_input_tokens_seen": 146544352, "step": 67945 }, { "epoch": 12.470178014314554, "grad_norm": 0.001027559512294829, "learning_rate": 3.731214924353224e-06, "loss": 0.0, "num_input_tokens_seen": 146555072, "step": 67950 }, { "epoch": 12.471095613874105, "grad_norm": 0.0004985338891856372, "learning_rate": 3.730440394321514e-06, "loss": 0.0, "num_input_tokens_seen": 146566016, "step": 67955 }, { "epoch": 12.472013213433657, "grad_norm": 0.00040879874723032117, "learning_rate": 3.7296658968520406e-06, "loss": 0.0, "num_input_tokens_seen": 146577088, "step": 67960 }, { "epoch": 12.47293081299321, "grad_norm": 0.0008972770883701742, "learning_rate": 3.7288914319646684e-06, "loss": 0.0015, "num_input_tokens_seen": 146586624, "step": 67965 }, { "epoch": 12.473848412552762, "grad_norm": 0.00029792904388159513, "learning_rate": 3.728116999679259e-06, "loss": 0.0, "num_input_tokens_seen": 146597504, "step": 67970 }, { "epoch": 12.474766012112314, "grad_norm": 0.0006297903601080179, "learning_rate": 3.727342600015679e-06, "loss": 0.0, "num_input_tokens_seen": 146606976, "step": 67975 }, { "epoch": 12.475683611671867, "grad_norm": 0.00016199306992348284, "learning_rate": 3.726568232993789e-06, "loss": 0.0002, "num_input_tokens_seen": 146618272, "step": 67980 }, { "epoch": 12.476601211231419, "grad_norm": 0.00130001874640584, "learning_rate": 3.72579389863345e-06, "loss": 0.0, "num_input_tokens_seen": 146628608, "step": 67985 }, { "epoch": 12.47751881079097, "grad_norm": 0.0008925769361667335, "learning_rate": 3.7250195969545213e-06, "loss": 0.0, "num_input_tokens_seen": 146639936, "step": 67990 }, { "epoch": 12.478436410350524, "grad_norm": 0.0006713999900966883, "learning_rate": 3.724245327976865e-06, "loss": 0.0, "num_input_tokens_seen": 146651904, "step": 67995 }, { "epoch": 12.479354009910075, "grad_norm": 0.0009227875270880759, "learning_rate": 3.7234710917203387e-06, "loss": 0.0, "num_input_tokens_seen": 146662624, "step": 68000 }, { "epoch": 12.480271609469627, "grad_norm": 0.0008793533779680729, "learning_rate": 3.722696888204799e-06, "loss": 0.0, "num_input_tokens_seen": 146673376, "step": 68005 }, { "epoch": 12.48118920902918, "grad_norm": 0.00017389492131769657, "learning_rate": 3.721922717450107e-06, "loss": 0.0004, "num_input_tokens_seen": 146683936, "step": 68010 }, { "epoch": 12.482106808588732, "grad_norm": 0.001641047652810812, "learning_rate": 3.721148579476116e-06, "loss": 0.0001, "num_input_tokens_seen": 146694816, "step": 68015 }, { "epoch": 12.483024408148284, "grad_norm": 0.00028692869818769395, "learning_rate": 3.720374474302679e-06, "loss": 0.0, "num_input_tokens_seen": 146704512, "step": 68020 }, { "epoch": 12.483942007707837, "grad_norm": 0.0003741731634363532, "learning_rate": 3.719600401949657e-06, "loss": 0.0, "num_input_tokens_seen": 146714688, "step": 68025 }, { "epoch": 12.484859607267389, "grad_norm": 0.0005116635002195835, "learning_rate": 3.718826362436899e-06, "loss": 0.0, "num_input_tokens_seen": 146726176, "step": 68030 }, { "epoch": 12.48577720682694, "grad_norm": 0.00021747636492364109, "learning_rate": 3.7180523557842603e-06, "loss": 0.0, "num_input_tokens_seen": 146735872, "step": 68035 }, { "epoch": 12.486694806386494, "grad_norm": 0.011943754740059376, "learning_rate": 3.717278382011589e-06, "loss": 0.0, "num_input_tokens_seen": 146745568, "step": 68040 }, { "epoch": 12.487612405946045, "grad_norm": 0.0002752227592281997, "learning_rate": 3.7165044411387417e-06, "loss": 0.0, "num_input_tokens_seen": 146756672, "step": 68045 }, { "epoch": 12.488530005505597, "grad_norm": 0.005507034715265036, "learning_rate": 3.715730533185566e-06, "loss": 0.0, "num_input_tokens_seen": 146766976, "step": 68050 }, { "epoch": 12.48944760506515, "grad_norm": 0.00017031459719873965, "learning_rate": 3.7149566581719105e-06, "loss": 0.0, "num_input_tokens_seen": 146778080, "step": 68055 }, { "epoch": 12.490365204624702, "grad_norm": 0.00041094201151281595, "learning_rate": 3.7141828161176265e-06, "loss": 0.0, "num_input_tokens_seen": 146787232, "step": 68060 }, { "epoch": 12.491282804184253, "grad_norm": 0.00022739512496627867, "learning_rate": 3.713409007042559e-06, "loss": 0.0, "num_input_tokens_seen": 146797856, "step": 68065 }, { "epoch": 12.492200403743807, "grad_norm": 0.002696293406188488, "learning_rate": 3.7126352309665577e-06, "loss": 0.0001, "num_input_tokens_seen": 146810432, "step": 68070 }, { "epoch": 12.493118003303358, "grad_norm": 0.0008489429019391537, "learning_rate": 3.711861487909466e-06, "loss": 0.0, "num_input_tokens_seen": 146820992, "step": 68075 }, { "epoch": 12.49403560286291, "grad_norm": 0.00046373094664886594, "learning_rate": 3.7110877778911324e-06, "loss": 0.0001, "num_input_tokens_seen": 146831360, "step": 68080 }, { "epoch": 12.494953202422463, "grad_norm": 0.003278193762525916, "learning_rate": 3.7103141009313993e-06, "loss": 0.0, "num_input_tokens_seen": 146841312, "step": 68085 }, { "epoch": 12.495870801982015, "grad_norm": 0.0003897347196470946, "learning_rate": 3.7095404570501087e-06, "loss": 0.0, "num_input_tokens_seen": 146852256, "step": 68090 }, { "epoch": 12.496788401541567, "grad_norm": 0.0003215594042558223, "learning_rate": 3.7087668462671074e-06, "loss": 0.0, "num_input_tokens_seen": 146864928, "step": 68095 }, { "epoch": 12.49770600110112, "grad_norm": 0.0003856861439999193, "learning_rate": 3.7079932686022354e-06, "loss": 0.0, "num_input_tokens_seen": 146874240, "step": 68100 }, { "epoch": 12.498623600660672, "grad_norm": 0.000222373433643952, "learning_rate": 3.7072197240753315e-06, "loss": 0.0, "num_input_tokens_seen": 146884320, "step": 68105 }, { "epoch": 12.499541200220223, "grad_norm": 0.0001664185692789033, "learning_rate": 3.7064462127062397e-06, "loss": 0.0, "num_input_tokens_seen": 146894944, "step": 68110 }, { "epoch": 12.500458799779777, "grad_norm": 0.0001852384302765131, "learning_rate": 3.705672734514798e-06, "loss": 0.0, "num_input_tokens_seen": 146904992, "step": 68115 }, { "epoch": 12.501376399339328, "grad_norm": 0.00046768790343776345, "learning_rate": 3.7048992895208445e-06, "loss": 0.0, "num_input_tokens_seen": 146916448, "step": 68120 }, { "epoch": 12.50229399889888, "grad_norm": 69.14262390136719, "learning_rate": 3.704125877744216e-06, "loss": 0.2281, "num_input_tokens_seen": 146928064, "step": 68125 }, { "epoch": 12.503211598458433, "grad_norm": 0.00022912290296517313, "learning_rate": 3.703352499204751e-06, "loss": 0.0, "num_input_tokens_seen": 146940064, "step": 68130 }, { "epoch": 12.504129198017985, "grad_norm": 0.0002688850509002805, "learning_rate": 3.7025791539222855e-06, "loss": 0.0, "num_input_tokens_seen": 146951072, "step": 68135 }, { "epoch": 12.505046797577537, "grad_norm": 0.0005418984219431877, "learning_rate": 3.701805841916651e-06, "loss": 0.0001, "num_input_tokens_seen": 146962112, "step": 68140 }, { "epoch": 12.50596439713709, "grad_norm": 0.0011290935799479485, "learning_rate": 3.7010325632076877e-06, "loss": 0.0004, "num_input_tokens_seen": 146973120, "step": 68145 }, { "epoch": 12.506881996696642, "grad_norm": 0.0006819161935709417, "learning_rate": 3.7002593178152257e-06, "loss": 0.0, "num_input_tokens_seen": 146984928, "step": 68150 }, { "epoch": 12.507799596256193, "grad_norm": 0.0007995701162144542, "learning_rate": 3.6994861057590963e-06, "loss": 0.0763, "num_input_tokens_seen": 146996192, "step": 68155 }, { "epoch": 12.508717195815747, "grad_norm": 0.004086831584572792, "learning_rate": 3.6987129270591337e-06, "loss": 0.0, "num_input_tokens_seen": 147006592, "step": 68160 }, { "epoch": 12.509634795375298, "grad_norm": 18.88407325744629, "learning_rate": 3.697939781735167e-06, "loss": 0.0119, "num_input_tokens_seen": 147017536, "step": 68165 }, { "epoch": 12.51055239493485, "grad_norm": 0.14577873051166534, "learning_rate": 3.6971666698070285e-06, "loss": 0.0001, "num_input_tokens_seen": 147027808, "step": 68170 }, { "epoch": 12.511469994494403, "grad_norm": 0.010658633895218372, "learning_rate": 3.696393591294544e-06, "loss": 0.0, "num_input_tokens_seen": 147037952, "step": 68175 }, { "epoch": 12.512387594053955, "grad_norm": 0.0011916907969862223, "learning_rate": 3.6956205462175443e-06, "loss": 0.0, "num_input_tokens_seen": 147049504, "step": 68180 }, { "epoch": 12.513305193613506, "grad_norm": 0.017732398584485054, "learning_rate": 3.694847534595857e-06, "loss": 0.0, "num_input_tokens_seen": 147058784, "step": 68185 }, { "epoch": 12.51422279317306, "grad_norm": 0.0019770509097725153, "learning_rate": 3.6940745564493043e-06, "loss": 0.0974, "num_input_tokens_seen": 147068704, "step": 68190 }, { "epoch": 12.515140392732611, "grad_norm": 0.002042395994067192, "learning_rate": 3.6933016117977177e-06, "loss": 0.0014, "num_input_tokens_seen": 147078944, "step": 68195 }, { "epoch": 12.516057992292163, "grad_norm": 0.00013283832231536508, "learning_rate": 3.69252870066092e-06, "loss": 0.0246, "num_input_tokens_seen": 147089120, "step": 68200 }, { "epoch": 12.516975591851716, "grad_norm": 0.0004909639828838408, "learning_rate": 3.691755823058734e-06, "loss": 0.0, "num_input_tokens_seen": 147100608, "step": 68205 }, { "epoch": 12.517893191411268, "grad_norm": 0.00018351497419644147, "learning_rate": 3.690982979010982e-06, "loss": 0.0, "num_input_tokens_seen": 147111136, "step": 68210 }, { "epoch": 12.51881079097082, "grad_norm": 0.0036502699367702007, "learning_rate": 3.6902101685374897e-06, "loss": 0.0, "num_input_tokens_seen": 147122144, "step": 68215 }, { "epoch": 12.519728390530373, "grad_norm": 0.0002268206444568932, "learning_rate": 3.689437391658077e-06, "loss": 0.0001, "num_input_tokens_seen": 147132864, "step": 68220 }, { "epoch": 12.520645990089925, "grad_norm": 0.17097030580043793, "learning_rate": 3.6886646483925613e-06, "loss": 0.0001, "num_input_tokens_seen": 147142976, "step": 68225 }, { "epoch": 12.521563589649476, "grad_norm": 0.0010979565558955073, "learning_rate": 3.687891938760767e-06, "loss": 0.0, "num_input_tokens_seen": 147152736, "step": 68230 }, { "epoch": 12.52248118920903, "grad_norm": 0.003371741157025099, "learning_rate": 3.6871192627825115e-06, "loss": 0.0, "num_input_tokens_seen": 147163712, "step": 68235 }, { "epoch": 12.523398788768581, "grad_norm": 0.00015020319551695138, "learning_rate": 3.68634662047761e-06, "loss": 0.0, "num_input_tokens_seen": 147174112, "step": 68240 }, { "epoch": 12.524316388328133, "grad_norm": 0.0002668405941221863, "learning_rate": 3.6855740118658834e-06, "loss": 0.0, "num_input_tokens_seen": 147185376, "step": 68245 }, { "epoch": 12.525233987887686, "grad_norm": 0.002990458160638809, "learning_rate": 3.6848014369671464e-06, "loss": 0.0056, "num_input_tokens_seen": 147196064, "step": 68250 }, { "epoch": 12.526151587447238, "grad_norm": 0.0004717234696727246, "learning_rate": 3.6840288958012137e-06, "loss": 0.0, "num_input_tokens_seen": 147206976, "step": 68255 }, { "epoch": 12.52706918700679, "grad_norm": 0.0003336250374559313, "learning_rate": 3.6832563883878993e-06, "loss": 0.0, "num_input_tokens_seen": 147218656, "step": 68260 }, { "epoch": 12.527986786566343, "grad_norm": 0.0003443045716267079, "learning_rate": 3.6824839147470193e-06, "loss": 0.0001, "num_input_tokens_seen": 147229824, "step": 68265 }, { "epoch": 12.528904386125895, "grad_norm": 0.00027177867013961077, "learning_rate": 3.6817114748983845e-06, "loss": 0.0, "num_input_tokens_seen": 147241344, "step": 68270 }, { "epoch": 12.529821985685446, "grad_norm": 0.00014771049609407783, "learning_rate": 3.680939068861806e-06, "loss": 0.0, "num_input_tokens_seen": 147252832, "step": 68275 }, { "epoch": 12.530739585245, "grad_norm": 287.419921875, "learning_rate": 3.6801666966570983e-06, "loss": 0.2031, "num_input_tokens_seen": 147262688, "step": 68280 }, { "epoch": 12.531657184804551, "grad_norm": 0.003772973082959652, "learning_rate": 3.679394358304067e-06, "loss": 0.0, "num_input_tokens_seen": 147272512, "step": 68285 }, { "epoch": 12.532574784364103, "grad_norm": 0.7219280004501343, "learning_rate": 3.678622053822524e-06, "loss": 0.0029, "num_input_tokens_seen": 147282496, "step": 68290 }, { "epoch": 12.533492383923656, "grad_norm": 0.0017794248415157199, "learning_rate": 3.677849783232279e-06, "loss": 0.0, "num_input_tokens_seen": 147293056, "step": 68295 }, { "epoch": 12.534409983483208, "grad_norm": 0.0013423989294096828, "learning_rate": 3.677077546553137e-06, "loss": 0.0002, "num_input_tokens_seen": 147303712, "step": 68300 }, { "epoch": 12.53532758304276, "grad_norm": 0.0028729215264320374, "learning_rate": 3.676305343804906e-06, "loss": 0.0, "num_input_tokens_seen": 147314208, "step": 68305 }, { "epoch": 12.536245182602313, "grad_norm": 0.01394700538367033, "learning_rate": 3.67553317500739e-06, "loss": 0.0, "num_input_tokens_seen": 147324736, "step": 68310 }, { "epoch": 12.537162782161865, "grad_norm": 0.0002456751244608313, "learning_rate": 3.674761040180396e-06, "loss": 0.0001, "num_input_tokens_seen": 147334400, "step": 68315 }, { "epoch": 12.538080381721416, "grad_norm": 0.0003521849284879863, "learning_rate": 3.673988939343728e-06, "loss": 0.0, "num_input_tokens_seen": 147345280, "step": 68320 }, { "epoch": 12.53899798128097, "grad_norm": 0.0006895328988321126, "learning_rate": 3.673216872517187e-06, "loss": 0.0001, "num_input_tokens_seen": 147356960, "step": 68325 }, { "epoch": 12.539915580840521, "grad_norm": 0.0005497458623722196, "learning_rate": 3.6724448397205785e-06, "loss": 0.0001, "num_input_tokens_seen": 147367936, "step": 68330 }, { "epoch": 12.540833180400073, "grad_norm": 0.0001616523804841563, "learning_rate": 3.671672840973702e-06, "loss": 0.0, "num_input_tokens_seen": 147379104, "step": 68335 }, { "epoch": 12.541750779959626, "grad_norm": 0.0005752627039328218, "learning_rate": 3.670900876296357e-06, "loss": 0.0, "num_input_tokens_seen": 147390368, "step": 68340 }, { "epoch": 12.542668379519178, "grad_norm": 0.0013863231288269162, "learning_rate": 3.6701289457083466e-06, "loss": 0.0, "num_input_tokens_seen": 147400864, "step": 68345 }, { "epoch": 12.54358597907873, "grad_norm": 0.0022578744683414698, "learning_rate": 3.669357049229468e-06, "loss": 0.0, "num_input_tokens_seen": 147412128, "step": 68350 }, { "epoch": 12.544503578638283, "grad_norm": 0.003783176885917783, "learning_rate": 3.668585186879518e-06, "loss": 0.0097, "num_input_tokens_seen": 147422720, "step": 68355 }, { "epoch": 12.545421178197834, "grad_norm": 0.042580392211675644, "learning_rate": 3.6678133586782928e-06, "loss": 0.0883, "num_input_tokens_seen": 147432800, "step": 68360 }, { "epoch": 12.546338777757386, "grad_norm": 0.00043937325244769454, "learning_rate": 3.667041564645592e-06, "loss": 0.0, "num_input_tokens_seen": 147444352, "step": 68365 }, { "epoch": 12.54725637731694, "grad_norm": 0.00016572167805861682, "learning_rate": 3.666269804801209e-06, "loss": 0.0, "num_input_tokens_seen": 147454784, "step": 68370 }, { "epoch": 12.548173976876491, "grad_norm": 0.00014034325431566685, "learning_rate": 3.665498079164938e-06, "loss": 0.0, "num_input_tokens_seen": 147463968, "step": 68375 }, { "epoch": 12.549091576436043, "grad_norm": 0.00012315243657212704, "learning_rate": 3.6647263877565736e-06, "loss": 0.0, "num_input_tokens_seen": 147474240, "step": 68380 }, { "epoch": 12.550009175995596, "grad_norm": 0.0037730266340076923, "learning_rate": 3.6639547305959066e-06, "loss": 0.0885, "num_input_tokens_seen": 147484768, "step": 68385 }, { "epoch": 12.550926775555148, "grad_norm": 0.002796561224386096, "learning_rate": 3.6631831077027317e-06, "loss": 0.0, "num_input_tokens_seen": 147496704, "step": 68390 }, { "epoch": 12.5518443751147, "grad_norm": 0.0036604932975023985, "learning_rate": 3.662411519096837e-06, "loss": 0.0, "num_input_tokens_seen": 147507616, "step": 68395 }, { "epoch": 12.552761974674253, "grad_norm": 0.0023372629657387733, "learning_rate": 3.661639964798015e-06, "loss": 0.0, "num_input_tokens_seen": 147519456, "step": 68400 }, { "epoch": 12.553679574233804, "grad_norm": 2.415282726287842, "learning_rate": 3.6608684448260535e-06, "loss": 0.0219, "num_input_tokens_seen": 147529728, "step": 68405 }, { "epoch": 12.554597173793356, "grad_norm": 0.008476455695927143, "learning_rate": 3.6600969592007383e-06, "loss": 0.0, "num_input_tokens_seen": 147541120, "step": 68410 }, { "epoch": 12.55551477335291, "grad_norm": 0.00016593006148468703, "learning_rate": 3.6593255079418622e-06, "loss": 0.0001, "num_input_tokens_seen": 147551584, "step": 68415 }, { "epoch": 12.556432372912461, "grad_norm": 0.0006532096303999424, "learning_rate": 3.6585540910692085e-06, "loss": 0.0, "num_input_tokens_seen": 147562048, "step": 68420 }, { "epoch": 12.557349972472013, "grad_norm": 24.331823348999023, "learning_rate": 3.6577827086025613e-06, "loss": 0.0028, "num_input_tokens_seen": 147573984, "step": 68425 }, { "epoch": 12.558267572031566, "grad_norm": 0.0012338646920397878, "learning_rate": 3.6570113605617095e-06, "loss": 0.0, "num_input_tokens_seen": 147585728, "step": 68430 }, { "epoch": 12.559185171591118, "grad_norm": 0.0023202765733003616, "learning_rate": 3.656240046966435e-06, "loss": 0.0012, "num_input_tokens_seen": 147598016, "step": 68435 }, { "epoch": 12.56010277115067, "grad_norm": 5.190972328186035, "learning_rate": 3.6554687678365207e-06, "loss": 0.0061, "num_input_tokens_seen": 147608448, "step": 68440 }, { "epoch": 12.561020370710223, "grad_norm": 0.0025111024733632803, "learning_rate": 3.6546975231917466e-06, "loss": 0.0, "num_input_tokens_seen": 147618784, "step": 68445 }, { "epoch": 12.561937970269774, "grad_norm": 0.0002868910669349134, "learning_rate": 3.6539263130518976e-06, "loss": 0.0001, "num_input_tokens_seen": 147629056, "step": 68450 }, { "epoch": 12.562855569829326, "grad_norm": 0.004949303809553385, "learning_rate": 3.653155137436753e-06, "loss": 0.1647, "num_input_tokens_seen": 147639200, "step": 68455 }, { "epoch": 12.56377316938888, "grad_norm": 0.0012727798894047737, "learning_rate": 3.6523839963660897e-06, "loss": 0.0, "num_input_tokens_seen": 147649888, "step": 68460 }, { "epoch": 12.56469076894843, "grad_norm": 0.0004925590474158525, "learning_rate": 3.6516128898596897e-06, "loss": 0.0, "num_input_tokens_seen": 147661280, "step": 68465 }, { "epoch": 12.565608368507982, "grad_norm": 0.00016316268010996282, "learning_rate": 3.6508418179373294e-06, "loss": 0.0, "num_input_tokens_seen": 147672832, "step": 68470 }, { "epoch": 12.566525968067536, "grad_norm": 0.00017026555724442005, "learning_rate": 3.650070780618785e-06, "loss": 0.0, "num_input_tokens_seen": 147684512, "step": 68475 }, { "epoch": 12.567443567627087, "grad_norm": 0.0002428110601613298, "learning_rate": 3.6492997779238325e-06, "loss": 0.0001, "num_input_tokens_seen": 147695648, "step": 68480 }, { "epoch": 12.568361167186639, "grad_norm": 0.00022440370230469853, "learning_rate": 3.6485288098722484e-06, "loss": 0.0, "num_input_tokens_seen": 147706752, "step": 68485 }, { "epoch": 12.569278766746192, "grad_norm": 0.00300949951633811, "learning_rate": 3.6477578764838063e-06, "loss": 0.0, "num_input_tokens_seen": 147718624, "step": 68490 }, { "epoch": 12.570196366305744, "grad_norm": 0.00026832151343114674, "learning_rate": 3.646986977778277e-06, "loss": 0.0001, "num_input_tokens_seen": 147729952, "step": 68495 }, { "epoch": 12.571113965865296, "grad_norm": 0.0009414084488525987, "learning_rate": 3.646216113775436e-06, "loss": 0.1563, "num_input_tokens_seen": 147741376, "step": 68500 }, { "epoch": 12.57203156542485, "grad_norm": 0.00175037095323205, "learning_rate": 3.6454452844950533e-06, "loss": 0.0, "num_input_tokens_seen": 147753408, "step": 68505 }, { "epoch": 12.5729491649844, "grad_norm": 0.00022199020895641297, "learning_rate": 3.6446744899568975e-06, "loss": 0.0, "num_input_tokens_seen": 147764224, "step": 68510 }, { "epoch": 12.573866764543952, "grad_norm": 0.000418456009356305, "learning_rate": 3.6439037301807434e-06, "loss": 0.0, "num_input_tokens_seen": 147774208, "step": 68515 }, { "epoch": 12.574784364103506, "grad_norm": 0.00020963442511856556, "learning_rate": 3.6431330051863567e-06, "loss": 0.0004, "num_input_tokens_seen": 147784352, "step": 68520 }, { "epoch": 12.575701963663057, "grad_norm": 0.0002562803856562823, "learning_rate": 3.642362314993505e-06, "loss": 0.0, "num_input_tokens_seen": 147794624, "step": 68525 }, { "epoch": 12.576619563222609, "grad_norm": 0.0006427150801755488, "learning_rate": 3.6415916596219538e-06, "loss": 0.0007, "num_input_tokens_seen": 147805280, "step": 68530 }, { "epoch": 12.577537162782162, "grad_norm": 0.0005029040039516985, "learning_rate": 3.6408210390914733e-06, "loss": 0.0, "num_input_tokens_seen": 147815936, "step": 68535 }, { "epoch": 12.578454762341714, "grad_norm": 0.00029726579668931663, "learning_rate": 3.640050453421826e-06, "loss": 0.0, "num_input_tokens_seen": 147827872, "step": 68540 }, { "epoch": 12.579372361901266, "grad_norm": 0.002791742328554392, "learning_rate": 3.6392799026327757e-06, "loss": 0.0051, "num_input_tokens_seen": 147838720, "step": 68545 }, { "epoch": 12.580289961460819, "grad_norm": 0.0002957258257083595, "learning_rate": 3.638509386744089e-06, "loss": 0.0001, "num_input_tokens_seen": 147849216, "step": 68550 }, { "epoch": 12.58120756102037, "grad_norm": 0.015442523173987865, "learning_rate": 3.637738905775526e-06, "loss": 0.0001, "num_input_tokens_seen": 147860064, "step": 68555 }, { "epoch": 12.582125160579922, "grad_norm": 0.0004990937886759639, "learning_rate": 3.636968459746847e-06, "loss": 0.0, "num_input_tokens_seen": 147870848, "step": 68560 }, { "epoch": 12.583042760139476, "grad_norm": 0.0003173711011186242, "learning_rate": 3.636198048677816e-06, "loss": 0.0, "num_input_tokens_seen": 147881760, "step": 68565 }, { "epoch": 12.583960359699027, "grad_norm": 0.06496620923280716, "learning_rate": 3.635427672588192e-06, "loss": 0.0533, "num_input_tokens_seen": 147892192, "step": 68570 }, { "epoch": 12.584877959258579, "grad_norm": 0.0006505810306407511, "learning_rate": 3.634657331497733e-06, "loss": 0.001, "num_input_tokens_seen": 147903072, "step": 68575 }, { "epoch": 12.585795558818132, "grad_norm": 0.00191919703502208, "learning_rate": 3.6338870254261947e-06, "loss": 0.0063, "num_input_tokens_seen": 147915008, "step": 68580 }, { "epoch": 12.586713158377684, "grad_norm": 0.0001887911494122818, "learning_rate": 3.63311675439334e-06, "loss": 0.0, "num_input_tokens_seen": 147926336, "step": 68585 }, { "epoch": 12.587630757937236, "grad_norm": 0.0001296576374443248, "learning_rate": 3.6323465184189217e-06, "loss": 0.0, "num_input_tokens_seen": 147937568, "step": 68590 }, { "epoch": 12.588548357496789, "grad_norm": 0.0026726231444627047, "learning_rate": 3.6315763175226948e-06, "loss": 0.0001, "num_input_tokens_seen": 147947936, "step": 68595 }, { "epoch": 12.58946595705634, "grad_norm": 0.0014804574893787503, "learning_rate": 3.630806151724415e-06, "loss": 0.0244, "num_input_tokens_seen": 147958592, "step": 68600 }, { "epoch": 12.590383556615892, "grad_norm": 0.0008279439061880112, "learning_rate": 3.6300360210438345e-06, "loss": 0.0, "num_input_tokens_seen": 147969696, "step": 68605 }, { "epoch": 12.591301156175446, "grad_norm": 0.0012200154596939683, "learning_rate": 3.6292659255007067e-06, "loss": 0.0, "num_input_tokens_seen": 147981376, "step": 68610 }, { "epoch": 12.592218755734997, "grad_norm": 0.0035451515577733517, "learning_rate": 3.628495865114785e-06, "loss": 0.0, "num_input_tokens_seen": 147992640, "step": 68615 }, { "epoch": 12.593136355294549, "grad_norm": 7.926203727722168, "learning_rate": 3.627725839905818e-06, "loss": 0.0007, "num_input_tokens_seen": 148003040, "step": 68620 }, { "epoch": 12.594053954854102, "grad_norm": 0.020977606996893883, "learning_rate": 3.6269558498935562e-06, "loss": 0.0, "num_input_tokens_seen": 148012800, "step": 68625 }, { "epoch": 12.594971554413654, "grad_norm": 0.011125266551971436, "learning_rate": 3.626185895097748e-06, "loss": 0.0, "num_input_tokens_seen": 148023168, "step": 68630 }, { "epoch": 12.595889153973205, "grad_norm": 0.00427880696952343, "learning_rate": 3.6254159755381434e-06, "loss": 0.0008, "num_input_tokens_seen": 148032224, "step": 68635 }, { "epoch": 12.596806753532759, "grad_norm": 0.037235479801893234, "learning_rate": 3.6246460912344892e-06, "loss": 0.0001, "num_input_tokens_seen": 148043776, "step": 68640 }, { "epoch": 12.59772435309231, "grad_norm": 0.028607435524463654, "learning_rate": 3.623876242206529e-06, "loss": 0.0, "num_input_tokens_seen": 148054112, "step": 68645 }, { "epoch": 12.598641952651862, "grad_norm": 0.0006576360901817679, "learning_rate": 3.623106428474012e-06, "loss": 0.0, "num_input_tokens_seen": 148066144, "step": 68650 }, { "epoch": 12.599559552211415, "grad_norm": 0.0005223475163802505, "learning_rate": 3.6223366500566815e-06, "loss": 0.0, "num_input_tokens_seen": 148077728, "step": 68655 }, { "epoch": 12.600477151770967, "grad_norm": 0.006126856431365013, "learning_rate": 3.6215669069742804e-06, "loss": 0.0, "num_input_tokens_seen": 148086880, "step": 68660 }, { "epoch": 12.601394751330519, "grad_norm": 0.01316374447196722, "learning_rate": 3.6207971992465495e-06, "loss": 0.0, "num_input_tokens_seen": 148096608, "step": 68665 }, { "epoch": 12.602312350890072, "grad_norm": 0.00014967423339840025, "learning_rate": 3.6200275268932362e-06, "loss": 0.0, "num_input_tokens_seen": 148109152, "step": 68670 }, { "epoch": 12.603229950449624, "grad_norm": 0.00015975750284269452, "learning_rate": 3.619257889934077e-06, "loss": 0.0, "num_input_tokens_seen": 148120960, "step": 68675 }, { "epoch": 12.604147550009175, "grad_norm": 0.371540904045105, "learning_rate": 3.6184882883888106e-06, "loss": 0.0002, "num_input_tokens_seen": 148131776, "step": 68680 }, { "epoch": 12.605065149568729, "grad_norm": 0.0001933667081175372, "learning_rate": 3.6177187222771814e-06, "loss": 0.0001, "num_input_tokens_seen": 148142368, "step": 68685 }, { "epoch": 12.60598274912828, "grad_norm": 0.0003141682827845216, "learning_rate": 3.6169491916189243e-06, "loss": 0.0, "num_input_tokens_seen": 148154432, "step": 68690 }, { "epoch": 12.606900348687832, "grad_norm": 0.0006676230696029961, "learning_rate": 3.616179696433776e-06, "loss": 0.0058, "num_input_tokens_seen": 148165344, "step": 68695 }, { "epoch": 12.607817948247385, "grad_norm": 0.0010413983836770058, "learning_rate": 3.6154102367414744e-06, "loss": 0.0002, "num_input_tokens_seen": 148176064, "step": 68700 }, { "epoch": 12.608735547806937, "grad_norm": 0.010430527850985527, "learning_rate": 3.614640812561754e-06, "loss": 0.0, "num_input_tokens_seen": 148187936, "step": 68705 }, { "epoch": 12.609653147366489, "grad_norm": 0.0002770928549580276, "learning_rate": 3.61387142391435e-06, "loss": 0.0, "num_input_tokens_seen": 148199424, "step": 68710 }, { "epoch": 12.610570746926042, "grad_norm": 8.963105938164517e-05, "learning_rate": 3.6131020708189947e-06, "loss": 0.0, "num_input_tokens_seen": 148208000, "step": 68715 }, { "epoch": 12.611488346485594, "grad_norm": 0.0003648559213615954, "learning_rate": 3.6123327532954233e-06, "loss": 0.0, "num_input_tokens_seen": 148218336, "step": 68720 }, { "epoch": 12.612405946045145, "grad_norm": 0.0011942732380703092, "learning_rate": 3.6115634713633653e-06, "loss": 0.0003, "num_input_tokens_seen": 148229920, "step": 68725 }, { "epoch": 12.613323545604699, "grad_norm": 0.00024217722238972783, "learning_rate": 3.610794225042551e-06, "loss": 0.0703, "num_input_tokens_seen": 148241184, "step": 68730 }, { "epoch": 12.61424114516425, "grad_norm": 0.000196057342691347, "learning_rate": 3.610025014352714e-06, "loss": 0.078, "num_input_tokens_seen": 148251872, "step": 68735 }, { "epoch": 12.615158744723802, "grad_norm": 0.00016007927479222417, "learning_rate": 3.60925583931358e-06, "loss": 0.0, "num_input_tokens_seen": 148263168, "step": 68740 }, { "epoch": 12.616076344283355, "grad_norm": 0.003767613088712096, "learning_rate": 3.608486699944877e-06, "loss": 0.0, "num_input_tokens_seen": 148272960, "step": 68745 }, { "epoch": 12.616993943842907, "grad_norm": 53.31307601928711, "learning_rate": 3.6077175962663356e-06, "loss": 0.2282, "num_input_tokens_seen": 148283360, "step": 68750 }, { "epoch": 12.617911543402458, "grad_norm": 0.0005457006627693772, "learning_rate": 3.6069485282976796e-06, "loss": 0.0004, "num_input_tokens_seen": 148293216, "step": 68755 }, { "epoch": 12.618829142962012, "grad_norm": 0.004032756667584181, "learning_rate": 3.606179496058635e-06, "loss": 0.0, "num_input_tokens_seen": 148304352, "step": 68760 }, { "epoch": 12.619746742521563, "grad_norm": 0.020015671849250793, "learning_rate": 3.6054104995689244e-06, "loss": 0.0, "num_input_tokens_seen": 148316512, "step": 68765 }, { "epoch": 12.620664342081115, "grad_norm": 0.13241299986839294, "learning_rate": 3.6046415388482746e-06, "loss": 0.0033, "num_input_tokens_seen": 148327104, "step": 68770 }, { "epoch": 12.621581941640668, "grad_norm": 0.00019310228526592255, "learning_rate": 3.603872613916407e-06, "loss": 0.0, "num_input_tokens_seen": 148337728, "step": 68775 }, { "epoch": 12.62249954120022, "grad_norm": 0.0006102079642005265, "learning_rate": 3.603103724793041e-06, "loss": 0.0, "num_input_tokens_seen": 148348992, "step": 68780 }, { "epoch": 12.623417140759772, "grad_norm": 0.23344159126281738, "learning_rate": 3.602334871497901e-06, "loss": 0.008, "num_input_tokens_seen": 148359008, "step": 68785 }, { "epoch": 12.624334740319325, "grad_norm": 0.0005798563361167908, "learning_rate": 3.601566054050706e-06, "loss": 0.0, "num_input_tokens_seen": 148369664, "step": 68790 }, { "epoch": 12.625252339878877, "grad_norm": 0.0001460972271161154, "learning_rate": 3.6007972724711737e-06, "loss": 0.1969, "num_input_tokens_seen": 148380672, "step": 68795 }, { "epoch": 12.626169939438428, "grad_norm": 0.00020591489737853408, "learning_rate": 3.6000285267790223e-06, "loss": 0.0, "num_input_tokens_seen": 148390368, "step": 68800 }, { "epoch": 12.627087538997982, "grad_norm": 0.0009168987744487822, "learning_rate": 3.5992598169939702e-06, "loss": 0.0, "num_input_tokens_seen": 148401728, "step": 68805 }, { "epoch": 12.628005138557533, "grad_norm": 0.00019446054648142308, "learning_rate": 3.598491143135733e-06, "loss": 0.0001, "num_input_tokens_seen": 148412544, "step": 68810 }, { "epoch": 12.628922738117085, "grad_norm": 0.00018660382193047553, "learning_rate": 3.597722505224025e-06, "loss": 0.0, "num_input_tokens_seen": 148422880, "step": 68815 }, { "epoch": 12.629840337676638, "grad_norm": 0.00020338893227744848, "learning_rate": 3.5969539032785622e-06, "loss": 0.0173, "num_input_tokens_seen": 148433696, "step": 68820 }, { "epoch": 12.63075793723619, "grad_norm": 0.002162800170481205, "learning_rate": 3.5961853373190566e-06, "loss": 0.0, "num_input_tokens_seen": 148444992, "step": 68825 }, { "epoch": 12.631675536795742, "grad_norm": 0.00033085778704844415, "learning_rate": 3.5954168073652207e-06, "loss": 0.0, "num_input_tokens_seen": 148455552, "step": 68830 }, { "epoch": 12.632593136355295, "grad_norm": 0.0009030346991494298, "learning_rate": 3.5946483134367684e-06, "loss": 0.0, "num_input_tokens_seen": 148466112, "step": 68835 }, { "epoch": 12.633510735914847, "grad_norm": 0.00012430001515895128, "learning_rate": 3.5938798555534084e-06, "loss": 0.0, "num_input_tokens_seen": 148476448, "step": 68840 }, { "epoch": 12.634428335474398, "grad_norm": 0.00023405799583997577, "learning_rate": 3.593111433734851e-06, "loss": 0.0, "num_input_tokens_seen": 148485472, "step": 68845 }, { "epoch": 12.635345935033952, "grad_norm": 0.014700379222631454, "learning_rate": 3.5923430480008028e-06, "loss": 0.0, "num_input_tokens_seen": 148495904, "step": 68850 }, { "epoch": 12.636263534593503, "grad_norm": 0.010210332460701466, "learning_rate": 3.591574698370976e-06, "loss": 0.0, "num_input_tokens_seen": 148505696, "step": 68855 }, { "epoch": 12.637181134153055, "grad_norm": 0.00221844925545156, "learning_rate": 3.5908063848650755e-06, "loss": 0.0004, "num_input_tokens_seen": 148517728, "step": 68860 }, { "epoch": 12.638098733712608, "grad_norm": 0.00024270315770991147, "learning_rate": 3.5900381075028045e-06, "loss": 0.0, "num_input_tokens_seen": 148528384, "step": 68865 }, { "epoch": 12.63901633327216, "grad_norm": 0.007257617544382811, "learning_rate": 3.589269866303873e-06, "loss": 0.0, "num_input_tokens_seen": 148537952, "step": 68870 }, { "epoch": 12.639933932831712, "grad_norm": 0.002098249737173319, "learning_rate": 3.588501661287983e-06, "loss": 0.0, "num_input_tokens_seen": 148548960, "step": 68875 }, { "epoch": 12.640851532391265, "grad_norm": 0.00063993816729635, "learning_rate": 3.587733492474835e-06, "loss": 0.0, "num_input_tokens_seen": 148558976, "step": 68880 }, { "epoch": 12.641769131950817, "grad_norm": 0.0008821804076433182, "learning_rate": 3.5869653598841376e-06, "loss": 0.0, "num_input_tokens_seen": 148569536, "step": 68885 }, { "epoch": 12.642686731510368, "grad_norm": 0.0003155019658152014, "learning_rate": 3.586197263535588e-06, "loss": 0.0532, "num_input_tokens_seen": 148580576, "step": 68890 }, { "epoch": 12.643604331069922, "grad_norm": 35.537132263183594, "learning_rate": 3.585429203448888e-06, "loss": 0.1977, "num_input_tokens_seen": 148591776, "step": 68895 }, { "epoch": 12.644521930629473, "grad_norm": 0.038491010665893555, "learning_rate": 3.5846611796437337e-06, "loss": 0.004, "num_input_tokens_seen": 148601984, "step": 68900 }, { "epoch": 12.645439530189025, "grad_norm": 19.92063331604004, "learning_rate": 3.583893192139829e-06, "loss": 0.0354, "num_input_tokens_seen": 148612928, "step": 68905 }, { "epoch": 12.646357129748578, "grad_norm": 0.005798513535410166, "learning_rate": 3.5831252409568683e-06, "loss": 0.0, "num_input_tokens_seen": 148624416, "step": 68910 }, { "epoch": 12.64727472930813, "grad_norm": 0.006141572259366512, "learning_rate": 3.5823573261145496e-06, "loss": 0.0, "num_input_tokens_seen": 148635776, "step": 68915 }, { "epoch": 12.648192328867681, "grad_norm": 0.0005209658993408084, "learning_rate": 3.5815894476325685e-06, "loss": 0.0, "num_input_tokens_seen": 148646528, "step": 68920 }, { "epoch": 12.649109928427235, "grad_norm": 0.014393694698810577, "learning_rate": 3.5808216055306187e-06, "loss": 0.0, "num_input_tokens_seen": 148657568, "step": 68925 }, { "epoch": 12.650027527986786, "grad_norm": 0.00015558567247353494, "learning_rate": 3.5800537998283957e-06, "loss": 0.0, "num_input_tokens_seen": 148668288, "step": 68930 }, { "epoch": 12.650945127546338, "grad_norm": 0.01937178149819374, "learning_rate": 3.5792860305455923e-06, "loss": 0.0, "num_input_tokens_seen": 148676960, "step": 68935 }, { "epoch": 12.651862727105891, "grad_norm": 0.08700564503669739, "learning_rate": 3.5785182977019005e-06, "loss": 0.0001, "num_input_tokens_seen": 148688384, "step": 68940 }, { "epoch": 12.652780326665443, "grad_norm": 0.0001871728600235656, "learning_rate": 3.5777506013170115e-06, "loss": 0.0, "num_input_tokens_seen": 148699712, "step": 68945 }, { "epoch": 12.653697926224995, "grad_norm": 0.004710102919489145, "learning_rate": 3.5769829414106137e-06, "loss": 0.002, "num_input_tokens_seen": 148709920, "step": 68950 }, { "epoch": 12.654615525784548, "grad_norm": 11.030747413635254, "learning_rate": 3.5762153180023997e-06, "loss": 0.0032, "num_input_tokens_seen": 148720768, "step": 68955 }, { "epoch": 12.6555331253441, "grad_norm": 0.7434595823287964, "learning_rate": 3.5754477311120562e-06, "loss": 0.0001, "num_input_tokens_seen": 148731872, "step": 68960 }, { "epoch": 12.656450724903651, "grad_norm": 0.0001416757149854675, "learning_rate": 3.574680180759268e-06, "loss": 0.0001, "num_input_tokens_seen": 148742880, "step": 68965 }, { "epoch": 12.657368324463205, "grad_norm": 0.00028992764418944716, "learning_rate": 3.573912666963726e-06, "loss": 0.0, "num_input_tokens_seen": 148755168, "step": 68970 }, { "epoch": 12.658285924022756, "grad_norm": 0.0007569091976620257, "learning_rate": 3.5731451897451146e-06, "loss": 0.0, "num_input_tokens_seen": 148767104, "step": 68975 }, { "epoch": 12.659203523582308, "grad_norm": 0.004517199005931616, "learning_rate": 3.572377749123117e-06, "loss": 0.0, "num_input_tokens_seen": 148777600, "step": 68980 }, { "epoch": 12.660121123141861, "grad_norm": 0.00012768736633006483, "learning_rate": 3.571610345117416e-06, "loss": 0.0001, "num_input_tokens_seen": 148788192, "step": 68985 }, { "epoch": 12.661038722701413, "grad_norm": 0.00016357922868337482, "learning_rate": 3.570842977747698e-06, "loss": 0.0097, "num_input_tokens_seen": 148798336, "step": 68990 }, { "epoch": 12.661956322260965, "grad_norm": 0.006032247561961412, "learning_rate": 3.5700756470336422e-06, "loss": 0.1688, "num_input_tokens_seen": 148808384, "step": 68995 }, { "epoch": 12.662873921820518, "grad_norm": 0.001115027000196278, "learning_rate": 3.569308352994928e-06, "loss": 0.0016, "num_input_tokens_seen": 148819392, "step": 69000 }, { "epoch": 12.66379152138007, "grad_norm": 0.0005413249018602073, "learning_rate": 3.568541095651239e-06, "loss": 0.1345, "num_input_tokens_seen": 148831072, "step": 69005 }, { "epoch": 12.664709120939621, "grad_norm": 0.0006785150617361069, "learning_rate": 3.5677738750222526e-06, "loss": 0.0, "num_input_tokens_seen": 148841536, "step": 69010 }, { "epoch": 12.665626720499175, "grad_norm": 48.37773513793945, "learning_rate": 3.567006691127646e-06, "loss": 0.2071, "num_input_tokens_seen": 148853248, "step": 69015 }, { "epoch": 12.666544320058726, "grad_norm": 0.001129640149883926, "learning_rate": 3.5662395439870956e-06, "loss": 0.0, "num_input_tokens_seen": 148864576, "step": 69020 }, { "epoch": 12.667461919618278, "grad_norm": 0.0004092328599654138, "learning_rate": 3.565472433620281e-06, "loss": 0.0, "num_input_tokens_seen": 148874144, "step": 69025 }, { "epoch": 12.668379519177831, "grad_norm": 0.0005988611374050379, "learning_rate": 3.5647053600468747e-06, "loss": 0.0, "num_input_tokens_seen": 148885312, "step": 69030 }, { "epoch": 12.669297118737383, "grad_norm": 0.005601395852863789, "learning_rate": 3.56393832328655e-06, "loss": 0.0, "num_input_tokens_seen": 148895808, "step": 69035 }, { "epoch": 12.670214718296934, "grad_norm": 0.004554182756692171, "learning_rate": 3.563171323358983e-06, "loss": 0.0, "num_input_tokens_seen": 148906848, "step": 69040 }, { "epoch": 12.671132317856488, "grad_norm": 0.0002378449571551755, "learning_rate": 3.5624043602838447e-06, "loss": 0.0, "num_input_tokens_seen": 148914656, "step": 69045 }, { "epoch": 12.67204991741604, "grad_norm": 0.006274988874793053, "learning_rate": 3.561637434080805e-06, "loss": 0.0, "num_input_tokens_seen": 148925920, "step": 69050 }, { "epoch": 12.672967516975591, "grad_norm": 0.07388883829116821, "learning_rate": 3.560870544769537e-06, "loss": 0.0001, "num_input_tokens_seen": 148936160, "step": 69055 }, { "epoch": 12.673885116535144, "grad_norm": 0.003241715021431446, "learning_rate": 3.560103692369709e-06, "loss": 0.0007, "num_input_tokens_seen": 148946400, "step": 69060 }, { "epoch": 12.674802716094696, "grad_norm": 0.0006032732198946178, "learning_rate": 3.5593368769009884e-06, "loss": 0.0, "num_input_tokens_seen": 148957440, "step": 69065 }, { "epoch": 12.675720315654248, "grad_norm": 0.000885714718606323, "learning_rate": 3.558570098383045e-06, "loss": 0.0, "num_input_tokens_seen": 148968128, "step": 69070 }, { "epoch": 12.676637915213801, "grad_norm": 0.012995138764381409, "learning_rate": 3.557803356835546e-06, "loss": 0.0, "num_input_tokens_seen": 148978592, "step": 69075 }, { "epoch": 12.677555514773353, "grad_norm": 0.00481317238882184, "learning_rate": 3.5570366522781546e-06, "loss": 0.0, "num_input_tokens_seen": 148989184, "step": 69080 }, { "epoch": 12.678473114332904, "grad_norm": 0.0038880258798599243, "learning_rate": 3.5562699847305347e-06, "loss": 0.0, "num_input_tokens_seen": 148999392, "step": 69085 }, { "epoch": 12.679390713892458, "grad_norm": 0.006596828810870647, "learning_rate": 3.555503354212355e-06, "loss": 0.0, "num_input_tokens_seen": 149010624, "step": 69090 }, { "epoch": 12.68030831345201, "grad_norm": 0.1548013538122177, "learning_rate": 3.5547367607432748e-06, "loss": 0.0004, "num_input_tokens_seen": 149021600, "step": 69095 }, { "epoch": 12.681225913011561, "grad_norm": 0.00015681938384659588, "learning_rate": 3.553970204342955e-06, "loss": 0.0, "num_input_tokens_seen": 149032256, "step": 69100 }, { "epoch": 12.682143512571114, "grad_norm": 0.4753044545650482, "learning_rate": 3.55320368503106e-06, "loss": 0.0002, "num_input_tokens_seen": 149042848, "step": 69105 }, { "epoch": 12.683061112130666, "grad_norm": 0.0005851627211086452, "learning_rate": 3.552437202827248e-06, "loss": 0.0001, "num_input_tokens_seen": 149053280, "step": 69110 }, { "epoch": 12.683978711690218, "grad_norm": 0.0003985703515354544, "learning_rate": 3.551670757751178e-06, "loss": 0.0, "num_input_tokens_seen": 149063808, "step": 69115 }, { "epoch": 12.684896311249771, "grad_norm": 0.0003022464516106993, "learning_rate": 3.5509043498225077e-06, "loss": 0.0, "num_input_tokens_seen": 149075680, "step": 69120 }, { "epoch": 12.685813910809323, "grad_norm": 0.0001411175908287987, "learning_rate": 3.5501379790608957e-06, "loss": 0.0647, "num_input_tokens_seen": 149085376, "step": 69125 }, { "epoch": 12.686731510368874, "grad_norm": 0.00028624929836951196, "learning_rate": 3.5493716454859985e-06, "loss": 0.0, "num_input_tokens_seen": 149096864, "step": 69130 }, { "epoch": 12.687649109928428, "grad_norm": 0.00026356184389442205, "learning_rate": 3.5486053491174687e-06, "loss": 0.0003, "num_input_tokens_seen": 149107968, "step": 69135 }, { "epoch": 12.68856670948798, "grad_norm": 0.06426367908716202, "learning_rate": 3.5478390899749633e-06, "loss": 0.0001, "num_input_tokens_seen": 149118752, "step": 69140 }, { "epoch": 12.68948430904753, "grad_norm": 0.0025135865435004234, "learning_rate": 3.5470728680781334e-06, "loss": 0.0376, "num_input_tokens_seen": 149130048, "step": 69145 }, { "epoch": 12.690401908607084, "grad_norm": 0.00011683908815030009, "learning_rate": 3.546306683446633e-06, "loss": 0.0, "num_input_tokens_seen": 149141824, "step": 69150 }, { "epoch": 12.691319508166636, "grad_norm": 0.0002665715874172747, "learning_rate": 3.5455405361001136e-06, "loss": 0.0, "num_input_tokens_seen": 149152096, "step": 69155 }, { "epoch": 12.692237107726188, "grad_norm": 0.02204212360084057, "learning_rate": 3.544774426058226e-06, "loss": 0.0, "num_input_tokens_seen": 149162432, "step": 69160 }, { "epoch": 12.693154707285741, "grad_norm": 0.04469712823629379, "learning_rate": 3.544008353340618e-06, "loss": 0.0, "num_input_tokens_seen": 149172608, "step": 69165 }, { "epoch": 12.694072306845293, "grad_norm": 0.0073383343406021595, "learning_rate": 3.5432423179669384e-06, "loss": 0.0, "num_input_tokens_seen": 149183072, "step": 69170 }, { "epoch": 12.694989906404844, "grad_norm": 0.00018405105220153928, "learning_rate": 3.542476319956837e-06, "loss": 0.1516, "num_input_tokens_seen": 149192608, "step": 69175 }, { "epoch": 12.695907505964398, "grad_norm": 0.007551101502031088, "learning_rate": 3.5417103593299586e-06, "loss": 0.0, "num_input_tokens_seen": 149203872, "step": 69180 }, { "epoch": 12.69682510552395, "grad_norm": 0.00026898382930085063, "learning_rate": 3.5409444361059474e-06, "loss": 0.0001, "num_input_tokens_seen": 149214400, "step": 69185 }, { "epoch": 12.6977427050835, "grad_norm": 0.00032092066248878837, "learning_rate": 3.5401785503044523e-06, "loss": 0.2406, "num_input_tokens_seen": 149224736, "step": 69190 }, { "epoch": 12.698660304643054, "grad_norm": 0.00028900758479721844, "learning_rate": 3.5394127019451153e-06, "loss": 0.0, "num_input_tokens_seen": 149235936, "step": 69195 }, { "epoch": 12.699577904202606, "grad_norm": 0.00031966116512194276, "learning_rate": 3.5386468910475756e-06, "loss": 0.0, "num_input_tokens_seen": 149247392, "step": 69200 }, { "epoch": 12.700495503762157, "grad_norm": 0.0006169757107272744, "learning_rate": 3.5378811176314813e-06, "loss": 0.0, "num_input_tokens_seen": 149258656, "step": 69205 }, { "epoch": 12.70141310332171, "grad_norm": 0.0003072044637519866, "learning_rate": 3.5371153817164706e-06, "loss": 0.0, "num_input_tokens_seen": 149269440, "step": 69210 }, { "epoch": 12.702330702881262, "grad_norm": 0.01380293257534504, "learning_rate": 3.5363496833221824e-06, "loss": 0.0, "num_input_tokens_seen": 149280896, "step": 69215 }, { "epoch": 12.703248302440814, "grad_norm": 0.0006874139071442187, "learning_rate": 3.5355840224682537e-06, "loss": 0.0, "num_input_tokens_seen": 149290784, "step": 69220 }, { "epoch": 12.704165902000367, "grad_norm": 0.0003792272473219782, "learning_rate": 3.534818399174328e-06, "loss": 0.1066, "num_input_tokens_seen": 149301152, "step": 69225 }, { "epoch": 12.705083501559919, "grad_norm": 0.019296659156680107, "learning_rate": 3.5340528134600393e-06, "loss": 0.0001, "num_input_tokens_seen": 149312640, "step": 69230 }, { "epoch": 12.70600110111947, "grad_norm": 0.0002573425881564617, "learning_rate": 3.5332872653450236e-06, "loss": 0.0, "num_input_tokens_seen": 149323616, "step": 69235 }, { "epoch": 12.706918700679024, "grad_norm": 0.00035332440165802836, "learning_rate": 3.5325217548489167e-06, "loss": 0.0001, "num_input_tokens_seen": 149334112, "step": 69240 }, { "epoch": 12.707836300238576, "grad_norm": 0.0005481525440700352, "learning_rate": 3.531756281991352e-06, "loss": 0.0, "num_input_tokens_seen": 149344384, "step": 69245 }, { "epoch": 12.708753899798127, "grad_norm": 0.003590100212022662, "learning_rate": 3.5309908467919616e-06, "loss": 0.0, "num_input_tokens_seen": 149355328, "step": 69250 }, { "epoch": 12.70967149935768, "grad_norm": 0.0005220033344812691, "learning_rate": 3.530225449270382e-06, "loss": 0.1813, "num_input_tokens_seen": 149366560, "step": 69255 }, { "epoch": 12.710589098917232, "grad_norm": 0.0030095442198216915, "learning_rate": 3.5294600894462405e-06, "loss": 0.0001, "num_input_tokens_seen": 149376704, "step": 69260 }, { "epoch": 12.711506698476784, "grad_norm": 0.04911840334534645, "learning_rate": 3.5286947673391693e-06, "loss": 0.0617, "num_input_tokens_seen": 149387968, "step": 69265 }, { "epoch": 12.712424298036337, "grad_norm": 0.0002657065633684397, "learning_rate": 3.527929482968795e-06, "loss": 0.0004, "num_input_tokens_seen": 149397952, "step": 69270 }, { "epoch": 12.713341897595889, "grad_norm": 0.00027822627453133464, "learning_rate": 3.52716423635475e-06, "loss": 0.0, "num_input_tokens_seen": 149406272, "step": 69275 }, { "epoch": 12.71425949715544, "grad_norm": 0.00014363454829435796, "learning_rate": 3.5263990275166585e-06, "loss": 0.0, "num_input_tokens_seen": 149416320, "step": 69280 }, { "epoch": 12.715177096714994, "grad_norm": 0.0004848047101404518, "learning_rate": 3.525633856474147e-06, "loss": 0.0, "num_input_tokens_seen": 149427680, "step": 69285 }, { "epoch": 12.716094696274546, "grad_norm": 0.0003677736676763743, "learning_rate": 3.5248687232468437e-06, "loss": 0.0, "num_input_tokens_seen": 149437888, "step": 69290 }, { "epoch": 12.717012295834097, "grad_norm": 0.004023442976176739, "learning_rate": 3.5241036278543716e-06, "loss": 0.0017, "num_input_tokens_seen": 149449120, "step": 69295 }, { "epoch": 12.71792989539365, "grad_norm": 0.00014879941591061652, "learning_rate": 3.5233385703163547e-06, "loss": 0.0, "num_input_tokens_seen": 149459840, "step": 69300 }, { "epoch": 12.718847494953202, "grad_norm": 0.00017575896345078945, "learning_rate": 3.522573550652412e-06, "loss": 0.0, "num_input_tokens_seen": 149471328, "step": 69305 }, { "epoch": 12.719765094512754, "grad_norm": 0.002228731522336602, "learning_rate": 3.52180856888217e-06, "loss": 0.0, "num_input_tokens_seen": 149480736, "step": 69310 }, { "epoch": 12.720682694072307, "grad_norm": 0.00119571085087955, "learning_rate": 3.521043625025248e-06, "loss": 0.0, "num_input_tokens_seen": 149491584, "step": 69315 }, { "epoch": 12.721600293631859, "grad_norm": 0.00031248535378836095, "learning_rate": 3.520278719101261e-06, "loss": 0.0, "num_input_tokens_seen": 149503552, "step": 69320 }, { "epoch": 12.72251789319141, "grad_norm": 0.002115576295182109, "learning_rate": 3.5195138511298356e-06, "loss": 0.0001, "num_input_tokens_seen": 149515040, "step": 69325 }, { "epoch": 12.723435492750964, "grad_norm": 0.00021368169109337032, "learning_rate": 3.5187490211305842e-06, "loss": 0.0, "num_input_tokens_seen": 149525472, "step": 69330 }, { "epoch": 12.724353092310515, "grad_norm": 0.00016580245574004948, "learning_rate": 3.5179842291231238e-06, "loss": 0.0, "num_input_tokens_seen": 149535584, "step": 69335 }, { "epoch": 12.725270691870067, "grad_norm": 0.01167367398738861, "learning_rate": 3.517219475127071e-06, "loss": 0.0, "num_input_tokens_seen": 149547104, "step": 69340 }, { "epoch": 12.72618829142962, "grad_norm": 0.0001717707491479814, "learning_rate": 3.5164547591620417e-06, "loss": 0.0003, "num_input_tokens_seen": 149557216, "step": 69345 }, { "epoch": 12.727105890989172, "grad_norm": 0.00018242448277305812, "learning_rate": 3.5156900812476487e-06, "loss": 0.2375, "num_input_tokens_seen": 149567648, "step": 69350 }, { "epoch": 12.728023490548724, "grad_norm": 0.0018055911641567945, "learning_rate": 3.5149254414035035e-06, "loss": 0.0, "num_input_tokens_seen": 149577280, "step": 69355 }, { "epoch": 12.728941090108277, "grad_norm": 0.0074947066605091095, "learning_rate": 3.51416083964922e-06, "loss": 0.0, "num_input_tokens_seen": 149586880, "step": 69360 }, { "epoch": 12.729858689667829, "grad_norm": 0.0009004290914162993, "learning_rate": 3.5133962760044073e-06, "loss": 0.0, "num_input_tokens_seen": 149596608, "step": 69365 }, { "epoch": 12.73077628922738, "grad_norm": 0.0024610937107354403, "learning_rate": 3.512631750488675e-06, "loss": 0.0, "num_input_tokens_seen": 149606624, "step": 69370 }, { "epoch": 12.731693888786934, "grad_norm": 0.00022431692923419178, "learning_rate": 3.5118672631216353e-06, "loss": 0.0, "num_input_tokens_seen": 149616160, "step": 69375 }, { "epoch": 12.732611488346485, "grad_norm": 0.005579677410423756, "learning_rate": 3.511102813922894e-06, "loss": 0.0005, "num_input_tokens_seen": 149626112, "step": 69380 }, { "epoch": 12.733529087906037, "grad_norm": 0.016601601615548134, "learning_rate": 3.5103384029120557e-06, "loss": 0.0, "num_input_tokens_seen": 149635936, "step": 69385 }, { "epoch": 12.73444668746559, "grad_norm": 0.005606118123978376, "learning_rate": 3.50957403010873e-06, "loss": 0.1405, "num_input_tokens_seen": 149647136, "step": 69390 }, { "epoch": 12.735364287025142, "grad_norm": 0.0005675202119164169, "learning_rate": 3.5088096955325215e-06, "loss": 0.0, "num_input_tokens_seen": 149658528, "step": 69395 }, { "epoch": 12.736281886584694, "grad_norm": 0.0006080363527871668, "learning_rate": 3.5080453992030327e-06, "loss": 0.0, "num_input_tokens_seen": 149670080, "step": 69400 }, { "epoch": 12.737199486144247, "grad_norm": 1.8330994844436646, "learning_rate": 3.5072811411398646e-06, "loss": 0.0228, "num_input_tokens_seen": 149681536, "step": 69405 }, { "epoch": 12.738117085703799, "grad_norm": 25.371423721313477, "learning_rate": 3.506516921362624e-06, "loss": 0.0041, "num_input_tokens_seen": 149692000, "step": 69410 }, { "epoch": 12.73903468526335, "grad_norm": 0.00023601205612067133, "learning_rate": 3.5057527398909103e-06, "loss": 0.0, "num_input_tokens_seen": 149703328, "step": 69415 }, { "epoch": 12.739952284822904, "grad_norm": 0.0009644082165323198, "learning_rate": 3.5049885967443205e-06, "loss": 0.2438, "num_input_tokens_seen": 149714656, "step": 69420 }, { "epoch": 12.740869884382455, "grad_norm": 0.007133982609957457, "learning_rate": 3.504224491942458e-06, "loss": 0.0, "num_input_tokens_seen": 149725248, "step": 69425 }, { "epoch": 12.741787483942007, "grad_norm": 0.0026451789308339357, "learning_rate": 3.503460425504919e-06, "loss": 0.0, "num_input_tokens_seen": 149735552, "step": 69430 }, { "epoch": 12.74270508350156, "grad_norm": 0.023995334282517433, "learning_rate": 3.5026963974513007e-06, "loss": 0.0208, "num_input_tokens_seen": 149745792, "step": 69435 }, { "epoch": 12.743622683061112, "grad_norm": 0.00035285198828205466, "learning_rate": 3.5019324078011973e-06, "loss": 0.0, "num_input_tokens_seen": 149756544, "step": 69440 }, { "epoch": 12.744540282620664, "grad_norm": 0.0011091436026617885, "learning_rate": 3.5011684565742083e-06, "loss": 0.0, "num_input_tokens_seen": 149767232, "step": 69445 }, { "epoch": 12.745457882180217, "grad_norm": 0.001304242992773652, "learning_rate": 3.5004045437899248e-06, "loss": 0.0, "num_input_tokens_seen": 149776576, "step": 69450 }, { "epoch": 12.746375481739769, "grad_norm": 0.0006034889374859631, "learning_rate": 3.4996406694679395e-06, "loss": 0.0, "num_input_tokens_seen": 149787968, "step": 69455 }, { "epoch": 12.74729308129932, "grad_norm": 0.03177417069673538, "learning_rate": 3.4988768336278473e-06, "loss": 0.0616, "num_input_tokens_seen": 149799072, "step": 69460 }, { "epoch": 12.748210680858874, "grad_norm": 0.020971419289708138, "learning_rate": 3.498113036289236e-06, "loss": 0.0005, "num_input_tokens_seen": 149810016, "step": 69465 }, { "epoch": 12.749128280418425, "grad_norm": 0.0003880949807353318, "learning_rate": 3.497349277471698e-06, "loss": 0.0001, "num_input_tokens_seen": 149819712, "step": 69470 }, { "epoch": 12.750045879977977, "grad_norm": 0.00016831577522680163, "learning_rate": 3.4965855571948236e-06, "loss": 0.0, "num_input_tokens_seen": 149830336, "step": 69475 }, { "epoch": 12.75096347953753, "grad_norm": 0.0006600709166377783, "learning_rate": 3.495821875478199e-06, "loss": 0.0, "num_input_tokens_seen": 149841184, "step": 69480 }, { "epoch": 12.751881079097082, "grad_norm": 0.0012724166736006737, "learning_rate": 3.495058232341412e-06, "loss": 0.0, "num_input_tokens_seen": 149852000, "step": 69485 }, { "epoch": 12.752798678656633, "grad_norm": 0.0003638928756117821, "learning_rate": 3.4942946278040475e-06, "loss": 0.0001, "num_input_tokens_seen": 149863008, "step": 69490 }, { "epoch": 12.753716278216187, "grad_norm": 0.0005443450063467026, "learning_rate": 3.4935310618856932e-06, "loss": 0.0, "num_input_tokens_seen": 149873504, "step": 69495 }, { "epoch": 12.754633877775738, "grad_norm": 0.0006443126476369798, "learning_rate": 3.492767534605933e-06, "loss": 0.0001, "num_input_tokens_seen": 149884864, "step": 69500 }, { "epoch": 12.755551477335292, "grad_norm": 0.03191991150379181, "learning_rate": 3.4920040459843475e-06, "loss": 0.0001, "num_input_tokens_seen": 149897184, "step": 69505 }, { "epoch": 12.756469076894843, "grad_norm": 0.0027224586810916662, "learning_rate": 3.4912405960405225e-06, "loss": 0.0001, "num_input_tokens_seen": 149909408, "step": 69510 }, { "epoch": 12.757386676454395, "grad_norm": 0.0003562222118489444, "learning_rate": 3.490477184794039e-06, "loss": 0.0001, "num_input_tokens_seen": 149919680, "step": 69515 }, { "epoch": 12.758304276013948, "grad_norm": 0.014793694019317627, "learning_rate": 3.4897138122644737e-06, "loss": 0.0004, "num_input_tokens_seen": 149931200, "step": 69520 }, { "epoch": 12.7592218755735, "grad_norm": 0.0004744746838696301, "learning_rate": 3.48895047847141e-06, "loss": 0.0, "num_input_tokens_seen": 149941632, "step": 69525 }, { "epoch": 12.760139475133052, "grad_norm": 0.004947966895997524, "learning_rate": 3.4881871834344242e-06, "loss": 0.1283, "num_input_tokens_seen": 149953472, "step": 69530 }, { "epoch": 12.761057074692605, "grad_norm": 0.0004327261121943593, "learning_rate": 3.487423927173095e-06, "loss": 0.0001, "num_input_tokens_seen": 149964640, "step": 69535 }, { "epoch": 12.761974674252157, "grad_norm": 0.001237307908013463, "learning_rate": 3.4866607097069948e-06, "loss": 0.0, "num_input_tokens_seen": 149976224, "step": 69540 }, { "epoch": 12.762892273811708, "grad_norm": 0.00022294450900517404, "learning_rate": 3.485897531055705e-06, "loss": 0.0, "num_input_tokens_seen": 149987200, "step": 69545 }, { "epoch": 12.763809873371262, "grad_norm": 0.07254984229803085, "learning_rate": 3.485134391238796e-06, "loss": 0.0004, "num_input_tokens_seen": 149998464, "step": 69550 }, { "epoch": 12.764727472930813, "grad_norm": 0.001027075806632638, "learning_rate": 3.484371290275842e-06, "loss": 0.0, "num_input_tokens_seen": 150009888, "step": 69555 }, { "epoch": 12.765645072490365, "grad_norm": 41.471675872802734, "learning_rate": 3.4836082281864148e-06, "loss": 0.0341, "num_input_tokens_seen": 150020960, "step": 69560 }, { "epoch": 12.766562672049918, "grad_norm": 0.0009627437102608383, "learning_rate": 3.4828452049900875e-06, "loss": 0.0, "num_input_tokens_seen": 150032192, "step": 69565 }, { "epoch": 12.76748027160947, "grad_norm": 0.0032285156194120646, "learning_rate": 3.4820822207064276e-06, "loss": 0.0, "num_input_tokens_seen": 150042080, "step": 69570 }, { "epoch": 12.768397871169022, "grad_norm": 0.0011784224770963192, "learning_rate": 3.4813192753550086e-06, "loss": 0.0, "num_input_tokens_seen": 150052800, "step": 69575 }, { "epoch": 12.769315470728575, "grad_norm": 1.7068753242492676, "learning_rate": 3.4805563689553954e-06, "loss": 0.0005, "num_input_tokens_seen": 150063776, "step": 69580 }, { "epoch": 12.770233070288127, "grad_norm": 0.0019174537155777216, "learning_rate": 3.4797935015271566e-06, "loss": 0.0, "num_input_tokens_seen": 150074720, "step": 69585 }, { "epoch": 12.771150669847678, "grad_norm": 0.0004646333400160074, "learning_rate": 3.479030673089856e-06, "loss": 0.0, "num_input_tokens_seen": 150085952, "step": 69590 }, { "epoch": 12.772068269407232, "grad_norm": 0.0009920945158228278, "learning_rate": 3.478267883663064e-06, "loss": 0.1097, "num_input_tokens_seen": 150095712, "step": 69595 }, { "epoch": 12.772985868966783, "grad_norm": 0.017008662223815918, "learning_rate": 3.4775051332663424e-06, "loss": 0.0011, "num_input_tokens_seen": 150105184, "step": 69600 }, { "epoch": 12.773903468526335, "grad_norm": 0.00013926149404142052, "learning_rate": 3.476742421919252e-06, "loss": 0.0, "num_input_tokens_seen": 150116352, "step": 69605 }, { "epoch": 12.774821068085888, "grad_norm": 0.009560164995491505, "learning_rate": 3.4759797496413593e-06, "loss": 0.0001, "num_input_tokens_seen": 150127360, "step": 69610 }, { "epoch": 12.77573866764544, "grad_norm": 0.0028631710447371006, "learning_rate": 3.4752171164522254e-06, "loss": 0.0, "num_input_tokens_seen": 150136896, "step": 69615 }, { "epoch": 12.776656267204991, "grad_norm": 0.0007884240476414561, "learning_rate": 3.4744545223714078e-06, "loss": 0.0, "num_input_tokens_seen": 150146752, "step": 69620 }, { "epoch": 12.777573866764545, "grad_norm": 0.0005515152588486671, "learning_rate": 3.4736919674184657e-06, "loss": 0.0, "num_input_tokens_seen": 150157696, "step": 69625 }, { "epoch": 12.778491466324096, "grad_norm": 0.0002995574031956494, "learning_rate": 3.4729294516129614e-06, "loss": 0.0, "num_input_tokens_seen": 150168608, "step": 69630 }, { "epoch": 12.779409065883648, "grad_norm": 28.586828231811523, "learning_rate": 3.4721669749744502e-06, "loss": 0.2938, "num_input_tokens_seen": 150178752, "step": 69635 }, { "epoch": 12.780326665443202, "grad_norm": 0.00036214230931364, "learning_rate": 3.4714045375224852e-06, "loss": 0.0002, "num_input_tokens_seen": 150190176, "step": 69640 }, { "epoch": 12.781244265002753, "grad_norm": 0.003994691651314497, "learning_rate": 3.4706421392766278e-06, "loss": 0.1345, "num_input_tokens_seen": 150201248, "step": 69645 }, { "epoch": 12.782161864562305, "grad_norm": 0.006232784129679203, "learning_rate": 3.469879780256428e-06, "loss": 0.0, "num_input_tokens_seen": 150212544, "step": 69650 }, { "epoch": 12.783079464121858, "grad_norm": 0.0016309890197589993, "learning_rate": 3.4691174604814406e-06, "loss": 0.0, "num_input_tokens_seen": 150223840, "step": 69655 }, { "epoch": 12.78399706368141, "grad_norm": 0.07423494756221771, "learning_rate": 3.4683551799712167e-06, "loss": 0.0001, "num_input_tokens_seen": 150234816, "step": 69660 }, { "epoch": 12.784914663240961, "grad_norm": 0.0005614803521893919, "learning_rate": 3.46759293874531e-06, "loss": 0.0, "num_input_tokens_seen": 150247456, "step": 69665 }, { "epoch": 12.785832262800515, "grad_norm": 0.2218882143497467, "learning_rate": 3.46683073682327e-06, "loss": 0.0002, "num_input_tokens_seen": 150258880, "step": 69670 }, { "epoch": 12.786749862360066, "grad_norm": 0.0009550096001476049, "learning_rate": 3.4660685742246436e-06, "loss": 0.0002, "num_input_tokens_seen": 150270048, "step": 69675 }, { "epoch": 12.787667461919618, "grad_norm": 0.7985938191413879, "learning_rate": 3.465306450968982e-06, "loss": 0.0003, "num_input_tokens_seen": 150278816, "step": 69680 }, { "epoch": 12.788585061479171, "grad_norm": 1.7061704397201538, "learning_rate": 3.4645443670758303e-06, "loss": 0.0009, "num_input_tokens_seen": 150289536, "step": 69685 }, { "epoch": 12.789502661038723, "grad_norm": 0.0011000679805874825, "learning_rate": 3.463782322564736e-06, "loss": 0.0, "num_input_tokens_seen": 150300992, "step": 69690 }, { "epoch": 12.790420260598275, "grad_norm": 0.00031408341601490974, "learning_rate": 3.4630203174552455e-06, "loss": 0.0, "num_input_tokens_seen": 150310464, "step": 69695 }, { "epoch": 12.791337860157828, "grad_norm": 0.018719200044870377, "learning_rate": 3.4622583517669016e-06, "loss": 0.0001, "num_input_tokens_seen": 150321728, "step": 69700 }, { "epoch": 12.79225545971738, "grad_norm": 28.68853759765625, "learning_rate": 3.461496425519246e-06, "loss": 0.0402, "num_input_tokens_seen": 150332928, "step": 69705 }, { "epoch": 12.793173059276931, "grad_norm": 0.005327257793396711, "learning_rate": 3.4607345387318236e-06, "loss": 0.0, "num_input_tokens_seen": 150343552, "step": 69710 }, { "epoch": 12.794090658836485, "grad_norm": 0.0006580498302355409, "learning_rate": 3.4599726914241755e-06, "loss": 0.0109, "num_input_tokens_seen": 150355232, "step": 69715 }, { "epoch": 12.795008258396036, "grad_norm": 0.009736272506415844, "learning_rate": 3.4592108836158413e-06, "loss": 0.0, "num_input_tokens_seen": 150366592, "step": 69720 }, { "epoch": 12.795925857955588, "grad_norm": 0.00017889276205096394, "learning_rate": 3.4584491153263565e-06, "loss": 0.0, "num_input_tokens_seen": 150377408, "step": 69725 }, { "epoch": 12.796843457515141, "grad_norm": 0.000399641168769449, "learning_rate": 3.457687386575266e-06, "loss": 0.0, "num_input_tokens_seen": 150387808, "step": 69730 }, { "epoch": 12.797761057074693, "grad_norm": 0.0004569461161736399, "learning_rate": 3.4569256973821036e-06, "loss": 0.0, "num_input_tokens_seen": 150399872, "step": 69735 }, { "epoch": 12.798678656634245, "grad_norm": 0.003013712354004383, "learning_rate": 3.4561640477664028e-06, "loss": 0.0, "num_input_tokens_seen": 150410144, "step": 69740 }, { "epoch": 12.799596256193798, "grad_norm": 0.0015825728187337518, "learning_rate": 3.455402437747704e-06, "loss": 0.0, "num_input_tokens_seen": 150420000, "step": 69745 }, { "epoch": 12.80051385575335, "grad_norm": 0.0004517220368143171, "learning_rate": 3.454640867345539e-06, "loss": 0.0001, "num_input_tokens_seen": 150430784, "step": 69750 }, { "epoch": 12.801431455312901, "grad_norm": 0.00019269908079877496, "learning_rate": 3.4538793365794397e-06, "loss": 0.0011, "num_input_tokens_seen": 150441248, "step": 69755 }, { "epoch": 12.802349054872455, "grad_norm": 0.0007130246376618743, "learning_rate": 3.453117845468938e-06, "loss": 0.0, "num_input_tokens_seen": 150452672, "step": 69760 }, { "epoch": 12.803266654432006, "grad_norm": 0.0006456011906266212, "learning_rate": 3.452356394033568e-06, "loss": 0.0, "num_input_tokens_seen": 150463456, "step": 69765 }, { "epoch": 12.804184253991558, "grad_norm": 0.004602834116667509, "learning_rate": 3.4515949822928573e-06, "loss": 0.0, "num_input_tokens_seen": 150472864, "step": 69770 }, { "epoch": 12.805101853551111, "grad_norm": 0.00019375470583327115, "learning_rate": 3.4508336102663353e-06, "loss": 0.0, "num_input_tokens_seen": 150484576, "step": 69775 }, { "epoch": 12.806019453110663, "grad_norm": 0.0009357211529277265, "learning_rate": 3.450072277973531e-06, "loss": 0.1283, "num_input_tokens_seen": 150496096, "step": 69780 }, { "epoch": 12.806937052670214, "grad_norm": 0.00030238853651098907, "learning_rate": 3.449310985433969e-06, "loss": 0.0119, "num_input_tokens_seen": 150506496, "step": 69785 }, { "epoch": 12.807854652229768, "grad_norm": 0.0002543533337302506, "learning_rate": 3.4485497326671774e-06, "loss": 0.0002, "num_input_tokens_seen": 150518176, "step": 69790 }, { "epoch": 12.80877225178932, "grad_norm": 0.0017822805093601346, "learning_rate": 3.4477885196926817e-06, "loss": 0.0, "num_input_tokens_seen": 150528480, "step": 69795 }, { "epoch": 12.809689851348871, "grad_norm": 0.0006432141526602209, "learning_rate": 3.4470273465300043e-06, "loss": 0.004, "num_input_tokens_seen": 150539808, "step": 69800 }, { "epoch": 12.810607450908424, "grad_norm": 0.0002627661742735654, "learning_rate": 3.446266213198669e-06, "loss": 0.0003, "num_input_tokens_seen": 150549824, "step": 69805 }, { "epoch": 12.811525050467976, "grad_norm": 0.002033109311014414, "learning_rate": 3.4455051197181953e-06, "loss": 0.0, "num_input_tokens_seen": 150560000, "step": 69810 }, { "epoch": 12.812442650027528, "grad_norm": 0.0004124848055653274, "learning_rate": 3.444744066108108e-06, "loss": 0.0004, "num_input_tokens_seen": 150569056, "step": 69815 }, { "epoch": 12.813360249587081, "grad_norm": 0.0003314663772471249, "learning_rate": 3.443983052387925e-06, "loss": 0.0, "num_input_tokens_seen": 150580768, "step": 69820 }, { "epoch": 12.814277849146633, "grad_norm": 0.00020024433615617454, "learning_rate": 3.443222078577162e-06, "loss": 0.0, "num_input_tokens_seen": 150591136, "step": 69825 }, { "epoch": 12.815195448706184, "grad_norm": 0.014509368687868118, "learning_rate": 3.442461144695343e-06, "loss": 0.0, "num_input_tokens_seen": 150601664, "step": 69830 }, { "epoch": 12.816113048265738, "grad_norm": 0.0007783291512168944, "learning_rate": 3.441700250761982e-06, "loss": 0.0, "num_input_tokens_seen": 150612768, "step": 69835 }, { "epoch": 12.81703064782529, "grad_norm": 0.0006349878385663033, "learning_rate": 3.4409393967965914e-06, "loss": 0.0, "num_input_tokens_seen": 150624256, "step": 69840 }, { "epoch": 12.817948247384841, "grad_norm": 0.0018534308765083551, "learning_rate": 3.4401785828186918e-06, "loss": 0.0, "num_input_tokens_seen": 150635488, "step": 69845 }, { "epoch": 12.818865846944394, "grad_norm": 0.0004565611598081887, "learning_rate": 3.4394178088477934e-06, "loss": 0.0, "num_input_tokens_seen": 150645824, "step": 69850 }, { "epoch": 12.819783446503946, "grad_norm": 0.02519473433494568, "learning_rate": 3.43865707490341e-06, "loss": 0.1036, "num_input_tokens_seen": 150656128, "step": 69855 }, { "epoch": 12.820701046063498, "grad_norm": 0.0041393497958779335, "learning_rate": 3.4378963810050505e-06, "loss": 0.0064, "num_input_tokens_seen": 150666848, "step": 69860 }, { "epoch": 12.821618645623051, "grad_norm": 0.03142070025205612, "learning_rate": 3.43713572717223e-06, "loss": 0.0001, "num_input_tokens_seen": 150676864, "step": 69865 }, { "epoch": 12.822536245182603, "grad_norm": 0.0007714987732470036, "learning_rate": 3.436375113424456e-06, "loss": 0.0, "num_input_tokens_seen": 150688320, "step": 69870 }, { "epoch": 12.823453844742154, "grad_norm": 0.00017615930119063705, "learning_rate": 3.4356145397812355e-06, "loss": 0.0, "num_input_tokens_seen": 150699104, "step": 69875 }, { "epoch": 12.824371444301708, "grad_norm": 0.00046518188901245594, "learning_rate": 3.4348540062620772e-06, "loss": 0.0, "num_input_tokens_seen": 150711616, "step": 69880 }, { "epoch": 12.82528904386126, "grad_norm": 17.269805908203125, "learning_rate": 3.4340935128864895e-06, "loss": 0.0189, "num_input_tokens_seen": 150721984, "step": 69885 }, { "epoch": 12.82620664342081, "grad_norm": 0.0001976375060621649, "learning_rate": 3.4333330596739765e-06, "loss": 0.1688, "num_input_tokens_seen": 150733088, "step": 69890 }, { "epoch": 12.827124242980364, "grad_norm": 0.14846792817115784, "learning_rate": 3.432572646644041e-06, "loss": 0.0001, "num_input_tokens_seen": 150745088, "step": 69895 }, { "epoch": 12.828041842539916, "grad_norm": 0.011893057264387608, "learning_rate": 3.4318122738161885e-06, "loss": 0.0001, "num_input_tokens_seen": 150754624, "step": 69900 }, { "epoch": 12.828959442099467, "grad_norm": 0.00031667735311202705, "learning_rate": 3.4310519412099203e-06, "loss": 0.0001, "num_input_tokens_seen": 150764864, "step": 69905 }, { "epoch": 12.82987704165902, "grad_norm": 0.00012203970982227474, "learning_rate": 3.430291648844738e-06, "loss": 0.0001, "num_input_tokens_seen": 150775072, "step": 69910 }, { "epoch": 12.830794641218572, "grad_norm": 65.28443145751953, "learning_rate": 3.429531396740143e-06, "loss": 0.2957, "num_input_tokens_seen": 150786080, "step": 69915 }, { "epoch": 12.831712240778124, "grad_norm": 0.0052034566178917885, "learning_rate": 3.4287711849156335e-06, "loss": 0.0328, "num_input_tokens_seen": 150797280, "step": 69920 }, { "epoch": 12.832629840337678, "grad_norm": 0.012170130386948586, "learning_rate": 3.4280110133907063e-06, "loss": 0.1438, "num_input_tokens_seen": 150807104, "step": 69925 }, { "epoch": 12.83354743989723, "grad_norm": 0.04010605439543724, "learning_rate": 3.4272508821848617e-06, "loss": 0.0001, "num_input_tokens_seen": 150818496, "step": 69930 }, { "epoch": 12.83446503945678, "grad_norm": 0.0204958263784647, "learning_rate": 3.426490791317595e-06, "loss": 0.0, "num_input_tokens_seen": 150829120, "step": 69935 }, { "epoch": 12.835382639016334, "grad_norm": 0.007248195819556713, "learning_rate": 3.425730740808401e-06, "loss": 0.0, "num_input_tokens_seen": 150839776, "step": 69940 }, { "epoch": 12.836300238575886, "grad_norm": 0.0009704299154691398, "learning_rate": 3.4249707306767706e-06, "loss": 0.0, "num_input_tokens_seen": 150849600, "step": 69945 }, { "epoch": 12.837217838135437, "grad_norm": 0.004275263752788305, "learning_rate": 3.4242107609422024e-06, "loss": 0.2188, "num_input_tokens_seen": 150859872, "step": 69950 }, { "epoch": 12.83813543769499, "grad_norm": 0.0005437820800580084, "learning_rate": 3.4234508316241853e-06, "loss": 0.222, "num_input_tokens_seen": 150871456, "step": 69955 }, { "epoch": 12.839053037254542, "grad_norm": 0.0004330238443799317, "learning_rate": 3.42269094274221e-06, "loss": 0.0001, "num_input_tokens_seen": 150879488, "step": 69960 }, { "epoch": 12.839970636814094, "grad_norm": 0.0053451755084097385, "learning_rate": 3.421931094315769e-06, "loss": 0.0001, "num_input_tokens_seen": 150890048, "step": 69965 }, { "epoch": 12.840888236373647, "grad_norm": 166.92999267578125, "learning_rate": 3.4211712863643497e-06, "loss": 0.0355, "num_input_tokens_seen": 150901376, "step": 69970 }, { "epoch": 12.841805835933199, "grad_norm": 0.10367987304925919, "learning_rate": 3.4204115189074386e-06, "loss": 0.0008, "num_input_tokens_seen": 150912480, "step": 69975 }, { "epoch": 12.84272343549275, "grad_norm": 0.0007940568029880524, "learning_rate": 3.4196517919645247e-06, "loss": 0.0036, "num_input_tokens_seen": 150923008, "step": 69980 }, { "epoch": 12.843641035052304, "grad_norm": 208.0224151611328, "learning_rate": 3.4188921055550934e-06, "loss": 0.1221, "num_input_tokens_seen": 150933792, "step": 69985 }, { "epoch": 12.844558634611856, "grad_norm": 0.000686321291141212, "learning_rate": 3.4181324596986294e-06, "loss": 0.0009, "num_input_tokens_seen": 150944736, "step": 69990 }, { "epoch": 12.845476234171407, "grad_norm": 0.012464548461139202, "learning_rate": 3.4173728544146147e-06, "loss": 0.0, "num_input_tokens_seen": 150956256, "step": 69995 }, { "epoch": 12.84639383373096, "grad_norm": 0.005341379903256893, "learning_rate": 3.4166132897225346e-06, "loss": 0.0, "num_input_tokens_seen": 150969280, "step": 70000 }, { "epoch": 12.847311433290512, "grad_norm": 0.0003224697429686785, "learning_rate": 3.415853765641869e-06, "loss": 0.0, "num_input_tokens_seen": 150978720, "step": 70005 }, { "epoch": 12.848229032850064, "grad_norm": 0.07031799107789993, "learning_rate": 3.4150942821920985e-06, "loss": 0.0001, "num_input_tokens_seen": 150989568, "step": 70010 }, { "epoch": 12.849146632409617, "grad_norm": 26.320045471191406, "learning_rate": 3.4143348393927043e-06, "loss": 0.1407, "num_input_tokens_seen": 151000256, "step": 70015 }, { "epoch": 12.850064231969169, "grad_norm": 0.01147841103374958, "learning_rate": 3.4135754372631646e-06, "loss": 0.1159, "num_input_tokens_seen": 151011872, "step": 70020 }, { "epoch": 12.85098183152872, "grad_norm": 0.00025069183902814984, "learning_rate": 3.4128160758229532e-06, "loss": 0.0001, "num_input_tokens_seen": 151022752, "step": 70025 }, { "epoch": 12.851899431088274, "grad_norm": 0.13099467754364014, "learning_rate": 3.4120567550915527e-06, "loss": 0.0002, "num_input_tokens_seen": 151033760, "step": 70030 }, { "epoch": 12.852817030647826, "grad_norm": 0.011484325863420963, "learning_rate": 3.411297475088434e-06, "loss": 0.0018, "num_input_tokens_seen": 151043904, "step": 70035 }, { "epoch": 12.853734630207377, "grad_norm": 0.0041751260869205, "learning_rate": 3.410538235833074e-06, "loss": 0.0, "num_input_tokens_seen": 151054272, "step": 70040 }, { "epoch": 12.85465222976693, "grad_norm": 0.0002245087525807321, "learning_rate": 3.4097790373449423e-06, "loss": 0.0426, "num_input_tokens_seen": 151064704, "step": 70045 }, { "epoch": 12.855569829326482, "grad_norm": 0.002065424108877778, "learning_rate": 3.409019879643516e-06, "loss": 0.0016, "num_input_tokens_seen": 151074016, "step": 70050 }, { "epoch": 12.856487428886034, "grad_norm": 0.0002546049654483795, "learning_rate": 3.408260762748263e-06, "loss": 0.0001, "num_input_tokens_seen": 151085952, "step": 70055 }, { "epoch": 12.857405028445587, "grad_norm": 0.003945198375731707, "learning_rate": 3.4075016866786538e-06, "loss": 0.0007, "num_input_tokens_seen": 151096800, "step": 70060 }, { "epoch": 12.858322628005139, "grad_norm": 0.02129482850432396, "learning_rate": 3.4067426514541597e-06, "loss": 0.0, "num_input_tokens_seen": 151106752, "step": 70065 }, { "epoch": 12.85924022756469, "grad_norm": 0.0001280852302443236, "learning_rate": 3.4059836570942472e-06, "loss": 0.1039, "num_input_tokens_seen": 151118240, "step": 70070 }, { "epoch": 12.860157827124244, "grad_norm": 0.00021582805493380874, "learning_rate": 3.4052247036183827e-06, "loss": 0.0, "num_input_tokens_seen": 151128384, "step": 70075 }, { "epoch": 12.861075426683795, "grad_norm": 0.0008530291961506009, "learning_rate": 3.4044657910460323e-06, "loss": 0.0, "num_input_tokens_seen": 151138720, "step": 70080 }, { "epoch": 12.861993026243347, "grad_norm": 0.014321491122245789, "learning_rate": 3.403706919396662e-06, "loss": 0.0, "num_input_tokens_seen": 151150656, "step": 70085 }, { "epoch": 12.8629106258029, "grad_norm": 0.0005678480374626815, "learning_rate": 3.402948088689736e-06, "loss": 0.0001, "num_input_tokens_seen": 151161824, "step": 70090 }, { "epoch": 12.863828225362452, "grad_norm": 0.0006225141696631908, "learning_rate": 3.402189298944716e-06, "loss": 0.1066, "num_input_tokens_seen": 151172544, "step": 70095 }, { "epoch": 12.864745824922004, "grad_norm": 0.98709636926651, "learning_rate": 3.401430550181063e-06, "loss": 0.0005, "num_input_tokens_seen": 151182816, "step": 70100 }, { "epoch": 12.865663424481557, "grad_norm": 0.0042510866187512875, "learning_rate": 3.40067184241824e-06, "loss": 0.0001, "num_input_tokens_seen": 151194400, "step": 70105 }, { "epoch": 12.866581024041109, "grad_norm": 0.007864415645599365, "learning_rate": 3.3999131756757043e-06, "loss": 0.0, "num_input_tokens_seen": 151205632, "step": 70110 }, { "epoch": 12.86749862360066, "grad_norm": 2.384124994277954, "learning_rate": 3.3991545499729175e-06, "loss": 0.0017, "num_input_tokens_seen": 151217568, "step": 70115 }, { "epoch": 12.868416223160214, "grad_norm": 0.049155041575431824, "learning_rate": 3.3983959653293353e-06, "loss": 0.0001, "num_input_tokens_seen": 151228928, "step": 70120 }, { "epoch": 12.869333822719765, "grad_norm": 0.00035758657031692564, "learning_rate": 3.3976374217644138e-06, "loss": 0.0, "num_input_tokens_seen": 151239008, "step": 70125 }, { "epoch": 12.870251422279317, "grad_norm": 0.0028597041964530945, "learning_rate": 3.396878919297607e-06, "loss": 0.0, "num_input_tokens_seen": 151250976, "step": 70130 }, { "epoch": 12.87116902183887, "grad_norm": 0.007492059376090765, "learning_rate": 3.3961204579483736e-06, "loss": 0.0, "num_input_tokens_seen": 151262016, "step": 70135 }, { "epoch": 12.872086621398422, "grad_norm": 0.06849531829357147, "learning_rate": 3.3953620377361648e-06, "loss": 0.0001, "num_input_tokens_seen": 151273056, "step": 70140 }, { "epoch": 12.873004220957974, "grad_norm": 0.0008065891452133656, "learning_rate": 3.3946036586804305e-06, "loss": 0.0, "num_input_tokens_seen": 151283136, "step": 70145 }, { "epoch": 12.873921820517527, "grad_norm": 0.0012516106944531202, "learning_rate": 3.3938453208006262e-06, "loss": 0.0018, "num_input_tokens_seen": 151293472, "step": 70150 }, { "epoch": 12.874839420077079, "grad_norm": 0.0007301251753233373, "learning_rate": 3.3930870241162e-06, "loss": 0.0, "num_input_tokens_seen": 151304096, "step": 70155 }, { "epoch": 12.87575701963663, "grad_norm": 0.0005567360203713179, "learning_rate": 3.3923287686465994e-06, "loss": 0.0014, "num_input_tokens_seen": 151314688, "step": 70160 }, { "epoch": 12.876674619196184, "grad_norm": 0.006262137554585934, "learning_rate": 3.3915705544112764e-06, "loss": 0.0, "num_input_tokens_seen": 151325568, "step": 70165 }, { "epoch": 12.877592218755735, "grad_norm": 0.05433957278728485, "learning_rate": 3.390812381429676e-06, "loss": 0.0001, "num_input_tokens_seen": 151336704, "step": 70170 }, { "epoch": 12.878509818315287, "grad_norm": 0.017413334921002388, "learning_rate": 3.390054249721243e-06, "loss": 0.2068, "num_input_tokens_seen": 151348992, "step": 70175 }, { "epoch": 12.87942741787484, "grad_norm": 0.0056189075112342834, "learning_rate": 3.389296159305422e-06, "loss": 0.0002, "num_input_tokens_seen": 151358592, "step": 70180 }, { "epoch": 12.880345017434392, "grad_norm": 0.004920678213238716, "learning_rate": 3.38853811020166e-06, "loss": 0.001, "num_input_tokens_seen": 151369824, "step": 70185 }, { "epoch": 12.881262616993943, "grad_norm": 0.0007534993346780539, "learning_rate": 3.387780102429398e-06, "loss": 0.0, "num_input_tokens_seen": 151381152, "step": 70190 }, { "epoch": 12.882180216553497, "grad_norm": 0.002877391641959548, "learning_rate": 3.3870221360080766e-06, "loss": 0.0, "num_input_tokens_seen": 151391488, "step": 70195 }, { "epoch": 12.883097816113048, "grad_norm": 0.0042854598723351955, "learning_rate": 3.3862642109571376e-06, "loss": 0.0207, "num_input_tokens_seen": 151401248, "step": 70200 }, { "epoch": 12.8840154156726, "grad_norm": 0.001215594820678234, "learning_rate": 3.385506327296021e-06, "loss": 0.0075, "num_input_tokens_seen": 151413312, "step": 70205 }, { "epoch": 12.884933015232154, "grad_norm": 0.005555022042244673, "learning_rate": 3.384748485044166e-06, "loss": 0.0001, "num_input_tokens_seen": 151424832, "step": 70210 }, { "epoch": 12.885850614791705, "grad_norm": 0.21326370537281036, "learning_rate": 3.3839906842210068e-06, "loss": 0.0001, "num_input_tokens_seen": 151436000, "step": 70215 }, { "epoch": 12.886768214351257, "grad_norm": 0.014246045611798763, "learning_rate": 3.383232924845984e-06, "loss": 0.0003, "num_input_tokens_seen": 151446656, "step": 70220 }, { "epoch": 12.88768581391081, "grad_norm": 0.004464289639145136, "learning_rate": 3.3824752069385293e-06, "loss": 0.1408, "num_input_tokens_seen": 151457344, "step": 70225 }, { "epoch": 12.888603413470362, "grad_norm": 0.001453312928788364, "learning_rate": 3.3817175305180784e-06, "loss": 0.0, "num_input_tokens_seen": 151467872, "step": 70230 }, { "epoch": 12.889521013029913, "grad_norm": 0.0003024667385034263, "learning_rate": 3.3809598956040656e-06, "loss": 0.0041, "num_input_tokens_seen": 151479584, "step": 70235 }, { "epoch": 12.890438612589467, "grad_norm": 0.002257216488942504, "learning_rate": 3.380202302215923e-06, "loss": 0.0, "num_input_tokens_seen": 151489952, "step": 70240 }, { "epoch": 12.891356212149018, "grad_norm": 0.0007338427240028977, "learning_rate": 3.3794447503730787e-06, "loss": 0.0002, "num_input_tokens_seen": 151501152, "step": 70245 }, { "epoch": 12.89227381170857, "grad_norm": 0.0005229536909610033, "learning_rate": 3.3786872400949666e-06, "loss": 0.0, "num_input_tokens_seen": 151511936, "step": 70250 }, { "epoch": 12.893191411268123, "grad_norm": 0.0006623272784054279, "learning_rate": 3.377929771401014e-06, "loss": 0.005, "num_input_tokens_seen": 151523424, "step": 70255 }, { "epoch": 12.894109010827675, "grad_norm": 0.002985229017212987, "learning_rate": 3.3771723443106486e-06, "loss": 0.0, "num_input_tokens_seen": 151535136, "step": 70260 }, { "epoch": 12.895026610387227, "grad_norm": 0.004590536467730999, "learning_rate": 3.376414958843296e-06, "loss": 0.0001, "num_input_tokens_seen": 151545152, "step": 70265 }, { "epoch": 12.89594420994678, "grad_norm": 0.004176843445748091, "learning_rate": 3.375657615018385e-06, "loss": 0.0, "num_input_tokens_seen": 151557728, "step": 70270 }, { "epoch": 12.896861809506332, "grad_norm": 0.0010959352366626263, "learning_rate": 3.374900312855339e-06, "loss": 0.0005, "num_input_tokens_seen": 151568864, "step": 70275 }, { "epoch": 12.897779409065883, "grad_norm": 0.04433906450867653, "learning_rate": 3.3741430523735787e-06, "loss": 0.0, "num_input_tokens_seen": 151579872, "step": 70280 }, { "epoch": 12.898697008625437, "grad_norm": 0.025145063176751137, "learning_rate": 3.3733858335925317e-06, "loss": 0.0, "num_input_tokens_seen": 151590752, "step": 70285 }, { "epoch": 12.899614608184988, "grad_norm": 0.003204175503924489, "learning_rate": 3.372628656531617e-06, "loss": 0.0, "num_input_tokens_seen": 151602144, "step": 70290 }, { "epoch": 12.90053220774454, "grad_norm": 0.0050011430867016315, "learning_rate": 3.371871521210253e-06, "loss": 0.0, "num_input_tokens_seen": 151613504, "step": 70295 }, { "epoch": 12.901449807304093, "grad_norm": 4.935776710510254, "learning_rate": 3.371114427647863e-06, "loss": 0.0014, "num_input_tokens_seen": 151623872, "step": 70300 }, { "epoch": 12.902367406863645, "grad_norm": 0.000548415060620755, "learning_rate": 3.3703573758638635e-06, "loss": 0.0, "num_input_tokens_seen": 151634368, "step": 70305 }, { "epoch": 12.903285006423197, "grad_norm": 0.09212254732847214, "learning_rate": 3.3696003658776717e-06, "loss": 0.0001, "num_input_tokens_seen": 151645440, "step": 70310 }, { "epoch": 12.90420260598275, "grad_norm": 0.0010354851838201284, "learning_rate": 3.368843397708702e-06, "loss": 0.0003, "num_input_tokens_seen": 151655168, "step": 70315 }, { "epoch": 12.905120205542302, "grad_norm": 0.00016487443645019084, "learning_rate": 3.368086471376373e-06, "loss": 0.0, "num_input_tokens_seen": 151664992, "step": 70320 }, { "epoch": 12.906037805101853, "grad_norm": 0.056730516254901886, "learning_rate": 3.3673295869000956e-06, "loss": 0.0001, "num_input_tokens_seen": 151676160, "step": 70325 }, { "epoch": 12.906955404661407, "grad_norm": 0.0005868096486665308, "learning_rate": 3.3665727442992833e-06, "loss": 0.0, "num_input_tokens_seen": 151687520, "step": 70330 }, { "epoch": 12.907873004220958, "grad_norm": 0.00015526835341006517, "learning_rate": 3.3658159435933503e-06, "loss": 0.056, "num_input_tokens_seen": 151698592, "step": 70335 }, { "epoch": 12.90879060378051, "grad_norm": 0.0015602674102410674, "learning_rate": 3.365059184801705e-06, "loss": 0.0001, "num_input_tokens_seen": 151709184, "step": 70340 }, { "epoch": 12.909708203340063, "grad_norm": 0.0011751772835850716, "learning_rate": 3.364302467943758e-06, "loss": 0.0, "num_input_tokens_seen": 151719936, "step": 70345 }, { "epoch": 12.910625802899615, "grad_norm": 0.14105257391929626, "learning_rate": 3.3635457930389153e-06, "loss": 0.0002, "num_input_tokens_seen": 151731648, "step": 70350 }, { "epoch": 12.911543402459166, "grad_norm": 0.33552682399749756, "learning_rate": 3.362789160106589e-06, "loss": 0.0002, "num_input_tokens_seen": 151743392, "step": 70355 }, { "epoch": 12.91246100201872, "grad_norm": 0.049943506717681885, "learning_rate": 3.3620325691661833e-06, "loss": 0.0001, "num_input_tokens_seen": 151753952, "step": 70360 }, { "epoch": 12.913378601578271, "grad_norm": 0.004368698224425316, "learning_rate": 3.3612760202371008e-06, "loss": 0.0, "num_input_tokens_seen": 151764832, "step": 70365 }, { "epoch": 12.914296201137823, "grad_norm": 0.0017579065170139074, "learning_rate": 3.3605195133387516e-06, "loss": 0.0, "num_input_tokens_seen": 151776864, "step": 70370 }, { "epoch": 12.915213800697376, "grad_norm": 0.000994154717773199, "learning_rate": 3.3597630484905356e-06, "loss": 0.0001, "num_input_tokens_seen": 151787232, "step": 70375 }, { "epoch": 12.916131400256928, "grad_norm": 0.0005175431142561138, "learning_rate": 3.359006625711854e-06, "loss": 0.0883, "num_input_tokens_seen": 151797824, "step": 70380 }, { "epoch": 12.91704899981648, "grad_norm": 0.005412801168859005, "learning_rate": 3.35825024502211e-06, "loss": 0.0, "num_input_tokens_seen": 151808576, "step": 70385 }, { "epoch": 12.917966599376033, "grad_norm": 0.009959348477423191, "learning_rate": 3.357493906440703e-06, "loss": 0.1283, "num_input_tokens_seen": 151820160, "step": 70390 }, { "epoch": 12.918884198935585, "grad_norm": 0.008235971443355083, "learning_rate": 3.3567376099870318e-06, "loss": 0.0009, "num_input_tokens_seen": 151830240, "step": 70395 }, { "epoch": 12.919801798495136, "grad_norm": 0.0037630994338542223, "learning_rate": 3.3559813556804922e-06, "loss": 0.0001, "num_input_tokens_seen": 151841088, "step": 70400 }, { "epoch": 12.92071939805469, "grad_norm": 0.004881796892732382, "learning_rate": 3.3552251435404844e-06, "loss": 0.0001, "num_input_tokens_seen": 151851488, "step": 70405 }, { "epoch": 12.921636997614241, "grad_norm": 0.00025752896908670664, "learning_rate": 3.354468973586403e-06, "loss": 0.0002, "num_input_tokens_seen": 151863264, "step": 70410 }, { "epoch": 12.922554597173793, "grad_norm": 0.0003534155257511884, "learning_rate": 3.3537128458376407e-06, "loss": 0.0, "num_input_tokens_seen": 151873312, "step": 70415 }, { "epoch": 12.923472196733346, "grad_norm": 0.0008057120139710605, "learning_rate": 3.3529567603135925e-06, "loss": 0.0001, "num_input_tokens_seen": 151884480, "step": 70420 }, { "epoch": 12.924389796292898, "grad_norm": 0.00045463812421076, "learning_rate": 3.352200717033652e-06, "loss": 0.0002, "num_input_tokens_seen": 151894400, "step": 70425 }, { "epoch": 12.92530739585245, "grad_norm": 0.000146481252158992, "learning_rate": 3.3514447160172077e-06, "loss": 0.0, "num_input_tokens_seen": 151905248, "step": 70430 }, { "epoch": 12.926224995412003, "grad_norm": 0.005011278670281172, "learning_rate": 3.350688757283653e-06, "loss": 0.0, "num_input_tokens_seen": 151917536, "step": 70435 }, { "epoch": 12.927142594971555, "grad_norm": 12.794739723205566, "learning_rate": 3.3499328408523748e-06, "loss": 0.0025, "num_input_tokens_seen": 151928384, "step": 70440 }, { "epoch": 12.928060194531106, "grad_norm": 0.00032143996213562787, "learning_rate": 3.349176966742761e-06, "loss": 0.0, "num_input_tokens_seen": 151938912, "step": 70445 }, { "epoch": 12.92897779409066, "grad_norm": 0.0010866356315091252, "learning_rate": 3.3484211349742003e-06, "loss": 0.0, "num_input_tokens_seen": 151950144, "step": 70450 }, { "epoch": 12.929895393650211, "grad_norm": 0.025065084919333458, "learning_rate": 3.3476653455660777e-06, "loss": 0.0, "num_input_tokens_seen": 151960224, "step": 70455 }, { "epoch": 12.930812993209763, "grad_norm": 0.05372791364789009, "learning_rate": 3.3469095985377786e-06, "loss": 0.1035, "num_input_tokens_seen": 151971744, "step": 70460 }, { "epoch": 12.931730592769316, "grad_norm": 0.00105217972304672, "learning_rate": 3.3461538939086844e-06, "loss": 0.0007, "num_input_tokens_seen": 151983616, "step": 70465 }, { "epoch": 12.932648192328868, "grad_norm": 0.0011912090703845024, "learning_rate": 3.3453982316981815e-06, "loss": 0.0, "num_input_tokens_seen": 151994560, "step": 70470 }, { "epoch": 12.93356579188842, "grad_norm": 0.00048769600107334554, "learning_rate": 3.3446426119256493e-06, "loss": 0.0, "num_input_tokens_seen": 152005536, "step": 70475 }, { "epoch": 12.934483391447973, "grad_norm": 0.0005196155980229378, "learning_rate": 3.343887034610467e-06, "loss": 0.0, "num_input_tokens_seen": 152016384, "step": 70480 }, { "epoch": 12.935400991007524, "grad_norm": 0.00030019544647075236, "learning_rate": 3.343131499772017e-06, "loss": 0.0, "num_input_tokens_seen": 152027648, "step": 70485 }, { "epoch": 12.936318590567076, "grad_norm": 0.00018837083189282566, "learning_rate": 3.3423760074296764e-06, "loss": 0.0001, "num_input_tokens_seen": 152038816, "step": 70490 }, { "epoch": 12.93723619012663, "grad_norm": 0.018475519493222237, "learning_rate": 3.341620557602822e-06, "loss": 0.0, "num_input_tokens_seen": 152048768, "step": 70495 }, { "epoch": 12.938153789686181, "grad_norm": 0.08888403326272964, "learning_rate": 3.340865150310828e-06, "loss": 0.0002, "num_input_tokens_seen": 152058784, "step": 70500 }, { "epoch": 12.939071389245733, "grad_norm": 0.0008728248649276793, "learning_rate": 3.3401097855730735e-06, "loss": 0.0119, "num_input_tokens_seen": 152070688, "step": 70505 }, { "epoch": 12.939988988805286, "grad_norm": 0.0008557557011954486, "learning_rate": 3.3393544634089304e-06, "loss": 0.0, "num_input_tokens_seen": 152080960, "step": 70510 }, { "epoch": 12.940906588364838, "grad_norm": 0.0004806608776561916, "learning_rate": 3.338599183837771e-06, "loss": 0.0, "num_input_tokens_seen": 152092000, "step": 70515 }, { "epoch": 12.94182418792439, "grad_norm": 0.0003957160224672407, "learning_rate": 3.337843946878967e-06, "loss": 0.0, "num_input_tokens_seen": 152102144, "step": 70520 }, { "epoch": 12.942741787483943, "grad_norm": 0.007856546901166439, "learning_rate": 3.337088752551891e-06, "loss": 0.0, "num_input_tokens_seen": 152112512, "step": 70525 }, { "epoch": 12.943659387043494, "grad_norm": 0.00041937732021324337, "learning_rate": 3.3363336008759113e-06, "loss": 0.0003, "num_input_tokens_seen": 152124032, "step": 70530 }, { "epoch": 12.944576986603046, "grad_norm": 0.02033270336687565, "learning_rate": 3.335578491870395e-06, "loss": 0.0, "num_input_tokens_seen": 152134784, "step": 70535 }, { "epoch": 12.9454945861626, "grad_norm": 0.0007070584106259048, "learning_rate": 3.3348234255547117e-06, "loss": 0.0001, "num_input_tokens_seen": 152145792, "step": 70540 }, { "epoch": 12.946412185722151, "grad_norm": 0.00018599066243041307, "learning_rate": 3.3340684019482263e-06, "loss": 0.0, "num_input_tokens_seen": 152155296, "step": 70545 }, { "epoch": 12.947329785281703, "grad_norm": 0.00021499207650776953, "learning_rate": 3.333313421070303e-06, "loss": 0.0, "num_input_tokens_seen": 152165088, "step": 70550 }, { "epoch": 12.948247384841256, "grad_norm": 0.0002296057646162808, "learning_rate": 3.33255848294031e-06, "loss": 0.0, "num_input_tokens_seen": 152175616, "step": 70555 }, { "epoch": 12.949164984400808, "grad_norm": 0.0009029562352225184, "learning_rate": 3.3318035875776066e-06, "loss": 0.0, "num_input_tokens_seen": 152186720, "step": 70560 }, { "epoch": 12.95008258396036, "grad_norm": 0.003571348264813423, "learning_rate": 3.331048735001554e-06, "loss": 0.0, "num_input_tokens_seen": 152197056, "step": 70565 }, { "epoch": 12.951000183519913, "grad_norm": 0.00036968974745832384, "learning_rate": 3.330293925231517e-06, "loss": 0.0, "num_input_tokens_seen": 152207776, "step": 70570 }, { "epoch": 12.951917783079464, "grad_norm": 0.0018940826412290335, "learning_rate": 3.3295391582868532e-06, "loss": 0.0, "num_input_tokens_seen": 152218240, "step": 70575 }, { "epoch": 12.952835382639016, "grad_norm": 0.0010412555420771241, "learning_rate": 3.32878443418692e-06, "loss": 0.0032, "num_input_tokens_seen": 152228160, "step": 70580 }, { "epoch": 12.95375298219857, "grad_norm": 0.06858919560909271, "learning_rate": 3.3280297529510754e-06, "loss": 0.0, "num_input_tokens_seen": 152239392, "step": 70585 }, { "epoch": 12.954670581758121, "grad_norm": 0.0005969590274617076, "learning_rate": 3.327275114598677e-06, "loss": 0.0, "num_input_tokens_seen": 152250624, "step": 70590 }, { "epoch": 12.955588181317673, "grad_norm": 0.0012134596472606063, "learning_rate": 3.32652051914908e-06, "loss": 0.0002, "num_input_tokens_seen": 152262176, "step": 70595 }, { "epoch": 12.956505780877226, "grad_norm": 0.0006664580432698131, "learning_rate": 3.3257659666216358e-06, "loss": 0.0, "num_input_tokens_seen": 152273152, "step": 70600 }, { "epoch": 12.957423380436778, "grad_norm": 0.0035244780592620373, "learning_rate": 3.325011457035702e-06, "loss": 0.0, "num_input_tokens_seen": 152284160, "step": 70605 }, { "epoch": 12.95834097999633, "grad_norm": 2.2426648139953613, "learning_rate": 3.324256990410628e-06, "loss": 0.0008, "num_input_tokens_seen": 152295840, "step": 70610 }, { "epoch": 12.959258579555883, "grad_norm": 0.0171796977519989, "learning_rate": 3.323502566765763e-06, "loss": 0.0001, "num_input_tokens_seen": 152308000, "step": 70615 }, { "epoch": 12.960176179115434, "grad_norm": 0.00025974703021347523, "learning_rate": 3.322748186120461e-06, "loss": 0.0, "num_input_tokens_seen": 152318336, "step": 70620 }, { "epoch": 12.961093778674986, "grad_norm": 0.0005826981505379081, "learning_rate": 3.3219938484940682e-06, "loss": 0.0001, "num_input_tokens_seen": 152329696, "step": 70625 }, { "epoch": 12.96201137823454, "grad_norm": 0.0006125592044554651, "learning_rate": 3.321239553905933e-06, "loss": 0.0, "num_input_tokens_seen": 152341600, "step": 70630 }, { "epoch": 12.96292897779409, "grad_norm": 0.0007404187344945967, "learning_rate": 3.3204853023754004e-06, "loss": 0.0, "num_input_tokens_seen": 152352928, "step": 70635 }, { "epoch": 12.963846577353642, "grad_norm": 0.00018391877529211342, "learning_rate": 3.3197310939218164e-06, "loss": 0.0, "num_input_tokens_seen": 152363712, "step": 70640 }, { "epoch": 12.964764176913196, "grad_norm": 0.0008175958064384758, "learning_rate": 3.3189769285645268e-06, "loss": 0.2945, "num_input_tokens_seen": 152374688, "step": 70645 }, { "epoch": 12.965681776472747, "grad_norm": 0.0004794804553966969, "learning_rate": 3.3182228063228726e-06, "loss": 0.0, "num_input_tokens_seen": 152384160, "step": 70650 }, { "epoch": 12.966599376032299, "grad_norm": 0.007717987522482872, "learning_rate": 3.317468727216198e-06, "loss": 0.0, "num_input_tokens_seen": 152395136, "step": 70655 }, { "epoch": 12.967516975591852, "grad_norm": 0.007851507514715195, "learning_rate": 3.316714691263843e-06, "loss": 0.0001, "num_input_tokens_seen": 152405952, "step": 70660 }, { "epoch": 12.968434575151404, "grad_norm": 0.0013170003658160567, "learning_rate": 3.315960698485147e-06, "loss": 0.0, "num_input_tokens_seen": 152418080, "step": 70665 }, { "epoch": 12.969352174710956, "grad_norm": 0.00042361748637631536, "learning_rate": 3.3152067488994477e-06, "loss": 0.0023, "num_input_tokens_seen": 152429632, "step": 70670 }, { "epoch": 12.970269774270509, "grad_norm": 0.020389826968312263, "learning_rate": 3.3144528425260854e-06, "loss": 0.0001, "num_input_tokens_seen": 152439584, "step": 70675 }, { "epoch": 12.97118737383006, "grad_norm": 0.0027885250747203827, "learning_rate": 3.3136989793843953e-06, "loss": 0.0, "num_input_tokens_seen": 152451072, "step": 70680 }, { "epoch": 12.972104973389612, "grad_norm": 0.0001716077676974237, "learning_rate": 3.312945159493711e-06, "loss": 0.0, "num_input_tokens_seen": 152461408, "step": 70685 }, { "epoch": 12.973022572949166, "grad_norm": 0.0006118517485447228, "learning_rate": 3.312191382873371e-06, "loss": 0.0, "num_input_tokens_seen": 152470784, "step": 70690 }, { "epoch": 12.973940172508717, "grad_norm": 0.0006967115332372487, "learning_rate": 3.3114376495427057e-06, "loss": 0.0, "num_input_tokens_seen": 152481376, "step": 70695 }, { "epoch": 12.974857772068269, "grad_norm": 0.006794114597141743, "learning_rate": 3.3106839595210462e-06, "loss": 0.0, "num_input_tokens_seen": 152492896, "step": 70700 }, { "epoch": 12.975775371627822, "grad_norm": 0.0012575555592775345, "learning_rate": 3.3099303128277266e-06, "loss": 0.001, "num_input_tokens_seen": 152501440, "step": 70705 }, { "epoch": 12.976692971187374, "grad_norm": 0.00019433714624028653, "learning_rate": 3.309176709482075e-06, "loss": 0.0, "num_input_tokens_seen": 152511648, "step": 70710 }, { "epoch": 12.977610570746926, "grad_norm": 0.0003588880645111203, "learning_rate": 3.3084231495034204e-06, "loss": 0.0, "num_input_tokens_seen": 152521952, "step": 70715 }, { "epoch": 12.978528170306479, "grad_norm": 0.0007599809905514121, "learning_rate": 3.307669632911088e-06, "loss": 0.0, "num_input_tokens_seen": 152531808, "step": 70720 }, { "epoch": 12.97944576986603, "grad_norm": 0.00020829227287322283, "learning_rate": 3.306916159724409e-06, "loss": 0.0, "num_input_tokens_seen": 152542400, "step": 70725 }, { "epoch": 12.980363369425582, "grad_norm": 0.00022656723740510643, "learning_rate": 3.3061627299627063e-06, "loss": 0.0001, "num_input_tokens_seen": 152552960, "step": 70730 }, { "epoch": 12.981280968985136, "grad_norm": 0.0007311895024031401, "learning_rate": 3.305409343645304e-06, "loss": 0.0376, "num_input_tokens_seen": 152564160, "step": 70735 }, { "epoch": 12.982198568544687, "grad_norm": 0.005190527997910976, "learning_rate": 3.304656000791525e-06, "loss": 0.0, "num_input_tokens_seen": 152575008, "step": 70740 }, { "epoch": 12.983116168104239, "grad_norm": 43.7509765625, "learning_rate": 3.303902701420693e-06, "loss": 0.2663, "num_input_tokens_seen": 152584480, "step": 70745 }, { "epoch": 12.984033767663792, "grad_norm": 0.0009398841648362577, "learning_rate": 3.3031494455521273e-06, "loss": 0.0, "num_input_tokens_seen": 152595328, "step": 70750 }, { "epoch": 12.984951367223344, "grad_norm": 0.0006059270235709846, "learning_rate": 3.3023962332051494e-06, "loss": 0.0, "num_input_tokens_seen": 152606176, "step": 70755 }, { "epoch": 12.985868966782895, "grad_norm": 0.0006616376340389252, "learning_rate": 3.3016430643990773e-06, "loss": 0.0, "num_input_tokens_seen": 152616864, "step": 70760 }, { "epoch": 12.986786566342449, "grad_norm": 0.000968037755228579, "learning_rate": 3.3008899391532266e-06, "loss": 0.0, "num_input_tokens_seen": 152626464, "step": 70765 }, { "epoch": 12.987704165902, "grad_norm": 0.17668001353740692, "learning_rate": 3.3001368574869158e-06, "loss": 0.0019, "num_input_tokens_seen": 152636032, "step": 70770 }, { "epoch": 12.988621765461552, "grad_norm": 22.608224868774414, "learning_rate": 3.2993838194194617e-06, "loss": 0.0007, "num_input_tokens_seen": 152647168, "step": 70775 }, { "epoch": 12.989539365021106, "grad_norm": 0.00016241749108303338, "learning_rate": 3.298630824970176e-06, "loss": 0.0, "num_input_tokens_seen": 152657344, "step": 70780 }, { "epoch": 12.990456964580657, "grad_norm": 0.000265798851614818, "learning_rate": 3.2978778741583717e-06, "loss": 0.0, "num_input_tokens_seen": 152668192, "step": 70785 }, { "epoch": 12.991374564140209, "grad_norm": 0.00018383623682893813, "learning_rate": 3.2971249670033633e-06, "loss": 0.0, "num_input_tokens_seen": 152679488, "step": 70790 }, { "epoch": 12.992292163699762, "grad_norm": 0.014338061213493347, "learning_rate": 3.29637210352446e-06, "loss": 0.0, "num_input_tokens_seen": 152690272, "step": 70795 }, { "epoch": 12.993209763259314, "grad_norm": 0.000612053438089788, "learning_rate": 3.2956192837409705e-06, "loss": 0.0913, "num_input_tokens_seen": 152700384, "step": 70800 }, { "epoch": 12.994127362818865, "grad_norm": 0.0016838941955938935, "learning_rate": 3.2948665076722064e-06, "loss": 0.0, "num_input_tokens_seen": 152711168, "step": 70805 }, { "epoch": 12.995044962378419, "grad_norm": 0.0005451967590488493, "learning_rate": 3.294113775337474e-06, "loss": 0.0, "num_input_tokens_seen": 152721664, "step": 70810 }, { "epoch": 12.99596256193797, "grad_norm": 0.006054308731108904, "learning_rate": 3.2933610867560796e-06, "loss": 0.0001, "num_input_tokens_seen": 152732032, "step": 70815 }, { "epoch": 12.996880161497522, "grad_norm": 0.028415413573384285, "learning_rate": 3.292608441947326e-06, "loss": 0.0001, "num_input_tokens_seen": 152743520, "step": 70820 }, { "epoch": 12.997797761057075, "grad_norm": 0.0037086186930537224, "learning_rate": 3.2918558409305213e-06, "loss": 0.0, "num_input_tokens_seen": 152753824, "step": 70825 }, { "epoch": 12.998715360616627, "grad_norm": 0.0011249257950112224, "learning_rate": 3.291103283724967e-06, "loss": 0.0018, "num_input_tokens_seen": 152764352, "step": 70830 }, { "epoch": 12.999632960176179, "grad_norm": 0.01771349459886551, "learning_rate": 3.2903507703499625e-06, "loss": 0.0, "num_input_tokens_seen": 152774784, "step": 70835 }, { "epoch": 13.000550559735732, "grad_norm": 0.002093605464324355, "learning_rate": 3.2895983008248144e-06, "loss": 0.0, "num_input_tokens_seen": 152784240, "step": 70840 }, { "epoch": 13.001468159295284, "grad_norm": 0.0003122987982351333, "learning_rate": 3.2888458751688177e-06, "loss": 0.0002, "num_input_tokens_seen": 152794800, "step": 70845 }, { "epoch": 13.002385758854835, "grad_norm": 0.0007108231075108051, "learning_rate": 3.2880934934012733e-06, "loss": 0.0, "num_input_tokens_seen": 152805392, "step": 70850 }, { "epoch": 13.003303358414389, "grad_norm": 0.003199406899511814, "learning_rate": 3.2873411555414757e-06, "loss": 0.0001, "num_input_tokens_seen": 152815984, "step": 70855 }, { "epoch": 13.00422095797394, "grad_norm": 0.07058833539485931, "learning_rate": 3.286588861608724e-06, "loss": 0.0002, "num_input_tokens_seen": 152826320, "step": 70860 }, { "epoch": 13.005138557533492, "grad_norm": 0.0005756879108957946, "learning_rate": 3.2858366116223124e-06, "loss": 0.0001, "num_input_tokens_seen": 152836464, "step": 70865 }, { "epoch": 13.006056157093045, "grad_norm": 0.010351709090173244, "learning_rate": 3.285084405601534e-06, "loss": 0.0, "num_input_tokens_seen": 152848592, "step": 70870 }, { "epoch": 13.006973756652597, "grad_norm": 0.0037684435956180096, "learning_rate": 3.2843322435656844e-06, "loss": 0.0, "num_input_tokens_seen": 152860016, "step": 70875 }, { "epoch": 13.007891356212149, "grad_norm": 0.0007991405436769128, "learning_rate": 3.283580125534054e-06, "loss": 0.0001, "num_input_tokens_seen": 152871504, "step": 70880 }, { "epoch": 13.008808955771702, "grad_norm": 0.002735954010859132, "learning_rate": 3.2828280515259303e-06, "loss": 0.0, "num_input_tokens_seen": 152881904, "step": 70885 }, { "epoch": 13.009726555331254, "grad_norm": 0.0012966160429641604, "learning_rate": 3.282076021560608e-06, "loss": 0.0097, "num_input_tokens_seen": 152892976, "step": 70890 }, { "epoch": 13.010644154890805, "grad_norm": 0.0002558068372309208, "learning_rate": 3.2813240356573732e-06, "loss": 0.1969, "num_input_tokens_seen": 152903504, "step": 70895 }, { "epoch": 13.011561754450359, "grad_norm": 0.0006914164987392724, "learning_rate": 3.2805720938355136e-06, "loss": 0.0, "num_input_tokens_seen": 152914000, "step": 70900 }, { "epoch": 13.01247935400991, "grad_norm": 0.0004968271823599935, "learning_rate": 3.279820196114312e-06, "loss": 0.0, "num_input_tokens_seen": 152925488, "step": 70905 }, { "epoch": 13.013396953569462, "grad_norm": 0.00037894712295383215, "learning_rate": 3.279068342513059e-06, "loss": 0.0, "num_input_tokens_seen": 152936048, "step": 70910 }, { "epoch": 13.014314553129015, "grad_norm": 0.0013120127841830254, "learning_rate": 3.2783165330510356e-06, "loss": 0.0, "num_input_tokens_seen": 152946256, "step": 70915 }, { "epoch": 13.015232152688567, "grad_norm": 0.0002765356039162725, "learning_rate": 3.277564767747523e-06, "loss": 0.0, "num_input_tokens_seen": 152957360, "step": 70920 }, { "epoch": 13.016149752248118, "grad_norm": 0.0012110961833968759, "learning_rate": 3.2768130466218063e-06, "loss": 0.0001, "num_input_tokens_seen": 152967472, "step": 70925 }, { "epoch": 13.017067351807672, "grad_norm": 0.020175954326987267, "learning_rate": 3.276061369693165e-06, "loss": 0.0, "num_input_tokens_seen": 152978576, "step": 70930 }, { "epoch": 13.017984951367223, "grad_norm": 0.0012143146013841033, "learning_rate": 3.275309736980875e-06, "loss": 0.0001, "num_input_tokens_seen": 152988048, "step": 70935 }, { "epoch": 13.018902550926775, "grad_norm": 0.003853460540995002, "learning_rate": 3.274558148504219e-06, "loss": 0.0, "num_input_tokens_seen": 152999344, "step": 70940 }, { "epoch": 13.019820150486328, "grad_norm": 0.0007291637011803687, "learning_rate": 3.273806604282473e-06, "loss": 0.0, "num_input_tokens_seen": 153010416, "step": 70945 }, { "epoch": 13.02073775004588, "grad_norm": 0.004692634101957083, "learning_rate": 3.273055104334911e-06, "loss": 0.0, "num_input_tokens_seen": 153021456, "step": 70950 }, { "epoch": 13.021655349605432, "grad_norm": 0.0002798406349029392, "learning_rate": 3.2723036486808096e-06, "loss": 0.0974, "num_input_tokens_seen": 153033200, "step": 70955 }, { "epoch": 13.022572949164985, "grad_norm": 0.0013325595064088702, "learning_rate": 3.2715522373394417e-06, "loss": 0.0, "num_input_tokens_seen": 153043344, "step": 70960 }, { "epoch": 13.023490548724537, "grad_norm": 0.018874887377023697, "learning_rate": 3.27080087033008e-06, "loss": 0.0132, "num_input_tokens_seen": 153053520, "step": 70965 }, { "epoch": 13.024408148284088, "grad_norm": 0.0003323715354781598, "learning_rate": 3.2700495476719956e-06, "loss": 0.0, "num_input_tokens_seen": 153065072, "step": 70970 }, { "epoch": 13.025325747843642, "grad_norm": 0.2043018341064453, "learning_rate": 3.26929826938446e-06, "loss": 0.0002, "num_input_tokens_seen": 153075088, "step": 70975 }, { "epoch": 13.026243347403193, "grad_norm": 0.7790470123291016, "learning_rate": 3.2685470354867417e-06, "loss": 0.0003, "num_input_tokens_seen": 153086480, "step": 70980 }, { "epoch": 13.027160946962745, "grad_norm": 0.0066949729807674885, "learning_rate": 3.2677958459981076e-06, "loss": 0.0, "num_input_tokens_seen": 153098000, "step": 70985 }, { "epoch": 13.028078546522298, "grad_norm": 0.010288679040968418, "learning_rate": 3.267044700937825e-06, "loss": 0.0002, "num_input_tokens_seen": 153109392, "step": 70990 }, { "epoch": 13.02899614608185, "grad_norm": 0.0002819158835336566, "learning_rate": 3.266293600325161e-06, "loss": 0.0, "num_input_tokens_seen": 153119536, "step": 70995 }, { "epoch": 13.029913745641402, "grad_norm": 0.2749241292476654, "learning_rate": 3.2655425441793788e-06, "loss": 0.0001, "num_input_tokens_seen": 153128368, "step": 71000 }, { "epoch": 13.030831345200955, "grad_norm": 0.000988077255897224, "learning_rate": 3.264791532519741e-06, "loss": 0.0002, "num_input_tokens_seen": 153138672, "step": 71005 }, { "epoch": 13.031748944760507, "grad_norm": 0.0027723105158656836, "learning_rate": 3.264040565365512e-06, "loss": 0.0008, "num_input_tokens_seen": 153149680, "step": 71010 }, { "epoch": 13.032666544320058, "grad_norm": 0.006047102622687817, "learning_rate": 3.2632896427359527e-06, "loss": 0.0, "num_input_tokens_seen": 153161200, "step": 71015 }, { "epoch": 13.033584143879612, "grad_norm": 0.002573334379121661, "learning_rate": 3.2625387646503202e-06, "loss": 0.0001, "num_input_tokens_seen": 153171536, "step": 71020 }, { "epoch": 13.034501743439163, "grad_norm": 0.0006848388002254069, "learning_rate": 3.2617879311278776e-06, "loss": 0.0001, "num_input_tokens_seen": 153180880, "step": 71025 }, { "epoch": 13.035419342998715, "grad_norm": 0.07843419164419174, "learning_rate": 3.2610371421878813e-06, "loss": 0.0001, "num_input_tokens_seen": 153191760, "step": 71030 }, { "epoch": 13.036336942558268, "grad_norm": 0.0002198181755375117, "learning_rate": 3.2602863978495864e-06, "loss": 0.0, "num_input_tokens_seen": 153203024, "step": 71035 }, { "epoch": 13.03725454211782, "grad_norm": 0.0012465008767321706, "learning_rate": 3.259535698132247e-06, "loss": 0.0, "num_input_tokens_seen": 153214096, "step": 71040 }, { "epoch": 13.038172141677371, "grad_norm": 0.0015062594320625067, "learning_rate": 3.2587850430551216e-06, "loss": 0.0, "num_input_tokens_seen": 153223760, "step": 71045 }, { "epoch": 13.039089741236925, "grad_norm": 0.0007234315271489322, "learning_rate": 3.2580344326374613e-06, "loss": 0.0, "num_input_tokens_seen": 153234416, "step": 71050 }, { "epoch": 13.040007340796476, "grad_norm": 0.0057275728322565556, "learning_rate": 3.2572838668985176e-06, "loss": 0.0, "num_input_tokens_seen": 153246064, "step": 71055 }, { "epoch": 13.040924940356028, "grad_norm": 0.00046193573507480323, "learning_rate": 3.256533345857541e-06, "loss": 0.0001, "num_input_tokens_seen": 153255504, "step": 71060 }, { "epoch": 13.041842539915582, "grad_norm": 0.032806430011987686, "learning_rate": 3.255782869533783e-06, "loss": 0.0, "num_input_tokens_seen": 153266800, "step": 71065 }, { "epoch": 13.042760139475133, "grad_norm": 0.005149275064468384, "learning_rate": 3.25503243794649e-06, "loss": 0.0006, "num_input_tokens_seen": 153276848, "step": 71070 }, { "epoch": 13.043677739034685, "grad_norm": 0.0004990362795069814, "learning_rate": 3.254282051114912e-06, "loss": 0.0, "num_input_tokens_seen": 153289296, "step": 71075 }, { "epoch": 13.044595338594238, "grad_norm": 0.00022822791652288288, "learning_rate": 3.253531709058293e-06, "loss": 0.0, "num_input_tokens_seen": 153299312, "step": 71080 }, { "epoch": 13.04551293815379, "grad_norm": 0.03858164697885513, "learning_rate": 3.2527814117958785e-06, "loss": 0.0071, "num_input_tokens_seen": 153310320, "step": 71085 }, { "epoch": 13.046430537713341, "grad_norm": 0.00020055162895005196, "learning_rate": 3.252031159346912e-06, "loss": 0.0, "num_input_tokens_seen": 153321168, "step": 71090 }, { "epoch": 13.047348137272895, "grad_norm": 0.000350077694747597, "learning_rate": 3.2512809517306398e-06, "loss": 0.0, "num_input_tokens_seen": 153331184, "step": 71095 }, { "epoch": 13.048265736832446, "grad_norm": 0.16603779792785645, "learning_rate": 3.2505307889662998e-06, "loss": 0.0001, "num_input_tokens_seen": 153340080, "step": 71100 }, { "epoch": 13.049183336391998, "grad_norm": 0.014468430541455746, "learning_rate": 3.2497806710731316e-06, "loss": 0.0, "num_input_tokens_seen": 153349808, "step": 71105 }, { "epoch": 13.050100935951551, "grad_norm": 0.0029287554789334536, "learning_rate": 3.2490305980703787e-06, "loss": 0.0, "num_input_tokens_seen": 153360464, "step": 71110 }, { "epoch": 13.051018535511103, "grad_norm": 0.0004256962856743485, "learning_rate": 3.2482805699772774e-06, "loss": 0.0, "num_input_tokens_seen": 153371472, "step": 71115 }, { "epoch": 13.051936135070655, "grad_norm": 0.0005668209632858634, "learning_rate": 3.247530586813065e-06, "loss": 0.0, "num_input_tokens_seen": 153380816, "step": 71120 }, { "epoch": 13.052853734630208, "grad_norm": 0.2466602921485901, "learning_rate": 3.2467806485969737e-06, "loss": 0.0005, "num_input_tokens_seen": 153390544, "step": 71125 }, { "epoch": 13.05377133418976, "grad_norm": 0.00019824202172458172, "learning_rate": 3.2460307553482447e-06, "loss": 0.0, "num_input_tokens_seen": 153401712, "step": 71130 }, { "epoch": 13.054688933749311, "grad_norm": 0.0008382601663470268, "learning_rate": 3.245280907086108e-06, "loss": 0.0, "num_input_tokens_seen": 153413360, "step": 71135 }, { "epoch": 13.055606533308865, "grad_norm": 0.00023877009516581893, "learning_rate": 3.2445311038297944e-06, "loss": 0.0, "num_input_tokens_seen": 153423952, "step": 71140 }, { "epoch": 13.056524132868416, "grad_norm": 0.29821598529815674, "learning_rate": 3.243781345598539e-06, "loss": 0.0001, "num_input_tokens_seen": 153435440, "step": 71145 }, { "epoch": 13.057441732427968, "grad_norm": 0.0001633449282962829, "learning_rate": 3.243031632411571e-06, "loss": 0.0, "num_input_tokens_seen": 153446640, "step": 71150 }, { "epoch": 13.058359331987521, "grad_norm": 0.0006013578386045992, "learning_rate": 3.2422819642881154e-06, "loss": 0.0007, "num_input_tokens_seen": 153457552, "step": 71155 }, { "epoch": 13.059276931547073, "grad_norm": 0.0022771370131522417, "learning_rate": 3.2415323412474066e-06, "loss": 0.0, "num_input_tokens_seen": 153468592, "step": 71160 }, { "epoch": 13.060194531106625, "grad_norm": 0.009569021873176098, "learning_rate": 3.2407827633086662e-06, "loss": 0.0, "num_input_tokens_seen": 153478032, "step": 71165 }, { "epoch": 13.061112130666178, "grad_norm": 0.19712716341018677, "learning_rate": 3.240033230491123e-06, "loss": 0.0001, "num_input_tokens_seen": 153489456, "step": 71170 }, { "epoch": 13.06202973022573, "grad_norm": 0.0005026665166951716, "learning_rate": 3.239283742813998e-06, "loss": 0.0005, "num_input_tokens_seen": 153500560, "step": 71175 }, { "epoch": 13.062947329785281, "grad_norm": 0.0005988843622617424, "learning_rate": 3.2385343002965156e-06, "loss": 0.0, "num_input_tokens_seen": 153511824, "step": 71180 }, { "epoch": 13.063864929344835, "grad_norm": 0.00037830433575436473, "learning_rate": 3.2377849029579e-06, "loss": 0.0, "num_input_tokens_seen": 153522960, "step": 71185 }, { "epoch": 13.064782528904386, "grad_norm": 1.3353017568588257, "learning_rate": 3.23703555081737e-06, "loss": 0.0001, "num_input_tokens_seen": 153534800, "step": 71190 }, { "epoch": 13.065700128463938, "grad_norm": 0.0032332362607121468, "learning_rate": 3.2362862438941458e-06, "loss": 0.0, "num_input_tokens_seen": 153543888, "step": 71195 }, { "epoch": 13.066617728023491, "grad_norm": 0.0003263189282733947, "learning_rate": 3.2355369822074467e-06, "loss": 0.0, "num_input_tokens_seen": 153554768, "step": 71200 }, { "epoch": 13.067535327583043, "grad_norm": 0.00556076318025589, "learning_rate": 3.234787765776487e-06, "loss": 0.0, "num_input_tokens_seen": 153565264, "step": 71205 }, { "epoch": 13.068452927142594, "grad_norm": 0.0017699648160487413, "learning_rate": 3.2340385946204867e-06, "loss": 0.0, "num_input_tokens_seen": 153575856, "step": 71210 }, { "epoch": 13.069370526702148, "grad_norm": 0.054471906274557114, "learning_rate": 3.2332894687586602e-06, "loss": 0.0, "num_input_tokens_seen": 153587120, "step": 71215 }, { "epoch": 13.0702881262617, "grad_norm": 0.0025337031111121178, "learning_rate": 3.2325403882102204e-06, "loss": 0.0107, "num_input_tokens_seen": 153596816, "step": 71220 }, { "epoch": 13.071205725821251, "grad_norm": 0.00042385587585158646, "learning_rate": 3.2317913529943782e-06, "loss": 0.0, "num_input_tokens_seen": 153607952, "step": 71225 }, { "epoch": 13.072123325380804, "grad_norm": 0.025783119723200798, "learning_rate": 3.23104236313035e-06, "loss": 0.1719, "num_input_tokens_seen": 153617936, "step": 71230 }, { "epoch": 13.073040924940356, "grad_norm": 0.0016759430291131139, "learning_rate": 3.2302934186373426e-06, "loss": 0.0, "num_input_tokens_seen": 153628400, "step": 71235 }, { "epoch": 13.073958524499908, "grad_norm": 0.0005824611289426684, "learning_rate": 3.229544519534565e-06, "loss": 0.0, "num_input_tokens_seen": 153639920, "step": 71240 }, { "epoch": 13.074876124059461, "grad_norm": 0.00026445332332514226, "learning_rate": 3.228795665841228e-06, "loss": 0.0, "num_input_tokens_seen": 153650768, "step": 71245 }, { "epoch": 13.075793723619013, "grad_norm": 0.0002573594683781266, "learning_rate": 3.228046857576537e-06, "loss": 0.0, "num_input_tokens_seen": 153660848, "step": 71250 }, { "epoch": 13.076711323178564, "grad_norm": 0.0008505262667313218, "learning_rate": 3.2272980947596967e-06, "loss": 0.0, "num_input_tokens_seen": 153671440, "step": 71255 }, { "epoch": 13.077628922738118, "grad_norm": 0.0018396947998553514, "learning_rate": 3.2265493774099138e-06, "loss": 0.0001, "num_input_tokens_seen": 153682928, "step": 71260 }, { "epoch": 13.07854652229767, "grad_norm": 0.0011843959800899029, "learning_rate": 3.2258007055463913e-06, "loss": 0.0, "num_input_tokens_seen": 153693456, "step": 71265 }, { "epoch": 13.079464121857221, "grad_norm": 0.001083493698388338, "learning_rate": 3.225052079188331e-06, "loss": 0.0, "num_input_tokens_seen": 153704304, "step": 71270 }, { "epoch": 13.080381721416774, "grad_norm": 155.90545654296875, "learning_rate": 3.2243034983549326e-06, "loss": 0.0853, "num_input_tokens_seen": 153715056, "step": 71275 }, { "epoch": 13.081299320976326, "grad_norm": 0.002550252480432391, "learning_rate": 3.2235549630653974e-06, "loss": 0.0001, "num_input_tokens_seen": 153724784, "step": 71280 }, { "epoch": 13.082216920535878, "grad_norm": 0.1409725844860077, "learning_rate": 3.2228064733389254e-06, "loss": 0.0001, "num_input_tokens_seen": 153734896, "step": 71285 }, { "epoch": 13.083134520095431, "grad_norm": 0.00037494447315111756, "learning_rate": 3.222058029194712e-06, "loss": 0.0, "num_input_tokens_seen": 153746672, "step": 71290 }, { "epoch": 13.084052119654983, "grad_norm": 0.0002493351639714092, "learning_rate": 3.2213096306519553e-06, "loss": 0.0, "num_input_tokens_seen": 153757456, "step": 71295 }, { "epoch": 13.084969719214534, "grad_norm": 0.0002442034019622952, "learning_rate": 3.22056127772985e-06, "loss": 0.0, "num_input_tokens_seen": 153768336, "step": 71300 }, { "epoch": 13.085887318774088, "grad_norm": 0.0025495262816548347, "learning_rate": 3.219812970447589e-06, "loss": 0.0005, "num_input_tokens_seen": 153778000, "step": 71305 }, { "epoch": 13.08680491833364, "grad_norm": 0.0004095171170774847, "learning_rate": 3.2190647088243665e-06, "loss": 0.0002, "num_input_tokens_seen": 153789200, "step": 71310 }, { "epoch": 13.08772251789319, "grad_norm": 0.00036134241963736713, "learning_rate": 3.2183164928793746e-06, "loss": 0.0, "num_input_tokens_seen": 153800528, "step": 71315 }, { "epoch": 13.088640117452744, "grad_norm": 0.009366744197905064, "learning_rate": 3.217568322631803e-06, "loss": 0.0, "num_input_tokens_seen": 153810384, "step": 71320 }, { "epoch": 13.089557717012296, "grad_norm": 0.003026447957381606, "learning_rate": 3.2168201981008406e-06, "loss": 0.1694, "num_input_tokens_seen": 153821712, "step": 71325 }, { "epoch": 13.090475316571847, "grad_norm": 0.0003128335520159453, "learning_rate": 3.2160721193056774e-06, "loss": 0.0, "num_input_tokens_seen": 153832048, "step": 71330 }, { "epoch": 13.0913929161314, "grad_norm": 0.0027845054864883423, "learning_rate": 3.2153240862655e-06, "loss": 0.0001, "num_input_tokens_seen": 153842608, "step": 71335 }, { "epoch": 13.092310515690952, "grad_norm": 0.005609443411231041, "learning_rate": 3.2145760989994917e-06, "loss": 0.0, "num_input_tokens_seen": 153852240, "step": 71340 }, { "epoch": 13.093228115250504, "grad_norm": 0.0015229049604386091, "learning_rate": 3.2138281575268414e-06, "loss": 0.0, "num_input_tokens_seen": 153863760, "step": 71345 }, { "epoch": 13.094145714810058, "grad_norm": 0.001427484443411231, "learning_rate": 3.2130802618667308e-06, "loss": 0.0, "num_input_tokens_seen": 153874704, "step": 71350 }, { "epoch": 13.09506331436961, "grad_norm": 0.0002449706371407956, "learning_rate": 3.2123324120383414e-06, "loss": 0.0001, "num_input_tokens_seen": 153885904, "step": 71355 }, { "epoch": 13.09598091392916, "grad_norm": 0.000521383248269558, "learning_rate": 3.2115846080608533e-06, "loss": 0.0, "num_input_tokens_seen": 153896912, "step": 71360 }, { "epoch": 13.096898513488714, "grad_norm": 0.01899254322052002, "learning_rate": 3.2108368499534503e-06, "loss": 0.0001, "num_input_tokens_seen": 153906864, "step": 71365 }, { "epoch": 13.097816113048266, "grad_norm": 0.000274989812169224, "learning_rate": 3.2100891377353083e-06, "loss": 0.0001, "num_input_tokens_seen": 153918128, "step": 71370 }, { "epoch": 13.098733712607817, "grad_norm": 0.01643146201968193, "learning_rate": 3.209341471425605e-06, "loss": 0.0, "num_input_tokens_seen": 153927824, "step": 71375 }, { "epoch": 13.09965131216737, "grad_norm": 0.0005027088918723166, "learning_rate": 3.2085938510435188e-06, "loss": 0.0001, "num_input_tokens_seen": 153939376, "step": 71380 }, { "epoch": 13.100568911726922, "grad_norm": 0.008543255738914013, "learning_rate": 3.207846276608224e-06, "loss": 0.0, "num_input_tokens_seen": 153949712, "step": 71385 }, { "epoch": 13.101486511286474, "grad_norm": 0.00021826270676683635, "learning_rate": 3.2070987481388942e-06, "loss": 0.0012, "num_input_tokens_seen": 153959472, "step": 71390 }, { "epoch": 13.102404110846027, "grad_norm": 0.0005797802587039769, "learning_rate": 3.2063512656547036e-06, "loss": 0.0, "num_input_tokens_seen": 153970928, "step": 71395 }, { "epoch": 13.103321710405579, "grad_norm": 0.0004142640973441303, "learning_rate": 3.205603829174823e-06, "loss": 0.0, "num_input_tokens_seen": 153981168, "step": 71400 }, { "epoch": 13.10423930996513, "grad_norm": 0.00043086925870738924, "learning_rate": 3.204856438718422e-06, "loss": 0.0002, "num_input_tokens_seen": 153992208, "step": 71405 }, { "epoch": 13.105156909524684, "grad_norm": 0.0002294795704074204, "learning_rate": 3.2041090943046715e-06, "loss": 0.0, "num_input_tokens_seen": 154002288, "step": 71410 }, { "epoch": 13.106074509084236, "grad_norm": 260.5286560058594, "learning_rate": 3.20336179595274e-06, "loss": 0.2844, "num_input_tokens_seen": 154012336, "step": 71415 }, { "epoch": 13.106992108643787, "grad_norm": 0.002269033109769225, "learning_rate": 3.202614543681794e-06, "loss": 0.0006, "num_input_tokens_seen": 154023312, "step": 71420 }, { "epoch": 13.10790970820334, "grad_norm": 0.0001579128875164315, "learning_rate": 3.201867337510997e-06, "loss": 0.0, "num_input_tokens_seen": 154034384, "step": 71425 }, { "epoch": 13.108827307762892, "grad_norm": 0.0024827171582728624, "learning_rate": 3.2011201774595187e-06, "loss": 0.0, "num_input_tokens_seen": 154045360, "step": 71430 }, { "epoch": 13.109744907322444, "grad_norm": 0.00026053490000776947, "learning_rate": 3.2003730635465193e-06, "loss": 0.0002, "num_input_tokens_seen": 154054096, "step": 71435 }, { "epoch": 13.110662506881997, "grad_norm": 0.00028265820583328605, "learning_rate": 3.199625995791161e-06, "loss": 0.0, "num_input_tokens_seen": 154065552, "step": 71440 }, { "epoch": 13.111580106441549, "grad_norm": 0.0002099773846566677, "learning_rate": 3.1988789742126046e-06, "loss": 0.0, "num_input_tokens_seen": 154077456, "step": 71445 }, { "epoch": 13.1124977060011, "grad_norm": 0.0007751444354653358, "learning_rate": 3.198131998830013e-06, "loss": 0.0, "num_input_tokens_seen": 154088336, "step": 71450 }, { "epoch": 13.113415305560654, "grad_norm": 0.0004491574363783002, "learning_rate": 3.1973850696625424e-06, "loss": 0.0, "num_input_tokens_seen": 154098800, "step": 71455 }, { "epoch": 13.114332905120206, "grad_norm": 0.0003013068635482341, "learning_rate": 3.1966381867293494e-06, "loss": 0.0, "num_input_tokens_seen": 154109168, "step": 71460 }, { "epoch": 13.115250504679757, "grad_norm": 0.0029892560560256243, "learning_rate": 3.1958913500495937e-06, "loss": 0.0, "num_input_tokens_seen": 154120272, "step": 71465 }, { "epoch": 13.11616810423931, "grad_norm": 0.00022514774173032492, "learning_rate": 3.1951445596424293e-06, "loss": 0.0, "num_input_tokens_seen": 154131760, "step": 71470 }, { "epoch": 13.117085703798862, "grad_norm": 0.000167927733855322, "learning_rate": 3.1943978155270066e-06, "loss": 0.0, "num_input_tokens_seen": 154143376, "step": 71475 }, { "epoch": 13.118003303358414, "grad_norm": 0.00043643650133162737, "learning_rate": 3.193651117722484e-06, "loss": 0.0, "num_input_tokens_seen": 154154576, "step": 71480 }, { "epoch": 13.118920902917967, "grad_norm": 0.0004624301800504327, "learning_rate": 3.1929044662480115e-06, "loss": 0.004, "num_input_tokens_seen": 154164432, "step": 71485 }, { "epoch": 13.119838502477519, "grad_norm": 0.00030588876688852906, "learning_rate": 3.1921578611227377e-06, "loss": 0.0003, "num_input_tokens_seen": 154176304, "step": 71490 }, { "epoch": 13.12075610203707, "grad_norm": 0.00031640371889807284, "learning_rate": 3.191411302365812e-06, "loss": 0.0, "num_input_tokens_seen": 154186384, "step": 71495 }, { "epoch": 13.121673701596624, "grad_norm": 0.013730793260037899, "learning_rate": 3.1906647899963834e-06, "loss": 0.0, "num_input_tokens_seen": 154197392, "step": 71500 }, { "epoch": 13.122591301156175, "grad_norm": 0.027647998183965683, "learning_rate": 3.1899183240335994e-06, "loss": 0.0131, "num_input_tokens_seen": 154209776, "step": 71505 }, { "epoch": 13.123508900715727, "grad_norm": 0.0005404418334364891, "learning_rate": 3.1891719044966044e-06, "loss": 0.0, "num_input_tokens_seen": 154219696, "step": 71510 }, { "epoch": 13.12442650027528, "grad_norm": 0.00018202062346972525, "learning_rate": 3.188425531404545e-06, "loss": 0.0, "num_input_tokens_seen": 154229360, "step": 71515 }, { "epoch": 13.125344099834832, "grad_norm": 0.0014904693234711885, "learning_rate": 3.1876792047765627e-06, "loss": 0.0, "num_input_tokens_seen": 154240336, "step": 71520 }, { "epoch": 13.126261699394384, "grad_norm": 0.00034619553480297327, "learning_rate": 3.186932924631797e-06, "loss": 0.0, "num_input_tokens_seen": 154250256, "step": 71525 }, { "epoch": 13.127179298953937, "grad_norm": 0.0008541785646229982, "learning_rate": 3.1861866909893953e-06, "loss": 0.0, "num_input_tokens_seen": 154262288, "step": 71530 }, { "epoch": 13.128096898513489, "grad_norm": 0.000518042768817395, "learning_rate": 3.185440503868493e-06, "loss": 0.0, "num_input_tokens_seen": 154273904, "step": 71535 }, { "epoch": 13.12901449807304, "grad_norm": 0.008512627333402634, "learning_rate": 3.1846943632882294e-06, "loss": 0.0, "num_input_tokens_seen": 154286384, "step": 71540 }, { "epoch": 13.129932097632594, "grad_norm": 0.0033419274259358644, "learning_rate": 3.1839482692677405e-06, "loss": 0.0001, "num_input_tokens_seen": 154298960, "step": 71545 }, { "epoch": 13.130849697192145, "grad_norm": 0.00035058686626143754, "learning_rate": 3.1832022218261648e-06, "loss": 0.0, "num_input_tokens_seen": 154309168, "step": 71550 }, { "epoch": 13.131767296751697, "grad_norm": 0.000492084538564086, "learning_rate": 3.182456220982637e-06, "loss": 0.0, "num_input_tokens_seen": 154320208, "step": 71555 }, { "epoch": 13.13268489631125, "grad_norm": 0.013226146809756756, "learning_rate": 3.1817102667562883e-06, "loss": 0.0, "num_input_tokens_seen": 154330544, "step": 71560 }, { "epoch": 13.133602495870802, "grad_norm": 0.0004448300751391798, "learning_rate": 3.1809643591662553e-06, "loss": 0.0, "num_input_tokens_seen": 154341648, "step": 71565 }, { "epoch": 13.134520095430354, "grad_norm": 0.00020687669166363776, "learning_rate": 3.180218498231667e-06, "loss": 0.0, "num_input_tokens_seen": 154352560, "step": 71570 }, { "epoch": 13.135437694989907, "grad_norm": 0.0002180405572289601, "learning_rate": 3.179472683971654e-06, "loss": 0.0, "num_input_tokens_seen": 154362320, "step": 71575 }, { "epoch": 13.136355294549459, "grad_norm": 0.0005083325086161494, "learning_rate": 3.1787269164053425e-06, "loss": 0.0, "num_input_tokens_seen": 154372656, "step": 71580 }, { "epoch": 13.13727289410901, "grad_norm": 0.0003007485647685826, "learning_rate": 3.1779811955518652e-06, "loss": 0.0, "num_input_tokens_seen": 154383920, "step": 71585 }, { "epoch": 13.138190493668564, "grad_norm": 0.0007071916479617357, "learning_rate": 3.1772355214303464e-06, "loss": 0.0001, "num_input_tokens_seen": 154393584, "step": 71590 }, { "epoch": 13.139108093228115, "grad_norm": 0.00027405767468735576, "learning_rate": 3.176489894059911e-06, "loss": 0.0001, "num_input_tokens_seen": 154404176, "step": 71595 }, { "epoch": 13.140025692787667, "grad_norm": 0.00048530142521485686, "learning_rate": 3.1757443134596827e-06, "loss": 0.0, "num_input_tokens_seen": 154415664, "step": 71600 }, { "epoch": 13.14094329234722, "grad_norm": 0.0001933918392751366, "learning_rate": 3.174998779648787e-06, "loss": 0.0, "num_input_tokens_seen": 154427504, "step": 71605 }, { "epoch": 13.141860891906772, "grad_norm": 0.0016449473332613707, "learning_rate": 3.1742532926463427e-06, "loss": 0.0, "num_input_tokens_seen": 154437584, "step": 71610 }, { "epoch": 13.142778491466323, "grad_norm": 0.00028968427795916796, "learning_rate": 3.173507852471473e-06, "loss": 0.0, "num_input_tokens_seen": 154447408, "step": 71615 }, { "epoch": 13.143696091025877, "grad_norm": 0.0002547217591200024, "learning_rate": 3.1727624591432958e-06, "loss": 0.0, "num_input_tokens_seen": 154457488, "step": 71620 }, { "epoch": 13.144613690585429, "grad_norm": 0.007401247508823872, "learning_rate": 3.172017112680929e-06, "loss": 0.0004, "num_input_tokens_seen": 154468048, "step": 71625 }, { "epoch": 13.14553129014498, "grad_norm": 0.0005015247734263539, "learning_rate": 3.1712718131034902e-06, "loss": 0.0001, "num_input_tokens_seen": 154479312, "step": 71630 }, { "epoch": 13.146448889704534, "grad_norm": 0.0002508772595319897, "learning_rate": 3.170526560430096e-06, "loss": 0.0, "num_input_tokens_seen": 154489360, "step": 71635 }, { "epoch": 13.147366489264085, "grad_norm": 0.0014855522895231843, "learning_rate": 3.16978135467986e-06, "loss": 0.0, "num_input_tokens_seen": 154500080, "step": 71640 }, { "epoch": 13.148284088823637, "grad_norm": 0.0007042267243377864, "learning_rate": 3.1690361958718935e-06, "loss": 0.0, "num_input_tokens_seen": 154511920, "step": 71645 }, { "epoch": 13.14920168838319, "grad_norm": 0.00012652447912842035, "learning_rate": 3.1682910840253132e-06, "loss": 0.0, "num_input_tokens_seen": 154521648, "step": 71650 }, { "epoch": 13.150119287942742, "grad_norm": 0.00029858361813239753, "learning_rate": 3.1675460191592277e-06, "loss": 0.0, "num_input_tokens_seen": 154532368, "step": 71655 }, { "epoch": 13.151036887502293, "grad_norm": 0.003018938470631838, "learning_rate": 3.166801001292744e-06, "loss": 0.0, "num_input_tokens_seen": 154543504, "step": 71660 }, { "epoch": 13.151954487061847, "grad_norm": 0.0002726232632994652, "learning_rate": 3.166056030444976e-06, "loss": 0.0, "num_input_tokens_seen": 154554352, "step": 71665 }, { "epoch": 13.152872086621398, "grad_norm": 0.0010005718795582652, "learning_rate": 3.165311106635029e-06, "loss": 0.0, "num_input_tokens_seen": 154565168, "step": 71670 }, { "epoch": 13.15378968618095, "grad_norm": 0.0023409409914165735, "learning_rate": 3.1645662298820077e-06, "loss": 0.0, "num_input_tokens_seen": 154576208, "step": 71675 }, { "epoch": 13.154707285740503, "grad_norm": 0.00023306820366997272, "learning_rate": 3.1638214002050165e-06, "loss": 0.0, "num_input_tokens_seen": 154587152, "step": 71680 }, { "epoch": 13.155624885300055, "grad_norm": 0.005076393950730562, "learning_rate": 3.1630766176231626e-06, "loss": 0.0, "num_input_tokens_seen": 154598064, "step": 71685 }, { "epoch": 13.156542484859607, "grad_norm": 0.09523360431194305, "learning_rate": 3.162331882155546e-06, "loss": 0.0, "num_input_tokens_seen": 154608368, "step": 71690 }, { "epoch": 13.15746008441916, "grad_norm": 0.0002369001740589738, "learning_rate": 3.161587193821267e-06, "loss": 0.0, "num_input_tokens_seen": 154619472, "step": 71695 }, { "epoch": 13.158377683978712, "grad_norm": 0.0003290343447588384, "learning_rate": 3.1608425526394286e-06, "loss": 0.0, "num_input_tokens_seen": 154630128, "step": 71700 }, { "epoch": 13.159295283538263, "grad_norm": 0.0006876154220663011, "learning_rate": 3.160097958629129e-06, "loss": 0.0, "num_input_tokens_seen": 154640688, "step": 71705 }, { "epoch": 13.160212883097817, "grad_norm": 0.001903394004330039, "learning_rate": 3.159353411809464e-06, "loss": 0.0, "num_input_tokens_seen": 154652080, "step": 71710 }, { "epoch": 13.161130482657368, "grad_norm": 0.0006735174683853984, "learning_rate": 3.1586089121995316e-06, "loss": 0.0, "num_input_tokens_seen": 154663760, "step": 71715 }, { "epoch": 13.16204808221692, "grad_norm": 0.00016225125000346452, "learning_rate": 3.157864459818426e-06, "loss": 0.2281, "num_input_tokens_seen": 154673744, "step": 71720 }, { "epoch": 13.162965681776473, "grad_norm": 0.0009307858417741954, "learning_rate": 3.1571200546852432e-06, "loss": 0.0, "num_input_tokens_seen": 154684048, "step": 71725 }, { "epoch": 13.163883281336025, "grad_norm": 0.00019762884767260402, "learning_rate": 3.156375696819074e-06, "loss": 0.0, "num_input_tokens_seen": 154695696, "step": 71730 }, { "epoch": 13.164800880895577, "grad_norm": 0.00026868702843785286, "learning_rate": 3.1556313862390116e-06, "loss": 0.0, "num_input_tokens_seen": 154706064, "step": 71735 }, { "epoch": 13.16571848045513, "grad_norm": 0.0005656382418237627, "learning_rate": 3.154887122964145e-06, "loss": 0.0, "num_input_tokens_seen": 154716336, "step": 71740 }, { "epoch": 13.166636080014682, "grad_norm": 0.0003241809899918735, "learning_rate": 3.154142907013563e-06, "loss": 0.0, "num_input_tokens_seen": 154726928, "step": 71745 }, { "epoch": 13.167553679574233, "grad_norm": 0.014554634690284729, "learning_rate": 3.1533987384063565e-06, "loss": 0.0, "num_input_tokens_seen": 154738672, "step": 71750 }, { "epoch": 13.168471279133787, "grad_norm": 0.00019020758918486536, "learning_rate": 3.15265461716161e-06, "loss": 0.0, "num_input_tokens_seen": 154749808, "step": 71755 }, { "epoch": 13.169388878693338, "grad_norm": 0.0002740818599704653, "learning_rate": 3.1519105432984098e-06, "loss": 0.0, "num_input_tokens_seen": 154759760, "step": 71760 }, { "epoch": 13.17030647825289, "grad_norm": 0.0002039500541286543, "learning_rate": 3.1511665168358374e-06, "loss": 0.0, "num_input_tokens_seen": 154770800, "step": 71765 }, { "epoch": 13.171224077812443, "grad_norm": 0.002503913827240467, "learning_rate": 3.150422537792981e-06, "loss": 0.0, "num_input_tokens_seen": 154780400, "step": 71770 }, { "epoch": 13.172141677371995, "grad_norm": 0.00014583767915610224, "learning_rate": 3.149678606188919e-06, "loss": 0.0, "num_input_tokens_seen": 154790128, "step": 71775 }, { "epoch": 13.173059276931546, "grad_norm": 0.00021442738943733275, "learning_rate": 3.148934722042731e-06, "loss": 0.0, "num_input_tokens_seen": 154800592, "step": 71780 }, { "epoch": 13.1739768764911, "grad_norm": 0.0005743603105656803, "learning_rate": 3.1481908853735018e-06, "loss": 0.0, "num_input_tokens_seen": 154810768, "step": 71785 }, { "epoch": 13.174894476050651, "grad_norm": 0.000754152424633503, "learning_rate": 3.147447096200306e-06, "loss": 0.0, "num_input_tokens_seen": 154820976, "step": 71790 }, { "epoch": 13.175812075610203, "grad_norm": 0.0003389399207662791, "learning_rate": 3.1467033545422184e-06, "loss": 0.0, "num_input_tokens_seen": 154830672, "step": 71795 }, { "epoch": 13.176729675169756, "grad_norm": 30.22688865661621, "learning_rate": 3.1459596604183197e-06, "loss": 0.0588, "num_input_tokens_seen": 154840560, "step": 71800 }, { "epoch": 13.177647274729308, "grad_norm": 0.000297157239401713, "learning_rate": 3.1452160138476817e-06, "loss": 0.0, "num_input_tokens_seen": 154850608, "step": 71805 }, { "epoch": 13.17856487428886, "grad_norm": 0.0022828709334135056, "learning_rate": 3.1444724148493786e-06, "loss": 0.0, "num_input_tokens_seen": 154862384, "step": 71810 }, { "epoch": 13.179482473848413, "grad_norm": 0.00031396059785038233, "learning_rate": 3.1437288634424814e-06, "loss": 0.0, "num_input_tokens_seen": 154872496, "step": 71815 }, { "epoch": 13.180400073407965, "grad_norm": 0.0004599719541147351, "learning_rate": 3.142985359646062e-06, "loss": 0.0, "num_input_tokens_seen": 154883760, "step": 71820 }, { "epoch": 13.181317672967516, "grad_norm": 0.003514477051794529, "learning_rate": 3.1422419034791905e-06, "loss": 0.0, "num_input_tokens_seen": 154894128, "step": 71825 }, { "epoch": 13.18223527252707, "grad_norm": 0.0001648108009248972, "learning_rate": 3.1414984949609345e-06, "loss": 0.0, "num_input_tokens_seen": 154905232, "step": 71830 }, { "epoch": 13.183152872086621, "grad_norm": 0.0003019506111741066, "learning_rate": 3.1407551341103626e-06, "loss": 0.0, "num_input_tokens_seen": 154915632, "step": 71835 }, { "epoch": 13.184070471646173, "grad_norm": 0.001286466489546001, "learning_rate": 3.1400118209465395e-06, "loss": 0.0, "num_input_tokens_seen": 154925904, "step": 71840 }, { "epoch": 13.184988071205726, "grad_norm": 0.0005900863907299936, "learning_rate": 3.139268555488529e-06, "loss": 0.0, "num_input_tokens_seen": 154936336, "step": 71845 }, { "epoch": 13.185905670765278, "grad_norm": 2.9355170726776123, "learning_rate": 3.138525337755398e-06, "loss": 0.0009, "num_input_tokens_seen": 154947056, "step": 71850 }, { "epoch": 13.18682327032483, "grad_norm": 0.0006061337189748883, "learning_rate": 3.137782167766207e-06, "loss": 0.0144, "num_input_tokens_seen": 154957072, "step": 71855 }, { "epoch": 13.187740869884383, "grad_norm": 0.0007761304150335491, "learning_rate": 3.137039045540017e-06, "loss": 0.0, "num_input_tokens_seen": 154966384, "step": 71860 }, { "epoch": 13.188658469443935, "grad_norm": 0.10927262902259827, "learning_rate": 3.136295971095886e-06, "loss": 0.0001, "num_input_tokens_seen": 154977968, "step": 71865 }, { "epoch": 13.189576069003486, "grad_norm": 0.00743782427161932, "learning_rate": 3.1355529444528777e-06, "loss": 0.0, "num_input_tokens_seen": 154989104, "step": 71870 }, { "epoch": 13.19049366856304, "grad_norm": 0.00027192875859327614, "learning_rate": 3.134809965630047e-06, "loss": 0.0001, "num_input_tokens_seen": 154999600, "step": 71875 }, { "epoch": 13.191411268122591, "grad_norm": 0.0008275046129710972, "learning_rate": 3.1340670346464465e-06, "loss": 0.0001, "num_input_tokens_seen": 155011376, "step": 71880 }, { "epoch": 13.192328867682143, "grad_norm": 0.00025454373098909855, "learning_rate": 3.1333241515211376e-06, "loss": 0.0, "num_input_tokens_seen": 155022256, "step": 71885 }, { "epoch": 13.193246467241696, "grad_norm": 0.00031788868363946676, "learning_rate": 3.1325813162731705e-06, "loss": 0.0, "num_input_tokens_seen": 155032848, "step": 71890 }, { "epoch": 13.194164066801248, "grad_norm": 0.00047644099686294794, "learning_rate": 3.131838528921599e-06, "loss": 0.0, "num_input_tokens_seen": 155043280, "step": 71895 }, { "epoch": 13.1950816663608, "grad_norm": 0.0002578867133706808, "learning_rate": 3.1310957894854717e-06, "loss": 0.0, "num_input_tokens_seen": 155054352, "step": 71900 }, { "epoch": 13.195999265920353, "grad_norm": 0.0005380377406254411, "learning_rate": 3.1303530979838425e-06, "loss": 0.0011, "num_input_tokens_seen": 155064368, "step": 71905 }, { "epoch": 13.196916865479905, "grad_norm": 0.0027148050721734762, "learning_rate": 3.1296104544357587e-06, "loss": 0.0001, "num_input_tokens_seen": 155075280, "step": 71910 }, { "epoch": 13.197834465039456, "grad_norm": 0.00032379422918893397, "learning_rate": 3.128867858860266e-06, "loss": 0.0, "num_input_tokens_seen": 155086096, "step": 71915 }, { "epoch": 13.19875206459901, "grad_norm": 0.0004104592662770301, "learning_rate": 3.1281253112764154e-06, "loss": 0.2781, "num_input_tokens_seen": 155097136, "step": 71920 }, { "epoch": 13.199669664158561, "grad_norm": 0.0006722742109559476, "learning_rate": 3.127382811703249e-06, "loss": 0.0, "num_input_tokens_seen": 155107088, "step": 71925 }, { "epoch": 13.200587263718113, "grad_norm": 0.003631036728620529, "learning_rate": 3.1266403601598094e-06, "loss": 0.0, "num_input_tokens_seen": 155118128, "step": 71930 }, { "epoch": 13.201504863277666, "grad_norm": 0.0004694582603406161, "learning_rate": 3.1258979566651426e-06, "loss": 0.0, "num_input_tokens_seen": 155127408, "step": 71935 }, { "epoch": 13.202422462837218, "grad_norm": 0.00023288650845643133, "learning_rate": 3.125155601238289e-06, "loss": 0.0, "num_input_tokens_seen": 155138064, "step": 71940 }, { "epoch": 13.20334006239677, "grad_norm": 133.6735076904297, "learning_rate": 3.1244132938982873e-06, "loss": 0.0817, "num_input_tokens_seen": 155148496, "step": 71945 }, { "epoch": 13.204257661956323, "grad_norm": 0.00027824228163808584, "learning_rate": 3.1236710346641776e-06, "loss": 0.0032, "num_input_tokens_seen": 155159344, "step": 71950 }, { "epoch": 13.205175261515874, "grad_norm": 0.02316988632082939, "learning_rate": 3.122928823554999e-06, "loss": 0.0, "num_input_tokens_seen": 155170192, "step": 71955 }, { "epoch": 13.206092861075426, "grad_norm": 0.00017746757657732815, "learning_rate": 3.1221866605897868e-06, "loss": 0.0, "num_input_tokens_seen": 155180688, "step": 71960 }, { "epoch": 13.20701046063498, "grad_norm": 0.0033195987343788147, "learning_rate": 3.121444545787574e-06, "loss": 0.0, "num_input_tokens_seen": 155192048, "step": 71965 }, { "epoch": 13.207928060194531, "grad_norm": 0.010280540212988853, "learning_rate": 3.1207024791674e-06, "loss": 0.0, "num_input_tokens_seen": 155203696, "step": 71970 }, { "epoch": 13.208845659754083, "grad_norm": 0.00025026712683029473, "learning_rate": 3.1199604607482942e-06, "loss": 0.0, "num_input_tokens_seen": 155214160, "step": 71975 }, { "epoch": 13.209763259313636, "grad_norm": 0.00021791443577967584, "learning_rate": 3.1192184905492865e-06, "loss": 0.0, "num_input_tokens_seen": 155226800, "step": 71980 }, { "epoch": 13.210680858873188, "grad_norm": 0.0706014558672905, "learning_rate": 3.1184765685894125e-06, "loss": 0.0001, "num_input_tokens_seen": 155237904, "step": 71985 }, { "epoch": 13.21159845843274, "grad_norm": 0.0001943620154634118, "learning_rate": 3.1177346948876974e-06, "loss": 0.0, "num_input_tokens_seen": 155249616, "step": 71990 }, { "epoch": 13.212516057992293, "grad_norm": 0.007951498031616211, "learning_rate": 3.1169928694631706e-06, "loss": 0.0, "num_input_tokens_seen": 155259728, "step": 71995 }, { "epoch": 13.213433657551844, "grad_norm": 0.008335846476256847, "learning_rate": 3.1162510923348564e-06, "loss": 0.0001, "num_input_tokens_seen": 155270192, "step": 72000 }, { "epoch": 13.214351257111396, "grad_norm": 0.0002497293462511152, "learning_rate": 3.1155093635217836e-06, "loss": 0.0452, "num_input_tokens_seen": 155280592, "step": 72005 }, { "epoch": 13.21526885667095, "grad_norm": 0.018578525632619858, "learning_rate": 3.114767683042976e-06, "loss": 0.0001, "num_input_tokens_seen": 155292144, "step": 72010 }, { "epoch": 13.216186456230501, "grad_norm": 0.0003339347313158214, "learning_rate": 3.1140260509174523e-06, "loss": 0.0001, "num_input_tokens_seen": 155303920, "step": 72015 }, { "epoch": 13.217104055790053, "grad_norm": 0.0002755435125436634, "learning_rate": 3.1132844671642405e-06, "loss": 0.0, "num_input_tokens_seen": 155315856, "step": 72020 }, { "epoch": 13.218021655349606, "grad_norm": 0.017132554203271866, "learning_rate": 3.112542931802357e-06, "loss": 0.0, "num_input_tokens_seen": 155327056, "step": 72025 }, { "epoch": 13.218939254909158, "grad_norm": 0.00015939748845994473, "learning_rate": 3.1118014448508223e-06, "loss": 0.0, "num_input_tokens_seen": 155337904, "step": 72030 }, { "epoch": 13.21985685446871, "grad_norm": 0.0005400498048402369, "learning_rate": 3.111060006328653e-06, "loss": 0.0, "num_input_tokens_seen": 155348304, "step": 72035 }, { "epoch": 13.220774454028263, "grad_norm": 0.0002582167217042297, "learning_rate": 3.110318616254867e-06, "loss": 0.0, "num_input_tokens_seen": 155359696, "step": 72040 }, { "epoch": 13.221692053587814, "grad_norm": 0.012312129139900208, "learning_rate": 3.109577274648481e-06, "loss": 0.0, "num_input_tokens_seen": 155371216, "step": 72045 }, { "epoch": 13.222609653147366, "grad_norm": 0.0006725293933413923, "learning_rate": 3.108835981528507e-06, "loss": 0.0, "num_input_tokens_seen": 155382000, "step": 72050 }, { "epoch": 13.22352725270692, "grad_norm": 0.012599000707268715, "learning_rate": 3.1080947369139603e-06, "loss": 0.0, "num_input_tokens_seen": 155392976, "step": 72055 }, { "epoch": 13.22444485226647, "grad_norm": 0.002991404617205262, "learning_rate": 3.107353540823851e-06, "loss": 0.0, "num_input_tokens_seen": 155404048, "step": 72060 }, { "epoch": 13.225362451826022, "grad_norm": 0.0006654493045061827, "learning_rate": 3.1066123932771873e-06, "loss": 0.0, "num_input_tokens_seen": 155414800, "step": 72065 }, { "epoch": 13.226280051385576, "grad_norm": 0.00025417309370823205, "learning_rate": 3.105871294292985e-06, "loss": 0.0002, "num_input_tokens_seen": 155424112, "step": 72070 }, { "epoch": 13.227197650945127, "grad_norm": 0.00019916942983400077, "learning_rate": 3.1051302438902463e-06, "loss": 0.0, "num_input_tokens_seen": 155434224, "step": 72075 }, { "epoch": 13.228115250504679, "grad_norm": 0.005369399208575487, "learning_rate": 3.1043892420879818e-06, "loss": 0.0173, "num_input_tokens_seen": 155443600, "step": 72080 }, { "epoch": 13.229032850064232, "grad_norm": 0.003537686774507165, "learning_rate": 3.1036482889051924e-06, "loss": 0.0002, "num_input_tokens_seen": 155454736, "step": 72085 }, { "epoch": 13.229950449623784, "grad_norm": 0.0009829222690314054, "learning_rate": 3.1029073843608874e-06, "loss": 0.0, "num_input_tokens_seen": 155466160, "step": 72090 }, { "epoch": 13.230868049183336, "grad_norm": 0.00027913390658795834, "learning_rate": 3.102166528474068e-06, "loss": 0.0, "num_input_tokens_seen": 155476432, "step": 72095 }, { "epoch": 13.231785648742889, "grad_norm": 0.017977211624383926, "learning_rate": 3.101425721263734e-06, "loss": 0.0, "num_input_tokens_seen": 155487664, "step": 72100 }, { "epoch": 13.23270324830244, "grad_norm": 0.00521581107750535, "learning_rate": 3.100684962748889e-06, "loss": 0.0, "num_input_tokens_seen": 155498224, "step": 72105 }, { "epoch": 13.233620847861992, "grad_norm": 0.00024005427258089185, "learning_rate": 3.0999442529485314e-06, "loss": 0.0, "num_input_tokens_seen": 155507920, "step": 72110 }, { "epoch": 13.234538447421546, "grad_norm": 0.00013296808174345642, "learning_rate": 3.099203591881657e-06, "loss": 0.0, "num_input_tokens_seen": 155518512, "step": 72115 }, { "epoch": 13.235456046981097, "grad_norm": 0.0007317315321415663, "learning_rate": 3.0984629795672666e-06, "loss": 0.0, "num_input_tokens_seen": 155529584, "step": 72120 }, { "epoch": 13.236373646540649, "grad_norm": 0.000499760324601084, "learning_rate": 3.0977224160243535e-06, "loss": 0.0, "num_input_tokens_seen": 155538576, "step": 72125 }, { "epoch": 13.237291246100202, "grad_norm": 0.00021252462465781718, "learning_rate": 3.096981901271912e-06, "loss": 0.0, "num_input_tokens_seen": 155549552, "step": 72130 }, { "epoch": 13.238208845659754, "grad_norm": 0.0005724888760596514, "learning_rate": 3.096241435328935e-06, "loss": 0.0, "num_input_tokens_seen": 155560752, "step": 72135 }, { "epoch": 13.239126445219306, "grad_norm": 0.000419376214267686, "learning_rate": 3.095501018214414e-06, "loss": 0.0, "num_input_tokens_seen": 155572176, "step": 72140 }, { "epoch": 13.240044044778859, "grad_norm": 0.0013696146197617054, "learning_rate": 3.0947606499473414e-06, "loss": 0.0, "num_input_tokens_seen": 155583056, "step": 72145 }, { "epoch": 13.24096164433841, "grad_norm": 0.0008284359937533736, "learning_rate": 3.0940203305467036e-06, "loss": 0.0, "num_input_tokens_seen": 155592944, "step": 72150 }, { "epoch": 13.241879243897962, "grad_norm": 0.0002509350888431072, "learning_rate": 3.093280060031492e-06, "loss": 0.0, "num_input_tokens_seen": 155602768, "step": 72155 }, { "epoch": 13.242796843457516, "grad_norm": 0.00017775422020349652, "learning_rate": 3.0925398384206915e-06, "loss": 0.0, "num_input_tokens_seen": 155613744, "step": 72160 }, { "epoch": 13.243714443017067, "grad_norm": 0.005256118252873421, "learning_rate": 3.091799665733286e-06, "loss": 0.0, "num_input_tokens_seen": 155624144, "step": 72165 }, { "epoch": 13.244632042576619, "grad_norm": 0.0002600186562631279, "learning_rate": 3.091059541988264e-06, "loss": 0.0, "num_input_tokens_seen": 155634352, "step": 72170 }, { "epoch": 13.245549642136172, "grad_norm": 0.0001929034333443269, "learning_rate": 3.0903194672046053e-06, "loss": 0.0, "num_input_tokens_seen": 155646352, "step": 72175 }, { "epoch": 13.246467241695724, "grad_norm": 0.0036113981623202562, "learning_rate": 3.089579441401293e-06, "loss": 0.0, "num_input_tokens_seen": 155657104, "step": 72180 }, { "epoch": 13.247384841255275, "grad_norm": 0.00018606981029734015, "learning_rate": 3.088839464597305e-06, "loss": 0.0, "num_input_tokens_seen": 155666640, "step": 72185 }, { "epoch": 13.248302440814829, "grad_norm": 0.00020277661678846925, "learning_rate": 3.088099536811625e-06, "loss": 0.0, "num_input_tokens_seen": 155677968, "step": 72190 }, { "epoch": 13.24922004037438, "grad_norm": 0.0002974118397105485, "learning_rate": 3.0873596580632287e-06, "loss": 0.0, "num_input_tokens_seen": 155688720, "step": 72195 }, { "epoch": 13.250137639933932, "grad_norm": 12.95589542388916, "learning_rate": 3.0866198283710904e-06, "loss": 0.0028, "num_input_tokens_seen": 155699120, "step": 72200 }, { "epoch": 13.251055239493486, "grad_norm": 0.002898627892136574, "learning_rate": 3.0858800477541906e-06, "loss": 0.0, "num_input_tokens_seen": 155710256, "step": 72205 }, { "epoch": 13.251972839053037, "grad_norm": 60.898780822753906, "learning_rate": 3.085140316231501e-06, "loss": 0.1252, "num_input_tokens_seen": 155720528, "step": 72210 }, { "epoch": 13.252890438612589, "grad_norm": 0.001237034099176526, "learning_rate": 3.0844006338219935e-06, "loss": 0.0, "num_input_tokens_seen": 155731088, "step": 72215 }, { "epoch": 13.253808038172142, "grad_norm": 0.00044494366738945246, "learning_rate": 3.08366100054464e-06, "loss": 0.0, "num_input_tokens_seen": 155741552, "step": 72220 }, { "epoch": 13.254725637731694, "grad_norm": 0.0001498489873483777, "learning_rate": 3.082921416418413e-06, "loss": 0.0, "num_input_tokens_seen": 155752720, "step": 72225 }, { "epoch": 13.255643237291245, "grad_norm": 0.0026265522465109825, "learning_rate": 3.0821818814622806e-06, "loss": 0.0, "num_input_tokens_seen": 155763408, "step": 72230 }, { "epoch": 13.256560836850799, "grad_norm": 0.00040097880992107093, "learning_rate": 3.081442395695209e-06, "loss": 0.0, "num_input_tokens_seen": 155774800, "step": 72235 }, { "epoch": 13.25747843641035, "grad_norm": 0.0002708738611545414, "learning_rate": 3.080702959136168e-06, "loss": 0.0, "num_input_tokens_seen": 155785808, "step": 72240 }, { "epoch": 13.258396035969902, "grad_norm": 0.020657366141676903, "learning_rate": 3.079963571804122e-06, "loss": 0.0, "num_input_tokens_seen": 155796848, "step": 72245 }, { "epoch": 13.259313635529455, "grad_norm": 0.0002454736677464098, "learning_rate": 3.0792242337180334e-06, "loss": 0.0, "num_input_tokens_seen": 155807216, "step": 72250 }, { "epoch": 13.260231235089007, "grad_norm": 0.00031270828912965953, "learning_rate": 3.0784849448968667e-06, "loss": 0.0, "num_input_tokens_seen": 155818480, "step": 72255 }, { "epoch": 13.261148834648559, "grad_norm": 0.002536717103794217, "learning_rate": 3.0777457053595827e-06, "loss": 0.0, "num_input_tokens_seen": 155828336, "step": 72260 }, { "epoch": 13.262066434208112, "grad_norm": 0.00018078138236887753, "learning_rate": 3.0770065151251427e-06, "loss": 0.0, "num_input_tokens_seen": 155838320, "step": 72265 }, { "epoch": 13.262984033767664, "grad_norm": 0.00040345502202399075, "learning_rate": 3.076267374212505e-06, "loss": 0.0, "num_input_tokens_seen": 155849136, "step": 72270 }, { "epoch": 13.263901633327215, "grad_norm": 0.0009202650981023908, "learning_rate": 3.0755282826406275e-06, "loss": 0.0, "num_input_tokens_seen": 155859888, "step": 72275 }, { "epoch": 13.264819232886769, "grad_norm": 0.004542147275060415, "learning_rate": 3.0747892404284675e-06, "loss": 0.0, "num_input_tokens_seen": 155870736, "step": 72280 }, { "epoch": 13.26573683244632, "grad_norm": 0.0015080509474501014, "learning_rate": 3.0740502475949775e-06, "loss": 0.0, "num_input_tokens_seen": 155882224, "step": 72285 }, { "epoch": 13.266654432005872, "grad_norm": 0.00015815006918273866, "learning_rate": 3.073311304159116e-06, "loss": 0.0, "num_input_tokens_seen": 155893392, "step": 72290 }, { "epoch": 13.267572031565425, "grad_norm": 0.0006667032721452415, "learning_rate": 3.0725724101398334e-06, "loss": 0.0, "num_input_tokens_seen": 155902224, "step": 72295 }, { "epoch": 13.268489631124977, "grad_norm": 0.0011166727636009455, "learning_rate": 3.0718335655560793e-06, "loss": 0.0, "num_input_tokens_seen": 155912368, "step": 72300 }, { "epoch": 13.269407230684529, "grad_norm": 0.0008692815899848938, "learning_rate": 3.071094770426808e-06, "loss": 0.0, "num_input_tokens_seen": 155923216, "step": 72305 }, { "epoch": 13.270324830244082, "grad_norm": 0.000452280743047595, "learning_rate": 3.0703560247709656e-06, "loss": 0.0, "num_input_tokens_seen": 155935568, "step": 72310 }, { "epoch": 13.271242429803634, "grad_norm": 0.04261482506990433, "learning_rate": 3.069617328607501e-06, "loss": 0.0018, "num_input_tokens_seen": 155945808, "step": 72315 }, { "epoch": 13.272160029363185, "grad_norm": 0.0015494957333430648, "learning_rate": 3.068878681955358e-06, "loss": 0.0001, "num_input_tokens_seen": 155957360, "step": 72320 }, { "epoch": 13.273077628922739, "grad_norm": 0.0056314412504434586, "learning_rate": 3.068140084833486e-06, "loss": 0.0, "num_input_tokens_seen": 155967088, "step": 72325 }, { "epoch": 13.27399522848229, "grad_norm": 0.0001838279131334275, "learning_rate": 3.067401537260826e-06, "loss": 0.0, "num_input_tokens_seen": 155977168, "step": 72330 }, { "epoch": 13.274912828041842, "grad_norm": 0.00023789280385244638, "learning_rate": 3.0666630392563203e-06, "loss": 0.0, "num_input_tokens_seen": 155987792, "step": 72335 }, { "epoch": 13.275830427601395, "grad_norm": 0.0007365027558989823, "learning_rate": 3.0659245908389122e-06, "loss": 0.0, "num_input_tokens_seen": 155998960, "step": 72340 }, { "epoch": 13.276748027160947, "grad_norm": 0.0009055021801032126, "learning_rate": 3.0651861920275415e-06, "loss": 0.0, "num_input_tokens_seen": 156009872, "step": 72345 }, { "epoch": 13.277665626720498, "grad_norm": 0.0013041780330240726, "learning_rate": 3.064447842841147e-06, "loss": 0.0, "num_input_tokens_seen": 156020720, "step": 72350 }, { "epoch": 13.278583226280052, "grad_norm": 0.000588389637414366, "learning_rate": 3.063709543298663e-06, "loss": 0.0, "num_input_tokens_seen": 156032432, "step": 72355 }, { "epoch": 13.279500825839603, "grad_norm": 0.022168321534991264, "learning_rate": 3.0629712934190294e-06, "loss": 0.0028, "num_input_tokens_seen": 156042032, "step": 72360 }, { "epoch": 13.280418425399155, "grad_norm": 0.05830270051956177, "learning_rate": 3.062233093221181e-06, "loss": 0.0, "num_input_tokens_seen": 156052848, "step": 72365 }, { "epoch": 13.281336024958708, "grad_norm": 0.00017416424816474319, "learning_rate": 3.0614949427240483e-06, "loss": 0.0, "num_input_tokens_seen": 156063312, "step": 72370 }, { "epoch": 13.28225362451826, "grad_norm": 0.0012123851338401437, "learning_rate": 3.060756841946568e-06, "loss": 0.0, "num_input_tokens_seen": 156074288, "step": 72375 }, { "epoch": 13.283171224077812, "grad_norm": 35.74408721923828, "learning_rate": 3.0600187909076683e-06, "loss": 0.1035, "num_input_tokens_seen": 156084816, "step": 72380 }, { "epoch": 13.284088823637365, "grad_norm": 0.00019643237465061247, "learning_rate": 3.059280789626279e-06, "loss": 0.0, "num_input_tokens_seen": 156096208, "step": 72385 }, { "epoch": 13.285006423196917, "grad_norm": 0.0001779358135536313, "learning_rate": 3.0585428381213305e-06, "loss": 0.0, "num_input_tokens_seen": 156106544, "step": 72390 }, { "epoch": 13.285924022756468, "grad_norm": 0.0010651153279468417, "learning_rate": 3.0578049364117502e-06, "loss": 0.1314, "num_input_tokens_seen": 156116752, "step": 72395 }, { "epoch": 13.286841622316022, "grad_norm": 0.00017330943956039846, "learning_rate": 3.057067084516462e-06, "loss": 0.0, "num_input_tokens_seen": 156127440, "step": 72400 }, { "epoch": 13.287759221875573, "grad_norm": 0.0005887112929485738, "learning_rate": 3.0563292824543912e-06, "loss": 0.0, "num_input_tokens_seen": 156139216, "step": 72405 }, { "epoch": 13.288676821435125, "grad_norm": 0.00023529419559054077, "learning_rate": 3.0555915302444626e-06, "loss": 0.0, "num_input_tokens_seen": 156149712, "step": 72410 }, { "epoch": 13.289594420994678, "grad_norm": 0.001242712140083313, "learning_rate": 3.0548538279055986e-06, "loss": 0.0, "num_input_tokens_seen": 156159760, "step": 72415 }, { "epoch": 13.29051202055423, "grad_norm": 0.00024079055583570153, "learning_rate": 3.054116175456717e-06, "loss": 0.0, "num_input_tokens_seen": 156170096, "step": 72420 }, { "epoch": 13.291429620113782, "grad_norm": 0.01785963959991932, "learning_rate": 3.053378572916741e-06, "loss": 0.0, "num_input_tokens_seen": 156182416, "step": 72425 }, { "epoch": 13.292347219673335, "grad_norm": 0.003127181902527809, "learning_rate": 3.0526410203045888e-06, "loss": 0.0, "num_input_tokens_seen": 156192592, "step": 72430 }, { "epoch": 13.293264819232887, "grad_norm": 0.0002806179691106081, "learning_rate": 3.051903517639173e-06, "loss": 0.0, "num_input_tokens_seen": 156202896, "step": 72435 }, { "epoch": 13.294182418792438, "grad_norm": 0.00048199729644693434, "learning_rate": 3.0511660649394153e-06, "loss": 0.0, "num_input_tokens_seen": 156214000, "step": 72440 }, { "epoch": 13.295100018351992, "grad_norm": 0.0005211273673921824, "learning_rate": 3.050428662224228e-06, "loss": 0.0, "num_input_tokens_seen": 156225616, "step": 72445 }, { "epoch": 13.296017617911543, "grad_norm": 0.0008697884622961283, "learning_rate": 3.0496913095125235e-06, "loss": 0.0, "num_input_tokens_seen": 156235120, "step": 72450 }, { "epoch": 13.296935217471095, "grad_norm": 0.00013090016727801412, "learning_rate": 3.0489540068232124e-06, "loss": 0.0, "num_input_tokens_seen": 156245968, "step": 72455 }, { "epoch": 13.297852817030648, "grad_norm": 0.00019455650181043893, "learning_rate": 3.048216754175209e-06, "loss": 0.0, "num_input_tokens_seen": 156255440, "step": 72460 }, { "epoch": 13.2987704165902, "grad_norm": 0.00025232075131498277, "learning_rate": 3.0474795515874212e-06, "loss": 0.0, "num_input_tokens_seen": 156265552, "step": 72465 }, { "epoch": 13.299688016149751, "grad_norm": 0.00018418222316540778, "learning_rate": 3.0467423990787547e-06, "loss": 0.0, "num_input_tokens_seen": 156276688, "step": 72470 }, { "epoch": 13.300605615709305, "grad_norm": 0.00013236385711934417, "learning_rate": 3.046005296668121e-06, "loss": 0.0, "num_input_tokens_seen": 156287344, "step": 72475 }, { "epoch": 13.301523215268857, "grad_norm": 0.002597155049443245, "learning_rate": 3.045268244374422e-06, "loss": 0.0, "num_input_tokens_seen": 156298864, "step": 72480 }, { "epoch": 13.302440814828408, "grad_norm": 0.0031456046272069216, "learning_rate": 3.0445312422165616e-06, "loss": 0.0, "num_input_tokens_seen": 156308784, "step": 72485 }, { "epoch": 13.303358414387962, "grad_norm": 0.0005680195172317326, "learning_rate": 3.0437942902134453e-06, "loss": 0.0, "num_input_tokens_seen": 156320144, "step": 72490 }, { "epoch": 13.304276013947513, "grad_norm": 0.00011811372678494081, "learning_rate": 3.043057388383974e-06, "loss": 0.0, "num_input_tokens_seen": 156330416, "step": 72495 }, { "epoch": 13.305193613507065, "grad_norm": 0.00020071868493687361, "learning_rate": 3.0423205367470475e-06, "loss": 0.0, "num_input_tokens_seen": 156340752, "step": 72500 }, { "epoch": 13.306111213066618, "grad_norm": 0.015591629780828953, "learning_rate": 3.041583735321564e-06, "loss": 0.0, "num_input_tokens_seen": 156351152, "step": 72505 }, { "epoch": 13.30702881262617, "grad_norm": 0.00015857242397032678, "learning_rate": 3.0408469841264234e-06, "loss": 0.0, "num_input_tokens_seen": 156360944, "step": 72510 }, { "epoch": 13.307946412185721, "grad_norm": 0.0009324162383563817, "learning_rate": 3.040110283180522e-06, "loss": 0.0, "num_input_tokens_seen": 156371920, "step": 72515 }, { "epoch": 13.308864011745275, "grad_norm": 0.003903219010680914, "learning_rate": 3.039373632502751e-06, "loss": 0.0101, "num_input_tokens_seen": 156383088, "step": 72520 }, { "epoch": 13.309781611304826, "grad_norm": 0.0002446901926305145, "learning_rate": 3.0386370321120105e-06, "loss": 0.0, "num_input_tokens_seen": 156392848, "step": 72525 }, { "epoch": 13.310699210864378, "grad_norm": 0.0020953963976353407, "learning_rate": 3.0379004820271906e-06, "loss": 0.0, "num_input_tokens_seen": 156404144, "step": 72530 }, { "epoch": 13.311616810423931, "grad_norm": 0.003295666305348277, "learning_rate": 3.037163982267182e-06, "loss": 0.0, "num_input_tokens_seen": 156414768, "step": 72535 }, { "epoch": 13.312534409983483, "grad_norm": 0.0003687766438815743, "learning_rate": 3.0364275328508736e-06, "loss": 0.0002, "num_input_tokens_seen": 156426896, "step": 72540 }, { "epoch": 13.313452009543035, "grad_norm": 0.0001367339282296598, "learning_rate": 3.0356911337971575e-06, "loss": 0.0, "num_input_tokens_seen": 156436496, "step": 72545 }, { "epoch": 13.314369609102588, "grad_norm": 0.0004443309735506773, "learning_rate": 3.034954785124919e-06, "loss": 0.0, "num_input_tokens_seen": 156449360, "step": 72550 }, { "epoch": 13.31528720866214, "grad_norm": 0.00015067571075633168, "learning_rate": 3.0342184868530435e-06, "loss": 0.0, "num_input_tokens_seen": 156459344, "step": 72555 }, { "epoch": 13.316204808221691, "grad_norm": 0.0020296424627304077, "learning_rate": 3.0334822390004183e-06, "loss": 0.0097, "num_input_tokens_seen": 156469680, "step": 72560 }, { "epoch": 13.317122407781245, "grad_norm": 0.0002600681036710739, "learning_rate": 3.0327460415859255e-06, "loss": 0.0532, "num_input_tokens_seen": 156479792, "step": 72565 }, { "epoch": 13.318040007340796, "grad_norm": 0.00046493226545862854, "learning_rate": 3.0320098946284477e-06, "loss": 0.0, "num_input_tokens_seen": 156490736, "step": 72570 }, { "epoch": 13.318957606900348, "grad_norm": 0.0028731608763337135, "learning_rate": 3.0312737981468663e-06, "loss": 0.0478, "num_input_tokens_seen": 156500112, "step": 72575 }, { "epoch": 13.319875206459901, "grad_norm": 0.007833151146769524, "learning_rate": 3.030537752160061e-06, "loss": 0.0, "num_input_tokens_seen": 156509840, "step": 72580 }, { "epoch": 13.320792806019453, "grad_norm": 0.000198078210814856, "learning_rate": 3.0298017566869096e-06, "loss": 0.0822, "num_input_tokens_seen": 156521584, "step": 72585 }, { "epoch": 13.321710405579005, "grad_norm": 0.0001724018802633509, "learning_rate": 3.029065811746289e-06, "loss": 0.0733, "num_input_tokens_seen": 156532208, "step": 72590 }, { "epoch": 13.322628005138558, "grad_norm": 0.00012744523701258004, "learning_rate": 3.0283299173570768e-06, "loss": 0.0, "num_input_tokens_seen": 156543184, "step": 72595 }, { "epoch": 13.32354560469811, "grad_norm": 0.010394629091024399, "learning_rate": 3.0275940735381463e-06, "loss": 0.0, "num_input_tokens_seen": 156553808, "step": 72600 }, { "epoch": 13.324463204257661, "grad_norm": 0.000320484337862581, "learning_rate": 3.026858280308369e-06, "loss": 0.0, "num_input_tokens_seen": 156566352, "step": 72605 }, { "epoch": 13.325380803817215, "grad_norm": 0.05608925223350525, "learning_rate": 3.026122537686621e-06, "loss": 0.0004, "num_input_tokens_seen": 156576688, "step": 72610 }, { "epoch": 13.326298403376766, "grad_norm": 0.0002012178156292066, "learning_rate": 3.02538684569177e-06, "loss": 0.0, "num_input_tokens_seen": 156587408, "step": 72615 }, { "epoch": 13.327216002936318, "grad_norm": 0.0002207919314969331, "learning_rate": 3.0246512043426846e-06, "loss": 0.0, "num_input_tokens_seen": 156598160, "step": 72620 }, { "epoch": 13.328133602495871, "grad_norm": 0.0001429350522812456, "learning_rate": 3.023915613658236e-06, "loss": 0.0022, "num_input_tokens_seen": 156608592, "step": 72625 }, { "epoch": 13.329051202055423, "grad_norm": 0.0002524307928979397, "learning_rate": 3.0231800736572893e-06, "loss": 0.0, "num_input_tokens_seen": 156618736, "step": 72630 }, { "epoch": 13.329968801614974, "grad_norm": 0.000141923752380535, "learning_rate": 3.0224445843587104e-06, "loss": 0.0006, "num_input_tokens_seen": 156629840, "step": 72635 }, { "epoch": 13.330886401174528, "grad_norm": 0.028381172567605972, "learning_rate": 3.0217091457813598e-06, "loss": 0.002, "num_input_tokens_seen": 156640752, "step": 72640 }, { "epoch": 13.33180400073408, "grad_norm": 0.0035965421702712774, "learning_rate": 3.0209737579441067e-06, "loss": 0.0, "num_input_tokens_seen": 156650640, "step": 72645 }, { "epoch": 13.332721600293631, "grad_norm": 0.06593447178602219, "learning_rate": 3.0202384208658086e-06, "loss": 0.0001, "num_input_tokens_seen": 156662864, "step": 72650 }, { "epoch": 13.333639199853184, "grad_norm": 0.00037955716834403574, "learning_rate": 3.0195031345653252e-06, "loss": 0.0, "num_input_tokens_seen": 156673552, "step": 72655 }, { "epoch": 13.334556799412736, "grad_norm": 0.13562718033790588, "learning_rate": 3.0187678990615187e-06, "loss": 0.0001, "num_input_tokens_seen": 156684720, "step": 72660 }, { "epoch": 13.335474398972288, "grad_norm": 0.0004684117157012224, "learning_rate": 3.018032714373245e-06, "loss": 0.0, "num_input_tokens_seen": 156694064, "step": 72665 }, { "epoch": 13.336391998531841, "grad_norm": 0.00016684534784872085, "learning_rate": 3.0172975805193604e-06, "loss": 0.0, "num_input_tokens_seen": 156705264, "step": 72670 }, { "epoch": 13.337309598091393, "grad_norm": 0.058800630271434784, "learning_rate": 3.016562497518719e-06, "loss": 0.0001, "num_input_tokens_seen": 156716528, "step": 72675 }, { "epoch": 13.338227197650944, "grad_norm": 0.00032278557773679495, "learning_rate": 3.0158274653901756e-06, "loss": 0.275, "num_input_tokens_seen": 156729008, "step": 72680 }, { "epoch": 13.339144797210498, "grad_norm": 0.02429022267460823, "learning_rate": 3.0150924841525837e-06, "loss": 0.0, "num_input_tokens_seen": 156739760, "step": 72685 }, { "epoch": 13.34006239677005, "grad_norm": 0.007919017225503922, "learning_rate": 3.0143575538247915e-06, "loss": 0.0, "num_input_tokens_seen": 156750544, "step": 72690 }, { "epoch": 13.340979996329601, "grad_norm": 0.01321782823652029, "learning_rate": 3.0136226744256524e-06, "loss": 0.0, "num_input_tokens_seen": 156760592, "step": 72695 }, { "epoch": 13.341897595889154, "grad_norm": 0.00037682897527702153, "learning_rate": 3.0128878459740128e-06, "loss": 0.0001, "num_input_tokens_seen": 156771472, "step": 72700 }, { "epoch": 13.342815195448706, "grad_norm": 0.000860765459947288, "learning_rate": 3.012153068488718e-06, "loss": 0.0, "num_input_tokens_seen": 156783056, "step": 72705 }, { "epoch": 13.343732795008258, "grad_norm": 0.11405835300683975, "learning_rate": 3.0114183419886183e-06, "loss": 0.0097, "num_input_tokens_seen": 156794192, "step": 72710 }, { "epoch": 13.344650394567811, "grad_norm": 0.002412385307252407, "learning_rate": 3.0106836664925565e-06, "loss": 0.0001, "num_input_tokens_seen": 156805584, "step": 72715 }, { "epoch": 13.345567994127363, "grad_norm": 0.026708083227276802, "learning_rate": 3.0099490420193746e-06, "loss": 0.0, "num_input_tokens_seen": 156816912, "step": 72720 }, { "epoch": 13.346485593686914, "grad_norm": 0.00027528320788405836, "learning_rate": 3.0092144685879144e-06, "loss": 0.0, "num_input_tokens_seen": 156827248, "step": 72725 }, { "epoch": 13.347403193246468, "grad_norm": 0.00930674560368061, "learning_rate": 3.0084799462170187e-06, "loss": 0.0, "num_input_tokens_seen": 156838096, "step": 72730 }, { "epoch": 13.34832079280602, "grad_norm": 0.00021309226576704532, "learning_rate": 3.0077454749255262e-06, "loss": 0.0, "num_input_tokens_seen": 156848368, "step": 72735 }, { "epoch": 13.34923839236557, "grad_norm": 0.0030049853958189487, "learning_rate": 3.007011054732273e-06, "loss": 0.0, "num_input_tokens_seen": 156858192, "step": 72740 }, { "epoch": 13.350155991925124, "grad_norm": 0.00036263983929529786, "learning_rate": 3.006276685656099e-06, "loss": 0.0001, "num_input_tokens_seen": 156868784, "step": 72745 }, { "epoch": 13.351073591484676, "grad_norm": 0.0006379205151461065, "learning_rate": 3.005542367715838e-06, "loss": 0.0, "num_input_tokens_seen": 156879408, "step": 72750 }, { "epoch": 13.351991191044227, "grad_norm": 0.0003853125963360071, "learning_rate": 3.004808100930322e-06, "loss": 0.0, "num_input_tokens_seen": 156891120, "step": 72755 }, { "epoch": 13.35290879060378, "grad_norm": 0.008243446238338947, "learning_rate": 3.004073885318388e-06, "loss": 0.0, "num_input_tokens_seen": 156899920, "step": 72760 }, { "epoch": 13.353826390163333, "grad_norm": 0.0008469125023111701, "learning_rate": 3.0033397208988656e-06, "loss": 0.0, "num_input_tokens_seen": 156912080, "step": 72765 }, { "epoch": 13.354743989722884, "grad_norm": 0.00048174418043345213, "learning_rate": 3.002605607690585e-06, "loss": 0.0, "num_input_tokens_seen": 156922224, "step": 72770 }, { "epoch": 13.355661589282438, "grad_norm": 0.0008208011859096587, "learning_rate": 3.0018715457123725e-06, "loss": 0.0, "num_input_tokens_seen": 156931408, "step": 72775 }, { "epoch": 13.35657918884199, "grad_norm": 0.008944203145802021, "learning_rate": 3.001137534983061e-06, "loss": 0.0, "num_input_tokens_seen": 156941008, "step": 72780 }, { "epoch": 13.35749678840154, "grad_norm": 0.0006166907260194421, "learning_rate": 3.000403575521472e-06, "loss": 0.0, "num_input_tokens_seen": 156951280, "step": 72785 }, { "epoch": 13.358414387961094, "grad_norm": 0.0002559540735092014, "learning_rate": 2.999669667346432e-06, "loss": 0.0, "num_input_tokens_seen": 156961168, "step": 72790 }, { "epoch": 13.359331987520646, "grad_norm": 0.0001965896663023159, "learning_rate": 2.9989358104767663e-06, "loss": 0.0, "num_input_tokens_seen": 156972304, "step": 72795 }, { "epoch": 13.360249587080197, "grad_norm": 0.00027623679488897324, "learning_rate": 2.9982020049312945e-06, "loss": 0.0001, "num_input_tokens_seen": 156983824, "step": 72800 }, { "epoch": 13.36116718663975, "grad_norm": 0.0005337633192539215, "learning_rate": 2.99746825072884e-06, "loss": 0.0001, "num_input_tokens_seen": 156994352, "step": 72805 }, { "epoch": 13.362084786199302, "grad_norm": 0.005866335704922676, "learning_rate": 2.996734547888219e-06, "loss": 0.0, "num_input_tokens_seen": 157005584, "step": 72810 }, { "epoch": 13.363002385758854, "grad_norm": 0.00036380128585733473, "learning_rate": 2.9960008964282544e-06, "loss": 0.0, "num_input_tokens_seen": 157016720, "step": 72815 }, { "epoch": 13.363919985318407, "grad_norm": 0.0007323671015910804, "learning_rate": 2.9952672963677604e-06, "loss": 0.0, "num_input_tokens_seen": 157026448, "step": 72820 }, { "epoch": 13.364837584877959, "grad_norm": 0.00039634172571823, "learning_rate": 2.994533747725551e-06, "loss": 0.0, "num_input_tokens_seen": 157037584, "step": 72825 }, { "epoch": 13.36575518443751, "grad_norm": 0.00013748278433922678, "learning_rate": 2.9938002505204457e-06, "loss": 0.0, "num_input_tokens_seen": 157048624, "step": 72830 }, { "epoch": 13.366672783997064, "grad_norm": 0.0023641360457986593, "learning_rate": 2.9930668047712536e-06, "loss": 0.0001, "num_input_tokens_seen": 157058736, "step": 72835 }, { "epoch": 13.367590383556616, "grad_norm": 0.00023373891599476337, "learning_rate": 2.992333410496786e-06, "loss": 0.0, "num_input_tokens_seen": 157070608, "step": 72840 }, { "epoch": 13.368507983116167, "grad_norm": 0.06627237796783447, "learning_rate": 2.991600067715856e-06, "loss": 0.0, "num_input_tokens_seen": 157081520, "step": 72845 }, { "epoch": 13.36942558267572, "grad_norm": 0.00020133574435021728, "learning_rate": 2.990866776447272e-06, "loss": 0.0, "num_input_tokens_seen": 157092368, "step": 72850 }, { "epoch": 13.370343182235272, "grad_norm": 0.0005903664859943092, "learning_rate": 2.9901335367098416e-06, "loss": 0.0, "num_input_tokens_seen": 157102736, "step": 72855 }, { "epoch": 13.371260781794824, "grad_norm": 55.87839889526367, "learning_rate": 2.9894003485223687e-06, "loss": 0.1689, "num_input_tokens_seen": 157113360, "step": 72860 }, { "epoch": 13.372178381354377, "grad_norm": 0.00025060330517590046, "learning_rate": 2.988667211903663e-06, "loss": 0.0, "num_input_tokens_seen": 157124432, "step": 72865 }, { "epoch": 13.373095980913929, "grad_norm": 0.0011661077151075006, "learning_rate": 2.987934126872526e-06, "loss": 0.1221, "num_input_tokens_seen": 157134896, "step": 72870 }, { "epoch": 13.37401358047348, "grad_norm": 0.0002033884811680764, "learning_rate": 2.987201093447758e-06, "loss": 0.0001, "num_input_tokens_seen": 157146576, "step": 72875 }, { "epoch": 13.374931180033034, "grad_norm": 0.00012272577441763133, "learning_rate": 2.9864681116481655e-06, "loss": 0.0063, "num_input_tokens_seen": 157157104, "step": 72880 }, { "epoch": 13.375848779592586, "grad_norm": 0.0002835276536643505, "learning_rate": 2.985735181492544e-06, "loss": 0.0, "num_input_tokens_seen": 157168752, "step": 72885 }, { "epoch": 13.376766379152137, "grad_norm": 0.0003948794037569314, "learning_rate": 2.9850023029996923e-06, "loss": 0.0, "num_input_tokens_seen": 157180048, "step": 72890 }, { "epoch": 13.37768397871169, "grad_norm": 0.0003516105061862618, "learning_rate": 2.9842694761884095e-06, "loss": 0.3969, "num_input_tokens_seen": 157190288, "step": 72895 }, { "epoch": 13.378601578271242, "grad_norm": 0.0003780663828365505, "learning_rate": 2.9835367010774903e-06, "loss": 0.0, "num_input_tokens_seen": 157200656, "step": 72900 }, { "epoch": 13.379519177830794, "grad_norm": 0.00028543968801386654, "learning_rate": 2.98280397768573e-06, "loss": 0.0, "num_input_tokens_seen": 157211376, "step": 72905 }, { "epoch": 13.380436777390347, "grad_norm": 0.0003133130958303809, "learning_rate": 2.98207130603192e-06, "loss": 0.0, "num_input_tokens_seen": 157220176, "step": 72910 }, { "epoch": 13.381354376949899, "grad_norm": 0.0024775867350399494, "learning_rate": 2.981338686134855e-06, "loss": 0.0001, "num_input_tokens_seen": 157230160, "step": 72915 }, { "epoch": 13.38227197650945, "grad_norm": 0.03220295161008835, "learning_rate": 2.980606118013324e-06, "loss": 0.0, "num_input_tokens_seen": 157240240, "step": 72920 }, { "epoch": 13.383189576069004, "grad_norm": 0.00278524705208838, "learning_rate": 2.979873601686114e-06, "loss": 0.0, "num_input_tokens_seen": 157250896, "step": 72925 }, { "epoch": 13.384107175628555, "grad_norm": 0.0009604545775800943, "learning_rate": 2.9791411371720168e-06, "loss": 0.0001, "num_input_tokens_seen": 157260976, "step": 72930 }, { "epoch": 13.385024775188107, "grad_norm": 0.00044174937647767365, "learning_rate": 2.9784087244898184e-06, "loss": 0.0, "num_input_tokens_seen": 157269936, "step": 72935 }, { "epoch": 13.38594237474766, "grad_norm": 0.006937554571777582, "learning_rate": 2.9776763636583007e-06, "loss": 0.0, "num_input_tokens_seen": 157280688, "step": 72940 }, { "epoch": 13.386859974307212, "grad_norm": 0.0006725976709276438, "learning_rate": 2.976944054696252e-06, "loss": 0.0, "num_input_tokens_seen": 157292368, "step": 72945 }, { "epoch": 13.387777573866764, "grad_norm": 0.02295728586614132, "learning_rate": 2.9762117976224526e-06, "loss": 0.0, "num_input_tokens_seen": 157303632, "step": 72950 }, { "epoch": 13.388695173426317, "grad_norm": 0.005038758274167776, "learning_rate": 2.975479592455684e-06, "loss": 0.0, "num_input_tokens_seen": 157313744, "step": 72955 }, { "epoch": 13.389612772985869, "grad_norm": 0.00035397123428992927, "learning_rate": 2.974747439214724e-06, "loss": 0.0, "num_input_tokens_seen": 157324304, "step": 72960 }, { "epoch": 13.39053037254542, "grad_norm": 0.00019587641872931272, "learning_rate": 2.9740153379183555e-06, "loss": 0.0, "num_input_tokens_seen": 157334416, "step": 72965 }, { "epoch": 13.391447972104974, "grad_norm": 0.026425106450915337, "learning_rate": 2.9732832885853535e-06, "loss": 0.0001, "num_input_tokens_seen": 157345360, "step": 72970 }, { "epoch": 13.392365571664525, "grad_norm": 0.004346094559878111, "learning_rate": 2.9725512912344923e-06, "loss": 0.0, "num_input_tokens_seen": 157356656, "step": 72975 }, { "epoch": 13.393283171224077, "grad_norm": 0.0002756238682195544, "learning_rate": 2.97181934588455e-06, "loss": 0.0, "num_input_tokens_seen": 157367248, "step": 72980 }, { "epoch": 13.39420077078363, "grad_norm": 0.0027298261411488056, "learning_rate": 2.971087452554299e-06, "loss": 0.0, "num_input_tokens_seen": 157378096, "step": 72985 }, { "epoch": 13.395118370343182, "grad_norm": 0.00029741012258455157, "learning_rate": 2.9703556112625086e-06, "loss": 0.0, "num_input_tokens_seen": 157389168, "step": 72990 }, { "epoch": 13.396035969902734, "grad_norm": 0.016221901401877403, "learning_rate": 2.9696238220279505e-06, "loss": 0.0, "num_input_tokens_seen": 157400528, "step": 72995 }, { "epoch": 13.396953569462287, "grad_norm": 0.00014064052083995193, "learning_rate": 2.968892084869396e-06, "loss": 0.0, "num_input_tokens_seen": 157412400, "step": 73000 }, { "epoch": 13.397871169021839, "grad_norm": 28.76251983642578, "learning_rate": 2.968160399805612e-06, "loss": 0.0822, "num_input_tokens_seen": 157424560, "step": 73005 }, { "epoch": 13.39878876858139, "grad_norm": 0.23612767457962036, "learning_rate": 2.9674287668553624e-06, "loss": 0.0004, "num_input_tokens_seen": 157434320, "step": 73010 }, { "epoch": 13.399706368140944, "grad_norm": 0.00025346307666040957, "learning_rate": 2.9666971860374173e-06, "loss": 0.0001, "num_input_tokens_seen": 157445936, "step": 73015 }, { "epoch": 13.400623967700495, "grad_norm": 0.003592663211748004, "learning_rate": 2.9659656573705374e-06, "loss": 0.0, "num_input_tokens_seen": 157455344, "step": 73020 }, { "epoch": 13.401541567260047, "grad_norm": 0.00032316482975147665, "learning_rate": 2.965234180873484e-06, "loss": 0.0, "num_input_tokens_seen": 157465744, "step": 73025 }, { "epoch": 13.4024591668196, "grad_norm": 0.0001858249888755381, "learning_rate": 2.964502756565022e-06, "loss": 0.0, "num_input_tokens_seen": 157475024, "step": 73030 }, { "epoch": 13.403376766379152, "grad_norm": 0.037610992789268494, "learning_rate": 2.9637713844639092e-06, "loss": 0.0001, "num_input_tokens_seen": 157485744, "step": 73035 }, { "epoch": 13.404294365938703, "grad_norm": 0.0016912161372601986, "learning_rate": 2.9630400645889055e-06, "loss": 0.0329, "num_input_tokens_seen": 157496848, "step": 73040 }, { "epoch": 13.405211965498257, "grad_norm": 0.0004917022306472063, "learning_rate": 2.9623087969587648e-06, "loss": 0.0, "num_input_tokens_seen": 157508400, "step": 73045 }, { "epoch": 13.406129565057809, "grad_norm": 0.0004350666713435203, "learning_rate": 2.961577581592247e-06, "loss": 0.0284, "num_input_tokens_seen": 157518832, "step": 73050 }, { "epoch": 13.40704716461736, "grad_norm": 0.0023879664950072765, "learning_rate": 2.9608464185081055e-06, "loss": 0.0, "num_input_tokens_seen": 157530160, "step": 73055 }, { "epoch": 13.407964764176914, "grad_norm": 0.014470046386122704, "learning_rate": 2.9601153077250907e-06, "loss": 0.0, "num_input_tokens_seen": 157539920, "step": 73060 }, { "epoch": 13.408882363736465, "grad_norm": 0.0009870543144643307, "learning_rate": 2.9593842492619584e-06, "loss": 0.0, "num_input_tokens_seen": 157551152, "step": 73065 }, { "epoch": 13.409799963296017, "grad_norm": 0.00019593043543864042, "learning_rate": 2.9586532431374583e-06, "loss": 0.0001, "num_input_tokens_seen": 157561648, "step": 73070 }, { "epoch": 13.41071756285557, "grad_norm": 0.005996497813612223, "learning_rate": 2.957922289370335e-06, "loss": 0.0001, "num_input_tokens_seen": 157572240, "step": 73075 }, { "epoch": 13.411635162415122, "grad_norm": 0.00022252551571000367, "learning_rate": 2.9571913879793433e-06, "loss": 0.0, "num_input_tokens_seen": 157581744, "step": 73080 }, { "epoch": 13.412552761974673, "grad_norm": 0.0011266723740845919, "learning_rate": 2.9564605389832267e-06, "loss": 0.0, "num_input_tokens_seen": 157593776, "step": 73085 }, { "epoch": 13.413470361534227, "grad_norm": 0.003019080962985754, "learning_rate": 2.9557297424007296e-06, "loss": 0.0, "num_input_tokens_seen": 157604816, "step": 73090 }, { "epoch": 13.414387961093778, "grad_norm": 0.00033550095395185053, "learning_rate": 2.9549989982505943e-06, "loss": 0.0, "num_input_tokens_seen": 157615728, "step": 73095 }, { "epoch": 13.41530556065333, "grad_norm": 0.0002746653335634619, "learning_rate": 2.9542683065515678e-06, "loss": 0.0, "num_input_tokens_seen": 157626352, "step": 73100 }, { "epoch": 13.416223160212883, "grad_norm": 0.0002521701098885387, "learning_rate": 2.953537667322388e-06, "loss": 0.0, "num_input_tokens_seen": 157637104, "step": 73105 }, { "epoch": 13.417140759772435, "grad_norm": 0.0002687309170141816, "learning_rate": 2.9528070805817945e-06, "loss": 0.0001, "num_input_tokens_seen": 157647088, "step": 73110 }, { "epoch": 13.418058359331987, "grad_norm": 0.031004328280687332, "learning_rate": 2.952076546348527e-06, "loss": 0.0, "num_input_tokens_seen": 157658224, "step": 73115 }, { "epoch": 13.41897595889154, "grad_norm": 467.398193359375, "learning_rate": 2.9513460646413215e-06, "loss": 0.0284, "num_input_tokens_seen": 157668272, "step": 73120 }, { "epoch": 13.419893558451092, "grad_norm": 0.00015144023927859962, "learning_rate": 2.9506156354789156e-06, "loss": 0.0, "num_input_tokens_seen": 157679888, "step": 73125 }, { "epoch": 13.420811158010643, "grad_norm": 0.0001417254825355485, "learning_rate": 2.949885258880041e-06, "loss": 0.0, "num_input_tokens_seen": 157691088, "step": 73130 }, { "epoch": 13.421728757570197, "grad_norm": 0.00021863591973669827, "learning_rate": 2.9491549348634335e-06, "loss": 0.0, "num_input_tokens_seen": 157702256, "step": 73135 }, { "epoch": 13.422646357129748, "grad_norm": 0.0011898797238245606, "learning_rate": 2.948424663447823e-06, "loss": 0.0, "num_input_tokens_seen": 157712976, "step": 73140 }, { "epoch": 13.4235639566893, "grad_norm": 0.01241689920425415, "learning_rate": 2.9476944446519383e-06, "loss": 0.0, "num_input_tokens_seen": 157722960, "step": 73145 }, { "epoch": 13.424481556248853, "grad_norm": 0.07091832160949707, "learning_rate": 2.946964278494513e-06, "loss": 0.0001, "num_input_tokens_seen": 157733808, "step": 73150 }, { "epoch": 13.425399155808405, "grad_norm": 0.0002349046990275383, "learning_rate": 2.946234164994271e-06, "loss": 0.0, "num_input_tokens_seen": 157745360, "step": 73155 }, { "epoch": 13.426316755367957, "grad_norm": 0.0014604080934077501, "learning_rate": 2.945504104169938e-06, "loss": 0.0, "num_input_tokens_seen": 157756272, "step": 73160 }, { "epoch": 13.42723435492751, "grad_norm": 0.00046178718912415206, "learning_rate": 2.9447740960402428e-06, "loss": 0.0, "num_input_tokens_seen": 157767472, "step": 73165 }, { "epoch": 13.428151954487062, "grad_norm": 0.0003116592124570161, "learning_rate": 2.9440441406239064e-06, "loss": 0.0, "num_input_tokens_seen": 157777680, "step": 73170 }, { "epoch": 13.429069554046613, "grad_norm": 0.00032315996941179037, "learning_rate": 2.943314237939652e-06, "loss": 0.0, "num_input_tokens_seen": 157787984, "step": 73175 }, { "epoch": 13.429987153606167, "grad_norm": 0.00017381682118866593, "learning_rate": 2.9425843880061966e-06, "loss": 0.0001, "num_input_tokens_seen": 157800048, "step": 73180 }, { "epoch": 13.430904753165718, "grad_norm": 0.00036406569415703416, "learning_rate": 2.941854590842266e-06, "loss": 0.0, "num_input_tokens_seen": 157811472, "step": 73185 }, { "epoch": 13.43182235272527, "grad_norm": 0.005360319744795561, "learning_rate": 2.9411248464665748e-06, "loss": 0.0, "num_input_tokens_seen": 157821904, "step": 73190 }, { "epoch": 13.432739952284823, "grad_norm": 0.00023984788276720792, "learning_rate": 2.9403951548978382e-06, "loss": 0.0, "num_input_tokens_seen": 157832176, "step": 73195 }, { "epoch": 13.433657551844375, "grad_norm": 0.0002733927103690803, "learning_rate": 2.939665516154776e-06, "loss": 0.0, "num_input_tokens_seen": 157843280, "step": 73200 }, { "epoch": 13.434575151403926, "grad_norm": 0.00021450076019391418, "learning_rate": 2.9389359302561004e-06, "loss": 0.0002, "num_input_tokens_seen": 157853872, "step": 73205 }, { "epoch": 13.43549275096348, "grad_norm": 0.0001665269664954394, "learning_rate": 2.938206397220523e-06, "loss": 0.0, "num_input_tokens_seen": 157866064, "step": 73210 }, { "epoch": 13.436410350523031, "grad_norm": 0.0006923821056261659, "learning_rate": 2.937476917066756e-06, "loss": 0.0, "num_input_tokens_seen": 157877200, "step": 73215 }, { "epoch": 13.437327950082583, "grad_norm": 0.003482334315776825, "learning_rate": 2.9367474898135095e-06, "loss": 0.0, "num_input_tokens_seen": 157887824, "step": 73220 }, { "epoch": 13.438245549642136, "grad_norm": 0.0025405168998986483, "learning_rate": 2.9360181154794927e-06, "loss": 0.0, "num_input_tokens_seen": 157897840, "step": 73225 }, { "epoch": 13.439163149201688, "grad_norm": 0.00011957171955145895, "learning_rate": 2.9352887940834115e-06, "loss": 0.0, "num_input_tokens_seen": 157909968, "step": 73230 }, { "epoch": 13.44008074876124, "grad_norm": 0.0006800955161452293, "learning_rate": 2.9345595256439727e-06, "loss": 0.0, "num_input_tokens_seen": 157919952, "step": 73235 }, { "epoch": 13.440998348320793, "grad_norm": 0.00048564589815214276, "learning_rate": 2.9338303101798825e-06, "loss": 0.2438, "num_input_tokens_seen": 157930192, "step": 73240 }, { "epoch": 13.441915947880345, "grad_norm": 0.000162370823090896, "learning_rate": 2.93310114770984e-06, "loss": 0.0, "num_input_tokens_seen": 157941104, "step": 73245 }, { "epoch": 13.442833547439896, "grad_norm": 0.0038268098141998053, "learning_rate": 2.932372038252551e-06, "loss": 0.0, "num_input_tokens_seen": 157952080, "step": 73250 }, { "epoch": 13.44375114699945, "grad_norm": 0.00026197408442385495, "learning_rate": 2.9316429818267156e-06, "loss": 0.0, "num_input_tokens_seen": 157961808, "step": 73255 }, { "epoch": 13.444668746559001, "grad_norm": 0.025314755737781525, "learning_rate": 2.9309139784510313e-06, "loss": 0.0, "num_input_tokens_seen": 157973072, "step": 73260 }, { "epoch": 13.445586346118553, "grad_norm": 0.0006449577631428838, "learning_rate": 2.9301850281441953e-06, "loss": 0.0, "num_input_tokens_seen": 157983280, "step": 73265 }, { "epoch": 13.446503945678106, "grad_norm": 0.0002314950543222949, "learning_rate": 2.929456130924907e-06, "loss": 0.0, "num_input_tokens_seen": 157994224, "step": 73270 }, { "epoch": 13.447421545237658, "grad_norm": 0.0010797781869769096, "learning_rate": 2.92872728681186e-06, "loss": 0.0, "num_input_tokens_seen": 158004496, "step": 73275 }, { "epoch": 13.44833914479721, "grad_norm": 0.0011244314955547452, "learning_rate": 2.9279984958237462e-06, "loss": 0.0, "num_input_tokens_seen": 158015088, "step": 73280 }, { "epoch": 13.449256744356763, "grad_norm": 0.00023290797253139317, "learning_rate": 2.927269757979261e-06, "loss": 0.0012, "num_input_tokens_seen": 158026288, "step": 73285 }, { "epoch": 13.450174343916315, "grad_norm": 0.000392169167753309, "learning_rate": 2.9265410732970943e-06, "loss": 0.0, "num_input_tokens_seen": 158038096, "step": 73290 }, { "epoch": 13.451091943475866, "grad_norm": 0.0007269567577168345, "learning_rate": 2.9258124417959337e-06, "loss": 0.0, "num_input_tokens_seen": 158048336, "step": 73295 }, { "epoch": 13.45200954303542, "grad_norm": 0.0014408663846552372, "learning_rate": 2.9250838634944713e-06, "loss": 0.0, "num_input_tokens_seen": 158058096, "step": 73300 }, { "epoch": 13.452927142594971, "grad_norm": 0.00017472227045800537, "learning_rate": 2.924355338411392e-06, "loss": 0.0, "num_input_tokens_seen": 158066704, "step": 73305 }, { "epoch": 13.453844742154523, "grad_norm": 0.0007304511382244527, "learning_rate": 2.923626866565381e-06, "loss": 0.0, "num_input_tokens_seen": 158077488, "step": 73310 }, { "epoch": 13.454762341714076, "grad_norm": 0.0006652242154814303, "learning_rate": 2.922898447975121e-06, "loss": 0.0, "num_input_tokens_seen": 158089008, "step": 73315 }, { "epoch": 13.455679941273628, "grad_norm": 0.0006938709411770105, "learning_rate": 2.922170082659299e-06, "loss": 0.3063, "num_input_tokens_seen": 158099248, "step": 73320 }, { "epoch": 13.45659754083318, "grad_norm": 0.0003598584153223783, "learning_rate": 2.9214417706365933e-06, "loss": 0.0, "num_input_tokens_seen": 158109712, "step": 73325 }, { "epoch": 13.457515140392733, "grad_norm": 0.000643118575681001, "learning_rate": 2.920713511925684e-06, "loss": 0.0, "num_input_tokens_seen": 158120624, "step": 73330 }, { "epoch": 13.458432739952285, "grad_norm": 0.0012139249593019485, "learning_rate": 2.9199853065452515e-06, "loss": 0.0, "num_input_tokens_seen": 158131088, "step": 73335 }, { "epoch": 13.459350339511836, "grad_norm": 45.772117614746094, "learning_rate": 2.9192571545139715e-06, "loss": 0.019, "num_input_tokens_seen": 158142352, "step": 73340 }, { "epoch": 13.46026793907139, "grad_norm": 0.000989639782346785, "learning_rate": 2.918529055850519e-06, "loss": 0.0, "num_input_tokens_seen": 158153360, "step": 73345 }, { "epoch": 13.461185538630941, "grad_norm": 0.0002024302666541189, "learning_rate": 2.9178010105735725e-06, "loss": 0.0, "num_input_tokens_seen": 158164912, "step": 73350 }, { "epoch": 13.462103138190493, "grad_norm": 0.0004636548110283911, "learning_rate": 2.917073018701804e-06, "loss": 0.0, "num_input_tokens_seen": 158176304, "step": 73355 }, { "epoch": 13.463020737750046, "grad_norm": 0.013159075751900673, "learning_rate": 2.916345080253883e-06, "loss": 0.0023, "num_input_tokens_seen": 158187472, "step": 73360 }, { "epoch": 13.463938337309598, "grad_norm": 0.017193051055073738, "learning_rate": 2.915617195248479e-06, "loss": 0.0001, "num_input_tokens_seen": 158197712, "step": 73365 }, { "epoch": 13.46485593686915, "grad_norm": 0.00035896318149752915, "learning_rate": 2.9148893637042663e-06, "loss": 0.0, "num_input_tokens_seen": 158208624, "step": 73370 }, { "epoch": 13.465773536428703, "grad_norm": 0.00014091355842538178, "learning_rate": 2.9141615856399095e-06, "loss": 0.0016, "num_input_tokens_seen": 158219568, "step": 73375 }, { "epoch": 13.466691135988254, "grad_norm": 2.246687412261963, "learning_rate": 2.9134338610740754e-06, "loss": 0.0003, "num_input_tokens_seen": 158231184, "step": 73380 }, { "epoch": 13.467608735547808, "grad_norm": 0.00031807596678845584, "learning_rate": 2.9127061900254295e-06, "loss": 0.0, "num_input_tokens_seen": 158241520, "step": 73385 }, { "epoch": 13.46852633510736, "grad_norm": 0.0012379152467474341, "learning_rate": 2.9119785725126316e-06, "loss": 0.0, "num_input_tokens_seen": 158251632, "step": 73390 }, { "epoch": 13.469443934666911, "grad_norm": 0.07546044141054153, "learning_rate": 2.9112510085543497e-06, "loss": 0.0002, "num_input_tokens_seen": 158261712, "step": 73395 }, { "epoch": 13.470361534226464, "grad_norm": 0.01593788154423237, "learning_rate": 2.910523498169242e-06, "loss": 0.0, "num_input_tokens_seen": 158273072, "step": 73400 }, { "epoch": 13.471279133786016, "grad_norm": 0.0003267330175731331, "learning_rate": 2.9097960413759683e-06, "loss": 0.0, "num_input_tokens_seen": 158282224, "step": 73405 }, { "epoch": 13.472196733345568, "grad_norm": 0.012677961029112339, "learning_rate": 2.9090686381931876e-06, "loss": 0.0001, "num_input_tokens_seen": 158293520, "step": 73410 }, { "epoch": 13.473114332905121, "grad_norm": 0.003019575960934162, "learning_rate": 2.9083412886395522e-06, "loss": 0.0, "num_input_tokens_seen": 158304784, "step": 73415 }, { "epoch": 13.474031932464673, "grad_norm": 0.0004942941013723612, "learning_rate": 2.907613992733724e-06, "loss": 0.0001, "num_input_tokens_seen": 158316016, "step": 73420 }, { "epoch": 13.474949532024224, "grad_norm": 0.005554075352847576, "learning_rate": 2.906886750494353e-06, "loss": 0.0, "num_input_tokens_seen": 158326832, "step": 73425 }, { "epoch": 13.475867131583778, "grad_norm": 0.0042525785975158215, "learning_rate": 2.9061595619400918e-06, "loss": 0.0001, "num_input_tokens_seen": 158338544, "step": 73430 }, { "epoch": 13.47678473114333, "grad_norm": 0.0005743250367231667, "learning_rate": 2.905432427089594e-06, "loss": 0.0, "num_input_tokens_seen": 158350928, "step": 73435 }, { "epoch": 13.477702330702881, "grad_norm": 0.06076057255268097, "learning_rate": 2.9047053459615083e-06, "loss": 0.2985, "num_input_tokens_seen": 158362224, "step": 73440 }, { "epoch": 13.478619930262434, "grad_norm": 0.00021542432659771293, "learning_rate": 2.903978318574483e-06, "loss": 0.0005, "num_input_tokens_seen": 158371632, "step": 73445 }, { "epoch": 13.479537529821986, "grad_norm": 0.0012153981951996684, "learning_rate": 2.903251344947164e-06, "loss": 0.0, "num_input_tokens_seen": 158382480, "step": 73450 }, { "epoch": 13.480455129381538, "grad_norm": 0.06366627663373947, "learning_rate": 2.9025244250982e-06, "loss": 0.0, "num_input_tokens_seen": 158392464, "step": 73455 }, { "epoch": 13.481372728941091, "grad_norm": 0.0002737441973295063, "learning_rate": 2.9017975590462332e-06, "loss": 0.0001, "num_input_tokens_seen": 158404208, "step": 73460 }, { "epoch": 13.482290328500643, "grad_norm": 0.00028448188095353544, "learning_rate": 2.9010707468099054e-06, "loss": 0.0, "num_input_tokens_seen": 158415568, "step": 73465 }, { "epoch": 13.483207928060194, "grad_norm": 0.003496627788990736, "learning_rate": 2.9003439884078615e-06, "loss": 0.0002, "num_input_tokens_seen": 158426928, "step": 73470 }, { "epoch": 13.484125527619748, "grad_norm": 0.04376363381743431, "learning_rate": 2.899617283858741e-06, "loss": 0.0, "num_input_tokens_seen": 158437648, "step": 73475 }, { "epoch": 13.4850431271793, "grad_norm": 0.00019616201461758465, "learning_rate": 2.8988906331811788e-06, "loss": 0.0004, "num_input_tokens_seen": 158449168, "step": 73480 }, { "epoch": 13.48596072673885, "grad_norm": 0.00017952370399143547, "learning_rate": 2.898164036393818e-06, "loss": 0.0001, "num_input_tokens_seen": 158459856, "step": 73485 }, { "epoch": 13.486878326298404, "grad_norm": 0.004107474349439144, "learning_rate": 2.897437493515293e-06, "loss": 0.0, "num_input_tokens_seen": 158470064, "step": 73490 }, { "epoch": 13.487795925857956, "grad_norm": 0.00035221161670051515, "learning_rate": 2.896711004564236e-06, "loss": 0.0, "num_input_tokens_seen": 158480528, "step": 73495 }, { "epoch": 13.488713525417507, "grad_norm": 0.000249865697696805, "learning_rate": 2.8959845695592807e-06, "loss": 0.0, "num_input_tokens_seen": 158490832, "step": 73500 }, { "epoch": 13.48963112497706, "grad_norm": 12.996072769165039, "learning_rate": 2.895258188519062e-06, "loss": 0.007, "num_input_tokens_seen": 158501104, "step": 73505 }, { "epoch": 13.490548724536612, "grad_norm": 0.004281848203390837, "learning_rate": 2.894531861462209e-06, "loss": 0.0, "num_input_tokens_seen": 158513168, "step": 73510 }, { "epoch": 13.491466324096164, "grad_norm": 0.002632303861901164, "learning_rate": 2.8938055884073492e-06, "loss": 0.0, "num_input_tokens_seen": 158522384, "step": 73515 }, { "epoch": 13.492383923655717, "grad_norm": 0.0003499349986668676, "learning_rate": 2.893079369373113e-06, "loss": 0.0, "num_input_tokens_seen": 158533008, "step": 73520 }, { "epoch": 13.493301523215269, "grad_norm": 0.00011755975720006973, "learning_rate": 2.8923532043781254e-06, "loss": 0.0, "num_input_tokens_seen": 158544464, "step": 73525 }, { "epoch": 13.49421912277482, "grad_norm": 0.00032660618308000267, "learning_rate": 2.8916270934410097e-06, "loss": 0.0, "num_input_tokens_seen": 158556368, "step": 73530 }, { "epoch": 13.495136722334374, "grad_norm": 0.00013189991295803338, "learning_rate": 2.8909010365803934e-06, "loss": 0.0, "num_input_tokens_seen": 158567664, "step": 73535 }, { "epoch": 13.496054321893926, "grad_norm": 0.001120201195590198, "learning_rate": 2.890175033814897e-06, "loss": 0.0, "num_input_tokens_seen": 158578576, "step": 73540 }, { "epoch": 13.496971921453477, "grad_norm": 0.000310817762510851, "learning_rate": 2.8894490851631405e-06, "loss": 0.0, "num_input_tokens_seen": 158589008, "step": 73545 }, { "epoch": 13.49788952101303, "grad_norm": 236.04873657226562, "learning_rate": 2.8887231906437417e-06, "loss": 0.056, "num_input_tokens_seen": 158599472, "step": 73550 }, { "epoch": 13.498807120572582, "grad_norm": 0.0017012981697916985, "learning_rate": 2.887997350275324e-06, "loss": 0.0532, "num_input_tokens_seen": 158610480, "step": 73555 }, { "epoch": 13.499724720132134, "grad_norm": 0.01667126454412937, "learning_rate": 2.8872715640765003e-06, "loss": 0.0, "num_input_tokens_seen": 158622032, "step": 73560 }, { "epoch": 13.500642319691687, "grad_norm": 0.00520982313901186, "learning_rate": 2.8865458320658844e-06, "loss": 0.0, "num_input_tokens_seen": 158632848, "step": 73565 }, { "epoch": 13.501559919251239, "grad_norm": 0.0029757979791611433, "learning_rate": 2.8858201542620945e-06, "loss": 0.0, "num_input_tokens_seen": 158645200, "step": 73570 }, { "epoch": 13.50247751881079, "grad_norm": 0.00010357663268223405, "learning_rate": 2.8850945306837406e-06, "loss": 0.0, "num_input_tokens_seen": 158656752, "step": 73575 }, { "epoch": 13.503395118370344, "grad_norm": 0.0016228621825575829, "learning_rate": 2.8843689613494352e-06, "loss": 0.0, "num_input_tokens_seen": 158666768, "step": 73580 }, { "epoch": 13.504312717929896, "grad_norm": 0.0002941351558547467, "learning_rate": 2.883643446277784e-06, "loss": 0.0, "num_input_tokens_seen": 158677968, "step": 73585 }, { "epoch": 13.505230317489447, "grad_norm": 0.0011852228781208396, "learning_rate": 2.8829179854874013e-06, "loss": 0.0, "num_input_tokens_seen": 158688944, "step": 73590 }, { "epoch": 13.506147917049, "grad_norm": 0.0051525854505598545, "learning_rate": 2.88219257899689e-06, "loss": 0.0146, "num_input_tokens_seen": 158699280, "step": 73595 }, { "epoch": 13.507065516608552, "grad_norm": 0.00019382365280762315, "learning_rate": 2.881467226824858e-06, "loss": 0.0, "num_input_tokens_seen": 158708368, "step": 73600 }, { "epoch": 13.507983116168104, "grad_norm": 0.0010035745799541473, "learning_rate": 2.880741928989907e-06, "loss": 0.0001, "num_input_tokens_seen": 158719248, "step": 73605 }, { "epoch": 13.508900715727657, "grad_norm": 0.000213602528674528, "learning_rate": 2.880016685510639e-06, "loss": 0.0, "num_input_tokens_seen": 158730352, "step": 73610 }, { "epoch": 13.509818315287209, "grad_norm": 0.0006126342341303825, "learning_rate": 2.87929149640566e-06, "loss": 0.0, "num_input_tokens_seen": 158741616, "step": 73615 }, { "epoch": 13.51073591484676, "grad_norm": 0.0007409100653603673, "learning_rate": 2.878566361693567e-06, "loss": 0.0, "num_input_tokens_seen": 158753232, "step": 73620 }, { "epoch": 13.511653514406314, "grad_norm": 0.00020079953537788242, "learning_rate": 2.877841281392959e-06, "loss": 0.0, "num_input_tokens_seen": 158764560, "step": 73625 }, { "epoch": 13.512571113965866, "grad_norm": 0.00012109029194107279, "learning_rate": 2.877116255522433e-06, "loss": 0.0, "num_input_tokens_seen": 158774640, "step": 73630 }, { "epoch": 13.513488713525417, "grad_norm": 0.017631081864237785, "learning_rate": 2.8763912841005833e-06, "loss": 0.0, "num_input_tokens_seen": 158785232, "step": 73635 }, { "epoch": 13.51440631308497, "grad_norm": 0.001087866141460836, "learning_rate": 2.8756663671460072e-06, "loss": 0.0, "num_input_tokens_seen": 158796112, "step": 73640 }, { "epoch": 13.515323912644522, "grad_norm": 0.00037783783045597374, "learning_rate": 2.8749415046772964e-06, "loss": 0.0, "num_input_tokens_seen": 158806064, "step": 73645 }, { "epoch": 13.516241512204074, "grad_norm": 0.001656028558500111, "learning_rate": 2.874216696713041e-06, "loss": 0.0, "num_input_tokens_seen": 158816464, "step": 73650 }, { "epoch": 13.517159111763627, "grad_norm": 0.00023018232604954392, "learning_rate": 2.8734919432718343e-06, "loss": 0.0, "num_input_tokens_seen": 158827280, "step": 73655 }, { "epoch": 13.518076711323179, "grad_norm": 0.00042346815462224185, "learning_rate": 2.8727672443722642e-06, "loss": 0.0001, "num_input_tokens_seen": 158839184, "step": 73660 }, { "epoch": 13.51899431088273, "grad_norm": 0.017677100375294685, "learning_rate": 2.872042600032915e-06, "loss": 0.0, "num_input_tokens_seen": 158848944, "step": 73665 }, { "epoch": 13.519911910442284, "grad_norm": 0.00012098193110432476, "learning_rate": 2.8713180102723764e-06, "loss": 0.0002, "num_input_tokens_seen": 158859760, "step": 73670 }, { "epoch": 13.520829510001835, "grad_norm": 0.06282507628202438, "learning_rate": 2.8705934751092323e-06, "loss": 0.0, "num_input_tokens_seen": 158869840, "step": 73675 }, { "epoch": 13.521747109561387, "grad_norm": 0.0011908470187336206, "learning_rate": 2.869868994562065e-06, "loss": 0.0, "num_input_tokens_seen": 158881040, "step": 73680 }, { "epoch": 13.52266470912094, "grad_norm": 0.00017541814304422587, "learning_rate": 2.8691445686494545e-06, "loss": 0.0, "num_input_tokens_seen": 158892112, "step": 73685 }, { "epoch": 13.523582308680492, "grad_norm": 0.24235787987709045, "learning_rate": 2.8684201973899856e-06, "loss": 0.0001, "num_input_tokens_seen": 158901904, "step": 73690 }, { "epoch": 13.524499908240044, "grad_norm": 0.0002818233915604651, "learning_rate": 2.8676958808022346e-06, "loss": 0.0001, "num_input_tokens_seen": 158912272, "step": 73695 }, { "epoch": 13.525417507799597, "grad_norm": 0.00013799963926430792, "learning_rate": 2.866971618904778e-06, "loss": 0.0, "num_input_tokens_seen": 158923184, "step": 73700 }, { "epoch": 13.526335107359149, "grad_norm": 42.71271514892578, "learning_rate": 2.8662474117161955e-06, "loss": 0.0051, "num_input_tokens_seen": 158935280, "step": 73705 }, { "epoch": 13.5272527069187, "grad_norm": 0.004381751641631126, "learning_rate": 2.865523259255059e-06, "loss": 0.0, "num_input_tokens_seen": 158945968, "step": 73710 }, { "epoch": 13.528170306478254, "grad_norm": 0.0003687803982757032, "learning_rate": 2.8647991615399436e-06, "loss": 0.0735, "num_input_tokens_seen": 158956528, "step": 73715 }, { "epoch": 13.529087906037805, "grad_norm": 0.0021725057158619165, "learning_rate": 2.8640751185894176e-06, "loss": 0.0, "num_input_tokens_seen": 158968080, "step": 73720 }, { "epoch": 13.530005505597357, "grad_norm": 0.000165327379363589, "learning_rate": 2.8633511304220574e-06, "loss": 0.0001, "num_input_tokens_seen": 158978896, "step": 73725 }, { "epoch": 13.53092310515691, "grad_norm": 0.0004838282475247979, "learning_rate": 2.862627197056429e-06, "loss": 0.0, "num_input_tokens_seen": 158990160, "step": 73730 }, { "epoch": 13.531840704716462, "grad_norm": 0.008256298489868641, "learning_rate": 2.8619033185110976e-06, "loss": 0.0, "num_input_tokens_seen": 159001744, "step": 73735 }, { "epoch": 13.532758304276014, "grad_norm": 0.006698652170598507, "learning_rate": 2.8611794948046357e-06, "loss": 0.0, "num_input_tokens_seen": 159012752, "step": 73740 }, { "epoch": 13.533675903835567, "grad_norm": 0.0005622858298011124, "learning_rate": 2.8604557259556037e-06, "loss": 0.0, "num_input_tokens_seen": 159024176, "step": 73745 }, { "epoch": 13.534593503395119, "grad_norm": 0.00018738367361947894, "learning_rate": 2.8597320119825642e-06, "loss": 0.0, "num_input_tokens_seen": 159034768, "step": 73750 }, { "epoch": 13.53551110295467, "grad_norm": 0.001543321879580617, "learning_rate": 2.8590083529040847e-06, "loss": 0.0, "num_input_tokens_seen": 159045328, "step": 73755 }, { "epoch": 13.536428702514224, "grad_norm": 0.0013468519318848848, "learning_rate": 2.8582847487387224e-06, "loss": 0.0, "num_input_tokens_seen": 159056560, "step": 73760 }, { "epoch": 13.537346302073775, "grad_norm": 0.00020789087284356356, "learning_rate": 2.857561199505036e-06, "loss": 0.0, "num_input_tokens_seen": 159067024, "step": 73765 }, { "epoch": 13.538263901633327, "grad_norm": 0.00031633200705982745, "learning_rate": 2.8568377052215828e-06, "loss": 0.0, "num_input_tokens_seen": 159077712, "step": 73770 }, { "epoch": 13.53918150119288, "grad_norm": 0.023347962647676468, "learning_rate": 2.856114265906923e-06, "loss": 0.0, "num_input_tokens_seen": 159088432, "step": 73775 }, { "epoch": 13.540099100752432, "grad_norm": 0.00029502439429052174, "learning_rate": 2.8553908815796095e-06, "loss": 0.0478, "num_input_tokens_seen": 159100624, "step": 73780 }, { "epoch": 13.541016700311983, "grad_norm": 0.008660533465445042, "learning_rate": 2.8546675522581947e-06, "loss": 0.0, "num_input_tokens_seen": 159110896, "step": 73785 }, { "epoch": 13.541934299871537, "grad_norm": 0.0028687696903944016, "learning_rate": 2.8539442779612332e-06, "loss": 0.2094, "num_input_tokens_seen": 159122096, "step": 73790 }, { "epoch": 13.542851899431088, "grad_norm": 0.0013834185665473342, "learning_rate": 2.853221058707275e-06, "loss": 0.0, "num_input_tokens_seen": 159133200, "step": 73795 }, { "epoch": 13.54376949899064, "grad_norm": 0.00032612396171316504, "learning_rate": 2.8524978945148702e-06, "loss": 0.0, "num_input_tokens_seen": 159144368, "step": 73800 }, { "epoch": 13.544687098550193, "grad_norm": 0.0009096155408769846, "learning_rate": 2.8517747854025633e-06, "loss": 0.0, "num_input_tokens_seen": 159154992, "step": 73805 }, { "epoch": 13.545604698109745, "grad_norm": 0.0016028714599087834, "learning_rate": 2.8510517313889063e-06, "loss": 0.0, "num_input_tokens_seen": 159164912, "step": 73810 }, { "epoch": 13.546522297669297, "grad_norm": 0.0003849141066893935, "learning_rate": 2.8503287324924413e-06, "loss": 0.0, "num_input_tokens_seen": 159175760, "step": 73815 }, { "epoch": 13.54743989722885, "grad_norm": 0.0032154610380530357, "learning_rate": 2.849605788731713e-06, "loss": 0.0, "num_input_tokens_seen": 159185968, "step": 73820 }, { "epoch": 13.548357496788402, "grad_norm": 0.0009770068572834134, "learning_rate": 2.8488829001252632e-06, "loss": 0.0, "num_input_tokens_seen": 159196688, "step": 73825 }, { "epoch": 13.549275096347953, "grad_norm": 0.015646493062376976, "learning_rate": 2.848160066691633e-06, "loss": 0.0, "num_input_tokens_seen": 159206832, "step": 73830 }, { "epoch": 13.550192695907507, "grad_norm": 0.00013600672536995262, "learning_rate": 2.8474372884493605e-06, "loss": 0.0, "num_input_tokens_seen": 159218608, "step": 73835 }, { "epoch": 13.551110295467058, "grad_norm": 0.00022686955344397575, "learning_rate": 2.846714565416987e-06, "loss": 0.0, "num_input_tokens_seen": 159230192, "step": 73840 }, { "epoch": 13.55202789502661, "grad_norm": 0.006607542745769024, "learning_rate": 2.8459918976130474e-06, "loss": 0.0, "num_input_tokens_seen": 159242224, "step": 73845 }, { "epoch": 13.552945494586163, "grad_norm": 1.3046951293945312, "learning_rate": 2.845269285056076e-06, "loss": 0.0016, "num_input_tokens_seen": 159252560, "step": 73850 }, { "epoch": 13.553863094145715, "grad_norm": 0.00048676820006221533, "learning_rate": 2.844546727764609e-06, "loss": 0.0, "num_input_tokens_seen": 159263952, "step": 73855 }, { "epoch": 13.554780693705267, "grad_norm": 0.0021422547288239002, "learning_rate": 2.843824225757178e-06, "loss": 0.0, "num_input_tokens_seen": 159275056, "step": 73860 }, { "epoch": 13.55569829326482, "grad_norm": 0.00023060620878823102, "learning_rate": 2.843101779052314e-06, "loss": 0.0, "num_input_tokens_seen": 159285488, "step": 73865 }, { "epoch": 13.556615892824372, "grad_norm": 0.0011404710821807384, "learning_rate": 2.8423793876685444e-06, "loss": 0.0, "num_input_tokens_seen": 159294800, "step": 73870 }, { "epoch": 13.557533492383923, "grad_norm": 0.0004828579258173704, "learning_rate": 2.8416570516244018e-06, "loss": 0.0, "num_input_tokens_seen": 159305200, "step": 73875 }, { "epoch": 13.558451091943477, "grad_norm": 0.00768750486895442, "learning_rate": 2.8409347709384103e-06, "loss": 0.0, "num_input_tokens_seen": 159316240, "step": 73880 }, { "epoch": 13.559368691503028, "grad_norm": 0.0010728772031143308, "learning_rate": 2.840212545629094e-06, "loss": 0.0, "num_input_tokens_seen": 159325808, "step": 73885 }, { "epoch": 13.56028629106258, "grad_norm": 0.00027357490034773946, "learning_rate": 2.8394903757149805e-06, "loss": 0.0, "num_input_tokens_seen": 159335920, "step": 73890 }, { "epoch": 13.561203890622133, "grad_norm": 0.0018484073225408792, "learning_rate": 2.8387682612145905e-06, "loss": 0.0, "num_input_tokens_seen": 159346672, "step": 73895 }, { "epoch": 13.562121490181685, "grad_norm": 0.00014186277985572815, "learning_rate": 2.838046202146445e-06, "loss": 0.0, "num_input_tokens_seen": 159358800, "step": 73900 }, { "epoch": 13.563039089741237, "grad_norm": 0.0005597265553660691, "learning_rate": 2.8373241985290613e-06, "loss": 0.0, "num_input_tokens_seen": 159369104, "step": 73905 }, { "epoch": 13.56395668930079, "grad_norm": 0.00909017026424408, "learning_rate": 2.836602250380962e-06, "loss": 0.0001, "num_input_tokens_seen": 159380240, "step": 73910 }, { "epoch": 13.564874288860342, "grad_norm": 0.0001628659520065412, "learning_rate": 2.8358803577206624e-06, "loss": 0.0, "num_input_tokens_seen": 159389296, "step": 73915 }, { "epoch": 13.565791888419893, "grad_norm": 0.0018182845087721944, "learning_rate": 2.8351585205666755e-06, "loss": 0.0, "num_input_tokens_seen": 159400848, "step": 73920 }, { "epoch": 13.566709487979447, "grad_norm": 35.98306655883789, "learning_rate": 2.8344367389375193e-06, "loss": 0.0328, "num_input_tokens_seen": 159412528, "step": 73925 }, { "epoch": 13.567627087538998, "grad_norm": 0.000328697613440454, "learning_rate": 2.8337150128517042e-06, "loss": 0.0, "num_input_tokens_seen": 159424688, "step": 73930 }, { "epoch": 13.56854468709855, "grad_norm": 0.006467697210609913, "learning_rate": 2.83299334232774e-06, "loss": 0.0004, "num_input_tokens_seen": 159435920, "step": 73935 }, { "epoch": 13.569462286658103, "grad_norm": 0.00026906185667030513, "learning_rate": 2.832271727384139e-06, "loss": 0.0, "num_input_tokens_seen": 159447088, "step": 73940 }, { "epoch": 13.570379886217655, "grad_norm": 0.003786693559959531, "learning_rate": 2.8315501680394097e-06, "loss": 0.0001, "num_input_tokens_seen": 159457776, "step": 73945 }, { "epoch": 13.571297485777206, "grad_norm": 0.00022804317995905876, "learning_rate": 2.8308286643120574e-06, "loss": 0.0001, "num_input_tokens_seen": 159468848, "step": 73950 }, { "epoch": 13.57221508533676, "grad_norm": 0.003364011412486434, "learning_rate": 2.8301072162205857e-06, "loss": 0.0913, "num_input_tokens_seen": 159480144, "step": 73955 }, { "epoch": 13.573132684896311, "grad_norm": 0.001023861113935709, "learning_rate": 2.8293858237835037e-06, "loss": 0.0, "num_input_tokens_seen": 159491088, "step": 73960 }, { "epoch": 13.574050284455863, "grad_norm": 0.03456631302833557, "learning_rate": 2.8286644870193104e-06, "loss": 0.0001, "num_input_tokens_seen": 159502704, "step": 73965 }, { "epoch": 13.574967884015416, "grad_norm": 0.00041510589653626084, "learning_rate": 2.8279432059465055e-06, "loss": 0.0082, "num_input_tokens_seen": 159514096, "step": 73970 }, { "epoch": 13.575885483574968, "grad_norm": 0.0003028768696822226, "learning_rate": 2.8272219805835933e-06, "loss": 0.147, "num_input_tokens_seen": 159522928, "step": 73975 }, { "epoch": 13.57680308313452, "grad_norm": 0.006885784678161144, "learning_rate": 2.82650081094907e-06, "loss": 0.0, "num_input_tokens_seen": 159533808, "step": 73980 }, { "epoch": 13.577720682694073, "grad_norm": 0.00014062263653613627, "learning_rate": 2.8257796970614303e-06, "loss": 0.0, "num_input_tokens_seen": 159544592, "step": 73985 }, { "epoch": 13.578638282253625, "grad_norm": 0.00029739554156549275, "learning_rate": 2.825058638939173e-06, "loss": 0.175, "num_input_tokens_seen": 159555472, "step": 73990 }, { "epoch": 13.579555881813176, "grad_norm": 0.0005706217489205301, "learning_rate": 2.824337636600792e-06, "loss": 0.0, "num_input_tokens_seen": 159566064, "step": 73995 }, { "epoch": 13.58047348137273, "grad_norm": 0.0005210343515500426, "learning_rate": 2.823616690064778e-06, "loss": 0.0, "num_input_tokens_seen": 159577456, "step": 74000 }, { "epoch": 13.581391080932281, "grad_norm": 0.00019111928122583777, "learning_rate": 2.8228957993496207e-06, "loss": 0.0, "num_input_tokens_seen": 159588304, "step": 74005 }, { "epoch": 13.582308680491833, "grad_norm": 0.015369596891105175, "learning_rate": 2.822174964473814e-06, "loss": 0.0, "num_input_tokens_seen": 159599184, "step": 74010 }, { "epoch": 13.583226280051386, "grad_norm": 0.0011504514841362834, "learning_rate": 2.821454185455844e-06, "loss": 0.147, "num_input_tokens_seen": 159610896, "step": 74015 }, { "epoch": 13.584143879610938, "grad_norm": 0.00019971022265963256, "learning_rate": 2.820733462314198e-06, "loss": 0.0, "num_input_tokens_seen": 159621456, "step": 74020 }, { "epoch": 13.58506147917049, "grad_norm": 0.008856051601469517, "learning_rate": 2.8200127950673608e-06, "loss": 0.0, "num_input_tokens_seen": 159632112, "step": 74025 }, { "epoch": 13.585979078730043, "grad_norm": 0.004187545273452997, "learning_rate": 2.819292183733815e-06, "loss": 0.0006, "num_input_tokens_seen": 159643728, "step": 74030 }, { "epoch": 13.586896678289595, "grad_norm": 0.0005844204570166767, "learning_rate": 2.8185716283320462e-06, "loss": 0.0, "num_input_tokens_seen": 159654512, "step": 74035 }, { "epoch": 13.587814277849146, "grad_norm": 0.0004955296753905714, "learning_rate": 2.8178511288805355e-06, "loss": 0.0, "num_input_tokens_seen": 159666128, "step": 74040 }, { "epoch": 13.5887318774087, "grad_norm": 0.00012862717267125845, "learning_rate": 2.8171306853977602e-06, "loss": 0.1688, "num_input_tokens_seen": 159677552, "step": 74045 }, { "epoch": 13.589649476968251, "grad_norm": 0.0009241526713594794, "learning_rate": 2.8164102979022e-06, "loss": 0.0, "num_input_tokens_seen": 159689040, "step": 74050 }, { "epoch": 13.590567076527803, "grad_norm": 0.00033484608866274357, "learning_rate": 2.8156899664123295e-06, "loss": 0.0, "num_input_tokens_seen": 159699536, "step": 74055 }, { "epoch": 13.591484676087356, "grad_norm": 0.000245788658503443, "learning_rate": 2.8149696909466285e-06, "loss": 0.0, "num_input_tokens_seen": 159711024, "step": 74060 }, { "epoch": 13.592402275646908, "grad_norm": 0.0003300231765024364, "learning_rate": 2.814249471523568e-06, "loss": 0.0, "num_input_tokens_seen": 159723152, "step": 74065 }, { "epoch": 13.59331987520646, "grad_norm": 0.000240561697864905, "learning_rate": 2.813529308161619e-06, "loss": 0.0, "num_input_tokens_seen": 159734256, "step": 74070 }, { "epoch": 13.594237474766013, "grad_norm": 0.0005125095485709608, "learning_rate": 2.812809200879256e-06, "loss": 0.0, "num_input_tokens_seen": 159743952, "step": 74075 }, { "epoch": 13.595155074325564, "grad_norm": 0.0006011732039041817, "learning_rate": 2.812089149694948e-06, "loss": 0.0, "num_input_tokens_seen": 159754736, "step": 74080 }, { "epoch": 13.596072673885116, "grad_norm": 0.11067583411931992, "learning_rate": 2.8113691546271614e-06, "loss": 0.0, "num_input_tokens_seen": 159764976, "step": 74085 }, { "epoch": 13.59699027344467, "grad_norm": 0.00036535237450152636, "learning_rate": 2.810649215694362e-06, "loss": 0.0, "num_input_tokens_seen": 159775120, "step": 74090 }, { "epoch": 13.597907873004221, "grad_norm": 0.00015535119746346027, "learning_rate": 2.80992933291502e-06, "loss": 0.0001, "num_input_tokens_seen": 159785808, "step": 74095 }, { "epoch": 13.598825472563773, "grad_norm": 0.0005492976633831859, "learning_rate": 2.8092095063075955e-06, "loss": 0.0002, "num_input_tokens_seen": 159797744, "step": 74100 }, { "epoch": 13.599743072123326, "grad_norm": 0.00015509448712691665, "learning_rate": 2.8084897358905506e-06, "loss": 0.0, "num_input_tokens_seen": 159808048, "step": 74105 }, { "epoch": 13.600660671682878, "grad_norm": 0.0001894938905024901, "learning_rate": 2.807770021682348e-06, "loss": 0.0, "num_input_tokens_seen": 159818064, "step": 74110 }, { "epoch": 13.60157827124243, "grad_norm": 0.0017346666427329183, "learning_rate": 2.8070503637014477e-06, "loss": 0.0, "num_input_tokens_seen": 159827952, "step": 74115 }, { "epoch": 13.602495870801983, "grad_norm": 0.01188530120998621, "learning_rate": 2.8063307619663047e-06, "loss": 0.0, "num_input_tokens_seen": 159838864, "step": 74120 }, { "epoch": 13.603413470361534, "grad_norm": 0.0002368203568039462, "learning_rate": 2.80561121649538e-06, "loss": 0.0, "num_input_tokens_seen": 159849072, "step": 74125 }, { "epoch": 13.604331069921086, "grad_norm": 0.00022355953115038574, "learning_rate": 2.8048917273071263e-06, "loss": 0.0, "num_input_tokens_seen": 159859472, "step": 74130 }, { "epoch": 13.60524866948064, "grad_norm": 0.000930488167796284, "learning_rate": 2.8041722944199977e-06, "loss": 0.0, "num_input_tokens_seen": 159868560, "step": 74135 }, { "epoch": 13.606166269040191, "grad_norm": 0.009242312051355839, "learning_rate": 2.803452917852445e-06, "loss": 0.0, "num_input_tokens_seen": 159879248, "step": 74140 }, { "epoch": 13.607083868599743, "grad_norm": 0.16533413529396057, "learning_rate": 2.802733597622922e-06, "loss": 0.0, "num_input_tokens_seen": 159890288, "step": 74145 }, { "epoch": 13.608001468159296, "grad_norm": 0.013752171769738197, "learning_rate": 2.802014333749877e-06, "loss": 0.0, "num_input_tokens_seen": 159901488, "step": 74150 }, { "epoch": 13.608919067718848, "grad_norm": 0.016555845737457275, "learning_rate": 2.801295126251755e-06, "loss": 0.0, "num_input_tokens_seen": 159912272, "step": 74155 }, { "epoch": 13.6098366672784, "grad_norm": 0.00019251834601163864, "learning_rate": 2.8005759751470086e-06, "loss": 0.0063, "num_input_tokens_seen": 159924048, "step": 74160 }, { "epoch": 13.610754266837953, "grad_norm": 0.0009258015197701752, "learning_rate": 2.7998568804540786e-06, "loss": 0.0, "num_input_tokens_seen": 159934544, "step": 74165 }, { "epoch": 13.611671866397504, "grad_norm": 0.005196136422455311, "learning_rate": 2.7991378421914107e-06, "loss": 0.0023, "num_input_tokens_seen": 159945104, "step": 74170 }, { "epoch": 13.612589465957056, "grad_norm": 0.05165807157754898, "learning_rate": 2.798418860377443e-06, "loss": 0.019, "num_input_tokens_seen": 159956816, "step": 74175 }, { "epoch": 13.61350706551661, "grad_norm": 0.0013601422542706132, "learning_rate": 2.797699935030622e-06, "loss": 0.0, "num_input_tokens_seen": 159968048, "step": 74180 }, { "epoch": 13.614424665076161, "grad_norm": 0.00012023017916362733, "learning_rate": 2.7969810661693848e-06, "loss": 0.0, "num_input_tokens_seen": 159979056, "step": 74185 }, { "epoch": 13.615342264635713, "grad_norm": 0.0001799306774046272, "learning_rate": 2.7962622538121665e-06, "loss": 0.0, "num_input_tokens_seen": 159990352, "step": 74190 }, { "epoch": 13.616259864195266, "grad_norm": 0.0055801523849368095, "learning_rate": 2.7955434979774077e-06, "loss": 0.0, "num_input_tokens_seen": 160001328, "step": 74195 }, { "epoch": 13.617177463754818, "grad_norm": 0.00013271512580104172, "learning_rate": 2.794824798683542e-06, "loss": 0.0, "num_input_tokens_seen": 160011760, "step": 74200 }, { "epoch": 13.61809506331437, "grad_norm": 0.000399614917114377, "learning_rate": 2.794106155949e-06, "loss": 0.0, "num_input_tokens_seen": 160022576, "step": 74205 }, { "epoch": 13.619012662873923, "grad_norm": 0.0009429337806068361, "learning_rate": 2.7933875697922184e-06, "loss": 0.0, "num_input_tokens_seen": 160033296, "step": 74210 }, { "epoch": 13.619930262433474, "grad_norm": 0.3575546145439148, "learning_rate": 2.7926690402316257e-06, "loss": 0.0002, "num_input_tokens_seen": 160043600, "step": 74215 }, { "epoch": 13.620847861993026, "grad_norm": 0.004129322245717049, "learning_rate": 2.7919505672856518e-06, "loss": 0.0001, "num_input_tokens_seen": 160055216, "step": 74220 }, { "epoch": 13.62176546155258, "grad_norm": 0.004845069721341133, "learning_rate": 2.7912321509727208e-06, "loss": 0.0001, "num_input_tokens_seen": 160066928, "step": 74225 }, { "epoch": 13.62268306111213, "grad_norm": 0.0009297266369685531, "learning_rate": 2.7905137913112647e-06, "loss": 0.0, "num_input_tokens_seen": 160076880, "step": 74230 }, { "epoch": 13.623600660671682, "grad_norm": 0.15558519959449768, "learning_rate": 2.7897954883197042e-06, "loss": 0.0001, "num_input_tokens_seen": 160088272, "step": 74235 }, { "epoch": 13.624518260231236, "grad_norm": 0.007132509723305702, "learning_rate": 2.7890772420164646e-06, "loss": 0.0144, "num_input_tokens_seen": 160099120, "step": 74240 }, { "epoch": 13.625435859790787, "grad_norm": 0.0019920601043850183, "learning_rate": 2.788359052419968e-06, "loss": 0.0, "num_input_tokens_seen": 160109392, "step": 74245 }, { "epoch": 13.626353459350339, "grad_norm": 0.0002381590602453798, "learning_rate": 2.7876409195486305e-06, "loss": 0.0, "num_input_tokens_seen": 160120304, "step": 74250 }, { "epoch": 13.627271058909892, "grad_norm": 0.0002187720383517444, "learning_rate": 2.7869228434208774e-06, "loss": 0.0001, "num_input_tokens_seen": 160131280, "step": 74255 }, { "epoch": 13.628188658469444, "grad_norm": 0.00012911749945487827, "learning_rate": 2.786204824055123e-06, "loss": 0.0, "num_input_tokens_seen": 160141584, "step": 74260 }, { "epoch": 13.629106258028996, "grad_norm": 0.0013775428524240851, "learning_rate": 2.785486861469784e-06, "loss": 0.0, "num_input_tokens_seen": 160151792, "step": 74265 }, { "epoch": 13.630023857588549, "grad_norm": 0.0005986993201076984, "learning_rate": 2.7847689556832745e-06, "loss": 0.1688, "num_input_tokens_seen": 160163248, "step": 74270 }, { "epoch": 13.6309414571481, "grad_norm": 0.0003635157481767237, "learning_rate": 2.7840511067140065e-06, "loss": 0.0, "num_input_tokens_seen": 160173840, "step": 74275 }, { "epoch": 13.631859056707652, "grad_norm": 0.00018235959578305483, "learning_rate": 2.7833333145803946e-06, "loss": 0.0, "num_input_tokens_seen": 160184784, "step": 74280 }, { "epoch": 13.632776656267206, "grad_norm": 0.0015749905724078417, "learning_rate": 2.782615579300848e-06, "loss": 0.0, "num_input_tokens_seen": 160196112, "step": 74285 }, { "epoch": 13.633694255826757, "grad_norm": 0.0023591138888150454, "learning_rate": 2.7818979008937735e-06, "loss": 0.0, "num_input_tokens_seen": 160207408, "step": 74290 }, { "epoch": 13.634611855386309, "grad_norm": 0.003030261257663369, "learning_rate": 2.781180279377582e-06, "loss": 0.0, "num_input_tokens_seen": 160218800, "step": 74295 }, { "epoch": 13.635529454945862, "grad_norm": 0.001214170246385038, "learning_rate": 2.7804627147706775e-06, "loss": 0.0588, "num_input_tokens_seen": 160229264, "step": 74300 }, { "epoch": 13.636447054505414, "grad_norm": 0.0031161238439381123, "learning_rate": 2.7797452070914622e-06, "loss": 0.0, "num_input_tokens_seen": 160239920, "step": 74305 }, { "epoch": 13.637364654064966, "grad_norm": 0.00014182327140588313, "learning_rate": 2.7790277563583427e-06, "loss": 0.0, "num_input_tokens_seen": 160250832, "step": 74310 }, { "epoch": 13.638282253624519, "grad_norm": 0.0001633901847526431, "learning_rate": 2.7783103625897194e-06, "loss": 0.0, "num_input_tokens_seen": 160262384, "step": 74315 }, { "epoch": 13.63919985318407, "grad_norm": 0.00644253846257925, "learning_rate": 2.7775930258039925e-06, "loss": 0.0, "num_input_tokens_seen": 160273424, "step": 74320 }, { "epoch": 13.640117452743622, "grad_norm": 0.000341626291628927, "learning_rate": 2.776875746019558e-06, "loss": 0.0, "num_input_tokens_seen": 160283984, "step": 74325 }, { "epoch": 13.641035052303176, "grad_norm": 0.003508468857035041, "learning_rate": 2.7761585232548165e-06, "loss": 0.0001, "num_input_tokens_seen": 160295216, "step": 74330 }, { "epoch": 13.641952651862727, "grad_norm": 0.0001506085245637223, "learning_rate": 2.7754413575281624e-06, "loss": 0.0, "num_input_tokens_seen": 160307088, "step": 74335 }, { "epoch": 13.642870251422279, "grad_norm": 0.00021209745318628848, "learning_rate": 2.7747242488579882e-06, "loss": 0.0, "num_input_tokens_seen": 160317936, "step": 74340 }, { "epoch": 13.643787850981832, "grad_norm": 0.0006226488039828837, "learning_rate": 2.7740071972626897e-06, "loss": 0.0, "num_input_tokens_seen": 160327440, "step": 74345 }, { "epoch": 13.644705450541384, "grad_norm": 0.00026513240300118923, "learning_rate": 2.7732902027606568e-06, "loss": 0.0, "num_input_tokens_seen": 160337744, "step": 74350 }, { "epoch": 13.645623050100935, "grad_norm": 0.021301481872797012, "learning_rate": 2.7725732653702786e-06, "loss": 0.0, "num_input_tokens_seen": 160348816, "step": 74355 }, { "epoch": 13.646540649660489, "grad_norm": 0.0009505526395514607, "learning_rate": 2.771856385109943e-06, "loss": 0.0, "num_input_tokens_seen": 160359088, "step": 74360 }, { "epoch": 13.64745824922004, "grad_norm": 0.03544970229268074, "learning_rate": 2.7711395619980385e-06, "loss": 0.0001, "num_input_tokens_seen": 160370864, "step": 74365 }, { "epoch": 13.648375848779592, "grad_norm": 0.0001284794125240296, "learning_rate": 2.7704227960529504e-06, "loss": 0.2157, "num_input_tokens_seen": 160382128, "step": 74370 }, { "epoch": 13.649293448339145, "grad_norm": 0.0007600185927003622, "learning_rate": 2.7697060872930608e-06, "loss": 0.0002, "num_input_tokens_seen": 160392784, "step": 74375 }, { "epoch": 13.650211047898697, "grad_norm": 0.003636206965893507, "learning_rate": 2.7689894357367547e-06, "loss": 0.0, "num_input_tokens_seen": 160404240, "step": 74380 }, { "epoch": 13.651128647458249, "grad_norm": 0.00011488800373626873, "learning_rate": 2.7682728414024117e-06, "loss": 0.0, "num_input_tokens_seen": 160414416, "step": 74385 }, { "epoch": 13.652046247017802, "grad_norm": 0.00017831336299423128, "learning_rate": 2.7675563043084096e-06, "loss": 0.0, "num_input_tokens_seen": 160424752, "step": 74390 }, { "epoch": 13.652963846577354, "grad_norm": 0.0030392934568226337, "learning_rate": 2.766839824473131e-06, "loss": 0.0376, "num_input_tokens_seen": 160436048, "step": 74395 }, { "epoch": 13.653881446136905, "grad_norm": 0.0002132465597242117, "learning_rate": 2.766123401914949e-06, "loss": 0.0, "num_input_tokens_seen": 160445712, "step": 74400 }, { "epoch": 13.654799045696459, "grad_norm": 0.0009393370128236711, "learning_rate": 2.7654070366522403e-06, "loss": 0.0, "num_input_tokens_seen": 160456624, "step": 74405 }, { "epoch": 13.65571664525601, "grad_norm": 0.00013252976350486279, "learning_rate": 2.7646907287033747e-06, "loss": 0.225, "num_input_tokens_seen": 160467728, "step": 74410 }, { "epoch": 13.656634244815562, "grad_norm": 0.0004908700357191265, "learning_rate": 2.76397447808673e-06, "loss": 0.0, "num_input_tokens_seen": 160476400, "step": 74415 }, { "epoch": 13.657551844375115, "grad_norm": 0.0002003447007155046, "learning_rate": 2.7632582848206747e-06, "loss": 0.0352, "num_input_tokens_seen": 160486384, "step": 74420 }, { "epoch": 13.658469443934667, "grad_norm": 0.00024257935001514852, "learning_rate": 2.7625421489235753e-06, "loss": 0.0, "num_input_tokens_seen": 160496528, "step": 74425 }, { "epoch": 13.659387043494219, "grad_norm": 0.03432874009013176, "learning_rate": 2.7618260704138043e-06, "loss": 0.0, "num_input_tokens_seen": 160507248, "step": 74430 }, { "epoch": 13.660304643053772, "grad_norm": 0.0005091950297355652, "learning_rate": 2.7611100493097253e-06, "loss": 0.0, "num_input_tokens_seen": 160517840, "step": 74435 }, { "epoch": 13.661222242613324, "grad_norm": 0.0015088346553966403, "learning_rate": 2.760394085629704e-06, "loss": 0.0, "num_input_tokens_seen": 160529936, "step": 74440 }, { "epoch": 13.662139842172875, "grad_norm": 0.0021858259569853544, "learning_rate": 2.759678179392102e-06, "loss": 0.0014, "num_input_tokens_seen": 160542160, "step": 74445 }, { "epoch": 13.663057441732429, "grad_norm": 0.046305883675813675, "learning_rate": 2.7589623306152836e-06, "loss": 0.0974, "num_input_tokens_seen": 160553008, "step": 74450 }, { "epoch": 13.66397504129198, "grad_norm": 0.00012105984933441505, "learning_rate": 2.758246539317608e-06, "loss": 0.0, "num_input_tokens_seen": 160562864, "step": 74455 }, { "epoch": 13.664892640851532, "grad_norm": 0.004917386453598738, "learning_rate": 2.7575308055174348e-06, "loss": 0.0, "num_input_tokens_seen": 160573008, "step": 74460 }, { "epoch": 13.665810240411085, "grad_norm": 0.0021652637515217066, "learning_rate": 2.756815129233121e-06, "loss": 0.0, "num_input_tokens_seen": 160583216, "step": 74465 }, { "epoch": 13.666727839970637, "grad_norm": 0.0002187026693718508, "learning_rate": 2.75609951048302e-06, "loss": 0.0, "num_input_tokens_seen": 160593360, "step": 74470 }, { "epoch": 13.667645439530189, "grad_norm": 0.0006434901733882725, "learning_rate": 2.755383949285491e-06, "loss": 0.0004, "num_input_tokens_seen": 160605008, "step": 74475 }, { "epoch": 13.668563039089742, "grad_norm": 0.03998681157827377, "learning_rate": 2.754668445658885e-06, "loss": 0.0001, "num_input_tokens_seen": 160616240, "step": 74480 }, { "epoch": 13.669480638649294, "grad_norm": 0.000314148172037676, "learning_rate": 2.753952999621553e-06, "loss": 0.0, "num_input_tokens_seen": 160627952, "step": 74485 }, { "epoch": 13.670398238208845, "grad_norm": 0.00015357516531366855, "learning_rate": 2.753237611191846e-06, "loss": 0.0, "num_input_tokens_seen": 160639120, "step": 74490 }, { "epoch": 13.671315837768399, "grad_norm": 0.0036709520500153303, "learning_rate": 2.7525222803881103e-06, "loss": 0.0, "num_input_tokens_seen": 160649232, "step": 74495 }, { "epoch": 13.67223343732795, "grad_norm": 0.0020680155139416456, "learning_rate": 2.751807007228696e-06, "loss": 0.0, "num_input_tokens_seen": 160660880, "step": 74500 }, { "epoch": 13.673151036887502, "grad_norm": 0.00070912268711254, "learning_rate": 2.7510917917319485e-06, "loss": 0.0, "num_input_tokens_seen": 160672144, "step": 74505 }, { "epoch": 13.674068636447055, "grad_norm": 0.00023907718423288316, "learning_rate": 2.7503766339162086e-06, "loss": 0.0, "num_input_tokens_seen": 160683728, "step": 74510 }, { "epoch": 13.674986236006607, "grad_norm": 1.8344366550445557, "learning_rate": 2.7496615337998234e-06, "loss": 0.0008, "num_input_tokens_seen": 160695280, "step": 74515 }, { "epoch": 13.675903835566158, "grad_norm": 1.180642008781433, "learning_rate": 2.748946491401132e-06, "loss": 0.0001, "num_input_tokens_seen": 160705392, "step": 74520 }, { "epoch": 13.676821435125712, "grad_norm": 0.0013725798344239593, "learning_rate": 2.7482315067384725e-06, "loss": 0.0, "num_input_tokens_seen": 160715440, "step": 74525 }, { "epoch": 13.677739034685263, "grad_norm": 0.0036843556445091963, "learning_rate": 2.7475165798301872e-06, "loss": 0.0001, "num_input_tokens_seen": 160727504, "step": 74530 }, { "epoch": 13.678656634244815, "grad_norm": 0.0003685281262733042, "learning_rate": 2.74680171069461e-06, "loss": 0.0, "num_input_tokens_seen": 160739344, "step": 74535 }, { "epoch": 13.679574233804368, "grad_norm": 0.00012786916340701282, "learning_rate": 2.7460868993500773e-06, "loss": 0.0, "num_input_tokens_seen": 160749040, "step": 74540 }, { "epoch": 13.68049183336392, "grad_norm": 0.0002218788577010855, "learning_rate": 2.7453721458149203e-06, "loss": 0.1071, "num_input_tokens_seen": 160759120, "step": 74545 }, { "epoch": 13.681409432923472, "grad_norm": 0.00023627410701010376, "learning_rate": 2.744657450107475e-06, "loss": 0.0, "num_input_tokens_seen": 160769680, "step": 74550 }, { "epoch": 13.682327032483025, "grad_norm": 0.0037342687137424946, "learning_rate": 2.743942812246071e-06, "loss": 0.0025, "num_input_tokens_seen": 160781072, "step": 74555 }, { "epoch": 13.683244632042577, "grad_norm": 0.0003124468494206667, "learning_rate": 2.7432282322490355e-06, "loss": 0.0, "num_input_tokens_seen": 160792912, "step": 74560 }, { "epoch": 13.684162231602128, "grad_norm": 0.0013778312131762505, "learning_rate": 2.7425137101347e-06, "loss": 0.0002, "num_input_tokens_seen": 160803248, "step": 74565 }, { "epoch": 13.685079831161682, "grad_norm": 11.099722862243652, "learning_rate": 2.7417992459213883e-06, "loss": 0.001, "num_input_tokens_seen": 160814480, "step": 74570 }, { "epoch": 13.685997430721233, "grad_norm": 0.0008239605813287199, "learning_rate": 2.741084839627425e-06, "loss": 0.0, "num_input_tokens_seen": 160825200, "step": 74575 }, { "epoch": 13.686915030280785, "grad_norm": 0.000981036340817809, "learning_rate": 2.740370491271136e-06, "loss": 0.0, "num_input_tokens_seen": 160835632, "step": 74580 }, { "epoch": 13.687832629840338, "grad_norm": 0.07828327268362045, "learning_rate": 2.7396562008708423e-06, "loss": 0.0001, "num_input_tokens_seen": 160846832, "step": 74585 }, { "epoch": 13.68875022939989, "grad_norm": 0.022306500002741814, "learning_rate": 2.7389419684448637e-06, "loss": 0.0, "num_input_tokens_seen": 160858512, "step": 74590 }, { "epoch": 13.689667828959442, "grad_norm": 141.12725830078125, "learning_rate": 2.7382277940115172e-06, "loss": 0.1938, "num_input_tokens_seen": 160869744, "step": 74595 }, { "epoch": 13.690585428518995, "grad_norm": 0.5220515727996826, "learning_rate": 2.737513677589124e-06, "loss": 0.0004, "num_input_tokens_seen": 160880496, "step": 74600 }, { "epoch": 13.691503028078547, "grad_norm": 0.02489219419658184, "learning_rate": 2.736799619195999e-06, "loss": 0.0, "num_input_tokens_seen": 160890192, "step": 74605 }, { "epoch": 13.692420627638098, "grad_norm": 0.006353751756250858, "learning_rate": 2.7360856188504538e-06, "loss": 0.0, "num_input_tokens_seen": 160901456, "step": 74610 }, { "epoch": 13.693338227197652, "grad_norm": 0.00478421151638031, "learning_rate": 2.735371676570806e-06, "loss": 0.0063, "num_input_tokens_seen": 160913040, "step": 74615 }, { "epoch": 13.694255826757203, "grad_norm": 0.021145354956388474, "learning_rate": 2.7346577923753644e-06, "loss": 0.0, "num_input_tokens_seen": 160924976, "step": 74620 }, { "epoch": 13.695173426316755, "grad_norm": 0.020916953682899475, "learning_rate": 2.7339439662824396e-06, "loss": 0.0001, "num_input_tokens_seen": 160935056, "step": 74625 }, { "epoch": 13.696091025876308, "grad_norm": 0.029649192467331886, "learning_rate": 2.733230198310338e-06, "loss": 0.0, "num_input_tokens_seen": 160945584, "step": 74630 }, { "epoch": 13.69700862543586, "grad_norm": 0.21177715063095093, "learning_rate": 2.732516488477371e-06, "loss": 0.0002, "num_input_tokens_seen": 160955824, "step": 74635 }, { "epoch": 13.697926224995411, "grad_norm": 0.00022692147467751056, "learning_rate": 2.731802836801841e-06, "loss": 0.0, "num_input_tokens_seen": 160964304, "step": 74640 }, { "epoch": 13.698843824554965, "grad_norm": 28.067121505737305, "learning_rate": 2.731089243302052e-06, "loss": 0.0808, "num_input_tokens_seen": 160974992, "step": 74645 }, { "epoch": 13.699761424114516, "grad_norm": 0.0012134264688938856, "learning_rate": 2.7303757079963083e-06, "loss": 0.0, "num_input_tokens_seen": 160986128, "step": 74650 }, { "epoch": 13.700679023674068, "grad_norm": 0.002478515962138772, "learning_rate": 2.7296622309029107e-06, "loss": 0.0, "num_input_tokens_seen": 160996848, "step": 74655 }, { "epoch": 13.701596623233621, "grad_norm": 0.0065271565690636635, "learning_rate": 2.728948812040158e-06, "loss": 0.0, "num_input_tokens_seen": 161009584, "step": 74660 }, { "epoch": 13.702514222793173, "grad_norm": 0.0008102373103611171, "learning_rate": 2.7282354514263464e-06, "loss": 0.0, "num_input_tokens_seen": 161019568, "step": 74665 }, { "epoch": 13.703431822352725, "grad_norm": 0.07110866904258728, "learning_rate": 2.7275221490797764e-06, "loss": 0.0001, "num_input_tokens_seen": 161030160, "step": 74670 }, { "epoch": 13.704349421912278, "grad_norm": 0.0031565192621201277, "learning_rate": 2.7268089050187418e-06, "loss": 0.0, "num_input_tokens_seen": 161040496, "step": 74675 }, { "epoch": 13.70526702147183, "grad_norm": 0.06474661082029343, "learning_rate": 2.7260957192615357e-06, "loss": 0.002, "num_input_tokens_seen": 161050704, "step": 74680 }, { "epoch": 13.706184621031381, "grad_norm": 0.03565619885921478, "learning_rate": 2.72538259182645e-06, "loss": 0.0, "num_input_tokens_seen": 161062384, "step": 74685 }, { "epoch": 13.707102220590935, "grad_norm": 0.0002488793106749654, "learning_rate": 2.724669522731773e-06, "loss": 0.1657, "num_input_tokens_seen": 161072624, "step": 74690 }, { "epoch": 13.708019820150486, "grad_norm": 0.0002445442951284349, "learning_rate": 2.723956511995799e-06, "loss": 0.0002, "num_input_tokens_seen": 161085488, "step": 74695 }, { "epoch": 13.708937419710038, "grad_norm": 0.0011815628968179226, "learning_rate": 2.7232435596368123e-06, "loss": 0.0, "num_input_tokens_seen": 161096240, "step": 74700 }, { "epoch": 13.709855019269591, "grad_norm": 0.0005115693202242255, "learning_rate": 2.7225306656730998e-06, "loss": 0.0, "num_input_tokens_seen": 161106864, "step": 74705 }, { "epoch": 13.710772618829143, "grad_norm": 0.00032407775870524347, "learning_rate": 2.7218178301229435e-06, "loss": 0.0, "num_input_tokens_seen": 161117328, "step": 74710 }, { "epoch": 13.711690218388695, "grad_norm": 0.029977913945913315, "learning_rate": 2.7211050530046325e-06, "loss": 0.0002, "num_input_tokens_seen": 161128624, "step": 74715 }, { "epoch": 13.712607817948248, "grad_norm": 0.26541629433631897, "learning_rate": 2.7203923343364434e-06, "loss": 0.0002, "num_input_tokens_seen": 161137264, "step": 74720 }, { "epoch": 13.7135254175078, "grad_norm": 0.00033117615384981036, "learning_rate": 2.7196796741366583e-06, "loss": 0.0, "num_input_tokens_seen": 161149072, "step": 74725 }, { "epoch": 13.714443017067351, "grad_norm": 0.001298984745517373, "learning_rate": 2.718967072423554e-06, "loss": 0.0036, "num_input_tokens_seen": 161161072, "step": 74730 }, { "epoch": 13.715360616626905, "grad_norm": 0.00047295488184317946, "learning_rate": 2.7182545292154106e-06, "loss": 0.0, "num_input_tokens_seen": 161170512, "step": 74735 }, { "epoch": 13.716278216186456, "grad_norm": 0.00274993060156703, "learning_rate": 2.7175420445305017e-06, "loss": 0.0, "num_input_tokens_seen": 161182416, "step": 74740 }, { "epoch": 13.717195815746008, "grad_norm": 0.00045620990567840636, "learning_rate": 2.7168296183871e-06, "loss": 0.0, "num_input_tokens_seen": 161192784, "step": 74745 }, { "epoch": 13.718113415305561, "grad_norm": 0.00015485098992940038, "learning_rate": 2.7161172508034826e-06, "loss": 0.0, "num_input_tokens_seen": 161203632, "step": 74750 }, { "epoch": 13.719031014865113, "grad_norm": 0.0018884983146563172, "learning_rate": 2.7154049417979176e-06, "loss": 0.0003, "num_input_tokens_seen": 161213264, "step": 74755 }, { "epoch": 13.719948614424665, "grad_norm": 0.25950828194618225, "learning_rate": 2.714692691388673e-06, "loss": 0.0002, "num_input_tokens_seen": 161224400, "step": 74760 }, { "epoch": 13.720866213984218, "grad_norm": 0.00030631080153398216, "learning_rate": 2.713980499594021e-06, "loss": 0.0, "num_input_tokens_seen": 161234192, "step": 74765 }, { "epoch": 13.72178381354377, "grad_norm": 0.0002117586409440264, "learning_rate": 2.7132683664322262e-06, "loss": 0.0, "num_input_tokens_seen": 161245968, "step": 74770 }, { "epoch": 13.722701413103321, "grad_norm": 0.010198964737355709, "learning_rate": 2.7125562919215537e-06, "loss": 0.0, "num_input_tokens_seen": 161255728, "step": 74775 }, { "epoch": 13.723619012662875, "grad_norm": 0.0003732717887032777, "learning_rate": 2.7118442760802654e-06, "loss": 0.0, "num_input_tokens_seen": 161268240, "step": 74780 }, { "epoch": 13.724536612222426, "grad_norm": 0.0011248700320720673, "learning_rate": 2.711132318926627e-06, "loss": 0.0, "num_input_tokens_seen": 161279792, "step": 74785 }, { "epoch": 13.725454211781978, "grad_norm": 0.0002679823955986649, "learning_rate": 2.710420420478896e-06, "loss": 0.0, "num_input_tokens_seen": 161291568, "step": 74790 }, { "epoch": 13.726371811341531, "grad_norm": 0.0010193781927227974, "learning_rate": 2.7097085807553326e-06, "loss": 0.0, "num_input_tokens_seen": 161302960, "step": 74795 }, { "epoch": 13.727289410901083, "grad_norm": 0.0007824130007065833, "learning_rate": 2.708996799774195e-06, "loss": 0.0, "num_input_tokens_seen": 161313968, "step": 74800 }, { "epoch": 13.728207010460634, "grad_norm": 0.18240399658679962, "learning_rate": 2.7082850775537397e-06, "loss": 0.0003, "num_input_tokens_seen": 161324464, "step": 74805 }, { "epoch": 13.729124610020188, "grad_norm": 0.0003549364337231964, "learning_rate": 2.70757341411222e-06, "loss": 0.0, "num_input_tokens_seen": 161334448, "step": 74810 }, { "epoch": 13.73004220957974, "grad_norm": 0.00016357483400497586, "learning_rate": 2.7068618094678867e-06, "loss": 0.0, "num_input_tokens_seen": 161346000, "step": 74815 }, { "epoch": 13.730959809139291, "grad_norm": 0.00016215647337958217, "learning_rate": 2.7061502636389967e-06, "loss": 0.0, "num_input_tokens_seen": 161357264, "step": 74820 }, { "epoch": 13.731877408698844, "grad_norm": 0.017560258507728577, "learning_rate": 2.705438776643797e-06, "loss": 0.0, "num_input_tokens_seen": 161368976, "step": 74825 }, { "epoch": 13.732795008258396, "grad_norm": 0.00019056849123444408, "learning_rate": 2.7047273485005344e-06, "loss": 0.0, "num_input_tokens_seen": 161379728, "step": 74830 }, { "epoch": 13.733712607817948, "grad_norm": 0.0005775768659077585, "learning_rate": 2.70401597922746e-06, "loss": 0.0, "num_input_tokens_seen": 161390384, "step": 74835 }, { "epoch": 13.734630207377501, "grad_norm": 0.7759271860122681, "learning_rate": 2.7033046688428177e-06, "loss": 0.0002, "num_input_tokens_seen": 161402320, "step": 74840 }, { "epoch": 13.735547806937053, "grad_norm": 0.00033541303128004074, "learning_rate": 2.7025934173648488e-06, "loss": 0.0005, "num_input_tokens_seen": 161412560, "step": 74845 }, { "epoch": 13.736465406496604, "grad_norm": 0.0024784463457763195, "learning_rate": 2.7018822248118e-06, "loss": 0.0, "num_input_tokens_seen": 161424272, "step": 74850 }, { "epoch": 13.737383006056158, "grad_norm": 0.00023599642736371607, "learning_rate": 2.7011710912019106e-06, "loss": 0.0, "num_input_tokens_seen": 161435792, "step": 74855 }, { "epoch": 13.73830060561571, "grad_norm": 0.0011854125186800957, "learning_rate": 2.7004600165534188e-06, "loss": 0.0, "num_input_tokens_seen": 161446992, "step": 74860 }, { "epoch": 13.739218205175261, "grad_norm": 0.00012079121370334178, "learning_rate": 2.699749000884563e-06, "loss": 0.0, "num_input_tokens_seen": 161456240, "step": 74865 }, { "epoch": 13.740135804734814, "grad_norm": 0.006391852628439665, "learning_rate": 2.6990380442135817e-06, "loss": 0.0, "num_input_tokens_seen": 161467696, "step": 74870 }, { "epoch": 13.741053404294366, "grad_norm": 0.0002618670114316046, "learning_rate": 2.698327146558708e-06, "loss": 0.0003, "num_input_tokens_seen": 161479088, "step": 74875 }, { "epoch": 13.741971003853918, "grad_norm": 0.00018396372615825385, "learning_rate": 2.697616307938177e-06, "loss": 0.0, "num_input_tokens_seen": 161489648, "step": 74880 }, { "epoch": 13.742888603413471, "grad_norm": 0.00025468168314546347, "learning_rate": 2.696905528370216e-06, "loss": 0.0, "num_input_tokens_seen": 161501072, "step": 74885 }, { "epoch": 13.743806202973023, "grad_norm": 0.00013758910063188523, "learning_rate": 2.6961948078730614e-06, "loss": 0.0, "num_input_tokens_seen": 161512560, "step": 74890 }, { "epoch": 13.744723802532574, "grad_norm": 0.0002172715903725475, "learning_rate": 2.695484146464939e-06, "loss": 0.0001, "num_input_tokens_seen": 161522480, "step": 74895 }, { "epoch": 13.745641402092128, "grad_norm": 0.0001509862777311355, "learning_rate": 2.6947735441640764e-06, "loss": 0.0, "num_input_tokens_seen": 161533264, "step": 74900 }, { "epoch": 13.74655900165168, "grad_norm": 0.00020263450278434902, "learning_rate": 2.6940630009887003e-06, "loss": 0.0001, "num_input_tokens_seen": 161544912, "step": 74905 }, { "epoch": 13.74747660121123, "grad_norm": 0.0001519692741567269, "learning_rate": 2.693352516957034e-06, "loss": 0.0284, "num_input_tokens_seen": 161554544, "step": 74910 }, { "epoch": 13.748394200770784, "grad_norm": 0.00022175903723109514, "learning_rate": 2.6926420920872987e-06, "loss": 0.0, "num_input_tokens_seen": 161566672, "step": 74915 }, { "epoch": 13.749311800330336, "grad_norm": 0.0033236180897802114, "learning_rate": 2.6919317263977198e-06, "loss": 0.0, "num_input_tokens_seen": 161577136, "step": 74920 }, { "epoch": 13.750229399889887, "grad_norm": 0.0006394777446985245, "learning_rate": 2.6912214199065146e-06, "loss": 0.0, "num_input_tokens_seen": 161588048, "step": 74925 }, { "epoch": 13.75114699944944, "grad_norm": 0.00027828323072753847, "learning_rate": 2.6905111726319e-06, "loss": 0.0, "num_input_tokens_seen": 161599120, "step": 74930 }, { "epoch": 13.752064599008992, "grad_norm": 0.0002253521088277921, "learning_rate": 2.6898009845920958e-06, "loss": 0.0001, "num_input_tokens_seen": 161609328, "step": 74935 }, { "epoch": 13.752982198568544, "grad_norm": 0.00021457749244291335, "learning_rate": 2.6890908558053163e-06, "loss": 0.0, "num_input_tokens_seen": 161621136, "step": 74940 }, { "epoch": 13.753899798128097, "grad_norm": 0.001504827756434679, "learning_rate": 2.688380786289775e-06, "loss": 0.0, "num_input_tokens_seen": 161632496, "step": 74945 }, { "epoch": 13.754817397687649, "grad_norm": 0.0008979163249023259, "learning_rate": 2.687670776063682e-06, "loss": 0.0001, "num_input_tokens_seen": 161644272, "step": 74950 }, { "epoch": 13.7557349972472, "grad_norm": 0.00018570625979918987, "learning_rate": 2.6869608251452517e-06, "loss": 0.0, "num_input_tokens_seen": 161655632, "step": 74955 }, { "epoch": 13.756652596806754, "grad_norm": 0.0006736476207152009, "learning_rate": 2.686250933552691e-06, "loss": 0.0, "num_input_tokens_seen": 161666608, "step": 74960 }, { "epoch": 13.757570196366306, "grad_norm": 0.00014650891534984112, "learning_rate": 2.6855411013042054e-06, "loss": 0.0032, "num_input_tokens_seen": 161678384, "step": 74965 }, { "epoch": 13.758487795925857, "grad_norm": 0.01397217158228159, "learning_rate": 2.684831328418006e-06, "loss": 0.0, "num_input_tokens_seen": 161687024, "step": 74970 }, { "epoch": 13.75940539548541, "grad_norm": 3.0280473232269287, "learning_rate": 2.6841216149122953e-06, "loss": 0.0005, "num_input_tokens_seen": 161698192, "step": 74975 }, { "epoch": 13.760322995044962, "grad_norm": 0.0002571178483776748, "learning_rate": 2.683411960805273e-06, "loss": 0.0, "num_input_tokens_seen": 161709776, "step": 74980 }, { "epoch": 13.761240594604514, "grad_norm": 0.00021687240223400295, "learning_rate": 2.682702366115146e-06, "loss": 0.0, "num_input_tokens_seen": 161720624, "step": 74985 }, { "epoch": 13.762158194164067, "grad_norm": 0.0002580390719231218, "learning_rate": 2.6819928308601123e-06, "loss": 0.0, "num_input_tokens_seen": 161730288, "step": 74990 }, { "epoch": 13.763075793723619, "grad_norm": 0.000464963901322335, "learning_rate": 2.6812833550583694e-06, "loss": 0.0, "num_input_tokens_seen": 161741968, "step": 74995 }, { "epoch": 13.76399339328317, "grad_norm": 0.017555177211761475, "learning_rate": 2.680573938728113e-06, "loss": 0.0, "num_input_tokens_seen": 161752592, "step": 75000 }, { "epoch": 13.764910992842724, "grad_norm": 0.0004326548660174012, "learning_rate": 2.6798645818875424e-06, "loss": 0.0, "num_input_tokens_seen": 161762608, "step": 75005 }, { "epoch": 13.765828592402276, "grad_norm": 0.00026843996602110565, "learning_rate": 2.6791552845548486e-06, "loss": 0.0, "num_input_tokens_seen": 161772976, "step": 75010 }, { "epoch": 13.766746191961827, "grad_norm": 0.001973339356482029, "learning_rate": 2.678446046748223e-06, "loss": 0.0, "num_input_tokens_seen": 161785488, "step": 75015 }, { "epoch": 13.76766379152138, "grad_norm": 0.00038168870378285646, "learning_rate": 2.6777368684858608e-06, "loss": 0.0, "num_input_tokens_seen": 161796688, "step": 75020 }, { "epoch": 13.768581391080932, "grad_norm": 78.59539031982422, "learning_rate": 2.677027749785949e-06, "loss": 0.2688, "num_input_tokens_seen": 161807760, "step": 75025 }, { "epoch": 13.769498990640484, "grad_norm": 0.00026651815278455615, "learning_rate": 2.676318690666672e-06, "loss": 0.0, "num_input_tokens_seen": 161816656, "step": 75030 }, { "epoch": 13.770416590200037, "grad_norm": 0.0002333016018383205, "learning_rate": 2.6756096911462216e-06, "loss": 0.0, "num_input_tokens_seen": 161828016, "step": 75035 }, { "epoch": 13.771334189759589, "grad_norm": 0.0011445690179243684, "learning_rate": 2.6749007512427807e-06, "loss": 0.0, "num_input_tokens_seen": 161838960, "step": 75040 }, { "epoch": 13.77225178931914, "grad_norm": 0.0001357407309114933, "learning_rate": 2.6741918709745314e-06, "loss": 0.0, "num_input_tokens_seen": 161848208, "step": 75045 }, { "epoch": 13.773169388878694, "grad_norm": 0.006486441008746624, "learning_rate": 2.6734830503596545e-06, "loss": 0.0, "num_input_tokens_seen": 161859344, "step": 75050 }, { "epoch": 13.774086988438246, "grad_norm": 0.00014294745051302016, "learning_rate": 2.6727742894163326e-06, "loss": 0.0, "num_input_tokens_seen": 161870576, "step": 75055 }, { "epoch": 13.775004587997797, "grad_norm": 0.042393047362565994, "learning_rate": 2.6720655881627437e-06, "loss": 0.0, "num_input_tokens_seen": 161880784, "step": 75060 }, { "epoch": 13.77592218755735, "grad_norm": 0.0013393474509939551, "learning_rate": 2.671356946617063e-06, "loss": 0.0, "num_input_tokens_seen": 161891952, "step": 75065 }, { "epoch": 13.776839787116902, "grad_norm": 0.00965321809053421, "learning_rate": 2.6706483647974692e-06, "loss": 0.0, "num_input_tokens_seen": 161904496, "step": 75070 }, { "epoch": 13.777757386676454, "grad_norm": 0.00012969599629286677, "learning_rate": 2.6699398427221345e-06, "loss": 0.0, "num_input_tokens_seen": 161915952, "step": 75075 }, { "epoch": 13.778674986236007, "grad_norm": 0.0006111705442890525, "learning_rate": 2.6692313804092297e-06, "loss": 0.0, "num_input_tokens_seen": 161925616, "step": 75080 }, { "epoch": 13.779592585795559, "grad_norm": 0.00031963677611202, "learning_rate": 2.6685229778769296e-06, "loss": 0.0, "num_input_tokens_seen": 161936944, "step": 75085 }, { "epoch": 13.78051018535511, "grad_norm": 0.0002162879245588556, "learning_rate": 2.667814635143402e-06, "loss": 0.0, "num_input_tokens_seen": 161947632, "step": 75090 }, { "epoch": 13.781427784914664, "grad_norm": 0.015989111736416817, "learning_rate": 2.6671063522268143e-06, "loss": 0.0, "num_input_tokens_seen": 161958352, "step": 75095 }, { "epoch": 13.782345384474215, "grad_norm": 0.00019696402887348086, "learning_rate": 2.666398129145333e-06, "loss": 0.0, "num_input_tokens_seen": 161970992, "step": 75100 }, { "epoch": 13.783262984033767, "grad_norm": 0.00012874614913016558, "learning_rate": 2.6656899659171225e-06, "loss": 0.0, "num_input_tokens_seen": 161982960, "step": 75105 }, { "epoch": 13.78418058359332, "grad_norm": 0.000503328803461045, "learning_rate": 2.6649818625603453e-06, "loss": 0.0, "num_input_tokens_seen": 161994768, "step": 75110 }, { "epoch": 13.785098183152872, "grad_norm": 0.0019230323377996683, "learning_rate": 2.6642738190931656e-06, "loss": 0.0, "num_input_tokens_seen": 162006480, "step": 75115 }, { "epoch": 13.786015782712424, "grad_norm": 0.0008687510271556675, "learning_rate": 2.663565835533742e-06, "loss": 0.0, "num_input_tokens_seen": 162017712, "step": 75120 }, { "epoch": 13.786933382271977, "grad_norm": 0.00011764749797293916, "learning_rate": 2.662857911900235e-06, "loss": 0.0, "num_input_tokens_seen": 162027376, "step": 75125 }, { "epoch": 13.787850981831529, "grad_norm": 0.00012635976599995047, "learning_rate": 2.6621500482108e-06, "loss": 0.0, "num_input_tokens_seen": 162037744, "step": 75130 }, { "epoch": 13.78876858139108, "grad_norm": 0.0008467682637274265, "learning_rate": 2.6614422444835897e-06, "loss": 0.0, "num_input_tokens_seen": 162048112, "step": 75135 }, { "epoch": 13.789686180950634, "grad_norm": 0.0034196048509329557, "learning_rate": 2.6607345007367645e-06, "loss": 0.1813, "num_input_tokens_seen": 162057872, "step": 75140 }, { "epoch": 13.790603780510185, "grad_norm": 0.0005374309839680791, "learning_rate": 2.6600268169884737e-06, "loss": 0.0, "num_input_tokens_seen": 162069328, "step": 75145 }, { "epoch": 13.791521380069737, "grad_norm": 0.00010643384302966297, "learning_rate": 2.6593191932568663e-06, "loss": 0.0, "num_input_tokens_seen": 162080560, "step": 75150 }, { "epoch": 13.79243897962929, "grad_norm": 0.0021459099370986223, "learning_rate": 2.6586116295600963e-06, "loss": 0.0, "num_input_tokens_seen": 162088880, "step": 75155 }, { "epoch": 13.793356579188842, "grad_norm": 0.00018790508329402655, "learning_rate": 2.657904125916308e-06, "loss": 0.0, "num_input_tokens_seen": 162099504, "step": 75160 }, { "epoch": 13.794274178748394, "grad_norm": 9.76803494268097e-05, "learning_rate": 2.657196682343648e-06, "loss": 0.0, "num_input_tokens_seen": 162109968, "step": 75165 }, { "epoch": 13.795191778307947, "grad_norm": 0.09087912738323212, "learning_rate": 2.6564892988602634e-06, "loss": 0.0, "num_input_tokens_seen": 162120240, "step": 75170 }, { "epoch": 13.796109377867499, "grad_norm": 0.0002477386442478746, "learning_rate": 2.6557819754842966e-06, "loss": 0.0, "num_input_tokens_seen": 162130160, "step": 75175 }, { "epoch": 13.79702697742705, "grad_norm": 0.00025270605692639947, "learning_rate": 2.6550747122338886e-06, "loss": 0.1283, "num_input_tokens_seen": 162141360, "step": 75180 }, { "epoch": 13.797944576986604, "grad_norm": 0.00010849031241377816, "learning_rate": 2.654367509127178e-06, "loss": 0.0, "num_input_tokens_seen": 162149488, "step": 75185 }, { "epoch": 13.798862176546155, "grad_norm": 0.0027077298145741224, "learning_rate": 2.653660366182308e-06, "loss": 0.1345, "num_input_tokens_seen": 162160656, "step": 75190 }, { "epoch": 13.799779776105707, "grad_norm": 0.006418003235012293, "learning_rate": 2.6529532834174126e-06, "loss": 0.0, "num_input_tokens_seen": 162171248, "step": 75195 }, { "epoch": 13.80069737566526, "grad_norm": 0.00046197554911486804, "learning_rate": 2.652246260850626e-06, "loss": 0.0, "num_input_tokens_seen": 162183760, "step": 75200 }, { "epoch": 13.801614975224812, "grad_norm": 0.0019343990134075284, "learning_rate": 2.651539298500086e-06, "loss": 0.1098, "num_input_tokens_seen": 162194992, "step": 75205 }, { "epoch": 13.802532574784363, "grad_norm": 0.00018680236826185137, "learning_rate": 2.6508323963839235e-06, "loss": 0.0, "num_input_tokens_seen": 162204272, "step": 75210 }, { "epoch": 13.803450174343917, "grad_norm": 0.00045485704322345555, "learning_rate": 2.6501255545202663e-06, "loss": 0.0, "num_input_tokens_seen": 162214736, "step": 75215 }, { "epoch": 13.804367773903468, "grad_norm": 0.00020689930533990264, "learning_rate": 2.64941877292725e-06, "loss": 0.0, "num_input_tokens_seen": 162226672, "step": 75220 }, { "epoch": 13.80528537346302, "grad_norm": 0.4387584924697876, "learning_rate": 2.648712051622998e-06, "loss": 0.1657, "num_input_tokens_seen": 162238384, "step": 75225 }, { "epoch": 13.806202973022573, "grad_norm": 7.063715747790411e-05, "learning_rate": 2.648005390625638e-06, "loss": 0.0, "num_input_tokens_seen": 162248848, "step": 75230 }, { "epoch": 13.807120572582125, "grad_norm": 0.00011325405648676679, "learning_rate": 2.647298789953293e-06, "loss": 0.0, "num_input_tokens_seen": 162259632, "step": 75235 }, { "epoch": 13.808038172141677, "grad_norm": 0.00018193248251918703, "learning_rate": 2.646592249624089e-06, "loss": 0.0, "num_input_tokens_seen": 162270864, "step": 75240 }, { "epoch": 13.80895577170123, "grad_norm": 0.0001776413555489853, "learning_rate": 2.6458857696561468e-06, "loss": 0.0, "num_input_tokens_seen": 162282256, "step": 75245 }, { "epoch": 13.809873371260782, "grad_norm": 0.00019584795518312603, "learning_rate": 2.645179350067584e-06, "loss": 0.0, "num_input_tokens_seen": 162291600, "step": 75250 }, { "epoch": 13.810790970820333, "grad_norm": 9.465440234635025e-05, "learning_rate": 2.6444729908765227e-06, "loss": 0.0, "num_input_tokens_seen": 162302032, "step": 75255 }, { "epoch": 13.811708570379887, "grad_norm": 0.002090906258672476, "learning_rate": 2.6437666921010784e-06, "loss": 0.0, "num_input_tokens_seen": 162313136, "step": 75260 }, { "epoch": 13.812626169939438, "grad_norm": 0.0004183159035164863, "learning_rate": 2.6430604537593673e-06, "loss": 0.375, "num_input_tokens_seen": 162323184, "step": 75265 }, { "epoch": 13.81354376949899, "grad_norm": 0.00043547654058784246, "learning_rate": 2.642354275869501e-06, "loss": 0.0, "num_input_tokens_seen": 162333840, "step": 75270 }, { "epoch": 13.814461369058543, "grad_norm": 0.00014305958757176995, "learning_rate": 2.6416481584495947e-06, "loss": 0.0, "num_input_tokens_seen": 162343952, "step": 75275 }, { "epoch": 13.815378968618095, "grad_norm": 0.00040830476791597903, "learning_rate": 2.6409421015177583e-06, "loss": 0.0, "num_input_tokens_seen": 162356976, "step": 75280 }, { "epoch": 13.816296568177647, "grad_norm": 0.0002675672876648605, "learning_rate": 2.640236105092099e-06, "loss": 0.0, "num_input_tokens_seen": 162368304, "step": 75285 }, { "epoch": 13.8172141677372, "grad_norm": 0.0014627197524532676, "learning_rate": 2.639530169190727e-06, "loss": 0.0, "num_input_tokens_seen": 162378640, "step": 75290 }, { "epoch": 13.818131767296752, "grad_norm": 0.00015489694487769157, "learning_rate": 2.6388242938317486e-06, "loss": 0.0, "num_input_tokens_seen": 162388944, "step": 75295 }, { "epoch": 13.819049366856303, "grad_norm": 0.005554951261729002, "learning_rate": 2.638118479033268e-06, "loss": 0.0, "num_input_tokens_seen": 162399952, "step": 75300 }, { "epoch": 13.819966966415857, "grad_norm": 0.0010773962130770087, "learning_rate": 2.6374127248133858e-06, "loss": 0.0, "num_input_tokens_seen": 162411376, "step": 75305 }, { "epoch": 13.820884565975408, "grad_norm": 0.00018166967493016273, "learning_rate": 2.6367070311902075e-06, "loss": 0.1625, "num_input_tokens_seen": 162421456, "step": 75310 }, { "epoch": 13.82180216553496, "grad_norm": 0.0019496199674904346, "learning_rate": 2.636001398181831e-06, "loss": 0.0032, "num_input_tokens_seen": 162431984, "step": 75315 }, { "epoch": 13.822719765094513, "grad_norm": 0.0024073247332125902, "learning_rate": 2.635295825806354e-06, "loss": 0.0, "num_input_tokens_seen": 162442192, "step": 75320 }, { "epoch": 13.823637364654065, "grad_norm": 0.00013730548380408436, "learning_rate": 2.634590314081875e-06, "loss": 0.1813, "num_input_tokens_seen": 162452592, "step": 75325 }, { "epoch": 13.824554964213617, "grad_norm": 0.00210692104883492, "learning_rate": 2.6338848630264864e-06, "loss": 0.2594, "num_input_tokens_seen": 162462416, "step": 75330 }, { "epoch": 13.82547256377317, "grad_norm": 0.00013611078611575067, "learning_rate": 2.6331794726582853e-06, "loss": 0.0, "num_input_tokens_seen": 162473456, "step": 75335 }, { "epoch": 13.826390163332722, "grad_norm": 0.00015656070900149643, "learning_rate": 2.6324741429953626e-06, "loss": 0.0008, "num_input_tokens_seen": 162484656, "step": 75340 }, { "epoch": 13.827307762892273, "grad_norm": 0.00034922180930152535, "learning_rate": 2.6317688740558096e-06, "loss": 0.0244, "num_input_tokens_seen": 162496176, "step": 75345 }, { "epoch": 13.828225362451827, "grad_norm": 0.00016970376600511372, "learning_rate": 2.6310636658577114e-06, "loss": 0.0, "num_input_tokens_seen": 162506448, "step": 75350 }, { "epoch": 13.829142962011378, "grad_norm": 0.0001402337074978277, "learning_rate": 2.6303585184191614e-06, "loss": 0.0, "num_input_tokens_seen": 162517584, "step": 75355 }, { "epoch": 13.83006056157093, "grad_norm": 0.00014469129382632673, "learning_rate": 2.629653431758243e-06, "loss": 0.1103, "num_input_tokens_seen": 162528464, "step": 75360 }, { "epoch": 13.830978161130483, "grad_norm": 0.0029165938030928373, "learning_rate": 2.6289484058930405e-06, "loss": 0.0, "num_input_tokens_seen": 162539984, "step": 75365 }, { "epoch": 13.831895760690035, "grad_norm": 0.03506956249475479, "learning_rate": 2.6282434408416337e-06, "loss": 0.0007, "num_input_tokens_seen": 162552272, "step": 75370 }, { "epoch": 13.832813360249586, "grad_norm": 0.00033382215769961476, "learning_rate": 2.627538536622109e-06, "loss": 0.0, "num_input_tokens_seen": 162563440, "step": 75375 }, { "epoch": 13.83373095980914, "grad_norm": 0.00018938854918815196, "learning_rate": 2.626833693252544e-06, "loss": 0.0, "num_input_tokens_seen": 162575408, "step": 75380 }, { "epoch": 13.834648559368691, "grad_norm": 0.014811787754297256, "learning_rate": 2.6261289107510148e-06, "loss": 0.0, "num_input_tokens_seen": 162587024, "step": 75385 }, { "epoch": 13.835566158928243, "grad_norm": 0.0010768600041046739, "learning_rate": 2.6254241891356014e-06, "loss": 0.0001, "num_input_tokens_seen": 162598416, "step": 75390 }, { "epoch": 13.836483758487796, "grad_norm": 0.0016142207896336913, "learning_rate": 2.6247195284243776e-06, "loss": 0.0, "num_input_tokens_seen": 162609584, "step": 75395 }, { "epoch": 13.837401358047348, "grad_norm": 0.00023589284683112055, "learning_rate": 2.6240149286354167e-06, "loss": 0.0, "num_input_tokens_seen": 162620176, "step": 75400 }, { "epoch": 13.8383189576069, "grad_norm": 0.0012188099790364504, "learning_rate": 2.6233103897867884e-06, "loss": 0.0, "num_input_tokens_seen": 162631408, "step": 75405 }, { "epoch": 13.839236557166453, "grad_norm": 3.118089199066162, "learning_rate": 2.6226059118965675e-06, "loss": 0.0009, "num_input_tokens_seen": 162642672, "step": 75410 }, { "epoch": 13.840154156726005, "grad_norm": 0.0019422332989051938, "learning_rate": 2.62190149498282e-06, "loss": 0.0, "num_input_tokens_seen": 162653648, "step": 75415 }, { "epoch": 13.841071756285556, "grad_norm": 0.0005697017186321318, "learning_rate": 2.621197139063611e-06, "loss": 0.0, "num_input_tokens_seen": 162663728, "step": 75420 }, { "epoch": 13.84198935584511, "grad_norm": 0.00011967102182097733, "learning_rate": 2.620492844157011e-06, "loss": 0.0001, "num_input_tokens_seen": 162673936, "step": 75425 }, { "epoch": 13.842906955404661, "grad_norm": 0.0001465812383685261, "learning_rate": 2.619788610281081e-06, "loss": 0.0677, "num_input_tokens_seen": 162684080, "step": 75430 }, { "epoch": 13.843824554964213, "grad_norm": 0.0010101583320647478, "learning_rate": 2.619084437453883e-06, "loss": 0.0, "num_input_tokens_seen": 162693840, "step": 75435 }, { "epoch": 13.844742154523766, "grad_norm": 1.0244988203048706, "learning_rate": 2.61838032569348e-06, "loss": 0.0003, "num_input_tokens_seen": 162705168, "step": 75440 }, { "epoch": 13.845659754083318, "grad_norm": 0.0028990488499403, "learning_rate": 2.617676275017932e-06, "loss": 0.0, "num_input_tokens_seen": 162717168, "step": 75445 }, { "epoch": 13.84657735364287, "grad_norm": 0.0004227877070661634, "learning_rate": 2.6169722854452944e-06, "loss": 0.0, "num_input_tokens_seen": 162728400, "step": 75450 }, { "epoch": 13.847494953202423, "grad_norm": 0.0010018389439210296, "learning_rate": 2.6162683569936224e-06, "loss": 0.0, "num_input_tokens_seen": 162739152, "step": 75455 }, { "epoch": 13.848412552761975, "grad_norm": 0.00016998629143927246, "learning_rate": 2.6155644896809745e-06, "loss": 0.0, "num_input_tokens_seen": 162749872, "step": 75460 }, { "epoch": 13.849330152321526, "grad_norm": 0.000357729266397655, "learning_rate": 2.614860683525402e-06, "loss": 0.0, "num_input_tokens_seen": 162760304, "step": 75465 }, { "epoch": 13.85024775188108, "grad_norm": 0.00015784944116603583, "learning_rate": 2.6141569385449545e-06, "loss": 0.0, "num_input_tokens_seen": 162771024, "step": 75470 }, { "epoch": 13.851165351440631, "grad_norm": 0.00010681898857001215, "learning_rate": 2.613453254757686e-06, "loss": 0.0, "num_input_tokens_seen": 162782800, "step": 75475 }, { "epoch": 13.852082951000183, "grad_norm": 0.0007690622005611658, "learning_rate": 2.612749632181642e-06, "loss": 0.0001, "num_input_tokens_seen": 162792848, "step": 75480 }, { "epoch": 13.853000550559736, "grad_norm": 9.722854883875698e-05, "learning_rate": 2.6120460708348685e-06, "loss": 0.0, "num_input_tokens_seen": 162804208, "step": 75485 }, { "epoch": 13.853918150119288, "grad_norm": 0.00027887633768841624, "learning_rate": 2.6113425707354147e-06, "loss": 0.1283, "num_input_tokens_seen": 162815728, "step": 75490 }, { "epoch": 13.85483574967884, "grad_norm": 0.0011704707285389304, "learning_rate": 2.6106391319013208e-06, "loss": 0.0, "num_input_tokens_seen": 162825616, "step": 75495 }, { "epoch": 13.855753349238393, "grad_norm": 0.00011510807235026732, "learning_rate": 2.6099357543506302e-06, "loss": 0.0144, "num_input_tokens_seen": 162837328, "step": 75500 }, { "epoch": 13.856670948797944, "grad_norm": 0.00014299752365332097, "learning_rate": 2.6092324381013823e-06, "loss": 0.0001, "num_input_tokens_seen": 162847888, "step": 75505 }, { "epoch": 13.857588548357496, "grad_norm": 0.0002667316293809563, "learning_rate": 2.6085291831716175e-06, "loss": 0.0, "num_input_tokens_seen": 162859664, "step": 75510 }, { "epoch": 13.85850614791705, "grad_norm": 0.00010721038415795192, "learning_rate": 2.607825989579374e-06, "loss": 0.0, "num_input_tokens_seen": 162871184, "step": 75515 }, { "epoch": 13.859423747476601, "grad_norm": 9.50164976529777e-05, "learning_rate": 2.6071228573426856e-06, "loss": 0.0001, "num_input_tokens_seen": 162881936, "step": 75520 }, { "epoch": 13.860341347036153, "grad_norm": 0.0017210367368534207, "learning_rate": 2.606419786479586e-06, "loss": 0.0, "num_input_tokens_seen": 162893488, "step": 75525 }, { "epoch": 13.861258946595706, "grad_norm": 0.00016514587332494557, "learning_rate": 2.6057167770081104e-06, "loss": 0.0001, "num_input_tokens_seen": 162903376, "step": 75530 }, { "epoch": 13.862176546155258, "grad_norm": 7.554906915174797e-05, "learning_rate": 2.605013828946289e-06, "loss": 0.0, "num_input_tokens_seen": 162914352, "step": 75535 }, { "epoch": 13.86309414571481, "grad_norm": 0.0005288418033160269, "learning_rate": 2.6043109423121506e-06, "loss": 0.0, "num_input_tokens_seen": 162925808, "step": 75540 }, { "epoch": 13.864011745274363, "grad_norm": 0.00019074346346314996, "learning_rate": 2.6036081171237236e-06, "loss": 0.0, "num_input_tokens_seen": 162937520, "step": 75545 }, { "epoch": 13.864929344833914, "grad_norm": 0.00023712108668405563, "learning_rate": 2.6029053533990333e-06, "loss": 0.0001, "num_input_tokens_seen": 162948528, "step": 75550 }, { "epoch": 13.865846944393466, "grad_norm": 0.0014206906780600548, "learning_rate": 2.6022026511561067e-06, "loss": 0.0, "num_input_tokens_seen": 162958832, "step": 75555 }, { "epoch": 13.86676454395302, "grad_norm": 0.0001096729320124723, "learning_rate": 2.601500010412966e-06, "loss": 0.0, "num_input_tokens_seen": 162968752, "step": 75560 }, { "epoch": 13.867682143512571, "grad_norm": 0.006131162401288748, "learning_rate": 2.600797431187633e-06, "loss": 0.0, "num_input_tokens_seen": 162980528, "step": 75565 }, { "epoch": 13.868599743072123, "grad_norm": 0.0012002827133983374, "learning_rate": 2.600094913498125e-06, "loss": 0.2938, "num_input_tokens_seen": 162990000, "step": 75570 }, { "epoch": 13.869517342631676, "grad_norm": 0.0001411353296134621, "learning_rate": 2.599392457362465e-06, "loss": 0.1594, "num_input_tokens_seen": 163000592, "step": 75575 }, { "epoch": 13.870434942191228, "grad_norm": 0.0016058373730629683, "learning_rate": 2.5986900627986677e-06, "loss": 0.0, "num_input_tokens_seen": 163012688, "step": 75580 }, { "epoch": 13.87135254175078, "grad_norm": 0.00029938234365545213, "learning_rate": 2.597987729824749e-06, "loss": 0.0002, "num_input_tokens_seen": 163023888, "step": 75585 }, { "epoch": 13.872270141310333, "grad_norm": 0.00013034880976192653, "learning_rate": 2.5972854584587205e-06, "loss": 0.0, "num_input_tokens_seen": 163032912, "step": 75590 }, { "epoch": 13.873187740869884, "grad_norm": 0.0003927359648514539, "learning_rate": 2.596583248718597e-06, "loss": 0.0003, "num_input_tokens_seen": 163043600, "step": 75595 }, { "epoch": 13.874105340429436, "grad_norm": 0.008549832738935947, "learning_rate": 2.5958811006223893e-06, "loss": 0.0, "num_input_tokens_seen": 163054512, "step": 75600 }, { "epoch": 13.87502293998899, "grad_norm": 9.856963151833043e-05, "learning_rate": 2.5951790141881028e-06, "loss": 0.0003, "num_input_tokens_seen": 163065392, "step": 75605 }, { "epoch": 13.875940539548541, "grad_norm": 20.27092742919922, "learning_rate": 2.5944769894337496e-06, "loss": 0.0097, "num_input_tokens_seen": 163074192, "step": 75610 }, { "epoch": 13.876858139108093, "grad_norm": 0.006417185999453068, "learning_rate": 2.5937750263773336e-06, "loss": 0.0, "num_input_tokens_seen": 163085328, "step": 75615 }, { "epoch": 13.877775738667646, "grad_norm": 0.0002886164584197104, "learning_rate": 2.593073125036857e-06, "loss": 0.0, "num_input_tokens_seen": 163095696, "step": 75620 }, { "epoch": 13.878693338227198, "grad_norm": 0.10624943673610687, "learning_rate": 2.5923712854303256e-06, "loss": 0.0001, "num_input_tokens_seen": 163106096, "step": 75625 }, { "epoch": 13.87961093778675, "grad_norm": 0.0001315514964517206, "learning_rate": 2.59166950757574e-06, "loss": 0.0, "num_input_tokens_seen": 163115760, "step": 75630 }, { "epoch": 13.880528537346303, "grad_norm": 0.0005616341368295252, "learning_rate": 2.5909677914910987e-06, "loss": 0.0, "num_input_tokens_seen": 163126096, "step": 75635 }, { "epoch": 13.881446136905854, "grad_norm": 0.00021110901434440166, "learning_rate": 2.5902661371943977e-06, "loss": 0.0, "num_input_tokens_seen": 163137488, "step": 75640 }, { "epoch": 13.882363736465406, "grad_norm": 133.51626586914062, "learning_rate": 2.5895645447036378e-06, "loss": 0.1969, "num_input_tokens_seen": 163148944, "step": 75645 }, { "epoch": 13.88328133602496, "grad_norm": 0.00042788489372469485, "learning_rate": 2.588863014036811e-06, "loss": 0.0003, "num_input_tokens_seen": 163159760, "step": 75650 }, { "epoch": 13.88419893558451, "grad_norm": 0.0008300464251078665, "learning_rate": 2.5881615452119092e-06, "loss": 0.0, "num_input_tokens_seen": 163170256, "step": 75655 }, { "epoch": 13.885116535144062, "grad_norm": 0.00022151802841108292, "learning_rate": 2.5874601382469277e-06, "loss": 0.0001, "num_input_tokens_seen": 163180368, "step": 75660 }, { "epoch": 13.886034134703616, "grad_norm": 0.00010076833132188767, "learning_rate": 2.586758793159855e-06, "loss": 0.0001, "num_input_tokens_seen": 163190576, "step": 75665 }, { "epoch": 13.886951734263167, "grad_norm": 0.0021651897113770247, "learning_rate": 2.586057509968677e-06, "loss": 0.0, "num_input_tokens_seen": 163200272, "step": 75670 }, { "epoch": 13.887869333822719, "grad_norm": 0.005689302459359169, "learning_rate": 2.585356288691384e-06, "loss": 0.0, "num_input_tokens_seen": 163210512, "step": 75675 }, { "epoch": 13.888786933382272, "grad_norm": 0.0006192360888235271, "learning_rate": 2.58465512934596e-06, "loss": 0.0, "num_input_tokens_seen": 163220720, "step": 75680 }, { "epoch": 13.889704532941824, "grad_norm": 0.0001472167787142098, "learning_rate": 2.583954031950389e-06, "loss": 0.0, "num_input_tokens_seen": 163230704, "step": 75685 }, { "epoch": 13.890622132501376, "grad_norm": 8.97441859706305e-05, "learning_rate": 2.5832529965226503e-06, "loss": 0.0, "num_input_tokens_seen": 163241520, "step": 75690 }, { "epoch": 13.891539732060929, "grad_norm": 0.00035251135705038905, "learning_rate": 2.5825520230807288e-06, "loss": 0.0, "num_input_tokens_seen": 163252560, "step": 75695 }, { "epoch": 13.89245733162048, "grad_norm": 0.0009371038759127259, "learning_rate": 2.581851111642601e-06, "loss": 0.0, "num_input_tokens_seen": 163263152, "step": 75700 }, { "epoch": 13.893374931180032, "grad_norm": 0.019957246258854866, "learning_rate": 2.581150262226242e-06, "loss": 0.1688, "num_input_tokens_seen": 163273392, "step": 75705 }, { "epoch": 13.894292530739586, "grad_norm": 0.0004296228289604187, "learning_rate": 2.580449474849632e-06, "loss": 0.0, "num_input_tokens_seen": 163283952, "step": 75710 }, { "epoch": 13.895210130299137, "grad_norm": 0.0005640293238684535, "learning_rate": 2.579748749530744e-06, "loss": 0.0, "num_input_tokens_seen": 163294480, "step": 75715 }, { "epoch": 13.896127729858689, "grad_norm": 0.00016212588525377214, "learning_rate": 2.579048086287549e-06, "loss": 0.0, "num_input_tokens_seen": 163304208, "step": 75720 }, { "epoch": 13.897045329418242, "grad_norm": 0.00028126026154495776, "learning_rate": 2.5783474851380157e-06, "loss": 0.0001, "num_input_tokens_seen": 163314960, "step": 75725 }, { "epoch": 13.897962928977794, "grad_norm": 9.632320143282413e-05, "learning_rate": 2.5776469461001184e-06, "loss": 0.0, "num_input_tokens_seen": 163325616, "step": 75730 }, { "epoch": 13.898880528537346, "grad_norm": 0.0040659671649336815, "learning_rate": 2.5769464691918235e-06, "loss": 0.0, "num_input_tokens_seen": 163337136, "step": 75735 }, { "epoch": 13.899798128096899, "grad_norm": 0.002031522337347269, "learning_rate": 2.5762460544310957e-06, "loss": 0.0, "num_input_tokens_seen": 163348464, "step": 75740 }, { "epoch": 13.90071572765645, "grad_norm": 0.00010567167191766202, "learning_rate": 2.575545701835898e-06, "loss": 0.0, "num_input_tokens_seen": 163359536, "step": 75745 }, { "epoch": 13.901633327216002, "grad_norm": 0.0003334736102260649, "learning_rate": 2.574845411424198e-06, "loss": 0.0425, "num_input_tokens_seen": 163371312, "step": 75750 }, { "epoch": 13.902550926775556, "grad_norm": 8.407198038185015e-05, "learning_rate": 2.5741451832139543e-06, "loss": 0.0, "num_input_tokens_seen": 163381904, "step": 75755 }, { "epoch": 13.903468526335107, "grad_norm": 0.014535538852214813, "learning_rate": 2.573445017223126e-06, "loss": 0.0, "num_input_tokens_seen": 163393680, "step": 75760 }, { "epoch": 13.904386125894659, "grad_norm": 0.007461522705852985, "learning_rate": 2.5727449134696736e-06, "loss": 0.0, "num_input_tokens_seen": 163403856, "step": 75765 }, { "epoch": 13.905303725454212, "grad_norm": 0.00573060242459178, "learning_rate": 2.5720448719715497e-06, "loss": 0.0001, "num_input_tokens_seen": 163414480, "step": 75770 }, { "epoch": 13.906221325013764, "grad_norm": 0.00014576411922462285, "learning_rate": 2.5713448927467134e-06, "loss": 0.0, "num_input_tokens_seen": 163426608, "step": 75775 }, { "epoch": 13.907138924573315, "grad_norm": 0.0066603426821529865, "learning_rate": 2.570644975813117e-06, "loss": 0.0, "num_input_tokens_seen": 163438032, "step": 75780 }, { "epoch": 13.908056524132869, "grad_norm": 0.00025960960192605853, "learning_rate": 2.5699451211887116e-06, "loss": 0.0001, "num_input_tokens_seen": 163449072, "step": 75785 }, { "epoch": 13.90897412369242, "grad_norm": 0.0009582490311004221, "learning_rate": 2.569245328891446e-06, "loss": 0.0, "num_input_tokens_seen": 163459184, "step": 75790 }, { "epoch": 13.909891723251972, "grad_norm": 0.0001408881798852235, "learning_rate": 2.568545598939272e-06, "loss": 0.0, "num_input_tokens_seen": 163470032, "step": 75795 }, { "epoch": 13.910809322811525, "grad_norm": 0.00039869220927357674, "learning_rate": 2.567845931350135e-06, "loss": 0.0, "num_input_tokens_seen": 163482384, "step": 75800 }, { "epoch": 13.911726922371077, "grad_norm": 0.0008166339830495417, "learning_rate": 2.567146326141979e-06, "loss": 0.0, "num_input_tokens_seen": 163492336, "step": 75805 }, { "epoch": 13.912644521930629, "grad_norm": 0.001960785361006856, "learning_rate": 2.5664467833327498e-06, "loss": 0.0, "num_input_tokens_seen": 163502672, "step": 75810 }, { "epoch": 13.913562121490182, "grad_norm": 0.00034738844260573387, "learning_rate": 2.56574730294039e-06, "loss": 0.0, "num_input_tokens_seen": 163513520, "step": 75815 }, { "epoch": 13.914479721049734, "grad_norm": 0.0001177136437036097, "learning_rate": 2.565047884982839e-06, "loss": 0.0001, "num_input_tokens_seen": 163524656, "step": 75820 }, { "epoch": 13.915397320609285, "grad_norm": 0.0016803100006654859, "learning_rate": 2.564348529478034e-06, "loss": 0.0002, "num_input_tokens_seen": 163535088, "step": 75825 }, { "epoch": 13.916314920168839, "grad_norm": 0.003108939388766885, "learning_rate": 2.5636492364439158e-06, "loss": 0.0, "num_input_tokens_seen": 163545520, "step": 75830 }, { "epoch": 13.91723251972839, "grad_norm": 0.000211047736229375, "learning_rate": 2.562950005898419e-06, "loss": 0.0, "num_input_tokens_seen": 163556880, "step": 75835 }, { "epoch": 13.918150119287942, "grad_norm": 0.00014413784083444625, "learning_rate": 2.5622508378594757e-06, "loss": 0.0, "num_input_tokens_seen": 163567728, "step": 75840 }, { "epoch": 13.919067718847495, "grad_norm": 0.0011619544820860028, "learning_rate": 2.5615517323450223e-06, "loss": 0.0, "num_input_tokens_seen": 163578640, "step": 75845 }, { "epoch": 13.919985318407047, "grad_norm": 0.00021306394774001092, "learning_rate": 2.560852689372987e-06, "loss": 0.0026, "num_input_tokens_seen": 163589040, "step": 75850 }, { "epoch": 13.920902917966599, "grad_norm": 0.0014188821660354733, "learning_rate": 2.5601537089613005e-06, "loss": 0.0, "num_input_tokens_seen": 163599376, "step": 75855 }, { "epoch": 13.921820517526152, "grad_norm": 0.020729990676045418, "learning_rate": 2.559454791127888e-06, "loss": 0.0, "num_input_tokens_seen": 163611472, "step": 75860 }, { "epoch": 13.922738117085704, "grad_norm": 0.00044353134580887854, "learning_rate": 2.5587559358906788e-06, "loss": 0.0, "num_input_tokens_seen": 163622224, "step": 75865 }, { "epoch": 13.923655716645255, "grad_norm": 0.002376631135120988, "learning_rate": 2.558057143267597e-06, "loss": 0.0, "num_input_tokens_seen": 163632144, "step": 75870 }, { "epoch": 13.924573316204809, "grad_norm": 0.00015972094843164086, "learning_rate": 2.5573584132765627e-06, "loss": 0.0, "num_input_tokens_seen": 163642512, "step": 75875 }, { "epoch": 13.92549091576436, "grad_norm": 9.263788524549454e-05, "learning_rate": 2.5566597459355013e-06, "loss": 0.0007, "num_input_tokens_seen": 163652560, "step": 75880 }, { "epoch": 13.926408515323912, "grad_norm": 0.26558718085289, "learning_rate": 2.555961141262331e-06, "loss": 0.0, "num_input_tokens_seen": 163663984, "step": 75885 }, { "epoch": 13.927326114883465, "grad_norm": 0.0002688198001123965, "learning_rate": 2.555262599274967e-06, "loss": 0.0, "num_input_tokens_seen": 163674800, "step": 75890 }, { "epoch": 13.928243714443017, "grad_norm": 0.00024305695842485875, "learning_rate": 2.5545641199913297e-06, "loss": 0.0, "num_input_tokens_seen": 163685296, "step": 75895 }, { "epoch": 13.929161314002569, "grad_norm": 0.0002899712126236409, "learning_rate": 2.5538657034293335e-06, "loss": 0.0063, "num_input_tokens_seen": 163696560, "step": 75900 }, { "epoch": 13.930078913562122, "grad_norm": 0.00011877741781063378, "learning_rate": 2.553167349606891e-06, "loss": 0.0, "num_input_tokens_seen": 163707376, "step": 75905 }, { "epoch": 13.930996513121674, "grad_norm": 0.004178804811090231, "learning_rate": 2.552469058541911e-06, "loss": 0.3016, "num_input_tokens_seen": 163718448, "step": 75910 }, { "epoch": 13.931914112681225, "grad_norm": 0.0019945392850786448, "learning_rate": 2.5517708302523092e-06, "loss": 0.0, "num_input_tokens_seen": 163728272, "step": 75915 }, { "epoch": 13.932831712240779, "grad_norm": 0.00017610400391276926, "learning_rate": 2.5510726647559904e-06, "loss": 0.0, "num_input_tokens_seen": 163738608, "step": 75920 }, { "epoch": 13.93374931180033, "grad_norm": 0.0005939681432209909, "learning_rate": 2.5503745620708607e-06, "loss": 0.0, "num_input_tokens_seen": 163750224, "step": 75925 }, { "epoch": 13.934666911359882, "grad_norm": 0.00013687406317330897, "learning_rate": 2.549676522214829e-06, "loss": 0.0, "num_input_tokens_seen": 163760720, "step": 75930 }, { "epoch": 13.935584510919435, "grad_norm": 8.930795593187213e-05, "learning_rate": 2.5489785452057965e-06, "loss": 0.0, "num_input_tokens_seen": 163772400, "step": 75935 }, { "epoch": 13.936502110478987, "grad_norm": 0.00041686996701173484, "learning_rate": 2.5482806310616635e-06, "loss": 0.0, "num_input_tokens_seen": 163783536, "step": 75940 }, { "epoch": 13.937419710038538, "grad_norm": 0.0005969146732240915, "learning_rate": 2.547582779800335e-06, "loss": 0.0, "num_input_tokens_seen": 163796048, "step": 75945 }, { "epoch": 13.938337309598092, "grad_norm": 0.0018251418368890882, "learning_rate": 2.5468849914397067e-06, "loss": 0.0, "num_input_tokens_seen": 163806896, "step": 75950 }, { "epoch": 13.939254909157643, "grad_norm": 0.00016218170640058815, "learning_rate": 2.5461872659976766e-06, "loss": 0.0, "num_input_tokens_seen": 163817392, "step": 75955 }, { "epoch": 13.940172508717195, "grad_norm": 0.0008080117404460907, "learning_rate": 2.5454896034921402e-06, "loss": 0.0, "num_input_tokens_seen": 163825904, "step": 75960 }, { "epoch": 13.941090108276748, "grad_norm": 0.00017679392476566136, "learning_rate": 2.544792003940989e-06, "loss": 0.0001, "num_input_tokens_seen": 163837328, "step": 75965 }, { "epoch": 13.9420077078363, "grad_norm": 0.00018181624182034284, "learning_rate": 2.5440944673621204e-06, "loss": 0.0, "num_input_tokens_seen": 163847248, "step": 75970 }, { "epoch": 13.942925307395852, "grad_norm": 0.0013277595862746239, "learning_rate": 2.5433969937734216e-06, "loss": 0.0, "num_input_tokens_seen": 163858128, "step": 75975 }, { "epoch": 13.943842906955405, "grad_norm": 0.00028608247521333396, "learning_rate": 2.5426995831927827e-06, "loss": 0.0, "num_input_tokens_seen": 163868208, "step": 75980 }, { "epoch": 13.944760506514957, "grad_norm": 0.0003238593926653266, "learning_rate": 2.5420022356380912e-06, "loss": 0.0, "num_input_tokens_seen": 163879984, "step": 75985 }, { "epoch": 13.945678106074508, "grad_norm": 0.012964833527803421, "learning_rate": 2.5413049511272307e-06, "loss": 0.147, "num_input_tokens_seen": 163891152, "step": 75990 }, { "epoch": 13.946595705634062, "grad_norm": 0.00019883541972376406, "learning_rate": 2.5406077296780895e-06, "loss": 0.0, "num_input_tokens_seen": 163901968, "step": 75995 }, { "epoch": 13.947513305193613, "grad_norm": 0.0012015813263133168, "learning_rate": 2.5399105713085486e-06, "loss": 0.0, "num_input_tokens_seen": 163912112, "step": 76000 }, { "epoch": 13.948430904753165, "grad_norm": 0.00022842857288196683, "learning_rate": 2.539213476036489e-06, "loss": 0.0002, "num_input_tokens_seen": 163923664, "step": 76005 }, { "epoch": 13.949348504312718, "grad_norm": 0.00031607059645466506, "learning_rate": 2.5385164438797872e-06, "loss": 0.0, "num_input_tokens_seen": 163934896, "step": 76010 }, { "epoch": 13.95026610387227, "grad_norm": 0.0032028595451265574, "learning_rate": 2.5378194748563264e-06, "loss": 0.0, "num_input_tokens_seen": 163945392, "step": 76015 }, { "epoch": 13.951183703431822, "grad_norm": 0.0018219905905425549, "learning_rate": 2.5371225689839795e-06, "loss": 0.0, "num_input_tokens_seen": 163955248, "step": 76020 }, { "epoch": 13.952101302991375, "grad_norm": 0.03689805045723915, "learning_rate": 2.536425726280619e-06, "loss": 0.1129, "num_input_tokens_seen": 163966000, "step": 76025 }, { "epoch": 13.953018902550927, "grad_norm": 0.28412553668022156, "learning_rate": 2.535728946764123e-06, "loss": 0.0003, "num_input_tokens_seen": 163976112, "step": 76030 }, { "epoch": 13.953936502110478, "grad_norm": 0.00019234386854805052, "learning_rate": 2.535032230452361e-06, "loss": 0.0, "num_input_tokens_seen": 163986640, "step": 76035 }, { "epoch": 13.954854101670032, "grad_norm": 0.0004292753292247653, "learning_rate": 2.534335577363201e-06, "loss": 0.0, "num_input_tokens_seen": 163996016, "step": 76040 }, { "epoch": 13.955771701229583, "grad_norm": 1.7950069904327393, "learning_rate": 2.5336389875145105e-06, "loss": 0.0004, "num_input_tokens_seen": 164005200, "step": 76045 }, { "epoch": 13.956689300789135, "grad_norm": 0.01663125678896904, "learning_rate": 2.5329424609241593e-06, "loss": 0.0, "num_input_tokens_seen": 164014192, "step": 76050 }, { "epoch": 13.957606900348688, "grad_norm": 0.0001774149714037776, "learning_rate": 2.53224599761001e-06, "loss": 0.0, "num_input_tokens_seen": 164025296, "step": 76055 }, { "epoch": 13.95852449990824, "grad_norm": 0.020539676770567894, "learning_rate": 2.531549597589925e-06, "loss": 0.0, "num_input_tokens_seen": 164036144, "step": 76060 }, { "epoch": 13.959442099467791, "grad_norm": 0.009657595306634903, "learning_rate": 2.530853260881768e-06, "loss": 0.0, "num_input_tokens_seen": 164045456, "step": 76065 }, { "epoch": 13.960359699027345, "grad_norm": 20.91965103149414, "learning_rate": 2.530156987503399e-06, "loss": 0.0451, "num_input_tokens_seen": 164056944, "step": 76070 }, { "epoch": 13.961277298586896, "grad_norm": 0.0008390630828216672, "learning_rate": 2.529460777472673e-06, "loss": 0.0, "num_input_tokens_seen": 164068752, "step": 76075 }, { "epoch": 13.962194898146448, "grad_norm": 0.00010046411625808105, "learning_rate": 2.5287646308074507e-06, "loss": 0.0, "num_input_tokens_seen": 164079600, "step": 76080 }, { "epoch": 13.963112497706001, "grad_norm": 0.00012428282934706658, "learning_rate": 2.528068547525586e-06, "loss": 0.0, "num_input_tokens_seen": 164091056, "step": 76085 }, { "epoch": 13.964030097265553, "grad_norm": 0.000723593810107559, "learning_rate": 2.5273725276449323e-06, "loss": 0.0, "num_input_tokens_seen": 164102672, "step": 76090 }, { "epoch": 13.964947696825105, "grad_norm": 0.018210623413324356, "learning_rate": 2.5266765711833387e-06, "loss": 0.0003, "num_input_tokens_seen": 164113776, "step": 76095 }, { "epoch": 13.965865296384658, "grad_norm": 0.0051294006407260895, "learning_rate": 2.5259806781586595e-06, "loss": 0.0, "num_input_tokens_seen": 164125200, "step": 76100 }, { "epoch": 13.96678289594421, "grad_norm": 0.002917303703725338, "learning_rate": 2.5252848485887416e-06, "loss": 0.0, "num_input_tokens_seen": 164135856, "step": 76105 }, { "epoch": 13.967700495503761, "grad_norm": 0.0008961131679825485, "learning_rate": 2.52458908249143e-06, "loss": 0.0006, "num_input_tokens_seen": 164147344, "step": 76110 }, { "epoch": 13.968618095063315, "grad_norm": 0.0011893085902556777, "learning_rate": 2.5238933798845733e-06, "loss": 0.0001, "num_input_tokens_seen": 164157744, "step": 76115 }, { "epoch": 13.969535694622866, "grad_norm": 0.0011057488154619932, "learning_rate": 2.523197740786014e-06, "loss": 0.0, "num_input_tokens_seen": 164167824, "step": 76120 }, { "epoch": 13.970453294182418, "grad_norm": 0.0006779703544452786, "learning_rate": 2.522502165213593e-06, "loss": 0.0, "num_input_tokens_seen": 164179568, "step": 76125 }, { "epoch": 13.971370893741971, "grad_norm": 0.05278955399990082, "learning_rate": 2.5218066531851525e-06, "loss": 0.0001, "num_input_tokens_seen": 164190192, "step": 76130 }, { "epoch": 13.972288493301523, "grad_norm": 0.004341667052358389, "learning_rate": 2.521111204718531e-06, "loss": 0.0, "num_input_tokens_seen": 164202576, "step": 76135 }, { "epoch": 13.973206092861075, "grad_norm": 0.00428536394611001, "learning_rate": 2.5204158198315652e-06, "loss": 0.0, "num_input_tokens_seen": 164213200, "step": 76140 }, { "epoch": 13.974123692420628, "grad_norm": 0.00011535468365764245, "learning_rate": 2.5197204985420886e-06, "loss": 0.0003, "num_input_tokens_seen": 164223344, "step": 76145 }, { "epoch": 13.97504129198018, "grad_norm": 0.0006612797733396292, "learning_rate": 2.519025240867938e-06, "loss": 0.0, "num_input_tokens_seen": 164234160, "step": 76150 }, { "epoch": 13.975958891539731, "grad_norm": 0.055821195244789124, "learning_rate": 2.518330046826947e-06, "loss": 0.0002, "num_input_tokens_seen": 164243184, "step": 76155 }, { "epoch": 13.976876491099285, "grad_norm": 0.00027219814364798367, "learning_rate": 2.5176349164369405e-06, "loss": 0.0001, "num_input_tokens_seen": 164254064, "step": 76160 }, { "epoch": 13.977794090658836, "grad_norm": 0.0005493871285580099, "learning_rate": 2.516939849715754e-06, "loss": 0.0, "num_input_tokens_seen": 164264336, "step": 76165 }, { "epoch": 13.978711690218388, "grad_norm": 0.009053459390997887, "learning_rate": 2.5162448466812106e-06, "loss": 0.0, "num_input_tokens_seen": 164275440, "step": 76170 }, { "epoch": 13.979629289777941, "grad_norm": 0.03620783984661102, "learning_rate": 2.515549907351138e-06, "loss": 0.0, "num_input_tokens_seen": 164286352, "step": 76175 }, { "epoch": 13.980546889337493, "grad_norm": 0.0009446987533010542, "learning_rate": 2.5148550317433606e-06, "loss": 0.0735, "num_input_tokens_seen": 164297104, "step": 76180 }, { "epoch": 13.981464488897045, "grad_norm": 0.0071288119070231915, "learning_rate": 2.5141602198756993e-06, "loss": 0.0264, "num_input_tokens_seen": 164308048, "step": 76185 }, { "epoch": 13.982382088456598, "grad_norm": 0.0030401817057281733, "learning_rate": 2.5134654717659735e-06, "loss": 0.0002, "num_input_tokens_seen": 164319248, "step": 76190 }, { "epoch": 13.98329968801615, "grad_norm": 0.00022993011225480586, "learning_rate": 2.5127707874320066e-06, "loss": 0.0057, "num_input_tokens_seen": 164329744, "step": 76195 }, { "epoch": 13.984217287575701, "grad_norm": 0.0002564409514889121, "learning_rate": 2.512076166891615e-06, "loss": 0.0, "num_input_tokens_seen": 164339952, "step": 76200 }, { "epoch": 13.985134887135255, "grad_norm": 0.13082584738731384, "learning_rate": 2.5113816101626127e-06, "loss": 0.0, "num_input_tokens_seen": 164351504, "step": 76205 }, { "epoch": 13.986052486694806, "grad_norm": 0.0013955890899524093, "learning_rate": 2.5106871172628133e-06, "loss": 0.0, "num_input_tokens_seen": 164361488, "step": 76210 }, { "epoch": 13.986970086254358, "grad_norm": 0.000135029258672148, "learning_rate": 2.5099926882100335e-06, "loss": 0.0, "num_input_tokens_seen": 164371728, "step": 76215 }, { "epoch": 13.987887685813911, "grad_norm": 0.0005844796542078257, "learning_rate": 2.5092983230220824e-06, "loss": 0.0, "num_input_tokens_seen": 164382288, "step": 76220 }, { "epoch": 13.988805285373463, "grad_norm": 0.017590554431080818, "learning_rate": 2.5086040217167683e-06, "loss": 0.0, "num_input_tokens_seen": 164392880, "step": 76225 }, { "epoch": 13.989722884933014, "grad_norm": 0.00012390837946441025, "learning_rate": 2.5079097843118984e-06, "loss": 0.0, "num_input_tokens_seen": 164402800, "step": 76230 }, { "epoch": 13.990640484492568, "grad_norm": 0.00041545447311364114, "learning_rate": 2.507215610825282e-06, "loss": 0.0, "num_input_tokens_seen": 164413744, "step": 76235 }, { "epoch": 13.99155808405212, "grad_norm": 0.0008793519227765501, "learning_rate": 2.506521501274722e-06, "loss": 0.0974, "num_input_tokens_seen": 164424656, "step": 76240 }, { "epoch": 13.992475683611671, "grad_norm": 0.023930255323648453, "learning_rate": 2.505827455678018e-06, "loss": 0.0001, "num_input_tokens_seen": 164435824, "step": 76245 }, { "epoch": 13.993393283171224, "grad_norm": 46.50698471069336, "learning_rate": 2.505133474052977e-06, "loss": 0.0426, "num_input_tokens_seen": 164447312, "step": 76250 }, { "epoch": 13.994310882730776, "grad_norm": 0.001117779640480876, "learning_rate": 2.504439556417395e-06, "loss": 0.0, "num_input_tokens_seen": 164458736, "step": 76255 }, { "epoch": 13.995228482290328, "grad_norm": 0.0013055165763944387, "learning_rate": 2.50374570278907e-06, "loss": 0.0, "num_input_tokens_seen": 164469616, "step": 76260 }, { "epoch": 13.996146081849881, "grad_norm": 0.00016277293616440147, "learning_rate": 2.5030519131857994e-06, "loss": 0.0, "num_input_tokens_seen": 164480080, "step": 76265 }, { "epoch": 13.997063681409433, "grad_norm": 0.0008217521826736629, "learning_rate": 2.5023581876253776e-06, "loss": 0.0004, "num_input_tokens_seen": 164491120, "step": 76270 }, { "epoch": 13.997981280968984, "grad_norm": 0.0003385292657185346, "learning_rate": 2.501664526125598e-06, "loss": 0.147, "num_input_tokens_seen": 164501168, "step": 76275 }, { "epoch": 13.998898880528538, "grad_norm": 0.0002614146505948156, "learning_rate": 2.5009709287042485e-06, "loss": 0.0, "num_input_tokens_seen": 164512368, "step": 76280 }, { "epoch": 13.99981648008809, "grad_norm": 0.0010643735295161605, "learning_rate": 2.5002773953791238e-06, "loss": 0.0, "num_input_tokens_seen": 164522832, "step": 76285 }, { "epoch": 14.0, "eval_loss": 0.5068264603614807, "eval_runtime": 179.02, "eval_samples_per_second": 30.438, "eval_steps_per_second": 7.614, "num_input_tokens_seen": 164523360, "step": 76286 }, { "epoch": 14.000734079647641, "grad_norm": 0.00610599247738719, "learning_rate": 2.49958392616801e-06, "loss": 0.0, "num_input_tokens_seen": 164530816, "step": 76290 }, { "epoch": 14.001651679207194, "grad_norm": 0.00400211475789547, "learning_rate": 2.4988905210886904e-06, "loss": 0.0, "num_input_tokens_seen": 164541248, "step": 76295 }, { "epoch": 14.002569278766746, "grad_norm": 0.10996038466691971, "learning_rate": 2.498197180158955e-06, "loss": 0.0001, "num_input_tokens_seen": 164552192, "step": 76300 }, { "epoch": 14.003486878326298, "grad_norm": 0.0003328394377604127, "learning_rate": 2.4975039033965847e-06, "loss": 0.0, "num_input_tokens_seen": 164563008, "step": 76305 }, { "epoch": 14.004404477885851, "grad_norm": 0.00029958909726701677, "learning_rate": 2.496810690819361e-06, "loss": 0.0, "num_input_tokens_seen": 164572992, "step": 76310 }, { "epoch": 14.005322077445403, "grad_norm": 0.0006310667959041893, "learning_rate": 2.4961175424450608e-06, "loss": 0.0, "num_input_tokens_seen": 164585088, "step": 76315 }, { "epoch": 14.006239677004954, "grad_norm": 0.003446887945756316, "learning_rate": 2.4954244582914673e-06, "loss": 0.0, "num_input_tokens_seen": 164596736, "step": 76320 }, { "epoch": 14.007157276564508, "grad_norm": 0.002877471735700965, "learning_rate": 2.4947314383763544e-06, "loss": 0.0, "num_input_tokens_seen": 164607744, "step": 76325 }, { "epoch": 14.00807487612406, "grad_norm": 0.0001568294974276796, "learning_rate": 2.4940384827174956e-06, "loss": 0.0001, "num_input_tokens_seen": 164617184, "step": 76330 }, { "epoch": 14.00899247568361, "grad_norm": 0.0019906102679669857, "learning_rate": 2.4933455913326678e-06, "loss": 0.0, "num_input_tokens_seen": 164628224, "step": 76335 }, { "epoch": 14.009910075243164, "grad_norm": 0.0018928140634670854, "learning_rate": 2.492652764239641e-06, "loss": 0.0, "num_input_tokens_seen": 164639776, "step": 76340 }, { "epoch": 14.010827674802716, "grad_norm": 0.00020728832168970257, "learning_rate": 2.4919600014561824e-06, "loss": 0.0001, "num_input_tokens_seen": 164650816, "step": 76345 }, { "epoch": 14.011745274362267, "grad_norm": 0.00037659198278561234, "learning_rate": 2.4912673030000646e-06, "loss": 0.0, "num_input_tokens_seen": 164660896, "step": 76350 }, { "epoch": 14.01266287392182, "grad_norm": 9.94750444078818e-05, "learning_rate": 2.490574668889052e-06, "loss": 0.04, "num_input_tokens_seen": 164670976, "step": 76355 }, { "epoch": 14.013580473481372, "grad_norm": 0.00017903590924106538, "learning_rate": 2.48988209914091e-06, "loss": 0.0002, "num_input_tokens_seen": 164681088, "step": 76360 }, { "epoch": 14.014498073040924, "grad_norm": 0.0028858077712357044, "learning_rate": 2.4891895937734e-06, "loss": 0.0, "num_input_tokens_seen": 164692192, "step": 76365 }, { "epoch": 14.015415672600477, "grad_norm": 0.00018922587332781404, "learning_rate": 2.4884971528042877e-06, "loss": 0.0001, "num_input_tokens_seen": 164702560, "step": 76370 }, { "epoch": 14.016333272160029, "grad_norm": 0.002631609793752432, "learning_rate": 2.487804776251331e-06, "loss": 0.0, "num_input_tokens_seen": 164712512, "step": 76375 }, { "epoch": 14.01725087171958, "grad_norm": 0.00025011113029904664, "learning_rate": 2.487112464132288e-06, "loss": 0.0001, "num_input_tokens_seen": 164725344, "step": 76380 }, { "epoch": 14.018168471279134, "grad_norm": 0.00131167471408844, "learning_rate": 2.4864202164649136e-06, "loss": 0.3469, "num_input_tokens_seen": 164735584, "step": 76385 }, { "epoch": 14.019086070838686, "grad_norm": 0.00023806377430446446, "learning_rate": 2.485728033266967e-06, "loss": 0.0, "num_input_tokens_seen": 164746432, "step": 76390 }, { "epoch": 14.020003670398237, "grad_norm": 0.0026336070150136948, "learning_rate": 2.4850359145562e-06, "loss": 0.0, "num_input_tokens_seen": 164756256, "step": 76395 }, { "epoch": 14.02092126995779, "grad_norm": 0.006523351650685072, "learning_rate": 2.4843438603503633e-06, "loss": 0.0001, "num_input_tokens_seen": 164767200, "step": 76400 }, { "epoch": 14.021838869517342, "grad_norm": 0.004686594940721989, "learning_rate": 2.4836518706672076e-06, "loss": 0.0, "num_input_tokens_seen": 164778784, "step": 76405 }, { "epoch": 14.022756469076894, "grad_norm": 0.00013460828631650656, "learning_rate": 2.4829599455244803e-06, "loss": 0.0, "num_input_tokens_seen": 164789760, "step": 76410 }, { "epoch": 14.023674068636447, "grad_norm": 0.0005548296612687409, "learning_rate": 2.4822680849399306e-06, "loss": 0.0, "num_input_tokens_seen": 164801024, "step": 76415 }, { "epoch": 14.024591668195999, "grad_norm": 0.0068453033454716206, "learning_rate": 2.481576288931302e-06, "loss": 0.0, "num_input_tokens_seen": 164812000, "step": 76420 }, { "epoch": 14.02550926775555, "grad_norm": 0.006751914508640766, "learning_rate": 2.4808845575163395e-06, "loss": 0.0, "num_input_tokens_seen": 164823968, "step": 76425 }, { "epoch": 14.026426867315104, "grad_norm": 0.0007623389828950167, "learning_rate": 2.4801928907127814e-06, "loss": 0.0, "num_input_tokens_seen": 164835488, "step": 76430 }, { "epoch": 14.027344466874656, "grad_norm": 0.005018539261072874, "learning_rate": 2.479501288538372e-06, "loss": 0.0, "num_input_tokens_seen": 164847136, "step": 76435 }, { "epoch": 14.028262066434207, "grad_norm": 0.0003391382342670113, "learning_rate": 2.478809751010848e-06, "loss": 0.0, "num_input_tokens_seen": 164857408, "step": 76440 }, { "epoch": 14.02917966599376, "grad_norm": 0.002857967047020793, "learning_rate": 2.478118278147945e-06, "loss": 0.0001, "num_input_tokens_seen": 164866560, "step": 76445 }, { "epoch": 14.030097265553312, "grad_norm": 0.0003126736846752465, "learning_rate": 2.4774268699674016e-06, "loss": 0.0, "num_input_tokens_seen": 164877152, "step": 76450 }, { "epoch": 14.031014865112864, "grad_norm": 0.0001795166899682954, "learning_rate": 2.4767355264869493e-06, "loss": 0.0, "num_input_tokens_seen": 164889472, "step": 76455 }, { "epoch": 14.031932464672417, "grad_norm": 0.00030281409271992743, "learning_rate": 2.4760442477243197e-06, "loss": 0.0002, "num_input_tokens_seen": 164899360, "step": 76460 }, { "epoch": 14.032850064231969, "grad_norm": 0.0017525871517136693, "learning_rate": 2.4753530336972413e-06, "loss": 0.0, "num_input_tokens_seen": 164910528, "step": 76465 }, { "epoch": 14.03376766379152, "grad_norm": 0.0029263587202876806, "learning_rate": 2.474661884423447e-06, "loss": 0.0352, "num_input_tokens_seen": 164922400, "step": 76470 }, { "epoch": 14.034685263351074, "grad_norm": 0.059128306806087494, "learning_rate": 2.4739707999206613e-06, "loss": 0.0, "num_input_tokens_seen": 164934112, "step": 76475 }, { "epoch": 14.035602862910626, "grad_norm": 0.0010225680889561772, "learning_rate": 2.473279780206608e-06, "loss": 0.0003, "num_input_tokens_seen": 164945408, "step": 76480 }, { "epoch": 14.036520462470177, "grad_norm": 0.0021004201844334602, "learning_rate": 2.472588825299014e-06, "loss": 0.0008, "num_input_tokens_seen": 164956416, "step": 76485 }, { "epoch": 14.03743806202973, "grad_norm": 0.0004222586576361209, "learning_rate": 2.4718979352155993e-06, "loss": 0.0, "num_input_tokens_seen": 164966656, "step": 76490 }, { "epoch": 14.038355661589282, "grad_norm": 0.006368235684931278, "learning_rate": 2.471207109974085e-06, "loss": 0.0, "num_input_tokens_seen": 164977120, "step": 76495 }, { "epoch": 14.039273261148834, "grad_norm": 0.002753910841420293, "learning_rate": 2.4705163495921864e-06, "loss": 0.0, "num_input_tokens_seen": 164988672, "step": 76500 }, { "epoch": 14.040190860708387, "grad_norm": 0.00031065772054716945, "learning_rate": 2.469825654087625e-06, "loss": 0.0376, "num_input_tokens_seen": 164999200, "step": 76505 }, { "epoch": 14.041108460267939, "grad_norm": 0.0012949665542691946, "learning_rate": 2.469135023478114e-06, "loss": 0.0, "num_input_tokens_seen": 165010368, "step": 76510 }, { "epoch": 14.04202605982749, "grad_norm": 0.00015576825535390526, "learning_rate": 2.468444457781366e-06, "loss": 0.0, "num_input_tokens_seen": 165021664, "step": 76515 }, { "epoch": 14.042943659387044, "grad_norm": 0.000135554262669757, "learning_rate": 2.4677539570150955e-06, "loss": 0.0, "num_input_tokens_seen": 165032448, "step": 76520 }, { "epoch": 14.043861258946595, "grad_norm": 0.0002518024994060397, "learning_rate": 2.4670635211970116e-06, "loss": 0.0, "num_input_tokens_seen": 165043584, "step": 76525 }, { "epoch": 14.044778858506147, "grad_norm": 0.0001972404570551589, "learning_rate": 2.4663731503448208e-06, "loss": 0.0, "num_input_tokens_seen": 165054464, "step": 76530 }, { "epoch": 14.0456964580657, "grad_norm": 0.00016627807053737342, "learning_rate": 2.4656828444762337e-06, "loss": 0.0, "num_input_tokens_seen": 165066208, "step": 76535 }, { "epoch": 14.046614057625252, "grad_norm": 0.00022340698342304677, "learning_rate": 2.464992603608954e-06, "loss": 0.0, "num_input_tokens_seen": 165077184, "step": 76540 }, { "epoch": 14.047531657184804, "grad_norm": 0.0007902688812464476, "learning_rate": 2.4643024277606846e-06, "loss": 0.0, "num_input_tokens_seen": 165087584, "step": 76545 }, { "epoch": 14.048449256744357, "grad_norm": 0.0005933790234848857, "learning_rate": 2.4636123169491265e-06, "loss": 0.0001, "num_input_tokens_seen": 165099456, "step": 76550 }, { "epoch": 14.049366856303909, "grad_norm": 0.00020568682521115988, "learning_rate": 2.4629222711919836e-06, "loss": 0.0, "num_input_tokens_seen": 165110624, "step": 76555 }, { "epoch": 14.05028445586346, "grad_norm": 0.0007276576943695545, "learning_rate": 2.4622322905069517e-06, "loss": 0.0, "num_input_tokens_seen": 165121408, "step": 76560 }, { "epoch": 14.051202055423014, "grad_norm": 0.00018458823615219444, "learning_rate": 2.4615423749117266e-06, "loss": 0.0, "num_input_tokens_seen": 165132288, "step": 76565 }, { "epoch": 14.052119654982565, "grad_norm": 0.00010810370440594852, "learning_rate": 2.460852524424008e-06, "loss": 0.0, "num_input_tokens_seen": 165143776, "step": 76570 }, { "epoch": 14.053037254542117, "grad_norm": 0.0010640942491590977, "learning_rate": 2.460162739061486e-06, "loss": 0.0, "num_input_tokens_seen": 165154912, "step": 76575 }, { "epoch": 14.05395485410167, "grad_norm": 0.00043931687832809985, "learning_rate": 2.4594730188418513e-06, "loss": 0.2188, "num_input_tokens_seen": 165166240, "step": 76580 }, { "epoch": 14.054872453661222, "grad_norm": 0.0008721821359358728, "learning_rate": 2.4587833637827986e-06, "loss": 0.0, "num_input_tokens_seen": 165175232, "step": 76585 }, { "epoch": 14.055790053220774, "grad_norm": 0.0008190061198547482, "learning_rate": 2.458093773902014e-06, "loss": 0.0, "num_input_tokens_seen": 165186688, "step": 76590 }, { "epoch": 14.056707652780327, "grad_norm": 0.0006743734702467918, "learning_rate": 2.4574042492171844e-06, "loss": 0.0, "num_input_tokens_seen": 165197152, "step": 76595 }, { "epoch": 14.057625252339879, "grad_norm": 0.002839938271790743, "learning_rate": 2.4567147897459954e-06, "loss": 0.0003, "num_input_tokens_seen": 165208832, "step": 76600 }, { "epoch": 14.05854285189943, "grad_norm": 0.0007513671298511326, "learning_rate": 2.456025395506128e-06, "loss": 0.0, "num_input_tokens_seen": 165219840, "step": 76605 }, { "epoch": 14.059460451458984, "grad_norm": 0.000814327911939472, "learning_rate": 2.4553360665152685e-06, "loss": 0.0, "num_input_tokens_seen": 165231072, "step": 76610 }, { "epoch": 14.060378051018535, "grad_norm": 0.00010335948900319636, "learning_rate": 2.4546468027910952e-06, "loss": 0.0, "num_input_tokens_seen": 165241472, "step": 76615 }, { "epoch": 14.061295650578089, "grad_norm": 0.02189691737294197, "learning_rate": 2.4539576043512862e-06, "loss": 0.0, "num_input_tokens_seen": 165251584, "step": 76620 }, { "epoch": 14.06221325013764, "grad_norm": 0.0007332430104725063, "learning_rate": 2.453268471213519e-06, "loss": 0.0, "num_input_tokens_seen": 165261440, "step": 76625 }, { "epoch": 14.063130849697192, "grad_norm": 0.0004157390503678471, "learning_rate": 2.4525794033954657e-06, "loss": 0.0005, "num_input_tokens_seen": 165272384, "step": 76630 }, { "epoch": 14.064048449256745, "grad_norm": 0.01238133292645216, "learning_rate": 2.4518904009148054e-06, "loss": 0.0, "num_input_tokens_seen": 165283520, "step": 76635 }, { "epoch": 14.064966048816297, "grad_norm": 0.0014367287512868643, "learning_rate": 2.4512014637892067e-06, "loss": 0.0, "num_input_tokens_seen": 165294080, "step": 76640 }, { "epoch": 14.065883648375848, "grad_norm": 0.0010111260926350951, "learning_rate": 2.4505125920363403e-06, "loss": 0.0, "num_input_tokens_seen": 165304960, "step": 76645 }, { "epoch": 14.066801247935402, "grad_norm": 0.0028428847435861826, "learning_rate": 2.4498237856738728e-06, "loss": 0.0, "num_input_tokens_seen": 165317088, "step": 76650 }, { "epoch": 14.067718847494953, "grad_norm": 0.0005062037380412221, "learning_rate": 2.449135044719474e-06, "loss": 0.0, "num_input_tokens_seen": 165327648, "step": 76655 }, { "epoch": 14.068636447054505, "grad_norm": 0.00034187737037427723, "learning_rate": 2.4484463691908082e-06, "loss": 0.0, "num_input_tokens_seen": 165338464, "step": 76660 }, { "epoch": 14.069554046614059, "grad_norm": 0.00018393735808786005, "learning_rate": 2.4477577591055368e-06, "loss": 0.0, "num_input_tokens_seen": 165348832, "step": 76665 }, { "epoch": 14.07047164617361, "grad_norm": 0.000569843512494117, "learning_rate": 2.4470692144813254e-06, "loss": 0.0, "num_input_tokens_seen": 165359776, "step": 76670 }, { "epoch": 14.071389245733162, "grad_norm": 0.0009227341506630182, "learning_rate": 2.4463807353358317e-06, "loss": 0.0, "num_input_tokens_seen": 165371616, "step": 76675 }, { "epoch": 14.072306845292715, "grad_norm": 0.0004890271811746061, "learning_rate": 2.445692321686714e-06, "loss": 0.0, "num_input_tokens_seen": 165383264, "step": 76680 }, { "epoch": 14.073224444852267, "grad_norm": 0.0002739429473876953, "learning_rate": 2.445003973551628e-06, "loss": 0.0, "num_input_tokens_seen": 165393312, "step": 76685 }, { "epoch": 14.074142044411818, "grad_norm": 0.00024931415100581944, "learning_rate": 2.4443156909482318e-06, "loss": 0.0, "num_input_tokens_seen": 165404064, "step": 76690 }, { "epoch": 14.075059643971372, "grad_norm": 0.0021251977887004614, "learning_rate": 2.4436274738941773e-06, "loss": 0.0, "num_input_tokens_seen": 165413664, "step": 76695 }, { "epoch": 14.075977243530923, "grad_norm": 0.0001483107771491632, "learning_rate": 2.442939322407114e-06, "loss": 0.0, "num_input_tokens_seen": 165424704, "step": 76700 }, { "epoch": 14.076894843090475, "grad_norm": 0.0013121600495651364, "learning_rate": 2.4422512365046957e-06, "loss": 0.0, "num_input_tokens_seen": 165435328, "step": 76705 }, { "epoch": 14.077812442650028, "grad_norm": 0.00024607201339676976, "learning_rate": 2.4415632162045695e-06, "loss": 0.0, "num_input_tokens_seen": 165445984, "step": 76710 }, { "epoch": 14.07873004220958, "grad_norm": 0.00013248741743154824, "learning_rate": 2.4408752615243796e-06, "loss": 0.0, "num_input_tokens_seen": 165457632, "step": 76715 }, { "epoch": 14.079647641769132, "grad_norm": 0.0011037063086405396, "learning_rate": 2.440187372481775e-06, "loss": 0.0, "num_input_tokens_seen": 165468224, "step": 76720 }, { "epoch": 14.080565241328685, "grad_norm": 0.000576032092794776, "learning_rate": 2.439499549094397e-06, "loss": 0.0, "num_input_tokens_seen": 165479264, "step": 76725 }, { "epoch": 14.081482840888237, "grad_norm": 0.0004277914995327592, "learning_rate": 2.4388117913798866e-06, "loss": 0.0, "num_input_tokens_seen": 165490752, "step": 76730 }, { "epoch": 14.082400440447788, "grad_norm": 0.0009018063428811729, "learning_rate": 2.4381240993558824e-06, "loss": 0.0, "num_input_tokens_seen": 165501024, "step": 76735 }, { "epoch": 14.083318040007342, "grad_norm": 0.0032547798473387957, "learning_rate": 2.4374364730400268e-06, "loss": 0.0, "num_input_tokens_seen": 165512480, "step": 76740 }, { "epoch": 14.084235639566893, "grad_norm": 0.00031237880466505885, "learning_rate": 2.4367489124499544e-06, "loss": 0.0, "num_input_tokens_seen": 165523328, "step": 76745 }, { "epoch": 14.085153239126445, "grad_norm": 0.011747317388653755, "learning_rate": 2.436061417603297e-06, "loss": 0.0, "num_input_tokens_seen": 165534976, "step": 76750 }, { "epoch": 14.086070838685998, "grad_norm": 0.00044234885717742145, "learning_rate": 2.435373988517693e-06, "loss": 0.0003, "num_input_tokens_seen": 165546496, "step": 76755 }, { "epoch": 14.08698843824555, "grad_norm": 0.00017745840887073427, "learning_rate": 2.434686625210771e-06, "loss": 0.0, "num_input_tokens_seen": 165557344, "step": 76760 }, { "epoch": 14.087906037805102, "grad_norm": 0.014742636121809483, "learning_rate": 2.4339993277001597e-06, "loss": 0.0, "num_input_tokens_seen": 165568096, "step": 76765 }, { "epoch": 14.088823637364655, "grad_norm": 0.00018200372869614512, "learning_rate": 2.43331209600349e-06, "loss": 0.0, "num_input_tokens_seen": 165577920, "step": 76770 }, { "epoch": 14.089741236924207, "grad_norm": 0.00043111032573506236, "learning_rate": 2.4326249301383876e-06, "loss": 0.0, "num_input_tokens_seen": 165588288, "step": 76775 }, { "epoch": 14.090658836483758, "grad_norm": 168.66456604003906, "learning_rate": 2.431937830122476e-06, "loss": 0.0264, "num_input_tokens_seen": 165598464, "step": 76780 }, { "epoch": 14.091576436043312, "grad_norm": 0.00020071770995855331, "learning_rate": 2.431250795973378e-06, "loss": 0.0, "num_input_tokens_seen": 165609472, "step": 76785 }, { "epoch": 14.092494035602863, "grad_norm": 0.0053346590138971806, "learning_rate": 2.430563827708717e-06, "loss": 0.0078, "num_input_tokens_seen": 165619648, "step": 76790 }, { "epoch": 14.093411635162415, "grad_norm": 0.37909236550331116, "learning_rate": 2.429876925346112e-06, "loss": 0.0001, "num_input_tokens_seen": 165629440, "step": 76795 }, { "epoch": 14.094329234721968, "grad_norm": 0.00029982064734213054, "learning_rate": 2.429190088903178e-06, "loss": 0.0, "num_input_tokens_seen": 165639680, "step": 76800 }, { "epoch": 14.09524683428152, "grad_norm": 0.005970670375972986, "learning_rate": 2.4285033183975364e-06, "loss": 0.0, "num_input_tokens_seen": 165650368, "step": 76805 }, { "epoch": 14.096164433841071, "grad_norm": 0.0013193573104217649, "learning_rate": 2.427816613846799e-06, "loss": 0.0082, "num_input_tokens_seen": 165661856, "step": 76810 }, { "epoch": 14.097082033400625, "grad_norm": 0.00012828891340177506, "learning_rate": 2.427129975268579e-06, "loss": 0.004, "num_input_tokens_seen": 165673056, "step": 76815 }, { "epoch": 14.097999632960176, "grad_norm": 0.00018703496607486159, "learning_rate": 2.426443402680487e-06, "loss": 0.0, "num_input_tokens_seen": 165683840, "step": 76820 }, { "epoch": 14.098917232519728, "grad_norm": 0.00012084706395398825, "learning_rate": 2.4257568961001316e-06, "loss": 0.0, "num_input_tokens_seen": 165695168, "step": 76825 }, { "epoch": 14.099834832079281, "grad_norm": 0.000609980255831033, "learning_rate": 2.4250704555451245e-06, "loss": 0.0, "num_input_tokens_seen": 165706048, "step": 76830 }, { "epoch": 14.100752431638833, "grad_norm": 0.0012862836010754108, "learning_rate": 2.424384081033069e-06, "loss": 0.0, "num_input_tokens_seen": 165714592, "step": 76835 }, { "epoch": 14.101670031198385, "grad_norm": 0.07329823821783066, "learning_rate": 2.4236977725815696e-06, "loss": 0.0001, "num_input_tokens_seen": 165724864, "step": 76840 }, { "epoch": 14.102587630757938, "grad_norm": 0.0009456327534280717, "learning_rate": 2.4230115302082295e-06, "loss": 0.0, "num_input_tokens_seen": 165735392, "step": 76845 }, { "epoch": 14.10350523031749, "grad_norm": 46.94102478027344, "learning_rate": 2.4223253539306487e-06, "loss": 0.0144, "num_input_tokens_seen": 165745600, "step": 76850 }, { "epoch": 14.104422829877041, "grad_norm": 0.00032014897442422807, "learning_rate": 2.4216392437664284e-06, "loss": 0.0, "num_input_tokens_seen": 165755744, "step": 76855 }, { "epoch": 14.105340429436595, "grad_norm": 0.0002288944087922573, "learning_rate": 2.420953199733166e-06, "loss": 0.1005, "num_input_tokens_seen": 165766528, "step": 76860 }, { "epoch": 14.106258028996146, "grad_norm": 0.005050556734204292, "learning_rate": 2.4202672218484563e-06, "loss": 0.0, "num_input_tokens_seen": 165776384, "step": 76865 }, { "epoch": 14.107175628555698, "grad_norm": 0.00024728989228606224, "learning_rate": 2.4195813101298928e-06, "loss": 0.0, "num_input_tokens_seen": 165787776, "step": 76870 }, { "epoch": 14.108093228115251, "grad_norm": 0.0004928613197989762, "learning_rate": 2.4188954645950715e-06, "loss": 0.0, "num_input_tokens_seen": 165797952, "step": 76875 }, { "epoch": 14.109010827674803, "grad_norm": 0.0006658806232735515, "learning_rate": 2.4182096852615806e-06, "loss": 0.0, "num_input_tokens_seen": 165809952, "step": 76880 }, { "epoch": 14.109928427234355, "grad_norm": 0.0006912517128512263, "learning_rate": 2.417523972147008e-06, "loss": 0.0, "num_input_tokens_seen": 165821088, "step": 76885 }, { "epoch": 14.110846026793908, "grad_norm": 0.00010723964078351855, "learning_rate": 2.4168383252689447e-06, "loss": 0.0, "num_input_tokens_seen": 165832448, "step": 76890 }, { "epoch": 14.11176362635346, "grad_norm": 0.0008839343790896237, "learning_rate": 2.4161527446449757e-06, "loss": 0.0001, "num_input_tokens_seen": 165844352, "step": 76895 }, { "epoch": 14.112681225913011, "grad_norm": 0.04858752340078354, "learning_rate": 2.415467230292681e-06, "loss": 0.0, "num_input_tokens_seen": 165853856, "step": 76900 }, { "epoch": 14.113598825472565, "grad_norm": 0.0021523023024201393, "learning_rate": 2.414781782229649e-06, "loss": 0.0, "num_input_tokens_seen": 165864672, "step": 76905 }, { "epoch": 14.114516425032116, "grad_norm": 0.012904506176710129, "learning_rate": 2.414096400473458e-06, "loss": 0.0, "num_input_tokens_seen": 165875808, "step": 76910 }, { "epoch": 14.115434024591668, "grad_norm": 0.0002869867894332856, "learning_rate": 2.413411085041686e-06, "loss": 0.0, "num_input_tokens_seen": 165886496, "step": 76915 }, { "epoch": 14.116351624151221, "grad_norm": 0.009415250271558762, "learning_rate": 2.4127258359519083e-06, "loss": 0.0002, "num_input_tokens_seen": 165897568, "step": 76920 }, { "epoch": 14.117269223710773, "grad_norm": 0.0017058118246495724, "learning_rate": 2.412040653221706e-06, "loss": 0.0, "num_input_tokens_seen": 165909024, "step": 76925 }, { "epoch": 14.118186823270324, "grad_norm": 0.0009151251870207489, "learning_rate": 2.411355536868649e-06, "loss": 0.0, "num_input_tokens_seen": 165920032, "step": 76930 }, { "epoch": 14.119104422829878, "grad_norm": 0.0007831567781977355, "learning_rate": 2.410670486910309e-06, "loss": 0.0, "num_input_tokens_seen": 165931424, "step": 76935 }, { "epoch": 14.12002202238943, "grad_norm": 0.000891899864654988, "learning_rate": 2.40998550336426e-06, "loss": 0.0, "num_input_tokens_seen": 165941792, "step": 76940 }, { "epoch": 14.120939621948981, "grad_norm": 0.0019574305042624474, "learning_rate": 2.409300586248069e-06, "loss": 0.0, "num_input_tokens_seen": 165952512, "step": 76945 }, { "epoch": 14.121857221508535, "grad_norm": 0.00011060161341447383, "learning_rate": 2.408615735579302e-06, "loss": 0.0, "num_input_tokens_seen": 165963200, "step": 76950 }, { "epoch": 14.122774821068086, "grad_norm": 0.0005821256781928241, "learning_rate": 2.407930951375523e-06, "loss": 0.0, "num_input_tokens_seen": 165974784, "step": 76955 }, { "epoch": 14.123692420627638, "grad_norm": 0.00023404551029670984, "learning_rate": 2.4072462336543007e-06, "loss": 0.0, "num_input_tokens_seen": 165986080, "step": 76960 }, { "epoch": 14.124610020187191, "grad_norm": 0.0016297785332426429, "learning_rate": 2.4065615824331936e-06, "loss": 0.0, "num_input_tokens_seen": 165997088, "step": 76965 }, { "epoch": 14.125527619746743, "grad_norm": 0.00035483395913615823, "learning_rate": 2.4058769977297604e-06, "loss": 0.0, "num_input_tokens_seen": 166007552, "step": 76970 }, { "epoch": 14.126445219306294, "grad_norm": 0.0005986317992210388, "learning_rate": 2.405192479561563e-06, "loss": 0.0, "num_input_tokens_seen": 166018976, "step": 76975 }, { "epoch": 14.127362818865848, "grad_norm": 0.0003452564124017954, "learning_rate": 2.404508027946158e-06, "loss": 0.0131, "num_input_tokens_seen": 166028608, "step": 76980 }, { "epoch": 14.1282804184254, "grad_norm": 0.0003233450115658343, "learning_rate": 2.403823642901097e-06, "loss": 0.0, "num_input_tokens_seen": 166039968, "step": 76985 }, { "epoch": 14.129198017984951, "grad_norm": 0.000381540710804984, "learning_rate": 2.403139324443938e-06, "loss": 0.0, "num_input_tokens_seen": 166051104, "step": 76990 }, { "epoch": 14.130115617544504, "grad_norm": 0.20461907982826233, "learning_rate": 2.40245507259223e-06, "loss": 0.0001, "num_input_tokens_seen": 166062560, "step": 76995 }, { "epoch": 14.131033217104056, "grad_norm": 0.00499329250305891, "learning_rate": 2.401770887363524e-06, "loss": 0.0, "num_input_tokens_seen": 166072896, "step": 77000 }, { "epoch": 14.131950816663608, "grad_norm": 0.0007012530695647001, "learning_rate": 2.401086768775366e-06, "loss": 0.0, "num_input_tokens_seen": 166084320, "step": 77005 }, { "epoch": 14.132868416223161, "grad_norm": 0.00544271944090724, "learning_rate": 2.4004027168453063e-06, "loss": 0.0, "num_input_tokens_seen": 166095936, "step": 77010 }, { "epoch": 14.133786015782713, "grad_norm": 0.00042828585719689727, "learning_rate": 2.3997187315908876e-06, "loss": 0.0001, "num_input_tokens_seen": 166106368, "step": 77015 }, { "epoch": 14.134703615342264, "grad_norm": 0.00018785403517540544, "learning_rate": 2.399034813029652e-06, "loss": 0.0, "num_input_tokens_seen": 166117088, "step": 77020 }, { "epoch": 14.135621214901818, "grad_norm": 1.3027896881103516, "learning_rate": 2.3983509611791437e-06, "loss": 0.0002, "num_input_tokens_seen": 166127744, "step": 77025 }, { "epoch": 14.13653881446137, "grad_norm": 0.00020415215112734586, "learning_rate": 2.3976671760569014e-06, "loss": 0.0, "num_input_tokens_seen": 166138912, "step": 77030 }, { "epoch": 14.137456414020921, "grad_norm": 0.042994286864995956, "learning_rate": 2.3969834576804623e-06, "loss": 0.0001, "num_input_tokens_seen": 166149664, "step": 77035 }, { "epoch": 14.138374013580474, "grad_norm": 0.0025497928727418184, "learning_rate": 2.396299806067364e-06, "loss": 0.0, "num_input_tokens_seen": 166160000, "step": 77040 }, { "epoch": 14.139291613140026, "grad_norm": 0.00036111942608840764, "learning_rate": 2.395616221235138e-06, "loss": 0.0, "num_input_tokens_seen": 166169536, "step": 77045 }, { "epoch": 14.140209212699578, "grad_norm": 0.0002729441039264202, "learning_rate": 2.3949327032013214e-06, "loss": 0.0, "num_input_tokens_seen": 166180864, "step": 77050 }, { "epoch": 14.141126812259131, "grad_norm": 0.0011832326417788863, "learning_rate": 2.3942492519834433e-06, "loss": 0.0, "num_input_tokens_seen": 166192128, "step": 77055 }, { "epoch": 14.142044411818683, "grad_norm": 0.00019213283667340875, "learning_rate": 2.393565867599033e-06, "loss": 0.0, "num_input_tokens_seen": 166202944, "step": 77060 }, { "epoch": 14.142962011378234, "grad_norm": 6.186504364013672, "learning_rate": 2.3928825500656192e-06, "loss": 0.0025, "num_input_tokens_seen": 166214368, "step": 77065 }, { "epoch": 14.143879610937788, "grad_norm": 0.07449424266815186, "learning_rate": 2.392199299400725e-06, "loss": 0.0001, "num_input_tokens_seen": 166224192, "step": 77070 }, { "epoch": 14.14479721049734, "grad_norm": 0.006366999354213476, "learning_rate": 2.3915161156218797e-06, "loss": 0.0, "num_input_tokens_seen": 166235360, "step": 77075 }, { "epoch": 14.14571481005689, "grad_norm": 0.0012350697070360184, "learning_rate": 2.390832998746603e-06, "loss": 0.0001, "num_input_tokens_seen": 166246144, "step": 77080 }, { "epoch": 14.146632409616444, "grad_norm": 0.013057304546236992, "learning_rate": 2.3901499487924155e-06, "loss": 0.0, "num_input_tokens_seen": 166255712, "step": 77085 }, { "epoch": 14.147550009175996, "grad_norm": 0.0023574333172291517, "learning_rate": 2.3894669657768356e-06, "loss": 0.0001, "num_input_tokens_seen": 166267008, "step": 77090 }, { "epoch": 14.148467608735547, "grad_norm": 0.00016398169100284576, "learning_rate": 2.3887840497173835e-06, "loss": 0.0, "num_input_tokens_seen": 166277856, "step": 77095 }, { "epoch": 14.1493852082951, "grad_norm": 0.000373669812688604, "learning_rate": 2.3881012006315734e-06, "loss": 0.0, "num_input_tokens_seen": 166288608, "step": 77100 }, { "epoch": 14.150302807854652, "grad_norm": 0.0015586822992190719, "learning_rate": 2.387418418536918e-06, "loss": 0.0001, "num_input_tokens_seen": 166298912, "step": 77105 }, { "epoch": 14.151220407414204, "grad_norm": 0.00024984305491670966, "learning_rate": 2.386735703450933e-06, "loss": 0.0, "num_input_tokens_seen": 166310080, "step": 77110 }, { "epoch": 14.152138006973757, "grad_norm": 0.0018475037068128586, "learning_rate": 2.3860530553911263e-06, "loss": 0.0, "num_input_tokens_seen": 166322336, "step": 77115 }, { "epoch": 14.153055606533309, "grad_norm": 0.0002220296737505123, "learning_rate": 2.385370474375007e-06, "loss": 0.0078, "num_input_tokens_seen": 166333472, "step": 77120 }, { "epoch": 14.15397320609286, "grad_norm": 0.00011206116323592141, "learning_rate": 2.3846879604200828e-06, "loss": 0.0, "num_input_tokens_seen": 166343232, "step": 77125 }, { "epoch": 14.154890805652414, "grad_norm": 0.0037230271846055984, "learning_rate": 2.38400551354386e-06, "loss": 0.0, "num_input_tokens_seen": 166354400, "step": 77130 }, { "epoch": 14.155808405211966, "grad_norm": 0.0010035751620307565, "learning_rate": 2.3833231337638413e-06, "loss": 0.0, "num_input_tokens_seen": 166365504, "step": 77135 }, { "epoch": 14.156726004771517, "grad_norm": 0.0001580521056894213, "learning_rate": 2.382640821097527e-06, "loss": 0.0001, "num_input_tokens_seen": 166377504, "step": 77140 }, { "epoch": 14.15764360433107, "grad_norm": 0.025351209565997124, "learning_rate": 2.381958575562421e-06, "loss": 0.0, "num_input_tokens_seen": 166386976, "step": 77145 }, { "epoch": 14.158561203890622, "grad_norm": 0.013965587131679058, "learning_rate": 2.3812763971760196e-06, "loss": 0.0, "num_input_tokens_seen": 166397184, "step": 77150 }, { "epoch": 14.159478803450174, "grad_norm": 0.0006347520975396037, "learning_rate": 2.3805942859558183e-06, "loss": 0.0, "num_input_tokens_seen": 166408160, "step": 77155 }, { "epoch": 14.160396403009727, "grad_norm": 0.002712325192987919, "learning_rate": 2.3799122419193155e-06, "loss": 0.0001, "num_input_tokens_seen": 166419360, "step": 77160 }, { "epoch": 14.161314002569279, "grad_norm": 0.03806911036372185, "learning_rate": 2.3792302650840032e-06, "loss": 0.0, "num_input_tokens_seen": 166428480, "step": 77165 }, { "epoch": 14.16223160212883, "grad_norm": 0.0005469050374813378, "learning_rate": 2.3785483554673707e-06, "loss": 0.0, "num_input_tokens_seen": 166439392, "step": 77170 }, { "epoch": 14.163149201688384, "grad_norm": 0.00022239497047849, "learning_rate": 2.377866513086912e-06, "loss": 0.0, "num_input_tokens_seen": 166450784, "step": 77175 }, { "epoch": 14.164066801247936, "grad_norm": 0.00022402932518161833, "learning_rate": 2.377184737960113e-06, "loss": 0.0, "num_input_tokens_seen": 166461824, "step": 77180 }, { "epoch": 14.164984400807487, "grad_norm": 0.009755020029842854, "learning_rate": 2.376503030104461e-06, "loss": 0.0, "num_input_tokens_seen": 166472128, "step": 77185 }, { "epoch": 14.16590200036704, "grad_norm": 0.0001337893190793693, "learning_rate": 2.3758213895374383e-06, "loss": 0.0, "num_input_tokens_seen": 166482496, "step": 77190 }, { "epoch": 14.166819599926592, "grad_norm": 2.169645309448242, "learning_rate": 2.375139816276531e-06, "loss": 0.0003, "num_input_tokens_seen": 166493344, "step": 77195 }, { "epoch": 14.167737199486144, "grad_norm": 0.0010775229893624783, "learning_rate": 2.37445831033922e-06, "loss": 0.0, "num_input_tokens_seen": 166505184, "step": 77200 }, { "epoch": 14.168654799045697, "grad_norm": 0.00011029876623069867, "learning_rate": 2.3737768717429823e-06, "loss": 0.0, "num_input_tokens_seen": 166515520, "step": 77205 }, { "epoch": 14.169572398605249, "grad_norm": 0.0003866110055241734, "learning_rate": 2.373095500505299e-06, "loss": 0.0002, "num_input_tokens_seen": 166526240, "step": 77210 }, { "epoch": 14.1704899981648, "grad_norm": 0.0001689133350737393, "learning_rate": 2.372414196643645e-06, "loss": 0.0, "num_input_tokens_seen": 166537568, "step": 77215 }, { "epoch": 14.171407597724354, "grad_norm": 0.00013832013064529747, "learning_rate": 2.3717329601754923e-06, "loss": 0.0, "num_input_tokens_seen": 166547840, "step": 77220 }, { "epoch": 14.172325197283905, "grad_norm": 0.00024344857956748456, "learning_rate": 2.371051791118318e-06, "loss": 0.0, "num_input_tokens_seen": 166559968, "step": 77225 }, { "epoch": 14.173242796843457, "grad_norm": 0.5006993412971497, "learning_rate": 2.3703706894895906e-06, "loss": 0.0015, "num_input_tokens_seen": 166570944, "step": 77230 }, { "epoch": 14.17416039640301, "grad_norm": 0.0007595432689413428, "learning_rate": 2.3696896553067795e-06, "loss": 0.0, "num_input_tokens_seen": 166582176, "step": 77235 }, { "epoch": 14.175077995962562, "grad_norm": 0.002419531811028719, "learning_rate": 2.36900868858735e-06, "loss": 0.0, "num_input_tokens_seen": 166593600, "step": 77240 }, { "epoch": 14.175995595522114, "grad_norm": 0.00034421388409100473, "learning_rate": 2.3683277893487723e-06, "loss": 0.0004, "num_input_tokens_seen": 166605280, "step": 77245 }, { "epoch": 14.176913195081667, "grad_norm": 0.0025433609262108803, "learning_rate": 2.3676469576085076e-06, "loss": 0.0, "num_input_tokens_seen": 166615872, "step": 77250 }, { "epoch": 14.177830794641219, "grad_norm": 0.00019105273531749845, "learning_rate": 2.366966193384019e-06, "loss": 0.0131, "num_input_tokens_seen": 166625920, "step": 77255 }, { "epoch": 14.17874839420077, "grad_norm": 0.012206773273646832, "learning_rate": 2.3662854966927662e-06, "loss": 0.0, "num_input_tokens_seen": 166635808, "step": 77260 }, { "epoch": 14.179665993760324, "grad_norm": 0.00010669480252545327, "learning_rate": 2.3656048675522094e-06, "loss": 0.0, "num_input_tokens_seen": 166645728, "step": 77265 }, { "epoch": 14.180583593319875, "grad_norm": 198.39280700683594, "learning_rate": 2.364924305979802e-06, "loss": 0.2813, "num_input_tokens_seen": 166656000, "step": 77270 }, { "epoch": 14.181501192879427, "grad_norm": 0.000250770099228248, "learning_rate": 2.3642438119930046e-06, "loss": 0.0, "num_input_tokens_seen": 166667744, "step": 77275 }, { "epoch": 14.18241879243898, "grad_norm": 0.000379076023818925, "learning_rate": 2.3635633856092684e-06, "loss": 0.0, "num_input_tokens_seen": 166678240, "step": 77280 }, { "epoch": 14.183336391998532, "grad_norm": 0.00040904051274992526, "learning_rate": 2.3628830268460452e-06, "loss": 0.0, "num_input_tokens_seen": 166689696, "step": 77285 }, { "epoch": 14.184253991558084, "grad_norm": 0.01728394255042076, "learning_rate": 2.3622027357207826e-06, "loss": 0.0, "num_input_tokens_seen": 166701056, "step": 77290 }, { "epoch": 14.185171591117637, "grad_norm": 0.00043541518971323967, "learning_rate": 2.3615225122509345e-06, "loss": 0.0, "num_input_tokens_seen": 166712864, "step": 77295 }, { "epoch": 14.186089190677189, "grad_norm": 0.0002213485713582486, "learning_rate": 2.360842356453944e-06, "loss": 0.0, "num_input_tokens_seen": 166724000, "step": 77300 }, { "epoch": 14.18700679023674, "grad_norm": 0.0001551120076328516, "learning_rate": 2.3601622683472553e-06, "loss": 0.0, "num_input_tokens_seen": 166735264, "step": 77305 }, { "epoch": 14.187924389796294, "grad_norm": 0.0002128092892235145, "learning_rate": 2.359482247948315e-06, "loss": 0.0, "num_input_tokens_seen": 166743904, "step": 77310 }, { "epoch": 14.188841989355845, "grad_norm": 0.00027302931994199753, "learning_rate": 2.358802295274562e-06, "loss": 0.0, "num_input_tokens_seen": 166754336, "step": 77315 }, { "epoch": 14.189759588915397, "grad_norm": 0.000232212376431562, "learning_rate": 2.3581224103434377e-06, "loss": 0.0, "num_input_tokens_seen": 166764768, "step": 77320 }, { "epoch": 14.19067718847495, "grad_norm": 0.0003675605694297701, "learning_rate": 2.357442593172376e-06, "loss": 0.0, "num_input_tokens_seen": 166775232, "step": 77325 }, { "epoch": 14.191594788034502, "grad_norm": 0.0008461938123218715, "learning_rate": 2.356762843778819e-06, "loss": 0.0, "num_input_tokens_seen": 166786752, "step": 77330 }, { "epoch": 14.192512387594054, "grad_norm": 0.0005859449156560004, "learning_rate": 2.3560831621801977e-06, "loss": 0.0001, "num_input_tokens_seen": 166797952, "step": 77335 }, { "epoch": 14.193429987153607, "grad_norm": 0.01844685710966587, "learning_rate": 2.3554035483939437e-06, "loss": 0.0001, "num_input_tokens_seen": 166809248, "step": 77340 }, { "epoch": 14.194347586713159, "grad_norm": 0.0002295779122505337, "learning_rate": 2.3547240024374922e-06, "loss": 0.0, "num_input_tokens_seen": 166820640, "step": 77345 }, { "epoch": 14.19526518627271, "grad_norm": 0.006550342310220003, "learning_rate": 2.35404452432827e-06, "loss": 0.0, "num_input_tokens_seen": 166832416, "step": 77350 }, { "epoch": 14.196182785832264, "grad_norm": 0.017081525176763535, "learning_rate": 2.3533651140837034e-06, "loss": 0.0, "num_input_tokens_seen": 166842592, "step": 77355 }, { "epoch": 14.197100385391815, "grad_norm": 0.00018687000556383282, "learning_rate": 2.352685771721221e-06, "loss": 0.0, "num_input_tokens_seen": 166853824, "step": 77360 }, { "epoch": 14.198017984951367, "grad_norm": 0.0003379150293767452, "learning_rate": 2.3520064972582458e-06, "loss": 0.0, "num_input_tokens_seen": 166864384, "step": 77365 }, { "epoch": 14.19893558451092, "grad_norm": 0.00022009668464306742, "learning_rate": 2.3513272907122005e-06, "loss": 0.0004, "num_input_tokens_seen": 166876608, "step": 77370 }, { "epoch": 14.199853184070472, "grad_norm": 0.012716555036604404, "learning_rate": 2.3506481521005028e-06, "loss": 0.0, "num_input_tokens_seen": 166885696, "step": 77375 }, { "epoch": 14.200770783630023, "grad_norm": 0.00021941830345895141, "learning_rate": 2.349969081440575e-06, "loss": 0.0, "num_input_tokens_seen": 166895872, "step": 77380 }, { "epoch": 14.201688383189577, "grad_norm": 0.0012182643404230475, "learning_rate": 2.349290078749834e-06, "loss": 0.0, "num_input_tokens_seen": 166906848, "step": 77385 }, { "epoch": 14.202605982749128, "grad_norm": 0.00014078831009101123, "learning_rate": 2.348611144045692e-06, "loss": 0.0, "num_input_tokens_seen": 166918656, "step": 77390 }, { "epoch": 14.20352358230868, "grad_norm": 0.015359017066657543, "learning_rate": 2.347932277345567e-06, "loss": 0.0, "num_input_tokens_seen": 166929376, "step": 77395 }, { "epoch": 14.204441181868233, "grad_norm": 0.005536680109798908, "learning_rate": 2.347253478666869e-06, "loss": 0.0, "num_input_tokens_seen": 166940096, "step": 77400 }, { "epoch": 14.205358781427785, "grad_norm": 0.00015821219130884856, "learning_rate": 2.3465747480270072e-06, "loss": 0.0, "num_input_tokens_seen": 166951616, "step": 77405 }, { "epoch": 14.206276380987337, "grad_norm": 0.0009561188635416329, "learning_rate": 2.3458960854433895e-06, "loss": 0.0, "num_input_tokens_seen": 166960896, "step": 77410 }, { "epoch": 14.20719398054689, "grad_norm": 0.0008464340353384614, "learning_rate": 2.3452174909334254e-06, "loss": 0.0, "num_input_tokens_seen": 166972128, "step": 77415 }, { "epoch": 14.208111580106442, "grad_norm": 0.011064102873206139, "learning_rate": 2.344538964514518e-06, "loss": 0.0, "num_input_tokens_seen": 166983008, "step": 77420 }, { "epoch": 14.209029179665993, "grad_norm": 0.0001760333398124203, "learning_rate": 2.3438605062040687e-06, "loss": 0.2125, "num_input_tokens_seen": 166993248, "step": 77425 }, { "epoch": 14.209946779225547, "grad_norm": 0.009867300279438496, "learning_rate": 2.3431821160194824e-06, "loss": 0.0, "num_input_tokens_seen": 167005248, "step": 77430 }, { "epoch": 14.210864378785098, "grad_norm": 0.0016541712684556842, "learning_rate": 2.3425037939781576e-06, "loss": 0.0, "num_input_tokens_seen": 167014656, "step": 77435 }, { "epoch": 14.21178197834465, "grad_norm": 0.003303976496681571, "learning_rate": 2.3418255400974893e-06, "loss": 0.0, "num_input_tokens_seen": 167025376, "step": 77440 }, { "epoch": 14.212699577904203, "grad_norm": 0.00015507776697631925, "learning_rate": 2.3411473543948787e-06, "loss": 0.0001, "num_input_tokens_seen": 167035776, "step": 77445 }, { "epoch": 14.213617177463755, "grad_norm": 0.0008237289148382843, "learning_rate": 2.340469236887717e-06, "loss": 0.0, "num_input_tokens_seen": 167046624, "step": 77450 }, { "epoch": 14.214534777023307, "grad_norm": 0.0004040111671201885, "learning_rate": 2.3397911875933978e-06, "loss": 0.0, "num_input_tokens_seen": 167057152, "step": 77455 }, { "epoch": 14.21545237658286, "grad_norm": 0.00021711642330046743, "learning_rate": 2.3391132065293113e-06, "loss": 0.0, "num_input_tokens_seen": 167067968, "step": 77460 }, { "epoch": 14.216369976142412, "grad_norm": 0.00016846905054990202, "learning_rate": 2.338435293712846e-06, "loss": 0.0, "num_input_tokens_seen": 167079104, "step": 77465 }, { "epoch": 14.217287575701963, "grad_norm": 0.0011157792760059237, "learning_rate": 2.3377574491613915e-06, "loss": 0.0, "num_input_tokens_seen": 167090784, "step": 77470 }, { "epoch": 14.218205175261517, "grad_norm": 0.0022356633562594652, "learning_rate": 2.3370796728923323e-06, "loss": 0.0, "num_input_tokens_seen": 167102496, "step": 77475 }, { "epoch": 14.219122774821068, "grad_norm": 0.002558192005380988, "learning_rate": 2.3364019649230526e-06, "loss": 0.0, "num_input_tokens_seen": 167112192, "step": 77480 }, { "epoch": 14.22004037438062, "grad_norm": 0.0012606515083462, "learning_rate": 2.3357243252709345e-06, "loss": 0.0, "num_input_tokens_seen": 167123200, "step": 77485 }, { "epoch": 14.220957973940173, "grad_norm": 0.0002687499509193003, "learning_rate": 2.3350467539533557e-06, "loss": 0.0, "num_input_tokens_seen": 167132576, "step": 77490 }, { "epoch": 14.221875573499725, "grad_norm": 0.00035292928805574775, "learning_rate": 2.3343692509877e-06, "loss": 0.0011, "num_input_tokens_seen": 167144096, "step": 77495 }, { "epoch": 14.222793173059276, "grad_norm": 0.0008438330260105431, "learning_rate": 2.333691816391341e-06, "loss": 0.0, "num_input_tokens_seen": 167155232, "step": 77500 }, { "epoch": 14.22371077261883, "grad_norm": 0.0004977012285962701, "learning_rate": 2.3330144501816547e-06, "loss": 0.0001, "num_input_tokens_seen": 167167328, "step": 77505 }, { "epoch": 14.224628372178381, "grad_norm": 0.0671195238828659, "learning_rate": 2.3323371523760125e-06, "loss": 0.0, "num_input_tokens_seen": 167178368, "step": 77510 }, { "epoch": 14.225545971737933, "grad_norm": 0.0007688554469496012, "learning_rate": 2.3316599229917898e-06, "loss": 0.0, "num_input_tokens_seen": 167188544, "step": 77515 }, { "epoch": 14.226463571297487, "grad_norm": 0.0036399229429662228, "learning_rate": 2.3309827620463545e-06, "loss": 0.0822, "num_input_tokens_seen": 167200288, "step": 77520 }, { "epoch": 14.227381170857038, "grad_norm": 0.011077816598117352, "learning_rate": 2.330305669557073e-06, "loss": 0.0, "num_input_tokens_seen": 167211040, "step": 77525 }, { "epoch": 14.22829877041659, "grad_norm": 0.014982087537646294, "learning_rate": 2.3296286455413148e-06, "loss": 0.0, "num_input_tokens_seen": 167220640, "step": 77530 }, { "epoch": 14.229216369976143, "grad_norm": 0.015032616443932056, "learning_rate": 2.328951690016444e-06, "loss": 0.0, "num_input_tokens_seen": 167231808, "step": 77535 }, { "epoch": 14.230133969535695, "grad_norm": 0.00023023174435365945, "learning_rate": 2.3282748029998213e-06, "loss": 0.0001, "num_input_tokens_seen": 167242752, "step": 77540 }, { "epoch": 14.231051569095246, "grad_norm": 0.003344785887748003, "learning_rate": 2.3275979845088083e-06, "loss": 0.0, "num_input_tokens_seen": 167253856, "step": 77545 }, { "epoch": 14.2319691686548, "grad_norm": 0.061861392110586166, "learning_rate": 2.3269212345607666e-06, "loss": 0.0, "num_input_tokens_seen": 167264288, "step": 77550 }, { "epoch": 14.232886768214351, "grad_norm": 0.00012325766147114336, "learning_rate": 2.3262445531730526e-06, "loss": 0.0, "num_input_tokens_seen": 167275584, "step": 77555 }, { "epoch": 14.233804367773903, "grad_norm": 0.0002449967432767153, "learning_rate": 2.32556794036302e-06, "loss": 0.0, "num_input_tokens_seen": 167287104, "step": 77560 }, { "epoch": 14.234721967333456, "grad_norm": 0.19150102138519287, "learning_rate": 2.3248913961480263e-06, "loss": 0.0001, "num_input_tokens_seen": 167298176, "step": 77565 }, { "epoch": 14.235639566893008, "grad_norm": 0.0009279934456571937, "learning_rate": 2.3242149205454223e-06, "loss": 0.0, "num_input_tokens_seen": 167308928, "step": 77570 }, { "epoch": 14.23655716645256, "grad_norm": 0.00018837617244571447, "learning_rate": 2.3235385135725567e-06, "loss": 0.0, "num_input_tokens_seen": 167320224, "step": 77575 }, { "epoch": 14.237474766012113, "grad_norm": 0.0002432834153296426, "learning_rate": 2.322862175246782e-06, "loss": 0.0, "num_input_tokens_seen": 167331584, "step": 77580 }, { "epoch": 14.238392365571665, "grad_norm": 0.14795421063899994, "learning_rate": 2.3221859055854433e-06, "loss": 0.0001, "num_input_tokens_seen": 167342080, "step": 77585 }, { "epoch": 14.239309965131216, "grad_norm": 0.003492456628009677, "learning_rate": 2.321509704605886e-06, "loss": 0.001, "num_input_tokens_seen": 167352160, "step": 77590 }, { "epoch": 14.24022756469077, "grad_norm": 0.0002799858048092574, "learning_rate": 2.3208335723254518e-06, "loss": 0.0, "num_input_tokens_seen": 167364064, "step": 77595 }, { "epoch": 14.241145164250321, "grad_norm": 0.00022051826817914844, "learning_rate": 2.3201575087614854e-06, "loss": 0.0, "num_input_tokens_seen": 167375104, "step": 77600 }, { "epoch": 14.242062763809873, "grad_norm": 0.0002549534838180989, "learning_rate": 2.319481513931326e-06, "loss": 0.0, "num_input_tokens_seen": 167384800, "step": 77605 }, { "epoch": 14.242980363369426, "grad_norm": 0.0005396444466896355, "learning_rate": 2.3188055878523093e-06, "loss": 0.0, "num_input_tokens_seen": 167395360, "step": 77610 }, { "epoch": 14.243897962928978, "grad_norm": 0.0006513712578453124, "learning_rate": 2.3181297305417753e-06, "loss": 0.0, "num_input_tokens_seen": 167405888, "step": 77615 }, { "epoch": 14.24481556248853, "grad_norm": 9.588171815266833e-05, "learning_rate": 2.317453942017058e-06, "loss": 0.0, "num_input_tokens_seen": 167416448, "step": 77620 }, { "epoch": 14.245733162048083, "grad_norm": 0.0002957340329885483, "learning_rate": 2.316778222295487e-06, "loss": 0.0, "num_input_tokens_seen": 167425312, "step": 77625 }, { "epoch": 14.246650761607635, "grad_norm": 0.3326758146286011, "learning_rate": 2.316102571394398e-06, "loss": 0.0001, "num_input_tokens_seen": 167436928, "step": 77630 }, { "epoch": 14.247568361167186, "grad_norm": 0.0015016699908301234, "learning_rate": 2.3154269893311186e-06, "loss": 0.0002, "num_input_tokens_seen": 167447808, "step": 77635 }, { "epoch": 14.24848596072674, "grad_norm": 0.0812179371714592, "learning_rate": 2.314751476122976e-06, "loss": 0.0001, "num_input_tokens_seen": 167457248, "step": 77640 }, { "epoch": 14.249403560286291, "grad_norm": 0.0002978197589982301, "learning_rate": 2.3140760317872947e-06, "loss": 0.0, "num_input_tokens_seen": 167468032, "step": 77645 }, { "epoch": 14.250321159845843, "grad_norm": 0.00012750203313771635, "learning_rate": 2.3134006563414017e-06, "loss": 0.0, "num_input_tokens_seen": 167479648, "step": 77650 }, { "epoch": 14.251238759405396, "grad_norm": 0.0003165163507219404, "learning_rate": 2.312725349802618e-06, "loss": 0.0, "num_input_tokens_seen": 167490848, "step": 77655 }, { "epoch": 14.252156358964948, "grad_norm": 0.0006921936292201281, "learning_rate": 2.3120501121882634e-06, "loss": 0.0, "num_input_tokens_seen": 167500960, "step": 77660 }, { "epoch": 14.2530739585245, "grad_norm": 0.017015507444739342, "learning_rate": 2.311374943515658e-06, "loss": 0.0001, "num_input_tokens_seen": 167511104, "step": 77665 }, { "epoch": 14.253991558084053, "grad_norm": 0.00038292555836960673, "learning_rate": 2.3106998438021187e-06, "loss": 0.0, "num_input_tokens_seen": 167520544, "step": 77670 }, { "epoch": 14.254909157643604, "grad_norm": 0.002816392807289958, "learning_rate": 2.310024813064961e-06, "loss": 0.0, "num_input_tokens_seen": 167531808, "step": 77675 }, { "epoch": 14.255826757203156, "grad_norm": 0.00012983937631361187, "learning_rate": 2.3093498513214974e-06, "loss": 0.0, "num_input_tokens_seen": 167543456, "step": 77680 }, { "epoch": 14.25674435676271, "grad_norm": 0.0024930983781814575, "learning_rate": 2.3086749585890377e-06, "loss": 0.0, "num_input_tokens_seen": 167554176, "step": 77685 }, { "epoch": 14.257661956322261, "grad_norm": 0.000360259844455868, "learning_rate": 2.3080001348848966e-06, "loss": 0.0, "num_input_tokens_seen": 167564928, "step": 77690 }, { "epoch": 14.258579555881813, "grad_norm": 0.0011942400597035885, "learning_rate": 2.3073253802263794e-06, "loss": 0.0001, "num_input_tokens_seen": 167576416, "step": 77695 }, { "epoch": 14.259497155441366, "grad_norm": 0.0026141901034861803, "learning_rate": 2.306650694630793e-06, "loss": 0.0, "num_input_tokens_seen": 167587296, "step": 77700 }, { "epoch": 14.260414755000918, "grad_norm": 0.0006980804610066116, "learning_rate": 2.3059760781154424e-06, "loss": 0.0, "num_input_tokens_seen": 167596864, "step": 77705 }, { "epoch": 14.26133235456047, "grad_norm": 0.0020019791554659605, "learning_rate": 2.305301530697628e-06, "loss": 0.0, "num_input_tokens_seen": 167606688, "step": 77710 }, { "epoch": 14.262249954120023, "grad_norm": 0.0009334956994280219, "learning_rate": 2.3046270523946545e-06, "loss": 0.0, "num_input_tokens_seen": 167616800, "step": 77715 }, { "epoch": 14.263167553679574, "grad_norm": 7.53910280764103e-05, "learning_rate": 2.3039526432238197e-06, "loss": 0.0, "num_input_tokens_seen": 167627808, "step": 77720 }, { "epoch": 14.264085153239126, "grad_norm": 0.00028799090068787336, "learning_rate": 2.3032783032024208e-06, "loss": 0.0, "num_input_tokens_seen": 167639040, "step": 77725 }, { "epoch": 14.26500275279868, "grad_norm": 0.011039011180400848, "learning_rate": 2.3026040323477528e-06, "loss": 0.0, "num_input_tokens_seen": 167650624, "step": 77730 }, { "epoch": 14.265920352358231, "grad_norm": 0.009314808063209057, "learning_rate": 2.301929830677112e-06, "loss": 0.0, "num_input_tokens_seen": 167661888, "step": 77735 }, { "epoch": 14.266837951917783, "grad_norm": 0.0008691576076671481, "learning_rate": 2.30125569820779e-06, "loss": 0.0, "num_input_tokens_seen": 167670656, "step": 77740 }, { "epoch": 14.267755551477336, "grad_norm": 0.0001666234020376578, "learning_rate": 2.300581634957074e-06, "loss": 0.0, "num_input_tokens_seen": 167678112, "step": 77745 }, { "epoch": 14.268673151036888, "grad_norm": 0.001053196843713522, "learning_rate": 2.2999076409422585e-06, "loss": 0.0, "num_input_tokens_seen": 167688416, "step": 77750 }, { "epoch": 14.26959075059644, "grad_norm": 0.0001360296009806916, "learning_rate": 2.2992337161806262e-06, "loss": 0.0, "num_input_tokens_seen": 167700160, "step": 77755 }, { "epoch": 14.270508350155993, "grad_norm": 0.00022831781825516373, "learning_rate": 2.2985598606894615e-06, "loss": 0.0087, "num_input_tokens_seen": 167710336, "step": 77760 }, { "epoch": 14.271425949715544, "grad_norm": 9.776375372894108e-05, "learning_rate": 2.2978860744860514e-06, "loss": 0.0284, "num_input_tokens_seen": 167722368, "step": 77765 }, { "epoch": 14.272343549275096, "grad_norm": 0.0007222101557999849, "learning_rate": 2.2972123575876757e-06, "loss": 0.0, "num_input_tokens_seen": 167731744, "step": 77770 }, { "epoch": 14.27326114883465, "grad_norm": 0.00041039599454961717, "learning_rate": 2.2965387100116145e-06, "loss": 0.0003, "num_input_tokens_seen": 167743840, "step": 77775 }, { "epoch": 14.2741787483942, "grad_norm": 0.00031785210012458265, "learning_rate": 2.295865131775143e-06, "loss": 0.0, "num_input_tokens_seen": 167755872, "step": 77780 }, { "epoch": 14.275096347953752, "grad_norm": 0.012456696480512619, "learning_rate": 2.2951916228955416e-06, "loss": 0.0, "num_input_tokens_seen": 167766784, "step": 77785 }, { "epoch": 14.276013947513306, "grad_norm": 6.794530054321513e-05, "learning_rate": 2.294518183390083e-06, "loss": 0.0, "num_input_tokens_seen": 167776416, "step": 77790 }, { "epoch": 14.276931547072857, "grad_norm": 0.00010176311479881406, "learning_rate": 2.293844813276039e-06, "loss": 0.0, "num_input_tokens_seen": 167787712, "step": 77795 }, { "epoch": 14.27784914663241, "grad_norm": 6.34004027233459e-05, "learning_rate": 2.293171512570682e-06, "loss": 0.0, "num_input_tokens_seen": 167798176, "step": 77800 }, { "epoch": 14.278766746191963, "grad_norm": 0.00023995808442123234, "learning_rate": 2.292498281291281e-06, "loss": 0.0, "num_input_tokens_seen": 167808224, "step": 77805 }, { "epoch": 14.279684345751514, "grad_norm": 0.00010287114855600521, "learning_rate": 2.291825119455101e-06, "loss": 0.0, "num_input_tokens_seen": 167819168, "step": 77810 }, { "epoch": 14.280601945311066, "grad_norm": 0.0017311503179371357, "learning_rate": 2.2911520270794114e-06, "loss": 0.0173, "num_input_tokens_seen": 167830880, "step": 77815 }, { "epoch": 14.28151954487062, "grad_norm": 0.00038453275919891894, "learning_rate": 2.2904790041814734e-06, "loss": 0.0, "num_input_tokens_seen": 167842656, "step": 77820 }, { "epoch": 14.28243714443017, "grad_norm": 0.011858507990837097, "learning_rate": 2.28980605077855e-06, "loss": 0.0, "num_input_tokens_seen": 167851936, "step": 77825 }, { "epoch": 14.283354743989722, "grad_norm": 0.0011509901378303766, "learning_rate": 2.289133166887899e-06, "loss": 0.0, "num_input_tokens_seen": 167862208, "step": 77830 }, { "epoch": 14.284272343549276, "grad_norm": 0.0004183028358966112, "learning_rate": 2.288460352526783e-06, "loss": 0.0, "num_input_tokens_seen": 167873184, "step": 77835 }, { "epoch": 14.285189943108827, "grad_norm": 0.007480005733668804, "learning_rate": 2.287787607712456e-06, "loss": 0.0, "num_input_tokens_seen": 167884256, "step": 77840 }, { "epoch": 14.286107542668379, "grad_norm": 0.00022333260858431458, "learning_rate": 2.287114932462172e-06, "loss": 0.0, "num_input_tokens_seen": 167895648, "step": 77845 }, { "epoch": 14.287025142227932, "grad_norm": 0.00016828475054353476, "learning_rate": 2.286442326793187e-06, "loss": 0.0, "num_input_tokens_seen": 167906240, "step": 77850 }, { "epoch": 14.287942741787484, "grad_norm": 0.00011641836317721754, "learning_rate": 2.2857697907227504e-06, "loss": 0.0001, "num_input_tokens_seen": 167916864, "step": 77855 }, { "epoch": 14.288860341347036, "grad_norm": 0.0008386684930883348, "learning_rate": 2.285097324268112e-06, "loss": 0.0001, "num_input_tokens_seen": 167927872, "step": 77860 }, { "epoch": 14.289777940906589, "grad_norm": 0.00012841180432587862, "learning_rate": 2.284424927446518e-06, "loss": 0.0, "num_input_tokens_seen": 167937184, "step": 77865 }, { "epoch": 14.29069554046614, "grad_norm": 8.875128696672618e-05, "learning_rate": 2.2837526002752176e-06, "loss": 0.0, "num_input_tokens_seen": 167946784, "step": 77870 }, { "epoch": 14.291613140025692, "grad_norm": 16.829172134399414, "learning_rate": 2.2830803427714533e-06, "loss": 0.0414, "num_input_tokens_seen": 167956832, "step": 77875 }, { "epoch": 14.292530739585246, "grad_norm": 7.66416997066699e-05, "learning_rate": 2.2824081549524654e-06, "loss": 0.0, "num_input_tokens_seen": 167967616, "step": 77880 }, { "epoch": 14.293448339144797, "grad_norm": 0.0001347633369732648, "learning_rate": 2.281736036835498e-06, "loss": 0.0, "num_input_tokens_seen": 167977760, "step": 77885 }, { "epoch": 14.294365938704349, "grad_norm": 0.0005856581847183406, "learning_rate": 2.281063988437789e-06, "loss": 0.0, "num_input_tokens_seen": 167988448, "step": 77890 }, { "epoch": 14.295283538263902, "grad_norm": 0.001701501663774252, "learning_rate": 2.280392009776574e-06, "loss": 0.0, "num_input_tokens_seen": 167998976, "step": 77895 }, { "epoch": 14.296201137823454, "grad_norm": 0.00014397451013792306, "learning_rate": 2.2797201008690893e-06, "loss": 0.0, "num_input_tokens_seen": 168009952, "step": 77900 }, { "epoch": 14.297118737383006, "grad_norm": 0.0002084140869555995, "learning_rate": 2.279048261732566e-06, "loss": 0.0, "num_input_tokens_seen": 168020544, "step": 77905 }, { "epoch": 14.298036336942559, "grad_norm": 8.291862468468025e-05, "learning_rate": 2.27837649238424e-06, "loss": 0.0, "num_input_tokens_seen": 168031552, "step": 77910 }, { "epoch": 14.29895393650211, "grad_norm": 0.00021453166846185923, "learning_rate": 2.277704792841338e-06, "loss": 0.0, "num_input_tokens_seen": 168041920, "step": 77915 }, { "epoch": 14.299871536061662, "grad_norm": 0.06589197367429733, "learning_rate": 2.2770331631210894e-06, "loss": 0.0001, "num_input_tokens_seen": 168053344, "step": 77920 }, { "epoch": 14.300789135621216, "grad_norm": 0.00047537259524688125, "learning_rate": 2.27636160324072e-06, "loss": 0.0, "num_input_tokens_seen": 168063776, "step": 77925 }, { "epoch": 14.301706735180767, "grad_norm": 0.00013461369962897152, "learning_rate": 2.2756901132174525e-06, "loss": 0.0, "num_input_tokens_seen": 168075744, "step": 77930 }, { "epoch": 14.302624334740319, "grad_norm": 0.00010535174078540877, "learning_rate": 2.2750186930685124e-06, "loss": 0.0, "num_input_tokens_seen": 168086176, "step": 77935 }, { "epoch": 14.303541934299872, "grad_norm": 0.00010208294406766072, "learning_rate": 2.27434734281112e-06, "loss": 0.0, "num_input_tokens_seen": 168096832, "step": 77940 }, { "epoch": 14.304459533859424, "grad_norm": 0.00011614854156505316, "learning_rate": 2.273676062462492e-06, "loss": 0.0, "num_input_tokens_seen": 168107712, "step": 77945 }, { "epoch": 14.305377133418975, "grad_norm": 6.85060367686674e-05, "learning_rate": 2.2730048520398494e-06, "loss": 0.0, "num_input_tokens_seen": 168117472, "step": 77950 }, { "epoch": 14.306294732978529, "grad_norm": 0.121061310172081, "learning_rate": 2.272333711560406e-06, "loss": 0.0078, "num_input_tokens_seen": 168128960, "step": 77955 }, { "epoch": 14.30721233253808, "grad_norm": 0.0025492054410278797, "learning_rate": 2.2716626410413755e-06, "loss": 0.0, "num_input_tokens_seen": 168140928, "step": 77960 }, { "epoch": 14.308129932097632, "grad_norm": 0.00018861489661503583, "learning_rate": 2.2709916404999677e-06, "loss": 0.0, "num_input_tokens_seen": 168152896, "step": 77965 }, { "epoch": 14.309047531657185, "grad_norm": 0.0005181496380828321, "learning_rate": 2.2703207099533963e-06, "loss": 0.0, "num_input_tokens_seen": 168164000, "step": 77970 }, { "epoch": 14.309965131216737, "grad_norm": 9.622705692891032e-05, "learning_rate": 2.2696498494188685e-06, "loss": 0.0, "num_input_tokens_seen": 168174208, "step": 77975 }, { "epoch": 14.310882730776289, "grad_norm": 9.064798359759152e-05, "learning_rate": 2.2689790589135884e-06, "loss": 0.0, "num_input_tokens_seen": 168185792, "step": 77980 }, { "epoch": 14.311800330335842, "grad_norm": 0.0030565713532269, "learning_rate": 2.268308338454765e-06, "loss": 0.0, "num_input_tokens_seen": 168197248, "step": 77985 }, { "epoch": 14.312717929895394, "grad_norm": 9.534825949231163e-05, "learning_rate": 2.2676376880595985e-06, "loss": 0.0, "num_input_tokens_seen": 168208224, "step": 77990 }, { "epoch": 14.313635529454945, "grad_norm": 9.24965861486271e-05, "learning_rate": 2.2669671077452906e-06, "loss": 0.0, "num_input_tokens_seen": 168218400, "step": 77995 }, { "epoch": 14.314553129014499, "grad_norm": 0.0007453422294929624, "learning_rate": 2.2662965975290386e-06, "loss": 0.0, "num_input_tokens_seen": 168228864, "step": 78000 }, { "epoch": 14.31547072857405, "grad_norm": 0.00034804351162165403, "learning_rate": 2.265626157428044e-06, "loss": 0.0, "num_input_tokens_seen": 168238944, "step": 78005 }, { "epoch": 14.316388328133602, "grad_norm": 0.00017342987121082842, "learning_rate": 2.2649557874595007e-06, "loss": 0.0007, "num_input_tokens_seen": 168250080, "step": 78010 }, { "epoch": 14.317305927693155, "grad_norm": 0.005039252806454897, "learning_rate": 2.2642854876406e-06, "loss": 0.0, "num_input_tokens_seen": 168261280, "step": 78015 }, { "epoch": 14.318223527252707, "grad_norm": 0.0001722440792946145, "learning_rate": 2.2636152579885395e-06, "loss": 0.0, "num_input_tokens_seen": 168271744, "step": 78020 }, { "epoch": 14.319141126812259, "grad_norm": 0.00046185273095034063, "learning_rate": 2.262945098520506e-06, "loss": 0.0, "num_input_tokens_seen": 168282624, "step": 78025 }, { "epoch": 14.320058726371812, "grad_norm": 0.0002960295241791755, "learning_rate": 2.2622750092536866e-06, "loss": 0.0974, "num_input_tokens_seen": 168292768, "step": 78030 }, { "epoch": 14.320976325931364, "grad_norm": 0.00010892502177739516, "learning_rate": 2.2616049902052723e-06, "loss": 0.0, "num_input_tokens_seen": 168304416, "step": 78035 }, { "epoch": 14.321893925490915, "grad_norm": 0.07006517052650452, "learning_rate": 2.260935041392446e-06, "loss": 0.0, "num_input_tokens_seen": 168315424, "step": 78040 }, { "epoch": 14.322811525050469, "grad_norm": 0.0467032864689827, "learning_rate": 2.2602651628323905e-06, "loss": 0.0, "num_input_tokens_seen": 168326944, "step": 78045 }, { "epoch": 14.32372912461002, "grad_norm": 0.0003612255968619138, "learning_rate": 2.2595953545422855e-06, "loss": 0.0, "num_input_tokens_seen": 168338816, "step": 78050 }, { "epoch": 14.324646724169572, "grad_norm": 0.00042115183896385133, "learning_rate": 2.258925616539314e-06, "loss": 0.0, "num_input_tokens_seen": 168348704, "step": 78055 }, { "epoch": 14.325564323729125, "grad_norm": 0.0002209583908552304, "learning_rate": 2.258255948840653e-06, "loss": 0.0, "num_input_tokens_seen": 168358720, "step": 78060 }, { "epoch": 14.326481923288677, "grad_norm": 0.002293252618983388, "learning_rate": 2.2575863514634748e-06, "loss": 0.0, "num_input_tokens_seen": 168368768, "step": 78065 }, { "epoch": 14.327399522848228, "grad_norm": 0.0004168440937064588, "learning_rate": 2.256916824424959e-06, "loss": 0.0001, "num_input_tokens_seen": 168379904, "step": 78070 }, { "epoch": 14.328317122407782, "grad_norm": 0.019245019182562828, "learning_rate": 2.2562473677422745e-06, "loss": 0.0, "num_input_tokens_seen": 168390944, "step": 78075 }, { "epoch": 14.329234721967333, "grad_norm": 0.000819750945083797, "learning_rate": 2.2555779814325913e-06, "loss": 0.0004, "num_input_tokens_seen": 168401472, "step": 78080 }, { "epoch": 14.330152321526885, "grad_norm": 8.631976379547268e-05, "learning_rate": 2.2549086655130815e-06, "loss": 0.0, "num_input_tokens_seen": 168412736, "step": 78085 }, { "epoch": 14.331069921086439, "grad_norm": 0.00011231624375795946, "learning_rate": 2.2542394200009095e-06, "loss": 0.0001, "num_input_tokens_seen": 168424992, "step": 78090 }, { "epoch": 14.33198752064599, "grad_norm": 0.00010541308438405395, "learning_rate": 2.253570244913241e-06, "loss": 0.0, "num_input_tokens_seen": 168435264, "step": 78095 }, { "epoch": 14.332905120205542, "grad_norm": 0.0003086425713263452, "learning_rate": 2.2529011402672367e-06, "loss": 0.0, "num_input_tokens_seen": 168446112, "step": 78100 }, { "epoch": 14.333822719765095, "grad_norm": 7.864580402383581e-05, "learning_rate": 2.252232106080063e-06, "loss": 0.0001, "num_input_tokens_seen": 168456512, "step": 78105 }, { "epoch": 14.334740319324647, "grad_norm": 0.00010837020818144083, "learning_rate": 2.2515631423688766e-06, "loss": 0.0, "num_input_tokens_seen": 168466656, "step": 78110 }, { "epoch": 14.335657918884198, "grad_norm": 0.005334517452865839, "learning_rate": 2.2508942491508364e-06, "loss": 0.0, "num_input_tokens_seen": 168477088, "step": 78115 }, { "epoch": 14.336575518443752, "grad_norm": 0.002718315925449133, "learning_rate": 2.250225426443098e-06, "loss": 0.1035, "num_input_tokens_seen": 168486912, "step": 78120 }, { "epoch": 14.337493118003303, "grad_norm": 0.0001300590083701536, "learning_rate": 2.2495566742628133e-06, "loss": 0.0, "num_input_tokens_seen": 168496640, "step": 78125 }, { "epoch": 14.338410717562855, "grad_norm": 8.802016964182258e-05, "learning_rate": 2.2488879926271396e-06, "loss": 0.0, "num_input_tokens_seen": 168508704, "step": 78130 }, { "epoch": 14.339328317122408, "grad_norm": 0.00018030348292086273, "learning_rate": 2.2482193815532246e-06, "loss": 0.0, "num_input_tokens_seen": 168519616, "step": 78135 }, { "epoch": 14.34024591668196, "grad_norm": 0.0010371593525633216, "learning_rate": 2.2475508410582176e-06, "loss": 0.0, "num_input_tokens_seen": 168530848, "step": 78140 }, { "epoch": 14.341163516241512, "grad_norm": 0.0002320960775250569, "learning_rate": 2.2468823711592656e-06, "loss": 0.0, "num_input_tokens_seen": 168542336, "step": 78145 }, { "epoch": 14.342081115801065, "grad_norm": 0.00018917536363005638, "learning_rate": 2.246213971873512e-06, "loss": 0.0, "num_input_tokens_seen": 168553312, "step": 78150 }, { "epoch": 14.342998715360617, "grad_norm": 0.004399335943162441, "learning_rate": 2.245545643218104e-06, "loss": 0.0018, "num_input_tokens_seen": 168564864, "step": 78155 }, { "epoch": 14.343916314920168, "grad_norm": 0.00015069308574311435, "learning_rate": 2.2448773852101814e-06, "loss": 0.0, "num_input_tokens_seen": 168576224, "step": 78160 }, { "epoch": 14.344833914479722, "grad_norm": 0.00023800793860573322, "learning_rate": 2.2442091978668817e-06, "loss": 0.0, "num_input_tokens_seen": 168587168, "step": 78165 }, { "epoch": 14.345751514039273, "grad_norm": 5.228468216955662e-05, "learning_rate": 2.243541081205347e-06, "loss": 0.0, "num_input_tokens_seen": 168596928, "step": 78170 }, { "epoch": 14.346669113598825, "grad_norm": 0.0006203648517839611, "learning_rate": 2.2428730352427112e-06, "loss": 0.0, "num_input_tokens_seen": 168606720, "step": 78175 }, { "epoch": 14.347586713158378, "grad_norm": 0.0009778348030522466, "learning_rate": 2.242205059996109e-06, "loss": 0.0, "num_input_tokens_seen": 168617536, "step": 78180 }, { "epoch": 14.34850431271793, "grad_norm": 8.498935494571924e-05, "learning_rate": 2.2415371554826714e-06, "loss": 0.0, "num_input_tokens_seen": 168628480, "step": 78185 }, { "epoch": 14.349421912277482, "grad_norm": 0.0007927282713353634, "learning_rate": 2.240869321719532e-06, "loss": 0.0, "num_input_tokens_seen": 168637760, "step": 78190 }, { "epoch": 14.350339511837035, "grad_norm": 0.00016742594016250223, "learning_rate": 2.240201558723818e-06, "loss": 0.0, "num_input_tokens_seen": 168649408, "step": 78195 }, { "epoch": 14.351257111396587, "grad_norm": 0.00012623195652849972, "learning_rate": 2.2395338665126554e-06, "loss": 0.0, "num_input_tokens_seen": 168659968, "step": 78200 }, { "epoch": 14.352174710956138, "grad_norm": 0.0001849279215093702, "learning_rate": 2.238866245103172e-06, "loss": 0.0, "num_input_tokens_seen": 168671648, "step": 78205 }, { "epoch": 14.353092310515692, "grad_norm": 0.00010141571692656726, "learning_rate": 2.2381986945124907e-06, "loss": 0.0, "num_input_tokens_seen": 168683296, "step": 78210 }, { "epoch": 14.354009910075243, "grad_norm": 0.0017784159863367677, "learning_rate": 2.23753121475773e-06, "loss": 0.0, "num_input_tokens_seen": 168692800, "step": 78215 }, { "epoch": 14.354927509634795, "grad_norm": 8.22315487312153e-05, "learning_rate": 2.2368638058560145e-06, "loss": 0.0, "num_input_tokens_seen": 168702080, "step": 78220 }, { "epoch": 14.355845109194348, "grad_norm": 0.002625558292493224, "learning_rate": 2.23619646782446e-06, "loss": 0.0, "num_input_tokens_seen": 168712352, "step": 78225 }, { "epoch": 14.3567627087539, "grad_norm": 0.0013942478690296412, "learning_rate": 2.235529200680182e-06, "loss": 0.0, "num_input_tokens_seen": 168722144, "step": 78230 }, { "epoch": 14.357680308313451, "grad_norm": 0.0011249256785959005, "learning_rate": 2.2348620044402943e-06, "loss": 0.0, "num_input_tokens_seen": 168732928, "step": 78235 }, { "epoch": 14.358597907873005, "grad_norm": 0.0007311392691917717, "learning_rate": 2.234194879121912e-06, "loss": 0.0, "num_input_tokens_seen": 168743968, "step": 78240 }, { "epoch": 14.359515507432556, "grad_norm": 0.003420777851715684, "learning_rate": 2.233527824742145e-06, "loss": 0.0, "num_input_tokens_seen": 168755232, "step": 78245 }, { "epoch": 14.360433106992108, "grad_norm": 0.00012094805424567312, "learning_rate": 2.232860841318099e-06, "loss": 0.0, "num_input_tokens_seen": 168766688, "step": 78250 }, { "epoch": 14.361350706551661, "grad_norm": 9.320136450696737e-05, "learning_rate": 2.232193928866886e-06, "loss": 0.0, "num_input_tokens_seen": 168777536, "step": 78255 }, { "epoch": 14.362268306111213, "grad_norm": 0.00037624654942192137, "learning_rate": 2.2315270874056088e-06, "loss": 0.0, "num_input_tokens_seen": 168788288, "step": 78260 }, { "epoch": 14.363185905670765, "grad_norm": 0.0010898245964199305, "learning_rate": 2.2308603169513698e-06, "loss": 0.0, "num_input_tokens_seen": 168798368, "step": 78265 }, { "epoch": 14.364103505230318, "grad_norm": 0.0004315005789976567, "learning_rate": 2.2301936175212737e-06, "loss": 0.0, "num_input_tokens_seen": 168809408, "step": 78270 }, { "epoch": 14.36502110478987, "grad_norm": 0.00015338700904976577, "learning_rate": 2.229526989132418e-06, "loss": 0.0004, "num_input_tokens_seen": 168820576, "step": 78275 }, { "epoch": 14.365938704349421, "grad_norm": 0.00017503017443232238, "learning_rate": 2.228860431801901e-06, "loss": 0.0, "num_input_tokens_seen": 168832768, "step": 78280 }, { "epoch": 14.366856303908975, "grad_norm": 0.0011434382759034634, "learning_rate": 2.228193945546818e-06, "loss": 0.0001, "num_input_tokens_seen": 168843200, "step": 78285 }, { "epoch": 14.367773903468526, "grad_norm": 0.00016888396930880845, "learning_rate": 2.2275275303842654e-06, "loss": 0.0, "num_input_tokens_seen": 168854080, "step": 78290 }, { "epoch": 14.368691503028078, "grad_norm": 0.0018584787612780929, "learning_rate": 2.226861186331335e-06, "loss": 0.0913, "num_input_tokens_seen": 168865216, "step": 78295 }, { "epoch": 14.369609102587631, "grad_norm": 0.00017054261115845293, "learning_rate": 2.226194913405115e-06, "loss": 0.0, "num_input_tokens_seen": 168875776, "step": 78300 }, { "epoch": 14.370526702147183, "grad_norm": 0.00010983858373947442, "learning_rate": 2.2255287116226994e-06, "loss": 0.0, "num_input_tokens_seen": 168884960, "step": 78305 }, { "epoch": 14.371444301706735, "grad_norm": 0.00014380126958712935, "learning_rate": 2.2248625810011716e-06, "loss": 0.0, "num_input_tokens_seen": 168896352, "step": 78310 }, { "epoch": 14.372361901266288, "grad_norm": 0.007465083617717028, "learning_rate": 2.2241965215576173e-06, "loss": 0.0, "num_input_tokens_seen": 168906304, "step": 78315 }, { "epoch": 14.37327950082584, "grad_norm": 0.0006090252427384257, "learning_rate": 2.223530533309119e-06, "loss": 0.0, "num_input_tokens_seen": 168917280, "step": 78320 }, { "epoch": 14.374197100385391, "grad_norm": 8.01559435785748e-05, "learning_rate": 2.2228646162727606e-06, "loss": 0.0, "num_input_tokens_seen": 168928064, "step": 78325 }, { "epoch": 14.375114699944945, "grad_norm": 0.0003501579340081662, "learning_rate": 2.2221987704656204e-06, "loss": 0.0, "num_input_tokens_seen": 168938784, "step": 78330 }, { "epoch": 14.376032299504496, "grad_norm": 0.0001014798108371906, "learning_rate": 2.221532995904777e-06, "loss": 0.0, "num_input_tokens_seen": 168949472, "step": 78335 }, { "epoch": 14.376949899064048, "grad_norm": 0.1405559629201889, "learning_rate": 2.2208672926073062e-06, "loss": 0.0, "num_input_tokens_seen": 168960896, "step": 78340 }, { "epoch": 14.377867498623601, "grad_norm": 0.00017896119970828295, "learning_rate": 2.2202016605902816e-06, "loss": 0.0, "num_input_tokens_seen": 168970240, "step": 78345 }, { "epoch": 14.378785098183153, "grad_norm": 0.2024925947189331, "learning_rate": 2.2195360998707747e-06, "loss": 0.0029, "num_input_tokens_seen": 168981760, "step": 78350 }, { "epoch": 14.379702697742704, "grad_norm": 0.00020024128025397658, "learning_rate": 2.218870610465859e-06, "loss": 0.0, "num_input_tokens_seen": 168993280, "step": 78355 }, { "epoch": 14.380620297302258, "grad_norm": 9.12073883228004e-05, "learning_rate": 2.218205192392602e-06, "loss": 0.0, "num_input_tokens_seen": 169003872, "step": 78360 }, { "epoch": 14.38153789686181, "grad_norm": 7.43162163416855e-05, "learning_rate": 2.21753984566807e-06, "loss": 0.0, "num_input_tokens_seen": 169014432, "step": 78365 }, { "epoch": 14.382455496421361, "grad_norm": 0.00040955544682219625, "learning_rate": 2.2168745703093273e-06, "loss": 0.0, "num_input_tokens_seen": 169023776, "step": 78370 }, { "epoch": 14.383373095980915, "grad_norm": 0.0002267317904625088, "learning_rate": 2.21620936633344e-06, "loss": 0.0, "num_input_tokens_seen": 169033792, "step": 78375 }, { "epoch": 14.384290695540466, "grad_norm": 0.0011231284588575363, "learning_rate": 2.2155442337574677e-06, "loss": 0.0, "num_input_tokens_seen": 169043904, "step": 78380 }, { "epoch": 14.385208295100018, "grad_norm": 0.00015687712584622204, "learning_rate": 2.214879172598469e-06, "loss": 0.0, "num_input_tokens_seen": 169053632, "step": 78385 }, { "epoch": 14.386125894659571, "grad_norm": 0.000222003465751186, "learning_rate": 2.2142141828735047e-06, "loss": 0.0, "num_input_tokens_seen": 169064224, "step": 78390 }, { "epoch": 14.387043494219123, "grad_norm": 0.0010387786896899343, "learning_rate": 2.2135492645996286e-06, "loss": 0.0, "num_input_tokens_seen": 169075488, "step": 78395 }, { "epoch": 14.387961093778674, "grad_norm": 0.00016365278861485422, "learning_rate": 2.212884417793894e-06, "loss": 0.0, "num_input_tokens_seen": 169086528, "step": 78400 }, { "epoch": 14.388878693338228, "grad_norm": 0.0002221369359176606, "learning_rate": 2.212219642473356e-06, "loss": 0.0, "num_input_tokens_seen": 169097408, "step": 78405 }, { "epoch": 14.38979629289778, "grad_norm": 0.0002429636660963297, "learning_rate": 2.2115549386550635e-06, "loss": 0.0, "num_input_tokens_seen": 169110272, "step": 78410 }, { "epoch": 14.390713892457331, "grad_norm": 0.00041892388253472745, "learning_rate": 2.2108903063560648e-06, "loss": 0.0, "num_input_tokens_seen": 169122016, "step": 78415 }, { "epoch": 14.391631492016884, "grad_norm": 0.000291888922220096, "learning_rate": 2.2102257455934056e-06, "loss": 0.0, "num_input_tokens_seen": 169133632, "step": 78420 }, { "epoch": 14.392549091576436, "grad_norm": 0.00042153079994022846, "learning_rate": 2.209561256384134e-06, "loss": 0.0, "num_input_tokens_seen": 169145728, "step": 78425 }, { "epoch": 14.393466691135988, "grad_norm": 0.0002909711329266429, "learning_rate": 2.2088968387452915e-06, "loss": 0.0, "num_input_tokens_seen": 169156416, "step": 78430 }, { "epoch": 14.394384290695541, "grad_norm": 0.00011939576506847516, "learning_rate": 2.208232492693917e-06, "loss": 0.0, "num_input_tokens_seen": 169165184, "step": 78435 }, { "epoch": 14.395301890255093, "grad_norm": 0.00013847197988070548, "learning_rate": 2.207568218247054e-06, "loss": 0.0, "num_input_tokens_seen": 169176640, "step": 78440 }, { "epoch": 14.396219489814644, "grad_norm": 0.00022339816496241838, "learning_rate": 2.2069040154217392e-06, "loss": 0.0, "num_input_tokens_seen": 169187008, "step": 78445 }, { "epoch": 14.397137089374198, "grad_norm": 0.19349448382854462, "learning_rate": 2.2062398842350067e-06, "loss": 0.0001, "num_input_tokens_seen": 169197120, "step": 78450 }, { "epoch": 14.39805468893375, "grad_norm": 0.00015810148033779114, "learning_rate": 2.20557582470389e-06, "loss": 0.0, "num_input_tokens_seen": 169208096, "step": 78455 }, { "epoch": 14.398972288493301, "grad_norm": 0.0004107766435481608, "learning_rate": 2.2049118368454236e-06, "loss": 0.0, "num_input_tokens_seen": 169220448, "step": 78460 }, { "epoch": 14.399889888052854, "grad_norm": 0.00011683193588396534, "learning_rate": 2.2042479206766372e-06, "loss": 0.0, "num_input_tokens_seen": 169231616, "step": 78465 }, { "epoch": 14.400807487612406, "grad_norm": 0.0003957404987886548, "learning_rate": 2.2035840762145566e-06, "loss": 0.0, "num_input_tokens_seen": 169242784, "step": 78470 }, { "epoch": 14.401725087171958, "grad_norm": 74.96866607666016, "learning_rate": 2.202920303476212e-06, "loss": 0.1625, "num_input_tokens_seen": 169253216, "step": 78475 }, { "epoch": 14.402642686731511, "grad_norm": 6.946848588995636e-05, "learning_rate": 2.202256602478627e-06, "loss": 0.0, "num_input_tokens_seen": 169262528, "step": 78480 }, { "epoch": 14.403560286291063, "grad_norm": 0.0010956041514873505, "learning_rate": 2.2015929732388214e-06, "loss": 0.0, "num_input_tokens_seen": 169272992, "step": 78485 }, { "epoch": 14.404477885850614, "grad_norm": 0.00015336225624196231, "learning_rate": 2.2009294157738214e-06, "loss": 0.0, "num_input_tokens_seen": 169284544, "step": 78490 }, { "epoch": 14.405395485410168, "grad_norm": 0.0001705841423245147, "learning_rate": 2.2002659301006434e-06, "loss": 0.0001, "num_input_tokens_seen": 169296960, "step": 78495 }, { "epoch": 14.40631308496972, "grad_norm": 0.0004096483171451837, "learning_rate": 2.1996025162363056e-06, "loss": 0.0, "num_input_tokens_seen": 169307648, "step": 78500 }, { "epoch": 14.40723068452927, "grad_norm": 0.0002464264980517328, "learning_rate": 2.1989391741978206e-06, "loss": 0.0674, "num_input_tokens_seen": 169319264, "step": 78505 }, { "epoch": 14.408148284088824, "grad_norm": 0.00033577054273337126, "learning_rate": 2.1982759040022066e-06, "loss": 0.0, "num_input_tokens_seen": 169330624, "step": 78510 }, { "epoch": 14.409065883648376, "grad_norm": 0.0004058087361045182, "learning_rate": 2.1976127056664732e-06, "loss": 0.0, "num_input_tokens_seen": 169342336, "step": 78515 }, { "epoch": 14.409983483207927, "grad_norm": 0.00012004023301415145, "learning_rate": 2.196949579207629e-06, "loss": 0.0, "num_input_tokens_seen": 169353632, "step": 78520 }, { "epoch": 14.41090108276748, "grad_norm": 7.957922935020179e-05, "learning_rate": 2.1962865246426857e-06, "loss": 0.0, "num_input_tokens_seen": 169363264, "step": 78525 }, { "epoch": 14.411818682327032, "grad_norm": 0.0001380223111482337, "learning_rate": 2.1956235419886475e-06, "loss": 0.0, "num_input_tokens_seen": 169373632, "step": 78530 }, { "epoch": 14.412736281886584, "grad_norm": 0.00011975293455179781, "learning_rate": 2.194960631262519e-06, "loss": 0.0, "num_input_tokens_seen": 169383040, "step": 78535 }, { "epoch": 14.413653881446137, "grad_norm": 0.00031990883871912956, "learning_rate": 2.194297792481303e-06, "loss": 0.0, "num_input_tokens_seen": 169394560, "step": 78540 }, { "epoch": 14.414571481005689, "grad_norm": 0.00029735136195085943, "learning_rate": 2.193635025661998e-06, "loss": 0.0, "num_input_tokens_seen": 169405376, "step": 78545 }, { "epoch": 14.41548908056524, "grad_norm": 0.01034974493086338, "learning_rate": 2.1929723308216074e-06, "loss": 0.0, "num_input_tokens_seen": 169416224, "step": 78550 }, { "epoch": 14.416406680124794, "grad_norm": 8.155217801686376e-05, "learning_rate": 2.192309707977126e-06, "loss": 0.0, "num_input_tokens_seen": 169426912, "step": 78555 }, { "epoch": 14.417324279684346, "grad_norm": 0.004114620853215456, "learning_rate": 2.191647157145549e-06, "loss": 0.0, "num_input_tokens_seen": 169437248, "step": 78560 }, { "epoch": 14.418241879243897, "grad_norm": 8.265646465588361e-05, "learning_rate": 2.19098467834387e-06, "loss": 0.0, "num_input_tokens_seen": 169447424, "step": 78565 }, { "epoch": 14.41915947880345, "grad_norm": 0.008266872726380825, "learning_rate": 2.190322271589078e-06, "loss": 0.0, "num_input_tokens_seen": 169458112, "step": 78570 }, { "epoch": 14.420077078363002, "grad_norm": 0.0007707077311351895, "learning_rate": 2.1896599368981674e-06, "loss": 0.0, "num_input_tokens_seen": 169469056, "step": 78575 }, { "epoch": 14.420994677922554, "grad_norm": 0.00016157441132236272, "learning_rate": 2.1889976742881237e-06, "loss": 0.0, "num_input_tokens_seen": 169479680, "step": 78580 }, { "epoch": 14.421912277482107, "grad_norm": 9.715111082186922e-05, "learning_rate": 2.1883354837759312e-06, "loss": 0.0, "num_input_tokens_seen": 169491040, "step": 78585 }, { "epoch": 14.422829877041659, "grad_norm": 0.0019003871129825711, "learning_rate": 2.1876733653785776e-06, "loss": 0.0, "num_input_tokens_seen": 169501440, "step": 78590 }, { "epoch": 14.42374747660121, "grad_norm": 0.00020635731925722212, "learning_rate": 2.187011319113044e-06, "loss": 0.0, "num_input_tokens_seen": 169512096, "step": 78595 }, { "epoch": 14.424665076160764, "grad_norm": 9.7144533356186e-05, "learning_rate": 2.1863493449963098e-06, "loss": 0.0, "num_input_tokens_seen": 169521920, "step": 78600 }, { "epoch": 14.425582675720316, "grad_norm": 0.0010365248890593648, "learning_rate": 2.1856874430453522e-06, "loss": 0.0002, "num_input_tokens_seen": 169532256, "step": 78605 }, { "epoch": 14.426500275279867, "grad_norm": 0.004072026815265417, "learning_rate": 2.185025613277152e-06, "loss": 0.0, "num_input_tokens_seen": 169543328, "step": 78610 }, { "epoch": 14.42741787483942, "grad_norm": 0.0002328126720385626, "learning_rate": 2.1843638557086816e-06, "loss": 0.0, "num_input_tokens_seen": 169553504, "step": 78615 }, { "epoch": 14.428335474398972, "grad_norm": 0.00017719804600346833, "learning_rate": 2.1837021703569134e-06, "loss": 0.0, "num_input_tokens_seen": 169563584, "step": 78620 }, { "epoch": 14.429253073958524, "grad_norm": 0.00021758608636446297, "learning_rate": 2.1830405572388207e-06, "loss": 0.0, "num_input_tokens_seen": 169575136, "step": 78625 }, { "epoch": 14.430170673518077, "grad_norm": 0.00019537264597602189, "learning_rate": 2.182379016371372e-06, "loss": 0.0, "num_input_tokens_seen": 169587072, "step": 78630 }, { "epoch": 14.431088273077629, "grad_norm": 0.0004363157495390624, "learning_rate": 2.1817175477715352e-06, "loss": 0.0, "num_input_tokens_seen": 169598368, "step": 78635 }, { "epoch": 14.43200587263718, "grad_norm": 0.00040509868995286524, "learning_rate": 2.181056151456273e-06, "loss": 0.0, "num_input_tokens_seen": 169609920, "step": 78640 }, { "epoch": 14.432923472196734, "grad_norm": 0.009176546707749367, "learning_rate": 2.1803948274425534e-06, "loss": 0.0, "num_input_tokens_seen": 169620384, "step": 78645 }, { "epoch": 14.433841071756286, "grad_norm": 7.220563566079363e-05, "learning_rate": 2.1797335757473363e-06, "loss": 0.0, "num_input_tokens_seen": 169630784, "step": 78650 }, { "epoch": 14.434758671315837, "grad_norm": 0.003933950327336788, "learning_rate": 2.1790723963875805e-06, "loss": 0.0, "num_input_tokens_seen": 169641696, "step": 78655 }, { "epoch": 14.43567627087539, "grad_norm": 0.018045956268906593, "learning_rate": 2.1784112893802474e-06, "loss": 0.0, "num_input_tokens_seen": 169652224, "step": 78660 }, { "epoch": 14.436593870434942, "grad_norm": 0.0019371317466720939, "learning_rate": 2.1777502547422917e-06, "loss": 0.0, "num_input_tokens_seen": 169662496, "step": 78665 }, { "epoch": 14.437511469994494, "grad_norm": 0.00026562873972579837, "learning_rate": 2.1770892924906663e-06, "loss": 0.0, "num_input_tokens_seen": 169672960, "step": 78670 }, { "epoch": 14.438429069554047, "grad_norm": 0.00014685821952298284, "learning_rate": 2.1764284026423266e-06, "loss": 0.0, "num_input_tokens_seen": 169684320, "step": 78675 }, { "epoch": 14.439346669113599, "grad_norm": 0.00024275967734865844, "learning_rate": 2.175767585214223e-06, "loss": 0.0, "num_input_tokens_seen": 169695776, "step": 78680 }, { "epoch": 14.44026426867315, "grad_norm": 0.06337327510118484, "learning_rate": 2.1751068402233033e-06, "loss": 0.0, "num_input_tokens_seen": 169707008, "step": 78685 }, { "epoch": 14.441181868232704, "grad_norm": 0.00029589232872240245, "learning_rate": 2.174446167686513e-06, "loss": 0.0007, "num_input_tokens_seen": 169717408, "step": 78690 }, { "epoch": 14.442099467792255, "grad_norm": 0.0010248059406876564, "learning_rate": 2.1737855676208016e-06, "loss": 0.0, "num_input_tokens_seen": 169729248, "step": 78695 }, { "epoch": 14.443017067351807, "grad_norm": 0.03024495579302311, "learning_rate": 2.173125040043109e-06, "loss": 0.1221, "num_input_tokens_seen": 169739808, "step": 78700 }, { "epoch": 14.44393466691136, "grad_norm": 0.00034525152295827866, "learning_rate": 2.1724645849703773e-06, "loss": 0.0, "num_input_tokens_seen": 169751904, "step": 78705 }, { "epoch": 14.444852266470912, "grad_norm": 0.001211386639624834, "learning_rate": 2.171804202419548e-06, "loss": 0.0, "num_input_tokens_seen": 169762720, "step": 78710 }, { "epoch": 14.445769866030464, "grad_norm": 0.0001429829135304317, "learning_rate": 2.1711438924075578e-06, "loss": 0.0, "num_input_tokens_seen": 169773024, "step": 78715 }, { "epoch": 14.446687465590017, "grad_norm": 0.0002221425238531083, "learning_rate": 2.1704836549513404e-06, "loss": 0.0, "num_input_tokens_seen": 169784608, "step": 78720 }, { "epoch": 14.447605065149569, "grad_norm": 0.003072976367548108, "learning_rate": 2.169823490067834e-06, "loss": 0.0, "num_input_tokens_seen": 169795936, "step": 78725 }, { "epoch": 14.44852266470912, "grad_norm": 0.0003492860123515129, "learning_rate": 2.1691633977739683e-06, "loss": 0.0, "num_input_tokens_seen": 169806304, "step": 78730 }, { "epoch": 14.449440264268674, "grad_norm": 0.0007917634211480618, "learning_rate": 2.168503378086674e-06, "loss": 0.0, "num_input_tokens_seen": 169817440, "step": 78735 }, { "epoch": 14.450357863828225, "grad_norm": 0.007573324721306562, "learning_rate": 2.1678434310228787e-06, "loss": 0.0, "num_input_tokens_seen": 169827488, "step": 78740 }, { "epoch": 14.451275463387777, "grad_norm": 0.0009977854788303375, "learning_rate": 2.1671835565995107e-06, "loss": 0.0, "num_input_tokens_seen": 169838816, "step": 78745 }, { "epoch": 14.45219306294733, "grad_norm": 0.00021704798564314842, "learning_rate": 2.1665237548334943e-06, "loss": 0.0, "num_input_tokens_seen": 169850368, "step": 78750 }, { "epoch": 14.453110662506882, "grad_norm": 0.0007393238483928144, "learning_rate": 2.1658640257417524e-06, "loss": 0.0, "num_input_tokens_seen": 169861088, "step": 78755 }, { "epoch": 14.454028262066434, "grad_norm": 9.896951814880595e-05, "learning_rate": 2.1652043693412057e-06, "loss": 0.0, "num_input_tokens_seen": 169872768, "step": 78760 }, { "epoch": 14.454945861625987, "grad_norm": 0.0025666868314146996, "learning_rate": 2.1645447856487713e-06, "loss": 0.0, "num_input_tokens_seen": 169884544, "step": 78765 }, { "epoch": 14.455863461185539, "grad_norm": 0.06252943724393845, "learning_rate": 2.1638852746813706e-06, "loss": 0.0, "num_input_tokens_seen": 169896192, "step": 78770 }, { "epoch": 14.45678106074509, "grad_norm": 0.0002388783759670332, "learning_rate": 2.163225836455917e-06, "loss": 0.0, "num_input_tokens_seen": 169907456, "step": 78775 }, { "epoch": 14.457698660304644, "grad_norm": 0.00033353999606333673, "learning_rate": 2.162566470989324e-06, "loss": 0.0, "num_input_tokens_seen": 169918560, "step": 78780 }, { "epoch": 14.458616259864195, "grad_norm": 0.18974360823631287, "learning_rate": 2.1619071782985033e-06, "loss": 0.0001, "num_input_tokens_seen": 169929344, "step": 78785 }, { "epoch": 14.459533859423747, "grad_norm": 0.00029664652538485825, "learning_rate": 2.161247958400363e-06, "loss": 0.0, "num_input_tokens_seen": 169940768, "step": 78790 }, { "epoch": 14.4604514589833, "grad_norm": 0.00011711679690051824, "learning_rate": 2.160588811311815e-06, "loss": 0.0, "num_input_tokens_seen": 169951904, "step": 78795 }, { "epoch": 14.461369058542852, "grad_norm": 9.010559733724222e-05, "learning_rate": 2.1599297370497637e-06, "loss": 0.0, "num_input_tokens_seen": 169963200, "step": 78800 }, { "epoch": 14.462286658102403, "grad_norm": 0.00019357874407432973, "learning_rate": 2.1592707356311103e-06, "loss": 0.0, "num_input_tokens_seen": 169973440, "step": 78805 }, { "epoch": 14.463204257661957, "grad_norm": 0.00028836840647272766, "learning_rate": 2.158611807072762e-06, "loss": 0.0, "num_input_tokens_seen": 169983200, "step": 78810 }, { "epoch": 14.464121857221508, "grad_norm": 0.00011949852341786027, "learning_rate": 2.157952951391617e-06, "loss": 0.0, "num_input_tokens_seen": 169994112, "step": 78815 }, { "epoch": 14.46503945678106, "grad_norm": 0.001971574267372489, "learning_rate": 2.157294168604574e-06, "loss": 0.0, "num_input_tokens_seen": 170006208, "step": 78820 }, { "epoch": 14.465957056340613, "grad_norm": 8.724095823708922e-05, "learning_rate": 2.1566354587285283e-06, "loss": 0.0, "num_input_tokens_seen": 170017856, "step": 78825 }, { "epoch": 14.466874655900165, "grad_norm": 0.00020194536773487926, "learning_rate": 2.1559768217803777e-06, "loss": 0.0, "num_input_tokens_seen": 170028960, "step": 78830 }, { "epoch": 14.467792255459717, "grad_norm": 0.000162989686941728, "learning_rate": 2.155318257777014e-06, "loss": 0.0, "num_input_tokens_seen": 170039936, "step": 78835 }, { "epoch": 14.46870985501927, "grad_norm": 0.0004016923485323787, "learning_rate": 2.1546597667353257e-06, "loss": 0.0, "num_input_tokens_seen": 170050656, "step": 78840 }, { "epoch": 14.469627454578822, "grad_norm": 0.004036819562315941, "learning_rate": 2.1540013486722073e-06, "loss": 0.0, "num_input_tokens_seen": 170061280, "step": 78845 }, { "epoch": 14.470545054138373, "grad_norm": 0.00016833929112181067, "learning_rate": 2.1533430036045427e-06, "loss": 0.0, "num_input_tokens_seen": 170072064, "step": 78850 }, { "epoch": 14.471462653697927, "grad_norm": 0.00011731069389497861, "learning_rate": 2.1526847315492165e-06, "loss": 0.0, "num_input_tokens_seen": 170082816, "step": 78855 }, { "epoch": 14.472380253257478, "grad_norm": 0.00025212348555214703, "learning_rate": 2.152026532523116e-06, "loss": 0.0, "num_input_tokens_seen": 170094048, "step": 78860 }, { "epoch": 14.47329785281703, "grad_norm": 0.019334647804498672, "learning_rate": 2.1513684065431207e-06, "loss": 0.0, "num_input_tokens_seen": 170104544, "step": 78865 }, { "epoch": 14.474215452376583, "grad_norm": 0.00016589117876719683, "learning_rate": 2.150710353626111e-06, "loss": 0.0, "num_input_tokens_seen": 170115520, "step": 78870 }, { "epoch": 14.475133051936135, "grad_norm": 0.00011769995762733743, "learning_rate": 2.150052373788963e-06, "loss": 0.0, "num_input_tokens_seen": 170124768, "step": 78875 }, { "epoch": 14.476050651495687, "grad_norm": 0.0018644804367795587, "learning_rate": 2.1493944670485562e-06, "loss": 0.0, "num_input_tokens_seen": 170135584, "step": 78880 }, { "epoch": 14.47696825105524, "grad_norm": 0.00012992376287002116, "learning_rate": 2.1487366334217628e-06, "loss": 0.0, "num_input_tokens_seen": 170146304, "step": 78885 }, { "epoch": 14.477885850614792, "grad_norm": 0.0020646294578909874, "learning_rate": 2.148078872925455e-06, "loss": 0.0, "num_input_tokens_seen": 170157312, "step": 78890 }, { "epoch": 14.478803450174343, "grad_norm": 0.0014065839350223541, "learning_rate": 2.1474211855765055e-06, "loss": 0.0, "num_input_tokens_seen": 170167808, "step": 78895 }, { "epoch": 14.479721049733897, "grad_norm": 7.756448030704632e-05, "learning_rate": 2.1467635713917807e-06, "loss": 0.0733, "num_input_tokens_seen": 170179072, "step": 78900 }, { "epoch": 14.480638649293448, "grad_norm": 0.00011890393943758681, "learning_rate": 2.146106030388147e-06, "loss": 0.0, "num_input_tokens_seen": 170189696, "step": 78905 }, { "epoch": 14.481556248853, "grad_norm": 0.0008908385061658919, "learning_rate": 2.1454485625824724e-06, "loss": 0.0, "num_input_tokens_seen": 170201152, "step": 78910 }, { "epoch": 14.482473848412553, "grad_norm": 6.876506085973233e-05, "learning_rate": 2.1447911679916177e-06, "loss": 0.0, "num_input_tokens_seen": 170211552, "step": 78915 }, { "epoch": 14.483391447972105, "grad_norm": 0.00040165611426346004, "learning_rate": 2.1441338466324445e-06, "loss": 0.0, "num_input_tokens_seen": 170222144, "step": 78920 }, { "epoch": 14.484309047531656, "grad_norm": 0.0016768219647929072, "learning_rate": 2.1434765985218103e-06, "loss": 0.0, "num_input_tokens_seen": 170231328, "step": 78925 }, { "epoch": 14.48522664709121, "grad_norm": 9.469147335039452e-05, "learning_rate": 2.142819423676576e-06, "loss": 0.0, "num_input_tokens_seen": 170240896, "step": 78930 }, { "epoch": 14.486144246650762, "grad_norm": 0.0002832971222233027, "learning_rate": 2.1421623221135947e-06, "loss": 0.0, "num_input_tokens_seen": 170251936, "step": 78935 }, { "epoch": 14.487061846210313, "grad_norm": 0.0004394223215058446, "learning_rate": 2.1415052938497195e-06, "loss": 0.0, "num_input_tokens_seen": 170262464, "step": 78940 }, { "epoch": 14.487979445769867, "grad_norm": 0.0002740457421168685, "learning_rate": 2.1408483389018043e-06, "loss": 0.0, "num_input_tokens_seen": 170273504, "step": 78945 }, { "epoch": 14.488897045329418, "grad_norm": 0.00011690844257827848, "learning_rate": 2.1401914572866983e-06, "loss": 0.0001, "num_input_tokens_seen": 170283840, "step": 78950 }, { "epoch": 14.48981464488897, "grad_norm": 0.00013596484495792538, "learning_rate": 2.1395346490212493e-06, "loss": 0.0, "num_input_tokens_seen": 170293376, "step": 78955 }, { "epoch": 14.490732244448523, "grad_norm": 0.0020820132922381163, "learning_rate": 2.138877914122301e-06, "loss": 0.0, "num_input_tokens_seen": 170304416, "step": 78960 }, { "epoch": 14.491649844008075, "grad_norm": 8.303370123030618e-05, "learning_rate": 2.138221252606702e-06, "loss": 0.0007, "num_input_tokens_seen": 170316096, "step": 78965 }, { "epoch": 14.492567443567626, "grad_norm": 7.03735277056694e-05, "learning_rate": 2.1375646644912925e-06, "loss": 0.0, "num_input_tokens_seen": 170326752, "step": 78970 }, { "epoch": 14.49348504312718, "grad_norm": 0.00033013365464285016, "learning_rate": 2.1369081497929127e-06, "loss": 0.0, "num_input_tokens_seen": 170338016, "step": 78975 }, { "epoch": 14.494402642686731, "grad_norm": 7.173359335865825e-05, "learning_rate": 2.136251708528402e-06, "loss": 0.0, "num_input_tokens_seen": 170348704, "step": 78980 }, { "epoch": 14.495320242246283, "grad_norm": 9.623181540518999e-05, "learning_rate": 2.1355953407145947e-06, "loss": 0.0, "num_input_tokens_seen": 170358912, "step": 78985 }, { "epoch": 14.496237841805836, "grad_norm": 0.0009333427296951413, "learning_rate": 2.134939046368329e-06, "loss": 0.0, "num_input_tokens_seen": 170369408, "step": 78990 }, { "epoch": 14.497155441365388, "grad_norm": 0.00020871576271019876, "learning_rate": 2.1342828255064362e-06, "loss": 0.0002, "num_input_tokens_seen": 170379296, "step": 78995 }, { "epoch": 14.49807304092494, "grad_norm": 9.691608283901587e-05, "learning_rate": 2.1336266781457475e-06, "loss": 0.0, "num_input_tokens_seen": 170389728, "step": 79000 }, { "epoch": 14.498990640484493, "grad_norm": 0.0002872894692700356, "learning_rate": 2.1329706043030924e-06, "loss": 0.0, "num_input_tokens_seen": 170400576, "step": 79005 }, { "epoch": 14.499908240044045, "grad_norm": 0.00027236444293521345, "learning_rate": 2.132314603995296e-06, "loss": 0.0, "num_input_tokens_seen": 170411968, "step": 79010 }, { "epoch": 14.500825839603596, "grad_norm": 0.00039756717160344124, "learning_rate": 2.1316586772391866e-06, "loss": 0.0, "num_input_tokens_seen": 170423072, "step": 79015 }, { "epoch": 14.50174343916315, "grad_norm": 0.0002854490594472736, "learning_rate": 2.1310028240515874e-06, "loss": 0.0, "num_input_tokens_seen": 170434592, "step": 79020 }, { "epoch": 14.502661038722701, "grad_norm": 0.0026291462127119303, "learning_rate": 2.130347044449317e-06, "loss": 0.0, "num_input_tokens_seen": 170446144, "step": 79025 }, { "epoch": 14.503578638282253, "grad_norm": 0.0011558531550690532, "learning_rate": 2.129691338449199e-06, "loss": 0.0, "num_input_tokens_seen": 170455936, "step": 79030 }, { "epoch": 14.504496237841806, "grad_norm": 0.00017532429774291813, "learning_rate": 2.1290357060680498e-06, "loss": 0.0, "num_input_tokens_seen": 170466432, "step": 79035 }, { "epoch": 14.505413837401358, "grad_norm": 0.021693069487810135, "learning_rate": 2.1283801473226835e-06, "loss": 0.0, "num_input_tokens_seen": 170478208, "step": 79040 }, { "epoch": 14.50633143696091, "grad_norm": 0.00010179505625274032, "learning_rate": 2.1277246622299176e-06, "loss": 0.1532, "num_input_tokens_seen": 170488768, "step": 79045 }, { "epoch": 14.507249036520463, "grad_norm": 9.707760909805074e-05, "learning_rate": 2.127069250806562e-06, "loss": 0.0, "num_input_tokens_seen": 170500320, "step": 79050 }, { "epoch": 14.508166636080015, "grad_norm": 0.004287825897336006, "learning_rate": 2.126413913069428e-06, "loss": 0.0001, "num_input_tokens_seen": 170512128, "step": 79055 }, { "epoch": 14.509084235639566, "grad_norm": 0.004493345506489277, "learning_rate": 2.1257586490353216e-06, "loss": 0.0, "num_input_tokens_seen": 170523776, "step": 79060 }, { "epoch": 14.51000183519912, "grad_norm": 0.0009932690300047398, "learning_rate": 2.1251034587210527e-06, "loss": 0.0, "num_input_tokens_seen": 170533760, "step": 79065 }, { "epoch": 14.510919434758671, "grad_norm": 0.0033425569999963045, "learning_rate": 2.124448342143425e-06, "loss": 0.0, "num_input_tokens_seen": 170545216, "step": 79070 }, { "epoch": 14.511837034318223, "grad_norm": 0.0009082922479137778, "learning_rate": 2.1237932993192385e-06, "loss": 0.0, "num_input_tokens_seen": 170555808, "step": 79075 }, { "epoch": 14.512754633877776, "grad_norm": 0.011020172387361526, "learning_rate": 2.1231383302652975e-06, "loss": 0.0, "num_input_tokens_seen": 170566976, "step": 79080 }, { "epoch": 14.513672233437328, "grad_norm": 0.0051450589671730995, "learning_rate": 2.1224834349984e-06, "loss": 0.0, "num_input_tokens_seen": 170578528, "step": 79085 }, { "epoch": 14.51458983299688, "grad_norm": 0.0016700325068086386, "learning_rate": 2.1218286135353427e-06, "loss": 0.0, "num_input_tokens_seen": 170589888, "step": 79090 }, { "epoch": 14.515507432556433, "grad_norm": 18.131689071655273, "learning_rate": 2.121173865892919e-06, "loss": 0.0674, "num_input_tokens_seen": 170600512, "step": 79095 }, { "epoch": 14.516425032115984, "grad_norm": 0.0049507152289152145, "learning_rate": 2.1205191920879254e-06, "loss": 0.147, "num_input_tokens_seen": 170611552, "step": 79100 }, { "epoch": 14.517342631675536, "grad_norm": 0.003112174803391099, "learning_rate": 2.1198645921371517e-06, "loss": 0.0, "num_input_tokens_seen": 170622560, "step": 79105 }, { "epoch": 14.51826023123509, "grad_norm": 8.357961633009836e-05, "learning_rate": 2.119210066057386e-06, "loss": 0.0, "num_input_tokens_seen": 170632928, "step": 79110 }, { "epoch": 14.519177830794641, "grad_norm": 0.0024178428575396538, "learning_rate": 2.1185556138654184e-06, "loss": 0.0, "num_input_tokens_seen": 170644768, "step": 79115 }, { "epoch": 14.520095430354193, "grad_norm": 0.001468223868869245, "learning_rate": 2.1179012355780344e-06, "loss": 0.0, "num_input_tokens_seen": 170656064, "step": 79120 }, { "epoch": 14.521013029913746, "grad_norm": 0.0001958037173608318, "learning_rate": 2.1172469312120144e-06, "loss": 0.0, "num_input_tokens_seen": 170666880, "step": 79125 }, { "epoch": 14.521930629473298, "grad_norm": 0.00015221562352962792, "learning_rate": 2.1165927007841445e-06, "loss": 0.0, "num_input_tokens_seen": 170678016, "step": 79130 }, { "epoch": 14.52284822903285, "grad_norm": 6.045810005161911e-05, "learning_rate": 2.1159385443112033e-06, "loss": 0.0, "num_input_tokens_seen": 170689408, "step": 79135 }, { "epoch": 14.523765828592403, "grad_norm": 0.00048165878979489207, "learning_rate": 2.1152844618099682e-06, "loss": 0.0001, "num_input_tokens_seen": 170699040, "step": 79140 }, { "epoch": 14.524683428151954, "grad_norm": 0.0005830719601362944, "learning_rate": 2.1146304532972144e-06, "loss": 0.0, "num_input_tokens_seen": 170709056, "step": 79145 }, { "epoch": 14.525601027711506, "grad_norm": 6.564205250469968e-05, "learning_rate": 2.1139765187897195e-06, "loss": 0.0, "num_input_tokens_seen": 170720192, "step": 79150 }, { "epoch": 14.52651862727106, "grad_norm": 211.25282287597656, "learning_rate": 2.1133226583042534e-06, "loss": 0.0207, "num_input_tokens_seen": 170730784, "step": 79155 }, { "epoch": 14.527436226830611, "grad_norm": 0.00011574511881917715, "learning_rate": 2.1126688718575857e-06, "loss": 0.0, "num_input_tokens_seen": 170739296, "step": 79160 }, { "epoch": 14.528353826390163, "grad_norm": 0.00048307940596714616, "learning_rate": 2.112015159466488e-06, "loss": 0.0, "num_input_tokens_seen": 170748192, "step": 79165 }, { "epoch": 14.529271425949716, "grad_norm": 0.00031586724799126387, "learning_rate": 2.111361521147725e-06, "loss": 0.0, "num_input_tokens_seen": 170758880, "step": 79170 }, { "epoch": 14.530189025509268, "grad_norm": 0.00011738674220396206, "learning_rate": 2.1107079569180626e-06, "loss": 0.0, "num_input_tokens_seen": 170769504, "step": 79175 }, { "epoch": 14.53110662506882, "grad_norm": 0.0003421303117647767, "learning_rate": 2.1100544667942617e-06, "loss": 0.0, "num_input_tokens_seen": 170779936, "step": 79180 }, { "epoch": 14.532024224628373, "grad_norm": 0.01942855305969715, "learning_rate": 2.109401050793086e-06, "loss": 0.0, "num_input_tokens_seen": 170790304, "step": 79185 }, { "epoch": 14.532941824187924, "grad_norm": 0.00017255032435059547, "learning_rate": 2.1087477089312938e-06, "loss": 0.0, "num_input_tokens_seen": 170800960, "step": 79190 }, { "epoch": 14.533859423747476, "grad_norm": 0.00012503372272476554, "learning_rate": 2.108094441225641e-06, "loss": 0.0, "num_input_tokens_seen": 170812096, "step": 79195 }, { "epoch": 14.53477702330703, "grad_norm": 0.009822435677051544, "learning_rate": 2.1074412476928845e-06, "loss": 0.0, "num_input_tokens_seen": 170821920, "step": 79200 }, { "epoch": 14.53569462286658, "grad_norm": 0.0005669983220286667, "learning_rate": 2.1067881283497763e-06, "loss": 0.0, "num_input_tokens_seen": 170831456, "step": 79205 }, { "epoch": 14.536612222426132, "grad_norm": 0.0025343834422528744, "learning_rate": 2.1061350832130673e-06, "loss": 0.0, "num_input_tokens_seen": 170843136, "step": 79210 }, { "epoch": 14.537529821985686, "grad_norm": 0.0003540784236975014, "learning_rate": 2.10548211229951e-06, "loss": 0.0001, "num_input_tokens_seen": 170852672, "step": 79215 }, { "epoch": 14.538447421545238, "grad_norm": 0.0011386810801923275, "learning_rate": 2.1048292156258506e-06, "loss": 0.0, "num_input_tokens_seen": 170863392, "step": 79220 }, { "epoch": 14.53936502110479, "grad_norm": 0.00010348793875891715, "learning_rate": 2.104176393208834e-06, "loss": 0.0003, "num_input_tokens_seen": 170873184, "step": 79225 }, { "epoch": 14.540282620664343, "grad_norm": 0.00010526156984269619, "learning_rate": 2.1035236450652037e-06, "loss": 0.0, "num_input_tokens_seen": 170882944, "step": 79230 }, { "epoch": 14.541200220223894, "grad_norm": 0.00036315061151981354, "learning_rate": 2.1028709712117045e-06, "loss": 0.0, "num_input_tokens_seen": 170895104, "step": 79235 }, { "epoch": 14.542117819783446, "grad_norm": 7.601279503433034e-05, "learning_rate": 2.1022183716650744e-06, "loss": 0.0, "num_input_tokens_seen": 170904960, "step": 79240 }, { "epoch": 14.543035419343, "grad_norm": 0.00010049612319562584, "learning_rate": 2.1015658464420503e-06, "loss": 0.0, "num_input_tokens_seen": 170914880, "step": 79245 }, { "epoch": 14.54395301890255, "grad_norm": 0.0001420042826794088, "learning_rate": 2.1009133955593717e-06, "loss": 0.0012, "num_input_tokens_seen": 170926656, "step": 79250 }, { "epoch": 14.544870618462102, "grad_norm": 0.0006657313206233084, "learning_rate": 2.100261019033772e-06, "loss": 0.0, "num_input_tokens_seen": 170937120, "step": 79255 }, { "epoch": 14.545788218021656, "grad_norm": 0.0003042370663024485, "learning_rate": 2.0996087168819803e-06, "loss": 0.0, "num_input_tokens_seen": 170948192, "step": 79260 }, { "epoch": 14.546705817581207, "grad_norm": 0.0011268139351159334, "learning_rate": 2.098956489120732e-06, "loss": 0.0, "num_input_tokens_seen": 170959072, "step": 79265 }, { "epoch": 14.547623417140759, "grad_norm": 0.00010953119635814801, "learning_rate": 2.0983043357667537e-06, "loss": 0.0, "num_input_tokens_seen": 170970432, "step": 79270 }, { "epoch": 14.548541016700312, "grad_norm": 155.11077880859375, "learning_rate": 2.097652256836772e-06, "loss": 0.0822, "num_input_tokens_seen": 170980704, "step": 79275 }, { "epoch": 14.549458616259864, "grad_norm": 0.0023252281825989485, "learning_rate": 2.0970002523475093e-06, "loss": 0.0, "num_input_tokens_seen": 170990304, "step": 79280 }, { "epoch": 14.550376215819416, "grad_norm": 0.00013290272909216583, "learning_rate": 2.0963483223156933e-06, "loss": 0.0, "num_input_tokens_seen": 171002144, "step": 79285 }, { "epoch": 14.551293815378969, "grad_norm": 0.0004129843437112868, "learning_rate": 2.095696466758042e-06, "loss": 0.0, "num_input_tokens_seen": 171013792, "step": 79290 }, { "epoch": 14.55221141493852, "grad_norm": 9.624019730836153e-05, "learning_rate": 2.095044685691274e-06, "loss": 0.0, "num_input_tokens_seen": 171024544, "step": 79295 }, { "epoch": 14.553129014498072, "grad_norm": 0.011059225536882877, "learning_rate": 2.0943929791321086e-06, "loss": 0.0001, "num_input_tokens_seen": 171036000, "step": 79300 }, { "epoch": 14.554046614057626, "grad_norm": 0.0006688061403110623, "learning_rate": 2.0937413470972603e-06, "loss": 0.0, "num_input_tokens_seen": 171046656, "step": 79305 }, { "epoch": 14.554964213617177, "grad_norm": 46.199527740478516, "learning_rate": 2.0930897896034403e-06, "loss": 0.0733, "num_input_tokens_seen": 171057536, "step": 79310 }, { "epoch": 14.555881813176729, "grad_norm": 0.0003320390242151916, "learning_rate": 2.0924383066673636e-06, "loss": 0.0, "num_input_tokens_seen": 171068000, "step": 79315 }, { "epoch": 14.556799412736282, "grad_norm": 8.036263170652092e-05, "learning_rate": 2.091786898305739e-06, "loss": 0.0, "num_input_tokens_seen": 171079136, "step": 79320 }, { "epoch": 14.557717012295834, "grad_norm": 7.670791092095897e-05, "learning_rate": 2.091135564535272e-06, "loss": 0.0, "num_input_tokens_seen": 171089504, "step": 79325 }, { "epoch": 14.558634611855386, "grad_norm": 0.00010416894656373188, "learning_rate": 2.090484305372668e-06, "loss": 0.0, "num_input_tokens_seen": 171099680, "step": 79330 }, { "epoch": 14.559552211414939, "grad_norm": 7.32695116312243e-05, "learning_rate": 2.089833120834634e-06, "loss": 0.0, "num_input_tokens_seen": 171111392, "step": 79335 }, { "epoch": 14.56046981097449, "grad_norm": 0.0002610218361951411, "learning_rate": 2.08918201093787e-06, "loss": 0.0, "num_input_tokens_seen": 171123136, "step": 79340 }, { "epoch": 14.561387410534042, "grad_norm": 0.0010027140378952026, "learning_rate": 2.0885309756990747e-06, "loss": 0.0, "num_input_tokens_seen": 171134976, "step": 79345 }, { "epoch": 14.562305010093596, "grad_norm": 0.00035698252031579614, "learning_rate": 2.087880015134949e-06, "loss": 0.0, "num_input_tokens_seen": 171145920, "step": 79350 }, { "epoch": 14.563222609653147, "grad_norm": 0.0014296078588813543, "learning_rate": 2.087229129262187e-06, "loss": 0.0, "num_input_tokens_seen": 171157472, "step": 79355 }, { "epoch": 14.564140209212699, "grad_norm": 0.009871464222669601, "learning_rate": 2.0865783180974825e-06, "loss": 0.0, "num_input_tokens_seen": 171167584, "step": 79360 }, { "epoch": 14.565057808772252, "grad_norm": 0.0002879579842556268, "learning_rate": 2.08592758165753e-06, "loss": 0.0, "num_input_tokens_seen": 171177600, "step": 79365 }, { "epoch": 14.565975408331804, "grad_norm": 0.00014069188910070807, "learning_rate": 2.0852769199590187e-06, "loss": 0.0, "num_input_tokens_seen": 171188000, "step": 79370 }, { "epoch": 14.566893007891355, "grad_norm": 9.048303763847798e-05, "learning_rate": 2.0846263330186373e-06, "loss": 0.0, "num_input_tokens_seen": 171199808, "step": 79375 }, { "epoch": 14.567810607450909, "grad_norm": 0.0009130730177275836, "learning_rate": 2.0839758208530704e-06, "loss": 0.0, "num_input_tokens_seen": 171210528, "step": 79380 }, { "epoch": 14.56872820701046, "grad_norm": 0.0001573185290908441, "learning_rate": 2.083325383479005e-06, "loss": 0.0, "num_input_tokens_seen": 171219872, "step": 79385 }, { "epoch": 14.569645806570012, "grad_norm": 0.0002748886763583869, "learning_rate": 2.082675020913124e-06, "loss": 0.0, "num_input_tokens_seen": 171230432, "step": 79390 }, { "epoch": 14.570563406129565, "grad_norm": 0.0011578156845644116, "learning_rate": 2.0820247331721073e-06, "loss": 0.0002, "num_input_tokens_seen": 171242048, "step": 79395 }, { "epoch": 14.571481005689117, "grad_norm": 0.00010304059105692431, "learning_rate": 2.0813745202726315e-06, "loss": 0.0, "num_input_tokens_seen": 171253600, "step": 79400 }, { "epoch": 14.572398605248669, "grad_norm": 0.0002625719062052667, "learning_rate": 2.0807243822313776e-06, "loss": 0.0011, "num_input_tokens_seen": 171264672, "step": 79405 }, { "epoch": 14.573316204808222, "grad_norm": 0.005027096718549728, "learning_rate": 2.08007431906502e-06, "loss": 0.0, "num_input_tokens_seen": 171274592, "step": 79410 }, { "epoch": 14.574233804367774, "grad_norm": 9.272785246139392e-05, "learning_rate": 2.079424330790229e-06, "loss": 0.0, "num_input_tokens_seen": 171285312, "step": 79415 }, { "epoch": 14.575151403927325, "grad_norm": 0.00016508316912222654, "learning_rate": 2.0787744174236784e-06, "loss": 0.0, "num_input_tokens_seen": 171296192, "step": 79420 }, { "epoch": 14.576069003486879, "grad_norm": 0.00010962288797600195, "learning_rate": 2.078124578982036e-06, "loss": 0.0, "num_input_tokens_seen": 171306432, "step": 79425 }, { "epoch": 14.57698660304643, "grad_norm": 0.0007132433820515871, "learning_rate": 2.077474815481968e-06, "loss": 0.0, "num_input_tokens_seen": 171316992, "step": 79430 }, { "epoch": 14.577904202605982, "grad_norm": 0.004278321284800768, "learning_rate": 2.0768251269401435e-06, "loss": 0.0, "num_input_tokens_seen": 171327168, "step": 79435 }, { "epoch": 14.578821802165535, "grad_norm": 0.0025361517909914255, "learning_rate": 2.0761755133732236e-06, "loss": 0.0, "num_input_tokens_seen": 171336864, "step": 79440 }, { "epoch": 14.579739401725087, "grad_norm": 0.00011514002835610881, "learning_rate": 2.075525974797869e-06, "loss": 0.0, "num_input_tokens_seen": 171346688, "step": 79445 }, { "epoch": 14.580657001284639, "grad_norm": 0.0005165610346011817, "learning_rate": 2.074876511230742e-06, "loss": 0.0, "num_input_tokens_seen": 171357088, "step": 79450 }, { "epoch": 14.581574600844192, "grad_norm": 0.00010419452883070335, "learning_rate": 2.074227122688499e-06, "loss": 0.0, "num_input_tokens_seen": 171369056, "step": 79455 }, { "epoch": 14.582492200403744, "grad_norm": 0.0018113930709660053, "learning_rate": 2.0735778091877963e-06, "loss": 0.0, "num_input_tokens_seen": 171380544, "step": 79460 }, { "epoch": 14.583409799963295, "grad_norm": 0.0001571895700180903, "learning_rate": 2.0729285707452846e-06, "loss": 0.0, "num_input_tokens_seen": 171391680, "step": 79465 }, { "epoch": 14.584327399522849, "grad_norm": 0.0001433429861208424, "learning_rate": 2.072279407377621e-06, "loss": 0.0, "num_input_tokens_seen": 171402496, "step": 79470 }, { "epoch": 14.5852449990824, "grad_norm": 0.012917676940560341, "learning_rate": 2.0716303191014527e-06, "loss": 0.0, "num_input_tokens_seen": 171413280, "step": 79475 }, { "epoch": 14.586162598641952, "grad_norm": 0.00010132156603503972, "learning_rate": 2.070981305933426e-06, "loss": 0.0004, "num_input_tokens_seen": 171424608, "step": 79480 }, { "epoch": 14.587080198201505, "grad_norm": 0.0010478977346792817, "learning_rate": 2.0703323678901915e-06, "loss": 0.0, "num_input_tokens_seen": 171435424, "step": 79485 }, { "epoch": 14.587997797761057, "grad_norm": 0.00021687941625714302, "learning_rate": 2.069683504988391e-06, "loss": 0.0, "num_input_tokens_seen": 171445312, "step": 79490 }, { "epoch": 14.588915397320608, "grad_norm": 0.0001881760690594092, "learning_rate": 2.0690347172446655e-06, "loss": 0.0, "num_input_tokens_seen": 171456000, "step": 79495 }, { "epoch": 14.589832996880162, "grad_norm": 0.0068831900134682655, "learning_rate": 2.0683860046756587e-06, "loss": 0.0, "num_input_tokens_seen": 171467616, "step": 79500 }, { "epoch": 14.590750596439714, "grad_norm": 0.0016087462427094579, "learning_rate": 2.067737367298007e-06, "loss": 0.0, "num_input_tokens_seen": 171478688, "step": 79505 }, { "epoch": 14.591668195999265, "grad_norm": 0.0007673100917600095, "learning_rate": 2.067088805128348e-06, "loss": 0.0, "num_input_tokens_seen": 171488096, "step": 79510 }, { "epoch": 14.592585795558819, "grad_norm": 0.0022598726209253073, "learning_rate": 2.0664403181833125e-06, "loss": 0.0, "num_input_tokens_seen": 171499072, "step": 79515 }, { "epoch": 14.59350339511837, "grad_norm": 0.0002376632037339732, "learning_rate": 2.065791906479539e-06, "loss": 0.0, "num_input_tokens_seen": 171509312, "step": 79520 }, { "epoch": 14.594420994677922, "grad_norm": 0.00022299862757790834, "learning_rate": 2.0651435700336554e-06, "loss": 0.0, "num_input_tokens_seen": 171520352, "step": 79525 }, { "epoch": 14.595338594237475, "grad_norm": 0.00025588812422938645, "learning_rate": 2.0644953088622882e-06, "loss": 0.0, "num_input_tokens_seen": 171531840, "step": 79530 }, { "epoch": 14.596256193797027, "grad_norm": 0.00035583024146035314, "learning_rate": 2.0638471229820687e-06, "loss": 0.0, "num_input_tokens_seen": 171542592, "step": 79535 }, { "epoch": 14.597173793356578, "grad_norm": 0.00021915996330790222, "learning_rate": 2.0631990124096203e-06, "loss": 0.0, "num_input_tokens_seen": 171553632, "step": 79540 }, { "epoch": 14.598091392916132, "grad_norm": 0.0001623809803277254, "learning_rate": 2.0625509771615655e-06, "loss": 0.0, "num_input_tokens_seen": 171564544, "step": 79545 }, { "epoch": 14.599008992475683, "grad_norm": 0.019702009856700897, "learning_rate": 2.0619030172545236e-06, "loss": 0.0, "num_input_tokens_seen": 171575584, "step": 79550 }, { "epoch": 14.599926592035235, "grad_norm": 0.0005811397568322718, "learning_rate": 2.061255132705117e-06, "loss": 0.0, "num_input_tokens_seen": 171586944, "step": 79555 }, { "epoch": 14.600844191594788, "grad_norm": 0.014321229420602322, "learning_rate": 2.0606073235299625e-06, "loss": 0.0, "num_input_tokens_seen": 171597472, "step": 79560 }, { "epoch": 14.60176179115434, "grad_norm": 0.0010750812944024801, "learning_rate": 2.059959589745672e-06, "loss": 0.0, "num_input_tokens_seen": 171607232, "step": 79565 }, { "epoch": 14.602679390713892, "grad_norm": 0.0005461118998937309, "learning_rate": 2.0593119313688635e-06, "loss": 0.0, "num_input_tokens_seen": 171618272, "step": 79570 }, { "epoch": 14.603596990273445, "grad_norm": 0.0023580873385071754, "learning_rate": 2.058664348416146e-06, "loss": 0.0, "num_input_tokens_seen": 171629280, "step": 79575 }, { "epoch": 14.604514589832997, "grad_norm": 0.00022774837270844728, "learning_rate": 2.0580168409041278e-06, "loss": 0.0056, "num_input_tokens_seen": 171640320, "step": 79580 }, { "epoch": 14.605432189392548, "grad_norm": 9.096774738281965e-05, "learning_rate": 2.05736940884942e-06, "loss": 0.0, "num_input_tokens_seen": 171651360, "step": 79585 }, { "epoch": 14.606349788952102, "grad_norm": 0.0001236108801094815, "learning_rate": 2.056722052268626e-06, "loss": 0.0001, "num_input_tokens_seen": 171661888, "step": 79590 }, { "epoch": 14.607267388511653, "grad_norm": 0.00017876214405987412, "learning_rate": 2.0560747711783497e-06, "loss": 0.0, "num_input_tokens_seen": 171672352, "step": 79595 }, { "epoch": 14.608184988071205, "grad_norm": 0.008554517291486263, "learning_rate": 2.0554275655951903e-06, "loss": 0.0, "num_input_tokens_seen": 171684000, "step": 79600 }, { "epoch": 14.609102587630758, "grad_norm": 0.004892663098871708, "learning_rate": 2.054780435535753e-06, "loss": 0.0, "num_input_tokens_seen": 171695328, "step": 79605 }, { "epoch": 14.61002018719031, "grad_norm": 0.00022879964672029018, "learning_rate": 2.0541333810166326e-06, "loss": 0.0, "num_input_tokens_seen": 171706400, "step": 79610 }, { "epoch": 14.610937786749862, "grad_norm": 0.0003142578643746674, "learning_rate": 2.0534864020544247e-06, "loss": 0.0, "num_input_tokens_seen": 171717184, "step": 79615 }, { "epoch": 14.611855386309415, "grad_norm": 0.0007924332749098539, "learning_rate": 2.0528394986657247e-06, "loss": 0.0, "num_input_tokens_seen": 171727200, "step": 79620 }, { "epoch": 14.612772985868967, "grad_norm": 0.0009346547303721309, "learning_rate": 2.0521926708671215e-06, "loss": 0.0, "num_input_tokens_seen": 171737632, "step": 79625 }, { "epoch": 14.613690585428518, "grad_norm": 0.00026130740297958255, "learning_rate": 2.0515459186752094e-06, "loss": 0.0, "num_input_tokens_seen": 171748448, "step": 79630 }, { "epoch": 14.614608184988072, "grad_norm": 0.0004674301890190691, "learning_rate": 2.0508992421065755e-06, "loss": 0.1594, "num_input_tokens_seen": 171760256, "step": 79635 }, { "epoch": 14.615525784547623, "grad_norm": 0.00017284687783103436, "learning_rate": 2.0502526411778046e-06, "loss": 0.0, "num_input_tokens_seen": 171771456, "step": 79640 }, { "epoch": 14.616443384107175, "grad_norm": 0.00025534225278533995, "learning_rate": 2.049606115905482e-06, "loss": 0.0006, "num_input_tokens_seen": 171782432, "step": 79645 }, { "epoch": 14.617360983666728, "grad_norm": 32.30659103393555, "learning_rate": 2.0489596663061882e-06, "loss": 0.1045, "num_input_tokens_seen": 171793568, "step": 79650 }, { "epoch": 14.61827858322628, "grad_norm": 0.018617698922753334, "learning_rate": 2.048313292396507e-06, "loss": 0.0, "num_input_tokens_seen": 171804000, "step": 79655 }, { "epoch": 14.619196182785831, "grad_norm": 0.00017316559387836605, "learning_rate": 2.047666994193015e-06, "loss": 0.2156, "num_input_tokens_seen": 171814272, "step": 79660 }, { "epoch": 14.620113782345385, "grad_norm": 0.0012990875402465463, "learning_rate": 2.0470207717122875e-06, "loss": 0.0, "num_input_tokens_seen": 171825664, "step": 79665 }, { "epoch": 14.621031381904936, "grad_norm": 0.0013936106115579605, "learning_rate": 2.0463746249709016e-06, "loss": 0.0, "num_input_tokens_seen": 171836768, "step": 79670 }, { "epoch": 14.621948981464488, "grad_norm": 0.00020235010015312582, "learning_rate": 2.0457285539854295e-06, "loss": 0.0001, "num_input_tokens_seen": 171848320, "step": 79675 }, { "epoch": 14.622866581024041, "grad_norm": 0.9442126750946045, "learning_rate": 2.045082558772441e-06, "loss": 0.0006, "num_input_tokens_seen": 171859488, "step": 79680 }, { "epoch": 14.623784180583593, "grad_norm": 0.0005437283543869853, "learning_rate": 2.044436639348503e-06, "loss": 0.0, "num_input_tokens_seen": 171871168, "step": 79685 }, { "epoch": 14.624701780143145, "grad_norm": 8.532466017641127e-05, "learning_rate": 2.0437907957301873e-06, "loss": 0.0, "num_input_tokens_seen": 171883488, "step": 79690 }, { "epoch": 14.625619379702698, "grad_norm": 0.00027593248523771763, "learning_rate": 2.0431450279340554e-06, "loss": 0.0007, "num_input_tokens_seen": 171893344, "step": 79695 }, { "epoch": 14.62653697926225, "grad_norm": 0.0019642910920083523, "learning_rate": 2.0424993359766685e-06, "loss": 0.0, "num_input_tokens_seen": 171903520, "step": 79700 }, { "epoch": 14.627454578821801, "grad_norm": 0.0003745472931768745, "learning_rate": 2.0418537198745927e-06, "loss": 0.0, "num_input_tokens_seen": 171914240, "step": 79705 }, { "epoch": 14.628372178381355, "grad_norm": 0.00021861329150851816, "learning_rate": 2.0412081796443834e-06, "loss": 0.0001, "num_input_tokens_seen": 171925408, "step": 79710 }, { "epoch": 14.629289777940906, "grad_norm": 0.0005068815080448985, "learning_rate": 2.0405627153025974e-06, "loss": 0.1097, "num_input_tokens_seen": 171935872, "step": 79715 }, { "epoch": 14.630207377500458, "grad_norm": 0.0008648235816508532, "learning_rate": 2.0399173268657923e-06, "loss": 0.0, "num_input_tokens_seen": 171947136, "step": 79720 }, { "epoch": 14.631124977060011, "grad_norm": 0.06531298905611038, "learning_rate": 2.039272014350521e-06, "loss": 0.0, "num_input_tokens_seen": 171957504, "step": 79725 }, { "epoch": 14.632042576619563, "grad_norm": 0.00024406160810030997, "learning_rate": 2.0386267777733325e-06, "loss": 0.0, "num_input_tokens_seen": 171968928, "step": 79730 }, { "epoch": 14.632960176179115, "grad_norm": 0.015978865325450897, "learning_rate": 2.0379816171507764e-06, "loss": 0.0, "num_input_tokens_seen": 171979904, "step": 79735 }, { "epoch": 14.633877775738668, "grad_norm": 8.477843221044168e-05, "learning_rate": 2.0373365324994033e-06, "loss": 0.0, "num_input_tokens_seen": 171990560, "step": 79740 }, { "epoch": 14.63479537529822, "grad_norm": 0.026287741959095, "learning_rate": 2.036691523835756e-06, "loss": 0.0, "num_input_tokens_seen": 172002080, "step": 79745 }, { "epoch": 14.635712974857771, "grad_norm": 8.282971975859255e-05, "learning_rate": 2.036046591176376e-06, "loss": 0.0, "num_input_tokens_seen": 172012832, "step": 79750 }, { "epoch": 14.636630574417325, "grad_norm": 0.0012717965291813016, "learning_rate": 2.0354017345378098e-06, "loss": 0.0, "num_input_tokens_seen": 172023616, "step": 79755 }, { "epoch": 14.637548173976876, "grad_norm": 0.00017489648598711938, "learning_rate": 2.034756953936594e-06, "loss": 0.0029, "num_input_tokens_seen": 172034720, "step": 79760 }, { "epoch": 14.638465773536428, "grad_norm": 0.0017466078279539943, "learning_rate": 2.034112249389265e-06, "loss": 0.0616, "num_input_tokens_seen": 172045440, "step": 79765 }, { "epoch": 14.639383373095981, "grad_norm": 0.000174034052179195, "learning_rate": 2.033467620912362e-06, "loss": 0.0, "num_input_tokens_seen": 172056032, "step": 79770 }, { "epoch": 14.640300972655533, "grad_norm": 0.00014264267520047724, "learning_rate": 2.032823068522417e-06, "loss": 0.0, "num_input_tokens_seen": 172067072, "step": 79775 }, { "epoch": 14.641218572215084, "grad_norm": 0.00024649655097164214, "learning_rate": 2.032178592235961e-06, "loss": 0.0, "num_input_tokens_seen": 172078112, "step": 79780 }, { "epoch": 14.642136171774638, "grad_norm": 0.0002387812564847991, "learning_rate": 2.031534192069523e-06, "loss": 0.0, "num_input_tokens_seen": 172086848, "step": 79785 }, { "epoch": 14.64305377133419, "grad_norm": 0.0003688460565172136, "learning_rate": 2.030889868039634e-06, "loss": 0.0, "num_input_tokens_seen": 172096928, "step": 79790 }, { "epoch": 14.643971370893741, "grad_norm": 0.00043951201951131225, "learning_rate": 2.030245620162818e-06, "loss": 0.0, "num_input_tokens_seen": 172107328, "step": 79795 }, { "epoch": 14.644888970453295, "grad_norm": 0.00012675371544901282, "learning_rate": 2.0296014484555976e-06, "loss": 0.0, "num_input_tokens_seen": 172117856, "step": 79800 }, { "epoch": 14.645806570012846, "grad_norm": 0.003488334594294429, "learning_rate": 2.028957352934498e-06, "loss": 0.0, "num_input_tokens_seen": 172128576, "step": 79805 }, { "epoch": 14.646724169572398, "grad_norm": 0.00033412716584280133, "learning_rate": 2.028313333616038e-06, "loss": 0.0, "num_input_tokens_seen": 172139200, "step": 79810 }, { "epoch": 14.647641769131951, "grad_norm": 0.00045260967453941703, "learning_rate": 2.0276693905167344e-06, "loss": 0.0, "num_input_tokens_seen": 172150528, "step": 79815 }, { "epoch": 14.648559368691503, "grad_norm": 0.00033696461468935013, "learning_rate": 2.0270255236531032e-06, "loss": 0.0, "num_input_tokens_seen": 172162400, "step": 79820 }, { "epoch": 14.649476968251054, "grad_norm": 0.0002253675484098494, "learning_rate": 2.0263817330416612e-06, "loss": 0.0, "num_input_tokens_seen": 172172672, "step": 79825 }, { "epoch": 14.650394567810608, "grad_norm": 0.0021515104454010725, "learning_rate": 2.025738018698919e-06, "loss": 0.0, "num_input_tokens_seen": 172183520, "step": 79830 }, { "epoch": 14.65131216737016, "grad_norm": 0.00023827815311960876, "learning_rate": 2.025094380641387e-06, "loss": 0.0, "num_input_tokens_seen": 172192768, "step": 79835 }, { "epoch": 14.652229766929711, "grad_norm": 0.0282472874969244, "learning_rate": 2.0244508188855733e-06, "loss": 0.0001, "num_input_tokens_seen": 172203808, "step": 79840 }, { "epoch": 14.653147366489264, "grad_norm": 0.0005484744906425476, "learning_rate": 2.023807333447983e-06, "loss": 0.0001, "num_input_tokens_seen": 172214400, "step": 79845 }, { "epoch": 14.654064966048816, "grad_norm": 0.024366935715079308, "learning_rate": 2.0231639243451235e-06, "loss": 0.0, "num_input_tokens_seen": 172224960, "step": 79850 }, { "epoch": 14.654982565608368, "grad_norm": 0.0010749977082014084, "learning_rate": 2.022520591593495e-06, "loss": 0.0, "num_input_tokens_seen": 172236544, "step": 79855 }, { "epoch": 14.655900165167921, "grad_norm": 0.0002368008135817945, "learning_rate": 2.0218773352096e-06, "loss": 0.0, "num_input_tokens_seen": 172247584, "step": 79860 }, { "epoch": 14.656817764727473, "grad_norm": 0.000729003397282213, "learning_rate": 2.0212341552099347e-06, "loss": 0.0, "num_input_tokens_seen": 172257984, "step": 79865 }, { "epoch": 14.657735364287024, "grad_norm": 0.0009336767834611237, "learning_rate": 2.020591051610995e-06, "loss": 0.0, "num_input_tokens_seen": 172267840, "step": 79870 }, { "epoch": 14.658652963846578, "grad_norm": 0.005705502349883318, "learning_rate": 2.019948024429279e-06, "loss": 0.0, "num_input_tokens_seen": 172278624, "step": 79875 }, { "epoch": 14.65957056340613, "grad_norm": 0.00023930729366838932, "learning_rate": 2.019305073681278e-06, "loss": 0.0, "num_input_tokens_seen": 172288576, "step": 79880 }, { "epoch": 14.660488162965681, "grad_norm": 0.0007362107280641794, "learning_rate": 2.01866219938348e-06, "loss": 0.0, "num_input_tokens_seen": 172300416, "step": 79885 }, { "epoch": 14.661405762525234, "grad_norm": 0.0013934164308011532, "learning_rate": 2.0180194015523772e-06, "loss": 0.0, "num_input_tokens_seen": 172311904, "step": 79890 }, { "epoch": 14.662323362084786, "grad_norm": 0.00022502432693727314, "learning_rate": 2.017376680204456e-06, "loss": 0.0, "num_input_tokens_seen": 172322464, "step": 79895 }, { "epoch": 14.663240961644338, "grad_norm": 8.05337549536489e-05, "learning_rate": 2.016734035356198e-06, "loss": 0.0, "num_input_tokens_seen": 172331872, "step": 79900 }, { "epoch": 14.664158561203891, "grad_norm": 183.3745880126953, "learning_rate": 2.0160914670240906e-06, "loss": 0.0341, "num_input_tokens_seen": 172342624, "step": 79905 }, { "epoch": 14.665076160763443, "grad_norm": 0.0007415074505843222, "learning_rate": 2.015448975224612e-06, "loss": 0.0, "num_input_tokens_seen": 172353504, "step": 79910 }, { "epoch": 14.665993760322994, "grad_norm": 0.0001288859057240188, "learning_rate": 2.0148065599742416e-06, "loss": 0.0, "num_input_tokens_seen": 172365088, "step": 79915 }, { "epoch": 14.666911359882548, "grad_norm": 0.000293582008453086, "learning_rate": 2.0141642212894547e-06, "loss": 0.0, "num_input_tokens_seen": 172375968, "step": 79920 }, { "epoch": 14.6678289594421, "grad_norm": 0.0014724860666319728, "learning_rate": 2.0135219591867293e-06, "loss": 0.0, "num_input_tokens_seen": 172386336, "step": 79925 }, { "epoch": 14.66874655900165, "grad_norm": 0.0006768403109163046, "learning_rate": 2.0128797736825375e-06, "loss": 0.0, "num_input_tokens_seen": 172398144, "step": 79930 }, { "epoch": 14.669664158561204, "grad_norm": 0.002742734272032976, "learning_rate": 2.012237664793348e-06, "loss": 0.0, "num_input_tokens_seen": 172409632, "step": 79935 }, { "epoch": 14.670581758120756, "grad_norm": 0.0034524030052125454, "learning_rate": 2.011595632535633e-06, "loss": 0.0001, "num_input_tokens_seen": 172419904, "step": 79940 }, { "epoch": 14.671499357680307, "grad_norm": 0.00014439094229601324, "learning_rate": 2.010953676925859e-06, "loss": 0.0, "num_input_tokens_seen": 172430784, "step": 79945 }, { "epoch": 14.67241695723986, "grad_norm": 0.000630885420832783, "learning_rate": 2.010311797980489e-06, "loss": 0.0, "num_input_tokens_seen": 172442048, "step": 79950 }, { "epoch": 14.673334556799412, "grad_norm": 0.0015523708425462246, "learning_rate": 2.0096699957159886e-06, "loss": 0.0, "num_input_tokens_seen": 172453088, "step": 79955 }, { "epoch": 14.674252156358964, "grad_norm": 0.00035600681439973414, "learning_rate": 2.009028270148819e-06, "loss": 0.0, "num_input_tokens_seen": 172464160, "step": 79960 }, { "epoch": 14.675169755918517, "grad_norm": 1.2125047445297241, "learning_rate": 2.008386621295439e-06, "loss": 0.0001, "num_input_tokens_seen": 172475264, "step": 79965 }, { "epoch": 14.676087355478069, "grad_norm": 20.08113670349121, "learning_rate": 2.007745049172303e-06, "loss": 0.1469, "num_input_tokens_seen": 172484800, "step": 79970 }, { "epoch": 14.67700495503762, "grad_norm": 0.0001605239958735183, "learning_rate": 2.007103553795871e-06, "loss": 0.0002, "num_input_tokens_seen": 172494976, "step": 79975 }, { "epoch": 14.677922554597174, "grad_norm": 0.0024217613972723484, "learning_rate": 2.006462135182594e-06, "loss": 0.0, "num_input_tokens_seen": 172506528, "step": 79980 }, { "epoch": 14.678840154156726, "grad_norm": 0.0012176387244835496, "learning_rate": 2.005820793348923e-06, "loss": 0.0, "num_input_tokens_seen": 172517280, "step": 79985 }, { "epoch": 14.679757753716277, "grad_norm": 0.00023601172142662108, "learning_rate": 2.0051795283113085e-06, "loss": 0.0, "num_input_tokens_seen": 172527168, "step": 79990 }, { "epoch": 14.68067535327583, "grad_norm": 0.00018581307085696608, "learning_rate": 2.0045383400861985e-06, "loss": 0.0, "num_input_tokens_seen": 172537472, "step": 79995 }, { "epoch": 14.681592952835382, "grad_norm": 0.00033028339385055006, "learning_rate": 2.003897228690037e-06, "loss": 0.0, "num_input_tokens_seen": 172548768, "step": 80000 }, { "epoch": 14.682510552394934, "grad_norm": 0.026730341836810112, "learning_rate": 2.0032561941392663e-06, "loss": 0.0, "num_input_tokens_seen": 172560224, "step": 80005 }, { "epoch": 14.683428151954487, "grad_norm": 0.0027484640013426542, "learning_rate": 2.0026152364503313e-06, "loss": 0.0, "num_input_tokens_seen": 172571968, "step": 80010 }, { "epoch": 14.684345751514039, "grad_norm": 0.01091448962688446, "learning_rate": 2.0019743556396703e-06, "loss": 0.0, "num_input_tokens_seen": 172582880, "step": 80015 }, { "epoch": 14.68526335107359, "grad_norm": 0.00011038102820748463, "learning_rate": 2.0013335517237194e-06, "loss": 0.0006, "num_input_tokens_seen": 172594112, "step": 80020 }, { "epoch": 14.686180950633144, "grad_norm": 0.0003618699556682259, "learning_rate": 2.0006928247189162e-06, "loss": 0.0, "num_input_tokens_seen": 172605568, "step": 80025 }, { "epoch": 14.687098550192696, "grad_norm": 0.0007188970339484513, "learning_rate": 2.000052174641694e-06, "loss": 0.0, "num_input_tokens_seen": 172617056, "step": 80030 }, { "epoch": 14.688016149752247, "grad_norm": 0.0001396294974256307, "learning_rate": 1.9994116015084835e-06, "loss": 0.0, "num_input_tokens_seen": 172627648, "step": 80035 }, { "epoch": 14.6889337493118, "grad_norm": 0.00197772728279233, "learning_rate": 1.9987711053357134e-06, "loss": 0.0, "num_input_tokens_seen": 172639840, "step": 80040 }, { "epoch": 14.689851348871352, "grad_norm": 0.0001832916314015165, "learning_rate": 1.998130686139815e-06, "loss": 0.0, "num_input_tokens_seen": 172650944, "step": 80045 }, { "epoch": 14.690768948430904, "grad_norm": 0.00012109145609429106, "learning_rate": 1.9974903439372116e-06, "loss": 0.0, "num_input_tokens_seen": 172660352, "step": 80050 }, { "epoch": 14.691686547990457, "grad_norm": 0.014379207044839859, "learning_rate": 1.996850078744328e-06, "loss": 0.0, "num_input_tokens_seen": 172670880, "step": 80055 }, { "epoch": 14.692604147550009, "grad_norm": 0.0007117484346963465, "learning_rate": 1.996209890577585e-06, "loss": 0.0, "num_input_tokens_seen": 172681248, "step": 80060 }, { "epoch": 14.69352174710956, "grad_norm": 0.00033720486680977046, "learning_rate": 1.9955697794534012e-06, "loss": 0.0, "num_input_tokens_seen": 172692288, "step": 80065 }, { "epoch": 14.694439346669114, "grad_norm": 0.00012956347200088203, "learning_rate": 1.9949297453881977e-06, "loss": 0.0111, "num_input_tokens_seen": 172702976, "step": 80070 }, { "epoch": 14.695356946228666, "grad_norm": 0.0014637868152931333, "learning_rate": 1.994289788398389e-06, "loss": 0.0, "num_input_tokens_seen": 172714304, "step": 80075 }, { "epoch": 14.696274545788217, "grad_norm": 0.0015309583395719528, "learning_rate": 1.993649908500389e-06, "loss": 0.0, "num_input_tokens_seen": 172724192, "step": 80080 }, { "epoch": 14.69719214534777, "grad_norm": 0.00012837362010031939, "learning_rate": 1.9930101057106076e-06, "loss": 0.0, "num_input_tokens_seen": 172734112, "step": 80085 }, { "epoch": 14.698109744907322, "grad_norm": 8.66971822688356e-05, "learning_rate": 1.992370380045458e-06, "loss": 0.0733, "num_input_tokens_seen": 172745440, "step": 80090 }, { "epoch": 14.699027344466874, "grad_norm": 0.0032943999394774437, "learning_rate": 1.9917307315213468e-06, "loss": 0.0, "num_input_tokens_seen": 172756608, "step": 80095 }, { "epoch": 14.699944944026427, "grad_norm": 0.0029650635551661253, "learning_rate": 1.99109116015468e-06, "loss": 0.0, "num_input_tokens_seen": 172766240, "step": 80100 }, { "epoch": 14.700862543585979, "grad_norm": 0.002459977986291051, "learning_rate": 1.99045166596186e-06, "loss": 0.0883, "num_input_tokens_seen": 172776256, "step": 80105 }, { "epoch": 14.70178014314553, "grad_norm": 0.0018979761516675353, "learning_rate": 1.989812248959292e-06, "loss": 0.0, "num_input_tokens_seen": 172786880, "step": 80110 }, { "epoch": 14.702697742705084, "grad_norm": 0.0007522857049480081, "learning_rate": 1.9891729091633743e-06, "loss": 0.0, "num_input_tokens_seen": 172798432, "step": 80115 }, { "epoch": 14.703615342264635, "grad_norm": 0.000985983177088201, "learning_rate": 1.9885336465905035e-06, "loss": 0.0, "num_input_tokens_seen": 172809280, "step": 80120 }, { "epoch": 14.704532941824187, "grad_norm": 0.004720374010503292, "learning_rate": 1.9878944612570793e-06, "loss": 0.0, "num_input_tokens_seen": 172820480, "step": 80125 }, { "epoch": 14.70545054138374, "grad_norm": 0.00014480565732810646, "learning_rate": 1.9872553531794936e-06, "loss": 0.0, "num_input_tokens_seen": 172830336, "step": 80130 }, { "epoch": 14.706368140943292, "grad_norm": 0.0004288386844564229, "learning_rate": 1.9866163223741386e-06, "loss": 0.0, "num_input_tokens_seen": 172841952, "step": 80135 }, { "epoch": 14.707285740502844, "grad_norm": 0.00046917659346945584, "learning_rate": 1.985977368857403e-06, "loss": 0.0, "num_input_tokens_seen": 172851584, "step": 80140 }, { "epoch": 14.708203340062397, "grad_norm": 0.0023530307225883007, "learning_rate": 1.9853384926456785e-06, "loss": 0.0, "num_input_tokens_seen": 172862144, "step": 80145 }, { "epoch": 14.709120939621949, "grad_norm": 0.0004157471121288836, "learning_rate": 1.984699693755349e-06, "loss": 0.0, "num_input_tokens_seen": 172873504, "step": 80150 }, { "epoch": 14.7100385391815, "grad_norm": 0.0004585337592288852, "learning_rate": 1.9840609722027976e-06, "loss": 0.0, "num_input_tokens_seen": 172883616, "step": 80155 }, { "epoch": 14.710956138741054, "grad_norm": 0.4500128924846649, "learning_rate": 1.9834223280044097e-06, "loss": 0.0001, "num_input_tokens_seen": 172893152, "step": 80160 }, { "epoch": 14.711873738300605, "grad_norm": 0.0030562051106244326, "learning_rate": 1.9827837611765638e-06, "loss": 0.0, "num_input_tokens_seen": 172903872, "step": 80165 }, { "epoch": 14.712791337860157, "grad_norm": 0.0014030078891664743, "learning_rate": 1.9821452717356367e-06, "loss": 0.0, "num_input_tokens_seen": 172914368, "step": 80170 }, { "epoch": 14.71370893741971, "grad_norm": 0.0019223594572395086, "learning_rate": 1.9815068596980077e-06, "loss": 0.0, "num_input_tokens_seen": 172925312, "step": 80175 }, { "epoch": 14.714626536979262, "grad_norm": 0.0015276808990165591, "learning_rate": 1.9808685250800493e-06, "loss": 0.0001, "num_input_tokens_seen": 172935744, "step": 80180 }, { "epoch": 14.715544136538814, "grad_norm": 0.00014678531442768872, "learning_rate": 1.980230267898134e-06, "loss": 0.0, "num_input_tokens_seen": 172946336, "step": 80185 }, { "epoch": 14.716461736098367, "grad_norm": 0.07473605126142502, "learning_rate": 1.9795920881686305e-06, "loss": 0.0001, "num_input_tokens_seen": 172956704, "step": 80190 }, { "epoch": 14.717379335657919, "grad_norm": 0.000426117709139362, "learning_rate": 1.9789539859079103e-06, "loss": 0.0, "num_input_tokens_seen": 172967232, "step": 80195 }, { "epoch": 14.71829693521747, "grad_norm": 0.0004296113329473883, "learning_rate": 1.9783159611323383e-06, "loss": 0.0, "num_input_tokens_seen": 172978144, "step": 80200 }, { "epoch": 14.719214534777024, "grad_norm": 0.006034179590642452, "learning_rate": 1.9776780138582768e-06, "loss": 0.0, "num_input_tokens_seen": 172987840, "step": 80205 }, { "epoch": 14.720132134336575, "grad_norm": 0.00012582798080984503, "learning_rate": 1.977040144102092e-06, "loss": 0.0, "num_input_tokens_seen": 172998304, "step": 80210 }, { "epoch": 14.721049733896127, "grad_norm": 0.00047148167504929006, "learning_rate": 1.976402351880142e-06, "loss": 0.0, "num_input_tokens_seen": 173009632, "step": 80215 }, { "epoch": 14.72196733345568, "grad_norm": 0.00010994128388119861, "learning_rate": 1.9757646372087845e-06, "loss": 0.0, "num_input_tokens_seen": 173020736, "step": 80220 }, { "epoch": 14.722884933015232, "grad_norm": 0.0013488823315128684, "learning_rate": 1.9751270001043782e-06, "loss": 0.0002, "num_input_tokens_seen": 173031424, "step": 80225 }, { "epoch": 14.723802532574783, "grad_norm": 0.002392376074567437, "learning_rate": 1.974489440583276e-06, "loss": 0.0, "num_input_tokens_seen": 173041568, "step": 80230 }, { "epoch": 14.724720132134337, "grad_norm": 0.00041076564230024815, "learning_rate": 1.973851958661831e-06, "loss": 0.0, "num_input_tokens_seen": 173052352, "step": 80235 }, { "epoch": 14.725637731693888, "grad_norm": 0.00025699925026856363, "learning_rate": 1.9732145543563913e-06, "loss": 0.0, "num_input_tokens_seen": 173063456, "step": 80240 }, { "epoch": 14.72655533125344, "grad_norm": 0.01440435741096735, "learning_rate": 1.9725772276833087e-06, "loss": 0.0, "num_input_tokens_seen": 173073600, "step": 80245 }, { "epoch": 14.727472930812993, "grad_norm": 0.0003226289409212768, "learning_rate": 1.971939978658928e-06, "loss": 0.0, "num_input_tokens_seen": 173082624, "step": 80250 }, { "epoch": 14.728390530372545, "grad_norm": 0.0009611611603759229, "learning_rate": 1.9713028072995945e-06, "loss": 0.0, "num_input_tokens_seen": 173092928, "step": 80255 }, { "epoch": 14.729308129932097, "grad_norm": 0.0012916127452626824, "learning_rate": 1.9706657136216477e-06, "loss": 0.0, "num_input_tokens_seen": 173105056, "step": 80260 }, { "epoch": 14.73022572949165, "grad_norm": 0.0004979968070983887, "learning_rate": 1.970028697641432e-06, "loss": 0.0, "num_input_tokens_seen": 173116256, "step": 80265 }, { "epoch": 14.731143329051202, "grad_norm": 0.000690632383339107, "learning_rate": 1.9693917593752843e-06, "loss": 0.0, "num_input_tokens_seen": 173126112, "step": 80270 }, { "epoch": 14.732060928610753, "grad_norm": 0.001100728870369494, "learning_rate": 1.968754898839541e-06, "loss": 0.0, "num_input_tokens_seen": 173137376, "step": 80275 }, { "epoch": 14.732978528170307, "grad_norm": 0.0002393128233961761, "learning_rate": 1.9681181160505364e-06, "loss": 0.0, "num_input_tokens_seen": 173148544, "step": 80280 }, { "epoch": 14.733896127729858, "grad_norm": 0.00105368928052485, "learning_rate": 1.967481411024603e-06, "loss": 0.0, "num_input_tokens_seen": 173159392, "step": 80285 }, { "epoch": 14.73481372728941, "grad_norm": 0.0007911354186944664, "learning_rate": 1.9668447837780704e-06, "loss": 0.0002, "num_input_tokens_seen": 173170656, "step": 80290 }, { "epoch": 14.735731326848963, "grad_norm": 0.002123347483575344, "learning_rate": 1.9662082343272693e-06, "loss": 0.0, "num_input_tokens_seen": 173181280, "step": 80295 }, { "epoch": 14.736648926408515, "grad_norm": 9.909442451316863e-05, "learning_rate": 1.9655717626885244e-06, "loss": 0.0, "num_input_tokens_seen": 173192576, "step": 80300 }, { "epoch": 14.737566525968067, "grad_norm": 0.0004906078684143722, "learning_rate": 1.96493536887816e-06, "loss": 0.0, "num_input_tokens_seen": 173202432, "step": 80305 }, { "epoch": 14.73848412552762, "grad_norm": 0.00011099874245701358, "learning_rate": 1.9642990529125013e-06, "loss": 0.0056, "num_input_tokens_seen": 173213728, "step": 80310 }, { "epoch": 14.739401725087172, "grad_norm": 0.00012064661859767511, "learning_rate": 1.963662814807867e-06, "loss": 0.0, "num_input_tokens_seen": 173224128, "step": 80315 }, { "epoch": 14.740319324646723, "grad_norm": 0.0005358452326618135, "learning_rate": 1.963026654580575e-06, "loss": 0.0, "num_input_tokens_seen": 173235616, "step": 80320 }, { "epoch": 14.741236924206277, "grad_norm": 8.588933997089043e-05, "learning_rate": 1.962390572246941e-06, "loss": 0.0, "num_input_tokens_seen": 173247680, "step": 80325 }, { "epoch": 14.742154523765828, "grad_norm": 0.00914919376373291, "learning_rate": 1.9617545678232826e-06, "loss": 0.0011, "num_input_tokens_seen": 173259104, "step": 80330 }, { "epoch": 14.74307212332538, "grad_norm": 0.06756418198347092, "learning_rate": 1.961118641325911e-06, "loss": 0.0, "num_input_tokens_seen": 173270496, "step": 80335 }, { "epoch": 14.743989722884933, "grad_norm": 0.018064213916659355, "learning_rate": 1.960482792771134e-06, "loss": 0.0352, "num_input_tokens_seen": 173279808, "step": 80340 }, { "epoch": 14.744907322444485, "grad_norm": 0.01126941479742527, "learning_rate": 1.9598470221752646e-06, "loss": 0.0, "num_input_tokens_seen": 173290400, "step": 80345 }, { "epoch": 14.745824922004036, "grad_norm": 0.00041953710024245083, "learning_rate": 1.9592113295546077e-06, "loss": 0.0, "num_input_tokens_seen": 173302048, "step": 80350 }, { "epoch": 14.74674252156359, "grad_norm": 0.0008284881478175521, "learning_rate": 1.958575714925465e-06, "loss": 0.0, "num_input_tokens_seen": 173313344, "step": 80355 }, { "epoch": 14.747660121123142, "grad_norm": 0.00032921467209234834, "learning_rate": 1.9579401783041435e-06, "loss": 0.0, "num_input_tokens_seen": 173324448, "step": 80360 }, { "epoch": 14.748577720682695, "grad_norm": 0.00021857788669876754, "learning_rate": 1.9573047197069415e-06, "loss": 0.0, "num_input_tokens_seen": 173334848, "step": 80365 }, { "epoch": 14.749495320242247, "grad_norm": 0.00014466377615462989, "learning_rate": 1.9566693391501583e-06, "loss": 0.0, "num_input_tokens_seen": 173345184, "step": 80370 }, { "epoch": 14.750412919801798, "grad_norm": 0.0003186320827808231, "learning_rate": 1.9560340366500874e-06, "loss": 0.0, "num_input_tokens_seen": 173355808, "step": 80375 }, { "epoch": 14.751330519361352, "grad_norm": 8.794367749942467e-05, "learning_rate": 1.955398812223028e-06, "loss": 0.0, "num_input_tokens_seen": 173366976, "step": 80380 }, { "epoch": 14.752248118920903, "grad_norm": 0.00017599169223103672, "learning_rate": 1.95476366588527e-06, "loss": 0.0, "num_input_tokens_seen": 173377120, "step": 80385 }, { "epoch": 14.753165718480455, "grad_norm": 0.00021774152992293239, "learning_rate": 1.9541285976531026e-06, "loss": 0.0, "num_input_tokens_seen": 173387776, "step": 80390 }, { "epoch": 14.754083318040008, "grad_norm": 0.000353815034031868, "learning_rate": 1.953493607542818e-06, "loss": 0.0, "num_input_tokens_seen": 173399008, "step": 80395 }, { "epoch": 14.75500091759956, "grad_norm": 0.00014918617671355605, "learning_rate": 1.9528586955707e-06, "loss": 0.0425, "num_input_tokens_seen": 173409024, "step": 80400 }, { "epoch": 14.755918517159111, "grad_norm": 0.0001184102802653797, "learning_rate": 1.9522238617530324e-06, "loss": 0.0, "num_input_tokens_seen": 173419520, "step": 80405 }, { "epoch": 14.756836116718665, "grad_norm": 0.0022569044958800077, "learning_rate": 1.9515891061061016e-06, "loss": 0.0, "num_input_tokens_seen": 173429792, "step": 80410 }, { "epoch": 14.757753716278216, "grad_norm": 0.006578864995390177, "learning_rate": 1.9509544286461852e-06, "loss": 0.0, "num_input_tokens_seen": 173440416, "step": 80415 }, { "epoch": 14.758671315837768, "grad_norm": 0.001594206434674561, "learning_rate": 1.9503198293895615e-06, "loss": 0.0, "num_input_tokens_seen": 173451520, "step": 80420 }, { "epoch": 14.759588915397321, "grad_norm": 0.0029355662409216166, "learning_rate": 1.9496853083525065e-06, "loss": 0.001, "num_input_tokens_seen": 173461280, "step": 80425 }, { "epoch": 14.760506514956873, "grad_norm": 0.00036446796730160713, "learning_rate": 1.9490508655512974e-06, "loss": 0.0, "num_input_tokens_seen": 173472768, "step": 80430 }, { "epoch": 14.761424114516425, "grad_norm": 0.001339743030257523, "learning_rate": 1.948416501002205e-06, "loss": 0.0, "num_input_tokens_seen": 173483776, "step": 80435 }, { "epoch": 14.762341714075978, "grad_norm": 0.0004018152831122279, "learning_rate": 1.9477822147214983e-06, "loss": 0.0, "num_input_tokens_seen": 173494624, "step": 80440 }, { "epoch": 14.76325931363553, "grad_norm": 0.0008409059955738485, "learning_rate": 1.9471480067254484e-06, "loss": 0.0, "num_input_tokens_seen": 173505952, "step": 80445 }, { "epoch": 14.764176913195081, "grad_norm": 0.0010329469805583358, "learning_rate": 1.946513877030321e-06, "loss": 0.0, "num_input_tokens_seen": 173517472, "step": 80450 }, { "epoch": 14.765094512754635, "grad_norm": 0.0002590034273453057, "learning_rate": 1.9458798256523804e-06, "loss": 0.0, "num_input_tokens_seen": 173528512, "step": 80455 }, { "epoch": 14.766012112314186, "grad_norm": 0.00012014457024633884, "learning_rate": 1.9452458526078867e-06, "loss": 0.0, "num_input_tokens_seen": 173538720, "step": 80460 }, { "epoch": 14.766929711873738, "grad_norm": 0.00013557802594732493, "learning_rate": 1.9446119579131045e-06, "loss": 0.0, "num_input_tokens_seen": 173550208, "step": 80465 }, { "epoch": 14.767847311433291, "grad_norm": 0.0002332335861865431, "learning_rate": 1.9439781415842903e-06, "loss": 0.0, "num_input_tokens_seen": 173561408, "step": 80470 }, { "epoch": 14.768764910992843, "grad_norm": 0.0002374570758547634, "learning_rate": 1.9433444036376997e-06, "loss": 0.0097, "num_input_tokens_seen": 173572224, "step": 80475 }, { "epoch": 14.769682510552395, "grad_norm": 0.00025782297598198056, "learning_rate": 1.9427107440895865e-06, "loss": 0.0, "num_input_tokens_seen": 173583616, "step": 80480 }, { "epoch": 14.770600110111948, "grad_norm": 0.0004113771428819746, "learning_rate": 1.9420771629562057e-06, "loss": 0.0005, "num_input_tokens_seen": 173594848, "step": 80485 }, { "epoch": 14.7715177096715, "grad_norm": 8.526971214450896e-05, "learning_rate": 1.941443660253807e-06, "loss": 0.0, "num_input_tokens_seen": 173604384, "step": 80490 }, { "epoch": 14.772435309231051, "grad_norm": 0.0012817594688385725, "learning_rate": 1.9408102359986375e-06, "loss": 0.0, "num_input_tokens_seen": 173616064, "step": 80495 }, { "epoch": 14.773352908790605, "grad_norm": 9.505576599622145e-05, "learning_rate": 1.940176890206944e-06, "loss": 0.0, "num_input_tokens_seen": 173626400, "step": 80500 }, { "epoch": 14.774270508350156, "grad_norm": 0.000723024713806808, "learning_rate": 1.9395436228949715e-06, "loss": 0.0, "num_input_tokens_seen": 173635648, "step": 80505 }, { "epoch": 14.775188107909708, "grad_norm": 0.0017584143206477165, "learning_rate": 1.93891043407896e-06, "loss": 0.0, "num_input_tokens_seen": 173647360, "step": 80510 }, { "epoch": 14.776105707469261, "grad_norm": 0.00014338248001877218, "learning_rate": 1.938277323775153e-06, "loss": 0.0, "num_input_tokens_seen": 173658144, "step": 80515 }, { "epoch": 14.777023307028813, "grad_norm": 0.00972991157323122, "learning_rate": 1.937644291999788e-06, "loss": 0.0, "num_input_tokens_seen": 173668160, "step": 80520 }, { "epoch": 14.777940906588364, "grad_norm": 0.00012826306920032948, "learning_rate": 1.937011338769098e-06, "loss": 0.0, "num_input_tokens_seen": 173679424, "step": 80525 }, { "epoch": 14.778858506147918, "grad_norm": 0.7376605272293091, "learning_rate": 1.9363784640993223e-06, "loss": 0.0001, "num_input_tokens_seen": 173691520, "step": 80530 }, { "epoch": 14.77977610570747, "grad_norm": 0.0008677936275489628, "learning_rate": 1.935745668006691e-06, "loss": 0.0, "num_input_tokens_seen": 173702880, "step": 80535 }, { "epoch": 14.780693705267021, "grad_norm": 0.0250571109354496, "learning_rate": 1.9351129505074317e-06, "loss": 0.0, "num_input_tokens_seen": 173714880, "step": 80540 }, { "epoch": 14.781611304826574, "grad_norm": 0.00026016769697889686, "learning_rate": 1.9344803116177772e-06, "loss": 0.0, "num_input_tokens_seen": 173724768, "step": 80545 }, { "epoch": 14.782528904386126, "grad_norm": 0.0013995693298056722, "learning_rate": 1.9338477513539523e-06, "loss": 0.0, "num_input_tokens_seen": 173735424, "step": 80550 }, { "epoch": 14.783446503945678, "grad_norm": 0.0004329232906457037, "learning_rate": 1.9332152697321793e-06, "loss": 0.0, "num_input_tokens_seen": 173747168, "step": 80555 }, { "epoch": 14.784364103505231, "grad_norm": 0.0002996204129885882, "learning_rate": 1.932582866768681e-06, "loss": 0.0, "num_input_tokens_seen": 173757152, "step": 80560 }, { "epoch": 14.785281703064783, "grad_norm": 0.00016048279940150678, "learning_rate": 1.9319505424796784e-06, "loss": 0.0, "num_input_tokens_seen": 173768224, "step": 80565 }, { "epoch": 14.786199302624334, "grad_norm": 0.00022628098668064922, "learning_rate": 1.9313182968813902e-06, "loss": 0.0, "num_input_tokens_seen": 173779136, "step": 80570 }, { "epoch": 14.787116902183888, "grad_norm": 0.0005559824639931321, "learning_rate": 1.9306861299900303e-06, "loss": 0.0451, "num_input_tokens_seen": 173790784, "step": 80575 }, { "epoch": 14.78803450174344, "grad_norm": 7.697237015236169e-05, "learning_rate": 1.9300540418218155e-06, "loss": 0.0, "num_input_tokens_seen": 173801760, "step": 80580 }, { "epoch": 14.788952101302991, "grad_norm": 0.005422461312264204, "learning_rate": 1.9294220323929564e-06, "loss": 0.0, "num_input_tokens_seen": 173812992, "step": 80585 }, { "epoch": 14.789869700862544, "grad_norm": 0.0001584331039339304, "learning_rate": 1.9287901017196613e-06, "loss": 0.0, "num_input_tokens_seen": 173823872, "step": 80590 }, { "epoch": 14.790787300422096, "grad_norm": 0.0003195503377355635, "learning_rate": 1.9281582498181424e-06, "loss": 0.04, "num_input_tokens_seen": 173835072, "step": 80595 }, { "epoch": 14.791704899981648, "grad_norm": 0.00020422144734766334, "learning_rate": 1.927526476704603e-06, "loss": 0.0001, "num_input_tokens_seen": 173845856, "step": 80600 }, { "epoch": 14.792622499541201, "grad_norm": 0.0004501910007093102, "learning_rate": 1.9268947823952476e-06, "loss": 0.0, "num_input_tokens_seen": 173855552, "step": 80605 }, { "epoch": 14.793540099100753, "grad_norm": 0.0002847796131391078, "learning_rate": 1.926263166906277e-06, "loss": 0.0, "num_input_tokens_seen": 173866976, "step": 80610 }, { "epoch": 14.794457698660304, "grad_norm": 0.1228926032781601, "learning_rate": 1.9256316302538935e-06, "loss": 0.0001, "num_input_tokens_seen": 173876512, "step": 80615 }, { "epoch": 14.795375298219858, "grad_norm": 0.0005195586127229035, "learning_rate": 1.925000172454294e-06, "loss": 0.0, "num_input_tokens_seen": 173888448, "step": 80620 }, { "epoch": 14.79629289777941, "grad_norm": 0.0004061188083142042, "learning_rate": 1.9243687935236725e-06, "loss": 0.0, "num_input_tokens_seen": 173899072, "step": 80625 }, { "epoch": 14.79721049733896, "grad_norm": 0.0014333492144942284, "learning_rate": 1.9237374934782266e-06, "loss": 0.0, "num_input_tokens_seen": 173909088, "step": 80630 }, { "epoch": 14.798128096898514, "grad_norm": 0.00028873104020021856, "learning_rate": 1.9231062723341458e-06, "loss": 0.0, "num_input_tokens_seen": 173920224, "step": 80635 }, { "epoch": 14.799045696458066, "grad_norm": 0.00017973539070226252, "learning_rate": 1.9224751301076206e-06, "loss": 0.0, "num_input_tokens_seen": 173930880, "step": 80640 }, { "epoch": 14.799963296017618, "grad_norm": 7.358068251051009e-05, "learning_rate": 1.9218440668148367e-06, "loss": 0.0, "num_input_tokens_seen": 173942592, "step": 80645 }, { "epoch": 14.800880895577171, "grad_norm": 0.0010909978300333023, "learning_rate": 1.921213082471984e-06, "loss": 0.0, "num_input_tokens_seen": 173953248, "step": 80650 }, { "epoch": 14.801798495136723, "grad_norm": 0.0004314104444347322, "learning_rate": 1.9205821770952433e-06, "loss": 0.0, "num_input_tokens_seen": 173963328, "step": 80655 }, { "epoch": 14.802716094696274, "grad_norm": 0.0003704352711793035, "learning_rate": 1.9199513507007954e-06, "loss": 0.0, "num_input_tokens_seen": 173974720, "step": 80660 }, { "epoch": 14.803633694255828, "grad_norm": 0.0002700005134101957, "learning_rate": 1.919320603304824e-06, "loss": 0.0008, "num_input_tokens_seen": 173985696, "step": 80665 }, { "epoch": 14.80455129381538, "grad_norm": 0.0001755407574819401, "learning_rate": 1.9186899349235044e-06, "loss": 0.0, "num_input_tokens_seen": 173997120, "step": 80670 }, { "epoch": 14.80546889337493, "grad_norm": 0.0003778722893912345, "learning_rate": 1.9180593455730107e-06, "loss": 0.0, "num_input_tokens_seen": 174009184, "step": 80675 }, { "epoch": 14.806386492934484, "grad_norm": 0.003192183095961809, "learning_rate": 1.9174288352695197e-06, "loss": 0.0, "num_input_tokens_seen": 174020032, "step": 80680 }, { "epoch": 14.807304092494036, "grad_norm": 0.00014300469774752855, "learning_rate": 1.9167984040292016e-06, "loss": 0.0, "num_input_tokens_seen": 174029056, "step": 80685 }, { "epoch": 14.808221692053587, "grad_norm": 0.0007125247502699494, "learning_rate": 1.916168051868226e-06, "loss": 0.0, "num_input_tokens_seen": 174040096, "step": 80690 }, { "epoch": 14.80913929161314, "grad_norm": 0.0004298123240005225, "learning_rate": 1.91553777880276e-06, "loss": 0.0, "num_input_tokens_seen": 174050112, "step": 80695 }, { "epoch": 14.810056891172692, "grad_norm": 0.000152234744746238, "learning_rate": 1.9149075848489698e-06, "loss": 0.0006, "num_input_tokens_seen": 174060960, "step": 80700 }, { "epoch": 14.810974490732244, "grad_norm": 0.00032129770261235535, "learning_rate": 1.9142774700230167e-06, "loss": 0.0, "num_input_tokens_seen": 174070944, "step": 80705 }, { "epoch": 14.811892090291797, "grad_norm": 0.0003133592545054853, "learning_rate": 1.913647434341066e-06, "loss": 0.0, "num_input_tokens_seen": 174081632, "step": 80710 }, { "epoch": 14.812809689851349, "grad_norm": 0.0001562369434395805, "learning_rate": 1.913017477819275e-06, "loss": 0.0, "num_input_tokens_seen": 174091200, "step": 80715 }, { "epoch": 14.8137272894109, "grad_norm": 0.007006433326750994, "learning_rate": 1.912387600473801e-06, "loss": 0.0, "num_input_tokens_seen": 174101696, "step": 80720 }, { "epoch": 14.814644888970454, "grad_norm": 0.000869409297592938, "learning_rate": 1.911757802320799e-06, "loss": 0.0, "num_input_tokens_seen": 174112480, "step": 80725 }, { "epoch": 14.815562488530006, "grad_norm": 0.01298054214566946, "learning_rate": 1.911128083376424e-06, "loss": 0.0, "num_input_tokens_seen": 174123936, "step": 80730 }, { "epoch": 14.816480088089557, "grad_norm": 0.00025218838709406555, "learning_rate": 1.9104984436568263e-06, "loss": 0.0, "num_input_tokens_seen": 174135104, "step": 80735 }, { "epoch": 14.81739768764911, "grad_norm": 0.0002421640238026157, "learning_rate": 1.909868883178155e-06, "loss": 0.0, "num_input_tokens_seen": 174145376, "step": 80740 }, { "epoch": 14.818315287208662, "grad_norm": 0.0026957399677485228, "learning_rate": 1.9092394019565564e-06, "loss": 0.0, "num_input_tokens_seen": 174155744, "step": 80745 }, { "epoch": 14.819232886768214, "grad_norm": 0.001445862464606762, "learning_rate": 1.9086100000081786e-06, "loss": 0.0, "num_input_tokens_seen": 174165376, "step": 80750 }, { "epoch": 14.820150486327767, "grad_norm": 0.0002858456573449075, "learning_rate": 1.9079806773491625e-06, "loss": 0.0001, "num_input_tokens_seen": 174175456, "step": 80755 }, { "epoch": 14.821068085887319, "grad_norm": 5.9164314734516665e-05, "learning_rate": 1.9073514339956487e-06, "loss": 0.0, "num_input_tokens_seen": 174184800, "step": 80760 }, { "epoch": 14.82198568544687, "grad_norm": 0.00042750081047415733, "learning_rate": 1.9067222699637794e-06, "loss": 0.0, "num_input_tokens_seen": 174196128, "step": 80765 }, { "epoch": 14.822903285006424, "grad_norm": 0.017021991312503815, "learning_rate": 1.906093185269689e-06, "loss": 0.0, "num_input_tokens_seen": 174206016, "step": 80770 }, { "epoch": 14.823820884565976, "grad_norm": 0.0019158001523464918, "learning_rate": 1.9054641799295136e-06, "loss": 0.0, "num_input_tokens_seen": 174216128, "step": 80775 }, { "epoch": 14.824738484125527, "grad_norm": 0.00021090268273837864, "learning_rate": 1.9048352539593845e-06, "loss": 0.0, "num_input_tokens_seen": 174226144, "step": 80780 }, { "epoch": 14.82565608368508, "grad_norm": 0.0018962783506140113, "learning_rate": 1.9042064073754352e-06, "loss": 0.0, "num_input_tokens_seen": 174238592, "step": 80785 }, { "epoch": 14.826573683244632, "grad_norm": 0.0001628616446396336, "learning_rate": 1.9035776401937938e-06, "loss": 0.0, "num_input_tokens_seen": 174250208, "step": 80790 }, { "epoch": 14.827491282804184, "grad_norm": 0.00013635458890348673, "learning_rate": 1.9029489524305855e-06, "loss": 0.0, "num_input_tokens_seen": 174261088, "step": 80795 }, { "epoch": 14.828408882363737, "grad_norm": 0.0005930267507210374, "learning_rate": 1.9023203441019377e-06, "loss": 0.0, "num_input_tokens_seen": 174272448, "step": 80800 }, { "epoch": 14.829326481923289, "grad_norm": 0.00023470490123145282, "learning_rate": 1.9016918152239722e-06, "loss": 0.1906, "num_input_tokens_seen": 174283392, "step": 80805 }, { "epoch": 14.83024408148284, "grad_norm": 0.006959888152778149, "learning_rate": 1.901063365812808e-06, "loss": 0.0, "num_input_tokens_seen": 174294400, "step": 80810 }, { "epoch": 14.831161681042394, "grad_norm": 0.00033649610122665763, "learning_rate": 1.9004349958845676e-06, "loss": 0.0, "num_input_tokens_seen": 174305024, "step": 80815 }, { "epoch": 14.832079280601945, "grad_norm": 0.00018533562251832336, "learning_rate": 1.8998067054553654e-06, "loss": 0.0, "num_input_tokens_seen": 174315456, "step": 80820 }, { "epoch": 14.832996880161497, "grad_norm": 0.11984232813119888, "learning_rate": 1.8991784945413166e-06, "loss": 0.0, "num_input_tokens_seen": 174326080, "step": 80825 }, { "epoch": 14.83391447972105, "grad_norm": 0.03475876525044441, "learning_rate": 1.8985503631585317e-06, "loss": 0.0, "num_input_tokens_seen": 174336832, "step": 80830 }, { "epoch": 14.834832079280602, "grad_norm": 9.988027159124613e-05, "learning_rate": 1.8979223113231249e-06, "loss": 0.0, "num_input_tokens_seen": 174347360, "step": 80835 }, { "epoch": 14.835749678840154, "grad_norm": 0.0011804038658738136, "learning_rate": 1.8972943390512026e-06, "loss": 0.0, "num_input_tokens_seen": 174357664, "step": 80840 }, { "epoch": 14.836667278399707, "grad_norm": 0.04522591456770897, "learning_rate": 1.8966664463588707e-06, "loss": 0.0, "num_input_tokens_seen": 174368640, "step": 80845 }, { "epoch": 14.837584877959259, "grad_norm": 9.976553701562807e-05, "learning_rate": 1.896038633262236e-06, "loss": 0.0, "num_input_tokens_seen": 174378752, "step": 80850 }, { "epoch": 14.83850247751881, "grad_norm": 0.0002780865179374814, "learning_rate": 1.8954108997774002e-06, "loss": 0.0009, "num_input_tokens_seen": 174389440, "step": 80855 }, { "epoch": 14.839420077078364, "grad_norm": 0.0002940724662039429, "learning_rate": 1.8947832459204607e-06, "loss": 0.0131, "num_input_tokens_seen": 174399456, "step": 80860 }, { "epoch": 14.840337676637915, "grad_norm": 0.00012591946870088577, "learning_rate": 1.8941556717075205e-06, "loss": 0.0, "num_input_tokens_seen": 174409376, "step": 80865 }, { "epoch": 14.841255276197467, "grad_norm": 0.0001630118058528751, "learning_rate": 1.8935281771546737e-06, "loss": 0.0, "num_input_tokens_seen": 174420480, "step": 80870 }, { "epoch": 14.84217287575702, "grad_norm": 0.00011566329339984804, "learning_rate": 1.8929007622780143e-06, "loss": 0.0, "num_input_tokens_seen": 174432544, "step": 80875 }, { "epoch": 14.843090475316572, "grad_norm": 0.000333745643729344, "learning_rate": 1.8922734270936333e-06, "loss": 0.0003, "num_input_tokens_seen": 174442624, "step": 80880 }, { "epoch": 14.844008074876124, "grad_norm": 0.00015995133435353637, "learning_rate": 1.8916461716176237e-06, "loss": 0.0, "num_input_tokens_seen": 174454464, "step": 80885 }, { "epoch": 14.844925674435677, "grad_norm": 0.0007649665931239724, "learning_rate": 1.8910189958660725e-06, "loss": 0.0, "num_input_tokens_seen": 174465120, "step": 80890 }, { "epoch": 14.845843273995229, "grad_norm": 0.00024205409863498062, "learning_rate": 1.8903918998550659e-06, "loss": 0.0376, "num_input_tokens_seen": 174475680, "step": 80895 }, { "epoch": 14.84676087355478, "grad_norm": 9.705415141070262e-05, "learning_rate": 1.8897648836006854e-06, "loss": 0.0, "num_input_tokens_seen": 174486624, "step": 80900 }, { "epoch": 14.847678473114334, "grad_norm": 0.000478406494949013, "learning_rate": 1.889137947119017e-06, "loss": 0.0, "num_input_tokens_seen": 174499232, "step": 80905 }, { "epoch": 14.848596072673885, "grad_norm": 0.00035742053296417, "learning_rate": 1.8885110904261389e-06, "loss": 0.0, "num_input_tokens_seen": 174510304, "step": 80910 }, { "epoch": 14.849513672233437, "grad_norm": 0.0004228524339850992, "learning_rate": 1.8878843135381293e-06, "loss": 0.0, "num_input_tokens_seen": 174520704, "step": 80915 }, { "epoch": 14.85043127179299, "grad_norm": 0.00040238804649561644, "learning_rate": 1.8872576164710633e-06, "loss": 0.0, "num_input_tokens_seen": 174531264, "step": 80920 }, { "epoch": 14.851348871352542, "grad_norm": 0.0006745629943907261, "learning_rate": 1.8866309992410137e-06, "loss": 0.0, "num_input_tokens_seen": 174541376, "step": 80925 }, { "epoch": 14.852266470912094, "grad_norm": 0.00010534500324865803, "learning_rate": 1.886004461864055e-06, "loss": 0.0, "num_input_tokens_seen": 174552384, "step": 80930 }, { "epoch": 14.853184070471647, "grad_norm": 0.005684832576662302, "learning_rate": 1.885378004356256e-06, "loss": 0.0, "num_input_tokens_seen": 174562976, "step": 80935 }, { "epoch": 14.854101670031199, "grad_norm": 0.002792028710246086, "learning_rate": 1.884751626733684e-06, "loss": 0.0001, "num_input_tokens_seen": 174573536, "step": 80940 }, { "epoch": 14.85501926959075, "grad_norm": 0.0017758633475750685, "learning_rate": 1.8841253290124022e-06, "loss": 0.0, "num_input_tokens_seen": 174584960, "step": 80945 }, { "epoch": 14.855936869150304, "grad_norm": 0.0003571969864424318, "learning_rate": 1.8834991112084788e-06, "loss": 0.0001, "num_input_tokens_seen": 174595488, "step": 80950 }, { "epoch": 14.856854468709855, "grad_norm": 0.0003636049514170736, "learning_rate": 1.882872973337973e-06, "loss": 0.0, "num_input_tokens_seen": 174606912, "step": 80955 }, { "epoch": 14.857772068269407, "grad_norm": 0.00016682842397131026, "learning_rate": 1.8822469154169448e-06, "loss": 0.0, "num_input_tokens_seen": 174617472, "step": 80960 }, { "epoch": 14.85868966782896, "grad_norm": 0.00020252882677596062, "learning_rate": 1.8816209374614487e-06, "loss": 0.0, "num_input_tokens_seen": 174628224, "step": 80965 }, { "epoch": 14.859607267388512, "grad_norm": 0.000186480741831474, "learning_rate": 1.8809950394875443e-06, "loss": 0.0, "num_input_tokens_seen": 174638720, "step": 80970 }, { "epoch": 14.860524866948063, "grad_norm": 0.00015453962259925902, "learning_rate": 1.8803692215112834e-06, "loss": 0.0, "num_input_tokens_seen": 174649184, "step": 80975 }, { "epoch": 14.861442466507617, "grad_norm": 0.00018689960415940732, "learning_rate": 1.879743483548715e-06, "loss": 0.0, "num_input_tokens_seen": 174661280, "step": 80980 }, { "epoch": 14.862360066067168, "grad_norm": 0.00020944868447259068, "learning_rate": 1.8791178256158926e-06, "loss": 0.0, "num_input_tokens_seen": 174670656, "step": 80985 }, { "epoch": 14.86327766562672, "grad_norm": 0.0002611639501992613, "learning_rate": 1.8784922477288602e-06, "loss": 0.0, "num_input_tokens_seen": 174681184, "step": 80990 }, { "epoch": 14.864195265186273, "grad_norm": 0.0010268765036016703, "learning_rate": 1.8778667499036624e-06, "loss": 0.0284, "num_input_tokens_seen": 174692640, "step": 80995 }, { "epoch": 14.865112864745825, "grad_norm": 9.07915600691922e-05, "learning_rate": 1.8772413321563453e-06, "loss": 0.0, "num_input_tokens_seen": 174702848, "step": 81000 }, { "epoch": 14.866030464305377, "grad_norm": 0.002457273658365011, "learning_rate": 1.8766159945029482e-06, "loss": 0.0, "num_input_tokens_seen": 174714496, "step": 81005 }, { "epoch": 14.86694806386493, "grad_norm": 0.00021498111891560256, "learning_rate": 1.8759907369595104e-06, "loss": 0.0, "num_input_tokens_seen": 174724896, "step": 81010 }, { "epoch": 14.867865663424482, "grad_norm": 0.0008960638078860939, "learning_rate": 1.8753655595420661e-06, "loss": 0.0, "num_input_tokens_seen": 174735392, "step": 81015 }, { "epoch": 14.868783262984033, "grad_norm": 0.0005480890395119786, "learning_rate": 1.874740462266655e-06, "loss": 0.0, "num_input_tokens_seen": 174744736, "step": 81020 }, { "epoch": 14.869700862543587, "grad_norm": 0.0001057658955687657, "learning_rate": 1.8741154451493065e-06, "loss": 0.0, "num_input_tokens_seen": 174755744, "step": 81025 }, { "epoch": 14.870618462103138, "grad_norm": 0.0002633697004057467, "learning_rate": 1.8734905082060505e-06, "loss": 0.0, "num_input_tokens_seen": 174767456, "step": 81030 }, { "epoch": 14.87153606166269, "grad_norm": 0.0002517063985578716, "learning_rate": 1.8728656514529192e-06, "loss": 0.0, "num_input_tokens_seen": 174777664, "step": 81035 }, { "epoch": 14.872453661222243, "grad_norm": 0.022766318172216415, "learning_rate": 1.8722408749059374e-06, "loss": 0.0, "num_input_tokens_seen": 174788288, "step": 81040 }, { "epoch": 14.873371260781795, "grad_norm": 0.002413966925814748, "learning_rate": 1.8716161785811277e-06, "loss": 0.0, "num_input_tokens_seen": 174799712, "step": 81045 }, { "epoch": 14.874288860341347, "grad_norm": 0.00026242100284434855, "learning_rate": 1.8709915624945163e-06, "loss": 0.0, "num_input_tokens_seen": 174810112, "step": 81050 }, { "epoch": 14.8752064599009, "grad_norm": 0.00013745774049311876, "learning_rate": 1.8703670266621222e-06, "loss": 0.0, "num_input_tokens_seen": 174821536, "step": 81055 }, { "epoch": 14.876124059460452, "grad_norm": 0.00019162682292517275, "learning_rate": 1.8697425710999628e-06, "loss": 0.0, "num_input_tokens_seen": 174832320, "step": 81060 }, { "epoch": 14.877041659020003, "grad_norm": 0.0002076152595691383, "learning_rate": 1.8691181958240534e-06, "loss": 0.0, "num_input_tokens_seen": 174844032, "step": 81065 }, { "epoch": 14.877959258579557, "grad_norm": 0.010197984054684639, "learning_rate": 1.8684939008504116e-06, "loss": 0.0, "num_input_tokens_seen": 174855008, "step": 81070 }, { "epoch": 14.878876858139108, "grad_norm": 0.0007840080070309341, "learning_rate": 1.8678696861950478e-06, "loss": 0.0, "num_input_tokens_seen": 174865216, "step": 81075 }, { "epoch": 14.87979445769866, "grad_norm": 0.0001141713437391445, "learning_rate": 1.8672455518739708e-06, "loss": 0.0, "num_input_tokens_seen": 174876128, "step": 81080 }, { "epoch": 14.880712057258213, "grad_norm": 0.00021508919599000365, "learning_rate": 1.866621497903191e-06, "loss": 0.0, "num_input_tokens_seen": 174886944, "step": 81085 }, { "epoch": 14.881629656817765, "grad_norm": 0.00030979840084910393, "learning_rate": 1.8659975242987143e-06, "loss": 0.0, "num_input_tokens_seen": 174897824, "step": 81090 }, { "epoch": 14.882547256377316, "grad_norm": 0.0003136041050311178, "learning_rate": 1.8653736310765435e-06, "loss": 0.0, "num_input_tokens_seen": 174907712, "step": 81095 }, { "epoch": 14.88346485593687, "grad_norm": 9.937679715221748e-05, "learning_rate": 1.864749818252679e-06, "loss": 0.0, "num_input_tokens_seen": 174917824, "step": 81100 }, { "epoch": 14.884382455496421, "grad_norm": 0.0001719977444736287, "learning_rate": 1.8641260858431243e-06, "loss": 0.0451, "num_input_tokens_seen": 174928864, "step": 81105 }, { "epoch": 14.885300055055973, "grad_norm": 0.0007172218174673617, "learning_rate": 1.8635024338638758e-06, "loss": 0.0, "num_input_tokens_seen": 174939680, "step": 81110 }, { "epoch": 14.886217654615526, "grad_norm": 7.136740896385163e-05, "learning_rate": 1.862878862330928e-06, "loss": 0.0, "num_input_tokens_seen": 174949824, "step": 81115 }, { "epoch": 14.887135254175078, "grad_norm": 255.80844116210938, "learning_rate": 1.8622553712602737e-06, "loss": 0.1594, "num_input_tokens_seen": 174960096, "step": 81120 }, { "epoch": 14.88805285373463, "grad_norm": 0.00010373163968324661, "learning_rate": 1.861631960667908e-06, "loss": 0.2594, "num_input_tokens_seen": 174969856, "step": 81125 }, { "epoch": 14.888970453294183, "grad_norm": 0.009292623959481716, "learning_rate": 1.8610086305698184e-06, "loss": 0.0, "num_input_tokens_seen": 174979584, "step": 81130 }, { "epoch": 14.889888052853735, "grad_norm": 0.00014802833902649581, "learning_rate": 1.8603853809819927e-06, "loss": 0.0001, "num_input_tokens_seen": 174989696, "step": 81135 }, { "epoch": 14.890805652413286, "grad_norm": 0.0011007606517523527, "learning_rate": 1.8597622119204156e-06, "loss": 0.0, "num_input_tokens_seen": 175001568, "step": 81140 }, { "epoch": 14.89172325197284, "grad_norm": 0.00016490048437844962, "learning_rate": 1.859139123401069e-06, "loss": 0.0, "num_input_tokens_seen": 175012000, "step": 81145 }, { "epoch": 14.892640851532391, "grad_norm": 0.00016240075638052076, "learning_rate": 1.8585161154399383e-06, "loss": 0.0, "num_input_tokens_seen": 175022720, "step": 81150 }, { "epoch": 14.893558451091943, "grad_norm": 0.0012338303495198488, "learning_rate": 1.8578931880529998e-06, "loss": 0.0, "num_input_tokens_seen": 175033600, "step": 81155 }, { "epoch": 14.894476050651496, "grad_norm": 0.001269436557777226, "learning_rate": 1.857270341256232e-06, "loss": 0.0, "num_input_tokens_seen": 175044544, "step": 81160 }, { "epoch": 14.895393650211048, "grad_norm": 0.00013044613297097385, "learning_rate": 1.8566475750656066e-06, "loss": 0.0, "num_input_tokens_seen": 175054400, "step": 81165 }, { "epoch": 14.8963112497706, "grad_norm": 0.00014324365474749357, "learning_rate": 1.856024889497101e-06, "loss": 0.0, "num_input_tokens_seen": 175064992, "step": 81170 }, { "epoch": 14.897228849330153, "grad_norm": 0.009119568392634392, "learning_rate": 1.8554022845666846e-06, "loss": 0.0, "num_input_tokens_seen": 175076160, "step": 81175 }, { "epoch": 14.898146448889705, "grad_norm": 0.0005032362532801926, "learning_rate": 1.8547797602903244e-06, "loss": 0.0, "num_input_tokens_seen": 175086432, "step": 81180 }, { "epoch": 14.899064048449256, "grad_norm": 0.004684883635491133, "learning_rate": 1.8541573166839898e-06, "loss": 0.0, "num_input_tokens_seen": 175096672, "step": 81185 }, { "epoch": 14.89998164800881, "grad_norm": 0.00021212163846939802, "learning_rate": 1.8535349537636449e-06, "loss": 0.0, "num_input_tokens_seen": 175106016, "step": 81190 }, { "epoch": 14.900899247568361, "grad_norm": 0.000176552843186073, "learning_rate": 1.8529126715452516e-06, "loss": 0.0, "num_input_tokens_seen": 175116864, "step": 81195 }, { "epoch": 14.901816847127913, "grad_norm": 0.0001339235168416053, "learning_rate": 1.852290470044769e-06, "loss": 0.0002, "num_input_tokens_seen": 175127360, "step": 81200 }, { "epoch": 14.902734446687466, "grad_norm": 0.002882269909605384, "learning_rate": 1.851668349278159e-06, "loss": 0.0, "num_input_tokens_seen": 175137056, "step": 81205 }, { "epoch": 14.903652046247018, "grad_norm": 0.0008832078310661018, "learning_rate": 1.8510463092613767e-06, "loss": 0.0, "num_input_tokens_seen": 175147392, "step": 81210 }, { "epoch": 14.90456964580657, "grad_norm": 0.00733412429690361, "learning_rate": 1.8504243500103742e-06, "loss": 0.0, "num_input_tokens_seen": 175158752, "step": 81215 }, { "epoch": 14.905487245366123, "grad_norm": 0.00026378544862382114, "learning_rate": 1.8498024715411073e-06, "loss": 0.0, "num_input_tokens_seen": 175169984, "step": 81220 }, { "epoch": 14.906404844925675, "grad_norm": 0.00014713733980897814, "learning_rate": 1.8491806738695245e-06, "loss": 0.0, "num_input_tokens_seen": 175180640, "step": 81225 }, { "epoch": 14.907322444485226, "grad_norm": 0.001622962299734354, "learning_rate": 1.8485589570115748e-06, "loss": 0.0, "num_input_tokens_seen": 175192032, "step": 81230 }, { "epoch": 14.90824004404478, "grad_norm": 0.0014666402712464333, "learning_rate": 1.8479373209832013e-06, "loss": 0.0002, "num_input_tokens_seen": 175201792, "step": 81235 }, { "epoch": 14.909157643604331, "grad_norm": 0.0003708883305080235, "learning_rate": 1.847315765800352e-06, "loss": 0.0, "num_input_tokens_seen": 175211328, "step": 81240 }, { "epoch": 14.910075243163883, "grad_norm": 0.00010520643263589591, "learning_rate": 1.846694291478967e-06, "loss": 0.0, "num_input_tokens_seen": 175222176, "step": 81245 }, { "epoch": 14.910992842723436, "grad_norm": 0.00037806635373272, "learning_rate": 1.8460728980349845e-06, "loss": 0.0, "num_input_tokens_seen": 175233632, "step": 81250 }, { "epoch": 14.911910442282988, "grad_norm": 0.0005374090978875756, "learning_rate": 1.8454515854843463e-06, "loss": 0.0, "num_input_tokens_seen": 175244224, "step": 81255 }, { "epoch": 14.91282804184254, "grad_norm": 8.341153443325311e-05, "learning_rate": 1.8448303538429852e-06, "loss": 0.0005, "num_input_tokens_seen": 175254176, "step": 81260 }, { "epoch": 14.913745641402093, "grad_norm": 0.0022555002942681313, "learning_rate": 1.8442092031268344e-06, "loss": 0.0, "num_input_tokens_seen": 175265088, "step": 81265 }, { "epoch": 14.914663240961644, "grad_norm": 0.0002961273421533406, "learning_rate": 1.8435881333518275e-06, "loss": 0.0, "num_input_tokens_seen": 175275872, "step": 81270 }, { "epoch": 14.915580840521196, "grad_norm": 0.0009391071507707238, "learning_rate": 1.8429671445338938e-06, "loss": 0.002, "num_input_tokens_seen": 175287552, "step": 81275 }, { "epoch": 14.91649844008075, "grad_norm": 0.00014163742889650166, "learning_rate": 1.8423462366889587e-06, "loss": 0.0, "num_input_tokens_seen": 175298176, "step": 81280 }, { "epoch": 14.917416039640301, "grad_norm": 9.292213508160785e-05, "learning_rate": 1.8417254098329479e-06, "loss": 0.0, "num_input_tokens_seen": 175309088, "step": 81285 }, { "epoch": 14.918333639199853, "grad_norm": 7.976403867360204e-05, "learning_rate": 1.841104663981787e-06, "loss": 0.0674, "num_input_tokens_seen": 175320160, "step": 81290 }, { "epoch": 14.919251238759406, "grad_norm": 0.002201998373493552, "learning_rate": 1.8404839991513956e-06, "loss": 0.0, "num_input_tokens_seen": 175328896, "step": 81295 }, { "epoch": 14.920168838318958, "grad_norm": 0.000177689827978611, "learning_rate": 1.8398634153576911e-06, "loss": 0.0, "num_input_tokens_seen": 175339488, "step": 81300 }, { "epoch": 14.92108643787851, "grad_norm": 0.006280485540628433, "learning_rate": 1.839242912616594e-06, "loss": 0.0, "num_input_tokens_seen": 175350080, "step": 81305 }, { "epoch": 14.922004037438063, "grad_norm": 0.0018803548300638795, "learning_rate": 1.8386224909440175e-06, "loss": 0.0, "num_input_tokens_seen": 175360832, "step": 81310 }, { "epoch": 14.922921636997614, "grad_norm": 0.0009056160342879593, "learning_rate": 1.8380021503558726e-06, "loss": 0.0, "num_input_tokens_seen": 175370688, "step": 81315 }, { "epoch": 14.923839236557166, "grad_norm": 0.003305234247818589, "learning_rate": 1.8373818908680736e-06, "loss": 0.0, "num_input_tokens_seen": 175381120, "step": 81320 }, { "epoch": 14.92475683611672, "grad_norm": 0.0007189453463070095, "learning_rate": 1.836761712496527e-06, "loss": 0.0405, "num_input_tokens_seen": 175390880, "step": 81325 }, { "epoch": 14.925674435676271, "grad_norm": 0.00020437694911379367, "learning_rate": 1.8361416152571403e-06, "loss": 0.0, "num_input_tokens_seen": 175402016, "step": 81330 }, { "epoch": 14.926592035235823, "grad_norm": 9.401245188200846e-05, "learning_rate": 1.8355215991658183e-06, "loss": 0.0, "num_input_tokens_seen": 175413120, "step": 81335 }, { "epoch": 14.927509634795376, "grad_norm": 0.0021720374934375286, "learning_rate": 1.8349016642384604e-06, "loss": 0.0, "num_input_tokens_seen": 175424672, "step": 81340 }, { "epoch": 14.928427234354928, "grad_norm": 9.098877490032464e-05, "learning_rate": 1.834281810490971e-06, "loss": 0.0, "num_input_tokens_seen": 175436192, "step": 81345 }, { "epoch": 14.92934483391448, "grad_norm": 0.0001567057188367471, "learning_rate": 1.8336620379392466e-06, "loss": 0.0, "num_input_tokens_seen": 175447232, "step": 81350 }, { "epoch": 14.930262433474033, "grad_norm": 0.00012001833238173276, "learning_rate": 1.8330423465991843e-06, "loss": 0.0, "num_input_tokens_seen": 175458944, "step": 81355 }, { "epoch": 14.931180033033584, "grad_norm": 0.0008418565266765654, "learning_rate": 1.832422736486677e-06, "loss": 0.0, "num_input_tokens_seen": 175469888, "step": 81360 }, { "epoch": 14.932097632593136, "grad_norm": 9.32382681639865e-05, "learning_rate": 1.8318032076176167e-06, "loss": 0.0, "num_input_tokens_seen": 175480640, "step": 81365 }, { "epoch": 14.93301523215269, "grad_norm": 0.023613594472408295, "learning_rate": 1.831183760007893e-06, "loss": 0.0, "num_input_tokens_seen": 175490240, "step": 81370 }, { "epoch": 14.93393283171224, "grad_norm": 7.04470876371488e-05, "learning_rate": 1.8305643936733959e-06, "loss": 0.0, "num_input_tokens_seen": 175503040, "step": 81375 }, { "epoch": 14.934850431271792, "grad_norm": 0.000511893245857209, "learning_rate": 1.8299451086300102e-06, "loss": 0.0, "num_input_tokens_seen": 175513824, "step": 81380 }, { "epoch": 14.935768030831346, "grad_norm": 0.00047031205031089485, "learning_rate": 1.8293259048936174e-06, "loss": 0.0, "num_input_tokens_seen": 175525600, "step": 81385 }, { "epoch": 14.936685630390897, "grad_norm": 0.000133784647914581, "learning_rate": 1.828706782480103e-06, "loss": 0.0, "num_input_tokens_seen": 175536640, "step": 81390 }, { "epoch": 14.937603229950449, "grad_norm": 0.0019719861447811127, "learning_rate": 1.8280877414053445e-06, "loss": 0.0, "num_input_tokens_seen": 175548096, "step": 81395 }, { "epoch": 14.938520829510002, "grad_norm": 0.012640812434256077, "learning_rate": 1.827468781685217e-06, "loss": 0.0, "num_input_tokens_seen": 175559424, "step": 81400 }, { "epoch": 14.939438429069554, "grad_norm": 0.0012059630826115608, "learning_rate": 1.8268499033356007e-06, "loss": 0.0, "num_input_tokens_seen": 175570432, "step": 81405 }, { "epoch": 14.940356028629106, "grad_norm": 0.00047243459266610444, "learning_rate": 1.8262311063723664e-06, "loss": 0.0, "num_input_tokens_seen": 175580928, "step": 81410 }, { "epoch": 14.94127362818866, "grad_norm": 0.00951520074158907, "learning_rate": 1.8256123908113853e-06, "loss": 0.0, "num_input_tokens_seen": 175591584, "step": 81415 }, { "epoch": 14.94219122774821, "grad_norm": 0.020350713282823563, "learning_rate": 1.8249937566685245e-06, "loss": 0.0, "num_input_tokens_seen": 175601568, "step": 81420 }, { "epoch": 14.943108827307762, "grad_norm": 0.00019962343503721058, "learning_rate": 1.824375203959655e-06, "loss": 0.0, "num_input_tokens_seen": 175612768, "step": 81425 }, { "epoch": 14.944026426867316, "grad_norm": 0.0001739023282425478, "learning_rate": 1.82375673270064e-06, "loss": 0.0, "num_input_tokens_seen": 175623104, "step": 81430 }, { "epoch": 14.944944026426867, "grad_norm": 0.00011255267600063235, "learning_rate": 1.8231383429073401e-06, "loss": 0.0, "num_input_tokens_seen": 175633312, "step": 81435 }, { "epoch": 14.945861625986419, "grad_norm": 0.00012750182941090316, "learning_rate": 1.8225200345956195e-06, "loss": 0.0, "num_input_tokens_seen": 175645248, "step": 81440 }, { "epoch": 14.946779225545972, "grad_norm": 0.0007591643370687962, "learning_rate": 1.8219018077813356e-06, "loss": 0.0, "num_input_tokens_seen": 175655744, "step": 81445 }, { "epoch": 14.947696825105524, "grad_norm": 0.0006195637979544699, "learning_rate": 1.8212836624803431e-06, "loss": 0.0, "num_input_tokens_seen": 175665920, "step": 81450 }, { "epoch": 14.948614424665076, "grad_norm": 9.803577268030494e-05, "learning_rate": 1.8206655987084998e-06, "loss": 0.0, "num_input_tokens_seen": 175676352, "step": 81455 }, { "epoch": 14.949532024224629, "grad_norm": 0.0005883356789126992, "learning_rate": 1.820047616481656e-06, "loss": 0.0, "num_input_tokens_seen": 175687232, "step": 81460 }, { "epoch": 14.95044962378418, "grad_norm": 0.0009339696262031794, "learning_rate": 1.8194297158156627e-06, "loss": 0.225, "num_input_tokens_seen": 175698240, "step": 81465 }, { "epoch": 14.951367223343732, "grad_norm": 0.00011080850526923314, "learning_rate": 1.8188118967263657e-06, "loss": 0.0004, "num_input_tokens_seen": 175708832, "step": 81470 }, { "epoch": 14.952284822903286, "grad_norm": 0.00018239814380649477, "learning_rate": 1.8181941592296155e-06, "loss": 0.0, "num_input_tokens_seen": 175719616, "step": 81475 }, { "epoch": 14.953202422462837, "grad_norm": 0.009513968601822853, "learning_rate": 1.8175765033412534e-06, "loss": 0.0, "num_input_tokens_seen": 175731008, "step": 81480 }, { "epoch": 14.954120022022389, "grad_norm": 0.0029145791195333004, "learning_rate": 1.81695892907712e-06, "loss": 0.0, "num_input_tokens_seen": 175741120, "step": 81485 }, { "epoch": 14.955037621581942, "grad_norm": 0.00520558375865221, "learning_rate": 1.8163414364530585e-06, "loss": 0.0, "num_input_tokens_seen": 175752192, "step": 81490 }, { "epoch": 14.955955221141494, "grad_norm": 0.00021107331849634647, "learning_rate": 1.8157240254849046e-06, "loss": 0.0, "num_input_tokens_seen": 175763456, "step": 81495 }, { "epoch": 14.956872820701046, "grad_norm": 2.6862611770629883, "learning_rate": 1.8151066961884927e-06, "loss": 0.0011, "num_input_tokens_seen": 175772864, "step": 81500 }, { "epoch": 14.957790420260599, "grad_norm": 0.0001369444071315229, "learning_rate": 1.81448944857966e-06, "loss": 0.0001, "num_input_tokens_seen": 175784640, "step": 81505 }, { "epoch": 14.95870801982015, "grad_norm": 0.003228982910513878, "learning_rate": 1.8138722826742356e-06, "loss": 0.0131, "num_input_tokens_seen": 175795552, "step": 81510 }, { "epoch": 14.959625619379702, "grad_norm": 0.0010287475306540728, "learning_rate": 1.8132551984880491e-06, "loss": 0.0, "num_input_tokens_seen": 175805728, "step": 81515 }, { "epoch": 14.960543218939256, "grad_norm": 0.00018112607358489186, "learning_rate": 1.812638196036926e-06, "loss": 0.0, "num_input_tokens_seen": 175816960, "step": 81520 }, { "epoch": 14.961460818498807, "grad_norm": 0.00015951845853123814, "learning_rate": 1.812021275336695e-06, "loss": 0.0, "num_input_tokens_seen": 175827712, "step": 81525 }, { "epoch": 14.962378418058359, "grad_norm": 0.00024923268938437104, "learning_rate": 1.8114044364031774e-06, "loss": 0.0018, "num_input_tokens_seen": 175838752, "step": 81530 }, { "epoch": 14.963296017617912, "grad_norm": 0.00012297369539737701, "learning_rate": 1.8107876792521928e-06, "loss": 0.0, "num_input_tokens_seen": 175849056, "step": 81535 }, { "epoch": 14.964213617177464, "grad_norm": 0.0005078449612483382, "learning_rate": 1.8101710038995623e-06, "loss": 0.0, "num_input_tokens_seen": 175860864, "step": 81540 }, { "epoch": 14.965131216737015, "grad_norm": 0.0009655822068452835, "learning_rate": 1.8095544103611024e-06, "loss": 0.0, "num_input_tokens_seen": 175870528, "step": 81545 }, { "epoch": 14.966048816296569, "grad_norm": 0.0001494809694122523, "learning_rate": 1.8089378986526268e-06, "loss": 0.0, "num_input_tokens_seen": 175879680, "step": 81550 }, { "epoch": 14.96696641585612, "grad_norm": 0.00010761979501694441, "learning_rate": 1.8083214687899487e-06, "loss": 0.0087, "num_input_tokens_seen": 175888960, "step": 81555 }, { "epoch": 14.967884015415672, "grad_norm": 0.00010989337897626683, "learning_rate": 1.807705120788878e-06, "loss": 0.0, "num_input_tokens_seen": 175899744, "step": 81560 }, { "epoch": 14.968801614975225, "grad_norm": 0.0047287787310779095, "learning_rate": 1.8070888546652216e-06, "loss": 0.0, "num_input_tokens_seen": 175912096, "step": 81565 }, { "epoch": 14.969719214534777, "grad_norm": 7.964118412928656e-05, "learning_rate": 1.806472670434789e-06, "loss": 0.0, "num_input_tokens_seen": 175923648, "step": 81570 }, { "epoch": 14.970636814094329, "grad_norm": 0.0001406632363796234, "learning_rate": 1.8058565681133833e-06, "loss": 0.0, "num_input_tokens_seen": 175936192, "step": 81575 }, { "epoch": 14.971554413653882, "grad_norm": 0.00011570911010494456, "learning_rate": 1.8052405477168062e-06, "loss": 0.0, "num_input_tokens_seen": 175945888, "step": 81580 }, { "epoch": 14.972472013213434, "grad_norm": 7.916890899650753e-05, "learning_rate": 1.8046246092608555e-06, "loss": 0.0, "num_input_tokens_seen": 175956320, "step": 81585 }, { "epoch": 14.973389612772985, "grad_norm": 0.00012270294246263802, "learning_rate": 1.8040087527613331e-06, "loss": 0.0, "num_input_tokens_seen": 175967712, "step": 81590 }, { "epoch": 14.974307212332539, "grad_norm": 0.0002702058991417289, "learning_rate": 1.8033929782340332e-06, "loss": 0.0, "num_input_tokens_seen": 175976640, "step": 81595 }, { "epoch": 14.97522481189209, "grad_norm": 0.0024761760141700506, "learning_rate": 1.802777285694749e-06, "loss": 0.0, "num_input_tokens_seen": 175987328, "step": 81600 }, { "epoch": 14.976142411451642, "grad_norm": 0.00022547069238498807, "learning_rate": 1.8021616751592702e-06, "loss": 0.0, "num_input_tokens_seen": 175997792, "step": 81605 }, { "epoch": 14.977060011011195, "grad_norm": 0.00015668675769120455, "learning_rate": 1.8015461466433898e-06, "loss": 0.0002, "num_input_tokens_seen": 176008384, "step": 81610 }, { "epoch": 14.977977610570747, "grad_norm": 0.00043617113260552287, "learning_rate": 1.800930700162894e-06, "loss": 0.0, "num_input_tokens_seen": 176019712, "step": 81615 }, { "epoch": 14.978895210130299, "grad_norm": 0.000577792408876121, "learning_rate": 1.8003153357335657e-06, "loss": 0.0, "num_input_tokens_seen": 176030272, "step": 81620 }, { "epoch": 14.979812809689852, "grad_norm": 0.0023164479061961174, "learning_rate": 1.7997000533711916e-06, "loss": 0.0, "num_input_tokens_seen": 176040576, "step": 81625 }, { "epoch": 14.980730409249404, "grad_norm": 0.00013464619405567646, "learning_rate": 1.7990848530915512e-06, "loss": 0.0, "num_input_tokens_seen": 176051584, "step": 81630 }, { "epoch": 14.981648008808955, "grad_norm": 2.072549343109131, "learning_rate": 1.7984697349104218e-06, "loss": 0.0002, "num_input_tokens_seen": 176061024, "step": 81635 }, { "epoch": 14.982565608368509, "grad_norm": 0.0008985610911622643, "learning_rate": 1.7978546988435836e-06, "loss": 0.0, "num_input_tokens_seen": 176071872, "step": 81640 }, { "epoch": 14.98348320792806, "grad_norm": 0.0004449980624485761, "learning_rate": 1.797239744906809e-06, "loss": 0.0, "num_input_tokens_seen": 176083232, "step": 81645 }, { "epoch": 14.984400807487612, "grad_norm": 0.003350819693878293, "learning_rate": 1.7966248731158714e-06, "loss": 0.0, "num_input_tokens_seen": 176092992, "step": 81650 }, { "epoch": 14.985318407047165, "grad_norm": 0.0006011284422129393, "learning_rate": 1.7960100834865396e-06, "loss": 0.0002, "num_input_tokens_seen": 176103712, "step": 81655 }, { "epoch": 14.986236006606717, "grad_norm": 0.00012237350165378302, "learning_rate": 1.795395376034585e-06, "loss": 0.0, "num_input_tokens_seen": 176113984, "step": 81660 }, { "epoch": 14.987153606166268, "grad_norm": 0.0011621835874393582, "learning_rate": 1.794780750775772e-06, "loss": 0.0, "num_input_tokens_seen": 176125120, "step": 81665 }, { "epoch": 14.988071205725822, "grad_norm": 0.00017966888844966888, "learning_rate": 1.7941662077258632e-06, "loss": 0.0, "num_input_tokens_seen": 176137088, "step": 81670 }, { "epoch": 14.988988805285373, "grad_norm": 0.0021987061481922865, "learning_rate": 1.7935517469006247e-06, "loss": 0.0, "num_input_tokens_seen": 176148480, "step": 81675 }, { "epoch": 14.989906404844925, "grad_norm": 0.00025699264369904995, "learning_rate": 1.7929373683158142e-06, "loss": 0.0, "num_input_tokens_seen": 176158656, "step": 81680 }, { "epoch": 14.990824004404478, "grad_norm": 0.0001568020525155589, "learning_rate": 1.7923230719871897e-06, "loss": 0.0, "num_input_tokens_seen": 176169184, "step": 81685 }, { "epoch": 14.99174160396403, "grad_norm": 0.00010258637485094368, "learning_rate": 1.791708857930506e-06, "loss": 0.0, "num_input_tokens_seen": 176179584, "step": 81690 }, { "epoch": 14.992659203523582, "grad_norm": 0.0008718704921193421, "learning_rate": 1.7910947261615186e-06, "loss": 0.0, "num_input_tokens_seen": 176190208, "step": 81695 }, { "epoch": 14.993576803083135, "grad_norm": 499.62982177734375, "learning_rate": 1.7904806766959782e-06, "loss": 0.1438, "num_input_tokens_seen": 176199712, "step": 81700 }, { "epoch": 14.994494402642687, "grad_norm": 0.0001991796016227454, "learning_rate": 1.7898667095496325e-06, "loss": 0.0, "num_input_tokens_seen": 176210976, "step": 81705 }, { "epoch": 14.995412002202238, "grad_norm": 0.0003735317150130868, "learning_rate": 1.7892528247382317e-06, "loss": 0.0, "num_input_tokens_seen": 176221600, "step": 81710 }, { "epoch": 14.996329601761792, "grad_norm": 0.019519513472914696, "learning_rate": 1.7886390222775202e-06, "loss": 0.0, "num_input_tokens_seen": 176232832, "step": 81715 }, { "epoch": 14.997247201321343, "grad_norm": 0.0009319443488493562, "learning_rate": 1.7880253021832388e-06, "loss": 0.0, "num_input_tokens_seen": 176243424, "step": 81720 }, { "epoch": 14.998164800880895, "grad_norm": 0.009031282737851143, "learning_rate": 1.7874116644711326e-06, "loss": 0.0, "num_input_tokens_seen": 176254240, "step": 81725 }, { "epoch": 14.999082400440448, "grad_norm": 4.093213081359863, "learning_rate": 1.7867981091569374e-06, "loss": 0.0004, "num_input_tokens_seen": 176264864, "step": 81730 }, { "epoch": 15.0, "grad_norm": 0.00021665178064722568, "learning_rate": 1.786184636256391e-06, "loss": 0.0, "num_input_tokens_seen": 176274784, "step": 81735 }, { "epoch": 15.000917599559552, "grad_norm": 0.00014163261221256107, "learning_rate": 1.7855712457852259e-06, "loss": 0.0132, "num_input_tokens_seen": 176286720, "step": 81740 }, { "epoch": 15.001835199119105, "grad_norm": 0.0032443064264953136, "learning_rate": 1.784957937759178e-06, "loss": 0.1657, "num_input_tokens_seen": 176298176, "step": 81745 }, { "epoch": 15.002752798678657, "grad_norm": 0.00024459263659082353, "learning_rate": 1.7843447121939767e-06, "loss": 0.0, "num_input_tokens_seen": 176307744, "step": 81750 }, { "epoch": 15.003670398238208, "grad_norm": 0.0001690311182755977, "learning_rate": 1.7837315691053474e-06, "loss": 0.0, "num_input_tokens_seen": 176319392, "step": 81755 }, { "epoch": 15.004587997797762, "grad_norm": 0.026556940749287605, "learning_rate": 1.7831185085090201e-06, "loss": 0.0, "num_input_tokens_seen": 176329952, "step": 81760 }, { "epoch": 15.005505597357313, "grad_norm": 0.0001389457902405411, "learning_rate": 1.7825055304207183e-06, "loss": 0.0, "num_input_tokens_seen": 176339424, "step": 81765 }, { "epoch": 15.006423196916865, "grad_norm": 0.0001316908310400322, "learning_rate": 1.781892634856162e-06, "loss": 0.0, "num_input_tokens_seen": 176351328, "step": 81770 }, { "epoch": 15.007340796476418, "grad_norm": 0.0005676131113432348, "learning_rate": 1.781279821831073e-06, "loss": 0.0003, "num_input_tokens_seen": 176361824, "step": 81775 }, { "epoch": 15.00825839603597, "grad_norm": 7.844402716727927e-05, "learning_rate": 1.7806670913611673e-06, "loss": 0.0, "num_input_tokens_seen": 176371328, "step": 81780 }, { "epoch": 15.009175995595522, "grad_norm": 0.00016869149112608284, "learning_rate": 1.7800544434621597e-06, "loss": 0.0, "num_input_tokens_seen": 176382336, "step": 81785 }, { "epoch": 15.010093595155075, "grad_norm": 0.00020564075384754688, "learning_rate": 1.7794418781497668e-06, "loss": 0.0, "num_input_tokens_seen": 176393760, "step": 81790 }, { "epoch": 15.011011194714627, "grad_norm": 0.0013559159124270082, "learning_rate": 1.778829395439698e-06, "loss": 0.0, "num_input_tokens_seen": 176405056, "step": 81795 }, { "epoch": 15.011928794274178, "grad_norm": 0.0001727776980260387, "learning_rate": 1.778216995347663e-06, "loss": 0.0, "num_input_tokens_seen": 176414336, "step": 81800 }, { "epoch": 15.012846393833732, "grad_norm": 0.008091761730611324, "learning_rate": 1.7776046778893675e-06, "loss": 0.0, "num_input_tokens_seen": 176425984, "step": 81805 }, { "epoch": 15.013763993393283, "grad_norm": 0.052212879061698914, "learning_rate": 1.776992443080519e-06, "loss": 0.0, "num_input_tokens_seen": 176436896, "step": 81810 }, { "epoch": 15.014681592952835, "grad_norm": 0.0002257524465676397, "learning_rate": 1.7763802909368194e-06, "loss": 0.0, "num_input_tokens_seen": 176449376, "step": 81815 }, { "epoch": 15.015599192512388, "grad_norm": 0.0035108213778585196, "learning_rate": 1.7757682214739692e-06, "loss": 0.0, "num_input_tokens_seen": 176459616, "step": 81820 }, { "epoch": 15.01651679207194, "grad_norm": 0.0004915785393677652, "learning_rate": 1.7751562347076651e-06, "loss": 0.0, "num_input_tokens_seen": 176469792, "step": 81825 }, { "epoch": 15.017434391631491, "grad_norm": 0.00018480805738363415, "learning_rate": 1.7745443306536075e-06, "loss": 0.0, "num_input_tokens_seen": 176479872, "step": 81830 }, { "epoch": 15.018351991191045, "grad_norm": 0.00010351527453167364, "learning_rate": 1.7739325093274883e-06, "loss": 0.0, "num_input_tokens_seen": 176490432, "step": 81835 }, { "epoch": 15.019269590750596, "grad_norm": 0.00015533801342826337, "learning_rate": 1.773320770744999e-06, "loss": 0.0, "num_input_tokens_seen": 176501056, "step": 81840 }, { "epoch": 15.020187190310148, "grad_norm": 0.00029066152637824416, "learning_rate": 1.7727091149218327e-06, "loss": 0.0, "num_input_tokens_seen": 176512032, "step": 81845 }, { "epoch": 15.021104789869701, "grad_norm": 0.00016187432629521936, "learning_rate": 1.7720975418736758e-06, "loss": 0.0, "num_input_tokens_seen": 176522656, "step": 81850 }, { "epoch": 15.022022389429253, "grad_norm": 0.00017851740994956344, "learning_rate": 1.7714860516162125e-06, "loss": 0.0, "num_input_tokens_seen": 176535072, "step": 81855 }, { "epoch": 15.022939988988805, "grad_norm": 0.0001134051926783286, "learning_rate": 1.7708746441651293e-06, "loss": 0.0, "num_input_tokens_seen": 176544896, "step": 81860 }, { "epoch": 15.023857588548358, "grad_norm": 0.002108618849888444, "learning_rate": 1.7702633195361073e-06, "loss": 0.0, "num_input_tokens_seen": 176554784, "step": 81865 }, { "epoch": 15.02477518810791, "grad_norm": 0.00018368275777902454, "learning_rate": 1.7696520777448256e-06, "loss": 0.0, "num_input_tokens_seen": 176563872, "step": 81870 }, { "epoch": 15.025692787667461, "grad_norm": 0.0001393705460941419, "learning_rate": 1.7690409188069595e-06, "loss": 0.0, "num_input_tokens_seen": 176574336, "step": 81875 }, { "epoch": 15.026610387227015, "grad_norm": 0.00020886976562906057, "learning_rate": 1.7684298427381885e-06, "loss": 0.0, "num_input_tokens_seen": 176586112, "step": 81880 }, { "epoch": 15.027527986786566, "grad_norm": 0.0001822231279220432, "learning_rate": 1.767818849554183e-06, "loss": 0.0, "num_input_tokens_seen": 176596960, "step": 81885 }, { "epoch": 15.028445586346118, "grad_norm": 0.00039614932029508054, "learning_rate": 1.7672079392706132e-06, "loss": 0.0, "num_input_tokens_seen": 176607936, "step": 81890 }, { "epoch": 15.029363185905671, "grad_norm": 7.829356036381796e-05, "learning_rate": 1.7665971119031512e-06, "loss": 0.0, "num_input_tokens_seen": 176618400, "step": 81895 }, { "epoch": 15.030280785465223, "grad_norm": 0.00043939577881246805, "learning_rate": 1.7659863674674615e-06, "loss": 0.0, "num_input_tokens_seen": 176629760, "step": 81900 }, { "epoch": 15.031198385024775, "grad_norm": 0.00020396486797835678, "learning_rate": 1.7653757059792081e-06, "loss": 0.0, "num_input_tokens_seen": 176641056, "step": 81905 }, { "epoch": 15.032115984584328, "grad_norm": 0.0011298590106889606, "learning_rate": 1.764765127454056e-06, "loss": 0.0, "num_input_tokens_seen": 176652640, "step": 81910 }, { "epoch": 15.03303358414388, "grad_norm": 0.00011952417116845027, "learning_rate": 1.764154631907664e-06, "loss": 0.0, "num_input_tokens_seen": 176664192, "step": 81915 }, { "epoch": 15.033951183703431, "grad_norm": 0.00013832314289174974, "learning_rate": 1.7635442193556913e-06, "loss": 0.0, "num_input_tokens_seen": 176675520, "step": 81920 }, { "epoch": 15.034868783262985, "grad_norm": 0.00019672582857310772, "learning_rate": 1.762933889813791e-06, "loss": 0.0, "num_input_tokens_seen": 176686720, "step": 81925 }, { "epoch": 15.035786382822536, "grad_norm": 0.0006372910575009882, "learning_rate": 1.7623236432976209e-06, "loss": 0.0, "num_input_tokens_seen": 176697344, "step": 81930 }, { "epoch": 15.036703982382088, "grad_norm": 0.0003639625501818955, "learning_rate": 1.7617134798228318e-06, "loss": 0.0, "num_input_tokens_seen": 176707712, "step": 81935 }, { "epoch": 15.037621581941641, "grad_norm": 0.0006591685814782977, "learning_rate": 1.7611033994050714e-06, "loss": 0.0, "num_input_tokens_seen": 176718240, "step": 81940 }, { "epoch": 15.038539181501193, "grad_norm": 7.230754272313789e-05, "learning_rate": 1.7604934020599906e-06, "loss": 0.0, "num_input_tokens_seen": 176729696, "step": 81945 }, { "epoch": 15.039456781060744, "grad_norm": 0.0006680066580884159, "learning_rate": 1.7598834878032333e-06, "loss": 0.0, "num_input_tokens_seen": 176739264, "step": 81950 }, { "epoch": 15.040374380620298, "grad_norm": 0.0001457357284380123, "learning_rate": 1.7592736566504414e-06, "loss": 0.0, "num_input_tokens_seen": 176749824, "step": 81955 }, { "epoch": 15.04129198017985, "grad_norm": 6.462732562795281e-05, "learning_rate": 1.7586639086172585e-06, "loss": 0.0001, "num_input_tokens_seen": 176760800, "step": 81960 }, { "epoch": 15.042209579739401, "grad_norm": 0.00016998623323161155, "learning_rate": 1.7580542437193231e-06, "loss": 0.0, "num_input_tokens_seen": 176771712, "step": 81965 }, { "epoch": 15.043127179298954, "grad_norm": 9.99741823761724e-05, "learning_rate": 1.7574446619722723e-06, "loss": 0.002, "num_input_tokens_seen": 176783040, "step": 81970 }, { "epoch": 15.044044778858506, "grad_norm": 0.00010766168998088688, "learning_rate": 1.7568351633917396e-06, "loss": 0.0, "num_input_tokens_seen": 176793920, "step": 81975 }, { "epoch": 15.044962378418058, "grad_norm": 0.0017217600252479315, "learning_rate": 1.7562257479933576e-06, "loss": 0.0, "num_input_tokens_seen": 176805088, "step": 81980 }, { "epoch": 15.045879977977611, "grad_norm": 0.003742011496797204, "learning_rate": 1.7556164157927586e-06, "loss": 0.0, "num_input_tokens_seen": 176816064, "step": 81985 }, { "epoch": 15.046797577537163, "grad_norm": 8.723318751435727e-05, "learning_rate": 1.7550071668055708e-06, "loss": 0.0, "num_input_tokens_seen": 176827776, "step": 81990 }, { "epoch": 15.047715177096714, "grad_norm": 0.0002224125637440011, "learning_rate": 1.7543980010474198e-06, "loss": 0.0, "num_input_tokens_seen": 176838144, "step": 81995 }, { "epoch": 15.048632776656268, "grad_norm": 0.00017686716455500573, "learning_rate": 1.7537889185339296e-06, "loss": 0.0, "num_input_tokens_seen": 176849216, "step": 82000 }, { "epoch": 15.04955037621582, "grad_norm": 0.0005288569373078644, "learning_rate": 1.7531799192807208e-06, "loss": 0.0, "num_input_tokens_seen": 176860032, "step": 82005 }, { "epoch": 15.050467975775371, "grad_norm": 0.00035958673106506467, "learning_rate": 1.752571003303417e-06, "loss": 0.0, "num_input_tokens_seen": 176870432, "step": 82010 }, { "epoch": 15.051385575334924, "grad_norm": 0.006763229612261057, "learning_rate": 1.7519621706176337e-06, "loss": 0.0, "num_input_tokens_seen": 176881056, "step": 82015 }, { "epoch": 15.052303174894476, "grad_norm": 9.963879710994661e-05, "learning_rate": 1.7513534212389865e-06, "loss": 0.0, "num_input_tokens_seen": 176890368, "step": 82020 }, { "epoch": 15.053220774454028, "grad_norm": 0.0038824023213237524, "learning_rate": 1.7507447551830875e-06, "loss": 0.0, "num_input_tokens_seen": 176900160, "step": 82025 }, { "epoch": 15.054138374013581, "grad_norm": 0.00019417796283960342, "learning_rate": 1.7501361724655519e-06, "loss": 0.0, "num_input_tokens_seen": 176911840, "step": 82030 }, { "epoch": 15.055055973573133, "grad_norm": 0.00010172335169045255, "learning_rate": 1.7495276731019862e-06, "loss": 0.0001, "num_input_tokens_seen": 176921888, "step": 82035 }, { "epoch": 15.055973573132684, "grad_norm": 0.0004083447565790266, "learning_rate": 1.748919257107996e-06, "loss": 0.0, "num_input_tokens_seen": 176932768, "step": 82040 }, { "epoch": 15.056891172692238, "grad_norm": 0.00012252986198291183, "learning_rate": 1.7483109244991896e-06, "loss": 0.0001, "num_input_tokens_seen": 176942784, "step": 82045 }, { "epoch": 15.05780877225179, "grad_norm": 0.00022276076197158545, "learning_rate": 1.7477026752911691e-06, "loss": 0.0, "num_input_tokens_seen": 176953568, "step": 82050 }, { "epoch": 15.05872637181134, "grad_norm": 0.00021312132594175637, "learning_rate": 1.747094509499534e-06, "loss": 0.0, "num_input_tokens_seen": 176964544, "step": 82055 }, { "epoch": 15.059643971370894, "grad_norm": 0.0009763712296262383, "learning_rate": 1.746486427139882e-06, "loss": 0.0, "num_input_tokens_seen": 176975872, "step": 82060 }, { "epoch": 15.060561570930446, "grad_norm": 0.00016209868772421032, "learning_rate": 1.7458784282278112e-06, "loss": 0.0, "num_input_tokens_seen": 176986400, "step": 82065 }, { "epoch": 15.061479170489998, "grad_norm": 0.0001610004692338407, "learning_rate": 1.745270512778916e-06, "loss": 0.0, "num_input_tokens_seen": 176997216, "step": 82070 }, { "epoch": 15.062396770049551, "grad_norm": 0.0017181859584525228, "learning_rate": 1.7446626808087864e-06, "loss": 0.0, "num_input_tokens_seen": 177008384, "step": 82075 }, { "epoch": 15.063314369609103, "grad_norm": 0.010847779922187328, "learning_rate": 1.7440549323330148e-06, "loss": 0.0, "num_input_tokens_seen": 177019872, "step": 82080 }, { "epoch": 15.064231969168654, "grad_norm": 0.00011398249625926837, "learning_rate": 1.743447267367188e-06, "loss": 0.0, "num_input_tokens_seen": 177029696, "step": 82085 }, { "epoch": 15.065149568728208, "grad_norm": 0.00029230196378193796, "learning_rate": 1.7428396859268903e-06, "loss": 0.0703, "num_input_tokens_seen": 177040576, "step": 82090 }, { "epoch": 15.06606716828776, "grad_norm": 0.00027911615325137973, "learning_rate": 1.7422321880277082e-06, "loss": 0.0, "num_input_tokens_seen": 177051488, "step": 82095 }, { "epoch": 15.06698476784731, "grad_norm": 8.159859862644225e-05, "learning_rate": 1.741624773685221e-06, "loss": 0.0, "num_input_tokens_seen": 177062048, "step": 82100 }, { "epoch": 15.067902367406864, "grad_norm": 0.0002836707280948758, "learning_rate": 1.7410174429150085e-06, "loss": 0.0, "num_input_tokens_seen": 177073088, "step": 82105 }, { "epoch": 15.068819966966416, "grad_norm": 0.0017476481152698398, "learning_rate": 1.7404101957326457e-06, "loss": 0.1438, "num_input_tokens_seen": 177083008, "step": 82110 }, { "epoch": 15.069737566525967, "grad_norm": 0.0002750611456576735, "learning_rate": 1.7398030321537117e-06, "loss": 0.0, "num_input_tokens_seen": 177093664, "step": 82115 }, { "epoch": 15.07065516608552, "grad_norm": 0.0027775513008236885, "learning_rate": 1.7391959521937767e-06, "loss": 0.0, "num_input_tokens_seen": 177104928, "step": 82120 }, { "epoch": 15.071572765645072, "grad_norm": 0.00019063486251980066, "learning_rate": 1.73858895586841e-06, "loss": 0.0, "num_input_tokens_seen": 177115968, "step": 82125 }, { "epoch": 15.072490365204624, "grad_norm": 0.00017451417807023972, "learning_rate": 1.7379820431931838e-06, "loss": 0.0, "num_input_tokens_seen": 177126464, "step": 82130 }, { "epoch": 15.073407964764177, "grad_norm": 0.0008099928381852806, "learning_rate": 1.7373752141836625e-06, "loss": 0.0, "num_input_tokens_seen": 177137536, "step": 82135 }, { "epoch": 15.074325564323729, "grad_norm": 0.0001867997634690255, "learning_rate": 1.7367684688554103e-06, "loss": 0.0, "num_input_tokens_seen": 177148352, "step": 82140 }, { "epoch": 15.07524316388328, "grad_norm": 9.762747504282743e-05, "learning_rate": 1.7361618072239877e-06, "loss": 0.0, "num_input_tokens_seen": 177157536, "step": 82145 }, { "epoch": 15.076160763442834, "grad_norm": 0.00019599880033638328, "learning_rate": 1.7355552293049578e-06, "loss": 0.0, "num_input_tokens_seen": 177168352, "step": 82150 }, { "epoch": 15.077078363002386, "grad_norm": 9.667594713391736e-05, "learning_rate": 1.7349487351138766e-06, "loss": 0.0, "num_input_tokens_seen": 177178688, "step": 82155 }, { "epoch": 15.077995962561937, "grad_norm": 0.0001933237654156983, "learning_rate": 1.7343423246662988e-06, "loss": 0.0, "num_input_tokens_seen": 177189632, "step": 82160 }, { "epoch": 15.07891356212149, "grad_norm": 0.010368192568421364, "learning_rate": 1.7337359979777802e-06, "loss": 0.0, "num_input_tokens_seen": 177201472, "step": 82165 }, { "epoch": 15.079831161681042, "grad_norm": 0.00010209789616055787, "learning_rate": 1.7331297550638714e-06, "loss": 0.0, "num_input_tokens_seen": 177211808, "step": 82170 }, { "epoch": 15.080748761240594, "grad_norm": 0.00010288221528753638, "learning_rate": 1.7325235959401194e-06, "loss": 0.0, "num_input_tokens_seen": 177221280, "step": 82175 }, { "epoch": 15.081666360800147, "grad_norm": 0.00021836714586243033, "learning_rate": 1.7319175206220745e-06, "loss": 0.0, "num_input_tokens_seen": 177231200, "step": 82180 }, { "epoch": 15.082583960359699, "grad_norm": 0.002716916147619486, "learning_rate": 1.7313115291252809e-06, "loss": 0.0, "num_input_tokens_seen": 177242240, "step": 82185 }, { "epoch": 15.08350155991925, "grad_norm": 0.00014794727030675858, "learning_rate": 1.7307056214652796e-06, "loss": 0.0, "num_input_tokens_seen": 177253600, "step": 82190 }, { "epoch": 15.084419159478804, "grad_norm": 0.0023422944359481335, "learning_rate": 1.7300997976576128e-06, "loss": 0.0, "num_input_tokens_seen": 177264864, "step": 82195 }, { "epoch": 15.085336759038356, "grad_norm": 0.0001788811496226117, "learning_rate": 1.7294940577178164e-06, "loss": 0.0, "num_input_tokens_seen": 177276864, "step": 82200 }, { "epoch": 15.086254358597907, "grad_norm": 0.001221562852151692, "learning_rate": 1.7288884016614305e-06, "loss": 0.0, "num_input_tokens_seen": 177288704, "step": 82205 }, { "epoch": 15.08717195815746, "grad_norm": 0.00023675545526202768, "learning_rate": 1.7282828295039866e-06, "loss": 0.2406, "num_input_tokens_seen": 177299840, "step": 82210 }, { "epoch": 15.088089557717012, "grad_norm": 0.00017513963393867016, "learning_rate": 1.7276773412610181e-06, "loss": 0.0, "num_input_tokens_seen": 177310528, "step": 82215 }, { "epoch": 15.089007157276564, "grad_norm": 0.0006665470427833498, "learning_rate": 1.7270719369480543e-06, "loss": 0.0, "num_input_tokens_seen": 177321600, "step": 82220 }, { "epoch": 15.089924756836117, "grad_norm": 0.0073247128166258335, "learning_rate": 1.7264666165806205e-06, "loss": 0.0, "num_input_tokens_seen": 177332768, "step": 82225 }, { "epoch": 15.090842356395669, "grad_norm": 0.0002675494470167905, "learning_rate": 1.7258613801742463e-06, "loss": 0.2, "num_input_tokens_seen": 177343424, "step": 82230 }, { "epoch": 15.09175995595522, "grad_norm": 0.00020824863167945296, "learning_rate": 1.7252562277444534e-06, "loss": 0.0, "num_input_tokens_seen": 177354112, "step": 82235 }, { "epoch": 15.092677555514774, "grad_norm": 0.00010623126581776887, "learning_rate": 1.7246511593067627e-06, "loss": 0.0, "num_input_tokens_seen": 177364256, "step": 82240 }, { "epoch": 15.093595155074325, "grad_norm": 0.00031549049890600145, "learning_rate": 1.7240461748766917e-06, "loss": 0.0, "num_input_tokens_seen": 177373568, "step": 82245 }, { "epoch": 15.094512754633877, "grad_norm": 0.008416473865509033, "learning_rate": 1.723441274469761e-06, "loss": 0.0, "num_input_tokens_seen": 177385792, "step": 82250 }, { "epoch": 15.09543035419343, "grad_norm": 0.00012561252515297383, "learning_rate": 1.7228364581014834e-06, "loss": 0.0, "num_input_tokens_seen": 177395008, "step": 82255 }, { "epoch": 15.096347953752982, "grad_norm": 0.0001715684193186462, "learning_rate": 1.72223172578737e-06, "loss": 0.0, "num_input_tokens_seen": 177405696, "step": 82260 }, { "epoch": 15.097265553312534, "grad_norm": 0.0001392389094689861, "learning_rate": 1.721627077542934e-06, "loss": 0.0, "num_input_tokens_seen": 177416672, "step": 82265 }, { "epoch": 15.098183152872087, "grad_norm": 36.72614669799805, "learning_rate": 1.7210225133836828e-06, "loss": 0.001, "num_input_tokens_seen": 177428192, "step": 82270 }, { "epoch": 15.099100752431639, "grad_norm": 6.655295874224976e-05, "learning_rate": 1.720418033325122e-06, "loss": 0.0, "num_input_tokens_seen": 177439296, "step": 82275 }, { "epoch": 15.10001835199119, "grad_norm": 0.007546324282884598, "learning_rate": 1.719813637382754e-06, "loss": 0.0, "num_input_tokens_seen": 177450176, "step": 82280 }, { "epoch": 15.100935951550744, "grad_norm": 0.00010838269372470677, "learning_rate": 1.7192093255720838e-06, "loss": 0.0, "num_input_tokens_seen": 177460576, "step": 82285 }, { "epoch": 15.101853551110295, "grad_norm": 0.00015066850755829364, "learning_rate": 1.71860509790861e-06, "loss": 0.0, "num_input_tokens_seen": 177471040, "step": 82290 }, { "epoch": 15.102771150669847, "grad_norm": 0.0018762932159006596, "learning_rate": 1.718000954407828e-06, "loss": 0.0, "num_input_tokens_seen": 177481824, "step": 82295 }, { "epoch": 15.1036887502294, "grad_norm": 0.00015482361777685583, "learning_rate": 1.7173968950852366e-06, "loss": 0.0, "num_input_tokens_seen": 177492544, "step": 82300 }, { "epoch": 15.104606349788952, "grad_norm": 0.0003683005052153021, "learning_rate": 1.7167929199563272e-06, "loss": 0.0, "num_input_tokens_seen": 177502432, "step": 82305 }, { "epoch": 15.105523949348504, "grad_norm": 0.0007318633142858744, "learning_rate": 1.716189029036589e-06, "loss": 0.0, "num_input_tokens_seen": 177512448, "step": 82310 }, { "epoch": 15.106441548908057, "grad_norm": 7.559249934274703e-05, "learning_rate": 1.715585222341515e-06, "loss": 0.0, "num_input_tokens_seen": 177523072, "step": 82315 }, { "epoch": 15.107359148467609, "grad_norm": 0.0001491199800511822, "learning_rate": 1.7149814998865894e-06, "loss": 0.0, "num_input_tokens_seen": 177534752, "step": 82320 }, { "epoch": 15.10827674802716, "grad_norm": 0.00010032556019723415, "learning_rate": 1.7143778616872968e-06, "loss": 0.0, "num_input_tokens_seen": 177544512, "step": 82325 }, { "epoch": 15.109194347586714, "grad_norm": 0.00033859856193885207, "learning_rate": 1.7137743077591184e-06, "loss": 0.3375, "num_input_tokens_seen": 177556416, "step": 82330 }, { "epoch": 15.110111947146265, "grad_norm": 0.0025547866243869066, "learning_rate": 1.713170838117537e-06, "loss": 0.0, "num_input_tokens_seen": 177567616, "step": 82335 }, { "epoch": 15.111029546705817, "grad_norm": 0.0003640448267105967, "learning_rate": 1.71256745277803e-06, "loss": 0.0, "num_input_tokens_seen": 177579040, "step": 82340 }, { "epoch": 15.11194714626537, "grad_norm": 0.003894251538440585, "learning_rate": 1.7119641517560709e-06, "loss": 0.0, "num_input_tokens_seen": 177590848, "step": 82345 }, { "epoch": 15.112864745824922, "grad_norm": 0.0005094525404274464, "learning_rate": 1.7113609350671372e-06, "loss": 0.0, "num_input_tokens_seen": 177601056, "step": 82350 }, { "epoch": 15.113782345384474, "grad_norm": 0.0001143787449109368, "learning_rate": 1.710757802726698e-06, "loss": 0.0, "num_input_tokens_seen": 177610688, "step": 82355 }, { "epoch": 15.114699944944027, "grad_norm": 0.0003966282238252461, "learning_rate": 1.7101547547502223e-06, "loss": 0.0, "num_input_tokens_seen": 177622688, "step": 82360 }, { "epoch": 15.115617544503579, "grad_norm": 0.00021206373639870435, "learning_rate": 1.70955179115318e-06, "loss": 0.0, "num_input_tokens_seen": 177634656, "step": 82365 }, { "epoch": 15.11653514406313, "grad_norm": 8.844827971188352e-05, "learning_rate": 1.708948911951034e-06, "loss": 0.0, "num_input_tokens_seen": 177646304, "step": 82370 }, { "epoch": 15.117452743622684, "grad_norm": 0.0010234818328171968, "learning_rate": 1.708346117159248e-06, "loss": 0.0, "num_input_tokens_seen": 177655744, "step": 82375 }, { "epoch": 15.118370343182235, "grad_norm": 0.09019813686609268, "learning_rate": 1.7077434067932808e-06, "loss": 0.0, "num_input_tokens_seen": 177667456, "step": 82380 }, { "epoch": 15.119287942741787, "grad_norm": 0.00010598028165986761, "learning_rate": 1.7071407808685946e-06, "loss": 0.0, "num_input_tokens_seen": 177679200, "step": 82385 }, { "epoch": 15.12020554230134, "grad_norm": 0.00014131912030279636, "learning_rate": 1.7065382394006436e-06, "loss": 0.0, "num_input_tokens_seen": 177690528, "step": 82390 }, { "epoch": 15.121123141860892, "grad_norm": 0.0025226117577403784, "learning_rate": 1.7059357824048805e-06, "loss": 0.0, "num_input_tokens_seen": 177702112, "step": 82395 }, { "epoch": 15.122040741420443, "grad_norm": 0.002091701840981841, "learning_rate": 1.7053334098967616e-06, "loss": 0.0, "num_input_tokens_seen": 177712704, "step": 82400 }, { "epoch": 15.122958340979997, "grad_norm": 0.0004459745832718909, "learning_rate": 1.704731121891734e-06, "loss": 0.0, "num_input_tokens_seen": 177724128, "step": 82405 }, { "epoch": 15.123875940539548, "grad_norm": 0.015349172987043858, "learning_rate": 1.7041289184052462e-06, "loss": 0.0, "num_input_tokens_seen": 177735104, "step": 82410 }, { "epoch": 15.1247935400991, "grad_norm": 0.0009872743394225836, "learning_rate": 1.7035267994527433e-06, "loss": 0.0, "num_input_tokens_seen": 177744160, "step": 82415 }, { "epoch": 15.125711139658653, "grad_norm": 0.0007376351859420538, "learning_rate": 1.7029247650496672e-06, "loss": 0.0, "num_input_tokens_seen": 177755360, "step": 82420 }, { "epoch": 15.126628739218205, "grad_norm": 0.00016175859491340816, "learning_rate": 1.7023228152114625e-06, "loss": 0.0284, "num_input_tokens_seen": 177765248, "step": 82425 }, { "epoch": 15.127546338777757, "grad_norm": 9.564743231749162e-05, "learning_rate": 1.7017209499535664e-06, "loss": 0.0, "num_input_tokens_seen": 177777152, "step": 82430 }, { "epoch": 15.12846393833731, "grad_norm": 0.0020882387179881334, "learning_rate": 1.7011191692914165e-06, "loss": 0.0029, "num_input_tokens_seen": 177789824, "step": 82435 }, { "epoch": 15.129381537896862, "grad_norm": 0.0004293479141779244, "learning_rate": 1.7005174732404473e-06, "loss": 0.0, "num_input_tokens_seen": 177800288, "step": 82440 }, { "epoch": 15.130299137456413, "grad_norm": 0.00014641611778642982, "learning_rate": 1.6999158618160888e-06, "loss": 0.0, "num_input_tokens_seen": 177811424, "step": 82445 }, { "epoch": 15.131216737015967, "grad_norm": 0.0002448732848279178, "learning_rate": 1.699314335033776e-06, "loss": 0.0, "num_input_tokens_seen": 177822016, "step": 82450 }, { "epoch": 15.132134336575518, "grad_norm": 0.005856410600244999, "learning_rate": 1.6987128929089346e-06, "loss": 0.0, "num_input_tokens_seen": 177831744, "step": 82455 }, { "epoch": 15.13305193613507, "grad_norm": 0.0004960199585184455, "learning_rate": 1.6981115354569915e-06, "loss": 0.0002, "num_input_tokens_seen": 177842816, "step": 82460 }, { "epoch": 15.133969535694623, "grad_norm": 0.00029700540471822023, "learning_rate": 1.6975102626933683e-06, "loss": 0.0001, "num_input_tokens_seen": 177853152, "step": 82465 }, { "epoch": 15.134887135254175, "grad_norm": 5.6335942645091563e-05, "learning_rate": 1.6969090746334893e-06, "loss": 0.0, "num_input_tokens_seen": 177862592, "step": 82470 }, { "epoch": 15.135804734813727, "grad_norm": 8.488605817547068e-05, "learning_rate": 1.6963079712927737e-06, "loss": 0.0, "num_input_tokens_seen": 177874656, "step": 82475 }, { "epoch": 15.13672233437328, "grad_norm": 9.12318573682569e-05, "learning_rate": 1.695706952686637e-06, "loss": 0.0, "num_input_tokens_seen": 177886240, "step": 82480 }, { "epoch": 15.137639933932832, "grad_norm": 6.355358345899731e-05, "learning_rate": 1.6951060188304975e-06, "loss": 0.0, "num_input_tokens_seen": 177897248, "step": 82485 }, { "epoch": 15.138557533492383, "grad_norm": 0.0019692431669682264, "learning_rate": 1.6945051697397658e-06, "loss": 0.0, "num_input_tokens_seen": 177906528, "step": 82490 }, { "epoch": 15.139475133051937, "grad_norm": 0.00025638207443989813, "learning_rate": 1.693904405429852e-06, "loss": 0.0, "num_input_tokens_seen": 177917184, "step": 82495 }, { "epoch": 15.140392732611488, "grad_norm": 0.0010546734556555748, "learning_rate": 1.6933037259161682e-06, "loss": 0.0, "num_input_tokens_seen": 177926752, "step": 82500 }, { "epoch": 15.14131033217104, "grad_norm": 972.4760131835938, "learning_rate": 1.692703131214119e-06, "loss": 0.0943, "num_input_tokens_seen": 177936960, "step": 82505 }, { "epoch": 15.142227931730593, "grad_norm": 0.004706973675638437, "learning_rate": 1.6921026213391083e-06, "loss": 0.0, "num_input_tokens_seen": 177946944, "step": 82510 }, { "epoch": 15.143145531290145, "grad_norm": 0.00012523483019322157, "learning_rate": 1.691502196306537e-06, "loss": 0.0, "num_input_tokens_seen": 177958304, "step": 82515 }, { "epoch": 15.144063130849696, "grad_norm": 0.00011379345232853666, "learning_rate": 1.6909018561318086e-06, "loss": 0.0, "num_input_tokens_seen": 177968800, "step": 82520 }, { "epoch": 15.14498073040925, "grad_norm": 0.00029940661625005305, "learning_rate": 1.6903016008303187e-06, "loss": 0.0, "num_input_tokens_seen": 177979360, "step": 82525 }, { "epoch": 15.145898329968801, "grad_norm": 7.668224861845374e-05, "learning_rate": 1.6897014304174615e-06, "loss": 0.0, "num_input_tokens_seen": 177989376, "step": 82530 }, { "epoch": 15.146815929528353, "grad_norm": 0.00017835857579484582, "learning_rate": 1.6891013449086335e-06, "loss": 0.0, "num_input_tokens_seen": 178000704, "step": 82535 }, { "epoch": 15.147733529087906, "grad_norm": 0.00025942560750991106, "learning_rate": 1.688501344319225e-06, "loss": 0.0, "num_input_tokens_seen": 178012256, "step": 82540 }, { "epoch": 15.148651128647458, "grad_norm": 0.017185579985380173, "learning_rate": 1.6879014286646228e-06, "loss": 0.0, "num_input_tokens_seen": 178022688, "step": 82545 }, { "epoch": 15.14956872820701, "grad_norm": 0.00014055061910767108, "learning_rate": 1.6873015979602176e-06, "loss": 0.0, "num_input_tokens_seen": 178031872, "step": 82550 }, { "epoch": 15.150486327766563, "grad_norm": 0.00014298177848104388, "learning_rate": 1.6867018522213918e-06, "loss": 0.0, "num_input_tokens_seen": 178043456, "step": 82555 }, { "epoch": 15.151403927326115, "grad_norm": 0.00011043006816180423, "learning_rate": 1.6861021914635289e-06, "loss": 0.0, "num_input_tokens_seen": 178054624, "step": 82560 }, { "epoch": 15.152321526885666, "grad_norm": 186.2545166015625, "learning_rate": 1.6855026157020066e-06, "loss": 0.0674, "num_input_tokens_seen": 178066144, "step": 82565 }, { "epoch": 15.15323912644522, "grad_norm": 0.0002627021458465606, "learning_rate": 1.684903124952207e-06, "loss": 0.0, "num_input_tokens_seen": 178076512, "step": 82570 }, { "epoch": 15.154156726004771, "grad_norm": 0.00044791059917770326, "learning_rate": 1.6843037192295042e-06, "loss": 0.0, "num_input_tokens_seen": 178088032, "step": 82575 }, { "epoch": 15.155074325564323, "grad_norm": 0.021019577980041504, "learning_rate": 1.6837043985492707e-06, "loss": 0.0, "num_input_tokens_seen": 178099008, "step": 82580 }, { "epoch": 15.155991925123876, "grad_norm": 0.0008964581647887826, "learning_rate": 1.6831051629268807e-06, "loss": 0.0, "num_input_tokens_seen": 178109952, "step": 82585 }, { "epoch": 15.156909524683428, "grad_norm": 0.00015688077837694436, "learning_rate": 1.6825060123777032e-06, "loss": 0.0, "num_input_tokens_seen": 178121568, "step": 82590 }, { "epoch": 15.15782712424298, "grad_norm": 0.001056983950547874, "learning_rate": 1.6819069469171045e-06, "loss": 0.0001, "num_input_tokens_seen": 178132320, "step": 82595 }, { "epoch": 15.158744723802533, "grad_norm": 0.000234407969401218, "learning_rate": 1.6813079665604487e-06, "loss": 0.0, "num_input_tokens_seen": 178141792, "step": 82600 }, { "epoch": 15.159662323362085, "grad_norm": 0.0002028344024438411, "learning_rate": 1.6807090713231012e-06, "loss": 0.0, "num_input_tokens_seen": 178153184, "step": 82605 }, { "epoch": 15.160579922921636, "grad_norm": 7.370010280283168e-05, "learning_rate": 1.6801102612204218e-06, "loss": 0.0, "num_input_tokens_seen": 178164512, "step": 82610 }, { "epoch": 15.16149752248119, "grad_norm": 0.00016827146464493126, "learning_rate": 1.6795115362677671e-06, "loss": 0.0, "num_input_tokens_seen": 178175712, "step": 82615 }, { "epoch": 15.162415122040741, "grad_norm": 0.0008823417592793703, "learning_rate": 1.6789128964804973e-06, "loss": 0.0, "num_input_tokens_seen": 178187360, "step": 82620 }, { "epoch": 15.163332721600293, "grad_norm": 0.003826951840892434, "learning_rate": 1.6783143418739639e-06, "loss": 0.0, "num_input_tokens_seen": 178196736, "step": 82625 }, { "epoch": 15.164250321159846, "grad_norm": 0.0011209155200049281, "learning_rate": 1.6777158724635202e-06, "loss": 0.0001, "num_input_tokens_seen": 178207136, "step": 82630 }, { "epoch": 15.165167920719398, "grad_norm": 0.004369609989225864, "learning_rate": 1.6771174882645147e-06, "loss": 0.0, "num_input_tokens_seen": 178217408, "step": 82635 }, { "epoch": 15.16608552027895, "grad_norm": 0.00023253289691638201, "learning_rate": 1.6765191892922956e-06, "loss": 0.0003, "num_input_tokens_seen": 178227552, "step": 82640 }, { "epoch": 15.167003119838503, "grad_norm": 97.69878387451172, "learning_rate": 1.675920975562207e-06, "loss": 0.0883, "num_input_tokens_seen": 178238816, "step": 82645 }, { "epoch": 15.167920719398055, "grad_norm": 0.0018266639672219753, "learning_rate": 1.675322847089595e-06, "loss": 0.0, "num_input_tokens_seen": 178250368, "step": 82650 }, { "epoch": 15.168838318957606, "grad_norm": 0.00028267671586945653, "learning_rate": 1.674724803889799e-06, "loss": 0.0, "num_input_tokens_seen": 178261152, "step": 82655 }, { "epoch": 15.16975591851716, "grad_norm": 8.440038800472394e-05, "learning_rate": 1.6741268459781584e-06, "loss": 0.0, "num_input_tokens_seen": 178273184, "step": 82660 }, { "epoch": 15.170673518076711, "grad_norm": 6.765510624973103e-05, "learning_rate": 1.6735289733700078e-06, "loss": 0.0028, "num_input_tokens_seen": 178283776, "step": 82665 }, { "epoch": 15.171591117636263, "grad_norm": 0.00015595419972669333, "learning_rate": 1.6729311860806851e-06, "loss": 0.0, "num_input_tokens_seen": 178294368, "step": 82670 }, { "epoch": 15.172508717195816, "grad_norm": 0.031582143157720566, "learning_rate": 1.6723334841255212e-06, "loss": 0.0505, "num_input_tokens_seen": 178305728, "step": 82675 }, { "epoch": 15.173426316755368, "grad_norm": 0.00019069173140451312, "learning_rate": 1.6717358675198442e-06, "loss": 0.0028, "num_input_tokens_seen": 178315648, "step": 82680 }, { "epoch": 15.17434391631492, "grad_norm": 0.00010101220686919987, "learning_rate": 1.6711383362789857e-06, "loss": 0.0, "num_input_tokens_seen": 178325376, "step": 82685 }, { "epoch": 15.175261515874473, "grad_norm": 9.917681018123403e-05, "learning_rate": 1.6705408904182696e-06, "loss": 0.0011, "num_input_tokens_seen": 178336064, "step": 82690 }, { "epoch": 15.176179115434024, "grad_norm": 0.0003401250869501382, "learning_rate": 1.6699435299530191e-06, "loss": 0.0, "num_input_tokens_seen": 178346848, "step": 82695 }, { "epoch": 15.177096714993576, "grad_norm": 0.00018194250878877938, "learning_rate": 1.6693462548985545e-06, "loss": 0.0, "num_input_tokens_seen": 178358400, "step": 82700 }, { "epoch": 15.17801431455313, "grad_norm": 0.015072409063577652, "learning_rate": 1.6687490652701982e-06, "loss": 0.0, "num_input_tokens_seen": 178369024, "step": 82705 }, { "epoch": 15.178931914112681, "grad_norm": 0.00015577051090076566, "learning_rate": 1.6681519610832653e-06, "loss": 0.0, "num_input_tokens_seen": 178380224, "step": 82710 }, { "epoch": 15.179849513672233, "grad_norm": 0.00018388405442237854, "learning_rate": 1.6675549423530685e-06, "loss": 0.0, "num_input_tokens_seen": 178391232, "step": 82715 }, { "epoch": 15.180767113231786, "grad_norm": 0.0018534325063228607, "learning_rate": 1.666958009094925e-06, "loss": 0.0, "num_input_tokens_seen": 178401440, "step": 82720 }, { "epoch": 15.181684712791338, "grad_norm": 0.00014020653907209635, "learning_rate": 1.6663611613241427e-06, "loss": 0.0, "num_input_tokens_seen": 178410624, "step": 82725 }, { "epoch": 15.18260231235089, "grad_norm": 0.00022214655473362654, "learning_rate": 1.665764399056028e-06, "loss": 0.0, "num_input_tokens_seen": 178421024, "step": 82730 }, { "epoch": 15.183519911910443, "grad_norm": 9.318414959125221e-05, "learning_rate": 1.6651677223058909e-06, "loss": 0.0, "num_input_tokens_seen": 178432448, "step": 82735 }, { "epoch": 15.184437511469994, "grad_norm": 0.003026131307706237, "learning_rate": 1.6645711310890328e-06, "loss": 0.0, "num_input_tokens_seen": 178443616, "step": 82740 }, { "epoch": 15.185355111029546, "grad_norm": 203.70339965820312, "learning_rate": 1.6639746254207562e-06, "loss": 0.1376, "num_input_tokens_seen": 178454464, "step": 82745 }, { "epoch": 15.1862727105891, "grad_norm": 7.851122791180387e-05, "learning_rate": 1.6633782053163578e-06, "loss": 0.0, "num_input_tokens_seen": 178465152, "step": 82750 }, { "epoch": 15.187190310148651, "grad_norm": 8.454331691609696e-05, "learning_rate": 1.6627818707911392e-06, "loss": 0.0, "num_input_tokens_seen": 178474912, "step": 82755 }, { "epoch": 15.188107909708203, "grad_norm": 0.005401620641350746, "learning_rate": 1.6621856218603932e-06, "loss": 0.0, "num_input_tokens_seen": 178484352, "step": 82760 }, { "epoch": 15.189025509267756, "grad_norm": 0.0006084452616050839, "learning_rate": 1.6615894585394115e-06, "loss": 0.0, "num_input_tokens_seen": 178494368, "step": 82765 }, { "epoch": 15.189943108827308, "grad_norm": 0.007193111348897219, "learning_rate": 1.6609933808434875e-06, "loss": 0.2063, "num_input_tokens_seen": 178505280, "step": 82770 }, { "epoch": 15.19086070838686, "grad_norm": 0.010330293327569962, "learning_rate": 1.6603973887879088e-06, "loss": 0.0, "num_input_tokens_seen": 178516768, "step": 82775 }, { "epoch": 15.191778307946413, "grad_norm": 0.0011509173782542348, "learning_rate": 1.6598014823879604e-06, "loss": 0.0, "num_input_tokens_seen": 178526848, "step": 82780 }, { "epoch": 15.192695907505964, "grad_norm": 0.7933951616287231, "learning_rate": 1.6592056616589258e-06, "loss": 0.0005, "num_input_tokens_seen": 178537248, "step": 82785 }, { "epoch": 15.193613507065516, "grad_norm": 0.000382964062737301, "learning_rate": 1.6586099266160904e-06, "loss": 0.0, "num_input_tokens_seen": 178547968, "step": 82790 }, { "epoch": 15.19453110662507, "grad_norm": 0.0008840053342282772, "learning_rate": 1.658014277274731e-06, "loss": 0.0, "num_input_tokens_seen": 178557760, "step": 82795 }, { "epoch": 15.19544870618462, "grad_norm": 0.00012638689076993614, "learning_rate": 1.6574187136501247e-06, "loss": 0.0, "num_input_tokens_seen": 178567584, "step": 82800 }, { "epoch": 15.196366305744172, "grad_norm": 9.614793088985607e-05, "learning_rate": 1.6568232357575486e-06, "loss": 0.0, "num_input_tokens_seen": 178578560, "step": 82805 }, { "epoch": 15.197283905303726, "grad_norm": 0.0022415274288505316, "learning_rate": 1.6562278436122759e-06, "loss": 0.0, "num_input_tokens_seen": 178588576, "step": 82810 }, { "epoch": 15.198201504863277, "grad_norm": 0.010694537311792374, "learning_rate": 1.6556325372295746e-06, "loss": 0.0, "num_input_tokens_seen": 178599072, "step": 82815 }, { "epoch": 15.199119104422829, "grad_norm": 0.0005100809503346682, "learning_rate": 1.6550373166247174e-06, "loss": 0.0, "num_input_tokens_seen": 178609952, "step": 82820 }, { "epoch": 15.200036703982382, "grad_norm": 7.126914715627208e-05, "learning_rate": 1.6544421818129685e-06, "loss": 0.0, "num_input_tokens_seen": 178619776, "step": 82825 }, { "epoch": 15.200954303541934, "grad_norm": 0.0007272089133039117, "learning_rate": 1.6538471328095922e-06, "loss": 0.0, "num_input_tokens_seen": 178630880, "step": 82830 }, { "epoch": 15.201871903101486, "grad_norm": 0.006841037422418594, "learning_rate": 1.6532521696298515e-06, "loss": 0.0, "num_input_tokens_seen": 178641088, "step": 82835 }, { "epoch": 15.20278950266104, "grad_norm": 0.0006557313026860356, "learning_rate": 1.6526572922890038e-06, "loss": 0.0, "num_input_tokens_seen": 178653216, "step": 82840 }, { "epoch": 15.20370710222059, "grad_norm": 0.2461574524641037, "learning_rate": 1.6520625008023106e-06, "loss": 0.0, "num_input_tokens_seen": 178664128, "step": 82845 }, { "epoch": 15.204624701780142, "grad_norm": 0.0017031572060659528, "learning_rate": 1.651467795185025e-06, "loss": 0.0, "num_input_tokens_seen": 178674592, "step": 82850 }, { "epoch": 15.205542301339696, "grad_norm": 0.00023738440359011292, "learning_rate": 1.6508731754524004e-06, "loss": 0.0, "num_input_tokens_seen": 178684928, "step": 82855 }, { "epoch": 15.206459900899247, "grad_norm": 0.0003355014487169683, "learning_rate": 1.6502786416196887e-06, "loss": 0.0, "num_input_tokens_seen": 178695232, "step": 82860 }, { "epoch": 15.207377500458799, "grad_norm": 0.14185680449008942, "learning_rate": 1.6496841937021363e-06, "loss": 0.0, "num_input_tokens_seen": 178706912, "step": 82865 }, { "epoch": 15.208295100018352, "grad_norm": 9.518433216726407e-05, "learning_rate": 1.649089831714994e-06, "loss": 0.0, "num_input_tokens_seen": 178717504, "step": 82870 }, { "epoch": 15.209212699577904, "grad_norm": 0.00017126962484326214, "learning_rate": 1.6484955556735033e-06, "loss": 0.0, "num_input_tokens_seen": 178727232, "step": 82875 }, { "epoch": 15.210130299137456, "grad_norm": 0.0003388495824765414, "learning_rate": 1.6479013655929077e-06, "loss": 0.0, "num_input_tokens_seen": 178738592, "step": 82880 }, { "epoch": 15.211047898697009, "grad_norm": 0.13599200546741486, "learning_rate": 1.647307261488445e-06, "loss": 0.0001, "num_input_tokens_seen": 178749472, "step": 82885 }, { "epoch": 15.21196549825656, "grad_norm": 0.00042995656258426607, "learning_rate": 1.6467132433753568e-06, "loss": 0.0, "num_input_tokens_seen": 178760000, "step": 82890 }, { "epoch": 15.212883097816112, "grad_norm": 0.0001331311505055055, "learning_rate": 1.646119311268876e-06, "loss": 0.0, "num_input_tokens_seen": 178771136, "step": 82895 }, { "epoch": 15.213800697375666, "grad_norm": 0.00027642634813673794, "learning_rate": 1.6455254651842361e-06, "loss": 0.0, "num_input_tokens_seen": 178782528, "step": 82900 }, { "epoch": 15.214718296935217, "grad_norm": 9.629041596781462e-05, "learning_rate": 1.6449317051366703e-06, "loss": 0.0, "num_input_tokens_seen": 178793600, "step": 82905 }, { "epoch": 15.215635896494769, "grad_norm": 0.001009552739560604, "learning_rate": 1.6443380311414065e-06, "loss": 0.0, "num_input_tokens_seen": 178806208, "step": 82910 }, { "epoch": 15.216553496054322, "grad_norm": 6.18318808847107e-05, "learning_rate": 1.6437444432136713e-06, "loss": 0.0, "num_input_tokens_seen": 178816544, "step": 82915 }, { "epoch": 15.217471095613874, "grad_norm": 0.00010337206185795367, "learning_rate": 1.6431509413686874e-06, "loss": 0.0, "num_input_tokens_seen": 178826624, "step": 82920 }, { "epoch": 15.218388695173426, "grad_norm": 0.01230715960264206, "learning_rate": 1.6425575256216815e-06, "loss": 0.0, "num_input_tokens_seen": 178837184, "step": 82925 }, { "epoch": 15.219306294732979, "grad_norm": 0.0002158429560950026, "learning_rate": 1.6419641959878712e-06, "loss": 0.0, "num_input_tokens_seen": 178847872, "step": 82930 }, { "epoch": 15.22022389429253, "grad_norm": 7.644232391612604e-05, "learning_rate": 1.6413709524824729e-06, "loss": 0.0, "num_input_tokens_seen": 178858368, "step": 82935 }, { "epoch": 15.221141493852082, "grad_norm": 0.0004470418789424002, "learning_rate": 1.6407777951207065e-06, "loss": 0.0, "num_input_tokens_seen": 178869472, "step": 82940 }, { "epoch": 15.222059093411636, "grad_norm": 0.00046816436224617064, "learning_rate": 1.6401847239177826e-06, "loss": 0.0, "num_input_tokens_seen": 178880064, "step": 82945 }, { "epoch": 15.222976692971187, "grad_norm": 0.0013644087594002485, "learning_rate": 1.6395917388889122e-06, "loss": 0.0, "num_input_tokens_seen": 178888352, "step": 82950 }, { "epoch": 15.223894292530739, "grad_norm": 0.00016316463006660342, "learning_rate": 1.6389988400493068e-06, "loss": 0.0, "num_input_tokens_seen": 178898880, "step": 82955 }, { "epoch": 15.224811892090292, "grad_norm": 0.0001464979286538437, "learning_rate": 1.6384060274141729e-06, "loss": 0.0, "num_input_tokens_seen": 178909824, "step": 82960 }, { "epoch": 15.225729491649844, "grad_norm": 0.00010810183448484167, "learning_rate": 1.6378133009987134e-06, "loss": 0.0, "num_input_tokens_seen": 178921600, "step": 82965 }, { "epoch": 15.226647091209395, "grad_norm": 0.0005917948437854648, "learning_rate": 1.6372206608181307e-06, "loss": 0.0, "num_input_tokens_seen": 178932704, "step": 82970 }, { "epoch": 15.227564690768949, "grad_norm": 0.00042761320946738124, "learning_rate": 1.6366281068876277e-06, "loss": 0.0, "num_input_tokens_seen": 178943392, "step": 82975 }, { "epoch": 15.2284822903285, "grad_norm": 0.0006221352959983051, "learning_rate": 1.6360356392224009e-06, "loss": 0.0, "num_input_tokens_seen": 178953472, "step": 82980 }, { "epoch": 15.229399889888052, "grad_norm": 0.004370047245174646, "learning_rate": 1.635443257837645e-06, "loss": 0.0, "num_input_tokens_seen": 178964928, "step": 82985 }, { "epoch": 15.230317489447605, "grad_norm": 0.0008765275706537068, "learning_rate": 1.6348509627485558e-06, "loss": 0.0, "num_input_tokens_seen": 178976000, "step": 82990 }, { "epoch": 15.231235089007157, "grad_norm": 0.004569144919514656, "learning_rate": 1.6342587539703247e-06, "loss": 0.0, "num_input_tokens_seen": 178986496, "step": 82995 }, { "epoch": 15.232152688566709, "grad_norm": 9.630562271922827e-05, "learning_rate": 1.6336666315181382e-06, "loss": 0.0, "num_input_tokens_seen": 178997984, "step": 83000 }, { "epoch": 15.233070288126262, "grad_norm": 0.0008712153066881001, "learning_rate": 1.6330745954071869e-06, "loss": 0.0, "num_input_tokens_seen": 179008320, "step": 83005 }, { "epoch": 15.233987887685814, "grad_norm": 0.00011124931916128844, "learning_rate": 1.6324826456526544e-06, "loss": 0.0, "num_input_tokens_seen": 179017792, "step": 83010 }, { "epoch": 15.234905487245365, "grad_norm": 0.00010444044164614752, "learning_rate": 1.6318907822697222e-06, "loss": 0.0, "num_input_tokens_seen": 179028416, "step": 83015 }, { "epoch": 15.235823086804919, "grad_norm": 9.17441793717444e-05, "learning_rate": 1.63129900527357e-06, "loss": 0.0, "num_input_tokens_seen": 179039008, "step": 83020 }, { "epoch": 15.23674068636447, "grad_norm": 0.0001139554733526893, "learning_rate": 1.6307073146793788e-06, "loss": 0.0, "num_input_tokens_seen": 179049792, "step": 83025 }, { "epoch": 15.237658285924022, "grad_norm": 0.00013266982568893582, "learning_rate": 1.630115710502323e-06, "loss": 0.0, "num_input_tokens_seen": 179060032, "step": 83030 }, { "epoch": 15.238575885483575, "grad_norm": 0.0006032960372976959, "learning_rate": 1.629524192757575e-06, "loss": 0.0, "num_input_tokens_seen": 179070816, "step": 83035 }, { "epoch": 15.239493485043127, "grad_norm": 0.010203620418906212, "learning_rate": 1.6289327614603096e-06, "loss": 0.0, "num_input_tokens_seen": 179081728, "step": 83040 }, { "epoch": 15.240411084602679, "grad_norm": 0.00013813750410918146, "learning_rate": 1.6283414166256933e-06, "loss": 0.0, "num_input_tokens_seen": 179090464, "step": 83045 }, { "epoch": 15.241328684162232, "grad_norm": 0.0016515494789928198, "learning_rate": 1.6277501582688948e-06, "loss": 0.0, "num_input_tokens_seen": 179101184, "step": 83050 }, { "epoch": 15.242246283721784, "grad_norm": 0.2029426246881485, "learning_rate": 1.627158986405078e-06, "loss": 0.0001, "num_input_tokens_seen": 179111264, "step": 83055 }, { "epoch": 15.243163883281335, "grad_norm": 0.0005619202274829149, "learning_rate": 1.626567901049404e-06, "loss": 0.0, "num_input_tokens_seen": 179122240, "step": 83060 }, { "epoch": 15.244081482840889, "grad_norm": 0.00016076088650152087, "learning_rate": 1.6259769022170368e-06, "loss": 0.0, "num_input_tokens_seen": 179132672, "step": 83065 }, { "epoch": 15.24499908240044, "grad_norm": 9.379328548675403e-05, "learning_rate": 1.6253859899231327e-06, "loss": 0.0097, "num_input_tokens_seen": 179144608, "step": 83070 }, { "epoch": 15.245916681959992, "grad_norm": 0.0010419485624879599, "learning_rate": 1.624795164182848e-06, "loss": 0.0, "num_input_tokens_seen": 179154208, "step": 83075 }, { "epoch": 15.246834281519545, "grad_norm": 0.0011083714198321104, "learning_rate": 1.624204425011336e-06, "loss": 0.0, "num_input_tokens_seen": 179165088, "step": 83080 }, { "epoch": 15.247751881079097, "grad_norm": 0.0049758777022361755, "learning_rate": 1.6236137724237473e-06, "loss": 0.0, "num_input_tokens_seen": 179174624, "step": 83085 }, { "epoch": 15.248669480638648, "grad_norm": 0.00011426717537688091, "learning_rate": 1.6230232064352336e-06, "loss": 0.0, "num_input_tokens_seen": 179184960, "step": 83090 }, { "epoch": 15.249587080198202, "grad_norm": 0.0001252345828106627, "learning_rate": 1.6224327270609408e-06, "loss": 0.0, "num_input_tokens_seen": 179196224, "step": 83095 }, { "epoch": 15.250504679757753, "grad_norm": 0.00042276878957636654, "learning_rate": 1.621842334316014e-06, "loss": 0.0, "num_input_tokens_seen": 179207456, "step": 83100 }, { "epoch": 15.251422279317305, "grad_norm": 0.004153572954237461, "learning_rate": 1.6212520282155935e-06, "loss": 0.0, "num_input_tokens_seen": 179218880, "step": 83105 }, { "epoch": 15.252339878876858, "grad_norm": 0.00011279655154794455, "learning_rate": 1.6206618087748238e-06, "loss": 0.0, "num_input_tokens_seen": 179230624, "step": 83110 }, { "epoch": 15.25325747843641, "grad_norm": 0.0001354334526695311, "learning_rate": 1.6200716760088415e-06, "loss": 0.0001, "num_input_tokens_seen": 179242048, "step": 83115 }, { "epoch": 15.254175077995962, "grad_norm": 0.00031130274874158204, "learning_rate": 1.619481629932781e-06, "loss": 0.0, "num_input_tokens_seen": 179253344, "step": 83120 }, { "epoch": 15.255092677555515, "grad_norm": 0.0002139762946171686, "learning_rate": 1.618891670561778e-06, "loss": 0.0, "num_input_tokens_seen": 179264320, "step": 83125 }, { "epoch": 15.256010277115067, "grad_norm": 0.00045249791583046317, "learning_rate": 1.618301797910964e-06, "loss": 0.0, "num_input_tokens_seen": 179274752, "step": 83130 }, { "epoch": 15.256927876674618, "grad_norm": 0.00027791591128334403, "learning_rate": 1.617712011995466e-06, "loss": 0.0036, "num_input_tokens_seen": 179285024, "step": 83135 }, { "epoch": 15.257845476234172, "grad_norm": 0.00011679372255457565, "learning_rate": 1.6171223128304148e-06, "loss": 0.0, "num_input_tokens_seen": 179295904, "step": 83140 }, { "epoch": 15.258763075793723, "grad_norm": 0.00012447063636500388, "learning_rate": 1.6165327004309328e-06, "loss": 0.0, "num_input_tokens_seen": 179305920, "step": 83145 }, { "epoch": 15.259680675353275, "grad_norm": 0.00010603982809698209, "learning_rate": 1.615943174812143e-06, "loss": 0.0, "num_input_tokens_seen": 179317344, "step": 83150 }, { "epoch": 15.260598274912828, "grad_norm": 0.0003908135986421257, "learning_rate": 1.6153537359891647e-06, "loss": 0.0, "num_input_tokens_seen": 179328864, "step": 83155 }, { "epoch": 15.26151587447238, "grad_norm": 2.1131699085235596, "learning_rate": 1.6147643839771188e-06, "loss": 0.001, "num_input_tokens_seen": 179340032, "step": 83160 }, { "epoch": 15.262433474031932, "grad_norm": 0.00015453649393748492, "learning_rate": 1.6141751187911198e-06, "loss": 0.0, "num_input_tokens_seen": 179351168, "step": 83165 }, { "epoch": 15.263351073591485, "grad_norm": 8.303357026306912e-05, "learning_rate": 1.61358594044628e-06, "loss": 0.0, "num_input_tokens_seen": 179361504, "step": 83170 }, { "epoch": 15.264268673151037, "grad_norm": 0.00012111181422369555, "learning_rate": 1.6129968489577142e-06, "loss": 0.0, "num_input_tokens_seen": 179371104, "step": 83175 }, { "epoch": 15.265186272710588, "grad_norm": 0.00015191920101642609, "learning_rate": 1.6124078443405294e-06, "loss": 0.04, "num_input_tokens_seen": 179378880, "step": 83180 }, { "epoch": 15.266103872270142, "grad_norm": 0.00010260360431857407, "learning_rate": 1.6118189266098315e-06, "loss": 0.0, "num_input_tokens_seen": 179389856, "step": 83185 }, { "epoch": 15.267021471829693, "grad_norm": 0.0006936907884664834, "learning_rate": 1.6112300957807286e-06, "loss": 0.0, "num_input_tokens_seen": 179399136, "step": 83190 }, { "epoch": 15.267939071389245, "grad_norm": 0.0001299103460041806, "learning_rate": 1.6106413518683217e-06, "loss": 0.0, "num_input_tokens_seen": 179408992, "step": 83195 }, { "epoch": 15.268856670948798, "grad_norm": 0.003338555572554469, "learning_rate": 1.6100526948877115e-06, "loss": 0.0, "num_input_tokens_seen": 179421696, "step": 83200 }, { "epoch": 15.26977427050835, "grad_norm": 0.00010874019062612206, "learning_rate": 1.6094641248539933e-06, "loss": 0.0, "num_input_tokens_seen": 179433472, "step": 83205 }, { "epoch": 15.270691870067902, "grad_norm": 0.003722158959135413, "learning_rate": 1.6088756417822675e-06, "loss": 0.0, "num_input_tokens_seen": 179444960, "step": 83210 }, { "epoch": 15.271609469627455, "grad_norm": 0.0013427316443994641, "learning_rate": 1.608287245687626e-06, "loss": 0.0, "num_input_tokens_seen": 179455808, "step": 83215 }, { "epoch": 15.272527069187007, "grad_norm": 0.0001625212753424421, "learning_rate": 1.6076989365851581e-06, "loss": 0.0, "num_input_tokens_seen": 179465632, "step": 83220 }, { "epoch": 15.273444668746558, "grad_norm": 0.0001299794384976849, "learning_rate": 1.6071107144899562e-06, "loss": 0.0, "num_input_tokens_seen": 179476576, "step": 83225 }, { "epoch": 15.274362268306112, "grad_norm": 0.0006294010090641677, "learning_rate": 1.6065225794171064e-06, "loss": 0.0, "num_input_tokens_seen": 179486912, "step": 83230 }, { "epoch": 15.275279867865663, "grad_norm": 9.960011811926961e-05, "learning_rate": 1.6059345313816927e-06, "loss": 0.0, "num_input_tokens_seen": 179498144, "step": 83235 }, { "epoch": 15.276197467425215, "grad_norm": 0.00011212965182494372, "learning_rate": 1.6053465703987963e-06, "loss": 0.0, "num_input_tokens_seen": 179507392, "step": 83240 }, { "epoch": 15.277115066984768, "grad_norm": 0.0098844263702631, "learning_rate": 1.604758696483501e-06, "loss": 0.0, "num_input_tokens_seen": 179517728, "step": 83245 }, { "epoch": 15.27803266654432, "grad_norm": 0.17318564653396606, "learning_rate": 1.6041709096508828e-06, "loss": 0.0001, "num_input_tokens_seen": 179528928, "step": 83250 }, { "epoch": 15.278950266103871, "grad_norm": 9.371629857923836e-05, "learning_rate": 1.6035832099160165e-06, "loss": 0.0, "num_input_tokens_seen": 179540608, "step": 83255 }, { "epoch": 15.279867865663425, "grad_norm": 7.192014891188592e-05, "learning_rate": 1.6029955972939782e-06, "loss": 0.0, "num_input_tokens_seen": 179550496, "step": 83260 }, { "epoch": 15.280785465222976, "grad_norm": 0.0001336645509582013, "learning_rate": 1.602408071799838e-06, "loss": 0.0, "num_input_tokens_seen": 179561120, "step": 83265 }, { "epoch": 15.281703064782528, "grad_norm": 0.00010166957508772612, "learning_rate": 1.6018206334486647e-06, "loss": 0.0, "num_input_tokens_seen": 179571648, "step": 83270 }, { "epoch": 15.282620664342081, "grad_norm": 0.00014793852460570633, "learning_rate": 1.601233282255526e-06, "loss": 0.0, "num_input_tokens_seen": 179581888, "step": 83275 }, { "epoch": 15.283538263901633, "grad_norm": 0.022081132978200912, "learning_rate": 1.6006460182354839e-06, "loss": 0.0, "num_input_tokens_seen": 179592384, "step": 83280 }, { "epoch": 15.284455863461186, "grad_norm": 0.00019087453256361187, "learning_rate": 1.600058841403605e-06, "loss": 0.0, "num_input_tokens_seen": 179603680, "step": 83285 }, { "epoch": 15.285373463020738, "grad_norm": 0.0038026024121791124, "learning_rate": 1.5994717517749469e-06, "loss": 0.0, "num_input_tokens_seen": 179613184, "step": 83290 }, { "epoch": 15.28629106258029, "grad_norm": 9.222347580362111e-05, "learning_rate": 1.5988847493645682e-06, "loss": 0.0, "num_input_tokens_seen": 179623552, "step": 83295 }, { "epoch": 15.287208662139843, "grad_norm": 7.243709114845842e-05, "learning_rate": 1.5982978341875244e-06, "loss": 0.0, "num_input_tokens_seen": 179634336, "step": 83300 }, { "epoch": 15.288126261699395, "grad_norm": 0.0015711103333160281, "learning_rate": 1.5977110062588675e-06, "loss": 0.0, "num_input_tokens_seen": 179645408, "step": 83305 }, { "epoch": 15.289043861258946, "grad_norm": 0.00017374652088619769, "learning_rate": 1.5971242655936519e-06, "loss": 0.0, "num_input_tokens_seen": 179656896, "step": 83310 }, { "epoch": 15.2899614608185, "grad_norm": 0.00012367291492410004, "learning_rate": 1.5965376122069248e-06, "loss": 0.0, "num_input_tokens_seen": 179667040, "step": 83315 }, { "epoch": 15.290879060378051, "grad_norm": 0.0003718295774888247, "learning_rate": 1.5959510461137312e-06, "loss": 0.0, "num_input_tokens_seen": 179677344, "step": 83320 }, { "epoch": 15.291796659937603, "grad_norm": 9.321350808022544e-05, "learning_rate": 1.595364567329119e-06, "loss": 0.0, "num_input_tokens_seen": 179687648, "step": 83325 }, { "epoch": 15.292714259497156, "grad_norm": 0.00012775007053278387, "learning_rate": 1.5947781758681297e-06, "loss": 0.0, "num_input_tokens_seen": 179698816, "step": 83330 }, { "epoch": 15.293631859056708, "grad_norm": 0.0015231342986226082, "learning_rate": 1.594191871745802e-06, "loss": 0.0002, "num_input_tokens_seen": 179709920, "step": 83335 }, { "epoch": 15.29454945861626, "grad_norm": 0.00011639206786639988, "learning_rate": 1.5936056549771728e-06, "loss": 0.0008, "num_input_tokens_seen": 179722016, "step": 83340 }, { "epoch": 15.295467058175813, "grad_norm": 8.711515692993999e-05, "learning_rate": 1.5930195255772807e-06, "loss": 0.0, "num_input_tokens_seen": 179731776, "step": 83345 }, { "epoch": 15.296384657735365, "grad_norm": 0.00016313944070134312, "learning_rate": 1.5924334835611572e-06, "loss": 0.0, "num_input_tokens_seen": 179743232, "step": 83350 }, { "epoch": 15.297302257294916, "grad_norm": 0.0009389728656969965, "learning_rate": 1.5918475289438323e-06, "loss": 0.0, "num_input_tokens_seen": 179753856, "step": 83355 }, { "epoch": 15.29821985685447, "grad_norm": 0.001575704663991928, "learning_rate": 1.5912616617403376e-06, "loss": 0.0001, "num_input_tokens_seen": 179765440, "step": 83360 }, { "epoch": 15.299137456414021, "grad_norm": 0.0006697179051116109, "learning_rate": 1.5906758819656982e-06, "loss": 0.0, "num_input_tokens_seen": 179775520, "step": 83365 }, { "epoch": 15.300055055973573, "grad_norm": 0.0001882267533801496, "learning_rate": 1.5900901896349386e-06, "loss": 0.0, "num_input_tokens_seen": 179786336, "step": 83370 }, { "epoch": 15.300972655533126, "grad_norm": 0.00010603744885884225, "learning_rate": 1.5895045847630792e-06, "loss": 0.0, "num_input_tokens_seen": 179797568, "step": 83375 }, { "epoch": 15.301890255092678, "grad_norm": 0.00011703836207743734, "learning_rate": 1.5889190673651427e-06, "loss": 0.0, "num_input_tokens_seen": 179807744, "step": 83380 }, { "epoch": 15.30280785465223, "grad_norm": 0.00023351052368525416, "learning_rate": 1.5883336374561453e-06, "loss": 0.0, "num_input_tokens_seen": 179819328, "step": 83385 }, { "epoch": 15.303725454211783, "grad_norm": 0.0011146237375214696, "learning_rate": 1.5877482950511013e-06, "loss": 0.0, "num_input_tokens_seen": 179830624, "step": 83390 }, { "epoch": 15.304643053771334, "grad_norm": 0.00018586953228805214, "learning_rate": 1.5871630401650268e-06, "loss": 0.0, "num_input_tokens_seen": 179841504, "step": 83395 }, { "epoch": 15.305560653330886, "grad_norm": 0.0009837030665948987, "learning_rate": 1.5865778728129305e-06, "loss": 0.0, "num_input_tokens_seen": 179852800, "step": 83400 }, { "epoch": 15.30647825289044, "grad_norm": 0.00989289116114378, "learning_rate": 1.58599279300982e-06, "loss": 0.0, "num_input_tokens_seen": 179863392, "step": 83405 }, { "epoch": 15.307395852449991, "grad_norm": 0.00014523108256980777, "learning_rate": 1.5854078007707047e-06, "loss": 0.0, "num_input_tokens_seen": 179873760, "step": 83410 }, { "epoch": 15.308313452009543, "grad_norm": 0.00020865943224634975, "learning_rate": 1.5848228961105872e-06, "loss": 0.0, "num_input_tokens_seen": 179885280, "step": 83415 }, { "epoch": 15.309231051569096, "grad_norm": 0.0001257381372852251, "learning_rate": 1.584238079044469e-06, "loss": 0.0, "num_input_tokens_seen": 179896864, "step": 83420 }, { "epoch": 15.310148651128648, "grad_norm": 0.006197707261890173, "learning_rate": 1.583653349587349e-06, "loss": 0.0, "num_input_tokens_seen": 179908064, "step": 83425 }, { "epoch": 15.3110662506882, "grad_norm": 9.915123519022018e-05, "learning_rate": 1.5830687077542272e-06, "loss": 0.0, "num_input_tokens_seen": 179917600, "step": 83430 }, { "epoch": 15.311983850247753, "grad_norm": 0.00011039926903322339, "learning_rate": 1.582484153560097e-06, "loss": 0.0, "num_input_tokens_seen": 179928672, "step": 83435 }, { "epoch": 15.312901449807304, "grad_norm": 0.00012177666212664917, "learning_rate": 1.5818996870199505e-06, "loss": 0.0, "num_input_tokens_seen": 179940256, "step": 83440 }, { "epoch": 15.313819049366856, "grad_norm": 7.706289034103975e-05, "learning_rate": 1.581315308148781e-06, "loss": 0.0, "num_input_tokens_seen": 179951104, "step": 83445 }, { "epoch": 15.31473664892641, "grad_norm": 0.0006939743761904538, "learning_rate": 1.5807310169615747e-06, "loss": 0.0, "num_input_tokens_seen": 179962592, "step": 83450 }, { "epoch": 15.315654248485961, "grad_norm": 0.0010316043626517057, "learning_rate": 1.5801468134733171e-06, "loss": 0.0, "num_input_tokens_seen": 179973408, "step": 83455 }, { "epoch": 15.316571848045513, "grad_norm": 0.00019771237566601485, "learning_rate": 1.5795626976989953e-06, "loss": 0.0, "num_input_tokens_seen": 179984608, "step": 83460 }, { "epoch": 15.317489447605066, "grad_norm": 0.00010440347978146747, "learning_rate": 1.5789786696535891e-06, "loss": 0.0, "num_input_tokens_seen": 179995040, "step": 83465 }, { "epoch": 15.318407047164618, "grad_norm": 0.00013243699504528195, "learning_rate": 1.578394729352078e-06, "loss": 0.0, "num_input_tokens_seen": 180005984, "step": 83470 }, { "epoch": 15.31932464672417, "grad_norm": 7.709957571933046e-05, "learning_rate": 1.5778108768094374e-06, "loss": 0.0, "num_input_tokens_seen": 180016768, "step": 83475 }, { "epoch": 15.320242246283723, "grad_norm": 0.00080696283839643, "learning_rate": 1.577227112040645e-06, "loss": 0.0, "num_input_tokens_seen": 180028192, "step": 83480 }, { "epoch": 15.321159845843274, "grad_norm": 0.00010398236190667376, "learning_rate": 1.576643435060673e-06, "loss": 0.0, "num_input_tokens_seen": 180038048, "step": 83485 }, { "epoch": 15.322077445402826, "grad_norm": 9.816357487579808e-05, "learning_rate": 1.576059845884491e-06, "loss": 0.0, "num_input_tokens_seen": 180048736, "step": 83490 }, { "epoch": 15.32299504496238, "grad_norm": 0.0065566725097596645, "learning_rate": 1.5754763445270677e-06, "loss": 0.0, "num_input_tokens_seen": 180059488, "step": 83495 }, { "epoch": 15.323912644521931, "grad_norm": 0.00025369974900968373, "learning_rate": 1.5748929310033661e-06, "loss": 0.0, "num_input_tokens_seen": 180069792, "step": 83500 }, { "epoch": 15.324830244081483, "grad_norm": 7.269566413015127e-05, "learning_rate": 1.5743096053283546e-06, "loss": 0.0, "num_input_tokens_seen": 180081088, "step": 83505 }, { "epoch": 15.325747843641036, "grad_norm": 9.30754977161996e-05, "learning_rate": 1.5737263675169922e-06, "loss": 0.0001, "num_input_tokens_seen": 180091648, "step": 83510 }, { "epoch": 15.326665443200588, "grad_norm": 0.0005550348432734609, "learning_rate": 1.5731432175842386e-06, "loss": 0.0, "num_input_tokens_seen": 180103392, "step": 83515 }, { "epoch": 15.32758304276014, "grad_norm": 0.00016066469834186137, "learning_rate": 1.5725601555450498e-06, "loss": 0.0, "num_input_tokens_seen": 180114368, "step": 83520 }, { "epoch": 15.328500642319693, "grad_norm": 8.121019345708191e-05, "learning_rate": 1.5719771814143798e-06, "loss": 0.0, "num_input_tokens_seen": 180126528, "step": 83525 }, { "epoch": 15.329418241879244, "grad_norm": 0.00018942431779578328, "learning_rate": 1.5713942952071837e-06, "loss": 0.0, "num_input_tokens_seen": 180136768, "step": 83530 }, { "epoch": 15.330335841438796, "grad_norm": 7.497252954635769e-05, "learning_rate": 1.5708114969384096e-06, "loss": 0.0, "num_input_tokens_seen": 180147008, "step": 83535 }, { "epoch": 15.33125344099835, "grad_norm": 0.00011289621033938602, "learning_rate": 1.5702287866230048e-06, "loss": 0.0, "num_input_tokens_seen": 180158304, "step": 83540 }, { "epoch": 15.3321710405579, "grad_norm": 0.00015412071661558002, "learning_rate": 1.5696461642759169e-06, "loss": 0.0, "num_input_tokens_seen": 180169408, "step": 83545 }, { "epoch": 15.333088640117452, "grad_norm": 0.00012060043081874028, "learning_rate": 1.5690636299120893e-06, "loss": 0.0, "num_input_tokens_seen": 180179712, "step": 83550 }, { "epoch": 15.334006239677006, "grad_norm": 0.0005119280540384352, "learning_rate": 1.5684811835464613e-06, "loss": 0.0, "num_input_tokens_seen": 180190624, "step": 83555 }, { "epoch": 15.334923839236557, "grad_norm": 6.666808621957898e-05, "learning_rate": 1.5678988251939713e-06, "loss": 0.0, "num_input_tokens_seen": 180201920, "step": 83560 }, { "epoch": 15.335841438796109, "grad_norm": 0.00019054129370488226, "learning_rate": 1.5673165548695584e-06, "loss": 0.0, "num_input_tokens_seen": 180211776, "step": 83565 }, { "epoch": 15.336759038355662, "grad_norm": 9.38659577514045e-05, "learning_rate": 1.566734372588156e-06, "loss": 0.0, "num_input_tokens_seen": 180223360, "step": 83570 }, { "epoch": 15.337676637915214, "grad_norm": 0.00016860845789778978, "learning_rate": 1.5661522783646943e-06, "loss": 0.0, "num_input_tokens_seen": 180234400, "step": 83575 }, { "epoch": 15.338594237474766, "grad_norm": 9.9102922831662e-05, "learning_rate": 1.5655702722141065e-06, "loss": 0.0, "num_input_tokens_seen": 180243936, "step": 83580 }, { "epoch": 15.339511837034319, "grad_norm": 0.0001747687056194991, "learning_rate": 1.5649883541513177e-06, "loss": 0.0, "num_input_tokens_seen": 180254176, "step": 83585 }, { "epoch": 15.34042943659387, "grad_norm": 0.0004257922410033643, "learning_rate": 1.5644065241912526e-06, "loss": 0.0007, "num_input_tokens_seen": 180265504, "step": 83590 }, { "epoch": 15.341347036153422, "grad_norm": 9.381621202919632e-05, "learning_rate": 1.5638247823488373e-06, "loss": 0.0, "num_input_tokens_seen": 180275552, "step": 83595 }, { "epoch": 15.342264635712976, "grad_norm": 0.0017470401944592595, "learning_rate": 1.5632431286389905e-06, "loss": 0.0, "num_input_tokens_seen": 180286848, "step": 83600 }, { "epoch": 15.343182235272527, "grad_norm": 9.783604764379561e-05, "learning_rate": 1.5626615630766312e-06, "loss": 0.0, "num_input_tokens_seen": 180298048, "step": 83605 }, { "epoch": 15.344099834832079, "grad_norm": 0.00010238222603220493, "learning_rate": 1.5620800856766731e-06, "loss": 0.0, "num_input_tokens_seen": 180309152, "step": 83610 }, { "epoch": 15.345017434391632, "grad_norm": 7.549825386377051e-05, "learning_rate": 1.5614986964540346e-06, "loss": 0.0, "num_input_tokens_seen": 180321120, "step": 83615 }, { "epoch": 15.345935033951184, "grad_norm": 0.00010978222417179495, "learning_rate": 1.5609173954236256e-06, "loss": 0.0, "num_input_tokens_seen": 180332032, "step": 83620 }, { "epoch": 15.346852633510736, "grad_norm": 0.00010310488869436085, "learning_rate": 1.5603361826003533e-06, "loss": 0.0, "num_input_tokens_seen": 180343648, "step": 83625 }, { "epoch": 15.347770233070289, "grad_norm": 0.00043872007518075407, "learning_rate": 1.559755057999129e-06, "loss": 0.0, "num_input_tokens_seen": 180354624, "step": 83630 }, { "epoch": 15.34868783262984, "grad_norm": 0.0002668691740836948, "learning_rate": 1.559174021634855e-06, "loss": 0.0, "num_input_tokens_seen": 180365632, "step": 83635 }, { "epoch": 15.349605432189392, "grad_norm": 0.00018229999113827944, "learning_rate": 1.5585930735224332e-06, "loss": 0.0, "num_input_tokens_seen": 180376704, "step": 83640 }, { "epoch": 15.350523031748946, "grad_norm": 0.00010461173224030063, "learning_rate": 1.5580122136767667e-06, "loss": 0.0, "num_input_tokens_seen": 180387808, "step": 83645 }, { "epoch": 15.351440631308497, "grad_norm": 7.22532786312513e-05, "learning_rate": 1.5574314421127528e-06, "loss": 0.0, "num_input_tokens_seen": 180397344, "step": 83650 }, { "epoch": 15.352358230868049, "grad_norm": 7.41788899176754e-05, "learning_rate": 1.5568507588452863e-06, "loss": 0.0, "num_input_tokens_seen": 180407648, "step": 83655 }, { "epoch": 15.353275830427602, "grad_norm": 0.00011830051516881213, "learning_rate": 1.5562701638892608e-06, "loss": 0.0, "num_input_tokens_seen": 180418080, "step": 83660 }, { "epoch": 15.354193429987154, "grad_norm": 0.00012245327525306493, "learning_rate": 1.5556896572595693e-06, "loss": 0.0, "num_input_tokens_seen": 180428352, "step": 83665 }, { "epoch": 15.355111029546705, "grad_norm": 0.00010358133295085281, "learning_rate": 1.5551092389710998e-06, "loss": 0.0, "num_input_tokens_seen": 180441088, "step": 83670 }, { "epoch": 15.356028629106259, "grad_norm": 6.753111665602773e-05, "learning_rate": 1.5545289090387378e-06, "loss": 0.0001, "num_input_tokens_seen": 180453184, "step": 83675 }, { "epoch": 15.35694622866581, "grad_norm": 9.043722093338147e-05, "learning_rate": 1.5539486674773707e-06, "loss": 0.0, "num_input_tokens_seen": 180464896, "step": 83680 }, { "epoch": 15.357863828225362, "grad_norm": 0.0005666384822688997, "learning_rate": 1.5533685143018795e-06, "loss": 0.0, "num_input_tokens_seen": 180474464, "step": 83685 }, { "epoch": 15.358781427784916, "grad_norm": 0.0003137846360914409, "learning_rate": 1.5527884495271439e-06, "loss": 0.0, "num_input_tokens_seen": 180484032, "step": 83690 }, { "epoch": 15.359699027344467, "grad_norm": 0.0013269903138279915, "learning_rate": 1.5522084731680404e-06, "loss": 0.0, "num_input_tokens_seen": 180494272, "step": 83695 }, { "epoch": 15.360616626904019, "grad_norm": 0.00010522675438551232, "learning_rate": 1.551628585239448e-06, "loss": 0.0, "num_input_tokens_seen": 180505216, "step": 83700 }, { "epoch": 15.361534226463572, "grad_norm": 0.00010299598216079175, "learning_rate": 1.5510487857562373e-06, "loss": 0.0, "num_input_tokens_seen": 180515200, "step": 83705 }, { "epoch": 15.362451826023124, "grad_norm": 0.00017944784485735, "learning_rate": 1.55046907473328e-06, "loss": 0.0, "num_input_tokens_seen": 180526144, "step": 83710 }, { "epoch": 15.363369425582675, "grad_norm": 6.647376721957698e-05, "learning_rate": 1.5498894521854452e-06, "loss": 0.0, "num_input_tokens_seen": 180536832, "step": 83715 }, { "epoch": 15.364287025142229, "grad_norm": 0.00014447123976424336, "learning_rate": 1.5493099181275978e-06, "loss": 0.0001, "num_input_tokens_seen": 180546944, "step": 83720 }, { "epoch": 15.36520462470178, "grad_norm": 0.0007117025670595467, "learning_rate": 1.5487304725746023e-06, "loss": 0.0001, "num_input_tokens_seen": 180557376, "step": 83725 }, { "epoch": 15.366122224261332, "grad_norm": 0.0002200965682277456, "learning_rate": 1.548151115541322e-06, "loss": 0.0, "num_input_tokens_seen": 180568544, "step": 83730 }, { "epoch": 15.367039823820885, "grad_norm": 0.00015011186769697815, "learning_rate": 1.547571847042616e-06, "loss": 0.0, "num_input_tokens_seen": 180577664, "step": 83735 }, { "epoch": 15.367957423380437, "grad_norm": 0.0027125650085508823, "learning_rate": 1.5469926670933417e-06, "loss": 0.0, "num_input_tokens_seen": 180588000, "step": 83740 }, { "epoch": 15.368875022939989, "grad_norm": 0.00020747519738506526, "learning_rate": 1.5464135757083516e-06, "loss": 0.0, "num_input_tokens_seen": 180598528, "step": 83745 }, { "epoch": 15.369792622499542, "grad_norm": 0.00010215811926173046, "learning_rate": 1.5458345729025025e-06, "loss": 0.0, "num_input_tokens_seen": 180607968, "step": 83750 }, { "epoch": 15.370710222059094, "grad_norm": 0.0003759480605367571, "learning_rate": 1.5452556586906437e-06, "loss": 0.0, "num_input_tokens_seen": 180618208, "step": 83755 }, { "epoch": 15.371627821618645, "grad_norm": 8.795042231213301e-05, "learning_rate": 1.5446768330876204e-06, "loss": 0.0001, "num_input_tokens_seen": 180630048, "step": 83760 }, { "epoch": 15.372545421178199, "grad_norm": 6.650968862231821e-05, "learning_rate": 1.5440980961082835e-06, "loss": 0.0, "num_input_tokens_seen": 180641440, "step": 83765 }, { "epoch": 15.37346302073775, "grad_norm": 0.00013791534001939, "learning_rate": 1.5435194477674737e-06, "loss": 0.0, "num_input_tokens_seen": 180652128, "step": 83770 }, { "epoch": 15.374380620297302, "grad_norm": 0.006420623045414686, "learning_rate": 1.5429408880800317e-06, "loss": 0.0, "num_input_tokens_seen": 180662720, "step": 83775 }, { "epoch": 15.375298219856855, "grad_norm": 8.285429066745564e-05, "learning_rate": 1.5423624170607992e-06, "loss": 0.0, "num_input_tokens_seen": 180673184, "step": 83780 }, { "epoch": 15.376215819416407, "grad_norm": 0.00041820845217444, "learning_rate": 1.5417840347246122e-06, "loss": 0.0, "num_input_tokens_seen": 180683840, "step": 83785 }, { "epoch": 15.377133418975959, "grad_norm": 9.271211456507444e-05, "learning_rate": 1.5412057410863045e-06, "loss": 0.0, "num_input_tokens_seen": 180694624, "step": 83790 }, { "epoch": 15.378051018535512, "grad_norm": 0.00011613177048275247, "learning_rate": 1.540627536160708e-06, "loss": 0.0, "num_input_tokens_seen": 180705312, "step": 83795 }, { "epoch": 15.378968618095064, "grad_norm": 0.00015724515833426267, "learning_rate": 1.5400494199626547e-06, "loss": 0.0, "num_input_tokens_seen": 180715904, "step": 83800 }, { "epoch": 15.379886217654615, "grad_norm": 8.792685548542067e-05, "learning_rate": 1.5394713925069715e-06, "loss": 0.0, "num_input_tokens_seen": 180727968, "step": 83805 }, { "epoch": 15.380803817214169, "grad_norm": 0.0006572980200871825, "learning_rate": 1.538893453808482e-06, "loss": 0.0, "num_input_tokens_seen": 180738400, "step": 83810 }, { "epoch": 15.38172141677372, "grad_norm": 5.979775232844986e-05, "learning_rate": 1.5383156038820134e-06, "loss": 0.0, "num_input_tokens_seen": 180748896, "step": 83815 }, { "epoch": 15.382639016333272, "grad_norm": 0.00027278103516437113, "learning_rate": 1.5377378427423839e-06, "loss": 0.0, "num_input_tokens_seen": 180760320, "step": 83820 }, { "epoch": 15.383556615892825, "grad_norm": 8.925815927796066e-05, "learning_rate": 1.5371601704044125e-06, "loss": 0.0, "num_input_tokens_seen": 180769920, "step": 83825 }, { "epoch": 15.384474215452377, "grad_norm": 0.0001780132733983919, "learning_rate": 1.536582586882915e-06, "loss": 0.0, "num_input_tokens_seen": 180780928, "step": 83830 }, { "epoch": 15.385391815011928, "grad_norm": 0.00017126491002272815, "learning_rate": 1.5360050921927072e-06, "loss": 0.0, "num_input_tokens_seen": 180792096, "step": 83835 }, { "epoch": 15.386309414571482, "grad_norm": 9.224805398844182e-05, "learning_rate": 1.5354276863486006e-06, "loss": 0.0, "num_input_tokens_seen": 180803840, "step": 83840 }, { "epoch": 15.387227014131033, "grad_norm": 0.00014680810272693634, "learning_rate": 1.5348503693654021e-06, "loss": 0.0, "num_input_tokens_seen": 180814912, "step": 83845 }, { "epoch": 15.388144613690585, "grad_norm": 7.210196054074913e-05, "learning_rate": 1.5342731412579232e-06, "loss": 0.0, "num_input_tokens_seen": 180827136, "step": 83850 }, { "epoch": 15.389062213250138, "grad_norm": 0.0014026141725480556, "learning_rate": 1.5336960020409665e-06, "loss": 0.0, "num_input_tokens_seen": 180839456, "step": 83855 }, { "epoch": 15.38997981280969, "grad_norm": 5.0998070946661755e-05, "learning_rate": 1.5331189517293337e-06, "loss": 0.0, "num_input_tokens_seen": 180850688, "step": 83860 }, { "epoch": 15.390897412369242, "grad_norm": 9.041285375133157e-05, "learning_rate": 1.532541990337828e-06, "loss": 0.0, "num_input_tokens_seen": 180861312, "step": 83865 }, { "epoch": 15.391815011928795, "grad_norm": 0.00011292075942037627, "learning_rate": 1.5319651178812462e-06, "loss": 0.0001, "num_input_tokens_seen": 180872384, "step": 83870 }, { "epoch": 15.392732611488347, "grad_norm": 0.0032409096602350473, "learning_rate": 1.5313883343743846e-06, "loss": 0.0, "num_input_tokens_seen": 180882560, "step": 83875 }, { "epoch": 15.393650211047898, "grad_norm": 9.67539381235838e-05, "learning_rate": 1.5308116398320343e-06, "loss": 0.0, "num_input_tokens_seen": 180892736, "step": 83880 }, { "epoch": 15.394567810607452, "grad_norm": 0.0001718745188554749, "learning_rate": 1.5302350342689904e-06, "loss": 0.0, "num_input_tokens_seen": 180904320, "step": 83885 }, { "epoch": 15.395485410167003, "grad_norm": 0.0001587914739502594, "learning_rate": 1.52965851770004e-06, "loss": 0.0, "num_input_tokens_seen": 180914720, "step": 83890 }, { "epoch": 15.396403009726555, "grad_norm": 0.00019619971862994134, "learning_rate": 1.529082090139969e-06, "loss": 0.0, "num_input_tokens_seen": 180926400, "step": 83895 }, { "epoch": 15.397320609286108, "grad_norm": 0.0012857280671596527, "learning_rate": 1.528505751603564e-06, "loss": 0.0, "num_input_tokens_seen": 180937792, "step": 83900 }, { "epoch": 15.39823820884566, "grad_norm": 0.00017772859428077936, "learning_rate": 1.5279295021056067e-06, "loss": 0.0, "num_input_tokens_seen": 180948896, "step": 83905 }, { "epoch": 15.399155808405212, "grad_norm": 9.366431186208501e-05, "learning_rate": 1.527353341660876e-06, "loss": 0.0, "num_input_tokens_seen": 180958432, "step": 83910 }, { "epoch": 15.400073407964765, "grad_norm": 0.00019469198014121503, "learning_rate": 1.52677727028415e-06, "loss": 0.0, "num_input_tokens_seen": 180968672, "step": 83915 }, { "epoch": 15.400991007524317, "grad_norm": 0.0008977903635241091, "learning_rate": 1.5262012879902027e-06, "loss": 0.0, "num_input_tokens_seen": 180980000, "step": 83920 }, { "epoch": 15.401908607083868, "grad_norm": 0.00014392459706868976, "learning_rate": 1.52562539479381e-06, "loss": 0.0, "num_input_tokens_seen": 180990944, "step": 83925 }, { "epoch": 15.402826206643422, "grad_norm": 0.00016363567556254566, "learning_rate": 1.5250495907097407e-06, "loss": 0.0, "num_input_tokens_seen": 181001632, "step": 83930 }, { "epoch": 15.403743806202973, "grad_norm": 0.0003467621572781354, "learning_rate": 1.5244738757527645e-06, "loss": 0.0, "num_input_tokens_seen": 181012800, "step": 83935 }, { "epoch": 15.404661405762525, "grad_norm": 7.914158049970865e-05, "learning_rate": 1.5238982499376458e-06, "loss": 0.0, "num_input_tokens_seen": 181023488, "step": 83940 }, { "epoch": 15.405579005322078, "grad_norm": 8.914653881220147e-05, "learning_rate": 1.523322713279149e-06, "loss": 0.0, "num_input_tokens_seen": 181034656, "step": 83945 }, { "epoch": 15.40649660488163, "grad_norm": 0.00017070448666345328, "learning_rate": 1.5227472657920373e-06, "loss": 0.0, "num_input_tokens_seen": 181045664, "step": 83950 }, { "epoch": 15.407414204441181, "grad_norm": 0.00015339204401243478, "learning_rate": 1.5221719074910691e-06, "loss": 0.0, "num_input_tokens_seen": 181055104, "step": 83955 }, { "epoch": 15.408331804000735, "grad_norm": 0.000171509018400684, "learning_rate": 1.5215966383910008e-06, "loss": 0.0, "num_input_tokens_seen": 181065152, "step": 83960 }, { "epoch": 15.409249403560286, "grad_norm": 0.00010591529280645773, "learning_rate": 1.521021458506587e-06, "loss": 0.0, "num_input_tokens_seen": 181075776, "step": 83965 }, { "epoch": 15.410167003119838, "grad_norm": 7.619654206791893e-05, "learning_rate": 1.5204463678525817e-06, "loss": 0.0, "num_input_tokens_seen": 181087936, "step": 83970 }, { "epoch": 15.411084602679392, "grad_norm": 0.0007313074311241508, "learning_rate": 1.5198713664437342e-06, "loss": 0.0, "num_input_tokens_seen": 181100800, "step": 83975 }, { "epoch": 15.412002202238943, "grad_norm": 0.0006365276640281081, "learning_rate": 1.5192964542947912e-06, "loss": 0.0, "num_input_tokens_seen": 181111200, "step": 83980 }, { "epoch": 15.412919801798495, "grad_norm": 0.0001419558102497831, "learning_rate": 1.518721631420501e-06, "loss": 0.0, "num_input_tokens_seen": 181120320, "step": 83985 }, { "epoch": 15.413837401358048, "grad_norm": 0.024037368595600128, "learning_rate": 1.5181468978356057e-06, "loss": 0.0, "num_input_tokens_seen": 181131104, "step": 83990 }, { "epoch": 15.4147550009176, "grad_norm": 0.00014113263750914484, "learning_rate": 1.5175722535548442e-06, "loss": 0.0, "num_input_tokens_seen": 181141888, "step": 83995 }, { "epoch": 15.415672600477151, "grad_norm": 0.030951527878642082, "learning_rate": 1.516997698592959e-06, "loss": 0.0, "num_input_tokens_seen": 181152608, "step": 84000 }, { "epoch": 15.416590200036705, "grad_norm": 8.21602443465963e-05, "learning_rate": 1.516423232964685e-06, "loss": 0.0003, "num_input_tokens_seen": 181162816, "step": 84005 }, { "epoch": 15.417507799596256, "grad_norm": 0.025622639805078506, "learning_rate": 1.5158488566847551e-06, "loss": 0.0, "num_input_tokens_seen": 181173152, "step": 84010 }, { "epoch": 15.418425399155808, "grad_norm": 5.767256880062632e-05, "learning_rate": 1.5152745697679011e-06, "loss": 0.0, "num_input_tokens_seen": 181184160, "step": 84015 }, { "epoch": 15.419342998715361, "grad_norm": 8.275567961391062e-05, "learning_rate": 1.5147003722288551e-06, "loss": 0.0, "num_input_tokens_seen": 181194944, "step": 84020 }, { "epoch": 15.420260598274913, "grad_norm": 0.00010272716463077813, "learning_rate": 1.5141262640823428e-06, "loss": 0.0, "num_input_tokens_seen": 181204864, "step": 84025 }, { "epoch": 15.421178197834465, "grad_norm": 6.932879477972165e-05, "learning_rate": 1.5135522453430874e-06, "loss": 0.0007, "num_input_tokens_seen": 181214912, "step": 84030 }, { "epoch": 15.422095797394018, "grad_norm": 0.00019340003200341016, "learning_rate": 1.5129783160258149e-06, "loss": 0.0, "num_input_tokens_seen": 181225632, "step": 84035 }, { "epoch": 15.42301339695357, "grad_norm": 0.00014033389743417501, "learning_rate": 1.5124044761452444e-06, "loss": 0.0, "num_input_tokens_seen": 181235456, "step": 84040 }, { "epoch": 15.423930996513121, "grad_norm": 6.668224523309618e-05, "learning_rate": 1.5118307257160925e-06, "loss": 0.0, "num_input_tokens_seen": 181246688, "step": 84045 }, { "epoch": 15.424848596072675, "grad_norm": 0.00011388105485821143, "learning_rate": 1.5112570647530779e-06, "loss": 0.0, "num_input_tokens_seen": 181257152, "step": 84050 }, { "epoch": 15.425766195632226, "grad_norm": 0.00037874054396525025, "learning_rate": 1.510683493270912e-06, "loss": 0.0, "num_input_tokens_seen": 181267776, "step": 84055 }, { "epoch": 15.426683795191778, "grad_norm": 0.00010939018102362752, "learning_rate": 1.510110011284307e-06, "loss": 0.0, "num_input_tokens_seen": 181278688, "step": 84060 }, { "epoch": 15.427601394751331, "grad_norm": 0.0001793185219867155, "learning_rate": 1.5095366188079697e-06, "loss": 0.0001, "num_input_tokens_seen": 181289120, "step": 84065 }, { "epoch": 15.428518994310883, "grad_norm": 9.547477384330705e-05, "learning_rate": 1.5089633158566098e-06, "loss": 0.0, "num_input_tokens_seen": 181299168, "step": 84070 }, { "epoch": 15.429436593870435, "grad_norm": 6.225502147572115e-05, "learning_rate": 1.5083901024449298e-06, "loss": 0.002, "num_input_tokens_seen": 181308864, "step": 84075 }, { "epoch": 15.430354193429988, "grad_norm": 0.00013136494089849293, "learning_rate": 1.5078169785876312e-06, "loss": 0.0, "num_input_tokens_seen": 181319648, "step": 84080 }, { "epoch": 15.43127179298954, "grad_norm": 0.00031366580515168607, "learning_rate": 1.5072439442994163e-06, "loss": 0.0, "num_input_tokens_seen": 181330592, "step": 84085 }, { "epoch": 15.432189392549091, "grad_norm": 6.0158745327498764e-05, "learning_rate": 1.5066709995949808e-06, "loss": 0.0, "num_input_tokens_seen": 181340256, "step": 84090 }, { "epoch": 15.433106992108645, "grad_norm": 0.00012602969945874065, "learning_rate": 1.5060981444890187e-06, "loss": 0.0, "num_input_tokens_seen": 181349504, "step": 84095 }, { "epoch": 15.434024591668196, "grad_norm": 0.00014591991202905774, "learning_rate": 1.5055253789962255e-06, "loss": 0.0, "num_input_tokens_seen": 181359712, "step": 84100 }, { "epoch": 15.434942191227748, "grad_norm": 9.725216659717262e-05, "learning_rate": 1.5049527031312906e-06, "loss": 0.0792, "num_input_tokens_seen": 181369440, "step": 84105 }, { "epoch": 15.435859790787301, "grad_norm": 0.00018749955052044243, "learning_rate": 1.5043801169089017e-06, "loss": 0.0, "num_input_tokens_seen": 181380192, "step": 84110 }, { "epoch": 15.436777390346853, "grad_norm": 0.0013073354493826628, "learning_rate": 1.5038076203437436e-06, "loss": 0.0, "num_input_tokens_seen": 181389888, "step": 84115 }, { "epoch": 15.437694989906404, "grad_norm": 8.784062083577737e-05, "learning_rate": 1.503235213450503e-06, "loss": 0.0, "num_input_tokens_seen": 181400544, "step": 84120 }, { "epoch": 15.438612589465958, "grad_norm": 0.00013558405044022948, "learning_rate": 1.5026628962438594e-06, "loss": 0.0, "num_input_tokens_seen": 181409248, "step": 84125 }, { "epoch": 15.43953018902551, "grad_norm": 0.0009897764539346099, "learning_rate": 1.5020906687384923e-06, "loss": 0.0, "num_input_tokens_seen": 181419616, "step": 84130 }, { "epoch": 15.440447788585061, "grad_norm": 0.0003967154480051249, "learning_rate": 1.5015185309490786e-06, "loss": 0.0, "num_input_tokens_seen": 181430208, "step": 84135 }, { "epoch": 15.441365388144614, "grad_norm": 0.009840310551226139, "learning_rate": 1.5009464828902902e-06, "loss": 0.0, "num_input_tokens_seen": 181440576, "step": 84140 }, { "epoch": 15.442282987704166, "grad_norm": 0.0001747346977936104, "learning_rate": 1.5003745245768036e-06, "loss": 0.0, "num_input_tokens_seen": 181451488, "step": 84145 }, { "epoch": 15.443200587263718, "grad_norm": 0.00010306573676643893, "learning_rate": 1.4998026560232865e-06, "loss": 0.0, "num_input_tokens_seen": 181463136, "step": 84150 }, { "epoch": 15.444118186823271, "grad_norm": 6.913227116456255e-05, "learning_rate": 1.4992308772444063e-06, "loss": 0.0, "num_input_tokens_seen": 181472864, "step": 84155 }, { "epoch": 15.445035786382823, "grad_norm": 6.116065924288705e-05, "learning_rate": 1.4986591882548285e-06, "loss": 0.0, "num_input_tokens_seen": 181484000, "step": 84160 }, { "epoch": 15.445953385942374, "grad_norm": 8.34832972032018e-05, "learning_rate": 1.4980875890692143e-06, "loss": 0.0, "num_input_tokens_seen": 181495776, "step": 84165 }, { "epoch": 15.446870985501928, "grad_norm": 0.000536032544914633, "learning_rate": 1.4975160797022276e-06, "loss": 0.2125, "num_input_tokens_seen": 181506272, "step": 84170 }, { "epoch": 15.44778858506148, "grad_norm": 0.00028274740907363594, "learning_rate": 1.496944660168525e-06, "loss": 0.0, "num_input_tokens_seen": 181516960, "step": 84175 }, { "epoch": 15.448706184621031, "grad_norm": 0.00206316658295691, "learning_rate": 1.4963733304827616e-06, "loss": 0.0, "num_input_tokens_seen": 181528160, "step": 84180 }, { "epoch": 15.449623784180584, "grad_norm": 0.0009899601573124528, "learning_rate": 1.4958020906595933e-06, "loss": 0.0, "num_input_tokens_seen": 181538400, "step": 84185 }, { "epoch": 15.450541383740136, "grad_norm": 0.017540866509079933, "learning_rate": 1.49523094071367e-06, "loss": 0.0, "num_input_tokens_seen": 181548128, "step": 84190 }, { "epoch": 15.451458983299688, "grad_norm": 0.00022064796939957887, "learning_rate": 1.4946598806596413e-06, "loss": 0.0, "num_input_tokens_seen": 181559936, "step": 84195 }, { "epoch": 15.452376582859241, "grad_norm": 0.00020951313490513712, "learning_rate": 1.4940889105121526e-06, "loss": 0.0, "num_input_tokens_seen": 181570272, "step": 84200 }, { "epoch": 15.453294182418793, "grad_norm": 0.013645526021718979, "learning_rate": 1.493518030285851e-06, "loss": 0.0, "num_input_tokens_seen": 181581152, "step": 84205 }, { "epoch": 15.454211781978344, "grad_norm": 7.892485155025497e-05, "learning_rate": 1.4929472399953775e-06, "loss": 0.0, "num_input_tokens_seen": 181592608, "step": 84210 }, { "epoch": 15.455129381537898, "grad_norm": 0.00027713621966540813, "learning_rate": 1.4923765396553702e-06, "loss": 0.0, "num_input_tokens_seen": 181601152, "step": 84215 }, { "epoch": 15.45604698109745, "grad_norm": 0.0003743656852748245, "learning_rate": 1.4918059292804698e-06, "loss": 0.0, "num_input_tokens_seen": 181611072, "step": 84220 }, { "epoch": 15.456964580657, "grad_norm": 0.0010962262749671936, "learning_rate": 1.4912354088853103e-06, "loss": 0.0, "num_input_tokens_seen": 181621440, "step": 84225 }, { "epoch": 15.457882180216554, "grad_norm": 0.00016331909864675254, "learning_rate": 1.4906649784845234e-06, "loss": 0.0, "num_input_tokens_seen": 181631808, "step": 84230 }, { "epoch": 15.458799779776106, "grad_norm": 0.0023515953216701746, "learning_rate": 1.4900946380927416e-06, "loss": 0.005, "num_input_tokens_seen": 181643168, "step": 84235 }, { "epoch": 15.459717379335657, "grad_norm": 0.0002800026850309223, "learning_rate": 1.489524387724593e-06, "loss": 0.1238, "num_input_tokens_seen": 181653248, "step": 84240 }, { "epoch": 15.46063497889521, "grad_norm": 0.0015832420904189348, "learning_rate": 1.4889542273947027e-06, "loss": 0.0, "num_input_tokens_seen": 181664288, "step": 84245 }, { "epoch": 15.461552578454762, "grad_norm": 0.00012364660506136715, "learning_rate": 1.4883841571176931e-06, "loss": 0.0, "num_input_tokens_seen": 181674592, "step": 84250 }, { "epoch": 15.462470178014314, "grad_norm": 0.00026063431869260967, "learning_rate": 1.4878141769081895e-06, "loss": 0.0, "num_input_tokens_seen": 181685664, "step": 84255 }, { "epoch": 15.463387777573868, "grad_norm": 0.2641501724720001, "learning_rate": 1.4872442867808084e-06, "loss": 0.0, "num_input_tokens_seen": 181694976, "step": 84260 }, { "epoch": 15.46430537713342, "grad_norm": 0.00015742237155791372, "learning_rate": 1.486674486750166e-06, "loss": 0.0, "num_input_tokens_seen": 181706336, "step": 84265 }, { "epoch": 15.46522297669297, "grad_norm": 7.220506086014211e-05, "learning_rate": 1.486104776830879e-06, "loss": 0.0, "num_input_tokens_seen": 181716512, "step": 84270 }, { "epoch": 15.466140576252524, "grad_norm": 0.029356546700000763, "learning_rate": 1.4855351570375587e-06, "loss": 0.0, "num_input_tokens_seen": 181726592, "step": 84275 }, { "epoch": 15.467058175812076, "grad_norm": 0.0003255379560869187, "learning_rate": 1.4849656273848146e-06, "loss": 0.0, "num_input_tokens_seen": 181738144, "step": 84280 }, { "epoch": 15.467975775371627, "grad_norm": 0.00021990708773955703, "learning_rate": 1.4843961878872526e-06, "loss": 0.0, "num_input_tokens_seen": 181748608, "step": 84285 }, { "epoch": 15.46889337493118, "grad_norm": 0.00012527754006441683, "learning_rate": 1.4838268385594812e-06, "loss": 0.0, "num_input_tokens_seen": 181759232, "step": 84290 }, { "epoch": 15.469810974490732, "grad_norm": 0.00032220312277786434, "learning_rate": 1.4832575794161024e-06, "loss": 0.0, "num_input_tokens_seen": 181769344, "step": 84295 }, { "epoch": 15.470728574050284, "grad_norm": 0.0007256303215399384, "learning_rate": 1.4826884104717142e-06, "loss": 0.0, "num_input_tokens_seen": 181779744, "step": 84300 }, { "epoch": 15.471646173609837, "grad_norm": 0.0033484857995063066, "learning_rate": 1.4821193317409182e-06, "loss": 0.0, "num_input_tokens_seen": 181791936, "step": 84305 }, { "epoch": 15.472563773169389, "grad_norm": 0.026974771171808243, "learning_rate": 1.4815503432383099e-06, "loss": 0.0, "num_input_tokens_seen": 181803552, "step": 84310 }, { "epoch": 15.47348137272894, "grad_norm": 0.00022199300292413682, "learning_rate": 1.4809814449784803e-06, "loss": 0.0, "num_input_tokens_seen": 181813440, "step": 84315 }, { "epoch": 15.474398972288494, "grad_norm": 0.0002679301251191646, "learning_rate": 1.4804126369760241e-06, "loss": 0.0, "num_input_tokens_seen": 181823904, "step": 84320 }, { "epoch": 15.475316571848046, "grad_norm": 0.0005213047843426466, "learning_rate": 1.4798439192455288e-06, "loss": 0.0, "num_input_tokens_seen": 181833824, "step": 84325 }, { "epoch": 15.476234171407597, "grad_norm": 0.0005396651104092598, "learning_rate": 1.4792752918015812e-06, "loss": 0.0, "num_input_tokens_seen": 181844768, "step": 84330 }, { "epoch": 15.47715177096715, "grad_norm": 0.00016119005158543587, "learning_rate": 1.4787067546587647e-06, "loss": 0.0, "num_input_tokens_seen": 181854624, "step": 84335 }, { "epoch": 15.478069370526702, "grad_norm": 0.0001190754264825955, "learning_rate": 1.4781383078316636e-06, "loss": 0.0, "num_input_tokens_seen": 181865216, "step": 84340 }, { "epoch": 15.478986970086254, "grad_norm": 0.0007315269904211164, "learning_rate": 1.4775699513348563e-06, "loss": 0.0, "num_input_tokens_seen": 181876768, "step": 84345 }, { "epoch": 15.479904569645807, "grad_norm": 0.0002582152374088764, "learning_rate": 1.477001685182921e-06, "loss": 0.0, "num_input_tokens_seen": 181888576, "step": 84350 }, { "epoch": 15.480822169205359, "grad_norm": 0.00011337766045471653, "learning_rate": 1.4764335093904319e-06, "loss": 0.0, "num_input_tokens_seen": 181898752, "step": 84355 }, { "epoch": 15.48173976876491, "grad_norm": 30.665620803833008, "learning_rate": 1.4758654239719612e-06, "loss": 0.1625, "num_input_tokens_seen": 181909728, "step": 84360 }, { "epoch": 15.482657368324464, "grad_norm": 14.092488288879395, "learning_rate": 1.4752974289420813e-06, "loss": 0.0284, "num_input_tokens_seen": 181920992, "step": 84365 }, { "epoch": 15.483574967884016, "grad_norm": 0.0006449755164794624, "learning_rate": 1.4747295243153597e-06, "loss": 0.0, "num_input_tokens_seen": 181932160, "step": 84370 }, { "epoch": 15.484492567443567, "grad_norm": 0.0002464938734192401, "learning_rate": 1.4741617101063626e-06, "loss": 0.0, "num_input_tokens_seen": 181942400, "step": 84375 }, { "epoch": 15.48541016700312, "grad_norm": 0.0019433620618656278, "learning_rate": 1.4735939863296527e-06, "loss": 0.0, "num_input_tokens_seen": 181953120, "step": 84380 }, { "epoch": 15.486327766562672, "grad_norm": 0.0002820042136590928, "learning_rate": 1.47302635299979e-06, "loss": 0.0, "num_input_tokens_seen": 181964768, "step": 84385 }, { "epoch": 15.487245366122224, "grad_norm": 0.00012801127741113305, "learning_rate": 1.4724588101313369e-06, "loss": 0.0001, "num_input_tokens_seen": 181976640, "step": 84390 }, { "epoch": 15.488162965681777, "grad_norm": 0.00044632222852669656, "learning_rate": 1.4718913577388483e-06, "loss": 0.0, "num_input_tokens_seen": 181988384, "step": 84395 }, { "epoch": 15.489080565241329, "grad_norm": 0.001782241859473288, "learning_rate": 1.4713239958368763e-06, "loss": 0.0, "num_input_tokens_seen": 181999872, "step": 84400 }, { "epoch": 15.48999816480088, "grad_norm": 0.0003291811444796622, "learning_rate": 1.4707567244399761e-06, "loss": 0.0, "num_input_tokens_seen": 182010432, "step": 84405 }, { "epoch": 15.490915764360434, "grad_norm": 0.00012817462265957147, "learning_rate": 1.4701895435626967e-06, "loss": 0.0, "num_input_tokens_seen": 182021024, "step": 84410 }, { "epoch": 15.491833363919985, "grad_norm": 0.0014526661252602935, "learning_rate": 1.4696224532195847e-06, "loss": 0.0, "num_input_tokens_seen": 182032544, "step": 84415 }, { "epoch": 15.492750963479537, "grad_norm": 0.00034091834095306695, "learning_rate": 1.4690554534251838e-06, "loss": 0.0, "num_input_tokens_seen": 182043360, "step": 84420 }, { "epoch": 15.49366856303909, "grad_norm": 0.0008756418246775866, "learning_rate": 1.4684885441940393e-06, "loss": 0.0, "num_input_tokens_seen": 182053536, "step": 84425 }, { "epoch": 15.494586162598642, "grad_norm": 0.00014784435916226357, "learning_rate": 1.4679217255406902e-06, "loss": 0.0002, "num_input_tokens_seen": 182063872, "step": 84430 }, { "epoch": 15.495503762158194, "grad_norm": 0.00019823033653665334, "learning_rate": 1.4673549974796735e-06, "loss": 0.0, "num_input_tokens_seen": 182075104, "step": 84435 }, { "epoch": 15.496421361717747, "grad_norm": 0.0001730974909150973, "learning_rate": 1.4667883600255272e-06, "loss": 0.0, "num_input_tokens_seen": 182087168, "step": 84440 }, { "epoch": 15.497338961277299, "grad_norm": 9.304335253546014e-05, "learning_rate": 1.4662218131927835e-06, "loss": 0.0, "num_input_tokens_seen": 182098848, "step": 84445 }, { "epoch": 15.49825656083685, "grad_norm": 0.00018810454639606178, "learning_rate": 1.4656553569959719e-06, "loss": 0.0, "num_input_tokens_seen": 182110208, "step": 84450 }, { "epoch": 15.499174160396404, "grad_norm": 0.00013036126620136201, "learning_rate": 1.465088991449624e-06, "loss": 0.0001, "num_input_tokens_seen": 182120704, "step": 84455 }, { "epoch": 15.500091759955955, "grad_norm": 0.00022092656581662595, "learning_rate": 1.4645227165682652e-06, "loss": 0.0, "num_input_tokens_seen": 182131712, "step": 84460 }, { "epoch": 15.501009359515507, "grad_norm": 0.00010793599358294159, "learning_rate": 1.4639565323664196e-06, "loss": 0.0, "num_input_tokens_seen": 182142976, "step": 84465 }, { "epoch": 15.50192695907506, "grad_norm": 0.00013388886873144656, "learning_rate": 1.4633904388586063e-06, "loss": 0.0, "num_input_tokens_seen": 182153600, "step": 84470 }, { "epoch": 15.502844558634612, "grad_norm": 0.00046424538595601916, "learning_rate": 1.4628244360593492e-06, "loss": 0.0, "num_input_tokens_seen": 182164288, "step": 84475 }, { "epoch": 15.503762158194164, "grad_norm": 0.00023705918283667415, "learning_rate": 1.4622585239831627e-06, "loss": 0.0, "num_input_tokens_seen": 182176064, "step": 84480 }, { "epoch": 15.504679757753717, "grad_norm": 0.00025747716426849365, "learning_rate": 1.4616927026445604e-06, "loss": 0.0, "num_input_tokens_seen": 182186080, "step": 84485 }, { "epoch": 15.505597357313269, "grad_norm": 0.000728664337657392, "learning_rate": 1.4611269720580578e-06, "loss": 0.0, "num_input_tokens_seen": 182197888, "step": 84490 }, { "epoch": 15.50651495687282, "grad_norm": 0.0009750569588504732, "learning_rate": 1.4605613322381644e-06, "loss": 0.0, "num_input_tokens_seen": 182209088, "step": 84495 }, { "epoch": 15.507432556432374, "grad_norm": 0.006687323562800884, "learning_rate": 1.459995783199385e-06, "loss": 0.0, "num_input_tokens_seen": 182220672, "step": 84500 }, { "epoch": 15.508350155991925, "grad_norm": 0.0003832852526102215, "learning_rate": 1.459430324956229e-06, "loss": 0.0, "num_input_tokens_seen": 182232288, "step": 84505 }, { "epoch": 15.509267755551477, "grad_norm": 0.0006045612972229719, "learning_rate": 1.4588649575231978e-06, "loss": 0.0, "num_input_tokens_seen": 182242720, "step": 84510 }, { "epoch": 15.51018535511103, "grad_norm": 0.00028970115818083286, "learning_rate": 1.458299680914792e-06, "loss": 0.0, "num_input_tokens_seen": 182253664, "step": 84515 }, { "epoch": 15.511102954670582, "grad_norm": 0.0005706904921680689, "learning_rate": 1.4577344951455092e-06, "loss": 0.0, "num_input_tokens_seen": 182264640, "step": 84520 }, { "epoch": 15.512020554230133, "grad_norm": 0.00011238301522098482, "learning_rate": 1.4571694002298476e-06, "loss": 0.0, "num_input_tokens_seen": 182275360, "step": 84525 }, { "epoch": 15.512938153789687, "grad_norm": 0.0014548193430528045, "learning_rate": 1.4566043961823001e-06, "loss": 0.0, "num_input_tokens_seen": 182286304, "step": 84530 }, { "epoch": 15.513855753349238, "grad_norm": 0.009943991899490356, "learning_rate": 1.4560394830173569e-06, "loss": 0.0, "num_input_tokens_seen": 182296544, "step": 84535 }, { "epoch": 15.51477335290879, "grad_norm": 0.0005300975171849132, "learning_rate": 1.4554746607495097e-06, "loss": 0.0, "num_input_tokens_seen": 182307872, "step": 84540 }, { "epoch": 15.515690952468344, "grad_norm": 0.0005579007556661963, "learning_rate": 1.4549099293932439e-06, "loss": 0.0, "num_input_tokens_seen": 182318720, "step": 84545 }, { "epoch": 15.516608552027895, "grad_norm": 0.00015120682655833662, "learning_rate": 1.4543452889630438e-06, "loss": 0.0, "num_input_tokens_seen": 182329728, "step": 84550 }, { "epoch": 15.517526151587447, "grad_norm": 9.019447315949947e-05, "learning_rate": 1.4537807394733905e-06, "loss": 0.0, "num_input_tokens_seen": 182340256, "step": 84555 }, { "epoch": 15.518443751147, "grad_norm": 7.48331003705971e-05, "learning_rate": 1.4532162809387663e-06, "loss": 0.0, "num_input_tokens_seen": 182350048, "step": 84560 }, { "epoch": 15.519361350706552, "grad_norm": 0.07378020137548447, "learning_rate": 1.4526519133736477e-06, "loss": 0.0001, "num_input_tokens_seen": 182358976, "step": 84565 }, { "epoch": 15.520278950266103, "grad_norm": 0.001305874902755022, "learning_rate": 1.4520876367925097e-06, "loss": 0.0, "num_input_tokens_seen": 182370656, "step": 84570 }, { "epoch": 15.521196549825657, "grad_norm": 0.0003245566913392395, "learning_rate": 1.4515234512098247e-06, "loss": 0.0, "num_input_tokens_seen": 182380832, "step": 84575 }, { "epoch": 15.522114149385208, "grad_norm": 0.000934479059651494, "learning_rate": 1.4509593566400627e-06, "loss": 0.0, "num_input_tokens_seen": 182391936, "step": 84580 }, { "epoch": 15.52303174894476, "grad_norm": 0.0002915232034865767, "learning_rate": 1.4503953530976933e-06, "loss": 0.0, "num_input_tokens_seen": 182402816, "step": 84585 }, { "epoch": 15.523949348504313, "grad_norm": 7.291847578017041e-05, "learning_rate": 1.4498314405971826e-06, "loss": 0.0, "num_input_tokens_seen": 182414816, "step": 84590 }, { "epoch": 15.524866948063865, "grad_norm": 0.056564148515462875, "learning_rate": 1.4492676191529926e-06, "loss": 0.0, "num_input_tokens_seen": 182426656, "step": 84595 }, { "epoch": 15.525784547623417, "grad_norm": 0.012338702566921711, "learning_rate": 1.4487038887795851e-06, "loss": 0.0, "num_input_tokens_seen": 182437856, "step": 84600 }, { "epoch": 15.52670214718297, "grad_norm": 0.00014134799130260944, "learning_rate": 1.4481402494914175e-06, "loss": 0.0, "num_input_tokens_seen": 182448992, "step": 84605 }, { "epoch": 15.527619746742522, "grad_norm": 0.0007581879035569727, "learning_rate": 1.447576701302949e-06, "loss": 0.0, "num_input_tokens_seen": 182460544, "step": 84610 }, { "epoch": 15.528537346302073, "grad_norm": 0.009225565940141678, "learning_rate": 1.4470132442286322e-06, "loss": 0.0001, "num_input_tokens_seen": 182471072, "step": 84615 }, { "epoch": 15.529454945861627, "grad_norm": 0.012059086002409458, "learning_rate": 1.4464498782829178e-06, "loss": 0.0, "num_input_tokens_seen": 182481280, "step": 84620 }, { "epoch": 15.530372545421178, "grad_norm": 0.0002608868817333132, "learning_rate": 1.4458866034802581e-06, "loss": 0.0, "num_input_tokens_seen": 182491840, "step": 84625 }, { "epoch": 15.53129014498073, "grad_norm": 0.0027974897529929876, "learning_rate": 1.4453234198350986e-06, "loss": 0.0, "num_input_tokens_seen": 182502048, "step": 84630 }, { "epoch": 15.532207744540283, "grad_norm": 0.00014880194794386625, "learning_rate": 1.4447603273618826e-06, "loss": 0.0, "num_input_tokens_seen": 182512192, "step": 84635 }, { "epoch": 15.533125344099835, "grad_norm": 0.0004700798890553415, "learning_rate": 1.4441973260750553e-06, "loss": 0.0, "num_input_tokens_seen": 182521792, "step": 84640 }, { "epoch": 15.534042943659387, "grad_norm": 0.000552878831513226, "learning_rate": 1.4436344159890559e-06, "loss": 0.0, "num_input_tokens_seen": 182532576, "step": 84645 }, { "epoch": 15.53496054321894, "grad_norm": 0.014683895744383335, "learning_rate": 1.4430715971183218e-06, "loss": 0.0, "num_input_tokens_seen": 182545344, "step": 84650 }, { "epoch": 15.535878142778492, "grad_norm": 0.00163019890896976, "learning_rate": 1.442508869477287e-06, "loss": 0.0, "num_input_tokens_seen": 182557280, "step": 84655 }, { "epoch": 15.536795742338043, "grad_norm": 0.0038126036524772644, "learning_rate": 1.4419462330803879e-06, "loss": 0.0, "num_input_tokens_seen": 182568256, "step": 84660 }, { "epoch": 15.537713341897597, "grad_norm": 0.00030547435744665563, "learning_rate": 1.4413836879420528e-06, "loss": 0.0, "num_input_tokens_seen": 182579104, "step": 84665 }, { "epoch": 15.538630941457148, "grad_norm": 0.0035222931765019894, "learning_rate": 1.4408212340767096e-06, "loss": 0.0, "num_input_tokens_seen": 182589856, "step": 84670 }, { "epoch": 15.5395485410167, "grad_norm": 0.00011044121492886916, "learning_rate": 1.440258871498787e-06, "loss": 0.0, "num_input_tokens_seen": 182599808, "step": 84675 }, { "epoch": 15.540466140576253, "grad_norm": 0.00018094241386279464, "learning_rate": 1.4396966002227075e-06, "loss": 0.0, "num_input_tokens_seen": 182611520, "step": 84680 }, { "epoch": 15.541383740135805, "grad_norm": 9.5979223260656e-05, "learning_rate": 1.4391344202628905e-06, "loss": 0.0, "num_input_tokens_seen": 182622080, "step": 84685 }, { "epoch": 15.542301339695356, "grad_norm": 0.00017528953321743757, "learning_rate": 1.438572331633758e-06, "loss": 0.0, "num_input_tokens_seen": 182633152, "step": 84690 }, { "epoch": 15.54321893925491, "grad_norm": 0.0001467200490878895, "learning_rate": 1.438010334349726e-06, "loss": 0.0, "num_input_tokens_seen": 182643936, "step": 84695 }, { "epoch": 15.544136538814461, "grad_norm": 0.005734507460147142, "learning_rate": 1.4374484284252077e-06, "loss": 0.0, "num_input_tokens_seen": 182655104, "step": 84700 }, { "epoch": 15.545054138374013, "grad_norm": 0.00010716418182710186, "learning_rate": 1.4368866138746147e-06, "loss": 0.0, "num_input_tokens_seen": 182665600, "step": 84705 }, { "epoch": 15.545971737933566, "grad_norm": 0.004279549699276686, "learning_rate": 1.436324890712359e-06, "loss": 0.0, "num_input_tokens_seen": 182675808, "step": 84710 }, { "epoch": 15.546889337493118, "grad_norm": 0.0001414763682987541, "learning_rate": 1.435763258952847e-06, "loss": 0.0, "num_input_tokens_seen": 182687072, "step": 84715 }, { "epoch": 15.54780693705267, "grad_norm": 0.00011690222891047597, "learning_rate": 1.4352017186104816e-06, "loss": 0.0, "num_input_tokens_seen": 182697728, "step": 84720 }, { "epoch": 15.548724536612223, "grad_norm": 0.00140655180439353, "learning_rate": 1.4346402696996685e-06, "loss": 0.0, "num_input_tokens_seen": 182708192, "step": 84725 }, { "epoch": 15.549642136171775, "grad_norm": 0.00034445407800376415, "learning_rate": 1.434078912234807e-06, "loss": 0.0001, "num_input_tokens_seen": 182719616, "step": 84730 }, { "epoch": 15.550559735731326, "grad_norm": 0.0003212891169823706, "learning_rate": 1.4335176462302947e-06, "loss": 0.0, "num_input_tokens_seen": 182730848, "step": 84735 }, { "epoch": 15.55147733529088, "grad_norm": 8.57881605043076e-05, "learning_rate": 1.4329564717005257e-06, "loss": 0.0, "num_input_tokens_seen": 182741760, "step": 84740 }, { "epoch": 15.552394934850431, "grad_norm": 7.841783371986821e-05, "learning_rate": 1.4323953886598963e-06, "loss": 0.0, "num_input_tokens_seen": 182752416, "step": 84745 }, { "epoch": 15.553312534409983, "grad_norm": 604.8499145507812, "learning_rate": 1.431834397122796e-06, "loss": 0.0703, "num_input_tokens_seen": 182763680, "step": 84750 }, { "epoch": 15.554230133969536, "grad_norm": 0.0003801110724452883, "learning_rate": 1.4312734971036113e-06, "loss": 0.0001, "num_input_tokens_seen": 182774144, "step": 84755 }, { "epoch": 15.555147733529088, "grad_norm": 0.000928712310269475, "learning_rate": 1.430712688616732e-06, "loss": 0.0, "num_input_tokens_seen": 182785824, "step": 84760 }, { "epoch": 15.55606533308864, "grad_norm": 0.00010101066436618567, "learning_rate": 1.4301519716765405e-06, "loss": 0.0, "num_input_tokens_seen": 182797504, "step": 84765 }, { "epoch": 15.556982932648193, "grad_norm": 0.00106595188844949, "learning_rate": 1.4295913462974187e-06, "loss": 0.0, "num_input_tokens_seen": 182807872, "step": 84770 }, { "epoch": 15.557900532207745, "grad_norm": 0.0001820629695430398, "learning_rate": 1.4290308124937429e-06, "loss": 0.0, "num_input_tokens_seen": 182818880, "step": 84775 }, { "epoch": 15.558818131767296, "grad_norm": 0.00011252659896854311, "learning_rate": 1.428470370279894e-06, "loss": 0.0, "num_input_tokens_seen": 182829920, "step": 84780 }, { "epoch": 15.55973573132685, "grad_norm": 0.0006642877124249935, "learning_rate": 1.427910019670245e-06, "loss": 0.0, "num_input_tokens_seen": 182840800, "step": 84785 }, { "epoch": 15.560653330886401, "grad_norm": 0.00012189658446004614, "learning_rate": 1.4273497606791675e-06, "loss": 0.0, "num_input_tokens_seen": 182851904, "step": 84790 }, { "epoch": 15.561570930445953, "grad_norm": 7.983671821421012e-05, "learning_rate": 1.426789593321032e-06, "loss": 0.0005, "num_input_tokens_seen": 182863712, "step": 84795 }, { "epoch": 15.562488530005506, "grad_norm": 0.0033004831057041883, "learning_rate": 1.4262295176102047e-06, "loss": 0.0, "num_input_tokens_seen": 182875104, "step": 84800 }, { "epoch": 15.563406129565058, "grad_norm": 0.0001947767595993355, "learning_rate": 1.4256695335610504e-06, "loss": 0.0, "num_input_tokens_seen": 182885952, "step": 84805 }, { "epoch": 15.56432372912461, "grad_norm": 0.00014406682748813182, "learning_rate": 1.425109641187934e-06, "loss": 0.0, "num_input_tokens_seen": 182895328, "step": 84810 }, { "epoch": 15.565241328684163, "grad_norm": 0.00010613967606332153, "learning_rate": 1.424549840505215e-06, "loss": 0.0, "num_input_tokens_seen": 182906464, "step": 84815 }, { "epoch": 15.566158928243714, "grad_norm": 0.004331488162279129, "learning_rate": 1.4239901315272498e-06, "loss": 0.1804, "num_input_tokens_seen": 182916672, "step": 84820 }, { "epoch": 15.567076527803266, "grad_norm": 0.004522764589637518, "learning_rate": 1.4234305142683963e-06, "loss": 0.0, "num_input_tokens_seen": 182928352, "step": 84825 }, { "epoch": 15.56799412736282, "grad_norm": 0.0002763635420706123, "learning_rate": 1.4228709887430075e-06, "loss": 0.0, "num_input_tokens_seen": 182939808, "step": 84830 }, { "epoch": 15.568911726922371, "grad_norm": 0.00020617333939298987, "learning_rate": 1.4223115549654337e-06, "loss": 0.0, "num_input_tokens_seen": 182949440, "step": 84835 }, { "epoch": 15.569829326481923, "grad_norm": 0.002693560440093279, "learning_rate": 1.4217522129500222e-06, "loss": 0.0, "num_input_tokens_seen": 182961440, "step": 84840 }, { "epoch": 15.570746926041476, "grad_norm": 0.00017410672444384545, "learning_rate": 1.421192962711122e-06, "loss": 0.0001, "num_input_tokens_seen": 182972512, "step": 84845 }, { "epoch": 15.571664525601028, "grad_norm": 0.0006536440923810005, "learning_rate": 1.4206338042630757e-06, "loss": 0.0, "num_input_tokens_seen": 182982624, "step": 84850 }, { "epoch": 15.57258212516058, "grad_norm": 0.022842179983854294, "learning_rate": 1.4200747376202228e-06, "loss": 0.0, "num_input_tokens_seen": 182993088, "step": 84855 }, { "epoch": 15.573499724720133, "grad_norm": 9.175093146041036e-05, "learning_rate": 1.419515762796907e-06, "loss": 0.0, "num_input_tokens_seen": 183003456, "step": 84860 }, { "epoch": 15.574417324279684, "grad_norm": 114.14559936523438, "learning_rate": 1.4189568798074615e-06, "loss": 0.0284, "num_input_tokens_seen": 183014784, "step": 84865 }, { "epoch": 15.575334923839236, "grad_norm": 0.00019714712107088417, "learning_rate": 1.4183980886662214e-06, "loss": 0.0, "num_input_tokens_seen": 183026176, "step": 84870 }, { "epoch": 15.57625252339879, "grad_norm": 0.0060972548089921474, "learning_rate": 1.4178393893875204e-06, "loss": 0.0, "num_input_tokens_seen": 183036128, "step": 84875 }, { "epoch": 15.577170122958341, "grad_norm": 9.81894409051165e-05, "learning_rate": 1.4172807819856872e-06, "loss": 0.0, "num_input_tokens_seen": 183047232, "step": 84880 }, { "epoch": 15.578087722517893, "grad_norm": 0.00017439975636079907, "learning_rate": 1.4167222664750495e-06, "loss": 0.0001, "num_input_tokens_seen": 183056192, "step": 84885 }, { "epoch": 15.579005322077446, "grad_norm": 6.526811193907633e-05, "learning_rate": 1.4161638428699304e-06, "loss": 0.0001, "num_input_tokens_seen": 183067072, "step": 84890 }, { "epoch": 15.579922921636998, "grad_norm": 0.0001278842828469351, "learning_rate": 1.4156055111846555e-06, "loss": 0.0, "num_input_tokens_seen": 183078528, "step": 84895 }, { "epoch": 15.58084052119655, "grad_norm": 8.785335376160219e-05, "learning_rate": 1.4150472714335446e-06, "loss": 0.0, "num_input_tokens_seen": 183089824, "step": 84900 }, { "epoch": 15.581758120756103, "grad_norm": 3.9696204662323, "learning_rate": 1.4144891236309127e-06, "loss": 0.0025, "num_input_tokens_seen": 183100960, "step": 84905 }, { "epoch": 15.582675720315654, "grad_norm": 0.00011893613555002958, "learning_rate": 1.41393106779108e-06, "loss": 0.0, "num_input_tokens_seen": 183112224, "step": 84910 }, { "epoch": 15.583593319875206, "grad_norm": 0.0002596481645014137, "learning_rate": 1.413373103928357e-06, "loss": 0.0, "num_input_tokens_seen": 183123904, "step": 84915 }, { "epoch": 15.58451091943476, "grad_norm": 0.00013048049004282802, "learning_rate": 1.4128152320570553e-06, "loss": 0.0, "num_input_tokens_seen": 183135392, "step": 84920 }, { "epoch": 15.585428518994311, "grad_norm": 8.53288802318275e-05, "learning_rate": 1.4122574521914818e-06, "loss": 0.0, "num_input_tokens_seen": 183145696, "step": 84925 }, { "epoch": 15.586346118553863, "grad_norm": 0.0004772621323354542, "learning_rate": 1.4116997643459458e-06, "loss": 0.0007, "num_input_tokens_seen": 183156768, "step": 84930 }, { "epoch": 15.587263718113416, "grad_norm": 0.00010963248496409506, "learning_rate": 1.4111421685347493e-06, "loss": 0.0, "num_input_tokens_seen": 183165568, "step": 84935 }, { "epoch": 15.588181317672968, "grad_norm": 7.338661816902459e-05, "learning_rate": 1.4105846647721922e-06, "loss": 0.0, "num_input_tokens_seen": 183177024, "step": 84940 }, { "epoch": 15.58909891723252, "grad_norm": 0.01287072617560625, "learning_rate": 1.4100272530725773e-06, "loss": 0.0, "num_input_tokens_seen": 183186592, "step": 84945 }, { "epoch": 15.590016516792073, "grad_norm": 0.00899010431021452, "learning_rate": 1.4094699334501988e-06, "loss": 0.0, "num_input_tokens_seen": 183197344, "step": 84950 }, { "epoch": 15.590934116351624, "grad_norm": 5.754578160122037e-05, "learning_rate": 1.4089127059193507e-06, "loss": 0.0, "num_input_tokens_seen": 183208448, "step": 84955 }, { "epoch": 15.591851715911176, "grad_norm": 0.00016721463180147111, "learning_rate": 1.4083555704943275e-06, "loss": 0.0, "num_input_tokens_seen": 183219328, "step": 84960 }, { "epoch": 15.59276931547073, "grad_norm": 0.00029131618794053793, "learning_rate": 1.4077985271894173e-06, "loss": 0.0, "num_input_tokens_seen": 183231872, "step": 84965 }, { "epoch": 15.59368691503028, "grad_norm": 0.00029148635803721845, "learning_rate": 1.4072415760189074e-06, "loss": 0.0, "num_input_tokens_seen": 183243680, "step": 84970 }, { "epoch": 15.594604514589832, "grad_norm": 0.00012430615606717765, "learning_rate": 1.4066847169970815e-06, "loss": 0.0, "num_input_tokens_seen": 183255872, "step": 84975 }, { "epoch": 15.595522114149386, "grad_norm": 0.0003271112509537488, "learning_rate": 1.4061279501382247e-06, "loss": 0.0, "num_input_tokens_seen": 183266688, "step": 84980 }, { "epoch": 15.596439713708937, "grad_norm": 0.0006647874251939356, "learning_rate": 1.4055712754566164e-06, "loss": 0.0, "num_input_tokens_seen": 183278080, "step": 84985 }, { "epoch": 15.597357313268489, "grad_norm": 9.351534390589222e-05, "learning_rate": 1.4050146929665337e-06, "loss": 0.0, "num_input_tokens_seen": 183290784, "step": 84990 }, { "epoch": 15.598274912828042, "grad_norm": 71.80829620361328, "learning_rate": 1.4044582026822523e-06, "loss": 0.0144, "num_input_tokens_seen": 183302624, "step": 84995 }, { "epoch": 15.599192512387594, "grad_norm": 0.00011097321112174541, "learning_rate": 1.4039018046180442e-06, "loss": 0.0, "num_input_tokens_seen": 183314272, "step": 85000 }, { "epoch": 15.600110111947146, "grad_norm": 0.0026221026200801134, "learning_rate": 1.4033454987881828e-06, "loss": 0.0, "num_input_tokens_seen": 183325312, "step": 85005 }, { "epoch": 15.601027711506699, "grad_norm": 0.00010904213559115306, "learning_rate": 1.4027892852069347e-06, "loss": 0.0, "num_input_tokens_seen": 183335200, "step": 85010 }, { "epoch": 15.60194531106625, "grad_norm": 0.001138315419666469, "learning_rate": 1.4022331638885666e-06, "loss": 0.0, "num_input_tokens_seen": 183345056, "step": 85015 }, { "epoch": 15.602862910625802, "grad_norm": 0.0001465635868953541, "learning_rate": 1.4016771348473418e-06, "loss": 0.0, "num_input_tokens_seen": 183355072, "step": 85020 }, { "epoch": 15.603780510185356, "grad_norm": 0.0005239314050413668, "learning_rate": 1.4011211980975198e-06, "loss": 0.0, "num_input_tokens_seen": 183366592, "step": 85025 }, { "epoch": 15.604698109744907, "grad_norm": 0.008529062382876873, "learning_rate": 1.400565353653363e-06, "loss": 0.0, "num_input_tokens_seen": 183378240, "step": 85030 }, { "epoch": 15.605615709304459, "grad_norm": 8.588687342125922e-05, "learning_rate": 1.4000096015291264e-06, "loss": 0.0, "num_input_tokens_seen": 183390528, "step": 85035 }, { "epoch": 15.606533308864012, "grad_norm": 0.00025983527302742004, "learning_rate": 1.3994539417390623e-06, "loss": 0.0, "num_input_tokens_seen": 183402144, "step": 85040 }, { "epoch": 15.607450908423564, "grad_norm": 0.0019055693410336971, "learning_rate": 1.398898374297426e-06, "loss": 0.0, "num_input_tokens_seen": 183413088, "step": 85045 }, { "epoch": 15.608368507983116, "grad_norm": 0.001529224100522697, "learning_rate": 1.3983428992184656e-06, "loss": 0.0, "num_input_tokens_seen": 183422688, "step": 85050 }, { "epoch": 15.609286107542669, "grad_norm": 0.00020965187286492437, "learning_rate": 1.3977875165164273e-06, "loss": 0.0, "num_input_tokens_seen": 183433984, "step": 85055 }, { "epoch": 15.61020370710222, "grad_norm": 0.013363627716898918, "learning_rate": 1.3972322262055543e-06, "loss": 0.0, "num_input_tokens_seen": 183446016, "step": 85060 }, { "epoch": 15.611121306661772, "grad_norm": 0.0001333783147856593, "learning_rate": 1.396677028300093e-06, "loss": 0.0, "num_input_tokens_seen": 183456832, "step": 85065 }, { "epoch": 15.612038906221326, "grad_norm": 0.00014508665481116623, "learning_rate": 1.3961219228142813e-06, "loss": 0.0, "num_input_tokens_seen": 183466304, "step": 85070 }, { "epoch": 15.612956505780877, "grad_norm": 0.0001072126324288547, "learning_rate": 1.3955669097623548e-06, "loss": 0.0, "num_input_tokens_seen": 183476736, "step": 85075 }, { "epoch": 15.613874105340429, "grad_norm": 0.00010372648830525577, "learning_rate": 1.3950119891585529e-06, "loss": 0.0, "num_input_tokens_seen": 183487424, "step": 85080 }, { "epoch": 15.614791704899982, "grad_norm": 0.0006608740077354014, "learning_rate": 1.394457161017106e-06, "loss": 0.0, "num_input_tokens_seen": 183496960, "step": 85085 }, { "epoch": 15.615709304459534, "grad_norm": 7.686047320021316e-05, "learning_rate": 1.3939024253522432e-06, "loss": 0.0, "num_input_tokens_seen": 183507584, "step": 85090 }, { "epoch": 15.616626904019085, "grad_norm": 0.00028864105115644634, "learning_rate": 1.3933477821781954e-06, "loss": 0.0001, "num_input_tokens_seen": 183518336, "step": 85095 }, { "epoch": 15.617544503578639, "grad_norm": 0.00011031675239792094, "learning_rate": 1.3927932315091874e-06, "loss": 0.0, "num_input_tokens_seen": 183530304, "step": 85100 }, { "epoch": 15.61846210313819, "grad_norm": 0.00019479586626403034, "learning_rate": 1.3922387733594428e-06, "loss": 0.0, "num_input_tokens_seen": 183541440, "step": 85105 }, { "epoch": 15.619379702697742, "grad_norm": 0.001268718158826232, "learning_rate": 1.3916844077431802e-06, "loss": 0.0, "num_input_tokens_seen": 183552672, "step": 85110 }, { "epoch": 15.620297302257296, "grad_norm": 0.00026949934544973075, "learning_rate": 1.391130134674622e-06, "loss": 0.0001, "num_input_tokens_seen": 183564544, "step": 85115 }, { "epoch": 15.621214901816847, "grad_norm": 0.0017182715237140656, "learning_rate": 1.3905759541679826e-06, "loss": 0.0, "num_input_tokens_seen": 183574464, "step": 85120 }, { "epoch": 15.622132501376399, "grad_norm": 0.00011147961777169257, "learning_rate": 1.3900218662374737e-06, "loss": 0.0, "num_input_tokens_seen": 183586176, "step": 85125 }, { "epoch": 15.623050100935952, "grad_norm": 8.0738594988361e-05, "learning_rate": 1.3894678708973108e-06, "loss": 0.0, "num_input_tokens_seen": 183596768, "step": 85130 }, { "epoch": 15.623967700495504, "grad_norm": 7.362432370427996e-05, "learning_rate": 1.3889139681617014e-06, "loss": 0.0, "num_input_tokens_seen": 183607712, "step": 85135 }, { "epoch": 15.624885300055055, "grad_norm": 0.0007567528518848121, "learning_rate": 1.3883601580448508e-06, "loss": 0.0, "num_input_tokens_seen": 183619104, "step": 85140 }, { "epoch": 15.625802899614609, "grad_norm": 0.00024960533482953906, "learning_rate": 1.387806440560966e-06, "loss": 0.0, "num_input_tokens_seen": 183629824, "step": 85145 }, { "epoch": 15.62672049917416, "grad_norm": 0.00021170324180275202, "learning_rate": 1.3872528157242471e-06, "loss": 0.0, "num_input_tokens_seen": 183639392, "step": 85150 }, { "epoch": 15.627638098733712, "grad_norm": 0.0001793618139345199, "learning_rate": 1.3866992835488945e-06, "loss": 0.0, "num_input_tokens_seen": 183650624, "step": 85155 }, { "epoch": 15.628555698293265, "grad_norm": 5.696272637578659e-05, "learning_rate": 1.3861458440491038e-06, "loss": 0.0, "num_input_tokens_seen": 183661792, "step": 85160 }, { "epoch": 15.629473297852817, "grad_norm": 0.0012276082998141646, "learning_rate": 1.3855924972390728e-06, "loss": 0.0001, "num_input_tokens_seen": 183673728, "step": 85165 }, { "epoch": 15.630390897412369, "grad_norm": 0.015055222436785698, "learning_rate": 1.3850392431329918e-06, "loss": 0.0001, "num_input_tokens_seen": 183683936, "step": 85170 }, { "epoch": 15.631308496971922, "grad_norm": 9.286567365052179e-05, "learning_rate": 1.3844860817450507e-06, "loss": 0.0, "num_input_tokens_seen": 183693984, "step": 85175 }, { "epoch": 15.632226096531474, "grad_norm": 0.000733103312086314, "learning_rate": 1.383933013089439e-06, "loss": 0.0, "num_input_tokens_seen": 183703872, "step": 85180 }, { "epoch": 15.633143696091025, "grad_norm": 0.003057253547012806, "learning_rate": 1.3833800371803419e-06, "loss": 0.0, "num_input_tokens_seen": 183713888, "step": 85185 }, { "epoch": 15.634061295650579, "grad_norm": 0.0010007289238274097, "learning_rate": 1.382827154031941e-06, "loss": 0.0, "num_input_tokens_seen": 183724448, "step": 85190 }, { "epoch": 15.63497889521013, "grad_norm": 0.00020396897161845118, "learning_rate": 1.3822743636584162e-06, "loss": 0.0, "num_input_tokens_seen": 183735616, "step": 85195 }, { "epoch": 15.635896494769682, "grad_norm": 0.0006434296374209225, "learning_rate": 1.3817216660739486e-06, "loss": 0.0, "num_input_tokens_seen": 183747488, "step": 85200 }, { "epoch": 15.636814094329235, "grad_norm": 0.00022306725441012532, "learning_rate": 1.381169061292712e-06, "loss": 0.0, "num_input_tokens_seen": 183758560, "step": 85205 }, { "epoch": 15.637731693888787, "grad_norm": 0.0007888399413786829, "learning_rate": 1.3806165493288808e-06, "loss": 0.0, "num_input_tokens_seen": 183769536, "step": 85210 }, { "epoch": 15.638649293448339, "grad_norm": 0.0001974381011677906, "learning_rate": 1.380064130196625e-06, "loss": 0.0, "num_input_tokens_seen": 183779456, "step": 85215 }, { "epoch": 15.639566893007892, "grad_norm": 0.00016238245007116348, "learning_rate": 1.379511803910113e-06, "loss": 0.3906, "num_input_tokens_seen": 183790496, "step": 85220 }, { "epoch": 15.640484492567444, "grad_norm": 0.00029621776775456965, "learning_rate": 1.378959570483513e-06, "loss": 0.0, "num_input_tokens_seen": 183802272, "step": 85225 }, { "epoch": 15.641402092126995, "grad_norm": 0.0005271760164760053, "learning_rate": 1.3784074299309885e-06, "loss": 0.0, "num_input_tokens_seen": 183813920, "step": 85230 }, { "epoch": 15.642319691686549, "grad_norm": 9.997330926125869e-05, "learning_rate": 1.3778553822667e-06, "loss": 0.0, "num_input_tokens_seen": 183823488, "step": 85235 }, { "epoch": 15.6432372912461, "grad_norm": 0.002737989416345954, "learning_rate": 1.377303427504807e-06, "loss": 0.0, "num_input_tokens_seen": 183833664, "step": 85240 }, { "epoch": 15.644154890805652, "grad_norm": 0.0005447635776363313, "learning_rate": 1.3767515656594654e-06, "loss": 0.0, "num_input_tokens_seen": 183844608, "step": 85245 }, { "epoch": 15.645072490365205, "grad_norm": 0.00038333673728629947, "learning_rate": 1.3761997967448316e-06, "loss": 0.0, "num_input_tokens_seen": 183855392, "step": 85250 }, { "epoch": 15.645990089924757, "grad_norm": 0.00025573562015779316, "learning_rate": 1.3756481207750572e-06, "loss": 0.0, "num_input_tokens_seen": 183867136, "step": 85255 }, { "epoch": 15.646907689484308, "grad_norm": 7.148388249333948e-05, "learning_rate": 1.3750965377642895e-06, "loss": 0.0, "num_input_tokens_seen": 183876992, "step": 85260 }, { "epoch": 15.647825289043862, "grad_norm": 0.008269027806818485, "learning_rate": 1.3745450477266786e-06, "loss": 0.0, "num_input_tokens_seen": 183887872, "step": 85265 }, { "epoch": 15.648742888603413, "grad_norm": 0.00016688340110704303, "learning_rate": 1.3739936506763685e-06, "loss": 0.0, "num_input_tokens_seen": 183900128, "step": 85270 }, { "epoch": 15.649660488162965, "grad_norm": 0.001033857697620988, "learning_rate": 1.3734423466275004e-06, "loss": 0.0, "num_input_tokens_seen": 183911968, "step": 85275 }, { "epoch": 15.650578087722518, "grad_norm": 0.0014038372319191694, "learning_rate": 1.3728911355942164e-06, "loss": 0.0, "num_input_tokens_seen": 183920736, "step": 85280 }, { "epoch": 15.65149568728207, "grad_norm": 0.0071687400341033936, "learning_rate": 1.3723400175906536e-06, "loss": 0.0, "num_input_tokens_seen": 183932480, "step": 85285 }, { "epoch": 15.652413286841622, "grad_norm": 0.00013330220826901495, "learning_rate": 1.3717889926309468e-06, "loss": 0.0, "num_input_tokens_seen": 183944224, "step": 85290 }, { "epoch": 15.653330886401175, "grad_norm": 0.00013525468239095062, "learning_rate": 1.3712380607292281e-06, "loss": 0.0, "num_input_tokens_seen": 183954912, "step": 85295 }, { "epoch": 15.654248485960727, "grad_norm": 23.62887954711914, "learning_rate": 1.3706872218996299e-06, "loss": 0.0505, "num_input_tokens_seen": 183966240, "step": 85300 }, { "epoch": 15.655166085520278, "grad_norm": 0.00011722454655682668, "learning_rate": 1.3701364761562801e-06, "loss": 0.0, "num_input_tokens_seen": 183977632, "step": 85305 }, { "epoch": 15.656083685079832, "grad_norm": 0.00044004101073369384, "learning_rate": 1.3695858235133019e-06, "loss": 0.0, "num_input_tokens_seen": 183989856, "step": 85310 }, { "epoch": 15.657001284639383, "grad_norm": 0.008580038323998451, "learning_rate": 1.3690352639848226e-06, "loss": 0.0001, "num_input_tokens_seen": 183999744, "step": 85315 }, { "epoch": 15.657918884198935, "grad_norm": 0.00012433555093593895, "learning_rate": 1.3684847975849609e-06, "loss": 0.0, "num_input_tokens_seen": 184010464, "step": 85320 }, { "epoch": 15.658836483758488, "grad_norm": 0.0006004688329994678, "learning_rate": 1.3679344243278348e-06, "loss": 0.0, "num_input_tokens_seen": 184021408, "step": 85325 }, { "epoch": 15.65975408331804, "grad_norm": 0.0008547918405383825, "learning_rate": 1.3673841442275625e-06, "loss": 0.0, "num_input_tokens_seen": 184031424, "step": 85330 }, { "epoch": 15.660671682877592, "grad_norm": 0.0002040999970631674, "learning_rate": 1.366833957298257e-06, "loss": 0.0, "num_input_tokens_seen": 184042016, "step": 85335 }, { "epoch": 15.661589282437145, "grad_norm": 0.0008647774811834097, "learning_rate": 1.3662838635540298e-06, "loss": 0.0, "num_input_tokens_seen": 184052416, "step": 85340 }, { "epoch": 15.662506881996697, "grad_norm": 0.00030118422000668943, "learning_rate": 1.3657338630089883e-06, "loss": 0.007, "num_input_tokens_seen": 184061664, "step": 85345 }, { "epoch": 15.663424481556248, "grad_norm": 0.00013985317491460592, "learning_rate": 1.3651839556772418e-06, "loss": 0.0, "num_input_tokens_seen": 184071808, "step": 85350 }, { "epoch": 15.664342081115802, "grad_norm": 9.609533299226314e-05, "learning_rate": 1.3646341415728936e-06, "loss": 0.0, "num_input_tokens_seen": 184082496, "step": 85355 }, { "epoch": 15.665259680675353, "grad_norm": 5.203509863349609e-05, "learning_rate": 1.364084420710044e-06, "loss": 0.0001, "num_input_tokens_seen": 184093344, "step": 85360 }, { "epoch": 15.666177280234905, "grad_norm": 0.00013852753909304738, "learning_rate": 1.3635347931027947e-06, "loss": 0.0, "num_input_tokens_seen": 184105088, "step": 85365 }, { "epoch": 15.667094879794458, "grad_norm": 0.0017977176466956735, "learning_rate": 1.3629852587652426e-06, "loss": 0.0, "num_input_tokens_seen": 184116160, "step": 85370 }, { "epoch": 15.66801247935401, "grad_norm": 0.00013317455886863172, "learning_rate": 1.3624358177114815e-06, "loss": 0.0, "num_input_tokens_seen": 184127808, "step": 85375 }, { "epoch": 15.668930078913561, "grad_norm": 0.00047940187505446374, "learning_rate": 1.3618864699556029e-06, "loss": 0.0, "num_input_tokens_seen": 184137440, "step": 85380 }, { "epoch": 15.669847678473115, "grad_norm": 8.837821224005893e-05, "learning_rate": 1.3613372155116987e-06, "loss": 0.0016, "num_input_tokens_seen": 184148128, "step": 85385 }, { "epoch": 15.670765278032667, "grad_norm": 0.00010488332191016525, "learning_rate": 1.3607880543938557e-06, "loss": 0.0, "num_input_tokens_seen": 184158048, "step": 85390 }, { "epoch": 15.671682877592218, "grad_norm": 0.000323633779771626, "learning_rate": 1.3602389866161575e-06, "loss": 0.0, "num_input_tokens_seen": 184168704, "step": 85395 }, { "epoch": 15.672600477151772, "grad_norm": 6.531013059429824e-05, "learning_rate": 1.3596900121926893e-06, "loss": 0.0, "num_input_tokens_seen": 184179552, "step": 85400 }, { "epoch": 15.673518076711323, "grad_norm": 0.22361750900745392, "learning_rate": 1.3591411311375307e-06, "loss": 0.0, "num_input_tokens_seen": 184190496, "step": 85405 }, { "epoch": 15.674435676270875, "grad_norm": 0.00024012825451791286, "learning_rate": 1.358592343464759e-06, "loss": 0.0, "num_input_tokens_seen": 184202144, "step": 85410 }, { "epoch": 15.675353275830428, "grad_norm": 0.00023253056860994548, "learning_rate": 1.3580436491884485e-06, "loss": 0.0, "num_input_tokens_seen": 184211840, "step": 85415 }, { "epoch": 15.67627087538998, "grad_norm": 0.05040529742836952, "learning_rate": 1.3574950483226757e-06, "loss": 0.0, "num_input_tokens_seen": 184222336, "step": 85420 }, { "epoch": 15.677188474949531, "grad_norm": 0.00020478676015045494, "learning_rate": 1.356946540881509e-06, "loss": 0.0, "num_input_tokens_seen": 184234016, "step": 85425 }, { "epoch": 15.678106074509085, "grad_norm": 0.007441167254000902, "learning_rate": 1.3563981268790182e-06, "loss": 0.0, "num_input_tokens_seen": 184243648, "step": 85430 }, { "epoch": 15.679023674068636, "grad_norm": 0.0002936551463790238, "learning_rate": 1.3558498063292674e-06, "loss": 0.0, "num_input_tokens_seen": 184255456, "step": 85435 }, { "epoch": 15.679941273628188, "grad_norm": 0.00014559680130332708, "learning_rate": 1.3553015792463202e-06, "loss": 0.0, "num_input_tokens_seen": 184266560, "step": 85440 }, { "epoch": 15.680858873187741, "grad_norm": 0.0029755241703242064, "learning_rate": 1.35475344564424e-06, "loss": 0.0, "num_input_tokens_seen": 184276960, "step": 85445 }, { "epoch": 15.681776472747293, "grad_norm": 0.00015942785830702633, "learning_rate": 1.3542054055370846e-06, "loss": 0.0, "num_input_tokens_seen": 184289152, "step": 85450 }, { "epoch": 15.682694072306845, "grad_norm": 0.0019615625496953726, "learning_rate": 1.35365745893891e-06, "loss": 0.0, "num_input_tokens_seen": 184298016, "step": 85455 }, { "epoch": 15.683611671866398, "grad_norm": 0.0001787388900993392, "learning_rate": 1.3531096058637682e-06, "loss": 0.0, "num_input_tokens_seen": 184308096, "step": 85460 }, { "epoch": 15.68452927142595, "grad_norm": 0.0006014166865497828, "learning_rate": 1.3525618463257151e-06, "loss": 0.0, "num_input_tokens_seen": 184318176, "step": 85465 }, { "epoch": 15.685446870985501, "grad_norm": 6.670199945801869e-05, "learning_rate": 1.352014180338797e-06, "loss": 0.0, "num_input_tokens_seen": 184329312, "step": 85470 }, { "epoch": 15.686364470545055, "grad_norm": 0.00013478854089044034, "learning_rate": 1.3514666079170618e-06, "loss": 0.0, "num_input_tokens_seen": 184338784, "step": 85475 }, { "epoch": 15.687282070104606, "grad_norm": 0.0009891930967569351, "learning_rate": 1.3509191290745515e-06, "loss": 0.0, "num_input_tokens_seen": 184349920, "step": 85480 }, { "epoch": 15.688199669664158, "grad_norm": 6.289032171480358e-05, "learning_rate": 1.3503717438253118e-06, "loss": 0.0, "num_input_tokens_seen": 184360224, "step": 85485 }, { "epoch": 15.689117269223711, "grad_norm": 9.44171697483398e-05, "learning_rate": 1.3498244521833803e-06, "loss": 0.0, "num_input_tokens_seen": 184371616, "step": 85490 }, { "epoch": 15.690034868783263, "grad_norm": 0.0003803895669989288, "learning_rate": 1.349277254162793e-06, "loss": 0.0, "num_input_tokens_seen": 184382400, "step": 85495 }, { "epoch": 15.690952468342815, "grad_norm": 0.005304589867591858, "learning_rate": 1.3487301497775874e-06, "loss": 0.0, "num_input_tokens_seen": 184393888, "step": 85500 }, { "epoch": 15.691870067902368, "grad_norm": 0.0004987255670130253, "learning_rate": 1.3481831390417943e-06, "loss": 0.0, "num_input_tokens_seen": 184405440, "step": 85505 }, { "epoch": 15.69278766746192, "grad_norm": 0.00016203790437430143, "learning_rate": 1.3476362219694445e-06, "loss": 0.0, "num_input_tokens_seen": 184415168, "step": 85510 }, { "epoch": 15.693705267021471, "grad_norm": 0.21149852871894836, "learning_rate": 1.347089398574563e-06, "loss": 0.0001, "num_input_tokens_seen": 184425664, "step": 85515 }, { "epoch": 15.694622866581025, "grad_norm": 0.00011574099335120991, "learning_rate": 1.3465426688711786e-06, "loss": 0.0, "num_input_tokens_seen": 184436800, "step": 85520 }, { "epoch": 15.695540466140576, "grad_norm": 0.00025476468726992607, "learning_rate": 1.3459960328733118e-06, "loss": 0.0, "num_input_tokens_seen": 184446496, "step": 85525 }, { "epoch": 15.696458065700128, "grad_norm": 0.00014424520486500114, "learning_rate": 1.3454494905949829e-06, "loss": 0.0, "num_input_tokens_seen": 184456640, "step": 85530 }, { "epoch": 15.697375665259681, "grad_norm": 0.0008232873515225947, "learning_rate": 1.3449030420502113e-06, "loss": 0.0, "num_input_tokens_seen": 184467104, "step": 85535 }, { "epoch": 15.698293264819233, "grad_norm": 8.74199322424829e-05, "learning_rate": 1.3443566872530122e-06, "loss": 0.0, "num_input_tokens_seen": 184477696, "step": 85540 }, { "epoch": 15.699210864378784, "grad_norm": 0.00014725020446348935, "learning_rate": 1.3438104262173968e-06, "loss": 0.0, "num_input_tokens_seen": 184489120, "step": 85545 }, { "epoch": 15.700128463938338, "grad_norm": 0.00036467413883656263, "learning_rate": 1.3432642589573792e-06, "loss": 0.0, "num_input_tokens_seen": 184500032, "step": 85550 }, { "epoch": 15.70104606349789, "grad_norm": 0.0001116149069275707, "learning_rate": 1.3427181854869653e-06, "loss": 0.0, "num_input_tokens_seen": 184509888, "step": 85555 }, { "epoch": 15.701963663057441, "grad_norm": 0.00017745784134604037, "learning_rate": 1.342172205820162e-06, "loss": 0.0883, "num_input_tokens_seen": 184521280, "step": 85560 }, { "epoch": 15.702881262616994, "grad_norm": 0.0005580254946835339, "learning_rate": 1.341626319970971e-06, "loss": 0.0, "num_input_tokens_seen": 184531168, "step": 85565 }, { "epoch": 15.703798862176546, "grad_norm": 0.0023490760941058397, "learning_rate": 1.341080527953396e-06, "loss": 0.0, "num_input_tokens_seen": 184542688, "step": 85570 }, { "epoch": 15.704716461736098, "grad_norm": 0.00014028180157765746, "learning_rate": 1.3405348297814353e-06, "loss": 0.0, "num_input_tokens_seen": 184553408, "step": 85575 }, { "epoch": 15.705634061295651, "grad_norm": 0.020060963928699493, "learning_rate": 1.3399892254690827e-06, "loss": 0.0, "num_input_tokens_seen": 184564992, "step": 85580 }, { "epoch": 15.706551660855203, "grad_norm": 0.0003744390851352364, "learning_rate": 1.3394437150303358e-06, "loss": 0.0, "num_input_tokens_seen": 184576320, "step": 85585 }, { "epoch": 15.707469260414754, "grad_norm": 0.00010326479969080538, "learning_rate": 1.3388982984791837e-06, "loss": 0.0, "num_input_tokens_seen": 184586592, "step": 85590 }, { "epoch": 15.708386859974308, "grad_norm": 0.0002110339846694842, "learning_rate": 1.3383529758296154e-06, "loss": 0.0, "num_input_tokens_seen": 184597024, "step": 85595 }, { "epoch": 15.70930445953386, "grad_norm": 0.00010246732563246042, "learning_rate": 1.3378077470956192e-06, "loss": 0.0, "num_input_tokens_seen": 184608032, "step": 85600 }, { "epoch": 15.710222059093411, "grad_norm": 8.05163144832477e-05, "learning_rate": 1.337262612291178e-06, "loss": 0.0002, "num_input_tokens_seen": 184618624, "step": 85605 }, { "epoch": 15.711139658652964, "grad_norm": 0.0010554007021710277, "learning_rate": 1.336717571430275e-06, "loss": 0.0, "num_input_tokens_seen": 184628800, "step": 85610 }, { "epoch": 15.712057258212516, "grad_norm": 0.00016315838729497045, "learning_rate": 1.3361726245268863e-06, "loss": 0.0, "num_input_tokens_seen": 184639488, "step": 85615 }, { "epoch": 15.712974857772068, "grad_norm": 0.0001791619579307735, "learning_rate": 1.3356277715949934e-06, "loss": 0.0, "num_input_tokens_seen": 184651104, "step": 85620 }, { "epoch": 15.713892457331621, "grad_norm": 0.00010183959238929674, "learning_rate": 1.3350830126485686e-06, "loss": 0.0, "num_input_tokens_seen": 184662272, "step": 85625 }, { "epoch": 15.714810056891173, "grad_norm": 0.0002902355627156794, "learning_rate": 1.3345383477015844e-06, "loss": 0.0, "num_input_tokens_seen": 184673504, "step": 85630 }, { "epoch": 15.715727656450724, "grad_norm": 6.843732990091667e-05, "learning_rate": 1.3339937767680094e-06, "loss": 0.0, "num_input_tokens_seen": 184684160, "step": 85635 }, { "epoch": 15.716645256010278, "grad_norm": 7.766928320052102e-05, "learning_rate": 1.3334492998618137e-06, "loss": 0.0, "num_input_tokens_seen": 184694112, "step": 85640 }, { "epoch": 15.71756285556983, "grad_norm": 0.0002806026895996183, "learning_rate": 1.332904916996961e-06, "loss": 0.0, "num_input_tokens_seen": 184704736, "step": 85645 }, { "epoch": 15.71848045512938, "grad_norm": 8.438366057816893e-05, "learning_rate": 1.3323606281874135e-06, "loss": 0.0, "num_input_tokens_seen": 184716672, "step": 85650 }, { "epoch": 15.719398054688934, "grad_norm": 0.0010459936456754804, "learning_rate": 1.3318164334471312e-06, "loss": 0.0, "num_input_tokens_seen": 184728000, "step": 85655 }, { "epoch": 15.720315654248486, "grad_norm": 0.010158409364521503, "learning_rate": 1.3312723327900711e-06, "loss": 0.0, "num_input_tokens_seen": 184738368, "step": 85660 }, { "epoch": 15.721233253808037, "grad_norm": 0.013245908543467522, "learning_rate": 1.3307283262301912e-06, "loss": 0.0, "num_input_tokens_seen": 184749216, "step": 85665 }, { "epoch": 15.72215085336759, "grad_norm": 0.00011124477168777958, "learning_rate": 1.3301844137814428e-06, "loss": 0.0, "num_input_tokens_seen": 184760256, "step": 85670 }, { "epoch": 15.723068452927143, "grad_norm": 8.539680857211351e-05, "learning_rate": 1.3296405954577763e-06, "loss": 0.0, "num_input_tokens_seen": 184770784, "step": 85675 }, { "epoch": 15.723986052486694, "grad_norm": 0.0034160767681896687, "learning_rate": 1.3290968712731384e-06, "loss": 0.0, "num_input_tokens_seen": 184781760, "step": 85680 }, { "epoch": 15.724903652046248, "grad_norm": 9.792511264095083e-05, "learning_rate": 1.328553241241478e-06, "loss": 0.0, "num_input_tokens_seen": 184792672, "step": 85685 }, { "epoch": 15.7258212516058, "grad_norm": 0.00025681726401671767, "learning_rate": 1.3280097053767372e-06, "loss": 0.0, "num_input_tokens_seen": 184803520, "step": 85690 }, { "epoch": 15.72673885116535, "grad_norm": 0.0001396374573232606, "learning_rate": 1.327466263692856e-06, "loss": 0.0, "num_input_tokens_seen": 184815808, "step": 85695 }, { "epoch": 15.727656450724904, "grad_norm": 0.0001048503108904697, "learning_rate": 1.3269229162037716e-06, "loss": 0.0, "num_input_tokens_seen": 184827360, "step": 85700 }, { "epoch": 15.728574050284456, "grad_norm": 0.0001674234663369134, "learning_rate": 1.3263796629234233e-06, "loss": 0.0, "num_input_tokens_seen": 184839840, "step": 85705 }, { "epoch": 15.729491649844007, "grad_norm": 0.00010449133696965873, "learning_rate": 1.3258365038657433e-06, "loss": 0.0, "num_input_tokens_seen": 184849408, "step": 85710 }, { "epoch": 15.73040924940356, "grad_norm": 0.00011629660002654418, "learning_rate": 1.3252934390446603e-06, "loss": 0.0, "num_input_tokens_seen": 184860736, "step": 85715 }, { "epoch": 15.731326848963112, "grad_norm": 0.00022185129637364298, "learning_rate": 1.3247504684741075e-06, "loss": 0.0, "num_input_tokens_seen": 184871456, "step": 85720 }, { "epoch": 15.732244448522664, "grad_norm": 0.00011461057147243991, "learning_rate": 1.3242075921680092e-06, "loss": 0.0, "num_input_tokens_seen": 184880864, "step": 85725 }, { "epoch": 15.733162048082217, "grad_norm": 0.00012626398529391736, "learning_rate": 1.323664810140287e-06, "loss": 0.0, "num_input_tokens_seen": 184891040, "step": 85730 }, { "epoch": 15.734079647641769, "grad_norm": 0.00011700570030370727, "learning_rate": 1.3231221224048668e-06, "loss": 0.0, "num_input_tokens_seen": 184901216, "step": 85735 }, { "epoch": 15.73499724720132, "grad_norm": 7.068607374094427e-05, "learning_rate": 1.322579528975665e-06, "loss": 0.0, "num_input_tokens_seen": 184912288, "step": 85740 }, { "epoch": 15.735914846760874, "grad_norm": 0.00012992419942747802, "learning_rate": 1.3220370298665992e-06, "loss": 0.0, "num_input_tokens_seen": 184923808, "step": 85745 }, { "epoch": 15.736832446320426, "grad_norm": 0.00012307034921832383, "learning_rate": 1.321494625091581e-06, "loss": 0.0, "num_input_tokens_seen": 184934688, "step": 85750 }, { "epoch": 15.737750045879977, "grad_norm": 0.0008564505842514336, "learning_rate": 1.320952314664527e-06, "loss": 0.0, "num_input_tokens_seen": 184944288, "step": 85755 }, { "epoch": 15.73866764543953, "grad_norm": 0.00040699681267142296, "learning_rate": 1.3204100985993435e-06, "loss": 0.0, "num_input_tokens_seen": 184954848, "step": 85760 }, { "epoch": 15.739585244999082, "grad_norm": 0.003345905104652047, "learning_rate": 1.3198679769099365e-06, "loss": 0.0, "num_input_tokens_seen": 184964800, "step": 85765 }, { "epoch": 15.740502844558634, "grad_norm": 0.0015791401965543628, "learning_rate": 1.319325949610214e-06, "loss": 0.0, "num_input_tokens_seen": 184974688, "step": 85770 }, { "epoch": 15.741420444118187, "grad_norm": 0.00033080618595704436, "learning_rate": 1.3187840167140763e-06, "loss": 0.0, "num_input_tokens_seen": 184986432, "step": 85775 }, { "epoch": 15.742338043677739, "grad_norm": 0.0001127323048422113, "learning_rate": 1.318242178235422e-06, "loss": 0.001, "num_input_tokens_seen": 184997120, "step": 85780 }, { "epoch": 15.74325564323729, "grad_norm": 0.0018849943298846483, "learning_rate": 1.317700434188151e-06, "loss": 0.0, "num_input_tokens_seen": 185007040, "step": 85785 }, { "epoch": 15.744173242796844, "grad_norm": 0.002619158010929823, "learning_rate": 1.3171587845861566e-06, "loss": 0.0, "num_input_tokens_seen": 185018304, "step": 85790 }, { "epoch": 15.745090842356396, "grad_norm": 0.0001108414217014797, "learning_rate": 1.3166172294433315e-06, "loss": 0.0, "num_input_tokens_seen": 185028448, "step": 85795 }, { "epoch": 15.746008441915947, "grad_norm": 0.0008382722153328359, "learning_rate": 1.3160757687735642e-06, "loss": 0.0, "num_input_tokens_seen": 185037856, "step": 85800 }, { "epoch": 15.7469260414755, "grad_norm": 0.00040493992855772376, "learning_rate": 1.3155344025907458e-06, "loss": 0.0, "num_input_tokens_seen": 185046752, "step": 85805 }, { "epoch": 15.747843641035052, "grad_norm": 0.0056908647529780865, "learning_rate": 1.3149931309087594e-06, "loss": 0.0, "num_input_tokens_seen": 185058304, "step": 85810 }, { "epoch": 15.748761240594604, "grad_norm": 0.0001789511152310297, "learning_rate": 1.3144519537414862e-06, "loss": 0.0, "num_input_tokens_seen": 185069824, "step": 85815 }, { "epoch": 15.749678840154157, "grad_norm": 0.00012590714322868735, "learning_rate": 1.3139108711028099e-06, "loss": 0.0, "num_input_tokens_seen": 185081760, "step": 85820 }, { "epoch": 15.750596439713709, "grad_norm": 9.848801710177213e-05, "learning_rate": 1.3133698830066066e-06, "loss": 0.0, "num_input_tokens_seen": 185090272, "step": 85825 }, { "epoch": 15.75151403927326, "grad_norm": 0.0002212027902714908, "learning_rate": 1.3128289894667524e-06, "loss": 0.0, "num_input_tokens_seen": 185101632, "step": 85830 }, { "epoch": 15.752431638832814, "grad_norm": 0.0002269458054797724, "learning_rate": 1.3122881904971186e-06, "loss": 0.0001, "num_input_tokens_seen": 185111008, "step": 85835 }, { "epoch": 15.753349238392365, "grad_norm": 8.17264590295963e-05, "learning_rate": 1.3117474861115786e-06, "loss": 0.0002, "num_input_tokens_seen": 185122560, "step": 85840 }, { "epoch": 15.754266837951917, "grad_norm": 0.0002652911643963307, "learning_rate": 1.3112068763239994e-06, "loss": 0.0, "num_input_tokens_seen": 185134432, "step": 85845 }, { "epoch": 15.75518443751147, "grad_norm": 0.00021486845798790455, "learning_rate": 1.3106663611482463e-06, "loss": 0.0, "num_input_tokens_seen": 185145728, "step": 85850 }, { "epoch": 15.756102037071022, "grad_norm": 7.745947368675843e-05, "learning_rate": 1.310125940598182e-06, "loss": 0.0, "num_input_tokens_seen": 185157120, "step": 85855 }, { "epoch": 15.757019636630574, "grad_norm": 0.0003262624959461391, "learning_rate": 1.3095856146876695e-06, "loss": 0.0, "num_input_tokens_seen": 185169152, "step": 85860 }, { "epoch": 15.757937236190127, "grad_norm": 0.00020411521836649626, "learning_rate": 1.3090453834305672e-06, "loss": 0.0, "num_input_tokens_seen": 185181792, "step": 85865 }, { "epoch": 15.758854835749679, "grad_norm": 0.0016797998687252402, "learning_rate": 1.30850524684073e-06, "loss": 0.0, "num_input_tokens_seen": 185192192, "step": 85870 }, { "epoch": 15.75977243530923, "grad_norm": 0.00036631352850236, "learning_rate": 1.307965204932012e-06, "loss": 0.0, "num_input_tokens_seen": 185204288, "step": 85875 }, { "epoch": 15.760690034868784, "grad_norm": 8.939990220824257e-05, "learning_rate": 1.3074252577182638e-06, "loss": 0.0, "num_input_tokens_seen": 185214784, "step": 85880 }, { "epoch": 15.761607634428335, "grad_norm": 7.842476770747453e-05, "learning_rate": 1.306885405213334e-06, "loss": 0.0, "num_input_tokens_seen": 185225280, "step": 85885 }, { "epoch": 15.762525233987887, "grad_norm": 5.2697083447128534e-05, "learning_rate": 1.306345647431071e-06, "loss": 0.0, "num_input_tokens_seen": 185235200, "step": 85890 }, { "epoch": 15.76344283354744, "grad_norm": 0.00010585693962639198, "learning_rate": 1.3058059843853171e-06, "loss": 0.0, "num_input_tokens_seen": 185245376, "step": 85895 }, { "epoch": 15.764360433106992, "grad_norm": 8.408076246269047e-05, "learning_rate": 1.3052664160899131e-06, "loss": 0.0, "num_input_tokens_seen": 185256320, "step": 85900 }, { "epoch": 15.765278032666544, "grad_norm": 0.00021437475515995175, "learning_rate": 1.3047269425587005e-06, "loss": 0.0, "num_input_tokens_seen": 185266816, "step": 85905 }, { "epoch": 15.766195632226097, "grad_norm": 0.0002112173242494464, "learning_rate": 1.3041875638055152e-06, "loss": 0.0, "num_input_tokens_seen": 185278560, "step": 85910 }, { "epoch": 15.767113231785649, "grad_norm": 0.0002960615383926779, "learning_rate": 1.303648279844189e-06, "loss": 0.0, "num_input_tokens_seen": 185288384, "step": 85915 }, { "epoch": 15.7680308313452, "grad_norm": 0.0001840488548623398, "learning_rate": 1.303109090688557e-06, "loss": 0.0, "num_input_tokens_seen": 185298048, "step": 85920 }, { "epoch": 15.768948430904754, "grad_norm": 0.0003561839403118938, "learning_rate": 1.3025699963524475e-06, "loss": 0.0, "num_input_tokens_seen": 185308768, "step": 85925 }, { "epoch": 15.769866030464305, "grad_norm": 0.00014637969434261322, "learning_rate": 1.3020309968496869e-06, "loss": 0.0, "num_input_tokens_seen": 185320256, "step": 85930 }, { "epoch": 15.770783630023857, "grad_norm": 0.0001382702321279794, "learning_rate": 1.3014920921940983e-06, "loss": 0.0, "num_input_tokens_seen": 185331296, "step": 85935 }, { "epoch": 15.77170122958341, "grad_norm": 0.0001060207505361177, "learning_rate": 1.300953282399507e-06, "loss": 0.0, "num_input_tokens_seen": 185342592, "step": 85940 }, { "epoch": 15.772618829142962, "grad_norm": 0.03223444148898125, "learning_rate": 1.3004145674797307e-06, "loss": 0.0, "num_input_tokens_seen": 185352960, "step": 85945 }, { "epoch": 15.773536428702513, "grad_norm": 9.349273750558496e-05, "learning_rate": 1.2998759474485856e-06, "loss": 0.0, "num_input_tokens_seen": 185363456, "step": 85950 }, { "epoch": 15.774454028262067, "grad_norm": 9.808806498767808e-05, "learning_rate": 1.2993374223198896e-06, "loss": 0.0, "num_input_tokens_seen": 185374688, "step": 85955 }, { "epoch": 15.775371627821619, "grad_norm": 0.0017485583666712046, "learning_rate": 1.2987989921074528e-06, "loss": 0.0, "num_input_tokens_seen": 185385504, "step": 85960 }, { "epoch": 15.77628922738117, "grad_norm": 0.0013362758327275515, "learning_rate": 1.2982606568250856e-06, "loss": 0.0, "num_input_tokens_seen": 185396096, "step": 85965 }, { "epoch": 15.777206826940724, "grad_norm": 0.000262622517766431, "learning_rate": 1.2977224164865943e-06, "loss": 0.0, "num_input_tokens_seen": 185407904, "step": 85970 }, { "epoch": 15.778124426500275, "grad_norm": 0.0001326005585724488, "learning_rate": 1.2971842711057858e-06, "loss": 0.0, "num_input_tokens_seen": 185419328, "step": 85975 }, { "epoch": 15.779042026059827, "grad_norm": 0.00022406851348932832, "learning_rate": 1.2966462206964624e-06, "loss": 0.0, "num_input_tokens_seen": 185430496, "step": 85980 }, { "epoch": 15.77995962561938, "grad_norm": 0.00011474838538561016, "learning_rate": 1.296108265272422e-06, "loss": 0.0, "num_input_tokens_seen": 185440768, "step": 85985 }, { "epoch": 15.780877225178932, "grad_norm": 0.00017878241487778723, "learning_rate": 1.2955704048474655e-06, "loss": 0.0, "num_input_tokens_seen": 185452128, "step": 85990 }, { "epoch": 15.781794824738483, "grad_norm": 0.0001666909083724022, "learning_rate": 1.2950326394353874e-06, "loss": 0.0, "num_input_tokens_seen": 185463648, "step": 85995 }, { "epoch": 15.782712424298037, "grad_norm": 8.75846526469104e-05, "learning_rate": 1.2944949690499776e-06, "loss": 0.0, "num_input_tokens_seen": 185474240, "step": 86000 }, { "epoch": 15.783630023857588, "grad_norm": 9.243081149179488e-05, "learning_rate": 1.293957393705031e-06, "loss": 0.0, "num_input_tokens_seen": 185485248, "step": 86005 }, { "epoch": 15.78454762341714, "grad_norm": 0.00048357207560911775, "learning_rate": 1.2934199134143326e-06, "loss": 0.0, "num_input_tokens_seen": 185496032, "step": 86010 }, { "epoch": 15.785465222976693, "grad_norm": 0.0007874297443777323, "learning_rate": 1.2928825281916697e-06, "loss": 0.0, "num_input_tokens_seen": 185507296, "step": 86015 }, { "epoch": 15.786382822536245, "grad_norm": 7.393453415716067e-05, "learning_rate": 1.2923452380508223e-06, "loss": 0.0, "num_input_tokens_seen": 185518112, "step": 86020 }, { "epoch": 15.787300422095797, "grad_norm": 0.00010996215132763609, "learning_rate": 1.291808043005575e-06, "loss": 0.0, "num_input_tokens_seen": 185530368, "step": 86025 }, { "epoch": 15.78821802165535, "grad_norm": 0.00022746033209841698, "learning_rate": 1.291270943069704e-06, "loss": 0.0, "num_input_tokens_seen": 185541760, "step": 86030 }, { "epoch": 15.789135621214902, "grad_norm": 9.962468902813271e-05, "learning_rate": 1.290733938256984e-06, "loss": 0.0, "num_input_tokens_seen": 185552992, "step": 86035 }, { "epoch": 15.790053220774453, "grad_norm": 0.00011780011118389666, "learning_rate": 1.290197028581191e-06, "loss": 0.0, "num_input_tokens_seen": 185563712, "step": 86040 }, { "epoch": 15.790970820334007, "grad_norm": 0.0003475966805126518, "learning_rate": 1.289660214056095e-06, "loss": 0.0, "num_input_tokens_seen": 185575776, "step": 86045 }, { "epoch": 15.791888419893558, "grad_norm": 0.00429270276799798, "learning_rate": 1.2891234946954617e-06, "loss": 0.0006, "num_input_tokens_seen": 185586432, "step": 86050 }, { "epoch": 15.79280601945311, "grad_norm": 0.008784985169768333, "learning_rate": 1.2885868705130617e-06, "loss": 0.0, "num_input_tokens_seen": 185596288, "step": 86055 }, { "epoch": 15.793723619012663, "grad_norm": 0.0001178133679786697, "learning_rate": 1.288050341522656e-06, "loss": 0.0, "num_input_tokens_seen": 185608064, "step": 86060 }, { "epoch": 15.794641218572215, "grad_norm": 0.00016308831982314587, "learning_rate": 1.2875139077380055e-06, "loss": 0.0, "num_input_tokens_seen": 185618496, "step": 86065 }, { "epoch": 15.795558818131767, "grad_norm": 0.00013080061762593687, "learning_rate": 1.2869775691728703e-06, "loss": 0.0, "num_input_tokens_seen": 185629536, "step": 86070 }, { "epoch": 15.79647641769132, "grad_norm": 0.0002167108323192224, "learning_rate": 1.2864413258410052e-06, "loss": 0.0, "num_input_tokens_seen": 185638400, "step": 86075 }, { "epoch": 15.797394017250872, "grad_norm": 0.0001273451780434698, "learning_rate": 1.2859051777561631e-06, "loss": 0.0, "num_input_tokens_seen": 185649344, "step": 86080 }, { "epoch": 15.798311616810423, "grad_norm": 0.00011525011359481141, "learning_rate": 1.2853691249320988e-06, "loss": 0.0, "num_input_tokens_seen": 185659776, "step": 86085 }, { "epoch": 15.799229216369977, "grad_norm": 7.946549158077687e-05, "learning_rate": 1.2848331673825587e-06, "loss": 0.0822, "num_input_tokens_seen": 185669600, "step": 86090 }, { "epoch": 15.800146815929528, "grad_norm": 0.00012651371071115136, "learning_rate": 1.2842973051212905e-06, "loss": 0.0, "num_input_tokens_seen": 185680160, "step": 86095 }, { "epoch": 15.80106441548908, "grad_norm": 8.064808935159817e-05, "learning_rate": 1.2837615381620371e-06, "loss": 0.0, "num_input_tokens_seen": 185691264, "step": 86100 }, { "epoch": 15.801982015048633, "grad_norm": 0.00011054803326260298, "learning_rate": 1.2832258665185392e-06, "loss": 0.0, "num_input_tokens_seen": 185700992, "step": 86105 }, { "epoch": 15.802899614608185, "grad_norm": 5.87476315558888e-05, "learning_rate": 1.2826902902045391e-06, "loss": 0.0, "num_input_tokens_seen": 185710848, "step": 86110 }, { "epoch": 15.803817214167736, "grad_norm": 0.0004121051460970193, "learning_rate": 1.2821548092337716e-06, "loss": 0.0, "num_input_tokens_seen": 185721760, "step": 86115 }, { "epoch": 15.80473481372729, "grad_norm": 0.0001287520135520026, "learning_rate": 1.2816194236199697e-06, "loss": 0.0, "num_input_tokens_seen": 185731136, "step": 86120 }, { "epoch": 15.805652413286841, "grad_norm": 0.00017642966122366488, "learning_rate": 1.281084133376868e-06, "loss": 0.0, "num_input_tokens_seen": 185740672, "step": 86125 }, { "epoch": 15.806570012846393, "grad_norm": 0.00013486036914400756, "learning_rate": 1.2805489385181946e-06, "loss": 0.0, "num_input_tokens_seen": 185751744, "step": 86130 }, { "epoch": 15.807487612405946, "grad_norm": 0.0038632138166576624, "learning_rate": 1.280013839057675e-06, "loss": 0.0, "num_input_tokens_seen": 185761728, "step": 86135 }, { "epoch": 15.808405211965498, "grad_norm": 0.31243446469306946, "learning_rate": 1.279478835009036e-06, "loss": 0.0001, "num_input_tokens_seen": 185771712, "step": 86140 }, { "epoch": 15.80932281152505, "grad_norm": 8.288133540190756e-05, "learning_rate": 1.2789439263859987e-06, "loss": 0.0, "num_input_tokens_seen": 185783104, "step": 86145 }, { "epoch": 15.810240411084603, "grad_norm": 5.791685907752253e-05, "learning_rate": 1.278409113202283e-06, "loss": 0.0, "num_input_tokens_seen": 185794272, "step": 86150 }, { "epoch": 15.811158010644155, "grad_norm": 0.00024268795095849782, "learning_rate": 1.2778743954716032e-06, "loss": 0.0, "num_input_tokens_seen": 185805152, "step": 86155 }, { "epoch": 15.812075610203706, "grad_norm": 9.414846863364801e-05, "learning_rate": 1.2773397732076787e-06, "loss": 0.0, "num_input_tokens_seen": 185815072, "step": 86160 }, { "epoch": 15.81299320976326, "grad_norm": 0.0005425005219876766, "learning_rate": 1.2768052464242193e-06, "loss": 0.0, "num_input_tokens_seen": 185826208, "step": 86165 }, { "epoch": 15.813910809322811, "grad_norm": 0.00014820045907981694, "learning_rate": 1.2762708151349335e-06, "loss": 0.0, "num_input_tokens_seen": 185836672, "step": 86170 }, { "epoch": 15.814828408882363, "grad_norm": 0.00016215928189922124, "learning_rate": 1.275736479353531e-06, "loss": 0.0, "num_input_tokens_seen": 185846016, "step": 86175 }, { "epoch": 15.815746008441916, "grad_norm": 0.004018701147288084, "learning_rate": 1.2752022390937158e-06, "loss": 0.0, "num_input_tokens_seen": 185856608, "step": 86180 }, { "epoch": 15.816663608001468, "grad_norm": 7.007485692156479e-05, "learning_rate": 1.2746680943691892e-06, "loss": 0.0, "num_input_tokens_seen": 185866176, "step": 86185 }, { "epoch": 15.81758120756102, "grad_norm": 0.0002534916566219181, "learning_rate": 1.2741340451936535e-06, "loss": 0.0, "num_input_tokens_seen": 185876640, "step": 86190 }, { "epoch": 15.818498807120573, "grad_norm": 0.0002571841177996248, "learning_rate": 1.273600091580805e-06, "loss": 0.0, "num_input_tokens_seen": 185887232, "step": 86195 }, { "epoch": 15.819416406680125, "grad_norm": 6.063331602490507e-05, "learning_rate": 1.2730662335443389e-06, "loss": 0.0, "num_input_tokens_seen": 185899040, "step": 86200 }, { "epoch": 15.820334006239676, "grad_norm": 5.5609416449442506e-05, "learning_rate": 1.2725324710979459e-06, "loss": 0.0, "num_input_tokens_seen": 185909984, "step": 86205 }, { "epoch": 15.82125160579923, "grad_norm": 8.763982772827148, "learning_rate": 1.2719988042553194e-06, "loss": 0.0042, "num_input_tokens_seen": 185921792, "step": 86210 }, { "epoch": 15.822169205358781, "grad_norm": 8.706439984962344e-05, "learning_rate": 1.2714652330301457e-06, "loss": 0.0, "num_input_tokens_seen": 185931904, "step": 86215 }, { "epoch": 15.823086804918333, "grad_norm": 0.00013709628547076136, "learning_rate": 1.2709317574361092e-06, "loss": 0.0, "num_input_tokens_seen": 185941600, "step": 86220 }, { "epoch": 15.824004404477886, "grad_norm": 0.0019465894438326359, "learning_rate": 1.2703983774868945e-06, "loss": 0.0, "num_input_tokens_seen": 185952672, "step": 86225 }, { "epoch": 15.824922004037438, "grad_norm": 0.00768860662356019, "learning_rate": 1.2698650931961815e-06, "loss": 0.0, "num_input_tokens_seen": 185963072, "step": 86230 }, { "epoch": 15.82583960359699, "grad_norm": 0.016429128125309944, "learning_rate": 1.269331904577646e-06, "loss": 0.0, "num_input_tokens_seen": 185973792, "step": 86235 }, { "epoch": 15.826757203156543, "grad_norm": 5.406019408837892e-05, "learning_rate": 1.2687988116449663e-06, "loss": 0.0, "num_input_tokens_seen": 185984288, "step": 86240 }, { "epoch": 15.827674802716095, "grad_norm": 0.0002355387550778687, "learning_rate": 1.2682658144118144e-06, "loss": 0.0, "num_input_tokens_seen": 185994592, "step": 86245 }, { "epoch": 15.828592402275646, "grad_norm": 0.001840526470914483, "learning_rate": 1.2677329128918608e-06, "loss": 0.0, "num_input_tokens_seen": 186005856, "step": 86250 }, { "epoch": 15.8295100018352, "grad_norm": 0.0002630430390127003, "learning_rate": 1.2672001070987716e-06, "loss": 0.0, "num_input_tokens_seen": 186016192, "step": 86255 }, { "epoch": 15.830427601394751, "grad_norm": 0.012320230714976788, "learning_rate": 1.2666673970462163e-06, "loss": 0.0, "num_input_tokens_seen": 186027456, "step": 86260 }, { "epoch": 15.831345200954303, "grad_norm": 0.00012980848259758204, "learning_rate": 1.266134782747856e-06, "loss": 0.0, "num_input_tokens_seen": 186037504, "step": 86265 }, { "epoch": 15.832262800513856, "grad_norm": 8.620841254014522e-05, "learning_rate": 1.2656022642173516e-06, "loss": 0.0, "num_input_tokens_seen": 186047392, "step": 86270 }, { "epoch": 15.833180400073408, "grad_norm": 0.00046815979294478893, "learning_rate": 1.2650698414683598e-06, "loss": 0.0079, "num_input_tokens_seen": 186057952, "step": 86275 }, { "epoch": 15.83409799963296, "grad_norm": 0.00012281318777240813, "learning_rate": 1.2645375145145395e-06, "loss": 0.0328, "num_input_tokens_seen": 186068672, "step": 86280 }, { "epoch": 15.835015599192513, "grad_norm": 0.00019195544882677495, "learning_rate": 1.2640052833695426e-06, "loss": 0.0, "num_input_tokens_seen": 186080224, "step": 86285 }, { "epoch": 15.835933198752064, "grad_norm": 0.00016019781469367445, "learning_rate": 1.2634731480470197e-06, "loss": 0.0, "num_input_tokens_seen": 186091456, "step": 86290 }, { "epoch": 15.836850798311616, "grad_norm": 0.04023533686995506, "learning_rate": 1.2629411085606196e-06, "loss": 0.0, "num_input_tokens_seen": 186102240, "step": 86295 }, { "epoch": 15.83776839787117, "grad_norm": 0.00010013412975240499, "learning_rate": 1.2624091649239867e-06, "loss": 0.0, "num_input_tokens_seen": 186113504, "step": 86300 }, { "epoch": 15.838685997430721, "grad_norm": 0.0003090373647864908, "learning_rate": 1.261877317150767e-06, "loss": 0.0, "num_input_tokens_seen": 186124864, "step": 86305 }, { "epoch": 15.839603596990273, "grad_norm": 0.00011648439976852387, "learning_rate": 1.2613455652546009e-06, "loss": 0.0, "num_input_tokens_seen": 186137088, "step": 86310 }, { "epoch": 15.840521196549826, "grad_norm": 0.00012652526493184268, "learning_rate": 1.2608139092491268e-06, "loss": 0.0, "num_input_tokens_seen": 186148480, "step": 86315 }, { "epoch": 15.841438796109378, "grad_norm": 7.469207776011899e-05, "learning_rate": 1.2602823491479793e-06, "loss": 0.0, "num_input_tokens_seen": 186159360, "step": 86320 }, { "epoch": 15.84235639566893, "grad_norm": 0.15425705909729004, "learning_rate": 1.2597508849647944e-06, "loss": 0.0001, "num_input_tokens_seen": 186169888, "step": 86325 }, { "epoch": 15.843273995228483, "grad_norm": 0.0044199880212545395, "learning_rate": 1.2592195167132032e-06, "loss": 0.0, "num_input_tokens_seen": 186180640, "step": 86330 }, { "epoch": 15.844191594788034, "grad_norm": 0.002994564129039645, "learning_rate": 1.2586882444068331e-06, "loss": 0.0, "num_input_tokens_seen": 186190400, "step": 86335 }, { "epoch": 15.845109194347586, "grad_norm": 0.0018487329361960292, "learning_rate": 1.2581570680593097e-06, "loss": 0.0, "num_input_tokens_seen": 186201856, "step": 86340 }, { "epoch": 15.84602679390714, "grad_norm": 0.00012489352957345545, "learning_rate": 1.25762598768426e-06, "loss": 0.0, "num_input_tokens_seen": 186213472, "step": 86345 }, { "epoch": 15.846944393466691, "grad_norm": 0.00021336802456062287, "learning_rate": 1.2570950032953034e-06, "loss": 0.0001, "num_input_tokens_seen": 186224064, "step": 86350 }, { "epoch": 15.847861993026243, "grad_norm": 0.00013437036250252277, "learning_rate": 1.256564114906057e-06, "loss": 0.0, "num_input_tokens_seen": 186232704, "step": 86355 }, { "epoch": 15.848779592585796, "grad_norm": 0.04707679897546768, "learning_rate": 1.2560333225301413e-06, "loss": 0.0, "num_input_tokens_seen": 186244000, "step": 86360 }, { "epoch": 15.849697192145348, "grad_norm": 0.00019713288929779083, "learning_rate": 1.2555026261811682e-06, "loss": 0.0, "num_input_tokens_seen": 186254528, "step": 86365 }, { "epoch": 15.8506147917049, "grad_norm": 0.00017691515677142888, "learning_rate": 1.2549720258727477e-06, "loss": 0.004, "num_input_tokens_seen": 186265216, "step": 86370 }, { "epoch": 15.851532391264453, "grad_norm": 0.00011861966049764305, "learning_rate": 1.2544415216184918e-06, "loss": 0.0, "num_input_tokens_seen": 186275424, "step": 86375 }, { "epoch": 15.852449990824004, "grad_norm": 0.0014399047940969467, "learning_rate": 1.2539111134320058e-06, "loss": 0.0, "num_input_tokens_seen": 186287072, "step": 86380 }, { "epoch": 15.853367590383556, "grad_norm": 0.00014920298417564481, "learning_rate": 1.2533808013268938e-06, "loss": 0.0, "num_input_tokens_seen": 186297664, "step": 86385 }, { "epoch": 15.85428518994311, "grad_norm": 0.0006123765488155186, "learning_rate": 1.2528505853167566e-06, "loss": 0.0, "num_input_tokens_seen": 186309696, "step": 86390 }, { "epoch": 15.85520278950266, "grad_norm": 8.357061597052962e-05, "learning_rate": 1.2523204654151955e-06, "loss": 0.0, "num_input_tokens_seen": 186321280, "step": 86395 }, { "epoch": 15.856120389062212, "grad_norm": 0.00012471224181354046, "learning_rate": 1.2517904416358056e-06, "loss": 0.0, "num_input_tokens_seen": 186332512, "step": 86400 }, { "epoch": 15.857037988621766, "grad_norm": 6.294394552242011e-05, "learning_rate": 1.2512605139921807e-06, "loss": 0.0, "num_input_tokens_seen": 186342304, "step": 86405 }, { "epoch": 15.857955588181317, "grad_norm": 0.00017666570784058422, "learning_rate": 1.2507306824979148e-06, "loss": 0.0, "num_input_tokens_seen": 186352000, "step": 86410 }, { "epoch": 15.858873187740869, "grad_norm": 0.000162647120305337, "learning_rate": 1.2502009471665966e-06, "loss": 0.0, "num_input_tokens_seen": 186363200, "step": 86415 }, { "epoch": 15.859790787300422, "grad_norm": 7.831749826436862e-05, "learning_rate": 1.2496713080118116e-06, "loss": 0.0, "num_input_tokens_seen": 186373536, "step": 86420 }, { "epoch": 15.860708386859974, "grad_norm": 0.000121179451525677, "learning_rate": 1.2491417650471444e-06, "loss": 0.0, "num_input_tokens_seen": 186385760, "step": 86425 }, { "epoch": 15.861625986419526, "grad_norm": 0.00012671198055613786, "learning_rate": 1.2486123182861788e-06, "loss": 0.0, "num_input_tokens_seen": 186397472, "step": 86430 }, { "epoch": 15.862543585979079, "grad_norm": 0.0007606149301864207, "learning_rate": 1.2480829677424933e-06, "loss": 0.0, "num_input_tokens_seen": 186409248, "step": 86435 }, { "epoch": 15.86346118553863, "grad_norm": 7.255312812048942e-05, "learning_rate": 1.2475537134296628e-06, "loss": 0.0, "num_input_tokens_seen": 186419040, "step": 86440 }, { "epoch": 15.864378785098182, "grad_norm": 0.00015403522411361337, "learning_rate": 1.2470245553612654e-06, "loss": 0.0, "num_input_tokens_seen": 186430304, "step": 86445 }, { "epoch": 15.865296384657736, "grad_norm": 0.00015230740245897323, "learning_rate": 1.2464954935508716e-06, "loss": 0.0, "num_input_tokens_seen": 186440512, "step": 86450 }, { "epoch": 15.866213984217287, "grad_norm": 0.0001010042178677395, "learning_rate": 1.2459665280120498e-06, "loss": 0.0, "num_input_tokens_seen": 186451744, "step": 86455 }, { "epoch": 15.867131583776839, "grad_norm": 0.00046757759992033243, "learning_rate": 1.2454376587583693e-06, "loss": 0.0, "num_input_tokens_seen": 186462496, "step": 86460 }, { "epoch": 15.868049183336392, "grad_norm": 0.0001864306250354275, "learning_rate": 1.244908885803394e-06, "loss": 0.0, "num_input_tokens_seen": 186473888, "step": 86465 }, { "epoch": 15.868966782895944, "grad_norm": 0.00030530180083587766, "learning_rate": 1.244380209160686e-06, "loss": 0.0, "num_input_tokens_seen": 186485888, "step": 86470 }, { "epoch": 15.869884382455496, "grad_norm": 0.0006832677754573524, "learning_rate": 1.2438516288438036e-06, "loss": 0.0, "num_input_tokens_seen": 186496768, "step": 86475 }, { "epoch": 15.870801982015049, "grad_norm": 0.0024301414377987385, "learning_rate": 1.2433231448663069e-06, "loss": 0.0, "num_input_tokens_seen": 186507840, "step": 86480 }, { "epoch": 15.8717195815746, "grad_norm": 0.00038201286224648356, "learning_rate": 1.2427947572417493e-06, "loss": 0.0, "num_input_tokens_seen": 186518976, "step": 86485 }, { "epoch": 15.872637181134152, "grad_norm": 9.502127068117261e-05, "learning_rate": 1.2422664659836824e-06, "loss": 0.0, "num_input_tokens_seen": 186528832, "step": 86490 }, { "epoch": 15.873554780693706, "grad_norm": 0.0027224430814385414, "learning_rate": 1.2417382711056558e-06, "loss": 0.0, "num_input_tokens_seen": 186539936, "step": 86495 }, { "epoch": 15.874472380253257, "grad_norm": 0.000523260619957, "learning_rate": 1.241210172621219e-06, "loss": 0.0, "num_input_tokens_seen": 186551008, "step": 86500 }, { "epoch": 15.875389979812809, "grad_norm": 7.784623448969796e-05, "learning_rate": 1.240682170543916e-06, "loss": 0.0, "num_input_tokens_seen": 186561376, "step": 86505 }, { "epoch": 15.876307579372362, "grad_norm": 0.00011200174776604399, "learning_rate": 1.2401542648872883e-06, "loss": 0.0, "num_input_tokens_seen": 186572736, "step": 86510 }, { "epoch": 15.877225178931914, "grad_norm": 0.00927029736340046, "learning_rate": 1.239626455664877e-06, "loss": 0.0063, "num_input_tokens_seen": 186581440, "step": 86515 }, { "epoch": 15.878142778491465, "grad_norm": 0.0008769484120421112, "learning_rate": 1.2390987428902175e-06, "loss": 0.0, "num_input_tokens_seen": 186592288, "step": 86520 }, { "epoch": 15.879060378051019, "grad_norm": 0.0001436338061466813, "learning_rate": 1.2385711265768474e-06, "loss": 0.0, "num_input_tokens_seen": 186602816, "step": 86525 }, { "epoch": 15.87997797761057, "grad_norm": 0.00012442110164556652, "learning_rate": 1.238043606738299e-06, "loss": 0.0, "num_input_tokens_seen": 186612288, "step": 86530 }, { "epoch": 15.880895577170122, "grad_norm": 0.0005077123641967773, "learning_rate": 1.2375161833881011e-06, "loss": 0.0, "num_input_tokens_seen": 186622848, "step": 86535 }, { "epoch": 15.881813176729676, "grad_norm": 0.0003193774027749896, "learning_rate": 1.2369888565397802e-06, "loss": 0.0, "num_input_tokens_seen": 186633056, "step": 86540 }, { "epoch": 15.882730776289227, "grad_norm": 0.16715988516807556, "learning_rate": 1.2364616262068646e-06, "loss": 0.0, "num_input_tokens_seen": 186644448, "step": 86545 }, { "epoch": 15.883648375848779, "grad_norm": 0.00016971326840575784, "learning_rate": 1.2359344924028754e-06, "loss": 0.0, "num_input_tokens_seen": 186655456, "step": 86550 }, { "epoch": 15.884565975408332, "grad_norm": 0.0008534127846360207, "learning_rate": 1.2354074551413314e-06, "loss": 0.0, "num_input_tokens_seen": 186665056, "step": 86555 }, { "epoch": 15.885483574967884, "grad_norm": 0.0001781680912245065, "learning_rate": 1.2348805144357528e-06, "loss": 0.0, "num_input_tokens_seen": 186676448, "step": 86560 }, { "epoch": 15.886401174527435, "grad_norm": 0.0014748319517821074, "learning_rate": 1.2343536702996534e-06, "loss": 0.0, "num_input_tokens_seen": 186687136, "step": 86565 }, { "epoch": 15.887318774086989, "grad_norm": 0.00019784964388236403, "learning_rate": 1.2338269227465467e-06, "loss": 0.0, "num_input_tokens_seen": 186697888, "step": 86570 }, { "epoch": 15.88823637364654, "grad_norm": 0.00012977977166883647, "learning_rate": 1.2333002717899405e-06, "loss": 0.0, "num_input_tokens_seen": 186709152, "step": 86575 }, { "epoch": 15.889153973206092, "grad_norm": 9.721134847495705e-05, "learning_rate": 1.2327737174433457e-06, "loss": 0.0, "num_input_tokens_seen": 186720448, "step": 86580 }, { "epoch": 15.890071572765645, "grad_norm": 8.380297367693856e-05, "learning_rate": 1.2322472597202667e-06, "loss": 0.0, "num_input_tokens_seen": 186732000, "step": 86585 }, { "epoch": 15.890989172325197, "grad_norm": 0.0029937089420855045, "learning_rate": 1.231720898634205e-06, "loss": 0.0, "num_input_tokens_seen": 186743648, "step": 86590 }, { "epoch": 15.891906771884749, "grad_norm": 0.0005130372010171413, "learning_rate": 1.2311946341986624e-06, "loss": 0.0, "num_input_tokens_seen": 186753600, "step": 86595 }, { "epoch": 15.892824371444302, "grad_norm": 0.005743419285863638, "learning_rate": 1.2306684664271374e-06, "loss": 0.0, "num_input_tokens_seen": 186766272, "step": 86600 }, { "epoch": 15.893741971003854, "grad_norm": 0.00014491192996501923, "learning_rate": 1.2301423953331237e-06, "loss": 0.0, "num_input_tokens_seen": 186778144, "step": 86605 }, { "epoch": 15.894659570563405, "grad_norm": 0.0010752786183729768, "learning_rate": 1.2296164209301132e-06, "loss": 0.0, "num_input_tokens_seen": 186790848, "step": 86610 }, { "epoch": 15.895577170122959, "grad_norm": 0.002260393463075161, "learning_rate": 1.2290905432315997e-06, "loss": 0.0, "num_input_tokens_seen": 186801344, "step": 86615 }, { "epoch": 15.89649476968251, "grad_norm": 9.886108455248177e-05, "learning_rate": 1.2285647622510693e-06, "loss": 0.0, "num_input_tokens_seen": 186812224, "step": 86620 }, { "epoch": 15.897412369242062, "grad_norm": 8.429033914580941e-05, "learning_rate": 1.2280390780020062e-06, "loss": 0.0, "num_input_tokens_seen": 186823392, "step": 86625 }, { "epoch": 15.898329968801615, "grad_norm": 8.697757584741339e-05, "learning_rate": 1.2275134904978958e-06, "loss": 0.0, "num_input_tokens_seen": 186833984, "step": 86630 }, { "epoch": 15.899247568361167, "grad_norm": 0.00018944732437375933, "learning_rate": 1.2269879997522182e-06, "loss": 0.0, "num_input_tokens_seen": 186844512, "step": 86635 }, { "epoch": 15.900165167920719, "grad_norm": 0.0016282773576676846, "learning_rate": 1.226462605778449e-06, "loss": 0.0, "num_input_tokens_seen": 186854624, "step": 86640 }, { "epoch": 15.901082767480272, "grad_norm": 0.0003467029018793255, "learning_rate": 1.2259373085900667e-06, "loss": 0.0, "num_input_tokens_seen": 186865824, "step": 86645 }, { "epoch": 15.902000367039824, "grad_norm": 7.380056194961071e-05, "learning_rate": 1.2254121082005431e-06, "loss": 0.0, "num_input_tokens_seen": 186876512, "step": 86650 }, { "epoch": 15.902917966599375, "grad_norm": 0.0005749135743826628, "learning_rate": 1.2248870046233495e-06, "loss": 0.0, "num_input_tokens_seen": 186887136, "step": 86655 }, { "epoch": 15.903835566158929, "grad_norm": 9.027004125528038e-05, "learning_rate": 1.2243619978719518e-06, "loss": 0.0, "num_input_tokens_seen": 186898816, "step": 86660 }, { "epoch": 15.90475316571848, "grad_norm": 6.590384873561561e-05, "learning_rate": 1.2238370879598183e-06, "loss": 0.0, "num_input_tokens_seen": 186909568, "step": 86665 }, { "epoch": 15.905670765278032, "grad_norm": 0.00015137797163333744, "learning_rate": 1.2233122749004107e-06, "loss": 0.0, "num_input_tokens_seen": 186920224, "step": 86670 }, { "epoch": 15.906588364837585, "grad_norm": 0.00020722851331811398, "learning_rate": 1.2227875587071886e-06, "loss": 0.0, "num_input_tokens_seen": 186931008, "step": 86675 }, { "epoch": 15.907505964397137, "grad_norm": 0.0009906857740134, "learning_rate": 1.222262939393613e-06, "loss": 0.0, "num_input_tokens_seen": 186941152, "step": 86680 }, { "epoch": 15.908423563956688, "grad_norm": 0.00023940743994899094, "learning_rate": 1.2217384169731383e-06, "loss": 0.0, "num_input_tokens_seen": 186950400, "step": 86685 }, { "epoch": 15.909341163516242, "grad_norm": 0.00048333522863686085, "learning_rate": 1.2212139914592158e-06, "loss": 0.0, "num_input_tokens_seen": 186961184, "step": 86690 }, { "epoch": 15.910258763075793, "grad_norm": 9.923574543790892e-05, "learning_rate": 1.2206896628652992e-06, "loss": 0.0, "num_input_tokens_seen": 186971648, "step": 86695 }, { "epoch": 15.911176362635345, "grad_norm": 0.0009569224203005433, "learning_rate": 1.2201654312048355e-06, "loss": 0.0, "num_input_tokens_seen": 186981696, "step": 86700 }, { "epoch": 15.912093962194898, "grad_norm": 4.4796430302085355e-05, "learning_rate": 1.2196412964912702e-06, "loss": 0.0, "num_input_tokens_seen": 186992992, "step": 86705 }, { "epoch": 15.91301156175445, "grad_norm": 0.006186894606798887, "learning_rate": 1.2191172587380467e-06, "loss": 0.0, "num_input_tokens_seen": 187004736, "step": 86710 }, { "epoch": 15.913929161314002, "grad_norm": 0.00070714030880481, "learning_rate": 1.218593317958604e-06, "loss": 0.0, "num_input_tokens_seen": 187016192, "step": 86715 }, { "epoch": 15.914846760873555, "grad_norm": 0.00016864917415659875, "learning_rate": 1.218069474166384e-06, "loss": 0.0, "num_input_tokens_seen": 187027104, "step": 86720 }, { "epoch": 15.915764360433107, "grad_norm": 0.00010069197742268443, "learning_rate": 1.2175457273748199e-06, "loss": 0.0, "num_input_tokens_seen": 187038336, "step": 86725 }, { "epoch": 15.916681959992658, "grad_norm": 0.0001058485358953476, "learning_rate": 1.2170220775973462e-06, "loss": 0.0, "num_input_tokens_seen": 187048768, "step": 86730 }, { "epoch": 15.917599559552212, "grad_norm": 0.0033019797410815954, "learning_rate": 1.2164985248473926e-06, "loss": 0.0, "num_input_tokens_seen": 187059008, "step": 86735 }, { "epoch": 15.918517159111763, "grad_norm": 6.213946471689269e-05, "learning_rate": 1.2159750691383865e-06, "loss": 0.0, "num_input_tokens_seen": 187070208, "step": 86740 }, { "epoch": 15.919434758671315, "grad_norm": 0.00012949531082995236, "learning_rate": 1.215451710483757e-06, "loss": 0.0, "num_input_tokens_seen": 187080736, "step": 86745 }, { "epoch": 15.920352358230868, "grad_norm": 0.0002104908344335854, "learning_rate": 1.2149284488969254e-06, "loss": 0.0, "num_input_tokens_seen": 187091296, "step": 86750 }, { "epoch": 15.92126995779042, "grad_norm": 0.0003101176698692143, "learning_rate": 1.214405284391313e-06, "loss": 0.0, "num_input_tokens_seen": 187101728, "step": 86755 }, { "epoch": 15.922187557349972, "grad_norm": 0.0006065274938009679, "learning_rate": 1.213882216980336e-06, "loss": 0.0, "num_input_tokens_seen": 187112064, "step": 86760 }, { "epoch": 15.923105156909525, "grad_norm": 0.00013834233686793596, "learning_rate": 1.213359246677414e-06, "loss": 0.0, "num_input_tokens_seen": 187122848, "step": 86765 }, { "epoch": 15.924022756469077, "grad_norm": 7.147258293116465e-05, "learning_rate": 1.2128363734959585e-06, "loss": 0.0, "num_input_tokens_seen": 187132928, "step": 86770 }, { "epoch": 15.924940356028628, "grad_norm": 0.00014211342204362154, "learning_rate": 1.2123135974493788e-06, "loss": 0.0, "num_input_tokens_seen": 187142560, "step": 86775 }, { "epoch": 15.925857955588182, "grad_norm": 0.00010619419481372461, "learning_rate": 1.2117909185510867e-06, "loss": 0.0, "num_input_tokens_seen": 187153600, "step": 86780 }, { "epoch": 15.926775555147733, "grad_norm": 5.679142122971825e-05, "learning_rate": 1.2112683368144862e-06, "loss": 0.0, "num_input_tokens_seen": 187164224, "step": 86785 }, { "epoch": 15.927693154707285, "grad_norm": 5.897205483051948e-05, "learning_rate": 1.2107458522529808e-06, "loss": 0.0, "num_input_tokens_seen": 187177600, "step": 86790 }, { "epoch": 15.928610754266838, "grad_norm": 0.0002464316785335541, "learning_rate": 1.2102234648799699e-06, "loss": 0.0, "num_input_tokens_seen": 187188512, "step": 86795 }, { "epoch": 15.92952835382639, "grad_norm": 0.00017155709792859852, "learning_rate": 1.2097011747088555e-06, "loss": 0.0, "num_input_tokens_seen": 187199520, "step": 86800 }, { "epoch": 15.930445953385941, "grad_norm": 0.0009912604000419378, "learning_rate": 1.2091789817530308e-06, "loss": 0.0, "num_input_tokens_seen": 187210304, "step": 86805 }, { "epoch": 15.931363552945495, "grad_norm": 0.00036849643220193684, "learning_rate": 1.208656886025889e-06, "loss": 0.0, "num_input_tokens_seen": 187222528, "step": 86810 }, { "epoch": 15.932281152505047, "grad_norm": 0.0011714862193912268, "learning_rate": 1.2081348875408233e-06, "loss": 0.0, "num_input_tokens_seen": 187232704, "step": 86815 }, { "epoch": 15.933198752064598, "grad_norm": 22.625564575195312, "learning_rate": 1.2076129863112213e-06, "loss": 0.04, "num_input_tokens_seen": 187243424, "step": 86820 }, { "epoch": 15.934116351624152, "grad_norm": 0.0003050784289371222, "learning_rate": 1.2070911823504667e-06, "loss": 0.0, "num_input_tokens_seen": 187254528, "step": 86825 }, { "epoch": 15.935033951183703, "grad_norm": 6.597052561119199e-05, "learning_rate": 1.2065694756719459e-06, "loss": 0.0, "num_input_tokens_seen": 187265504, "step": 86830 }, { "epoch": 15.935951550743255, "grad_norm": 5.719671753467992e-05, "learning_rate": 1.2060478662890396e-06, "loss": 0.0, "num_input_tokens_seen": 187275616, "step": 86835 }, { "epoch": 15.936869150302808, "grad_norm": 0.0002474674547556788, "learning_rate": 1.2055263542151246e-06, "loss": 0.0, "num_input_tokens_seen": 187285984, "step": 86840 }, { "epoch": 15.93778674986236, "grad_norm": 7.765717600705102e-05, "learning_rate": 1.2050049394635766e-06, "loss": 0.0, "num_input_tokens_seen": 187297536, "step": 86845 }, { "epoch": 15.938704349421911, "grad_norm": 0.00023334365687333047, "learning_rate": 1.2044836220477718e-06, "loss": 0.0, "num_input_tokens_seen": 187307616, "step": 86850 }, { "epoch": 15.939621948981465, "grad_norm": 0.0007753486279398203, "learning_rate": 1.2039624019810796e-06, "loss": 0.0, "num_input_tokens_seen": 187318784, "step": 86855 }, { "epoch": 15.940539548541016, "grad_norm": 0.035168442875146866, "learning_rate": 1.2034412792768668e-06, "loss": 0.0, "num_input_tokens_seen": 187330144, "step": 86860 }, { "epoch": 15.941457148100568, "grad_norm": 0.00013108048005960882, "learning_rate": 1.2029202539485025e-06, "loss": 0.0, "num_input_tokens_seen": 187340224, "step": 86865 }, { "epoch": 15.942374747660121, "grad_norm": 0.003847911022603512, "learning_rate": 1.2023993260093491e-06, "loss": 0.0, "num_input_tokens_seen": 187350560, "step": 86870 }, { "epoch": 15.943292347219673, "grad_norm": 7.611281762365252e-05, "learning_rate": 1.2018784954727669e-06, "loss": 0.0, "num_input_tokens_seen": 187360640, "step": 86875 }, { "epoch": 15.944209946779225, "grad_norm": 0.002730805892497301, "learning_rate": 1.2013577623521132e-06, "loss": 0.0, "num_input_tokens_seen": 187371136, "step": 86880 }, { "epoch": 15.945127546338778, "grad_norm": 6.457691051764414e-05, "learning_rate": 1.2008371266607471e-06, "loss": 0.0, "num_input_tokens_seen": 187382208, "step": 86885 }, { "epoch": 15.94604514589833, "grad_norm": 0.0004293598176445812, "learning_rate": 1.2003165884120205e-06, "loss": 0.0, "num_input_tokens_seen": 187392480, "step": 86890 }, { "epoch": 15.946962745457881, "grad_norm": 0.000432279339293018, "learning_rate": 1.1997961476192832e-06, "loss": 0.0, "num_input_tokens_seen": 187404800, "step": 86895 }, { "epoch": 15.947880345017435, "grad_norm": 7.106990233296528e-05, "learning_rate": 1.1992758042958864e-06, "loss": 0.0, "num_input_tokens_seen": 187415872, "step": 86900 }, { "epoch": 15.948797944576986, "grad_norm": 0.000154189343447797, "learning_rate": 1.1987555584551741e-06, "loss": 0.0, "num_input_tokens_seen": 187427424, "step": 86905 }, { "epoch": 15.949715544136538, "grad_norm": 0.00033919388079084456, "learning_rate": 1.1982354101104892e-06, "loss": 0.0, "num_input_tokens_seen": 187438400, "step": 86910 }, { "epoch": 15.950633143696091, "grad_norm": 6.420376303140074e-05, "learning_rate": 1.1977153592751755e-06, "loss": 0.0, "num_input_tokens_seen": 187448960, "step": 86915 }, { "epoch": 15.951550743255643, "grad_norm": 0.001429822063073516, "learning_rate": 1.1971954059625696e-06, "loss": 0.0, "num_input_tokens_seen": 187460416, "step": 86920 }, { "epoch": 15.952468342815195, "grad_norm": 0.0008571448270231485, "learning_rate": 1.1966755501860077e-06, "loss": 0.0, "num_input_tokens_seen": 187470528, "step": 86925 }, { "epoch": 15.953385942374748, "grad_norm": 6.447751366067678e-05, "learning_rate": 1.1961557919588234e-06, "loss": 0.0, "num_input_tokens_seen": 187481504, "step": 86930 }, { "epoch": 15.9543035419343, "grad_norm": 7.786681817378849e-05, "learning_rate": 1.1956361312943466e-06, "loss": 0.0, "num_input_tokens_seen": 187492384, "step": 86935 }, { "epoch": 15.955221141493851, "grad_norm": 0.00021586798538919538, "learning_rate": 1.1951165682059073e-06, "loss": 0.1128, "num_input_tokens_seen": 187503424, "step": 86940 }, { "epoch": 15.956138741053405, "grad_norm": 0.008404356427490711, "learning_rate": 1.194597102706832e-06, "loss": 0.0792, "num_input_tokens_seen": 187515456, "step": 86945 }, { "epoch": 15.957056340612956, "grad_norm": 0.006199910771101713, "learning_rate": 1.1940777348104427e-06, "loss": 0.0, "num_input_tokens_seen": 187526304, "step": 86950 }, { "epoch": 15.957973940172508, "grad_norm": 5.4113916121423244e-05, "learning_rate": 1.1935584645300607e-06, "loss": 0.0, "num_input_tokens_seen": 187537504, "step": 86955 }, { "epoch": 15.958891539732061, "grad_norm": 0.0029182955622673035, "learning_rate": 1.1930392918790035e-06, "loss": 0.0, "num_input_tokens_seen": 187548512, "step": 86960 }, { "epoch": 15.959809139291613, "grad_norm": 0.0006152979331091046, "learning_rate": 1.192520216870589e-06, "loss": 0.0, "num_input_tokens_seen": 187559584, "step": 86965 }, { "epoch": 15.960726738851164, "grad_norm": 0.0024371794424951077, "learning_rate": 1.1920012395181308e-06, "loss": 0.0, "num_input_tokens_seen": 187570560, "step": 86970 }, { "epoch": 15.961644338410718, "grad_norm": 0.0012809946201741695, "learning_rate": 1.1914823598349384e-06, "loss": 0.0, "num_input_tokens_seen": 187581344, "step": 86975 }, { "epoch": 15.96256193797027, "grad_norm": 7.323568570427597e-05, "learning_rate": 1.1909635778343192e-06, "loss": 0.0, "num_input_tokens_seen": 187591712, "step": 86980 }, { "epoch": 15.963479537529821, "grad_norm": 0.0001396942971041426, "learning_rate": 1.1904448935295825e-06, "loss": 0.0, "num_input_tokens_seen": 187602176, "step": 86985 }, { "epoch": 15.964397137089374, "grad_norm": 3.858965646941215e-05, "learning_rate": 1.18992630693403e-06, "loss": 0.0, "num_input_tokens_seen": 187611360, "step": 86990 }, { "epoch": 15.965314736648926, "grad_norm": 0.0010278784902766347, "learning_rate": 1.1894078180609614e-06, "loss": 0.0, "num_input_tokens_seen": 187622304, "step": 86995 }, { "epoch": 15.966232336208478, "grad_norm": 0.00018127102521248162, "learning_rate": 1.1888894269236773e-06, "loss": 0.0, "num_input_tokens_seen": 187633312, "step": 87000 }, { "epoch": 15.967149935768031, "grad_norm": 9.632483852328733e-05, "learning_rate": 1.188371133535473e-06, "loss": 0.0, "num_input_tokens_seen": 187644288, "step": 87005 }, { "epoch": 15.968067535327583, "grad_norm": 9.364331344841048e-05, "learning_rate": 1.1878529379096405e-06, "loss": 0.0, "num_input_tokens_seen": 187655712, "step": 87010 }, { "epoch": 15.968985134887134, "grad_norm": 8.972229261416942e-05, "learning_rate": 1.1873348400594725e-06, "loss": 0.0, "num_input_tokens_seen": 187665696, "step": 87015 }, { "epoch": 15.969902734446688, "grad_norm": 5.660258830175735e-05, "learning_rate": 1.1868168399982578e-06, "loss": 0.0, "num_input_tokens_seen": 187675168, "step": 87020 }, { "epoch": 15.97082033400624, "grad_norm": 0.00015971421089489013, "learning_rate": 1.1862989377392802e-06, "loss": 0.0, "num_input_tokens_seen": 187685280, "step": 87025 }, { "epoch": 15.971737933565791, "grad_norm": 0.000877601676620543, "learning_rate": 1.1857811332958235e-06, "loss": 0.0, "num_input_tokens_seen": 187696256, "step": 87030 }, { "epoch": 15.972655533125344, "grad_norm": 0.00010574102634564042, "learning_rate": 1.1852634266811701e-06, "loss": 0.0, "num_input_tokens_seen": 187706848, "step": 87035 }, { "epoch": 15.973573132684896, "grad_norm": 0.00016522839723620564, "learning_rate": 1.184745817908598e-06, "loss": 0.0, "num_input_tokens_seen": 187718144, "step": 87040 }, { "epoch": 15.974490732244448, "grad_norm": 0.2800178825855255, "learning_rate": 1.1842283069913807e-06, "loss": 0.0003, "num_input_tokens_seen": 187728832, "step": 87045 }, { "epoch": 15.975408331804001, "grad_norm": 0.000181742143468, "learning_rate": 1.1837108939427955e-06, "loss": 0.0, "num_input_tokens_seen": 187740192, "step": 87050 }, { "epoch": 15.976325931363553, "grad_norm": 0.00020241712627466768, "learning_rate": 1.1831935787761106e-06, "loss": 0.0, "num_input_tokens_seen": 187751168, "step": 87055 }, { "epoch": 15.977243530923104, "grad_norm": 4.8096630052896217e-05, "learning_rate": 1.182676361504595e-06, "loss": 0.0, "num_input_tokens_seen": 187760960, "step": 87060 }, { "epoch": 15.978161130482658, "grad_norm": 9.248802962247282e-05, "learning_rate": 1.182159242141513e-06, "loss": 0.0, "num_input_tokens_seen": 187772288, "step": 87065 }, { "epoch": 15.97907873004221, "grad_norm": 0.0002459941024426371, "learning_rate": 1.1816422207001304e-06, "loss": 0.0, "num_input_tokens_seen": 187783424, "step": 87070 }, { "epoch": 15.97999632960176, "grad_norm": 0.00039101173751987517, "learning_rate": 1.1811252971937075e-06, "loss": 0.0, "num_input_tokens_seen": 187793504, "step": 87075 }, { "epoch": 15.980913929161314, "grad_norm": 0.0016053408617153764, "learning_rate": 1.1806084716355003e-06, "loss": 0.0, "num_input_tokens_seen": 187804832, "step": 87080 }, { "epoch": 15.981831528720866, "grad_norm": 8.655244891997427e-05, "learning_rate": 1.1800917440387677e-06, "loss": 0.0, "num_input_tokens_seen": 187815360, "step": 87085 }, { "epoch": 15.982749128280417, "grad_norm": 9.293630864704028e-05, "learning_rate": 1.1795751144167616e-06, "loss": 0.0, "num_input_tokens_seen": 187826976, "step": 87090 }, { "epoch": 15.983666727839971, "grad_norm": 6.149493856355548e-05, "learning_rate": 1.179058582782731e-06, "loss": 0.0, "num_input_tokens_seen": 187837088, "step": 87095 }, { "epoch": 15.984584327399523, "grad_norm": 0.0001508995919721201, "learning_rate": 1.1785421491499277e-06, "loss": 0.0, "num_input_tokens_seen": 187848000, "step": 87100 }, { "epoch": 15.985501926959074, "grad_norm": 4.603939305525273e-05, "learning_rate": 1.1780258135315953e-06, "loss": 0.0, "num_input_tokens_seen": 187858624, "step": 87105 }, { "epoch": 15.986419526518628, "grad_norm": 0.00010038738400908187, "learning_rate": 1.1775095759409776e-06, "loss": 0.0189, "num_input_tokens_seen": 187869344, "step": 87110 }, { "epoch": 15.98733712607818, "grad_norm": 0.00010752987145679072, "learning_rate": 1.1769934363913132e-06, "loss": 0.0, "num_input_tokens_seen": 187880576, "step": 87115 }, { "epoch": 15.98825472563773, "grad_norm": 0.00014700970496051013, "learning_rate": 1.1764773948958435e-06, "loss": 0.0, "num_input_tokens_seen": 187890176, "step": 87120 }, { "epoch": 15.989172325197284, "grad_norm": 0.00016183470143005252, "learning_rate": 1.1759614514678024e-06, "loss": 0.0, "num_input_tokens_seen": 187900960, "step": 87125 }, { "epoch": 15.990089924756836, "grad_norm": 0.000397953117499128, "learning_rate": 1.1754456061204227e-06, "loss": 0.0, "num_input_tokens_seen": 187911104, "step": 87130 }, { "epoch": 15.991007524316387, "grad_norm": 6.90772503730841e-05, "learning_rate": 1.1749298588669366e-06, "loss": 0.0, "num_input_tokens_seen": 187921248, "step": 87135 }, { "epoch": 15.99192512387594, "grad_norm": 6.645567191299051e-05, "learning_rate": 1.1744142097205713e-06, "loss": 0.0, "num_input_tokens_seen": 187931456, "step": 87140 }, { "epoch": 15.992842723435492, "grad_norm": 0.0034232172183692455, "learning_rate": 1.1738986586945523e-06, "loss": 0.0005, "num_input_tokens_seen": 187941440, "step": 87145 }, { "epoch": 15.993760322995044, "grad_norm": 6.789032340748236e-05, "learning_rate": 1.1733832058021027e-06, "loss": 0.0, "num_input_tokens_seen": 187951232, "step": 87150 }, { "epoch": 15.994677922554597, "grad_norm": 9.711473103379831e-05, "learning_rate": 1.1728678510564435e-06, "loss": 0.0, "num_input_tokens_seen": 187961408, "step": 87155 }, { "epoch": 15.995595522114149, "grad_norm": 0.00047916683251969516, "learning_rate": 1.1723525944707908e-06, "loss": 0.0, "num_input_tokens_seen": 187972832, "step": 87160 }, { "epoch": 15.996513121673702, "grad_norm": 0.0011242778273299336, "learning_rate": 1.1718374360583633e-06, "loss": 0.0, "num_input_tokens_seen": 187983136, "step": 87165 }, { "epoch": 15.997430721233254, "grad_norm": 9.371640771860257e-05, "learning_rate": 1.1713223758323728e-06, "loss": 0.0, "num_input_tokens_seen": 187993984, "step": 87170 }, { "epoch": 15.998348320792806, "grad_norm": 9.421882714377716e-05, "learning_rate": 1.170807413806029e-06, "loss": 0.0, "num_input_tokens_seen": 188005728, "step": 87175 }, { "epoch": 15.999265920352359, "grad_norm": 0.00020194183161947876, "learning_rate": 1.1702925499925388e-06, "loss": 0.0, "num_input_tokens_seen": 188016224, "step": 87180 }, { "epoch": 16.0, "eval_loss": 0.6247561573982239, "eval_runtime": 179.1686, "eval_samples_per_second": 30.413, "eval_steps_per_second": 7.607, "num_input_tokens_seen": 188023968, "step": 87184 }, { "epoch": 16.00018351991191, "grad_norm": 0.0016017130110412836, "learning_rate": 1.1697777844051105e-06, "loss": 0.0, "num_input_tokens_seen": 188026336, "step": 87185 }, { "epoch": 16.001101119471464, "grad_norm": 0.00011801378423115239, "learning_rate": 1.1692631170569457e-06, "loss": 0.0, "num_input_tokens_seen": 188037216, "step": 87190 }, { "epoch": 16.002018719031014, "grad_norm": 0.00025532630388624966, "learning_rate": 1.1687485479612453e-06, "loss": 0.0, "num_input_tokens_seen": 188047200, "step": 87195 }, { "epoch": 16.002936318590567, "grad_norm": 0.00010821357136592269, "learning_rate": 1.1682340771312051e-06, "loss": 0.0, "num_input_tokens_seen": 188058880, "step": 87200 }, { "epoch": 16.00385391815012, "grad_norm": 0.00018047999765258282, "learning_rate": 1.1677197045800238e-06, "loss": 0.0, "num_input_tokens_seen": 188070016, "step": 87205 }, { "epoch": 16.00477151770967, "grad_norm": 0.00017711696273181587, "learning_rate": 1.1672054303208923e-06, "loss": 0.0, "num_input_tokens_seen": 188079776, "step": 87210 }, { "epoch": 16.005689117269224, "grad_norm": 0.0001538211217848584, "learning_rate": 1.1666912543669995e-06, "loss": 0.0, "num_input_tokens_seen": 188091008, "step": 87215 }, { "epoch": 16.006606716828777, "grad_norm": 5.541689461097121e-05, "learning_rate": 1.1661771767315366e-06, "loss": 0.0003, "num_input_tokens_seen": 188101632, "step": 87220 }, { "epoch": 16.007524316388327, "grad_norm": 0.0001473910961067304, "learning_rate": 1.1656631974276878e-06, "loss": 0.0, "num_input_tokens_seen": 188113376, "step": 87225 }, { "epoch": 16.00844191594788, "grad_norm": 0.000474364758701995, "learning_rate": 1.1651493164686333e-06, "loss": 0.0, "num_input_tokens_seen": 188123552, "step": 87230 }, { "epoch": 16.009359515507434, "grad_norm": 0.0005206516943871975, "learning_rate": 1.1646355338675568e-06, "loss": 0.0, "num_input_tokens_seen": 188134720, "step": 87235 }, { "epoch": 16.010277115066984, "grad_norm": 0.0003810721100308001, "learning_rate": 1.1641218496376345e-06, "loss": 0.0, "num_input_tokens_seen": 188146080, "step": 87240 }, { "epoch": 16.011194714626537, "grad_norm": 0.00022394412371795624, "learning_rate": 1.163608263792042e-06, "loss": 0.0, "num_input_tokens_seen": 188156736, "step": 87245 }, { "epoch": 16.01211231418609, "grad_norm": 7.259070844156668e-05, "learning_rate": 1.1630947763439498e-06, "loss": 0.0, "num_input_tokens_seen": 188167872, "step": 87250 }, { "epoch": 16.01302991374564, "grad_norm": 0.00017127644969150424, "learning_rate": 1.1625813873065317e-06, "loss": 0.0, "num_input_tokens_seen": 188180000, "step": 87255 }, { "epoch": 16.013947513305194, "grad_norm": 9.793984645511955e-05, "learning_rate": 1.1620680966929538e-06, "loss": 0.0, "num_input_tokens_seen": 188190656, "step": 87260 }, { "epoch": 16.014865112864747, "grad_norm": 7.304822793230414e-05, "learning_rate": 1.1615549045163794e-06, "loss": 0.0, "num_input_tokens_seen": 188202176, "step": 87265 }, { "epoch": 16.015782712424297, "grad_norm": 0.00010611040488583967, "learning_rate": 1.1610418107899734e-06, "loss": 0.0376, "num_input_tokens_seen": 188214304, "step": 87270 }, { "epoch": 16.01670031198385, "grad_norm": 0.00025615558843128383, "learning_rate": 1.1605288155268958e-06, "loss": 0.0, "num_input_tokens_seen": 188225344, "step": 87275 }, { "epoch": 16.017617911543404, "grad_norm": 0.000659674231428653, "learning_rate": 1.160015918740302e-06, "loss": 0.0, "num_input_tokens_seen": 188237120, "step": 87280 }, { "epoch": 16.018535511102954, "grad_norm": 7.256459502968937e-05, "learning_rate": 1.1595031204433493e-06, "loss": 0.0, "num_input_tokens_seen": 188248288, "step": 87285 }, { "epoch": 16.019453110662507, "grad_norm": 0.00012290595623198897, "learning_rate": 1.1589904206491898e-06, "loss": 0.0, "num_input_tokens_seen": 188257248, "step": 87290 }, { "epoch": 16.02037071022206, "grad_norm": 0.00031278401729650795, "learning_rate": 1.1584778193709728e-06, "loss": 0.0, "num_input_tokens_seen": 188268096, "step": 87295 }, { "epoch": 16.02128830978161, "grad_norm": 0.000150455191032961, "learning_rate": 1.1579653166218447e-06, "loss": 0.0, "num_input_tokens_seen": 188279648, "step": 87300 }, { "epoch": 16.022205909341164, "grad_norm": 0.0001547016727272421, "learning_rate": 1.157452912414953e-06, "loss": 0.0, "num_input_tokens_seen": 188290816, "step": 87305 }, { "epoch": 16.023123508900717, "grad_norm": 0.004989389795809984, "learning_rate": 1.1569406067634386e-06, "loss": 0.0, "num_input_tokens_seen": 188302624, "step": 87310 }, { "epoch": 16.024041108460267, "grad_norm": 0.0015044756000861526, "learning_rate": 1.1564283996804405e-06, "loss": 0.0, "num_input_tokens_seen": 188313184, "step": 87315 }, { "epoch": 16.02495870801982, "grad_norm": 0.009415244683623314, "learning_rate": 1.1559162911790978e-06, "loss": 0.0, "num_input_tokens_seen": 188323136, "step": 87320 }, { "epoch": 16.025876307579374, "grad_norm": 0.00019063020590692759, "learning_rate": 1.155404281272544e-06, "loss": 0.0, "num_input_tokens_seen": 188334496, "step": 87325 }, { "epoch": 16.026793907138924, "grad_norm": 0.00021690009452868253, "learning_rate": 1.1548923699739129e-06, "loss": 0.0, "num_input_tokens_seen": 188345632, "step": 87330 }, { "epoch": 16.027711506698477, "grad_norm": 0.00011877578072017059, "learning_rate": 1.1543805572963307e-06, "loss": 0.0, "num_input_tokens_seen": 188355584, "step": 87335 }, { "epoch": 16.02862910625803, "grad_norm": 0.00026568869361653924, "learning_rate": 1.1538688432529294e-06, "loss": 0.0, "num_input_tokens_seen": 188368096, "step": 87340 }, { "epoch": 16.02954670581758, "grad_norm": 0.00021743298566434532, "learning_rate": 1.1533572278568306e-06, "loss": 0.0, "num_input_tokens_seen": 188379072, "step": 87345 }, { "epoch": 16.030464305377134, "grad_norm": 0.00025570267462171614, "learning_rate": 1.1528457111211572e-06, "loss": 0.0, "num_input_tokens_seen": 188390592, "step": 87350 }, { "epoch": 16.031381904936687, "grad_norm": 0.0026057111099362373, "learning_rate": 1.1523342930590276e-06, "loss": 0.0, "num_input_tokens_seen": 188400640, "step": 87355 }, { "epoch": 16.032299504496237, "grad_norm": 0.0003457840357441455, "learning_rate": 1.1518229736835612e-06, "loss": 0.0, "num_input_tokens_seen": 188412480, "step": 87360 }, { "epoch": 16.03321710405579, "grad_norm": 0.00048652710393071175, "learning_rate": 1.1513117530078715e-06, "loss": 0.0, "num_input_tokens_seen": 188422816, "step": 87365 }, { "epoch": 16.034134703615344, "grad_norm": 0.0001811595429899171, "learning_rate": 1.150800631045071e-06, "loss": 0.0, "num_input_tokens_seen": 188435008, "step": 87370 }, { "epoch": 16.035052303174893, "grad_norm": 0.000180468283360824, "learning_rate": 1.1502896078082682e-06, "loss": 0.0, "num_input_tokens_seen": 188445888, "step": 87375 }, { "epoch": 16.035969902734447, "grad_norm": 0.00043286997242830694, "learning_rate": 1.1497786833105685e-06, "loss": 0.0, "num_input_tokens_seen": 188456992, "step": 87380 }, { "epoch": 16.036887502294, "grad_norm": 0.0010567201534286141, "learning_rate": 1.1492678575650802e-06, "loss": 0.0, "num_input_tokens_seen": 188467392, "step": 87385 }, { "epoch": 16.03780510185355, "grad_norm": 0.00021551440295297652, "learning_rate": 1.1487571305849032e-06, "loss": 0.0, "num_input_tokens_seen": 188478688, "step": 87390 }, { "epoch": 16.038722701413104, "grad_norm": 0.00014126961468718946, "learning_rate": 1.148246502383137e-06, "loss": 0.0, "num_input_tokens_seen": 188489280, "step": 87395 }, { "epoch": 16.039640300972657, "grad_norm": 0.0002641559112817049, "learning_rate": 1.1477359729728765e-06, "loss": 0.0, "num_input_tokens_seen": 188500416, "step": 87400 }, { "epoch": 16.040557900532207, "grad_norm": 0.000658757402561605, "learning_rate": 1.1472255423672196e-06, "loss": 0.0, "num_input_tokens_seen": 188509504, "step": 87405 }, { "epoch": 16.04147550009176, "grad_norm": 0.0012223043013364077, "learning_rate": 1.1467152105792563e-06, "loss": 0.0, "num_input_tokens_seen": 188519776, "step": 87410 }, { "epoch": 16.042393099651314, "grad_norm": 5.1505063311196864e-05, "learning_rate": 1.146204977622074e-06, "loss": 0.0, "num_input_tokens_seen": 188530592, "step": 87415 }, { "epoch": 16.043310699210863, "grad_norm": 0.0002887434675358236, "learning_rate": 1.1456948435087633e-06, "loss": 0.0, "num_input_tokens_seen": 188541600, "step": 87420 }, { "epoch": 16.044228298770417, "grad_norm": 0.0001398090535076335, "learning_rate": 1.1451848082524059e-06, "loss": 0.0, "num_input_tokens_seen": 188553152, "step": 87425 }, { "epoch": 16.04514589832997, "grad_norm": 0.0030741780065000057, "learning_rate": 1.1446748718660834e-06, "loss": 0.0, "num_input_tokens_seen": 188562944, "step": 87430 }, { "epoch": 16.04606349788952, "grad_norm": 8.78864957485348e-05, "learning_rate": 1.144165034362874e-06, "loss": 0.0, "num_input_tokens_seen": 188573728, "step": 87435 }, { "epoch": 16.046981097449073, "grad_norm": 0.0006986747612245381, "learning_rate": 1.1436552957558571e-06, "loss": 0.0, "num_input_tokens_seen": 188583968, "step": 87440 }, { "epoch": 16.047898697008627, "grad_norm": 0.00017691661196295172, "learning_rate": 1.1431456560581051e-06, "loss": 0.0, "num_input_tokens_seen": 188595456, "step": 87445 }, { "epoch": 16.048816296568177, "grad_norm": 0.00011275168071733788, "learning_rate": 1.1426361152826876e-06, "loss": 0.0, "num_input_tokens_seen": 188604928, "step": 87450 }, { "epoch": 16.04973389612773, "grad_norm": 0.00021391415793914348, "learning_rate": 1.1421266734426773e-06, "loss": 0.0, "num_input_tokens_seen": 188616288, "step": 87455 }, { "epoch": 16.050651495687283, "grad_norm": 8.550508937332779e-05, "learning_rate": 1.141617330551138e-06, "loss": 0.0, "num_input_tokens_seen": 188627328, "step": 87460 }, { "epoch": 16.051569095246833, "grad_norm": 0.000404529448132962, "learning_rate": 1.1411080866211334e-06, "loss": 0.0, "num_input_tokens_seen": 188639168, "step": 87465 }, { "epoch": 16.052486694806387, "grad_norm": 0.0001632224884815514, "learning_rate": 1.140598941665727e-06, "loss": 0.0, "num_input_tokens_seen": 188649248, "step": 87470 }, { "epoch": 16.05340429436594, "grad_norm": 0.000357170298229903, "learning_rate": 1.140089895697976e-06, "loss": 0.0, "num_input_tokens_seen": 188660192, "step": 87475 }, { "epoch": 16.05432189392549, "grad_norm": 6.199507333803922e-05, "learning_rate": 1.1395809487309367e-06, "loss": 0.0, "num_input_tokens_seen": 188672000, "step": 87480 }, { "epoch": 16.055239493485043, "grad_norm": 0.00023164953745435923, "learning_rate": 1.1390721007776616e-06, "loss": 0.0, "num_input_tokens_seen": 188682336, "step": 87485 }, { "epoch": 16.056157093044597, "grad_norm": 5.435643106466159e-05, "learning_rate": 1.1385633518512051e-06, "loss": 0.0, "num_input_tokens_seen": 188692640, "step": 87490 }, { "epoch": 16.057074692604147, "grad_norm": 5.0761162128765136e-05, "learning_rate": 1.1380547019646137e-06, "loss": 0.0, "num_input_tokens_seen": 188703392, "step": 87495 }, { "epoch": 16.0579922921637, "grad_norm": 8.940640691434965e-05, "learning_rate": 1.1375461511309322e-06, "loss": 0.0, "num_input_tokens_seen": 188714784, "step": 87500 }, { "epoch": 16.058909891723253, "grad_norm": 0.0004905745154246688, "learning_rate": 1.137037699363207e-06, "loss": 0.0, "num_input_tokens_seen": 188724896, "step": 87505 }, { "epoch": 16.059827491282803, "grad_norm": 0.00010462552745593712, "learning_rate": 1.1365293466744781e-06, "loss": 0.0, "num_input_tokens_seen": 188736320, "step": 87510 }, { "epoch": 16.060745090842357, "grad_norm": 6.75193005008623e-05, "learning_rate": 1.1360210930777836e-06, "loss": 0.0, "num_input_tokens_seen": 188746016, "step": 87515 }, { "epoch": 16.06166269040191, "grad_norm": 0.0003422211157158017, "learning_rate": 1.135512938586158e-06, "loss": 0.0, "num_input_tokens_seen": 188757504, "step": 87520 }, { "epoch": 16.06258028996146, "grad_norm": 0.00011734283179976046, "learning_rate": 1.135004883212637e-06, "loss": 0.0, "num_input_tokens_seen": 188768864, "step": 87525 }, { "epoch": 16.063497889521013, "grad_norm": 5.8233330491930246e-05, "learning_rate": 1.134496926970251e-06, "loss": 0.0, "num_input_tokens_seen": 188779488, "step": 87530 }, { "epoch": 16.064415489080567, "grad_norm": 0.0009264732943847775, "learning_rate": 1.1339890698720263e-06, "loss": 0.0, "num_input_tokens_seen": 188789952, "step": 87535 }, { "epoch": 16.065333088640116, "grad_norm": 7.371533865807578e-05, "learning_rate": 1.1334813119309918e-06, "loss": 0.0244, "num_input_tokens_seen": 188800032, "step": 87540 }, { "epoch": 16.06625068819967, "grad_norm": 0.00014626688789576292, "learning_rate": 1.1329736531601687e-06, "loss": 0.0, "num_input_tokens_seen": 188810144, "step": 87545 }, { "epoch": 16.067168287759223, "grad_norm": 8.872972830431536e-05, "learning_rate": 1.1324660935725772e-06, "loss": 0.0, "num_input_tokens_seen": 188821120, "step": 87550 }, { "epoch": 16.068085887318773, "grad_norm": 0.00016722048167139292, "learning_rate": 1.1319586331812372e-06, "loss": 0.0, "num_input_tokens_seen": 188831616, "step": 87555 }, { "epoch": 16.069003486878326, "grad_norm": 0.0004933173186145723, "learning_rate": 1.1314512719991633e-06, "loss": 0.0, "num_input_tokens_seen": 188841952, "step": 87560 }, { "epoch": 16.06992108643788, "grad_norm": 0.0008180577424354851, "learning_rate": 1.1309440100393686e-06, "loss": 0.0, "num_input_tokens_seen": 188852384, "step": 87565 }, { "epoch": 16.07083868599743, "grad_norm": 0.0006203232915140688, "learning_rate": 1.1304368473148641e-06, "loss": 0.0, "num_input_tokens_seen": 188863584, "step": 87570 }, { "epoch": 16.071756285556983, "grad_norm": 0.0005428956937976182, "learning_rate": 1.1299297838386553e-06, "loss": 0.0, "num_input_tokens_seen": 188873760, "step": 87575 }, { "epoch": 16.072673885116536, "grad_norm": 0.00016314134700223804, "learning_rate": 1.129422819623751e-06, "loss": 0.0, "num_input_tokens_seen": 188884896, "step": 87580 }, { "epoch": 16.073591484676086, "grad_norm": 0.0025446531362831593, "learning_rate": 1.1289159546831524e-06, "loss": 0.0, "num_input_tokens_seen": 188896832, "step": 87585 }, { "epoch": 16.07450908423564, "grad_norm": 0.0001404913200531155, "learning_rate": 1.1284091890298599e-06, "loss": 0.0, "num_input_tokens_seen": 188907936, "step": 87590 }, { "epoch": 16.075426683795193, "grad_norm": 5.1272669224999845e-05, "learning_rate": 1.1279025226768713e-06, "loss": 0.0, "num_input_tokens_seen": 188919392, "step": 87595 }, { "epoch": 16.076344283354743, "grad_norm": 48.41067123413086, "learning_rate": 1.1273959556371806e-06, "loss": 0.0943, "num_input_tokens_seen": 188930848, "step": 87600 }, { "epoch": 16.077261882914296, "grad_norm": 0.0009140092297457159, "learning_rate": 1.1268894879237829e-06, "loss": 0.0, "num_input_tokens_seen": 188942528, "step": 87605 }, { "epoch": 16.07817948247385, "grad_norm": 0.0011337068863213062, "learning_rate": 1.1263831195496672e-06, "loss": 0.0, "num_input_tokens_seen": 188952832, "step": 87610 }, { "epoch": 16.0790970820334, "grad_norm": 7.231926429085433e-05, "learning_rate": 1.1258768505278205e-06, "loss": 0.0, "num_input_tokens_seen": 188964416, "step": 87615 }, { "epoch": 16.080014681592953, "grad_norm": 0.006373478099703789, "learning_rate": 1.1253706808712272e-06, "loss": 0.0, "num_input_tokens_seen": 188975520, "step": 87620 }, { "epoch": 16.080932281152506, "grad_norm": 0.00019700628763530403, "learning_rate": 1.1248646105928724e-06, "loss": 0.0, "num_input_tokens_seen": 188986784, "step": 87625 }, { "epoch": 16.081849880712056, "grad_norm": 0.0033150638919323683, "learning_rate": 1.1243586397057343e-06, "loss": 0.0, "num_input_tokens_seen": 188997408, "step": 87630 }, { "epoch": 16.08276748027161, "grad_norm": 0.0002480621333234012, "learning_rate": 1.123852768222789e-06, "loss": 0.0, "num_input_tokens_seen": 189007488, "step": 87635 }, { "epoch": 16.083685079831163, "grad_norm": 0.00010949633724521846, "learning_rate": 1.1233469961570138e-06, "loss": 0.0, "num_input_tokens_seen": 189017440, "step": 87640 }, { "epoch": 16.084602679390713, "grad_norm": 0.014227321371436119, "learning_rate": 1.1228413235213799e-06, "loss": 0.0, "num_input_tokens_seen": 189026816, "step": 87645 }, { "epoch": 16.085520278950266, "grad_norm": 0.0003870957880280912, "learning_rate": 1.1223357503288573e-06, "loss": 0.0, "num_input_tokens_seen": 189037376, "step": 87650 }, { "epoch": 16.08643787850982, "grad_norm": 7.92506180005148e-05, "learning_rate": 1.121830276592411e-06, "loss": 0.0, "num_input_tokens_seen": 189047936, "step": 87655 }, { "epoch": 16.08735547806937, "grad_norm": 7.149586599553004e-05, "learning_rate": 1.1213249023250094e-06, "loss": 0.0, "num_input_tokens_seen": 189058400, "step": 87660 }, { "epoch": 16.088273077628923, "grad_norm": 0.00011713823914760724, "learning_rate": 1.1208196275396128e-06, "loss": 0.0, "num_input_tokens_seen": 189069152, "step": 87665 }, { "epoch": 16.089190677188476, "grad_norm": 0.0030241108033806086, "learning_rate": 1.1203144522491789e-06, "loss": 0.0, "num_input_tokens_seen": 189080992, "step": 87670 }, { "epoch": 16.090108276748026, "grad_norm": 0.00011436276690801606, "learning_rate": 1.1198093764666673e-06, "loss": 0.0, "num_input_tokens_seen": 189092832, "step": 87675 }, { "epoch": 16.09102587630758, "grad_norm": 7.966969133121893e-05, "learning_rate": 1.1193044002050318e-06, "loss": 0.0, "num_input_tokens_seen": 189102752, "step": 87680 }, { "epoch": 16.091943475867133, "grad_norm": 0.00015266861009877175, "learning_rate": 1.1187995234772224e-06, "loss": 0.0, "num_input_tokens_seen": 189112608, "step": 87685 }, { "epoch": 16.092861075426683, "grad_norm": 0.00018170253315474838, "learning_rate": 1.1182947462961913e-06, "loss": 0.0, "num_input_tokens_seen": 189122656, "step": 87690 }, { "epoch": 16.093778674986236, "grad_norm": 9.092144318856299e-05, "learning_rate": 1.1177900686748844e-06, "loss": 0.0, "num_input_tokens_seen": 189133760, "step": 87695 }, { "epoch": 16.09469627454579, "grad_norm": 0.0001480091013945639, "learning_rate": 1.1172854906262449e-06, "loss": 0.0, "num_input_tokens_seen": 189145760, "step": 87700 }, { "epoch": 16.09561387410534, "grad_norm": 6.459202995756641e-05, "learning_rate": 1.1167810121632133e-06, "loss": 0.0, "num_input_tokens_seen": 189157184, "step": 87705 }, { "epoch": 16.096531473664893, "grad_norm": 0.00013770299847237766, "learning_rate": 1.1162766332987318e-06, "loss": 0.0, "num_input_tokens_seen": 189168032, "step": 87710 }, { "epoch": 16.097449073224446, "grad_norm": 0.0008877475629560649, "learning_rate": 1.1157723540457354e-06, "loss": 0.0, "num_input_tokens_seen": 189178304, "step": 87715 }, { "epoch": 16.098366672783996, "grad_norm": 0.0001890620042104274, "learning_rate": 1.1152681744171573e-06, "loss": 0.0, "num_input_tokens_seen": 189189504, "step": 87720 }, { "epoch": 16.09928427234355, "grad_norm": 0.001981025794520974, "learning_rate": 1.1147640944259308e-06, "loss": 0.0, "num_input_tokens_seen": 189201184, "step": 87725 }, { "epoch": 16.100201871903103, "grad_norm": 6.483509787358344e-05, "learning_rate": 1.1142601140849835e-06, "loss": 0.0, "num_input_tokens_seen": 189212512, "step": 87730 }, { "epoch": 16.101119471462653, "grad_norm": 0.007191121112555265, "learning_rate": 1.1137562334072405e-06, "loss": 0.0, "num_input_tokens_seen": 189224224, "step": 87735 }, { "epoch": 16.102037071022206, "grad_norm": 0.21204549074172974, "learning_rate": 1.1132524524056287e-06, "loss": 0.0, "num_input_tokens_seen": 189234496, "step": 87740 }, { "epoch": 16.10295467058176, "grad_norm": 0.00018768044537864625, "learning_rate": 1.1127487710930673e-06, "loss": 0.0, "num_input_tokens_seen": 189245824, "step": 87745 }, { "epoch": 16.10387227014131, "grad_norm": 0.0003892001404892653, "learning_rate": 1.1122451894824753e-06, "loss": 0.0, "num_input_tokens_seen": 189256768, "step": 87750 }, { "epoch": 16.104789869700863, "grad_norm": 0.00010723053856054321, "learning_rate": 1.1117417075867675e-06, "loss": 0.0, "num_input_tokens_seen": 189267712, "step": 87755 }, { "epoch": 16.105707469260416, "grad_norm": 0.007099728100001812, "learning_rate": 1.1112383254188598e-06, "loss": 0.0, "num_input_tokens_seen": 189278880, "step": 87760 }, { "epoch": 16.106625068819966, "grad_norm": 6.442819721996784e-05, "learning_rate": 1.110735042991662e-06, "loss": 0.0, "num_input_tokens_seen": 189289888, "step": 87765 }, { "epoch": 16.10754266837952, "grad_norm": 6.26988330623135e-05, "learning_rate": 1.1102318603180811e-06, "loss": 0.0, "num_input_tokens_seen": 189301824, "step": 87770 }, { "epoch": 16.108460267939073, "grad_norm": 0.0005257687880657613, "learning_rate": 1.109728777411026e-06, "loss": 0.0, "num_input_tokens_seen": 189313568, "step": 87775 }, { "epoch": 16.109377867498623, "grad_norm": 9.034344111569226e-05, "learning_rate": 1.1092257942833985e-06, "loss": 0.0, "num_input_tokens_seen": 189325248, "step": 87780 }, { "epoch": 16.110295467058176, "grad_norm": 0.00048008302110247314, "learning_rate": 1.108722910948099e-06, "loss": 0.0, "num_input_tokens_seen": 189335008, "step": 87785 }, { "epoch": 16.11121306661773, "grad_norm": 7.032512075966224e-05, "learning_rate": 1.1082201274180259e-06, "loss": 0.0, "num_input_tokens_seen": 189346304, "step": 87790 }, { "epoch": 16.11213066617728, "grad_norm": 0.00011674792767735198, "learning_rate": 1.1077174437060734e-06, "loss": 0.0, "num_input_tokens_seen": 189355872, "step": 87795 }, { "epoch": 16.113048265736833, "grad_norm": 0.00011708549573086202, "learning_rate": 1.1072148598251375e-06, "loss": 0.0, "num_input_tokens_seen": 189366496, "step": 87800 }, { "epoch": 16.113965865296386, "grad_norm": 0.0005315226153470576, "learning_rate": 1.106712375788107e-06, "loss": 0.0, "num_input_tokens_seen": 189376704, "step": 87805 }, { "epoch": 16.114883464855936, "grad_norm": 8.436646749032661e-05, "learning_rate": 1.1062099916078705e-06, "loss": 0.0, "num_input_tokens_seen": 189386816, "step": 87810 }, { "epoch": 16.11580106441549, "grad_norm": 0.00016020082694012672, "learning_rate": 1.1057077072973121e-06, "loss": 0.0, "num_input_tokens_seen": 189397536, "step": 87815 }, { "epoch": 16.116718663975043, "grad_norm": 0.00012595528096426278, "learning_rate": 1.1052055228693147e-06, "loss": 0.0, "num_input_tokens_seen": 189408480, "step": 87820 }, { "epoch": 16.117636263534592, "grad_norm": 0.0006945864297449589, "learning_rate": 1.1047034383367606e-06, "loss": 0.0, "num_input_tokens_seen": 189420704, "step": 87825 }, { "epoch": 16.118553863094146, "grad_norm": 5.749936826759949e-05, "learning_rate": 1.1042014537125256e-06, "loss": 0.0, "num_input_tokens_seen": 189430080, "step": 87830 }, { "epoch": 16.1194714626537, "grad_norm": 0.00029479764634743333, "learning_rate": 1.1036995690094859e-06, "loss": 0.1035, "num_input_tokens_seen": 189440672, "step": 87835 }, { "epoch": 16.12038906221325, "grad_norm": 0.001310161780565977, "learning_rate": 1.1031977842405117e-06, "loss": 0.0, "num_input_tokens_seen": 189451296, "step": 87840 }, { "epoch": 16.121306661772802, "grad_norm": 0.0016894618747755885, "learning_rate": 1.1026960994184766e-06, "loss": 0.0, "num_input_tokens_seen": 189462336, "step": 87845 }, { "epoch": 16.122224261332356, "grad_norm": 0.00025140627985820174, "learning_rate": 1.1021945145562463e-06, "loss": 0.0, "num_input_tokens_seen": 189472064, "step": 87850 }, { "epoch": 16.123141860891906, "grad_norm": 3.722305336850695e-05, "learning_rate": 1.101693029666684e-06, "loss": 0.0, "num_input_tokens_seen": 189483104, "step": 87855 }, { "epoch": 16.12405946045146, "grad_norm": 0.0002335274184588343, "learning_rate": 1.1011916447626548e-06, "loss": 0.0, "num_input_tokens_seen": 189493920, "step": 87860 }, { "epoch": 16.124977060011012, "grad_norm": 0.00030607680673711, "learning_rate": 1.100690359857018e-06, "loss": 0.0, "num_input_tokens_seen": 189505216, "step": 87865 }, { "epoch": 16.125894659570562, "grad_norm": 0.00016392962425015867, "learning_rate": 1.1001891749626281e-06, "loss": 0.0, "num_input_tokens_seen": 189516736, "step": 87870 }, { "epoch": 16.126812259130116, "grad_norm": 0.00040924144559539855, "learning_rate": 1.0996880900923433e-06, "loss": 0.0, "num_input_tokens_seen": 189527168, "step": 87875 }, { "epoch": 16.12772985868967, "grad_norm": 0.0004333060351200402, "learning_rate": 1.0991871052590141e-06, "loss": 0.0, "num_input_tokens_seen": 189538048, "step": 87880 }, { "epoch": 16.12864745824922, "grad_norm": 0.00011319067561998963, "learning_rate": 1.09868622047549e-06, "loss": 0.0, "num_input_tokens_seen": 189549248, "step": 87885 }, { "epoch": 16.129565057808772, "grad_norm": 0.10016641020774841, "learning_rate": 1.0981854357546163e-06, "loss": 0.0, "num_input_tokens_seen": 189560832, "step": 87890 }, { "epoch": 16.130482657368326, "grad_norm": 0.002814068691805005, "learning_rate": 1.0976847511092403e-06, "loss": 0.0, "num_input_tokens_seen": 189571520, "step": 87895 }, { "epoch": 16.131400256927876, "grad_norm": 0.00029741550679318607, "learning_rate": 1.0971841665522026e-06, "loss": 0.0, "num_input_tokens_seen": 189582432, "step": 87900 }, { "epoch": 16.13231785648743, "grad_norm": 6.466338527388871e-05, "learning_rate": 1.0966836820963412e-06, "loss": 0.0, "num_input_tokens_seen": 189593984, "step": 87905 }, { "epoch": 16.133235456046982, "grad_norm": 6.201132055139169e-05, "learning_rate": 1.0961832977544944e-06, "loss": 0.0005, "num_input_tokens_seen": 189605152, "step": 87910 }, { "epoch": 16.134153055606532, "grad_norm": 0.00037274163332767785, "learning_rate": 1.0956830135394959e-06, "loss": 0.0, "num_input_tokens_seen": 189616928, "step": 87915 }, { "epoch": 16.135070655166086, "grad_norm": 9.38929442781955e-05, "learning_rate": 1.0951828294641753e-06, "loss": 0.0, "num_input_tokens_seen": 189627456, "step": 87920 }, { "epoch": 16.13598825472564, "grad_norm": 0.00012111309479223564, "learning_rate": 1.094682745541365e-06, "loss": 0.0, "num_input_tokens_seen": 189636736, "step": 87925 }, { "epoch": 16.13690585428519, "grad_norm": 0.0007172366604208946, "learning_rate": 1.0941827617838897e-06, "loss": 0.0, "num_input_tokens_seen": 189646880, "step": 87930 }, { "epoch": 16.137823453844742, "grad_norm": 0.00022580919903703034, "learning_rate": 1.0936828782045728e-06, "loss": 0.0, "num_input_tokens_seen": 189659040, "step": 87935 }, { "epoch": 16.138741053404296, "grad_norm": 0.00010231021587969735, "learning_rate": 1.0931830948162342e-06, "loss": 0.0, "num_input_tokens_seen": 189670144, "step": 87940 }, { "epoch": 16.139658652963845, "grad_norm": 7.245424058055505e-05, "learning_rate": 1.0926834116316958e-06, "loss": 0.0, "num_input_tokens_seen": 189680096, "step": 87945 }, { "epoch": 16.1405762525234, "grad_norm": 0.009653945453464985, "learning_rate": 1.0921838286637726e-06, "loss": 0.0, "num_input_tokens_seen": 189691136, "step": 87950 }, { "epoch": 16.141493852082952, "grad_norm": 9.269200381822884e-05, "learning_rate": 1.0916843459252756e-06, "loss": 0.0, "num_input_tokens_seen": 189701248, "step": 87955 }, { "epoch": 16.142411451642502, "grad_norm": 0.0001731348893372342, "learning_rate": 1.0911849634290194e-06, "loss": 0.0, "num_input_tokens_seen": 189713568, "step": 87960 }, { "epoch": 16.143329051202056, "grad_norm": 0.00022992401500232518, "learning_rate": 1.0906856811878107e-06, "loss": 0.0, "num_input_tokens_seen": 189724320, "step": 87965 }, { "epoch": 16.14424665076161, "grad_norm": 0.00045473454520106316, "learning_rate": 1.0901864992144556e-06, "loss": 0.0, "num_input_tokens_seen": 189735520, "step": 87970 }, { "epoch": 16.14516425032116, "grad_norm": 0.0007471648859791458, "learning_rate": 1.089687417521756e-06, "loss": 0.0, "num_input_tokens_seen": 189746432, "step": 87975 }, { "epoch": 16.146081849880712, "grad_norm": 0.0019683511927723885, "learning_rate": 1.0891884361225147e-06, "loss": 0.0, "num_input_tokens_seen": 189758080, "step": 87980 }, { "epoch": 16.146999449440266, "grad_norm": 0.00012305575364734977, "learning_rate": 1.0886895550295284e-06, "loss": 0.0, "num_input_tokens_seen": 189768416, "step": 87985 }, { "epoch": 16.147917048999815, "grad_norm": 0.00018282000382896513, "learning_rate": 1.088190774255592e-06, "loss": 0.0, "num_input_tokens_seen": 189779488, "step": 87990 }, { "epoch": 16.14883464855937, "grad_norm": 0.012710602022707462, "learning_rate": 1.087692093813501e-06, "loss": 0.0, "num_input_tokens_seen": 189790464, "step": 87995 }, { "epoch": 16.149752248118922, "grad_norm": 9.48052474996075e-05, "learning_rate": 1.0871935137160444e-06, "loss": 0.0, "num_input_tokens_seen": 189802880, "step": 88000 }, { "epoch": 16.150669847678472, "grad_norm": 0.00023199895804282278, "learning_rate": 1.0866950339760096e-06, "loss": 0.0, "num_input_tokens_seen": 189813344, "step": 88005 }, { "epoch": 16.151587447238025, "grad_norm": 0.000463741336716339, "learning_rate": 1.0861966546061819e-06, "loss": 0.0, "num_input_tokens_seen": 189824896, "step": 88010 }, { "epoch": 16.15250504679758, "grad_norm": 0.0003485378110781312, "learning_rate": 1.0856983756193435e-06, "loss": 0.0, "num_input_tokens_seen": 189836192, "step": 88015 }, { "epoch": 16.15342264635713, "grad_norm": 8.938341488828883e-05, "learning_rate": 1.085200197028276e-06, "loss": 0.0, "num_input_tokens_seen": 189847072, "step": 88020 }, { "epoch": 16.154340245916682, "grad_norm": 0.0012944024056196213, "learning_rate": 1.0847021188457563e-06, "loss": 0.0, "num_input_tokens_seen": 189856640, "step": 88025 }, { "epoch": 16.155257845476235, "grad_norm": 0.0006655732868239284, "learning_rate": 1.084204141084559e-06, "loss": 0.0, "num_input_tokens_seen": 189868480, "step": 88030 }, { "epoch": 16.156175445035785, "grad_norm": 0.0001762019091984257, "learning_rate": 1.0837062637574563e-06, "loss": 0.0011, "num_input_tokens_seen": 189879808, "step": 88035 }, { "epoch": 16.15709304459534, "grad_norm": 0.00024167315859813243, "learning_rate": 1.083208486877217e-06, "loss": 0.0, "num_input_tokens_seen": 189891264, "step": 88040 }, { "epoch": 16.158010644154892, "grad_norm": 0.00040670891758054495, "learning_rate": 1.082710810456611e-06, "loss": 0.0, "num_input_tokens_seen": 189901152, "step": 88045 }, { "epoch": 16.158928243714442, "grad_norm": 6.256847700569779e-05, "learning_rate": 1.0822132345084014e-06, "loss": 0.0, "num_input_tokens_seen": 189912736, "step": 88050 }, { "epoch": 16.159845843273995, "grad_norm": 4.945489126839675e-05, "learning_rate": 1.0817157590453487e-06, "loss": 0.0, "num_input_tokens_seen": 189923776, "step": 88055 }, { "epoch": 16.16076344283355, "grad_norm": 8.467773295706138e-05, "learning_rate": 1.0812183840802154e-06, "loss": 0.0, "num_input_tokens_seen": 189934560, "step": 88060 }, { "epoch": 16.1616810423931, "grad_norm": 0.00011137239926028997, "learning_rate": 1.0807211096257576e-06, "loss": 0.0, "num_input_tokens_seen": 189946944, "step": 88065 }, { "epoch": 16.162598641952652, "grad_norm": 0.00024284613027703017, "learning_rate": 1.0802239356947285e-06, "loss": 0.0, "num_input_tokens_seen": 189958240, "step": 88070 }, { "epoch": 16.163516241512205, "grad_norm": 0.0002984062011819333, "learning_rate": 1.0797268622998791e-06, "loss": 0.0, "num_input_tokens_seen": 189968096, "step": 88075 }, { "epoch": 16.164433841071755, "grad_norm": 0.0006394424708560109, "learning_rate": 1.079229889453961e-06, "loss": 0.0, "num_input_tokens_seen": 189979488, "step": 88080 }, { "epoch": 16.16535144063131, "grad_norm": 8.997275290312245e-05, "learning_rate": 1.0787330171697197e-06, "loss": 0.0, "num_input_tokens_seen": 189991168, "step": 88085 }, { "epoch": 16.166269040190862, "grad_norm": 0.0001034025990520604, "learning_rate": 1.0782362454598978e-06, "loss": 0.0, "num_input_tokens_seen": 190000672, "step": 88090 }, { "epoch": 16.167186639750412, "grad_norm": 9.407856123289093e-05, "learning_rate": 1.0777395743372392e-06, "loss": 0.0, "num_input_tokens_seen": 190013056, "step": 88095 }, { "epoch": 16.168104239309965, "grad_norm": 8.395363693125546e-05, "learning_rate": 1.0772430038144822e-06, "loss": 0.0, "num_input_tokens_seen": 190024416, "step": 88100 }, { "epoch": 16.16902183886952, "grad_norm": 0.0001765379129210487, "learning_rate": 1.0767465339043615e-06, "loss": 0.0, "num_input_tokens_seen": 190034336, "step": 88105 }, { "epoch": 16.16993943842907, "grad_norm": 0.00026888155844062567, "learning_rate": 1.076250164619611e-06, "loss": 0.0, "num_input_tokens_seen": 190045504, "step": 88110 }, { "epoch": 16.170857037988622, "grad_norm": 0.0001683735754340887, "learning_rate": 1.0757538959729635e-06, "loss": 0.0, "num_input_tokens_seen": 190056832, "step": 88115 }, { "epoch": 16.171774637548175, "grad_norm": 0.0001526318083051592, "learning_rate": 1.075257727977147e-06, "loss": 0.0, "num_input_tokens_seen": 190067488, "step": 88120 }, { "epoch": 16.172692237107725, "grad_norm": 7.908482803031802e-05, "learning_rate": 1.0747616606448853e-06, "loss": 0.0, "num_input_tokens_seen": 190077184, "step": 88125 }, { "epoch": 16.17360983666728, "grad_norm": 0.0003293266345281154, "learning_rate": 1.0742656939889046e-06, "loss": 0.0, "num_input_tokens_seen": 190088832, "step": 88130 }, { "epoch": 16.174527436226832, "grad_norm": 0.00011813483433797956, "learning_rate": 1.073769828021925e-06, "loss": 0.0, "num_input_tokens_seen": 190098560, "step": 88135 }, { "epoch": 16.17544503578638, "grad_norm": 0.003958697430789471, "learning_rate": 1.0732740627566623e-06, "loss": 0.0, "num_input_tokens_seen": 190109728, "step": 88140 }, { "epoch": 16.176362635345935, "grad_norm": 9.796674567041919e-05, "learning_rate": 1.072778398205836e-06, "loss": 0.0, "num_input_tokens_seen": 190120288, "step": 88145 }, { "epoch": 16.17728023490549, "grad_norm": 0.00023441045777872205, "learning_rate": 1.0722828343821568e-06, "loss": 0.0, "num_input_tokens_seen": 190130880, "step": 88150 }, { "epoch": 16.17819783446504, "grad_norm": 0.00013606090215034783, "learning_rate": 1.0717873712983357e-06, "loss": 0.0, "num_input_tokens_seen": 190142304, "step": 88155 }, { "epoch": 16.17911543402459, "grad_norm": 0.0010409797541797161, "learning_rate": 1.0712920089670787e-06, "loss": 0.0, "num_input_tokens_seen": 190152448, "step": 88160 }, { "epoch": 16.180033033584145, "grad_norm": 0.00012152252020314336, "learning_rate": 1.0707967474010937e-06, "loss": 0.0, "num_input_tokens_seen": 190162656, "step": 88165 }, { "epoch": 16.180950633143695, "grad_norm": 0.00010866260709008202, "learning_rate": 1.0703015866130833e-06, "loss": 0.0, "num_input_tokens_seen": 190174144, "step": 88170 }, { "epoch": 16.18186823270325, "grad_norm": 0.004298864398151636, "learning_rate": 1.0698065266157447e-06, "loss": 0.0, "num_input_tokens_seen": 190184544, "step": 88175 }, { "epoch": 16.1827858322628, "grad_norm": 8.424698899034411e-05, "learning_rate": 1.069311567421779e-06, "loss": 0.0, "num_input_tokens_seen": 190195264, "step": 88180 }, { "epoch": 16.18370343182235, "grad_norm": 0.00010197629308095202, "learning_rate": 1.06881670904388e-06, "loss": 0.0, "num_input_tokens_seen": 190207584, "step": 88185 }, { "epoch": 16.184621031381905, "grad_norm": 7.694983651163056e-05, "learning_rate": 1.0683219514947379e-06, "loss": 0.0, "num_input_tokens_seen": 190218304, "step": 88190 }, { "epoch": 16.18553863094146, "grad_norm": 0.0015940496232360601, "learning_rate": 1.0678272947870455e-06, "loss": 0.0, "num_input_tokens_seen": 190230176, "step": 88195 }, { "epoch": 16.18645623050101, "grad_norm": 9.613981819711626e-05, "learning_rate": 1.0673327389334886e-06, "loss": 0.0, "num_input_tokens_seen": 190241120, "step": 88200 }, { "epoch": 16.18737383006056, "grad_norm": 9.619029151508585e-05, "learning_rate": 1.0668382839467522e-06, "loss": 0.0, "num_input_tokens_seen": 190251264, "step": 88205 }, { "epoch": 16.188291429620115, "grad_norm": 6.108161323936656e-05, "learning_rate": 1.0663439298395162e-06, "loss": 0.0, "num_input_tokens_seen": 190262400, "step": 88210 }, { "epoch": 16.189209029179665, "grad_norm": 0.0007730303332209587, "learning_rate": 1.0658496766244636e-06, "loss": 0.0, "num_input_tokens_seen": 190273184, "step": 88215 }, { "epoch": 16.19012662873922, "grad_norm": 0.00033859864925034344, "learning_rate": 1.0653555243142694e-06, "loss": 0.0, "num_input_tokens_seen": 190283776, "step": 88220 }, { "epoch": 16.19104422829877, "grad_norm": 0.000222183924051933, "learning_rate": 1.0648614729216072e-06, "loss": 0.0, "num_input_tokens_seen": 190294048, "step": 88225 }, { "epoch": 16.19196182785832, "grad_norm": 0.0001053379601216875, "learning_rate": 1.06436752245915e-06, "loss": 0.0, "num_input_tokens_seen": 190304736, "step": 88230 }, { "epoch": 16.192879427417875, "grad_norm": 0.0024547246284782887, "learning_rate": 1.063873672939566e-06, "loss": 0.0, "num_input_tokens_seen": 190315360, "step": 88235 }, { "epoch": 16.19379702697743, "grad_norm": 8.679778693476692e-05, "learning_rate": 1.0633799243755199e-06, "loss": 0.0, "num_input_tokens_seen": 190326432, "step": 88240 }, { "epoch": 16.194714626536978, "grad_norm": 7.339607691392303e-05, "learning_rate": 1.0628862767796799e-06, "loss": 0.0, "num_input_tokens_seen": 190337312, "step": 88245 }, { "epoch": 16.19563222609653, "grad_norm": 7.555680349469185e-05, "learning_rate": 1.0623927301647042e-06, "loss": 0.0, "num_input_tokens_seen": 190347104, "step": 88250 }, { "epoch": 16.196549825656085, "grad_norm": 4.275199171388522e-05, "learning_rate": 1.0618992845432525e-06, "loss": 0.0, "num_input_tokens_seen": 190358336, "step": 88255 }, { "epoch": 16.197467425215635, "grad_norm": 0.00033019043621607125, "learning_rate": 1.0614059399279792e-06, "loss": 0.0, "num_input_tokens_seen": 190369184, "step": 88260 }, { "epoch": 16.198385024775188, "grad_norm": 0.00025188896688632667, "learning_rate": 1.0609126963315407e-06, "loss": 0.0002, "num_input_tokens_seen": 190380032, "step": 88265 }, { "epoch": 16.19930262433474, "grad_norm": 9.725341806188226e-05, "learning_rate": 1.0604195537665861e-06, "loss": 0.0, "num_input_tokens_seen": 190391200, "step": 88270 }, { "epoch": 16.20022022389429, "grad_norm": 8.32348523545079e-05, "learning_rate": 1.0599265122457637e-06, "loss": 0.0, "num_input_tokens_seen": 190402272, "step": 88275 }, { "epoch": 16.201137823453845, "grad_norm": 0.005373919848352671, "learning_rate": 1.0594335717817207e-06, "loss": 0.0, "num_input_tokens_seen": 190414016, "step": 88280 }, { "epoch": 16.202055423013398, "grad_norm": 0.0024286708794534206, "learning_rate": 1.0589407323870988e-06, "loss": 0.0, "num_input_tokens_seen": 190423520, "step": 88285 }, { "epoch": 16.202973022572948, "grad_norm": 0.0004633502976503223, "learning_rate": 1.058447994074539e-06, "loss": 0.0, "num_input_tokens_seen": 190434976, "step": 88290 }, { "epoch": 16.2038906221325, "grad_norm": 0.0007017249008640647, "learning_rate": 1.0579553568566787e-06, "loss": 0.0, "num_input_tokens_seen": 190445248, "step": 88295 }, { "epoch": 16.204808221692055, "grad_norm": 5.694953870261088e-05, "learning_rate": 1.0574628207461546e-06, "loss": 0.0, "num_input_tokens_seen": 190455552, "step": 88300 }, { "epoch": 16.205725821251605, "grad_norm": 0.0057179806753993034, "learning_rate": 1.0569703857555992e-06, "loss": 0.0, "num_input_tokens_seen": 190467968, "step": 88305 }, { "epoch": 16.206643420811158, "grad_norm": 0.013253790326416492, "learning_rate": 1.0564780518976403e-06, "loss": 0.0, "num_input_tokens_seen": 190477888, "step": 88310 }, { "epoch": 16.20756102037071, "grad_norm": 0.00011849980364786461, "learning_rate": 1.0559858191849092e-06, "loss": 0.0003, "num_input_tokens_seen": 190488896, "step": 88315 }, { "epoch": 16.20847861993026, "grad_norm": 0.00014217307034414262, "learning_rate": 1.0554936876300292e-06, "loss": 0.0, "num_input_tokens_seen": 190499328, "step": 88320 }, { "epoch": 16.209396219489815, "grad_norm": 0.00025939205079339445, "learning_rate": 1.0550016572456212e-06, "loss": 0.0, "num_input_tokens_seen": 190509920, "step": 88325 }, { "epoch": 16.210313819049368, "grad_norm": 0.001140063744969666, "learning_rate": 1.0545097280443078e-06, "loss": 0.0, "num_input_tokens_seen": 190521472, "step": 88330 }, { "epoch": 16.211231418608918, "grad_norm": 10.78144645690918, "learning_rate": 1.0540179000387053e-06, "loss": 0.001, "num_input_tokens_seen": 190532096, "step": 88335 }, { "epoch": 16.21214901816847, "grad_norm": 0.0009281915263272822, "learning_rate": 1.0535261732414276e-06, "loss": 0.0, "num_input_tokens_seen": 190541920, "step": 88340 }, { "epoch": 16.213066617728025, "grad_norm": 7.94104125816375e-05, "learning_rate": 1.053034547665086e-06, "loss": 0.0, "num_input_tokens_seen": 190552992, "step": 88345 }, { "epoch": 16.213984217287575, "grad_norm": 8.866374992066994e-05, "learning_rate": 1.0525430233222922e-06, "loss": 0.0, "num_input_tokens_seen": 190563616, "step": 88350 }, { "epoch": 16.214901816847128, "grad_norm": 0.0004284561728127301, "learning_rate": 1.052051600225652e-06, "loss": 0.0, "num_input_tokens_seen": 190575328, "step": 88355 }, { "epoch": 16.21581941640668, "grad_norm": 0.00013887969544157386, "learning_rate": 1.0515602783877676e-06, "loss": 0.0, "num_input_tokens_seen": 190586240, "step": 88360 }, { "epoch": 16.21673701596623, "grad_norm": 6.755255162715912e-05, "learning_rate": 1.0510690578212447e-06, "loss": 0.0, "num_input_tokens_seen": 190595488, "step": 88365 }, { "epoch": 16.217654615525785, "grad_norm": 0.00011020624515367672, "learning_rate": 1.0505779385386795e-06, "loss": 0.0, "num_input_tokens_seen": 190606752, "step": 88370 }, { "epoch": 16.218572215085338, "grad_norm": 8.059287210926414e-05, "learning_rate": 1.0500869205526681e-06, "loss": 0.0, "num_input_tokens_seen": 190617344, "step": 88375 }, { "epoch": 16.219489814644888, "grad_norm": 0.00017617411504033953, "learning_rate": 1.0495960038758063e-06, "loss": 0.0, "num_input_tokens_seen": 190626048, "step": 88380 }, { "epoch": 16.22040741420444, "grad_norm": 9.687989950180054e-05, "learning_rate": 1.049105188520685e-06, "loss": 0.0, "num_input_tokens_seen": 190637472, "step": 88385 }, { "epoch": 16.221325013763995, "grad_norm": 0.0002151843364117667, "learning_rate": 1.0486144744998922e-06, "loss": 0.0, "num_input_tokens_seen": 190648448, "step": 88390 }, { "epoch": 16.222242613323544, "grad_norm": 0.00010762140300357714, "learning_rate": 1.0481238618260126e-06, "loss": 0.0, "num_input_tokens_seen": 190659136, "step": 88395 }, { "epoch": 16.223160212883098, "grad_norm": 0.0001187239759019576, "learning_rate": 1.047633350511632e-06, "loss": 0.0, "num_input_tokens_seen": 190671200, "step": 88400 }, { "epoch": 16.22407781244265, "grad_norm": 0.00015489337965846062, "learning_rate": 1.0471429405693307e-06, "loss": 0.0, "num_input_tokens_seen": 190681792, "step": 88405 }, { "epoch": 16.2249954120022, "grad_norm": 0.00023249711375683546, "learning_rate": 1.0466526320116854e-06, "loss": 0.0, "num_input_tokens_seen": 190692992, "step": 88410 }, { "epoch": 16.225913011561754, "grad_norm": 0.0005579875432886183, "learning_rate": 1.0461624248512741e-06, "loss": 0.0, "num_input_tokens_seen": 190701952, "step": 88415 }, { "epoch": 16.226830611121308, "grad_norm": 5.8406669268151745e-05, "learning_rate": 1.045672319100669e-06, "loss": 0.0, "num_input_tokens_seen": 190711840, "step": 88420 }, { "epoch": 16.227748210680858, "grad_norm": 0.0001311576197622344, "learning_rate": 1.04518231477244e-06, "loss": 0.0, "num_input_tokens_seen": 190722080, "step": 88425 }, { "epoch": 16.22866581024041, "grad_norm": 0.0001496077748015523, "learning_rate": 1.0446924118791552e-06, "loss": 0.0, "num_input_tokens_seen": 190733088, "step": 88430 }, { "epoch": 16.229583409799964, "grad_norm": 7.960259972605854e-05, "learning_rate": 1.0442026104333785e-06, "loss": 0.0, "num_input_tokens_seen": 190742752, "step": 88435 }, { "epoch": 16.230501009359514, "grad_norm": 7.174881466198713e-05, "learning_rate": 1.0437129104476756e-06, "loss": 0.0, "num_input_tokens_seen": 190752736, "step": 88440 }, { "epoch": 16.231418608919068, "grad_norm": 0.0004100358346477151, "learning_rate": 1.0432233119346047e-06, "loss": 0.0, "num_input_tokens_seen": 190763392, "step": 88445 }, { "epoch": 16.23233620847862, "grad_norm": 8.051044278545305e-05, "learning_rate": 1.042733814906723e-06, "loss": 0.0, "num_input_tokens_seen": 190774528, "step": 88450 }, { "epoch": 16.23325380803817, "grad_norm": 0.00017748924437910318, "learning_rate": 1.0422444193765862e-06, "loss": 0.0, "num_input_tokens_seen": 190785312, "step": 88455 }, { "epoch": 16.234171407597724, "grad_norm": 0.00017194717656821012, "learning_rate": 1.0417551253567447e-06, "loss": 0.0, "num_input_tokens_seen": 190797152, "step": 88460 }, { "epoch": 16.235089007157278, "grad_norm": 0.0004917133483104408, "learning_rate": 1.0412659328597507e-06, "loss": 0.0, "num_input_tokens_seen": 190806688, "step": 88465 }, { "epoch": 16.236006606716828, "grad_norm": 0.0007362633477896452, "learning_rate": 1.0407768418981501e-06, "loss": 0.0, "num_input_tokens_seen": 190817152, "step": 88470 }, { "epoch": 16.23692420627638, "grad_norm": 8.444949344266206e-05, "learning_rate": 1.0402878524844872e-06, "loss": 0.0, "num_input_tokens_seen": 190827776, "step": 88475 }, { "epoch": 16.237841805835934, "grad_norm": 0.00036503863520920277, "learning_rate": 1.0397989646313022e-06, "loss": 0.0, "num_input_tokens_seen": 190839392, "step": 88480 }, { "epoch": 16.238759405395484, "grad_norm": 5.98218830418773e-05, "learning_rate": 1.0393101783511377e-06, "loss": 0.0, "num_input_tokens_seen": 190850080, "step": 88485 }, { "epoch": 16.239677004955038, "grad_norm": 0.00023290832177735865, "learning_rate": 1.038821493656529e-06, "loss": 0.0, "num_input_tokens_seen": 190859872, "step": 88490 }, { "epoch": 16.24059460451459, "grad_norm": 0.00024637329624965787, "learning_rate": 1.0383329105600082e-06, "loss": 0.0, "num_input_tokens_seen": 190870784, "step": 88495 }, { "epoch": 16.24151220407414, "grad_norm": 7.566460408270359e-05, "learning_rate": 1.0378444290741092e-06, "loss": 0.0, "num_input_tokens_seen": 190882400, "step": 88500 }, { "epoch": 16.242429803633694, "grad_norm": 6.956215656828135e-05, "learning_rate": 1.0373560492113598e-06, "loss": 0.0, "num_input_tokens_seen": 190893184, "step": 88505 }, { "epoch": 16.243347403193248, "grad_norm": 0.0005604529869742692, "learning_rate": 1.036867770984285e-06, "loss": 0.0, "num_input_tokens_seen": 190902464, "step": 88510 }, { "epoch": 16.244265002752797, "grad_norm": 0.00011620402801781893, "learning_rate": 1.0363795944054112e-06, "loss": 0.0, "num_input_tokens_seen": 190912704, "step": 88515 }, { "epoch": 16.24518260231235, "grad_norm": 0.0002519233385100961, "learning_rate": 1.0358915194872576e-06, "loss": 0.0, "num_input_tokens_seen": 190922688, "step": 88520 }, { "epoch": 16.246100201871904, "grad_norm": 0.00011796661419793963, "learning_rate": 1.0354035462423423e-06, "loss": 0.0, "num_input_tokens_seen": 190933952, "step": 88525 }, { "epoch": 16.247017801431454, "grad_norm": 0.0006193601875565946, "learning_rate": 1.0349156746831807e-06, "loss": 0.0, "num_input_tokens_seen": 190945216, "step": 88530 }, { "epoch": 16.247935400991008, "grad_norm": 0.002141515724360943, "learning_rate": 1.0344279048222877e-06, "loss": 0.0, "num_input_tokens_seen": 190956992, "step": 88535 }, { "epoch": 16.24885300055056, "grad_norm": 6.236438639461994e-05, "learning_rate": 1.033940236672173e-06, "loss": 0.0, "num_input_tokens_seen": 190966944, "step": 88540 }, { "epoch": 16.24977060011011, "grad_norm": 0.00020586425671353936, "learning_rate": 1.0334526702453429e-06, "loss": 0.0, "num_input_tokens_seen": 190975488, "step": 88545 }, { "epoch": 16.250688199669664, "grad_norm": 4.9132351705338806e-05, "learning_rate": 1.032965205554306e-06, "loss": 0.0, "num_input_tokens_seen": 190986080, "step": 88550 }, { "epoch": 16.251605799229218, "grad_norm": 9.763244452187791e-05, "learning_rate": 1.0324778426115628e-06, "loss": 0.0, "num_input_tokens_seen": 190996896, "step": 88555 }, { "epoch": 16.252523398788767, "grad_norm": 0.00012153121497249231, "learning_rate": 1.031990581429614e-06, "loss": 0.0, "num_input_tokens_seen": 191009472, "step": 88560 }, { "epoch": 16.25344099834832, "grad_norm": 0.0006604111986234784, "learning_rate": 1.0315034220209553e-06, "loss": 0.0, "num_input_tokens_seen": 191020000, "step": 88565 }, { "epoch": 16.254358597907874, "grad_norm": 0.0009005329920910299, "learning_rate": 1.0310163643980848e-06, "loss": 0.0, "num_input_tokens_seen": 191031136, "step": 88570 }, { "epoch": 16.255276197467424, "grad_norm": 0.000461395742604509, "learning_rate": 1.0305294085734935e-06, "loss": 0.0, "num_input_tokens_seen": 191041952, "step": 88575 }, { "epoch": 16.256193797026977, "grad_norm": 0.00011310025001876056, "learning_rate": 1.0300425545596686e-06, "loss": 0.0, "num_input_tokens_seen": 191051936, "step": 88580 }, { "epoch": 16.25711139658653, "grad_norm": 0.0013474992010742426, "learning_rate": 1.0295558023691016e-06, "loss": 0.0, "num_input_tokens_seen": 191063040, "step": 88585 }, { "epoch": 16.25802899614608, "grad_norm": 3.531725815264508e-05, "learning_rate": 1.0290691520142737e-06, "loss": 0.0, "num_input_tokens_seen": 191073472, "step": 88590 }, { "epoch": 16.258946595705634, "grad_norm": 7.976518099894747e-05, "learning_rate": 1.0285826035076667e-06, "loss": 0.0, "num_input_tokens_seen": 191084832, "step": 88595 }, { "epoch": 16.259864195265187, "grad_norm": 0.00012468175555113703, "learning_rate": 1.0280961568617626e-06, "loss": 0.0, "num_input_tokens_seen": 191094688, "step": 88600 }, { "epoch": 16.260781794824737, "grad_norm": 0.00013068846601527184, "learning_rate": 1.027609812089036e-06, "loss": 0.0, "num_input_tokens_seen": 191106080, "step": 88605 }, { "epoch": 16.26169939438429, "grad_norm": 0.0011312479618936777, "learning_rate": 1.0271235692019605e-06, "loss": 0.0, "num_input_tokens_seen": 191116384, "step": 88610 }, { "epoch": 16.262616993943844, "grad_norm": 9.66400039033033e-05, "learning_rate": 1.026637428213007e-06, "loss": 0.0, "num_input_tokens_seen": 191126400, "step": 88615 }, { "epoch": 16.263534593503394, "grad_norm": 8.772163710091263e-05, "learning_rate": 1.0261513891346469e-06, "loss": 0.0, "num_input_tokens_seen": 191138432, "step": 88620 }, { "epoch": 16.264452193062947, "grad_norm": 0.08544459193944931, "learning_rate": 1.0256654519793447e-06, "loss": 0.0, "num_input_tokens_seen": 191148960, "step": 88625 }, { "epoch": 16.2653697926225, "grad_norm": 0.00015436497051268816, "learning_rate": 1.0251796167595623e-06, "loss": 0.0, "num_input_tokens_seen": 191159648, "step": 88630 }, { "epoch": 16.26628739218205, "grad_norm": 0.0001944220857694745, "learning_rate": 1.024693883487764e-06, "loss": 0.0, "num_input_tokens_seen": 191170528, "step": 88635 }, { "epoch": 16.267204991741604, "grad_norm": 6.807039608247578e-05, "learning_rate": 1.0242082521764062e-06, "loss": 0.0, "num_input_tokens_seen": 191181056, "step": 88640 }, { "epoch": 16.268122591301157, "grad_norm": 0.002504191128537059, "learning_rate": 1.0237227228379448e-06, "loss": 0.0, "num_input_tokens_seen": 191190976, "step": 88645 }, { "epoch": 16.269040190860707, "grad_norm": 9.660288924351335e-05, "learning_rate": 1.0232372954848335e-06, "loss": 0.0, "num_input_tokens_seen": 191202432, "step": 88650 }, { "epoch": 16.26995779042026, "grad_norm": 7.397420995403081e-05, "learning_rate": 1.0227519701295203e-06, "loss": 0.0, "num_input_tokens_seen": 191212864, "step": 88655 }, { "epoch": 16.270875389979814, "grad_norm": 0.0009075879352167249, "learning_rate": 1.022266746784456e-06, "loss": 0.0, "num_input_tokens_seen": 191223968, "step": 88660 }, { "epoch": 16.271792989539364, "grad_norm": 9.036387200467288e-05, "learning_rate": 1.021781625462085e-06, "loss": 0.0, "num_input_tokens_seen": 191234336, "step": 88665 }, { "epoch": 16.272710589098917, "grad_norm": 0.00017219939036294818, "learning_rate": 1.0212966061748497e-06, "loss": 0.0, "num_input_tokens_seen": 191245120, "step": 88670 }, { "epoch": 16.27362818865847, "grad_norm": 0.002364229876548052, "learning_rate": 1.0208116889351899e-06, "loss": 0.0, "num_input_tokens_seen": 191255616, "step": 88675 }, { "epoch": 16.27454578821802, "grad_norm": 0.012001111172139645, "learning_rate": 1.0203268737555417e-06, "loss": 0.0, "num_input_tokens_seen": 191266624, "step": 88680 }, { "epoch": 16.275463387777574, "grad_norm": 0.00015862785221543163, "learning_rate": 1.0198421606483427e-06, "loss": 0.0, "num_input_tokens_seen": 191277984, "step": 88685 }, { "epoch": 16.276380987337127, "grad_norm": 7.812443072907627e-05, "learning_rate": 1.0193575496260238e-06, "loss": 0.0, "num_input_tokens_seen": 191289184, "step": 88690 }, { "epoch": 16.277298586896677, "grad_norm": 7.554287003586069e-05, "learning_rate": 1.0188730407010129e-06, "loss": 0.0, "num_input_tokens_seen": 191298528, "step": 88695 }, { "epoch": 16.27821618645623, "grad_norm": 0.000492845952976495, "learning_rate": 1.018388633885739e-06, "loss": 0.0, "num_input_tokens_seen": 191307808, "step": 88700 }, { "epoch": 16.279133786015784, "grad_norm": 9.43164704949595e-05, "learning_rate": 1.0179043291926267e-06, "loss": 0.0, "num_input_tokens_seen": 191318688, "step": 88705 }, { "epoch": 16.280051385575334, "grad_norm": 9.581461199559271e-05, "learning_rate": 1.017420126634096e-06, "loss": 0.0, "num_input_tokens_seen": 191329472, "step": 88710 }, { "epoch": 16.280968985134887, "grad_norm": 0.0003693012986332178, "learning_rate": 1.0169360262225653e-06, "loss": 0.0, "num_input_tokens_seen": 191340640, "step": 88715 }, { "epoch": 16.28188658469444, "grad_norm": 0.00013977635535411537, "learning_rate": 1.0164520279704538e-06, "loss": 0.0, "num_input_tokens_seen": 191352320, "step": 88720 }, { "epoch": 16.28280418425399, "grad_norm": 0.0001251089561264962, "learning_rate": 1.0159681318901738e-06, "loss": 0.0, "num_input_tokens_seen": 191363008, "step": 88725 }, { "epoch": 16.283721783813544, "grad_norm": 8.083422289928421e-05, "learning_rate": 1.0154843379941354e-06, "loss": 0.0, "num_input_tokens_seen": 191373024, "step": 88730 }, { "epoch": 16.284639383373097, "grad_norm": 0.0001310253283008933, "learning_rate": 1.0150006462947493e-06, "loss": 0.0001, "num_input_tokens_seen": 191384512, "step": 88735 }, { "epoch": 16.285556982932647, "grad_norm": 0.0004108519933652133, "learning_rate": 1.014517056804421e-06, "loss": 0.0, "num_input_tokens_seen": 191394976, "step": 88740 }, { "epoch": 16.2864745824922, "grad_norm": 0.0016607868019491434, "learning_rate": 1.0140335695355525e-06, "loss": 0.0, "num_input_tokens_seen": 191406240, "step": 88745 }, { "epoch": 16.287392182051754, "grad_norm": 9.59045355557464e-05, "learning_rate": 1.0135501845005446e-06, "loss": 0.0, "num_input_tokens_seen": 191417312, "step": 88750 }, { "epoch": 16.288309781611304, "grad_norm": 0.00011045943392673507, "learning_rate": 1.0130669017117967e-06, "loss": 0.0, "num_input_tokens_seen": 191426272, "step": 88755 }, { "epoch": 16.289227381170857, "grad_norm": 0.010995537973940372, "learning_rate": 1.0125837211817042e-06, "loss": 0.0, "num_input_tokens_seen": 191437920, "step": 88760 }, { "epoch": 16.29014498073041, "grad_norm": 0.0012074115220457315, "learning_rate": 1.0121006429226575e-06, "loss": 0.0012, "num_input_tokens_seen": 191448480, "step": 88765 }, { "epoch": 16.29106258028996, "grad_norm": 0.00027462615980766714, "learning_rate": 1.01161766694705e-06, "loss": 0.0, "num_input_tokens_seen": 191458816, "step": 88770 }, { "epoch": 16.291980179849514, "grad_norm": 6.463935278588906e-05, "learning_rate": 1.0111347932672682e-06, "loss": 0.0, "num_input_tokens_seen": 191470624, "step": 88775 }, { "epoch": 16.292897779409067, "grad_norm": 0.00021804490825161338, "learning_rate": 1.0106520218956955e-06, "loss": 0.0, "num_input_tokens_seen": 191480768, "step": 88780 }, { "epoch": 16.293815378968617, "grad_norm": 0.00025931952404789627, "learning_rate": 1.0101693528447166e-06, "loss": 0.0, "num_input_tokens_seen": 191491040, "step": 88785 }, { "epoch": 16.29473297852817, "grad_norm": 0.0010984578402712941, "learning_rate": 1.0096867861267102e-06, "loss": 0.0, "num_input_tokens_seen": 191502880, "step": 88790 }, { "epoch": 16.295650578087724, "grad_norm": 0.00027528218924999237, "learning_rate": 1.0092043217540536e-06, "loss": 0.0, "num_input_tokens_seen": 191513440, "step": 88795 }, { "epoch": 16.296568177647274, "grad_norm": 0.00016255042282864451, "learning_rate": 1.008721959739119e-06, "loss": 0.0, "num_input_tokens_seen": 191523456, "step": 88800 }, { "epoch": 16.297485777206827, "grad_norm": 9.044300531968474e-05, "learning_rate": 1.0082397000942823e-06, "loss": 0.0, "num_input_tokens_seen": 191534848, "step": 88805 }, { "epoch": 16.29840337676638, "grad_norm": 8.696052827872336e-05, "learning_rate": 1.0077575428319096e-06, "loss": 0.0, "num_input_tokens_seen": 191545664, "step": 88810 }, { "epoch": 16.29932097632593, "grad_norm": 0.00022045955120120198, "learning_rate": 1.0072754879643682e-06, "loss": 0.0, "num_input_tokens_seen": 191556960, "step": 88815 }, { "epoch": 16.300238575885484, "grad_norm": 0.0005148694035597146, "learning_rate": 1.0067935355040231e-06, "loss": 0.0, "num_input_tokens_seen": 191567104, "step": 88820 }, { "epoch": 16.301156175445037, "grad_norm": 0.004562122747302055, "learning_rate": 1.0063116854632355e-06, "loss": 0.0, "num_input_tokens_seen": 191577440, "step": 88825 }, { "epoch": 16.302073775004587, "grad_norm": 0.002284718444570899, "learning_rate": 1.0058299378543617e-06, "loss": 0.0, "num_input_tokens_seen": 191587904, "step": 88830 }, { "epoch": 16.30299137456414, "grad_norm": 0.0005558542325161397, "learning_rate": 1.0053482926897607e-06, "loss": 0.0, "num_input_tokens_seen": 191598336, "step": 88835 }, { "epoch": 16.303908974123694, "grad_norm": 0.0009185700910165906, "learning_rate": 1.0048667499817854e-06, "loss": 0.0, "num_input_tokens_seen": 191608256, "step": 88840 }, { "epoch": 16.304826573683243, "grad_norm": 0.00026224361499771476, "learning_rate": 1.0043853097427859e-06, "loss": 0.0, "num_input_tokens_seen": 191619840, "step": 88845 }, { "epoch": 16.305744173242797, "grad_norm": 0.0002722146746236831, "learning_rate": 1.003903971985109e-06, "loss": 0.0007, "num_input_tokens_seen": 191630976, "step": 88850 }, { "epoch": 16.30666177280235, "grad_norm": 0.002980683697387576, "learning_rate": 1.0034227367211036e-06, "loss": 0.0, "num_input_tokens_seen": 191641632, "step": 88855 }, { "epoch": 16.3075793723619, "grad_norm": 0.0005278682801872492, "learning_rate": 1.0029416039631101e-06, "loss": 0.0, "num_input_tokens_seen": 191651808, "step": 88860 }, { "epoch": 16.308496971921453, "grad_norm": 0.00010685018787626177, "learning_rate": 1.0024605737234705e-06, "loss": 0.0, "num_input_tokens_seen": 191661632, "step": 88865 }, { "epoch": 16.309414571481007, "grad_norm": 0.00040710013126954436, "learning_rate": 1.0019796460145209e-06, "loss": 0.0, "num_input_tokens_seen": 191672480, "step": 88870 }, { "epoch": 16.310332171040557, "grad_norm": 6.173145084176213e-05, "learning_rate": 1.001498820848596e-06, "loss": 0.0, "num_input_tokens_seen": 191681984, "step": 88875 }, { "epoch": 16.31124977060011, "grad_norm": 0.24610868096351624, "learning_rate": 1.0010180982380303e-06, "loss": 0.0002, "num_input_tokens_seen": 191694272, "step": 88880 }, { "epoch": 16.312167370159663, "grad_norm": 0.00322686112485826, "learning_rate": 1.0005374781951526e-06, "loss": 0.0, "num_input_tokens_seen": 191704832, "step": 88885 }, { "epoch": 16.313084969719213, "grad_norm": 7.108243153197691e-05, "learning_rate": 1.0000569607322902e-06, "loss": 0.0, "num_input_tokens_seen": 191716384, "step": 88890 }, { "epoch": 16.314002569278767, "grad_norm": 0.0004829942772630602, "learning_rate": 9.995765458617674e-07, "loss": 0.0, "num_input_tokens_seen": 191728224, "step": 88895 }, { "epoch": 16.31492016883832, "grad_norm": 0.00017310109979007393, "learning_rate": 9.990962335959047e-07, "loss": 0.0, "num_input_tokens_seen": 191737536, "step": 88900 }, { "epoch": 16.31583776839787, "grad_norm": 9.963303455151618e-05, "learning_rate": 9.986160239470238e-07, "loss": 0.0, "num_input_tokens_seen": 191748000, "step": 88905 }, { "epoch": 16.316755367957423, "grad_norm": 6.181725620990619e-05, "learning_rate": 9.981359169274408e-07, "loss": 0.0, "num_input_tokens_seen": 191758752, "step": 88910 }, { "epoch": 16.317672967516977, "grad_norm": 0.0033449872862547636, "learning_rate": 9.976559125494673e-07, "loss": 0.0, "num_input_tokens_seen": 191770336, "step": 88915 }, { "epoch": 16.318590567076527, "grad_norm": 7.192097837105393e-05, "learning_rate": 9.971760108254185e-07, "loss": 0.0, "num_input_tokens_seen": 191781216, "step": 88920 }, { "epoch": 16.31950816663608, "grad_norm": 0.0005162289016880095, "learning_rate": 9.96696211767601e-07, "loss": 0.0, "num_input_tokens_seen": 191792608, "step": 88925 }, { "epoch": 16.320425766195633, "grad_norm": 0.00033189583336934447, "learning_rate": 9.962165153883207e-07, "loss": 0.0, "num_input_tokens_seen": 191804512, "step": 88930 }, { "epoch": 16.321343365755183, "grad_norm": 0.00011274940334260464, "learning_rate": 9.957369216998807e-07, "loss": 0.0, "num_input_tokens_seen": 191815104, "step": 88935 }, { "epoch": 16.322260965314737, "grad_norm": 9.165448136627674e-05, "learning_rate": 9.952574307145834e-07, "loss": 0.0, "num_input_tokens_seen": 191825312, "step": 88940 }, { "epoch": 16.32317856487429, "grad_norm": 0.00013080464850645512, "learning_rate": 9.947780424447268e-07, "loss": 0.0, "num_input_tokens_seen": 191838144, "step": 88945 }, { "epoch": 16.32409616443384, "grad_norm": 9.656963084125891e-05, "learning_rate": 9.942987569026041e-07, "loss": 0.0, "num_input_tokens_seen": 191848576, "step": 88950 }, { "epoch": 16.325013763993393, "grad_norm": 9.267507266486064e-05, "learning_rate": 9.938195741005119e-07, "loss": 0.0, "num_input_tokens_seen": 191858752, "step": 88955 }, { "epoch": 16.325931363552947, "grad_norm": 0.00010013653809437528, "learning_rate": 9.93340494050738e-07, "loss": 0.0, "num_input_tokens_seen": 191869088, "step": 88960 }, { "epoch": 16.326848963112496, "grad_norm": 0.0003302257973700762, "learning_rate": 9.928615167655698e-07, "loss": 0.0, "num_input_tokens_seen": 191879008, "step": 88965 }, { "epoch": 16.32776656267205, "grad_norm": 0.00011435666965553537, "learning_rate": 9.92382642257294e-07, "loss": 0.0, "num_input_tokens_seen": 191889632, "step": 88970 }, { "epoch": 16.328684162231603, "grad_norm": 0.00018485909095034003, "learning_rate": 9.91903870538193e-07, "loss": 0.0, "num_input_tokens_seen": 191899840, "step": 88975 }, { "epoch": 16.329601761791153, "grad_norm": 9.640963980928063e-05, "learning_rate": 9.91425201620545e-07, "loss": 0.0, "num_input_tokens_seen": 191910400, "step": 88980 }, { "epoch": 16.330519361350706, "grad_norm": 0.00028072347049601376, "learning_rate": 9.909466355166263e-07, "loss": 0.0, "num_input_tokens_seen": 191921056, "step": 88985 }, { "epoch": 16.33143696091026, "grad_norm": 9.37751610763371e-05, "learning_rate": 9.904681722387149e-07, "loss": 0.0, "num_input_tokens_seen": 191932096, "step": 88990 }, { "epoch": 16.33235456046981, "grad_norm": 0.0018252324080094695, "learning_rate": 9.899898117990808e-07, "loss": 0.0, "num_input_tokens_seen": 191944544, "step": 88995 }, { "epoch": 16.333272160029363, "grad_norm": 0.0001989987795241177, "learning_rate": 9.895115542099915e-07, "loss": 0.0, "num_input_tokens_seen": 191956704, "step": 89000 }, { "epoch": 16.334189759588917, "grad_norm": 0.00012085222988389432, "learning_rate": 9.890333994837159e-07, "loss": 0.0, "num_input_tokens_seen": 191967904, "step": 89005 }, { "epoch": 16.335107359148466, "grad_norm": 7.32952103135176e-05, "learning_rate": 9.885553476325177e-07, "loss": 0.0, "num_input_tokens_seen": 191978176, "step": 89010 }, { "epoch": 16.33602495870802, "grad_norm": 0.0003654637548606843, "learning_rate": 9.880773986686576e-07, "loss": 0.0, "num_input_tokens_seen": 191989216, "step": 89015 }, { "epoch": 16.336942558267573, "grad_norm": 0.0002244034258183092, "learning_rate": 9.87599552604393e-07, "loss": 0.0, "num_input_tokens_seen": 192000960, "step": 89020 }, { "epoch": 16.337860157827123, "grad_norm": 0.00011806574184447527, "learning_rate": 9.871218094519824e-07, "loss": 0.0, "num_input_tokens_seen": 192012960, "step": 89025 }, { "epoch": 16.338777757386676, "grad_norm": 7.745306356810033e-05, "learning_rate": 9.866441692236784e-07, "loss": 0.0, "num_input_tokens_seen": 192024704, "step": 89030 }, { "epoch": 16.33969535694623, "grad_norm": 9.821578714763746e-05, "learning_rate": 9.861666319317298e-07, "loss": 0.0, "num_input_tokens_seen": 192034528, "step": 89035 }, { "epoch": 16.34061295650578, "grad_norm": 0.022167205810546875, "learning_rate": 9.856891975883874e-07, "loss": 0.0, "num_input_tokens_seen": 192045984, "step": 89040 }, { "epoch": 16.341530556065333, "grad_norm": 0.00013133777247276157, "learning_rate": 9.852118662058957e-07, "loss": 0.0, "num_input_tokens_seen": 192058848, "step": 89045 }, { "epoch": 16.342448155624886, "grad_norm": 0.004969909321516752, "learning_rate": 9.847346377964956e-07, "loss": 0.0, "num_input_tokens_seen": 192069312, "step": 89050 }, { "epoch": 16.343365755184436, "grad_norm": 0.0003397140244487673, "learning_rate": 9.8425751237243e-07, "loss": 0.0, "num_input_tokens_seen": 192080896, "step": 89055 }, { "epoch": 16.34428335474399, "grad_norm": 0.0006463091704063118, "learning_rate": 9.837804899459364e-07, "loss": 0.0, "num_input_tokens_seen": 192092416, "step": 89060 }, { "epoch": 16.345200954303543, "grad_norm": 7.742339948890731e-05, "learning_rate": 9.833035705292482e-07, "loss": 0.0, "num_input_tokens_seen": 192103040, "step": 89065 }, { "epoch": 16.346118553863093, "grad_norm": 9.687371493782848e-05, "learning_rate": 9.828267541345965e-07, "loss": 0.0, "num_input_tokens_seen": 192113792, "step": 89070 }, { "epoch": 16.347036153422646, "grad_norm": 0.001713801291771233, "learning_rate": 9.823500407742137e-07, "loss": 0.0, "num_input_tokens_seen": 192124896, "step": 89075 }, { "epoch": 16.3479537529822, "grad_norm": 8.938428800320253e-05, "learning_rate": 9.81873430460326e-07, "loss": 0.0, "num_input_tokens_seen": 192136224, "step": 89080 }, { "epoch": 16.34887135254175, "grad_norm": 0.00010621709952829406, "learning_rate": 9.813969232051573e-07, "loss": 0.0, "num_input_tokens_seen": 192147680, "step": 89085 }, { "epoch": 16.349788952101303, "grad_norm": 0.0002834982005879283, "learning_rate": 9.809205190209287e-07, "loss": 0.0, "num_input_tokens_seen": 192158624, "step": 89090 }, { "epoch": 16.350706551660856, "grad_norm": 6.479241710621864e-05, "learning_rate": 9.804442179198593e-07, "loss": 0.0, "num_input_tokens_seen": 192169216, "step": 89095 }, { "epoch": 16.351624151220406, "grad_norm": 3.986651063314639e-05, "learning_rate": 9.799680199141665e-07, "loss": 0.0, "num_input_tokens_seen": 192179776, "step": 89100 }, { "epoch": 16.35254175077996, "grad_norm": 0.00018388242460787296, "learning_rate": 9.79491925016064e-07, "loss": 0.0, "num_input_tokens_seen": 192191744, "step": 89105 }, { "epoch": 16.353459350339513, "grad_norm": 0.00011326077219564468, "learning_rate": 9.790159332377619e-07, "loss": 0.0, "num_input_tokens_seen": 192202848, "step": 89110 }, { "epoch": 16.354376949899063, "grad_norm": 0.0008337375475093722, "learning_rate": 9.785400445914694e-07, "loss": 0.0, "num_input_tokens_seen": 192213952, "step": 89115 }, { "epoch": 16.355294549458616, "grad_norm": 0.0006739050149917603, "learning_rate": 9.780642590893908e-07, "loss": 0.0, "num_input_tokens_seen": 192224800, "step": 89120 }, { "epoch": 16.35621214901817, "grad_norm": 0.003116437466815114, "learning_rate": 9.77588576743732e-07, "loss": 0.0, "num_input_tokens_seen": 192234464, "step": 89125 }, { "epoch": 16.35712974857772, "grad_norm": 0.00010780551383504644, "learning_rate": 9.77112997566692e-07, "loss": 0.0, "num_input_tokens_seen": 192245056, "step": 89130 }, { "epoch": 16.358047348137273, "grad_norm": 0.000997678260318935, "learning_rate": 9.76637521570467e-07, "loss": 0.0, "num_input_tokens_seen": 192255936, "step": 89135 }, { "epoch": 16.358964947696826, "grad_norm": 0.00023706041974946856, "learning_rate": 9.761621487672558e-07, "loss": 0.0, "num_input_tokens_seen": 192267040, "step": 89140 }, { "epoch": 16.359882547256376, "grad_norm": 9.292169124819338e-05, "learning_rate": 9.756868791692486e-07, "loss": 0.0, "num_input_tokens_seen": 192277312, "step": 89145 }, { "epoch": 16.36080014681593, "grad_norm": 0.00195147970225662, "learning_rate": 9.752117127886346e-07, "loss": 0.0, "num_input_tokens_seen": 192288256, "step": 89150 }, { "epoch": 16.361717746375483, "grad_norm": 9.273126488551497e-05, "learning_rate": 9.747366496376037e-07, "loss": 0.0, "num_input_tokens_seen": 192298048, "step": 89155 }, { "epoch": 16.362635345935033, "grad_norm": 0.0014279953902587295, "learning_rate": 9.74261689728339e-07, "loss": 0.0, "num_input_tokens_seen": 192308544, "step": 89160 }, { "epoch": 16.363552945494586, "grad_norm": 0.0025151094887405634, "learning_rate": 9.737868330730232e-07, "loss": 0.0, "num_input_tokens_seen": 192319616, "step": 89165 }, { "epoch": 16.36447054505414, "grad_norm": 6.410667265299708e-05, "learning_rate": 9.73312079683833e-07, "loss": 0.0, "num_input_tokens_seen": 192330880, "step": 89170 }, { "epoch": 16.36538814461369, "grad_norm": 9.427069744560868e-05, "learning_rate": 9.728374295729491e-07, "loss": 0.0, "num_input_tokens_seen": 192341760, "step": 89175 }, { "epoch": 16.366305744173243, "grad_norm": 0.0002866916765924543, "learning_rate": 9.723628827525433e-07, "loss": 0.0, "num_input_tokens_seen": 192352736, "step": 89180 }, { "epoch": 16.367223343732796, "grad_norm": 6.0067628510296345e-05, "learning_rate": 9.71888439234786e-07, "loss": 0.0, "num_input_tokens_seen": 192363008, "step": 89185 }, { "epoch": 16.368140943292346, "grad_norm": 0.00016195795615203679, "learning_rate": 9.714140990318488e-07, "loss": 0.0, "num_input_tokens_seen": 192374688, "step": 89190 }, { "epoch": 16.3690585428519, "grad_norm": 0.00011169978097314015, "learning_rate": 9.709398621558958e-07, "loss": 0.0, "num_input_tokens_seen": 192384000, "step": 89195 }, { "epoch": 16.369976142411453, "grad_norm": 6.06536450504791e-05, "learning_rate": 9.704657286190911e-07, "loss": 0.0, "num_input_tokens_seen": 192395072, "step": 89200 }, { "epoch": 16.370893741971003, "grad_norm": 0.0001476134784752503, "learning_rate": 9.699916984335938e-07, "loss": 0.0, "num_input_tokens_seen": 192406304, "step": 89205 }, { "epoch": 16.371811341530556, "grad_norm": 0.00014729445683769882, "learning_rate": 9.695177716115645e-07, "loss": 0.0, "num_input_tokens_seen": 192417408, "step": 89210 }, { "epoch": 16.37272894109011, "grad_norm": 3.618855043896474e-05, "learning_rate": 9.690439481651582e-07, "loss": 0.0, "num_input_tokens_seen": 192428192, "step": 89215 }, { "epoch": 16.37364654064966, "grad_norm": 6.425594619940966e-05, "learning_rate": 9.685702281065258e-07, "loss": 0.0, "num_input_tokens_seen": 192438048, "step": 89220 }, { "epoch": 16.374564140209213, "grad_norm": 0.00022120990615803748, "learning_rate": 9.680966114478202e-07, "loss": 0.0, "num_input_tokens_seen": 192447488, "step": 89225 }, { "epoch": 16.375481739768766, "grad_norm": 0.017191756516695023, "learning_rate": 9.676230982011875e-07, "loss": 0.0, "num_input_tokens_seen": 192458144, "step": 89230 }, { "epoch": 16.376399339328316, "grad_norm": 0.0008610581862740219, "learning_rate": 9.671496883787712e-07, "loss": 0.0, "num_input_tokens_seen": 192468640, "step": 89235 }, { "epoch": 16.37731693888787, "grad_norm": 6.336181832011789e-05, "learning_rate": 9.66676381992717e-07, "loss": 0.0, "num_input_tokens_seen": 192480032, "step": 89240 }, { "epoch": 16.378234538447423, "grad_norm": 9.347101877210662e-05, "learning_rate": 9.66203179055162e-07, "loss": 0.0, "num_input_tokens_seen": 192490464, "step": 89245 }, { "epoch": 16.379152138006972, "grad_norm": 0.001029958832077682, "learning_rate": 9.657300795782436e-07, "loss": 0.0, "num_input_tokens_seen": 192501440, "step": 89250 }, { "epoch": 16.380069737566526, "grad_norm": 0.0002654392446856946, "learning_rate": 9.652570835740954e-07, "loss": 0.0, "num_input_tokens_seen": 192513088, "step": 89255 }, { "epoch": 16.38098733712608, "grad_norm": 7.958192145451903e-05, "learning_rate": 9.647841910548505e-07, "loss": 0.0, "num_input_tokens_seen": 192524096, "step": 89260 }, { "epoch": 16.38190493668563, "grad_norm": 0.00014385662507265806, "learning_rate": 9.643114020326371e-07, "loss": 0.0, "num_input_tokens_seen": 192534912, "step": 89265 }, { "epoch": 16.382822536245182, "grad_norm": 0.0001378659944748506, "learning_rate": 9.6383871651958e-07, "loss": 0.0, "num_input_tokens_seen": 192545408, "step": 89270 }, { "epoch": 16.383740135804736, "grad_norm": 7.436356827383861e-05, "learning_rate": 9.63366134527806e-07, "loss": 0.0, "num_input_tokens_seen": 192556256, "step": 89275 }, { "epoch": 16.384657735364286, "grad_norm": 0.0001059370260918513, "learning_rate": 9.628936560694347e-07, "loss": 0.0, "num_input_tokens_seen": 192567040, "step": 89280 }, { "epoch": 16.38557533492384, "grad_norm": 0.00018082487804349512, "learning_rate": 9.624212811565837e-07, "loss": 0.0, "num_input_tokens_seen": 192577760, "step": 89285 }, { "epoch": 16.386492934483393, "grad_norm": 0.0001734666439006105, "learning_rate": 9.619490098013678e-07, "loss": 0.0, "num_input_tokens_seen": 192589248, "step": 89290 }, { "epoch": 16.387410534042942, "grad_norm": 0.00010538112837821245, "learning_rate": 9.614768420159031e-07, "loss": 0.0, "num_input_tokens_seen": 192600384, "step": 89295 }, { "epoch": 16.388328133602496, "grad_norm": 6.108899833634496e-05, "learning_rate": 9.610047778122977e-07, "loss": 0.0, "num_input_tokens_seen": 192611584, "step": 89300 }, { "epoch": 16.38924573316205, "grad_norm": 0.00016145245172083378, "learning_rate": 9.605328172026608e-07, "loss": 0.0, "num_input_tokens_seen": 192622560, "step": 89305 }, { "epoch": 16.3901633327216, "grad_norm": 5.225088898441754e-05, "learning_rate": 9.600609601990957e-07, "loss": 0.0, "num_input_tokens_seen": 192633600, "step": 89310 }, { "epoch": 16.391080932281152, "grad_norm": 0.00010798903531394899, "learning_rate": 9.59589206813706e-07, "loss": 0.0, "num_input_tokens_seen": 192643808, "step": 89315 }, { "epoch": 16.391998531840706, "grad_norm": 5.629070437862538e-05, "learning_rate": 9.591175570585892e-07, "loss": 0.0, "num_input_tokens_seen": 192654912, "step": 89320 }, { "epoch": 16.392916131400256, "grad_norm": 7.186588481999934e-05, "learning_rate": 9.586460109458462e-07, "loss": 0.0, "num_input_tokens_seen": 192666176, "step": 89325 }, { "epoch": 16.39383373095981, "grad_norm": 0.00022482564963866025, "learning_rate": 9.581745684875694e-07, "loss": 0.0, "num_input_tokens_seen": 192676832, "step": 89330 }, { "epoch": 16.394751330519362, "grad_norm": 5.4283751524053514e-05, "learning_rate": 9.577032296958504e-07, "loss": 0.0, "num_input_tokens_seen": 192688512, "step": 89335 }, { "epoch": 16.395668930078912, "grad_norm": 0.0001860844495240599, "learning_rate": 9.572319945827774e-07, "loss": 0.0, "num_input_tokens_seen": 192700416, "step": 89340 }, { "epoch": 16.396586529638466, "grad_norm": 66.4942855834961, "learning_rate": 9.567608631604398e-07, "loss": 0.0016, "num_input_tokens_seen": 192711648, "step": 89345 }, { "epoch": 16.39750412919802, "grad_norm": 8.9551460405346e-05, "learning_rate": 9.56289835440919e-07, "loss": 0.0, "num_input_tokens_seen": 192722496, "step": 89350 }, { "epoch": 16.39842172875757, "grad_norm": 7.970083242980763e-05, "learning_rate": 9.558189114362954e-07, "loss": 0.0, "num_input_tokens_seen": 192733344, "step": 89355 }, { "epoch": 16.399339328317122, "grad_norm": 7.129530422389507e-05, "learning_rate": 9.553480911586504e-07, "loss": 0.0, "num_input_tokens_seen": 192743040, "step": 89360 }, { "epoch": 16.400256927876676, "grad_norm": 0.00016291970678139478, "learning_rate": 9.54877374620058e-07, "loss": 0.0, "num_input_tokens_seen": 192754304, "step": 89365 }, { "epoch": 16.401174527436226, "grad_norm": 0.0004976088530384004, "learning_rate": 9.544067618325904e-07, "loss": 0.0, "num_input_tokens_seen": 192765056, "step": 89370 }, { "epoch": 16.40209212699578, "grad_norm": 0.005021759308874607, "learning_rate": 9.539362528083207e-07, "loss": 0.0, "num_input_tokens_seen": 192777280, "step": 89375 }, { "epoch": 16.403009726555332, "grad_norm": 0.00012375111691653728, "learning_rate": 9.534658475593151e-07, "loss": 0.0, "num_input_tokens_seen": 192788576, "step": 89380 }, { "epoch": 16.403927326114882, "grad_norm": 0.003059017937630415, "learning_rate": 9.529955460976387e-07, "loss": 0.0, "num_input_tokens_seen": 192799232, "step": 89385 }, { "epoch": 16.404844925674436, "grad_norm": 0.0001228287728736177, "learning_rate": 9.52525348435353e-07, "loss": 0.0, "num_input_tokens_seen": 192808640, "step": 89390 }, { "epoch": 16.40576252523399, "grad_norm": 4.981114398106001e-05, "learning_rate": 9.520552545845208e-07, "loss": 0.0, "num_input_tokens_seen": 192819744, "step": 89395 }, { "epoch": 16.40668012479354, "grad_norm": 0.0002926176821347326, "learning_rate": 9.515852645571977e-07, "loss": 0.0, "num_input_tokens_seen": 192831040, "step": 89400 }, { "epoch": 16.407597724353092, "grad_norm": 0.00011947723396588117, "learning_rate": 9.511153783654365e-07, "loss": 0.0, "num_input_tokens_seen": 192842400, "step": 89405 }, { "epoch": 16.408515323912646, "grad_norm": 0.0007053593872115016, "learning_rate": 9.506455960212918e-07, "loss": 0.0, "num_input_tokens_seen": 192854368, "step": 89410 }, { "epoch": 16.409432923472195, "grad_norm": 8.196626731660217e-05, "learning_rate": 9.501759175368114e-07, "loss": 0.0, "num_input_tokens_seen": 192864992, "step": 89415 }, { "epoch": 16.41035052303175, "grad_norm": 8.345612877747044e-05, "learning_rate": 9.497063429240411e-07, "loss": 0.0, "num_input_tokens_seen": 192875776, "step": 89420 }, { "epoch": 16.411268122591302, "grad_norm": 0.0005738092586398125, "learning_rate": 9.492368721950274e-07, "loss": 0.0, "num_input_tokens_seen": 192884832, "step": 89425 }, { "epoch": 16.412185722150852, "grad_norm": 7.717057451372966e-05, "learning_rate": 9.487675053618095e-07, "loss": 0.0, "num_input_tokens_seen": 192894880, "step": 89430 }, { "epoch": 16.413103321710405, "grad_norm": 0.0001595976937096566, "learning_rate": 9.482982424364262e-07, "loss": 0.0, "num_input_tokens_seen": 192904448, "step": 89435 }, { "epoch": 16.41402092126996, "grad_norm": 0.0009656304609961808, "learning_rate": 9.478290834309117e-07, "loss": 0.0, "num_input_tokens_seen": 192913408, "step": 89440 }, { "epoch": 16.41493852082951, "grad_norm": 9.047336789080873e-05, "learning_rate": 9.473600283573026e-07, "loss": 0.0, "num_input_tokens_seen": 192924128, "step": 89445 }, { "epoch": 16.415856120389062, "grad_norm": 5.3187173762125894e-05, "learning_rate": 9.46891077227628e-07, "loss": 0.0, "num_input_tokens_seen": 192934496, "step": 89450 }, { "epoch": 16.416773719948615, "grad_norm": 0.0001385032810503617, "learning_rate": 9.464222300539139e-07, "loss": 0.0, "num_input_tokens_seen": 192945088, "step": 89455 }, { "epoch": 16.417691319508165, "grad_norm": 5.707745731342584e-05, "learning_rate": 9.459534868481885e-07, "loss": 0.0, "num_input_tokens_seen": 192955424, "step": 89460 }, { "epoch": 16.41860891906772, "grad_norm": 0.00012125793000450358, "learning_rate": 9.454848476224732e-07, "loss": 0.0, "num_input_tokens_seen": 192967392, "step": 89465 }, { "epoch": 16.419526518627272, "grad_norm": 6.557173765031621e-05, "learning_rate": 9.450163123887873e-07, "loss": 0.0, "num_input_tokens_seen": 192979456, "step": 89470 }, { "epoch": 16.420444118186822, "grad_norm": 0.0028821630403399467, "learning_rate": 9.445478811591469e-07, "loss": 0.0, "num_input_tokens_seen": 192990144, "step": 89475 }, { "epoch": 16.421361717746375, "grad_norm": 7.101200026227161e-05, "learning_rate": 9.440795539455699e-07, "loss": 0.0, "num_input_tokens_seen": 193000608, "step": 89480 }, { "epoch": 16.42227931730593, "grad_norm": 0.00012012602383038029, "learning_rate": 9.436113307600658e-07, "loss": 0.0, "num_input_tokens_seen": 193010400, "step": 89485 }, { "epoch": 16.42319691686548, "grad_norm": 0.00017939881945494562, "learning_rate": 9.431432116146427e-07, "loss": 0.0002, "num_input_tokens_seen": 193021984, "step": 89490 }, { "epoch": 16.424114516425032, "grad_norm": 0.0005046525620855391, "learning_rate": 9.426751965213105e-07, "loss": 0.0, "num_input_tokens_seen": 193033248, "step": 89495 }, { "epoch": 16.425032115984585, "grad_norm": 4.9381680582882836e-05, "learning_rate": 9.422072854920706e-07, "loss": 0.0, "num_input_tokens_seen": 193042816, "step": 89500 }, { "epoch": 16.425949715544135, "grad_norm": 0.00045402106479741633, "learning_rate": 9.417394785389255e-07, "loss": 0.0, "num_input_tokens_seen": 193054400, "step": 89505 }, { "epoch": 16.42686731510369, "grad_norm": 0.0004424735379870981, "learning_rate": 9.412717756738726e-07, "loss": 0.0, "num_input_tokens_seen": 193064000, "step": 89510 }, { "epoch": 16.427784914663242, "grad_norm": 0.0006497726426459849, "learning_rate": 9.408041769089071e-07, "loss": 0.0, "num_input_tokens_seen": 193075232, "step": 89515 }, { "epoch": 16.428702514222792, "grad_norm": 0.002566205570474267, "learning_rate": 9.403366822560245e-07, "loss": 0.0, "num_input_tokens_seen": 193086080, "step": 89520 }, { "epoch": 16.429620113782345, "grad_norm": 0.0001914446329465136, "learning_rate": 9.39869291727214e-07, "loss": 0.0, "num_input_tokens_seen": 193096192, "step": 89525 }, { "epoch": 16.4305377133419, "grad_norm": 0.0001370108366245404, "learning_rate": 9.394020053344638e-07, "loss": 0.0, "num_input_tokens_seen": 193107200, "step": 89530 }, { "epoch": 16.43145531290145, "grad_norm": 0.0002482305280864239, "learning_rate": 9.389348230897582e-07, "loss": 0.0, "num_input_tokens_seen": 193116992, "step": 89535 }, { "epoch": 16.432372912461002, "grad_norm": 8.091058407444507e-05, "learning_rate": 9.384677450050794e-07, "loss": 0.0, "num_input_tokens_seen": 193127744, "step": 89540 }, { "epoch": 16.433290512020555, "grad_norm": 0.00014079833636060357, "learning_rate": 9.380007710924099e-07, "loss": 0.0, "num_input_tokens_seen": 193139840, "step": 89545 }, { "epoch": 16.434208111580105, "grad_norm": 0.0002400255179964006, "learning_rate": 9.375339013637247e-07, "loss": 0.0, "num_input_tokens_seen": 193151008, "step": 89550 }, { "epoch": 16.43512571113966, "grad_norm": 8.105381130008027e-05, "learning_rate": 9.370671358309968e-07, "loss": 0.0, "num_input_tokens_seen": 193163072, "step": 89555 }, { "epoch": 16.436043310699212, "grad_norm": 0.0009946627542376518, "learning_rate": 9.366004745062018e-07, "loss": 0.0, "num_input_tokens_seen": 193173024, "step": 89560 }, { "epoch": 16.43696091025876, "grad_norm": 0.00011201231245649979, "learning_rate": 9.361339174013073e-07, "loss": 0.0, "num_input_tokens_seen": 193183904, "step": 89565 }, { "epoch": 16.437878509818315, "grad_norm": 0.00010724196908995509, "learning_rate": 9.356674645282787e-07, "loss": 0.0, "num_input_tokens_seen": 193193984, "step": 89570 }, { "epoch": 16.43879610937787, "grad_norm": 0.0006376287783496082, "learning_rate": 9.352011158990793e-07, "loss": 0.0, "num_input_tokens_seen": 193204736, "step": 89575 }, { "epoch": 16.43971370893742, "grad_norm": 7.032865687506273e-05, "learning_rate": 9.347348715256732e-07, "loss": 0.0, "num_input_tokens_seen": 193215936, "step": 89580 }, { "epoch": 16.44063130849697, "grad_norm": 0.0003364538715686649, "learning_rate": 9.342687314200166e-07, "loss": 0.0004, "num_input_tokens_seen": 193227136, "step": 89585 }, { "epoch": 16.441548908056525, "grad_norm": 3.955570355174132e-05, "learning_rate": 9.338026955940643e-07, "loss": 0.0, "num_input_tokens_seen": 193238368, "step": 89590 }, { "epoch": 16.442466507616075, "grad_norm": 0.0005087070749141276, "learning_rate": 9.333367640597723e-07, "loss": 0.0, "num_input_tokens_seen": 193249568, "step": 89595 }, { "epoch": 16.44338410717563, "grad_norm": 9.659965144237503e-05, "learning_rate": 9.328709368290901e-07, "loss": 0.0, "num_input_tokens_seen": 193261600, "step": 89600 }, { "epoch": 16.44430170673518, "grad_norm": 0.0009415359236299992, "learning_rate": 9.324052139139628e-07, "loss": 0.0, "num_input_tokens_seen": 193272160, "step": 89605 }, { "epoch": 16.44521930629473, "grad_norm": 0.013645931147038937, "learning_rate": 9.319395953263393e-07, "loss": 0.0, "num_input_tokens_seen": 193283200, "step": 89610 }, { "epoch": 16.446136905854285, "grad_norm": 0.00010811099491547793, "learning_rate": 9.314740810781603e-07, "loss": 0.0, "num_input_tokens_seen": 193294528, "step": 89615 }, { "epoch": 16.44705450541384, "grad_norm": 6.178293551784009e-05, "learning_rate": 9.310086711813649e-07, "loss": 0.0, "num_input_tokens_seen": 193305696, "step": 89620 }, { "epoch": 16.44797210497339, "grad_norm": 0.0004027296381536871, "learning_rate": 9.305433656478902e-07, "loss": 0.0, "num_input_tokens_seen": 193316448, "step": 89625 }, { "epoch": 16.44888970453294, "grad_norm": 0.0005176044651307166, "learning_rate": 9.300781644896717e-07, "loss": 0.0, "num_input_tokens_seen": 193326592, "step": 89630 }, { "epoch": 16.449807304092495, "grad_norm": 0.0003938491572625935, "learning_rate": 9.29613067718641e-07, "loss": 0.0, "num_input_tokens_seen": 193337344, "step": 89635 }, { "epoch": 16.450724903652045, "grad_norm": 0.0002891257463488728, "learning_rate": 9.291480753467247e-07, "loss": 0.0, "num_input_tokens_seen": 193348448, "step": 89640 }, { "epoch": 16.4516425032116, "grad_norm": 0.00012872157094534487, "learning_rate": 9.286831873858531e-07, "loss": 0.0, "num_input_tokens_seen": 193358880, "step": 89645 }, { "epoch": 16.45256010277115, "grad_norm": 0.0002324835950275883, "learning_rate": 9.282184038479469e-07, "loss": 0.0, "num_input_tokens_seen": 193370912, "step": 89650 }, { "epoch": 16.4534777023307, "grad_norm": 6.911067612236366e-05, "learning_rate": 9.277537247449286e-07, "loss": 0.0, "num_input_tokens_seen": 193382080, "step": 89655 }, { "epoch": 16.454395301890255, "grad_norm": 5.131458965479396e-05, "learning_rate": 9.272891500887143e-07, "loss": 0.0, "num_input_tokens_seen": 193394368, "step": 89660 }, { "epoch": 16.45531290144981, "grad_norm": 0.0006811387720517814, "learning_rate": 9.268246798912228e-07, "loss": 0.0, "num_input_tokens_seen": 193405312, "step": 89665 }, { "epoch": 16.456230501009358, "grad_norm": 0.0002904442953877151, "learning_rate": 9.263603141643651e-07, "loss": 0.0, "num_input_tokens_seen": 193416512, "step": 89670 }, { "epoch": 16.45714810056891, "grad_norm": 0.0002586210612207651, "learning_rate": 9.258960529200505e-07, "loss": 0.0, "num_input_tokens_seen": 193427584, "step": 89675 }, { "epoch": 16.458065700128465, "grad_norm": 5.9872963902307674e-05, "learning_rate": 9.254318961701892e-07, "loss": 0.0, "num_input_tokens_seen": 193438304, "step": 89680 }, { "epoch": 16.458983299688015, "grad_norm": 7.398756133625284e-05, "learning_rate": 9.249678439266846e-07, "loss": 0.0, "num_input_tokens_seen": 193448544, "step": 89685 }, { "epoch": 16.459900899247568, "grad_norm": 5.7036380894714966e-05, "learning_rate": 9.24503896201438e-07, "loss": 0.0, "num_input_tokens_seen": 193460000, "step": 89690 }, { "epoch": 16.46081849880712, "grad_norm": 6.21739964117296e-05, "learning_rate": 9.240400530063509e-07, "loss": 0.0, "num_input_tokens_seen": 193471040, "step": 89695 }, { "epoch": 16.46173609836667, "grad_norm": 7.373043627012521e-05, "learning_rate": 9.2357631435332e-07, "loss": 0.0, "num_input_tokens_seen": 193481344, "step": 89700 }, { "epoch": 16.462653697926225, "grad_norm": 0.0006920790183357894, "learning_rate": 9.231126802542378e-07, "loss": 0.0, "num_input_tokens_seen": 193491904, "step": 89705 }, { "epoch": 16.463571297485778, "grad_norm": 4.6140274207573384e-05, "learning_rate": 9.226491507209961e-07, "loss": 0.0, "num_input_tokens_seen": 193504032, "step": 89710 }, { "epoch": 16.464488897045328, "grad_norm": 0.001983506139367819, "learning_rate": 9.221857257654859e-07, "loss": 0.0, "num_input_tokens_seen": 193514080, "step": 89715 }, { "epoch": 16.46540649660488, "grad_norm": 0.00015179517504293472, "learning_rate": 9.21722405399591e-07, "loss": 0.0, "num_input_tokens_seen": 193524480, "step": 89720 }, { "epoch": 16.466324096164435, "grad_norm": 0.000188679521670565, "learning_rate": 9.212591896351963e-07, "loss": 0.0, "num_input_tokens_seen": 193534848, "step": 89725 }, { "epoch": 16.467241695723985, "grad_norm": 5.277631134958938e-05, "learning_rate": 9.207960784841818e-07, "loss": 0.0, "num_input_tokens_seen": 193544928, "step": 89730 }, { "epoch": 16.468159295283538, "grad_norm": 0.00011565387831069529, "learning_rate": 9.203330719584241e-07, "loss": 0.0, "num_input_tokens_seen": 193557376, "step": 89735 }, { "epoch": 16.46907689484309, "grad_norm": 8.792562584858388e-05, "learning_rate": 9.19870170069802e-07, "loss": 0.0, "num_input_tokens_seen": 193567808, "step": 89740 }, { "epoch": 16.46999449440264, "grad_norm": 0.0002535879029892385, "learning_rate": 9.194073728301861e-07, "loss": 0.0, "num_input_tokens_seen": 193579520, "step": 89745 }, { "epoch": 16.470912093962195, "grad_norm": 0.0014626872725784779, "learning_rate": 9.189446802514468e-07, "loss": 0.0, "num_input_tokens_seen": 193590304, "step": 89750 }, { "epoch": 16.471829693521748, "grad_norm": 0.0002039218961726874, "learning_rate": 9.184820923454513e-07, "loss": 0.0, "num_input_tokens_seen": 193601248, "step": 89755 }, { "epoch": 16.472747293081298, "grad_norm": 6.415274401661009e-05, "learning_rate": 9.180196091240628e-07, "loss": 0.0, "num_input_tokens_seen": 193612096, "step": 89760 }, { "epoch": 16.47366489264085, "grad_norm": 9.735807543620467e-05, "learning_rate": 9.175572305991465e-07, "loss": 0.0, "num_input_tokens_seen": 193624096, "step": 89765 }, { "epoch": 16.474582492200405, "grad_norm": 7.604502025060356e-05, "learning_rate": 9.170949567825599e-07, "loss": 0.0, "num_input_tokens_seen": 193634752, "step": 89770 }, { "epoch": 16.475500091759955, "grad_norm": 0.00013947617844678462, "learning_rate": 9.166327876861586e-07, "loss": 0.0, "num_input_tokens_seen": 193645504, "step": 89775 }, { "epoch": 16.476417691319508, "grad_norm": 8.951413474278525e-05, "learning_rate": 9.161707233217987e-07, "loss": 0.0, "num_input_tokens_seen": 193656128, "step": 89780 }, { "epoch": 16.47733529087906, "grad_norm": 4.520410537719727, "learning_rate": 9.157087637013307e-07, "loss": 0.0005, "num_input_tokens_seen": 193667008, "step": 89785 }, { "epoch": 16.47825289043861, "grad_norm": 0.0003555901348590851, "learning_rate": 9.152469088366023e-07, "loss": 0.0, "num_input_tokens_seen": 193678176, "step": 89790 }, { "epoch": 16.479170489998165, "grad_norm": 0.00019696632807608694, "learning_rate": 9.147851587394591e-07, "loss": 0.0, "num_input_tokens_seen": 193686784, "step": 89795 }, { "epoch": 16.480088089557718, "grad_norm": 0.0005259359022602439, "learning_rate": 9.14323513421746e-07, "loss": 0.0, "num_input_tokens_seen": 193696896, "step": 89800 }, { "epoch": 16.481005689117268, "grad_norm": 0.0003279732191003859, "learning_rate": 9.13861972895303e-07, "loss": 0.0, "num_input_tokens_seen": 193707712, "step": 89805 }, { "epoch": 16.48192328867682, "grad_norm": 0.0019604568369686604, "learning_rate": 9.134005371719656e-07, "loss": 0.0, "num_input_tokens_seen": 193718400, "step": 89810 }, { "epoch": 16.482840888236375, "grad_norm": 5.555678944801912e-05, "learning_rate": 9.129392062635728e-07, "loss": 0.0, "num_input_tokens_seen": 193729024, "step": 89815 }, { "epoch": 16.483758487795924, "grad_norm": 0.00029954660567454994, "learning_rate": 9.124779801819544e-07, "loss": 0.0, "num_input_tokens_seen": 193740448, "step": 89820 }, { "epoch": 16.484676087355478, "grad_norm": 0.00018394652579445392, "learning_rate": 9.120168589389395e-07, "loss": 0.0, "num_input_tokens_seen": 193751552, "step": 89825 }, { "epoch": 16.48559368691503, "grad_norm": 0.00010906290845014155, "learning_rate": 9.115558425463577e-07, "loss": 0.0, "num_input_tokens_seen": 193763552, "step": 89830 }, { "epoch": 16.48651128647458, "grad_norm": 0.0005546578322537243, "learning_rate": 9.110949310160322e-07, "loss": 0.0, "num_input_tokens_seen": 193775424, "step": 89835 }, { "epoch": 16.487428886034134, "grad_norm": 8.962985157268122e-05, "learning_rate": 9.106341243597844e-07, "loss": 0.0, "num_input_tokens_seen": 193786560, "step": 89840 }, { "epoch": 16.488346485593688, "grad_norm": 0.0003433410602156073, "learning_rate": 9.10173422589432e-07, "loss": 0.0001, "num_input_tokens_seen": 193797504, "step": 89845 }, { "epoch": 16.489264085153238, "grad_norm": 0.0023743489291518927, "learning_rate": 9.097128257167937e-07, "loss": 0.056, "num_input_tokens_seen": 193807904, "step": 89850 }, { "epoch": 16.49018168471279, "grad_norm": 5.988882185192779e-05, "learning_rate": 9.092523337536824e-07, "loss": 0.0, "num_input_tokens_seen": 193819200, "step": 89855 }, { "epoch": 16.491099284272345, "grad_norm": 8.279105531983078e-05, "learning_rate": 9.087919467119071e-07, "loss": 0.0, "num_input_tokens_seen": 193830400, "step": 89860 }, { "epoch": 16.492016883831894, "grad_norm": 0.03861036151647568, "learning_rate": 9.083316646032791e-07, "loss": 0.0, "num_input_tokens_seen": 193841248, "step": 89865 }, { "epoch": 16.492934483391448, "grad_norm": 0.00014515257498715073, "learning_rate": 9.078714874396027e-07, "loss": 0.0, "num_input_tokens_seen": 193852160, "step": 89870 }, { "epoch": 16.493852082951, "grad_norm": 0.0003343808639328927, "learning_rate": 9.074114152326785e-07, "loss": 0.0, "num_input_tokens_seen": 193861792, "step": 89875 }, { "epoch": 16.49476968251055, "grad_norm": 0.00021758036746177822, "learning_rate": 9.069514479943104e-07, "loss": 0.0, "num_input_tokens_seen": 193871360, "step": 89880 }, { "epoch": 16.495687282070104, "grad_norm": 0.00039113356615416706, "learning_rate": 9.064915857362939e-07, "loss": 0.0, "num_input_tokens_seen": 193881184, "step": 89885 }, { "epoch": 16.496604881629658, "grad_norm": 0.00011808628914877772, "learning_rate": 9.060318284704234e-07, "loss": 0.0, "num_input_tokens_seen": 193891968, "step": 89890 }, { "epoch": 16.497522481189208, "grad_norm": 0.0013978026108816266, "learning_rate": 9.055721762084907e-07, "loss": 0.0, "num_input_tokens_seen": 193901664, "step": 89895 }, { "epoch": 16.49844008074876, "grad_norm": 0.00017172952357213944, "learning_rate": 9.051126289622869e-07, "loss": 0.0, "num_input_tokens_seen": 193912928, "step": 89900 }, { "epoch": 16.499357680308314, "grad_norm": 9.658941417001188e-05, "learning_rate": 9.046531867435976e-07, "loss": 0.0, "num_input_tokens_seen": 193923200, "step": 89905 }, { "epoch": 16.500275279867864, "grad_norm": 0.00037436801358126104, "learning_rate": 9.041938495642056e-07, "loss": 0.0, "num_input_tokens_seen": 193934112, "step": 89910 }, { "epoch": 16.501192879427418, "grad_norm": 0.0006782998680137098, "learning_rate": 9.037346174358946e-07, "loss": 0.0, "num_input_tokens_seen": 193944960, "step": 89915 }, { "epoch": 16.50211047898697, "grad_norm": 9.85905498964712e-05, "learning_rate": 9.032754903704422e-07, "loss": 0.0, "num_input_tokens_seen": 193955360, "step": 89920 }, { "epoch": 16.50302807854652, "grad_norm": 7.859636389184743e-05, "learning_rate": 9.028164683796243e-07, "loss": 0.0, "num_input_tokens_seen": 193965632, "step": 89925 }, { "epoch": 16.503945678106074, "grad_norm": 0.00023288313241209835, "learning_rate": 9.023575514752126e-07, "loss": 0.0, "num_input_tokens_seen": 193977600, "step": 89930 }, { "epoch": 16.504863277665628, "grad_norm": 0.0011936970986425877, "learning_rate": 9.018987396689799e-07, "loss": 0.0, "num_input_tokens_seen": 193988288, "step": 89935 }, { "epoch": 16.505780877225178, "grad_norm": 0.00013665464939549565, "learning_rate": 9.01440032972693e-07, "loss": 0.0, "num_input_tokens_seen": 193998432, "step": 89940 }, { "epoch": 16.50669847678473, "grad_norm": 0.0017145833699032664, "learning_rate": 9.009814313981175e-07, "loss": 0.0, "num_input_tokens_seen": 194009184, "step": 89945 }, { "epoch": 16.507616076344284, "grad_norm": 0.0006594757433049381, "learning_rate": 9.005229349570155e-07, "loss": 0.0, "num_input_tokens_seen": 194019424, "step": 89950 }, { "epoch": 16.508533675903834, "grad_norm": 0.001405207091011107, "learning_rate": 9.000645436611449e-07, "loss": 0.0, "num_input_tokens_seen": 194030560, "step": 89955 }, { "epoch": 16.509451275463388, "grad_norm": 0.024504737928509712, "learning_rate": 8.996062575222659e-07, "loss": 0.0, "num_input_tokens_seen": 194041568, "step": 89960 }, { "epoch": 16.51036887502294, "grad_norm": 0.00011412736785132438, "learning_rate": 8.99148076552131e-07, "loss": 0.0, "num_input_tokens_seen": 194052096, "step": 89965 }, { "epoch": 16.51128647458249, "grad_norm": 6.077714351704344e-05, "learning_rate": 8.986900007624927e-07, "loss": 0.0, "num_input_tokens_seen": 194062720, "step": 89970 }, { "epoch": 16.512204074142044, "grad_norm": 0.00015367957530543208, "learning_rate": 8.982320301650988e-07, "loss": 0.0, "num_input_tokens_seen": 194074592, "step": 89975 }, { "epoch": 16.513121673701598, "grad_norm": 0.00010162369289901108, "learning_rate": 8.977741647716953e-07, "loss": 0.0, "num_input_tokens_seen": 194086400, "step": 89980 }, { "epoch": 16.514039273261147, "grad_norm": 0.00035474871401675045, "learning_rate": 8.97316404594028e-07, "loss": 0.0, "num_input_tokens_seen": 194096608, "step": 89985 }, { "epoch": 16.5149568728207, "grad_norm": 8.686522050993517e-05, "learning_rate": 8.968587496438363e-07, "loss": 0.007, "num_input_tokens_seen": 194108448, "step": 89990 }, { "epoch": 16.515874472380254, "grad_norm": 7.773146353429183e-05, "learning_rate": 8.96401199932857e-07, "loss": 0.0, "num_input_tokens_seen": 194119840, "step": 89995 }, { "epoch": 16.516792071939804, "grad_norm": 7.935435132822022e-05, "learning_rate": 8.959437554728279e-07, "loss": 0.0, "num_input_tokens_seen": 194131616, "step": 90000 }, { "epoch": 16.517709671499357, "grad_norm": 0.00021400781406555325, "learning_rate": 8.954864162754812e-07, "loss": 0.0, "num_input_tokens_seen": 194142336, "step": 90005 }, { "epoch": 16.51862727105891, "grad_norm": 0.000647285021841526, "learning_rate": 8.950291823525447e-07, "loss": 0.0, "num_input_tokens_seen": 194152416, "step": 90010 }, { "epoch": 16.51954487061846, "grad_norm": 0.00014259690942708403, "learning_rate": 8.945720537157493e-07, "loss": 0.0, "num_input_tokens_seen": 194163328, "step": 90015 }, { "epoch": 16.520462470178014, "grad_norm": 0.000833941507153213, "learning_rate": 8.941150303768181e-07, "loss": 0.0, "num_input_tokens_seen": 194174368, "step": 90020 }, { "epoch": 16.521380069737567, "grad_norm": 0.00023909483570605516, "learning_rate": 8.936581123474725e-07, "loss": 0.0, "num_input_tokens_seen": 194186144, "step": 90025 }, { "epoch": 16.522297669297117, "grad_norm": 0.00045101746218279004, "learning_rate": 8.932012996394307e-07, "loss": 0.0, "num_input_tokens_seen": 194197376, "step": 90030 }, { "epoch": 16.52321526885667, "grad_norm": 0.0001264360180357471, "learning_rate": 8.927445922644118e-07, "loss": 0.0, "num_input_tokens_seen": 194208512, "step": 90035 }, { "epoch": 16.524132868416224, "grad_norm": 7.652439671801403e-05, "learning_rate": 8.922879902341286e-07, "loss": 0.0, "num_input_tokens_seen": 194218784, "step": 90040 }, { "epoch": 16.525050467975774, "grad_norm": 0.00014640943845734, "learning_rate": 8.918314935602912e-07, "loss": 0.0, "num_input_tokens_seen": 194228800, "step": 90045 }, { "epoch": 16.525968067535327, "grad_norm": 0.010099882259964943, "learning_rate": 8.913751022546097e-07, "loss": 0.0, "num_input_tokens_seen": 194239904, "step": 90050 }, { "epoch": 16.52688566709488, "grad_norm": 143.00709533691406, "learning_rate": 8.909188163287891e-07, "loss": 0.0936, "num_input_tokens_seen": 194250688, "step": 90055 }, { "epoch": 16.52780326665443, "grad_norm": 0.0010182110127061605, "learning_rate": 8.904626357945312e-07, "loss": 0.0, "num_input_tokens_seen": 194261824, "step": 90060 }, { "epoch": 16.528720866213984, "grad_norm": 0.0017890495946630836, "learning_rate": 8.900065606635383e-07, "loss": 0.0, "num_input_tokens_seen": 194272320, "step": 90065 }, { "epoch": 16.529638465773537, "grad_norm": 0.0001182595660793595, "learning_rate": 8.895505909475077e-07, "loss": 0.0, "num_input_tokens_seen": 194283264, "step": 90070 }, { "epoch": 16.530556065333087, "grad_norm": 0.0015663346275687218, "learning_rate": 8.89094726658134e-07, "loss": 0.0, "num_input_tokens_seen": 194293184, "step": 90075 }, { "epoch": 16.53147366489264, "grad_norm": 0.00011982091382378712, "learning_rate": 8.886389678071073e-07, "loss": 0.0, "num_input_tokens_seen": 194303872, "step": 90080 }, { "epoch": 16.532391264452194, "grad_norm": 0.000278771884040907, "learning_rate": 8.881833144061208e-07, "loss": 0.0, "num_input_tokens_seen": 194314496, "step": 90085 }, { "epoch": 16.533308864011744, "grad_norm": 0.0007392700063064694, "learning_rate": 8.877277664668593e-07, "loss": 0.0001, "num_input_tokens_seen": 194325888, "step": 90090 }, { "epoch": 16.534226463571297, "grad_norm": 0.0001701874571153894, "learning_rate": 8.872723240010061e-07, "loss": 0.0, "num_input_tokens_seen": 194338112, "step": 90095 }, { "epoch": 16.53514406313085, "grad_norm": 7.167899457272142e-05, "learning_rate": 8.868169870202447e-07, "loss": 0.0, "num_input_tokens_seen": 194349024, "step": 90100 }, { "epoch": 16.5360616626904, "grad_norm": 0.009257261641323566, "learning_rate": 8.86361755536253e-07, "loss": 0.0, "num_input_tokens_seen": 194358688, "step": 90105 }, { "epoch": 16.536979262249954, "grad_norm": 0.0012668768176808953, "learning_rate": 8.859066295607066e-07, "loss": 0.0, "num_input_tokens_seen": 194369696, "step": 90110 }, { "epoch": 16.537896861809507, "grad_norm": 0.00031446616048924625, "learning_rate": 8.854516091052772e-07, "loss": 0.0, "num_input_tokens_seen": 194380480, "step": 90115 }, { "epoch": 16.538814461369057, "grad_norm": 0.00015117890143301338, "learning_rate": 8.84996694181639e-07, "loss": 0.0, "num_input_tokens_seen": 194391872, "step": 90120 }, { "epoch": 16.53973206092861, "grad_norm": 0.0009932247921824455, "learning_rate": 8.845418848014576e-07, "loss": 0.0, "num_input_tokens_seen": 194403936, "step": 90125 }, { "epoch": 16.540649660488164, "grad_norm": 9.368270548293367e-05, "learning_rate": 8.840871809763973e-07, "loss": 0.0, "num_input_tokens_seen": 194415328, "step": 90130 }, { "epoch": 16.541567260047714, "grad_norm": 6.727808795403689e-05, "learning_rate": 8.83632582718123e-07, "loss": 0.0, "num_input_tokens_seen": 194427232, "step": 90135 }, { "epoch": 16.542484859607267, "grad_norm": 0.007373083382844925, "learning_rate": 8.83178090038293e-07, "loss": 0.0, "num_input_tokens_seen": 194436704, "step": 90140 }, { "epoch": 16.54340245916682, "grad_norm": 0.0001455184246879071, "learning_rate": 8.827237029485647e-07, "loss": 0.0, "num_input_tokens_seen": 194447872, "step": 90145 }, { "epoch": 16.54432005872637, "grad_norm": 0.00035854775342158973, "learning_rate": 8.822694214605904e-07, "loss": 0.0, "num_input_tokens_seen": 194458560, "step": 90150 }, { "epoch": 16.545237658285924, "grad_norm": 0.004448212217539549, "learning_rate": 8.818152455860251e-07, "loss": 0.0, "num_input_tokens_seen": 194469664, "step": 90155 }, { "epoch": 16.546155257845477, "grad_norm": 0.0005352203734219074, "learning_rate": 8.813611753365165e-07, "loss": 0.0, "num_input_tokens_seen": 194480192, "step": 90160 }, { "epoch": 16.547072857405027, "grad_norm": 0.0020920548122376204, "learning_rate": 8.809072107237105e-07, "loss": 0.0, "num_input_tokens_seen": 194491488, "step": 90165 }, { "epoch": 16.54799045696458, "grad_norm": 0.00011100751726189628, "learning_rate": 8.804533517592501e-07, "loss": 0.0, "num_input_tokens_seen": 194502784, "step": 90170 }, { "epoch": 16.548908056524134, "grad_norm": 0.00891432911157608, "learning_rate": 8.799995984547754e-07, "loss": 0.0, "num_input_tokens_seen": 194513216, "step": 90175 }, { "epoch": 16.549825656083684, "grad_norm": 0.00042733430746011436, "learning_rate": 8.795459508219267e-07, "loss": 0.0, "num_input_tokens_seen": 194524128, "step": 90180 }, { "epoch": 16.550743255643237, "grad_norm": 0.0005502367275767028, "learning_rate": 8.790924088723384e-07, "loss": 0.0, "num_input_tokens_seen": 194534656, "step": 90185 }, { "epoch": 16.55166085520279, "grad_norm": 0.0004691948997788131, "learning_rate": 8.78638972617643e-07, "loss": 0.0, "num_input_tokens_seen": 194544992, "step": 90190 }, { "epoch": 16.55257845476234, "grad_norm": 0.00010451247362652794, "learning_rate": 8.78185642069469e-07, "loss": 0.0, "num_input_tokens_seen": 194555936, "step": 90195 }, { "epoch": 16.553496054321894, "grad_norm": 0.0002081191196339205, "learning_rate": 8.777324172394463e-07, "loss": 0.0, "num_input_tokens_seen": 194566272, "step": 90200 }, { "epoch": 16.554413653881447, "grad_norm": 0.00011097503738710657, "learning_rate": 8.772792981391981e-07, "loss": 0.0, "num_input_tokens_seen": 194578208, "step": 90205 }, { "epoch": 16.555331253440997, "grad_norm": 0.004021449945867062, "learning_rate": 8.768262847803466e-07, "loss": 0.0, "num_input_tokens_seen": 194589536, "step": 90210 }, { "epoch": 16.55624885300055, "grad_norm": 0.00014764300431124866, "learning_rate": 8.763733771745092e-07, "loss": 0.0, "num_input_tokens_seen": 194601216, "step": 90215 }, { "epoch": 16.557166452560104, "grad_norm": 0.00023958354722708464, "learning_rate": 8.75920575333305e-07, "loss": 0.0, "num_input_tokens_seen": 194611712, "step": 90220 }, { "epoch": 16.558084052119654, "grad_norm": 5.3564446716336533e-05, "learning_rate": 8.754678792683457e-07, "loss": 0.0, "num_input_tokens_seen": 194623712, "step": 90225 }, { "epoch": 16.559001651679207, "grad_norm": 9.908584615914151e-05, "learning_rate": 8.750152889912422e-07, "loss": 0.0002, "num_input_tokens_seen": 194635200, "step": 90230 }, { "epoch": 16.55991925123876, "grad_norm": 0.00010087090049637482, "learning_rate": 8.745628045136045e-07, "loss": 0.0, "num_input_tokens_seen": 194645568, "step": 90235 }, { "epoch": 16.56083685079831, "grad_norm": 0.0003208060807082802, "learning_rate": 8.741104258470368e-07, "loss": 0.0, "num_input_tokens_seen": 194655744, "step": 90240 }, { "epoch": 16.561754450357864, "grad_norm": 0.0005949990591034293, "learning_rate": 8.736581530031424e-07, "loss": 0.0, "num_input_tokens_seen": 194666688, "step": 90245 }, { "epoch": 16.562672049917417, "grad_norm": 0.00013827340444549918, "learning_rate": 8.7320598599352e-07, "loss": 0.0, "num_input_tokens_seen": 194677824, "step": 90250 }, { "epoch": 16.563589649476967, "grad_norm": 0.0005688119563274086, "learning_rate": 8.727539248297689e-07, "loss": 0.0, "num_input_tokens_seen": 194689344, "step": 90255 }, { "epoch": 16.56450724903652, "grad_norm": 4.8766152758616954e-05, "learning_rate": 8.72301969523483e-07, "loss": 0.0, "num_input_tokens_seen": 194700864, "step": 90260 }, { "epoch": 16.565424848596074, "grad_norm": 6.467460480052978e-05, "learning_rate": 8.718501200862533e-07, "loss": 0.0, "num_input_tokens_seen": 194711776, "step": 90265 }, { "epoch": 16.566342448155623, "grad_norm": 0.00011943361459998414, "learning_rate": 8.713983765296713e-07, "loss": 0.0, "num_input_tokens_seen": 194723328, "step": 90270 }, { "epoch": 16.567260047715177, "grad_norm": 6.842335278633982e-05, "learning_rate": 8.70946738865322e-07, "loss": 0.0, "num_input_tokens_seen": 194733984, "step": 90275 }, { "epoch": 16.56817764727473, "grad_norm": 0.000737285939976573, "learning_rate": 8.704952071047879e-07, "loss": 0.0, "num_input_tokens_seen": 194746112, "step": 90280 }, { "epoch": 16.56909524683428, "grad_norm": 0.00011832659220090136, "learning_rate": 8.700437812596535e-07, "loss": 0.0, "num_input_tokens_seen": 194756320, "step": 90285 }, { "epoch": 16.570012846393833, "grad_norm": 8.323427027789876e-05, "learning_rate": 8.695924613414946e-07, "loss": 0.0, "num_input_tokens_seen": 194766784, "step": 90290 }, { "epoch": 16.570930445953387, "grad_norm": 6.83900507283397e-05, "learning_rate": 8.691412473618876e-07, "loss": 0.0, "num_input_tokens_seen": 194777568, "step": 90295 }, { "epoch": 16.571848045512937, "grad_norm": 8.579556015320122e-05, "learning_rate": 8.686901393324043e-07, "loss": 0.0, "num_input_tokens_seen": 194789632, "step": 90300 }, { "epoch": 16.57276564507249, "grad_norm": 0.00013190344907343388, "learning_rate": 8.682391372646171e-07, "loss": 0.0, "num_input_tokens_seen": 194800832, "step": 90305 }, { "epoch": 16.573683244632043, "grad_norm": 0.00046092490083537996, "learning_rate": 8.677882411700928e-07, "loss": 0.0001, "num_input_tokens_seen": 194811360, "step": 90310 }, { "epoch": 16.574600844191593, "grad_norm": 5.864443664904684e-05, "learning_rate": 8.673374510603938e-07, "loss": 0.0, "num_input_tokens_seen": 194821792, "step": 90315 }, { "epoch": 16.575518443751147, "grad_norm": 0.00011782125511672348, "learning_rate": 8.668867669470859e-07, "loss": 0.0, "num_input_tokens_seen": 194832000, "step": 90320 }, { "epoch": 16.5764360433107, "grad_norm": 0.0019104386446997523, "learning_rate": 8.664361888417267e-07, "loss": 0.0, "num_input_tokens_seen": 194843264, "step": 90325 }, { "epoch": 16.57735364287025, "grad_norm": 0.02795998752117157, "learning_rate": 8.65985716755871e-07, "loss": 0.0, "num_input_tokens_seen": 194854656, "step": 90330 }, { "epoch": 16.578271242429803, "grad_norm": 6.374983058776706e-05, "learning_rate": 8.655353507010766e-07, "loss": 0.0, "num_input_tokens_seen": 194865216, "step": 90335 }, { "epoch": 16.579188841989357, "grad_norm": 0.00179290643427521, "learning_rate": 8.650850906888919e-07, "loss": 0.0, "num_input_tokens_seen": 194876960, "step": 90340 }, { "epoch": 16.580106441548907, "grad_norm": 0.00017016398487612605, "learning_rate": 8.646349367308666e-07, "loss": 0.0, "num_input_tokens_seen": 194887328, "step": 90345 }, { "epoch": 16.58102404110846, "grad_norm": 0.00013030147238168865, "learning_rate": 8.641848888385446e-07, "loss": 0.0, "num_input_tokens_seen": 194897696, "step": 90350 }, { "epoch": 16.581941640668013, "grad_norm": 0.00022137117048259825, "learning_rate": 8.637349470234713e-07, "loss": 0.0, "num_input_tokens_seen": 194907904, "step": 90355 }, { "epoch": 16.582859240227563, "grad_norm": 0.00017435762856621295, "learning_rate": 8.632851112971857e-07, "loss": 0.0, "num_input_tokens_seen": 194918624, "step": 90360 }, { "epoch": 16.583776839787117, "grad_norm": 0.013549626804888248, "learning_rate": 8.628353816712265e-07, "loss": 0.0, "num_input_tokens_seen": 194931200, "step": 90365 }, { "epoch": 16.58469443934667, "grad_norm": 0.0008272617706097662, "learning_rate": 8.62385758157126e-07, "loss": 0.0, "num_input_tokens_seen": 194943424, "step": 90370 }, { "epoch": 16.58561203890622, "grad_norm": 0.00022863459889777005, "learning_rate": 8.619362407664195e-07, "loss": 0.0, "num_input_tokens_seen": 194953440, "step": 90375 }, { "epoch": 16.586529638465773, "grad_norm": 0.0001554072805447504, "learning_rate": 8.614868295106343e-07, "loss": 0.0, "num_input_tokens_seen": 194963648, "step": 90380 }, { "epoch": 16.587447238025327, "grad_norm": 7.488929259125143e-05, "learning_rate": 8.610375244012986e-07, "loss": 0.0, "num_input_tokens_seen": 194973376, "step": 90385 }, { "epoch": 16.588364837584876, "grad_norm": 8.932268974604085e-05, "learning_rate": 8.605883254499353e-07, "loss": 0.0, "num_input_tokens_seen": 194984864, "step": 90390 }, { "epoch": 16.58928243714443, "grad_norm": 0.0008422394166700542, "learning_rate": 8.601392326680664e-07, "loss": 0.0, "num_input_tokens_seen": 194996352, "step": 90395 }, { "epoch": 16.590200036703983, "grad_norm": 0.00012612628052011132, "learning_rate": 8.596902460672079e-07, "loss": 0.0, "num_input_tokens_seen": 195005248, "step": 90400 }, { "epoch": 16.591117636263533, "grad_norm": 0.00010783546167658642, "learning_rate": 8.592413656588794e-07, "loss": 0.0, "num_input_tokens_seen": 195016320, "step": 90405 }, { "epoch": 16.592035235823086, "grad_norm": 0.00016921886708587408, "learning_rate": 8.587925914545925e-07, "loss": 0.0, "num_input_tokens_seen": 195027936, "step": 90410 }, { "epoch": 16.59295283538264, "grad_norm": 6.137731543276459e-05, "learning_rate": 8.583439234658558e-07, "loss": 0.0, "num_input_tokens_seen": 195039328, "step": 90415 }, { "epoch": 16.59387043494219, "grad_norm": 6.5981199441012e-05, "learning_rate": 8.578953617041797e-07, "loss": 0.0, "num_input_tokens_seen": 195049952, "step": 90420 }, { "epoch": 16.594788034501743, "grad_norm": 0.00010612851474434137, "learning_rate": 8.574469061810681e-07, "loss": 0.0, "num_input_tokens_seen": 195061152, "step": 90425 }, { "epoch": 16.595705634061297, "grad_norm": 0.00033391310716979206, "learning_rate": 8.569985569080225e-07, "loss": 0.0, "num_input_tokens_seen": 195072448, "step": 90430 }, { "epoch": 16.596623233620846, "grad_norm": 0.00043758878018707037, "learning_rate": 8.56550313896542e-07, "loss": 0.0, "num_input_tokens_seen": 195082144, "step": 90435 }, { "epoch": 16.5975408331804, "grad_norm": 5.942301868344657e-05, "learning_rate": 8.561021771581257e-07, "loss": 0.0, "num_input_tokens_seen": 195092608, "step": 90440 }, { "epoch": 16.598458432739953, "grad_norm": 0.0012253514723852277, "learning_rate": 8.556541467042656e-07, "loss": 0.0, "num_input_tokens_seen": 195104000, "step": 90445 }, { "epoch": 16.599376032299503, "grad_norm": 0.000749401340726763, "learning_rate": 8.552062225464525e-07, "loss": 0.0, "num_input_tokens_seen": 195115040, "step": 90450 }, { "epoch": 16.600293631859056, "grad_norm": 0.0005191314849071205, "learning_rate": 8.547584046961771e-07, "loss": 0.0, "num_input_tokens_seen": 195126848, "step": 90455 }, { "epoch": 16.60121123141861, "grad_norm": 0.00010429740359541029, "learning_rate": 8.543106931649236e-07, "loss": 0.0, "num_input_tokens_seen": 195139264, "step": 90460 }, { "epoch": 16.60212883097816, "grad_norm": 8.137373515637591e-05, "learning_rate": 8.538630879641752e-07, "loss": 0.0, "num_input_tokens_seen": 195150528, "step": 90465 }, { "epoch": 16.603046430537713, "grad_norm": 0.00010228085739072412, "learning_rate": 8.534155891054135e-07, "loss": 0.0, "num_input_tokens_seen": 195161856, "step": 90470 }, { "epoch": 16.603964030097266, "grad_norm": 0.00012925152259413153, "learning_rate": 8.529681966001152e-07, "loss": 0.0, "num_input_tokens_seen": 195172256, "step": 90475 }, { "epoch": 16.604881629656816, "grad_norm": 0.0001608406164450571, "learning_rate": 8.525209104597553e-07, "loss": 0.0, "num_input_tokens_seen": 195181984, "step": 90480 }, { "epoch": 16.60579922921637, "grad_norm": 0.0008748920517973602, "learning_rate": 8.520737306958049e-07, "loss": 0.0, "num_input_tokens_seen": 195192608, "step": 90485 }, { "epoch": 16.606716828775923, "grad_norm": 6.192195723997429e-05, "learning_rate": 8.516266573197363e-07, "loss": 0.0, "num_input_tokens_seen": 195203232, "step": 90490 }, { "epoch": 16.607634428335473, "grad_norm": 0.0001229754270752892, "learning_rate": 8.511796903430142e-07, "loss": 0.0, "num_input_tokens_seen": 195215168, "step": 90495 }, { "epoch": 16.608552027895026, "grad_norm": 0.00023508841695729643, "learning_rate": 8.507328297771017e-07, "loss": 0.0, "num_input_tokens_seen": 195226240, "step": 90500 }, { "epoch": 16.60946962745458, "grad_norm": 9.514106932329014e-05, "learning_rate": 8.502860756334624e-07, "loss": 0.0, "num_input_tokens_seen": 195237504, "step": 90505 }, { "epoch": 16.61038722701413, "grad_norm": 0.0005558007978834212, "learning_rate": 8.498394279235539e-07, "loss": 0.0, "num_input_tokens_seen": 195247680, "step": 90510 }, { "epoch": 16.611304826573683, "grad_norm": 0.0001683339214650914, "learning_rate": 8.493928866588308e-07, "loss": 0.0, "num_input_tokens_seen": 195257920, "step": 90515 }, { "epoch": 16.612222426133236, "grad_norm": 0.00011298267054371536, "learning_rate": 8.489464518507484e-07, "loss": 0.0, "num_input_tokens_seen": 195269216, "step": 90520 }, { "epoch": 16.613140025692786, "grad_norm": 0.00015749080921523273, "learning_rate": 8.485001235107559e-07, "loss": 0.0, "num_input_tokens_seen": 195279648, "step": 90525 }, { "epoch": 16.61405762525234, "grad_norm": 6.301549001364037e-05, "learning_rate": 8.480539016503009e-07, "loss": 0.0, "num_input_tokens_seen": 195291776, "step": 90530 }, { "epoch": 16.614975224811893, "grad_norm": 7.294974057003856e-05, "learning_rate": 8.476077862808274e-07, "loss": 0.0, "num_input_tokens_seen": 195301792, "step": 90535 }, { "epoch": 16.615892824371443, "grad_norm": 0.001286909799091518, "learning_rate": 8.471617774137797e-07, "loss": 0.0, "num_input_tokens_seen": 195312384, "step": 90540 }, { "epoch": 16.616810423930996, "grad_norm": 0.0008413416799157858, "learning_rate": 8.467158750605964e-07, "loss": 0.0, "num_input_tokens_seen": 195323296, "step": 90545 }, { "epoch": 16.61772802349055, "grad_norm": 0.00012201185018057004, "learning_rate": 8.462700792327122e-07, "loss": 0.0, "num_input_tokens_seen": 195334240, "step": 90550 }, { "epoch": 16.6186456230501, "grad_norm": 5.709183096769266e-05, "learning_rate": 8.458243899415641e-07, "loss": 0.0, "num_input_tokens_seen": 195345120, "step": 90555 }, { "epoch": 16.619563222609653, "grad_norm": 0.00015734393673483282, "learning_rate": 8.453788071985824e-07, "loss": 0.0, "num_input_tokens_seen": 195356256, "step": 90560 }, { "epoch": 16.620480822169206, "grad_norm": 0.00154875370208174, "learning_rate": 8.449333310151947e-07, "loss": 0.0, "num_input_tokens_seen": 195367872, "step": 90565 }, { "epoch": 16.621398421728756, "grad_norm": 0.0015961163444444537, "learning_rate": 8.444879614028262e-07, "loss": 0.0, "num_input_tokens_seen": 195378560, "step": 90570 }, { "epoch": 16.62231602128831, "grad_norm": 7.117255154298618e-05, "learning_rate": 8.440426983729027e-07, "loss": 0.0, "num_input_tokens_seen": 195389568, "step": 90575 }, { "epoch": 16.623233620847863, "grad_norm": 6.1011382058495656e-05, "learning_rate": 8.435975419368425e-07, "loss": 0.0, "num_input_tokens_seen": 195399136, "step": 90580 }, { "epoch": 16.624151220407413, "grad_norm": 0.0001368810044368729, "learning_rate": 8.431524921060635e-07, "loss": 0.0, "num_input_tokens_seen": 195409088, "step": 90585 }, { "epoch": 16.625068819966966, "grad_norm": 0.00022430440003518015, "learning_rate": 8.427075488919801e-07, "loss": 0.0, "num_input_tokens_seen": 195419392, "step": 90590 }, { "epoch": 16.62598641952652, "grad_norm": 5.430516830529086e-05, "learning_rate": 8.42262712306004e-07, "loss": 0.0, "num_input_tokens_seen": 195429344, "step": 90595 }, { "epoch": 16.62690401908607, "grad_norm": 0.00032915276824496686, "learning_rate": 8.418179823595468e-07, "loss": 0.0, "num_input_tokens_seen": 195440096, "step": 90600 }, { "epoch": 16.627821618645623, "grad_norm": 0.00046535374713130295, "learning_rate": 8.413733590640138e-07, "loss": 0.0, "num_input_tokens_seen": 195451328, "step": 90605 }, { "epoch": 16.628739218205176, "grad_norm": 0.00010749584180302918, "learning_rate": 8.409288424308088e-07, "loss": 0.0, "num_input_tokens_seen": 195461120, "step": 90610 }, { "epoch": 16.629656817764726, "grad_norm": 0.0011870564194396138, "learning_rate": 8.40484432471333e-07, "loss": 0.0, "num_input_tokens_seen": 195473120, "step": 90615 }, { "epoch": 16.63057441732428, "grad_norm": 0.00010715855023590848, "learning_rate": 8.400401291969834e-07, "loss": 0.0, "num_input_tokens_seen": 195483968, "step": 90620 }, { "epoch": 16.631492016883833, "grad_norm": 8.181585144484416e-05, "learning_rate": 8.395959326191583e-07, "loss": 0.0, "num_input_tokens_seen": 195494400, "step": 90625 }, { "epoch": 16.632409616443383, "grad_norm": 9.931498061632738e-05, "learning_rate": 8.391518427492501e-07, "loss": 0.0, "num_input_tokens_seen": 195504832, "step": 90630 }, { "epoch": 16.633327216002936, "grad_norm": 0.0001943547831615433, "learning_rate": 8.387078595986464e-07, "loss": 0.0, "num_input_tokens_seen": 195515072, "step": 90635 }, { "epoch": 16.63424481556249, "grad_norm": 6.0403133829822764e-05, "learning_rate": 8.382639831787387e-07, "loss": 0.0, "num_input_tokens_seen": 195526400, "step": 90640 }, { "epoch": 16.63516241512204, "grad_norm": 9.246853733202443e-05, "learning_rate": 8.378202135009089e-07, "loss": 0.0, "num_input_tokens_seen": 195537888, "step": 90645 }, { "epoch": 16.636080014681593, "grad_norm": 0.00011955334775848314, "learning_rate": 8.373765505765391e-07, "loss": 0.0, "num_input_tokens_seen": 195547744, "step": 90650 }, { "epoch": 16.636997614241146, "grad_norm": 0.0002414273185422644, "learning_rate": 8.369329944170107e-07, "loss": 0.0, "num_input_tokens_seen": 195559392, "step": 90655 }, { "epoch": 16.637915213800696, "grad_norm": 5.5307300499407575e-05, "learning_rate": 8.364895450336985e-07, "loss": 0.0001, "num_input_tokens_seen": 195568576, "step": 90660 }, { "epoch": 16.63883281336025, "grad_norm": 6.170492270030081e-05, "learning_rate": 8.360462024379762e-07, "loss": 0.0, "num_input_tokens_seen": 195579296, "step": 90665 }, { "epoch": 16.639750412919803, "grad_norm": 0.0001085604089894332, "learning_rate": 8.356029666412147e-07, "loss": 0.0, "num_input_tokens_seen": 195589312, "step": 90670 }, { "epoch": 16.640668012479352, "grad_norm": 0.00014862293028272688, "learning_rate": 8.351598376547837e-07, "loss": 0.0, "num_input_tokens_seen": 195599680, "step": 90675 }, { "epoch": 16.641585612038906, "grad_norm": 9.18138466659002e-05, "learning_rate": 8.347168154900481e-07, "loss": 0.0, "num_input_tokens_seen": 195610048, "step": 90680 }, { "epoch": 16.64250321159846, "grad_norm": 8.127262117341161e-05, "learning_rate": 8.342739001583699e-07, "loss": 0.0, "num_input_tokens_seen": 195621312, "step": 90685 }, { "epoch": 16.64342081115801, "grad_norm": 0.00016858309390954673, "learning_rate": 8.338310916711106e-07, "loss": 0.0, "num_input_tokens_seen": 195632608, "step": 90690 }, { "epoch": 16.644338410717562, "grad_norm": 0.00019489112310111523, "learning_rate": 8.333883900396267e-07, "loss": 0.0, "num_input_tokens_seen": 195643296, "step": 90695 }, { "epoch": 16.645256010277116, "grad_norm": 6.119376485003158e-05, "learning_rate": 8.329457952752729e-07, "loss": 0.0, "num_input_tokens_seen": 195653152, "step": 90700 }, { "epoch": 16.646173609836666, "grad_norm": 0.00022301978606265038, "learning_rate": 8.325033073894001e-07, "loss": 0.0, "num_input_tokens_seen": 195662848, "step": 90705 }, { "epoch": 16.64709120939622, "grad_norm": 5.168074130779132e-05, "learning_rate": 8.320609263933593e-07, "loss": 0.0, "num_input_tokens_seen": 195673504, "step": 90710 }, { "epoch": 16.648008808955773, "grad_norm": 0.01101804617792368, "learning_rate": 8.316186522984964e-07, "loss": 0.0, "num_input_tokens_seen": 195683296, "step": 90715 }, { "epoch": 16.648926408515322, "grad_norm": 5.978470653644763e-05, "learning_rate": 8.311764851161535e-07, "loss": 0.0, "num_input_tokens_seen": 195692192, "step": 90720 }, { "epoch": 16.649844008074876, "grad_norm": 8.514225919498131e-05, "learning_rate": 8.307344248576738e-07, "loss": 0.0, "num_input_tokens_seen": 195703328, "step": 90725 }, { "epoch": 16.65076160763443, "grad_norm": 0.0025658151134848595, "learning_rate": 8.302924715343941e-07, "loss": 0.0, "num_input_tokens_seen": 195715936, "step": 90730 }, { "epoch": 16.65167920719398, "grad_norm": 6.35138712823391e-05, "learning_rate": 8.298506251576494e-07, "loss": 0.0, "num_input_tokens_seen": 195727616, "step": 90735 }, { "epoch": 16.652596806753532, "grad_norm": 0.0001683756272541359, "learning_rate": 8.294088857387733e-07, "loss": 0.0, "num_input_tokens_seen": 195738432, "step": 90740 }, { "epoch": 16.653514406313086, "grad_norm": 7.380080205621198e-05, "learning_rate": 8.289672532890963e-07, "loss": 0.0, "num_input_tokens_seen": 195749952, "step": 90745 }, { "epoch": 16.654432005872636, "grad_norm": 0.0024876927491277456, "learning_rate": 8.285257278199443e-07, "loss": 0.0, "num_input_tokens_seen": 195760192, "step": 90750 }, { "epoch": 16.65534960543219, "grad_norm": 0.0001052895822795108, "learning_rate": 8.28084309342641e-07, "loss": 0.0, "num_input_tokens_seen": 195771232, "step": 90755 }, { "epoch": 16.656267204991742, "grad_norm": 0.00012190069537609816, "learning_rate": 8.276429978685108e-07, "loss": 0.0, "num_input_tokens_seen": 195781248, "step": 90760 }, { "epoch": 16.657184804551292, "grad_norm": 0.00011724849900929257, "learning_rate": 8.272017934088706e-07, "loss": 0.0, "num_input_tokens_seen": 195792896, "step": 90765 }, { "epoch": 16.658102404110846, "grad_norm": 0.0003234573523513973, "learning_rate": 8.267606959750363e-07, "loss": 0.0, "num_input_tokens_seen": 195804416, "step": 90770 }, { "epoch": 16.6590200036704, "grad_norm": 0.0004107340064365417, "learning_rate": 8.263197055783234e-07, "loss": 0.0, "num_input_tokens_seen": 195816000, "step": 90775 }, { "epoch": 16.65993760322995, "grad_norm": 0.0001708322815829888, "learning_rate": 8.258788222300413e-07, "loss": 0.0, "num_input_tokens_seen": 195827936, "step": 90780 }, { "epoch": 16.660855202789502, "grad_norm": 0.00013994451728649437, "learning_rate": 8.254380459414984e-07, "loss": 0.0, "num_input_tokens_seen": 195837888, "step": 90785 }, { "epoch": 16.661772802349056, "grad_norm": 0.00035664235474541783, "learning_rate": 8.249973767239983e-07, "loss": 0.0, "num_input_tokens_seen": 195847136, "step": 90790 }, { "epoch": 16.662690401908606, "grad_norm": 6.635047611780465e-05, "learning_rate": 8.24556814588846e-07, "loss": 0.0, "num_input_tokens_seen": 195857664, "step": 90795 }, { "epoch": 16.66360800146816, "grad_norm": 6.108960951678455e-05, "learning_rate": 8.2411635954734e-07, "loss": 0.0, "num_input_tokens_seen": 195868288, "step": 90800 }, { "epoch": 16.664525601027712, "grad_norm": 5.874259659321979e-05, "learning_rate": 8.236760116107773e-07, "loss": 0.0, "num_input_tokens_seen": 195880352, "step": 90805 }, { "epoch": 16.665443200587262, "grad_norm": 0.00012845243327319622, "learning_rate": 8.232357707904521e-07, "loss": 0.0, "num_input_tokens_seen": 195890048, "step": 90810 }, { "epoch": 16.666360800146816, "grad_norm": 0.00047976235509850085, "learning_rate": 8.227956370976553e-07, "loss": 0.0, "num_input_tokens_seen": 195901024, "step": 90815 }, { "epoch": 16.66727839970637, "grad_norm": 0.00014901709801051766, "learning_rate": 8.22355610543677e-07, "loss": 0.0, "num_input_tokens_seen": 195912480, "step": 90820 }, { "epoch": 16.66819599926592, "grad_norm": 0.0001484697131672874, "learning_rate": 8.219156911398024e-07, "loss": 0.0, "num_input_tokens_seen": 195923392, "step": 90825 }, { "epoch": 16.669113598825472, "grad_norm": 5.2932202379452065e-05, "learning_rate": 8.214758788973154e-07, "loss": 0.0, "num_input_tokens_seen": 195934432, "step": 90830 }, { "epoch": 16.670031198385026, "grad_norm": 0.00010013318387791514, "learning_rate": 8.210361738274946e-07, "loss": 0.0, "num_input_tokens_seen": 195945472, "step": 90835 }, { "epoch": 16.670948797944575, "grad_norm": 0.0004200079129077494, "learning_rate": 8.205965759416202e-07, "loss": 0.0207, "num_input_tokens_seen": 195955936, "step": 90840 }, { "epoch": 16.67186639750413, "grad_norm": 7.587834261357784e-05, "learning_rate": 8.201570852509661e-07, "loss": 0.0, "num_input_tokens_seen": 195966720, "step": 90845 }, { "epoch": 16.672783997063682, "grad_norm": 8.126715692924336e-05, "learning_rate": 8.197177017668051e-07, "loss": 0.0, "num_input_tokens_seen": 195976160, "step": 90850 }, { "epoch": 16.673701596623232, "grad_norm": 5.896767834201455e-05, "learning_rate": 8.192784255004043e-07, "loss": 0.0, "num_input_tokens_seen": 195985824, "step": 90855 }, { "epoch": 16.674619196182785, "grad_norm": 0.00011643422476481646, "learning_rate": 8.188392564630337e-07, "loss": 0.0, "num_input_tokens_seen": 195997984, "step": 90860 }, { "epoch": 16.67553679574234, "grad_norm": 7.574623305117711e-05, "learning_rate": 8.184001946659564e-07, "loss": 0.0, "num_input_tokens_seen": 196008096, "step": 90865 }, { "epoch": 16.67645439530189, "grad_norm": 0.00015843503933865577, "learning_rate": 8.179612401204317e-07, "loss": 0.0, "num_input_tokens_seen": 196017696, "step": 90870 }, { "epoch": 16.677371994861442, "grad_norm": 0.0002245591313112527, "learning_rate": 8.175223928377207e-07, "loss": 0.0, "num_input_tokens_seen": 196028640, "step": 90875 }, { "epoch": 16.678289594420995, "grad_norm": 9.442909504286945e-05, "learning_rate": 8.170836528290783e-07, "loss": 0.0, "num_input_tokens_seen": 196039232, "step": 90880 }, { "epoch": 16.679207193980545, "grad_norm": 7.351599924732e-05, "learning_rate": 8.166450201057574e-07, "loss": 0.0, "num_input_tokens_seen": 196050240, "step": 90885 }, { "epoch": 16.6801247935401, "grad_norm": 7.944256503833458e-05, "learning_rate": 8.162064946790066e-07, "loss": 0.0, "num_input_tokens_seen": 196060992, "step": 90890 }, { "epoch": 16.681042393099652, "grad_norm": 0.00013164192205294967, "learning_rate": 8.157680765600762e-07, "loss": 0.0, "num_input_tokens_seen": 196071328, "step": 90895 }, { "epoch": 16.681959992659202, "grad_norm": 0.0003839043201878667, "learning_rate": 8.1532976576021e-07, "loss": 0.0, "num_input_tokens_seen": 196082048, "step": 90900 }, { "epoch": 16.682877592218755, "grad_norm": 5.083884025225416e-05, "learning_rate": 8.148915622906478e-07, "loss": 0.0, "num_input_tokens_seen": 196091968, "step": 90905 }, { "epoch": 16.68379519177831, "grad_norm": 9.209764539264143e-05, "learning_rate": 8.144534661626324e-07, "loss": 0.0, "num_input_tokens_seen": 196103616, "step": 90910 }, { "epoch": 16.68471279133786, "grad_norm": 0.00015850619820412248, "learning_rate": 8.140154773873988e-07, "loss": 0.0, "num_input_tokens_seen": 196114176, "step": 90915 }, { "epoch": 16.685630390897412, "grad_norm": 7.531677692895755e-05, "learning_rate": 8.135775959761788e-07, "loss": 0.0, "num_input_tokens_seen": 196125120, "step": 90920 }, { "epoch": 16.686547990456965, "grad_norm": 8.065438305493444e-05, "learning_rate": 8.131398219402065e-07, "loss": 0.0, "num_input_tokens_seen": 196135808, "step": 90925 }, { "epoch": 16.687465590016515, "grad_norm": 0.00015749795420560986, "learning_rate": 8.127021552907083e-07, "loss": 0.0, "num_input_tokens_seen": 196147520, "step": 90930 }, { "epoch": 16.68838318957607, "grad_norm": 0.0795678123831749, "learning_rate": 8.122645960389108e-07, "loss": 0.0001, "num_input_tokens_seen": 196159008, "step": 90935 }, { "epoch": 16.689300789135622, "grad_norm": 0.00022052765416447073, "learning_rate": 8.118271441960346e-07, "loss": 0.0, "num_input_tokens_seen": 196170496, "step": 90940 }, { "epoch": 16.690218388695172, "grad_norm": 0.00019103105296380818, "learning_rate": 8.113897997733017e-07, "loss": 0.0, "num_input_tokens_seen": 196181440, "step": 90945 }, { "epoch": 16.691135988254725, "grad_norm": 0.0006448394269682467, "learning_rate": 8.109525627819293e-07, "loss": 0.0, "num_input_tokens_seen": 196191200, "step": 90950 }, { "epoch": 16.69205358781428, "grad_norm": 7.550124428234994e-05, "learning_rate": 8.1051543323313e-07, "loss": 0.0, "num_input_tokens_seen": 196202624, "step": 90955 }, { "epoch": 16.69297118737383, "grad_norm": 9.51608017203398e-05, "learning_rate": 8.100784111381177e-07, "loss": 0.0, "num_input_tokens_seen": 196213664, "step": 90960 }, { "epoch": 16.693888786933382, "grad_norm": 0.00010236418893327937, "learning_rate": 8.096414965081007e-07, "loss": 0.0, "num_input_tokens_seen": 196224288, "step": 90965 }, { "epoch": 16.694806386492935, "grad_norm": 0.0001258716656593606, "learning_rate": 8.092046893542832e-07, "loss": 0.0, "num_input_tokens_seen": 196234656, "step": 90970 }, { "epoch": 16.695723986052485, "grad_norm": 7.443256617989391e-05, "learning_rate": 8.087679896878715e-07, "loss": 0.0, "num_input_tokens_seen": 196245568, "step": 90975 }, { "epoch": 16.69664158561204, "grad_norm": 0.00015455020184163004, "learning_rate": 8.083313975200651e-07, "loss": 0.0, "num_input_tokens_seen": 196255808, "step": 90980 }, { "epoch": 16.697559185171592, "grad_norm": 0.0002016449871007353, "learning_rate": 8.078949128620623e-07, "loss": 0.0, "num_input_tokens_seen": 196266208, "step": 90985 }, { "epoch": 16.69847678473114, "grad_norm": 5.4535343224415556e-05, "learning_rate": 8.074585357250564e-07, "loss": 0.0, "num_input_tokens_seen": 196276800, "step": 90990 }, { "epoch": 16.699394384290695, "grad_norm": 7.04462145222351e-05, "learning_rate": 8.07022266120242e-07, "loss": 0.0, "num_input_tokens_seen": 196288832, "step": 90995 }, { "epoch": 16.70031198385025, "grad_norm": 0.025790903717279434, "learning_rate": 8.065861040588086e-07, "loss": 0.0, "num_input_tokens_seen": 196299776, "step": 91000 }, { "epoch": 16.7012295834098, "grad_norm": 0.0001286137121496722, "learning_rate": 8.06150049551942e-07, "loss": 0.0, "num_input_tokens_seen": 196311200, "step": 91005 }, { "epoch": 16.70214718296935, "grad_norm": 0.00010163179831579328, "learning_rate": 8.057141026108256e-07, "loss": 0.0, "num_input_tokens_seen": 196322624, "step": 91010 }, { "epoch": 16.703064782528905, "grad_norm": 0.001039201975800097, "learning_rate": 8.052782632466427e-07, "loss": 0.0, "num_input_tokens_seen": 196332448, "step": 91015 }, { "epoch": 16.703982382088455, "grad_norm": 0.00014065142022445798, "learning_rate": 8.048425314705716e-07, "loss": 0.0, "num_input_tokens_seen": 196342976, "step": 91020 }, { "epoch": 16.70489998164801, "grad_norm": 0.0003249221190344542, "learning_rate": 8.044069072937877e-07, "loss": 0.0, "num_input_tokens_seen": 196354848, "step": 91025 }, { "epoch": 16.70581758120756, "grad_norm": 0.00032932349131442606, "learning_rate": 8.039713907274643e-07, "loss": 0.0, "num_input_tokens_seen": 196366336, "step": 91030 }, { "epoch": 16.70673518076711, "grad_norm": 0.0001407304807798937, "learning_rate": 8.035359817827698e-07, "loss": 0.0, "num_input_tokens_seen": 196377120, "step": 91035 }, { "epoch": 16.707652780326665, "grad_norm": 0.00029659419669769704, "learning_rate": 8.031006804708746e-07, "loss": 0.0, "num_input_tokens_seen": 196387296, "step": 91040 }, { "epoch": 16.70857037988622, "grad_norm": 0.00016423373017460108, "learning_rate": 8.026654868029427e-07, "loss": 0.0, "num_input_tokens_seen": 196397376, "step": 91045 }, { "epoch": 16.70948797944577, "grad_norm": 9.301817772211507e-05, "learning_rate": 8.022304007901355e-07, "loss": 0.0, "num_input_tokens_seen": 196408448, "step": 91050 }, { "epoch": 16.71040557900532, "grad_norm": 6.840098649263382e-05, "learning_rate": 8.017954224436114e-07, "loss": 0.0, "num_input_tokens_seen": 196419872, "step": 91055 }, { "epoch": 16.711323178564875, "grad_norm": 5.813653478980996e-05, "learning_rate": 8.013605517745293e-07, "loss": 0.0, "num_input_tokens_seen": 196430112, "step": 91060 }, { "epoch": 16.712240778124425, "grad_norm": 0.0002009825548157096, "learning_rate": 8.009257887940419e-07, "loss": 0.0, "num_input_tokens_seen": 196439584, "step": 91065 }, { "epoch": 16.71315837768398, "grad_norm": 0.020928123965859413, "learning_rate": 8.004911335132998e-07, "loss": 0.0, "num_input_tokens_seen": 196451200, "step": 91070 }, { "epoch": 16.71407597724353, "grad_norm": 0.0007109930738806725, "learning_rate": 8.000565859434506e-07, "loss": 0.0, "num_input_tokens_seen": 196462496, "step": 91075 }, { "epoch": 16.71499357680308, "grad_norm": 0.0005217110156081617, "learning_rate": 7.996221460956416e-07, "loss": 0.0, "num_input_tokens_seen": 196472448, "step": 91080 }, { "epoch": 16.715911176362635, "grad_norm": 0.0006487361970357597, "learning_rate": 7.99187813981015e-07, "loss": 0.0, "num_input_tokens_seen": 196482336, "step": 91085 }, { "epoch": 16.71682877592219, "grad_norm": 0.000507988384924829, "learning_rate": 7.987535896107085e-07, "loss": 0.0, "num_input_tokens_seen": 196493632, "step": 91090 }, { "epoch": 16.717746375481738, "grad_norm": 5.232670810073614e-05, "learning_rate": 7.983194729958626e-07, "loss": 0.0, "num_input_tokens_seen": 196503648, "step": 91095 }, { "epoch": 16.71866397504129, "grad_norm": 8.363326924154535e-05, "learning_rate": 7.978854641476102e-07, "loss": 0.0, "num_input_tokens_seen": 196514208, "step": 91100 }, { "epoch": 16.719581574600845, "grad_norm": 0.00038709529326297343, "learning_rate": 7.974515630770813e-07, "loss": 0.0, "num_input_tokens_seen": 196525152, "step": 91105 }, { "epoch": 16.720499174160395, "grad_norm": 0.00011247454676777124, "learning_rate": 7.970177697954084e-07, "loss": 0.0, "num_input_tokens_seen": 196535072, "step": 91110 }, { "epoch": 16.721416773719948, "grad_norm": 0.0001665625168243423, "learning_rate": 7.965840843137152e-07, "loss": 0.0, "num_input_tokens_seen": 196545856, "step": 91115 }, { "epoch": 16.7223343732795, "grad_norm": 8.13209408079274e-05, "learning_rate": 7.961505066431258e-07, "loss": 0.0, "num_input_tokens_seen": 196556032, "step": 91120 }, { "epoch": 16.72325197283905, "grad_norm": 6.406130705727264e-05, "learning_rate": 7.957170367947587e-07, "loss": 0.0, "num_input_tokens_seen": 196566112, "step": 91125 }, { "epoch": 16.724169572398605, "grad_norm": 7.773973629809916e-05, "learning_rate": 7.952836747797354e-07, "loss": 0.0, "num_input_tokens_seen": 196576736, "step": 91130 }, { "epoch": 16.725087171958158, "grad_norm": 0.0009144210489466786, "learning_rate": 7.94850420609169e-07, "loss": 0.0, "num_input_tokens_seen": 196587392, "step": 91135 }, { "epoch": 16.726004771517708, "grad_norm": 0.00041139486711472273, "learning_rate": 7.944172742941708e-07, "loss": 0.0, "num_input_tokens_seen": 196597792, "step": 91140 }, { "epoch": 16.72692237107726, "grad_norm": 0.00010843466588994488, "learning_rate": 7.939842358458521e-07, "loss": 0.0, "num_input_tokens_seen": 196609024, "step": 91145 }, { "epoch": 16.727839970636815, "grad_norm": 8.422024257015437e-05, "learning_rate": 7.935513052753197e-07, "loss": 0.0, "num_input_tokens_seen": 196619680, "step": 91150 }, { "epoch": 16.728757570196365, "grad_norm": 9.86239465419203e-05, "learning_rate": 7.931184825936766e-07, "loss": 0.0, "num_input_tokens_seen": 196629632, "step": 91155 }, { "epoch": 16.729675169755918, "grad_norm": 0.00019571016309782863, "learning_rate": 7.926857678120232e-07, "loss": 0.0, "num_input_tokens_seen": 196639264, "step": 91160 }, { "epoch": 16.73059276931547, "grad_norm": 0.0004857470339629799, "learning_rate": 7.922531609414602e-07, "loss": 0.0, "num_input_tokens_seen": 196650976, "step": 91165 }, { "epoch": 16.73151036887502, "grad_norm": 0.0003914129047188908, "learning_rate": 7.918206619930824e-07, "loss": 0.0, "num_input_tokens_seen": 196662528, "step": 91170 }, { "epoch": 16.732427968434575, "grad_norm": 5.669841993949376e-05, "learning_rate": 7.913882709779813e-07, "loss": 0.0, "num_input_tokens_seen": 196674176, "step": 91175 }, { "epoch": 16.733345567994128, "grad_norm": 0.0020580976270139217, "learning_rate": 7.909559879072493e-07, "loss": 0.0, "num_input_tokens_seen": 196686016, "step": 91180 }, { "epoch": 16.734263167553678, "grad_norm": 0.0001485436368966475, "learning_rate": 7.905238127919729e-07, "loss": 0.0, "num_input_tokens_seen": 196696480, "step": 91185 }, { "epoch": 16.73518076711323, "grad_norm": 5.2335260988911614e-05, "learning_rate": 7.900917456432355e-07, "loss": 0.0, "num_input_tokens_seen": 196707104, "step": 91190 }, { "epoch": 16.736098366672785, "grad_norm": 0.00011603259918047115, "learning_rate": 7.896597864721212e-07, "loss": 0.0, "num_input_tokens_seen": 196716864, "step": 91195 }, { "epoch": 16.737015966232335, "grad_norm": 0.00027868899633176625, "learning_rate": 7.892279352897075e-07, "loss": 0.0, "num_input_tokens_seen": 196727104, "step": 91200 }, { "epoch": 16.737933565791888, "grad_norm": 0.00010979470243910328, "learning_rate": 7.887961921070719e-07, "loss": 0.0, "num_input_tokens_seen": 196738176, "step": 91205 }, { "epoch": 16.73885116535144, "grad_norm": 0.00023730896646156907, "learning_rate": 7.883645569352854e-07, "loss": 0.0, "num_input_tokens_seen": 196749920, "step": 91210 }, { "epoch": 16.73976876491099, "grad_norm": 0.0001060120266629383, "learning_rate": 7.879330297854221e-07, "loss": 0.0, "num_input_tokens_seen": 196761120, "step": 91215 }, { "epoch": 16.740686364470545, "grad_norm": 0.00018162679043598473, "learning_rate": 7.875016106685485e-07, "loss": 0.0, "num_input_tokens_seen": 196772448, "step": 91220 }, { "epoch": 16.741603964030098, "grad_norm": 8.280518522951752e-05, "learning_rate": 7.870702995957297e-07, "loss": 0.0, "num_input_tokens_seen": 196782208, "step": 91225 }, { "epoch": 16.742521563589648, "grad_norm": 9.057879651663825e-05, "learning_rate": 7.866390965780274e-07, "loss": 0.0, "num_input_tokens_seen": 196792736, "step": 91230 }, { "epoch": 16.7434391631492, "grad_norm": 0.0002703209756873548, "learning_rate": 7.862080016265028e-07, "loss": 0.0, "num_input_tokens_seen": 196802528, "step": 91235 }, { "epoch": 16.744356762708755, "grad_norm": 0.00018296623602509499, "learning_rate": 7.857770147522126e-07, "loss": 0.0, "num_input_tokens_seen": 196813856, "step": 91240 }, { "epoch": 16.745274362268304, "grad_norm": 0.00010743597522377968, "learning_rate": 7.853461359662101e-07, "loss": 0.0, "num_input_tokens_seen": 196824288, "step": 91245 }, { "epoch": 16.746191961827858, "grad_norm": 6.281101377680898e-05, "learning_rate": 7.849153652795472e-07, "loss": 0.0, "num_input_tokens_seen": 196834816, "step": 91250 }, { "epoch": 16.74710956138741, "grad_norm": 0.0003270533634349704, "learning_rate": 7.844847027032715e-07, "loss": 0.0, "num_input_tokens_seen": 196845728, "step": 91255 }, { "epoch": 16.74802716094696, "grad_norm": 5.227861038292758e-05, "learning_rate": 7.84054148248431e-07, "loss": 0.0, "num_input_tokens_seen": 196856576, "step": 91260 }, { "epoch": 16.748944760506514, "grad_norm": 4.620970139512792e-05, "learning_rate": 7.836237019260667e-07, "loss": 0.0, "num_input_tokens_seen": 196867776, "step": 91265 }, { "epoch": 16.749862360066068, "grad_norm": 9.554273856338114e-05, "learning_rate": 7.831933637472205e-07, "loss": 0.0, "num_input_tokens_seen": 196878144, "step": 91270 }, { "epoch": 16.750779959625618, "grad_norm": 9.441763540962711e-05, "learning_rate": 7.827631337229274e-07, "loss": 0.0, "num_input_tokens_seen": 196888160, "step": 91275 }, { "epoch": 16.75169755918517, "grad_norm": 0.0004958758945576847, "learning_rate": 7.823330118642253e-07, "loss": 0.0, "num_input_tokens_seen": 196897920, "step": 91280 }, { "epoch": 16.752615158744725, "grad_norm": 0.0001203159408760257, "learning_rate": 7.819029981821441e-07, "loss": 0.0, "num_input_tokens_seen": 196909440, "step": 91285 }, { "epoch": 16.753532758304274, "grad_norm": 0.00040786847239360213, "learning_rate": 7.814730926877129e-07, "loss": 0.0, "num_input_tokens_seen": 196918816, "step": 91290 }, { "epoch": 16.754450357863828, "grad_norm": 0.000588743481785059, "learning_rate": 7.8104329539196e-07, "loss": 0.0, "num_input_tokens_seen": 196930464, "step": 91295 }, { "epoch": 16.75536795742338, "grad_norm": 0.00030864443397149444, "learning_rate": 7.806136063059072e-07, "loss": 0.0, "num_input_tokens_seen": 196940864, "step": 91300 }, { "epoch": 16.75628555698293, "grad_norm": 8.446600986644626e-05, "learning_rate": 7.801840254405763e-07, "loss": 0.0, "num_input_tokens_seen": 196952576, "step": 91305 }, { "epoch": 16.757203156542484, "grad_norm": 0.0021818680688738823, "learning_rate": 7.797545528069839e-07, "loss": 0.0, "num_input_tokens_seen": 196963808, "step": 91310 }, { "epoch": 16.758120756102038, "grad_norm": 0.00014818314230069518, "learning_rate": 7.793251884161474e-07, "loss": 0.0, "num_input_tokens_seen": 196975648, "step": 91315 }, { "epoch": 16.759038355661588, "grad_norm": 0.012444031424820423, "learning_rate": 7.788959322790784e-07, "loss": 0.0, "num_input_tokens_seen": 196986272, "step": 91320 }, { "epoch": 16.75995595522114, "grad_norm": 8.351372525794432e-05, "learning_rate": 7.784667844067856e-07, "loss": 0.0, "num_input_tokens_seen": 196996992, "step": 91325 }, { "epoch": 16.760873554780694, "grad_norm": 0.00010092041338793933, "learning_rate": 7.780377448102783e-07, "loss": 0.0, "num_input_tokens_seen": 197008672, "step": 91330 }, { "epoch": 16.761791154340244, "grad_norm": 0.00019785597396548837, "learning_rate": 7.776088135005594e-07, "loss": 0.0, "num_input_tokens_seen": 197020800, "step": 91335 }, { "epoch": 16.762708753899798, "grad_norm": 0.021335704252123833, "learning_rate": 7.771799904886301e-07, "loss": 0.0, "num_input_tokens_seen": 197031552, "step": 91340 }, { "epoch": 16.76362635345935, "grad_norm": 0.00010655736696207896, "learning_rate": 7.767512757854878e-07, "loss": 0.0, "num_input_tokens_seen": 197042240, "step": 91345 }, { "epoch": 16.7645439530189, "grad_norm": 6.145427323644981e-05, "learning_rate": 7.763226694021314e-07, "loss": 0.0, "num_input_tokens_seen": 197051872, "step": 91350 }, { "epoch": 16.765461552578454, "grad_norm": 0.00025204921257682145, "learning_rate": 7.758941713495522e-07, "loss": 0.0, "num_input_tokens_seen": 197062336, "step": 91355 }, { "epoch": 16.766379152138008, "grad_norm": 5.634620538330637e-05, "learning_rate": 7.754657816387401e-07, "loss": 0.0, "num_input_tokens_seen": 197072608, "step": 91360 }, { "epoch": 16.767296751697558, "grad_norm": 0.00010145373380510136, "learning_rate": 7.750375002806837e-07, "loss": 0.0225, "num_input_tokens_seen": 197082560, "step": 91365 }, { "epoch": 16.76821435125711, "grad_norm": 6.0387108533177525e-05, "learning_rate": 7.746093272863681e-07, "loss": 0.0, "num_input_tokens_seen": 197093088, "step": 91370 }, { "epoch": 16.769131950816664, "grad_norm": 0.0001956909109139815, "learning_rate": 7.741812626667727e-07, "loss": 0.0, "num_input_tokens_seen": 197103648, "step": 91375 }, { "epoch": 16.770049550376214, "grad_norm": 0.00016745293396525085, "learning_rate": 7.737533064328795e-07, "loss": 0.0, "num_input_tokens_seen": 197114400, "step": 91380 }, { "epoch": 16.770967149935768, "grad_norm": 7.072640437399969e-05, "learning_rate": 7.733254585956646e-07, "loss": 0.0, "num_input_tokens_seen": 197125504, "step": 91385 }, { "epoch": 16.77188474949532, "grad_norm": 0.00016527416300959885, "learning_rate": 7.728977191661002e-07, "loss": 0.0, "num_input_tokens_seen": 197137152, "step": 91390 }, { "epoch": 16.77280234905487, "grad_norm": 6.9806570536457e-05, "learning_rate": 7.724700881551572e-07, "loss": 0.0, "num_input_tokens_seen": 197147616, "step": 91395 }, { "epoch": 16.773719948614424, "grad_norm": 7.678212568862364e-05, "learning_rate": 7.720425655738056e-07, "loss": 0.0, "num_input_tokens_seen": 197158176, "step": 91400 }, { "epoch": 16.774637548173978, "grad_norm": 0.0004114538023713976, "learning_rate": 7.716151514330094e-07, "loss": 0.0, "num_input_tokens_seen": 197168864, "step": 91405 }, { "epoch": 16.775555147733527, "grad_norm": 8.070135663729161e-05, "learning_rate": 7.7118784574373e-07, "loss": 0.0, "num_input_tokens_seen": 197179168, "step": 91410 }, { "epoch": 16.77647274729308, "grad_norm": 0.0012821080163121223, "learning_rate": 7.707606485169289e-07, "loss": 0.0, "num_input_tokens_seen": 197190048, "step": 91415 }, { "epoch": 16.777390346852634, "grad_norm": 7.495138561353087e-05, "learning_rate": 7.703335597635631e-07, "loss": 0.0, "num_input_tokens_seen": 197200736, "step": 91420 }, { "epoch": 16.778307946412184, "grad_norm": 6.286654388532043e-05, "learning_rate": 7.699065794945848e-07, "loss": 0.0, "num_input_tokens_seen": 197211840, "step": 91425 }, { "epoch": 16.779225545971737, "grad_norm": 0.0013613664777949452, "learning_rate": 7.694797077209476e-07, "loss": 0.0, "num_input_tokens_seen": 197222688, "step": 91430 }, { "epoch": 16.78014314553129, "grad_norm": 4.9704103730618954e-05, "learning_rate": 7.690529444535993e-07, "loss": 0.0, "num_input_tokens_seen": 197233632, "step": 91435 }, { "epoch": 16.78106074509084, "grad_norm": 0.00014379002095665783, "learning_rate": 7.686262897034858e-07, "loss": 0.0, "num_input_tokens_seen": 197245088, "step": 91440 }, { "epoch": 16.781978344650394, "grad_norm": 0.00027000068803317845, "learning_rate": 7.681997434815497e-07, "loss": 0.0, "num_input_tokens_seen": 197255104, "step": 91445 }, { "epoch": 16.782895944209947, "grad_norm": 0.00024706419208087027, "learning_rate": 7.677733057987308e-07, "loss": 0.0, "num_input_tokens_seen": 197266080, "step": 91450 }, { "epoch": 16.783813543769497, "grad_norm": 9.366596350446343e-05, "learning_rate": 7.67346976665968e-07, "loss": 0.0, "num_input_tokens_seen": 197277280, "step": 91455 }, { "epoch": 16.78473114332905, "grad_norm": 0.0016456634039059281, "learning_rate": 7.66920756094195e-07, "loss": 0.0, "num_input_tokens_seen": 197287648, "step": 91460 }, { "epoch": 16.785648742888604, "grad_norm": 8.185511251213029e-05, "learning_rate": 7.664946440943444e-07, "loss": 0.0, "num_input_tokens_seen": 197297984, "step": 91465 }, { "epoch": 16.786566342448154, "grad_norm": 0.00036522350274026394, "learning_rate": 7.660686406773443e-07, "loss": 0.0, "num_input_tokens_seen": 197308896, "step": 91470 }, { "epoch": 16.787483942007707, "grad_norm": 0.00014810546417720616, "learning_rate": 7.656427458541222e-07, "loss": 0.0, "num_input_tokens_seen": 197320352, "step": 91475 }, { "epoch": 16.78840154156726, "grad_norm": 0.0005524729494936764, "learning_rate": 7.652169596355997e-07, "loss": 0.0, "num_input_tokens_seen": 197330976, "step": 91480 }, { "epoch": 16.78931914112681, "grad_norm": 0.00015630756388418376, "learning_rate": 7.647912820326997e-07, "loss": 0.0, "num_input_tokens_seen": 197341888, "step": 91485 }, { "epoch": 16.790236740686364, "grad_norm": 9.099581075133756e-05, "learning_rate": 7.643657130563392e-07, "loss": 0.0, "num_input_tokens_seen": 197353376, "step": 91490 }, { "epoch": 16.791154340245917, "grad_norm": 0.000505677133332938, "learning_rate": 7.639402527174328e-07, "loss": 0.0, "num_input_tokens_seen": 197363904, "step": 91495 }, { "epoch": 16.792071939805467, "grad_norm": 0.0004936640616506338, "learning_rate": 7.635149010268944e-07, "loss": 0.0, "num_input_tokens_seen": 197375136, "step": 91500 }, { "epoch": 16.79298953936502, "grad_norm": 0.00021842215210199356, "learning_rate": 7.630896579956331e-07, "loss": 0.0, "num_input_tokens_seen": 197386368, "step": 91505 }, { "epoch": 16.793907138924574, "grad_norm": 0.0005400452064350247, "learning_rate": 7.626645236345543e-07, "loss": 0.0, "num_input_tokens_seen": 197396608, "step": 91510 }, { "epoch": 16.794824738484124, "grad_norm": 0.0005332044675014913, "learning_rate": 7.622394979545644e-07, "loss": 0.0, "num_input_tokens_seen": 197406976, "step": 91515 }, { "epoch": 16.795742338043677, "grad_norm": 0.00031126130488701165, "learning_rate": 7.618145809665634e-07, "loss": 0.0, "num_input_tokens_seen": 197417312, "step": 91520 }, { "epoch": 16.79665993760323, "grad_norm": 0.000771863735280931, "learning_rate": 7.613897726814501e-07, "loss": 0.0, "num_input_tokens_seen": 197428160, "step": 91525 }, { "epoch": 16.79757753716278, "grad_norm": 0.00010441836639074609, "learning_rate": 7.609650731101181e-07, "loss": 0.0, "num_input_tokens_seen": 197437824, "step": 91530 }, { "epoch": 16.798495136722334, "grad_norm": 0.004282307345420122, "learning_rate": 7.605404822634637e-07, "loss": 0.0, "num_input_tokens_seen": 197448992, "step": 91535 }, { "epoch": 16.799412736281887, "grad_norm": 0.00011132062354590744, "learning_rate": 7.601160001523749e-07, "loss": 0.0, "num_input_tokens_seen": 197459968, "step": 91540 }, { "epoch": 16.800330335841437, "grad_norm": 8.35280807223171e-05, "learning_rate": 7.596916267877385e-07, "loss": 0.0, "num_input_tokens_seen": 197471936, "step": 91545 }, { "epoch": 16.80124793540099, "grad_norm": 6.75513074384071e-05, "learning_rate": 7.592673621804414e-07, "loss": 0.0, "num_input_tokens_seen": 197482880, "step": 91550 }, { "epoch": 16.802165534960544, "grad_norm": 0.0002000302920350805, "learning_rate": 7.588432063413637e-07, "loss": 0.0, "num_input_tokens_seen": 197493088, "step": 91555 }, { "epoch": 16.803083134520094, "grad_norm": 0.006154308095574379, "learning_rate": 7.584191592813839e-07, "loss": 0.0, "num_input_tokens_seen": 197504096, "step": 91560 }, { "epoch": 16.804000734079647, "grad_norm": 7.999724766705185e-05, "learning_rate": 7.579952210113795e-07, "loss": 0.0822, "num_input_tokens_seen": 197515392, "step": 91565 }, { "epoch": 16.8049183336392, "grad_norm": 0.00025085508241318166, "learning_rate": 7.575713915422228e-07, "loss": 0.0, "num_input_tokens_seen": 197526048, "step": 91570 }, { "epoch": 16.80583593319875, "grad_norm": 0.000104630904388614, "learning_rate": 7.571476708847853e-07, "loss": 0.0, "num_input_tokens_seen": 197537088, "step": 91575 }, { "epoch": 16.806753532758304, "grad_norm": 0.00013639645476359874, "learning_rate": 7.56724059049933e-07, "loss": 0.0, "num_input_tokens_seen": 197547744, "step": 91580 }, { "epoch": 16.807671132317857, "grad_norm": 6.868504715384915e-05, "learning_rate": 7.563005560485332e-07, "loss": 0.0, "num_input_tokens_seen": 197557760, "step": 91585 }, { "epoch": 16.808588731877407, "grad_norm": 1.3140168190002441, "learning_rate": 7.558771618914468e-07, "loss": 0.0002, "num_input_tokens_seen": 197568320, "step": 91590 }, { "epoch": 16.80950633143696, "grad_norm": 0.00010043278598459437, "learning_rate": 7.554538765895325e-07, "loss": 0.0, "num_input_tokens_seen": 197579456, "step": 91595 }, { "epoch": 16.810423930996514, "grad_norm": 0.0005802545929327607, "learning_rate": 7.550307001536489e-07, "loss": 0.0, "num_input_tokens_seen": 197590944, "step": 91600 }, { "epoch": 16.811341530556064, "grad_norm": 0.14809811115264893, "learning_rate": 7.546076325946489e-07, "loss": 0.0, "num_input_tokens_seen": 197601760, "step": 91605 }, { "epoch": 16.812259130115617, "grad_norm": 0.0025894076097756624, "learning_rate": 7.541846739233832e-07, "loss": 0.0, "num_input_tokens_seen": 197612096, "step": 91610 }, { "epoch": 16.81317672967517, "grad_norm": 0.0003477109712548554, "learning_rate": 7.537618241506989e-07, "loss": 0.0, "num_input_tokens_seen": 197622528, "step": 91615 }, { "epoch": 16.81409432923472, "grad_norm": 0.00047046015970408916, "learning_rate": 7.533390832874438e-07, "loss": 0.0, "num_input_tokens_seen": 197633728, "step": 91620 }, { "epoch": 16.815011928794274, "grad_norm": 7.630208710907027e-05, "learning_rate": 7.529164513444598e-07, "loss": 0.0, "num_input_tokens_seen": 197644672, "step": 91625 }, { "epoch": 16.815929528353827, "grad_norm": 0.0001254268572665751, "learning_rate": 7.524939283325849e-07, "loss": 0.0, "num_input_tokens_seen": 197654432, "step": 91630 }, { "epoch": 16.816847127913377, "grad_norm": 5.454113124869764e-05, "learning_rate": 7.520715142626595e-07, "loss": 0.0, "num_input_tokens_seen": 197665152, "step": 91635 }, { "epoch": 16.81776472747293, "grad_norm": 0.00010974722681567073, "learning_rate": 7.516492091455157e-07, "loss": 0.0, "num_input_tokens_seen": 197676896, "step": 91640 }, { "epoch": 16.818682327032484, "grad_norm": 0.00038343321648426354, "learning_rate": 7.51227012991984e-07, "loss": 0.0, "num_input_tokens_seen": 197686880, "step": 91645 }, { "epoch": 16.819599926592034, "grad_norm": 0.0015363974962383509, "learning_rate": 7.508049258128958e-07, "loss": 0.0, "num_input_tokens_seen": 197698560, "step": 91650 }, { "epoch": 16.820517526151587, "grad_norm": 9.13102412596345e-05, "learning_rate": 7.503829476190754e-07, "loss": 0.0, "num_input_tokens_seen": 197709120, "step": 91655 }, { "epoch": 16.82143512571114, "grad_norm": 0.0002598285209387541, "learning_rate": 7.499610784213468e-07, "loss": 0.0, "num_input_tokens_seen": 197721152, "step": 91660 }, { "epoch": 16.82235272527069, "grad_norm": 0.03344467282295227, "learning_rate": 7.495393182305288e-07, "loss": 0.0, "num_input_tokens_seen": 197731840, "step": 91665 }, { "epoch": 16.823270324830244, "grad_norm": 0.0004894431331194937, "learning_rate": 7.491176670574396e-07, "loss": 0.0, "num_input_tokens_seen": 197742784, "step": 91670 }, { "epoch": 16.824187924389797, "grad_norm": 5.934228465775959e-05, "learning_rate": 7.486961249128932e-07, "loss": 0.0, "num_input_tokens_seen": 197753664, "step": 91675 }, { "epoch": 16.825105523949347, "grad_norm": 7.171764445956796e-05, "learning_rate": 7.482746918077033e-07, "loss": 0.0, "num_input_tokens_seen": 197764128, "step": 91680 }, { "epoch": 16.8260231235089, "grad_norm": 5.787689224234782e-05, "learning_rate": 7.478533677526783e-07, "loss": 0.0, "num_input_tokens_seen": 197774336, "step": 91685 }, { "epoch": 16.826940723068454, "grad_norm": 6.092052717576735e-05, "learning_rate": 7.474321527586237e-07, "loss": 0.0, "num_input_tokens_seen": 197784416, "step": 91690 }, { "epoch": 16.827858322628003, "grad_norm": 0.00024236351600848138, "learning_rate": 7.470110468363428e-07, "loss": 0.0, "num_input_tokens_seen": 197795936, "step": 91695 }, { "epoch": 16.828775922187557, "grad_norm": 8.954329678090289e-05, "learning_rate": 7.465900499966378e-07, "loss": 0.0, "num_input_tokens_seen": 197806080, "step": 91700 }, { "epoch": 16.82969352174711, "grad_norm": 6.820010457886383e-05, "learning_rate": 7.461691622503059e-07, "loss": 0.0, "num_input_tokens_seen": 197816896, "step": 91705 }, { "epoch": 16.83061112130666, "grad_norm": 8.951972267823294e-05, "learning_rate": 7.45748383608142e-07, "loss": 0.0, "num_input_tokens_seen": 197829056, "step": 91710 }, { "epoch": 16.831528720866213, "grad_norm": 0.00015449254715349525, "learning_rate": 7.453277140809378e-07, "loss": 0.0, "num_input_tokens_seen": 197838592, "step": 91715 }, { "epoch": 16.832446320425767, "grad_norm": 6.619398482143879e-05, "learning_rate": 7.449071536794844e-07, "loss": 0.0, "num_input_tokens_seen": 197849920, "step": 91720 }, { "epoch": 16.833363919985317, "grad_norm": 7.745433686068282e-05, "learning_rate": 7.44486702414568e-07, "loss": 0.0, "num_input_tokens_seen": 197861280, "step": 91725 }, { "epoch": 16.83428151954487, "grad_norm": 0.00012617265747394413, "learning_rate": 7.440663602969711e-07, "loss": 0.0, "num_input_tokens_seen": 197872704, "step": 91730 }, { "epoch": 16.835199119104423, "grad_norm": 6.489783118013293e-05, "learning_rate": 7.436461273374768e-07, "loss": 0.0, "num_input_tokens_seen": 197884256, "step": 91735 }, { "epoch": 16.836116718663973, "grad_norm": 0.0001077794877346605, "learning_rate": 7.432260035468625e-07, "loss": 0.0, "num_input_tokens_seen": 197894272, "step": 91740 }, { "epoch": 16.837034318223527, "grad_norm": 4.274371167412028e-05, "learning_rate": 7.428059889359029e-07, "loss": 0.0, "num_input_tokens_seen": 197904224, "step": 91745 }, { "epoch": 16.83795191778308, "grad_norm": 0.00010499334166524932, "learning_rate": 7.423860835153729e-07, "loss": 0.0, "num_input_tokens_seen": 197914560, "step": 91750 }, { "epoch": 16.83886951734263, "grad_norm": 0.0007783431210555136, "learning_rate": 7.419662872960409e-07, "loss": 0.0, "num_input_tokens_seen": 197925856, "step": 91755 }, { "epoch": 16.839787116902183, "grad_norm": 0.00017295745783485472, "learning_rate": 7.415466002886745e-07, "loss": 0.0, "num_input_tokens_seen": 197936288, "step": 91760 }, { "epoch": 16.840704716461737, "grad_norm": 6.838935951236635e-05, "learning_rate": 7.411270225040368e-07, "loss": 0.0, "num_input_tokens_seen": 197947264, "step": 91765 }, { "epoch": 16.841622316021287, "grad_norm": 0.0004660508711822331, "learning_rate": 7.407075539528907e-07, "loss": 0.0, "num_input_tokens_seen": 197957280, "step": 91770 }, { "epoch": 16.84253991558084, "grad_norm": 6.353859498631209e-05, "learning_rate": 7.402881946459956e-07, "loss": 0.0, "num_input_tokens_seen": 197969120, "step": 91775 }, { "epoch": 16.843457515140393, "grad_norm": 0.0003102310874965042, "learning_rate": 7.398689445941043e-07, "loss": 0.0, "num_input_tokens_seen": 197979200, "step": 91780 }, { "epoch": 16.844375114699943, "grad_norm": 0.00020790804410353303, "learning_rate": 7.394498038079734e-07, "loss": 0.0, "num_input_tokens_seen": 197989472, "step": 91785 }, { "epoch": 16.845292714259497, "grad_norm": 0.0004099187208339572, "learning_rate": 7.39030772298352e-07, "loss": 0.0, "num_input_tokens_seen": 198000640, "step": 91790 }, { "epoch": 16.84621031381905, "grad_norm": 8.346899994648993e-05, "learning_rate": 7.38611850075987e-07, "loss": 0.0, "num_input_tokens_seen": 198010432, "step": 91795 }, { "epoch": 16.8471279133786, "grad_norm": 9.691779996501282e-05, "learning_rate": 7.381930371516227e-07, "loss": 0.0, "num_input_tokens_seen": 198021216, "step": 91800 }, { "epoch": 16.848045512938153, "grad_norm": 0.000419330463046208, "learning_rate": 7.377743335360027e-07, "loss": 0.0, "num_input_tokens_seen": 198032000, "step": 91805 }, { "epoch": 16.848963112497707, "grad_norm": 8.539290865883231e-05, "learning_rate": 7.373557392398656e-07, "loss": 0.0, "num_input_tokens_seen": 198041600, "step": 91810 }, { "epoch": 16.849880712057256, "grad_norm": 9.486734779784456e-05, "learning_rate": 7.369372542739456e-07, "loss": 0.0, "num_input_tokens_seen": 198052800, "step": 91815 }, { "epoch": 16.85079831161681, "grad_norm": 0.00011949992040172219, "learning_rate": 7.365188786489796e-07, "loss": 0.0, "num_input_tokens_seen": 198062752, "step": 91820 }, { "epoch": 16.851715911176363, "grad_norm": 0.00024417912936769426, "learning_rate": 7.361006123756964e-07, "loss": 0.0, "num_input_tokens_seen": 198073376, "step": 91825 }, { "epoch": 16.852633510735913, "grad_norm": 5.5845586757641286e-05, "learning_rate": 7.356824554648223e-07, "loss": 0.0, "num_input_tokens_seen": 198085408, "step": 91830 }, { "epoch": 16.853551110295466, "grad_norm": 0.0001350110542261973, "learning_rate": 7.35264407927086e-07, "loss": 0.0, "num_input_tokens_seen": 198096736, "step": 91835 }, { "epoch": 16.85446870985502, "grad_norm": 0.0001339131995337084, "learning_rate": 7.348464697732077e-07, "loss": 0.0, "num_input_tokens_seen": 198107136, "step": 91840 }, { "epoch": 16.85538630941457, "grad_norm": 7.693399675190449e-05, "learning_rate": 7.344286410139067e-07, "loss": 0.0, "num_input_tokens_seen": 198116896, "step": 91845 }, { "epoch": 16.856303908974123, "grad_norm": 9.875604882836342e-05, "learning_rate": 7.340109216598995e-07, "loss": 0.0, "num_input_tokens_seen": 198127232, "step": 91850 }, { "epoch": 16.857221508533677, "grad_norm": 7.18059454811737e-05, "learning_rate": 7.335933117219013e-07, "loss": 0.0, "num_input_tokens_seen": 198138336, "step": 91855 }, { "epoch": 16.858139108093226, "grad_norm": 0.0032037298660725355, "learning_rate": 7.331758112106219e-07, "loss": 0.0, "num_input_tokens_seen": 198150848, "step": 91860 }, { "epoch": 16.85905670765278, "grad_norm": 7.707360782660544e-05, "learning_rate": 7.327584201367705e-07, "loss": 0.0, "num_input_tokens_seen": 198161184, "step": 91865 }, { "epoch": 16.859974307212333, "grad_norm": 0.0004145744605921209, "learning_rate": 7.323411385110507e-07, "loss": 0.0, "num_input_tokens_seen": 198171968, "step": 91870 }, { "epoch": 16.860891906771883, "grad_norm": 7.649339386262e-05, "learning_rate": 7.319239663441674e-07, "loss": 0.0, "num_input_tokens_seen": 198183552, "step": 91875 }, { "epoch": 16.861809506331436, "grad_norm": 0.0001897549955174327, "learning_rate": 7.315069036468197e-07, "loss": 0.0, "num_input_tokens_seen": 198194176, "step": 91880 }, { "epoch": 16.86272710589099, "grad_norm": 8.10078126960434e-05, "learning_rate": 7.310899504297042e-07, "loss": 0.0, "num_input_tokens_seen": 198205088, "step": 91885 }, { "epoch": 16.86364470545054, "grad_norm": 7.947091216919944e-05, "learning_rate": 7.306731067035155e-07, "loss": 0.0, "num_input_tokens_seen": 198215712, "step": 91890 }, { "epoch": 16.864562305010093, "grad_norm": 4.692985748988576e-05, "learning_rate": 7.302563724789435e-07, "loss": 0.0002, "num_input_tokens_seen": 198226592, "step": 91895 }, { "epoch": 16.865479904569646, "grad_norm": 0.00533836567774415, "learning_rate": 7.298397477666791e-07, "loss": 0.0, "num_input_tokens_seen": 198238112, "step": 91900 }, { "epoch": 16.866397504129196, "grad_norm": 0.00012179031182313338, "learning_rate": 7.29423232577407e-07, "loss": 0.0, "num_input_tokens_seen": 198248864, "step": 91905 }, { "epoch": 16.86731510368875, "grad_norm": 0.00017717269656714052, "learning_rate": 7.290068269218103e-07, "loss": 0.0, "num_input_tokens_seen": 198260704, "step": 91910 }, { "epoch": 16.868232703248303, "grad_norm": 0.00014624919276684523, "learning_rate": 7.285905308105678e-07, "loss": 0.0, "num_input_tokens_seen": 198272224, "step": 91915 }, { "epoch": 16.869150302807853, "grad_norm": 0.00011039232049370185, "learning_rate": 7.281743442543593e-07, "loss": 0.0001, "num_input_tokens_seen": 198282080, "step": 91920 }, { "epoch": 16.870067902367406, "grad_norm": 7.672821811866015e-05, "learning_rate": 7.277582672638583e-07, "loss": 0.0, "num_input_tokens_seen": 198293088, "step": 91925 }, { "epoch": 16.87098550192696, "grad_norm": 8.515970694134012e-05, "learning_rate": 7.273422998497365e-07, "loss": 0.0, "num_input_tokens_seen": 198304192, "step": 91930 }, { "epoch": 16.87190310148651, "grad_norm": 0.00023346322996076196, "learning_rate": 7.269264420226613e-07, "loss": 0.0, "num_input_tokens_seen": 198313952, "step": 91935 }, { "epoch": 16.872820701046063, "grad_norm": 6.855617539258674e-05, "learning_rate": 7.265106937933009e-07, "loss": 0.0, "num_input_tokens_seen": 198325824, "step": 91940 }, { "epoch": 16.873738300605616, "grad_norm": 5.611087181023322e-05, "learning_rate": 7.260950551723184e-07, "loss": 0.0, "num_input_tokens_seen": 198336928, "step": 91945 }, { "epoch": 16.874655900165166, "grad_norm": 9.111547842621803e-05, "learning_rate": 7.256795261703725e-07, "loss": 0.0, "num_input_tokens_seen": 198347776, "step": 91950 }, { "epoch": 16.87557349972472, "grad_norm": 7.034841109998524e-05, "learning_rate": 7.252641067981237e-07, "loss": 0.0, "num_input_tokens_seen": 198357440, "step": 91955 }, { "epoch": 16.876491099284273, "grad_norm": 9.71169865806587e-05, "learning_rate": 7.248487970662249e-07, "loss": 0.0, "num_input_tokens_seen": 198368736, "step": 91960 }, { "epoch": 16.877408698843823, "grad_norm": 0.00017266445502173156, "learning_rate": 7.244335969853272e-07, "loss": 0.0, "num_input_tokens_seen": 198379744, "step": 91965 }, { "epoch": 16.878326298403376, "grad_norm": 0.00014251048560254276, "learning_rate": 7.240185065660827e-07, "loss": 0.0, "num_input_tokens_seen": 198390560, "step": 91970 }, { "epoch": 16.87924389796293, "grad_norm": 7.30944739188999e-05, "learning_rate": 7.236035258191365e-07, "loss": 0.0, "num_input_tokens_seen": 198400864, "step": 91975 }, { "epoch": 16.88016149752248, "grad_norm": 8.079737017396837e-05, "learning_rate": 7.231886547551314e-07, "loss": 0.0, "num_input_tokens_seen": 198411200, "step": 91980 }, { "epoch": 16.881079097082033, "grad_norm": 0.0003108017554040998, "learning_rate": 7.227738933847083e-07, "loss": 0.0, "num_input_tokens_seen": 198422720, "step": 91985 }, { "epoch": 16.881996696641586, "grad_norm": 0.0002247098891530186, "learning_rate": 7.223592417185066e-07, "loss": 0.0, "num_input_tokens_seen": 198433696, "step": 91990 }, { "epoch": 16.882914296201136, "grad_norm": 7.302980520762503e-05, "learning_rate": 7.219446997671609e-07, "loss": 0.0, "num_input_tokens_seen": 198445184, "step": 91995 }, { "epoch": 16.88383189576069, "grad_norm": 0.001175791840068996, "learning_rate": 7.215302675413022e-07, "loss": 0.0, "num_input_tokens_seen": 198455168, "step": 92000 }, { "epoch": 16.884749495320243, "grad_norm": 7.380406896118075e-05, "learning_rate": 7.211159450515621e-07, "loss": 0.0, "num_input_tokens_seen": 198466688, "step": 92005 }, { "epoch": 16.885667094879793, "grad_norm": 0.0005868307780474424, "learning_rate": 7.207017323085658e-07, "loss": 0.0, "num_input_tokens_seen": 198477344, "step": 92010 }, { "epoch": 16.886584694439346, "grad_norm": 8.34841703181155e-05, "learning_rate": 7.202876293229372e-07, "loss": 0.0, "num_input_tokens_seen": 198487968, "step": 92015 }, { "epoch": 16.8875022939989, "grad_norm": 7.071831350913271e-05, "learning_rate": 7.198736361052989e-07, "loss": 0.0, "num_input_tokens_seen": 198499360, "step": 92020 }, { "epoch": 16.88841989355845, "grad_norm": 0.0003971236292272806, "learning_rate": 7.194597526662683e-07, "loss": 0.0, "num_input_tokens_seen": 198509600, "step": 92025 }, { "epoch": 16.889337493118003, "grad_norm": 0.0001559141674079001, "learning_rate": 7.190459790164605e-07, "loss": 0.0, "num_input_tokens_seen": 198518560, "step": 92030 }, { "epoch": 16.890255092677556, "grad_norm": 9.532841795589775e-05, "learning_rate": 7.186323151664881e-07, "loss": 0.0, "num_input_tokens_seen": 198529728, "step": 92035 }, { "epoch": 16.891172692237106, "grad_norm": 0.0003053235705010593, "learning_rate": 7.18218761126962e-07, "loss": 0.0, "num_input_tokens_seen": 198540288, "step": 92040 }, { "epoch": 16.89209029179666, "grad_norm": 0.00011740908666979522, "learning_rate": 7.178053169084881e-07, "loss": 0.0, "num_input_tokens_seen": 198551680, "step": 92045 }, { "epoch": 16.893007891356213, "grad_norm": 6.822915020165965e-05, "learning_rate": 7.173919825216702e-07, "loss": 0.0, "num_input_tokens_seen": 198562528, "step": 92050 }, { "epoch": 16.893925490915763, "grad_norm": 0.0006910533411428332, "learning_rate": 7.16978757977112e-07, "loss": 0.0, "num_input_tokens_seen": 198573376, "step": 92055 }, { "epoch": 16.894843090475316, "grad_norm": 0.0002626344212330878, "learning_rate": 7.165656432854101e-07, "loss": 0.0, "num_input_tokens_seen": 198584000, "step": 92060 }, { "epoch": 16.89576069003487, "grad_norm": 5.641885582008399e-05, "learning_rate": 7.16152638457161e-07, "loss": 0.0, "num_input_tokens_seen": 198595008, "step": 92065 }, { "epoch": 16.89667828959442, "grad_norm": 9.769118332769722e-05, "learning_rate": 7.157397435029561e-07, "loss": 0.0, "num_input_tokens_seen": 198605632, "step": 92070 }, { "epoch": 16.897595889153973, "grad_norm": 9.108472295338288e-05, "learning_rate": 7.153269584333877e-07, "loss": 0.0, "num_input_tokens_seen": 198615840, "step": 92075 }, { "epoch": 16.898513488713526, "grad_norm": 6.201706128194928e-05, "learning_rate": 7.149142832590428e-07, "loss": 0.0, "num_input_tokens_seen": 198627040, "step": 92080 }, { "epoch": 16.899431088273076, "grad_norm": 0.00018495955737307668, "learning_rate": 7.145017179905045e-07, "loss": 0.0, "num_input_tokens_seen": 198638688, "step": 92085 }, { "epoch": 16.90034868783263, "grad_norm": 4.7401266783708706e-05, "learning_rate": 7.140892626383544e-07, "loss": 0.0, "num_input_tokens_seen": 198649504, "step": 92090 }, { "epoch": 16.901266287392183, "grad_norm": 8.46856928546913e-05, "learning_rate": 7.136769172131736e-07, "loss": 0.0, "num_input_tokens_seen": 198659200, "step": 92095 }, { "epoch": 16.902183886951732, "grad_norm": 0.00012065890769008547, "learning_rate": 7.132646817255362e-07, "loss": 0.0, "num_input_tokens_seen": 198669568, "step": 92100 }, { "epoch": 16.903101486511286, "grad_norm": 5.451369725051336e-05, "learning_rate": 7.128525561860161e-07, "loss": 0.0, "num_input_tokens_seen": 198680544, "step": 92105 }, { "epoch": 16.90401908607084, "grad_norm": 0.00010176259820582345, "learning_rate": 7.124405406051837e-07, "loss": 0.0, "num_input_tokens_seen": 198690848, "step": 92110 }, { "epoch": 16.90493668563039, "grad_norm": 3.9354199543595314e-05, "learning_rate": 7.12028634993605e-07, "loss": 0.0, "num_input_tokens_seen": 198701280, "step": 92115 }, { "epoch": 16.905854285189942, "grad_norm": 0.0010110285365954041, "learning_rate": 7.116168393618473e-07, "loss": 0.0, "num_input_tokens_seen": 198712224, "step": 92120 }, { "epoch": 16.906771884749496, "grad_norm": 8.068173337960616e-05, "learning_rate": 7.112051537204706e-07, "loss": 0.0, "num_input_tokens_seen": 198723232, "step": 92125 }, { "epoch": 16.907689484309046, "grad_norm": 0.0024462235160171986, "learning_rate": 7.107935780800351e-07, "loss": 0.0, "num_input_tokens_seen": 198733664, "step": 92130 }, { "epoch": 16.9086070838686, "grad_norm": 0.0006161404307931662, "learning_rate": 7.103821124510957e-07, "loss": 0.0, "num_input_tokens_seen": 198744384, "step": 92135 }, { "epoch": 16.909524683428153, "grad_norm": 7.750685472274199e-05, "learning_rate": 7.099707568442083e-07, "loss": 0.0, "num_input_tokens_seen": 198755040, "step": 92140 }, { "epoch": 16.910442282987702, "grad_norm": 0.0001912096340674907, "learning_rate": 7.095595112699211e-07, "loss": 0.0, "num_input_tokens_seen": 198765312, "step": 92145 }, { "epoch": 16.911359882547256, "grad_norm": 8.781892393017188e-05, "learning_rate": 7.091483757387824e-07, "loss": 0.0, "num_input_tokens_seen": 198776864, "step": 92150 }, { "epoch": 16.91227748210681, "grad_norm": 5.75394878978841e-05, "learning_rate": 7.087373502613387e-07, "loss": 0.0, "num_input_tokens_seen": 198787456, "step": 92155 }, { "epoch": 16.91319508166636, "grad_norm": 0.009172740392386913, "learning_rate": 7.083264348481312e-07, "loss": 0.0, "num_input_tokens_seen": 198798272, "step": 92160 }, { "epoch": 16.914112681225912, "grad_norm": 9.312059410149232e-05, "learning_rate": 7.079156295096983e-07, "loss": 0.0, "num_input_tokens_seen": 198808672, "step": 92165 }, { "epoch": 16.915030280785466, "grad_norm": 0.00013812909310217947, "learning_rate": 7.075049342565771e-07, "loss": 0.0, "num_input_tokens_seen": 198819360, "step": 92170 }, { "epoch": 16.915947880345016, "grad_norm": 0.046595461666584015, "learning_rate": 7.070943490993027e-07, "loss": 0.0, "num_input_tokens_seen": 198829856, "step": 92175 }, { "epoch": 16.91686547990457, "grad_norm": 8.629658259451389e-05, "learning_rate": 7.066838740484044e-07, "loss": 0.0, "num_input_tokens_seen": 198840992, "step": 92180 }, { "epoch": 16.917783079464122, "grad_norm": 8.036070357775316e-05, "learning_rate": 7.062735091144102e-07, "loss": 0.0, "num_input_tokens_seen": 198850496, "step": 92185 }, { "epoch": 16.918700679023672, "grad_norm": 0.00013207945448812097, "learning_rate": 7.058632543078464e-07, "loss": 0.0, "num_input_tokens_seen": 198861696, "step": 92190 }, { "epoch": 16.919618278583226, "grad_norm": 0.00012202072684885934, "learning_rate": 7.054531096392347e-07, "loss": 0.0, "num_input_tokens_seen": 198872512, "step": 92195 }, { "epoch": 16.92053587814278, "grad_norm": 0.00011085028381785378, "learning_rate": 7.05043075119094e-07, "loss": 0.0, "num_input_tokens_seen": 198882624, "step": 92200 }, { "epoch": 16.92145347770233, "grad_norm": 7.318056304939091e-05, "learning_rate": 7.046331507579429e-07, "loss": 0.0, "num_input_tokens_seen": 198894016, "step": 92205 }, { "epoch": 16.922371077261882, "grad_norm": 0.000132736808154732, "learning_rate": 7.042233365662943e-07, "loss": 0.0, "num_input_tokens_seen": 198904896, "step": 92210 }, { "epoch": 16.923288676821436, "grad_norm": 0.00013018205936532468, "learning_rate": 7.038136325546597e-07, "loss": 0.0, "num_input_tokens_seen": 198914496, "step": 92215 }, { "epoch": 16.924206276380986, "grad_norm": 4.4239255657885224e-05, "learning_rate": 7.03404038733545e-07, "loss": 0.0, "num_input_tokens_seen": 198924448, "step": 92220 }, { "epoch": 16.92512387594054, "grad_norm": 0.00015889195492491126, "learning_rate": 7.029945551134592e-07, "loss": 0.0, "num_input_tokens_seen": 198934912, "step": 92225 }, { "epoch": 16.926041475500092, "grad_norm": 9.6218689577654e-05, "learning_rate": 7.025851817049028e-07, "loss": 0.0, "num_input_tokens_seen": 198945408, "step": 92230 }, { "epoch": 16.926959075059642, "grad_norm": 0.00012782897101715207, "learning_rate": 7.021759185183757e-07, "loss": 0.0, "num_input_tokens_seen": 198957056, "step": 92235 }, { "epoch": 16.927876674619196, "grad_norm": 6.565394141944125e-05, "learning_rate": 7.017667655643762e-07, "loss": 0.0, "num_input_tokens_seen": 198966752, "step": 92240 }, { "epoch": 16.92879427417875, "grad_norm": 0.0002523023867979646, "learning_rate": 7.013577228533975e-07, "loss": 0.0, "num_input_tokens_seen": 198977472, "step": 92245 }, { "epoch": 16.9297118737383, "grad_norm": 0.003046313300728798, "learning_rate": 7.009487903959305e-07, "loss": 0.0, "num_input_tokens_seen": 198988096, "step": 92250 }, { "epoch": 16.930629473297852, "grad_norm": 7.543236279161647e-05, "learning_rate": 7.005399682024633e-07, "loss": 0.0, "num_input_tokens_seen": 198999648, "step": 92255 }, { "epoch": 16.931547072857406, "grad_norm": 0.00026330642867833376, "learning_rate": 7.001312562834834e-07, "loss": 0.0, "num_input_tokens_seen": 199008896, "step": 92260 }, { "epoch": 16.932464672416955, "grad_norm": 0.00014498899690806866, "learning_rate": 6.997226546494723e-07, "loss": 0.0, "num_input_tokens_seen": 199020384, "step": 92265 }, { "epoch": 16.93338227197651, "grad_norm": 9.958700684364885e-05, "learning_rate": 6.993141633109096e-07, "loss": 0.0, "num_input_tokens_seen": 199030784, "step": 92270 }, { "epoch": 16.934299871536062, "grad_norm": 6.326845323201269e-05, "learning_rate": 6.989057822782741e-07, "loss": 0.0, "num_input_tokens_seen": 199041440, "step": 92275 }, { "epoch": 16.935217471095616, "grad_norm": 7.627923332620412e-05, "learning_rate": 6.984975115620396e-07, "loss": 0.0, "num_input_tokens_seen": 199052768, "step": 92280 }, { "epoch": 16.936135070655165, "grad_norm": 8.317539322888479e-05, "learning_rate": 6.980893511726756e-07, "loss": 0.0, "num_input_tokens_seen": 199063040, "step": 92285 }, { "epoch": 16.93705267021472, "grad_norm": 0.028089415282011032, "learning_rate": 6.976813011206534e-07, "loss": 0.0, "num_input_tokens_seen": 199074176, "step": 92290 }, { "epoch": 16.937970269774272, "grad_norm": 4.913696102448739e-05, "learning_rate": 6.972733614164378e-07, "loss": 0.0, "num_input_tokens_seen": 199086048, "step": 92295 }, { "epoch": 16.938887869333822, "grad_norm": 0.00024215404118876904, "learning_rate": 6.968655320704926e-07, "loss": 0.0, "num_input_tokens_seen": 199095744, "step": 92300 }, { "epoch": 16.939805468893375, "grad_norm": 0.00031676687649451196, "learning_rate": 6.964578130932764e-07, "loss": 0.0, "num_input_tokens_seen": 199106784, "step": 92305 }, { "epoch": 16.94072306845293, "grad_norm": 0.0004320633306633681, "learning_rate": 6.960502044952466e-07, "loss": 0.0, "num_input_tokens_seen": 199117632, "step": 92310 }, { "epoch": 16.94164066801248, "grad_norm": 0.0001719146966934204, "learning_rate": 6.956427062868599e-07, "loss": 0.0, "num_input_tokens_seen": 199128160, "step": 92315 }, { "epoch": 16.942558267572032, "grad_norm": 5.501198029378429e-05, "learning_rate": 6.952353184785666e-07, "loss": 0.0, "num_input_tokens_seen": 199139616, "step": 92320 }, { "epoch": 16.943475867131585, "grad_norm": 0.0005506631569005549, "learning_rate": 6.948280410808156e-07, "loss": 0.0, "num_input_tokens_seen": 199149376, "step": 92325 }, { "epoch": 16.944393466691135, "grad_norm": 8.411804446950555e-05, "learning_rate": 6.944208741040526e-07, "loss": 0.0, "num_input_tokens_seen": 199158912, "step": 92330 }, { "epoch": 16.94531106625069, "grad_norm": 0.0001719333668006584, "learning_rate": 6.940138175587202e-07, "loss": 0.0, "num_input_tokens_seen": 199171008, "step": 92335 }, { "epoch": 16.946228665810242, "grad_norm": 7.721487054368481e-05, "learning_rate": 6.936068714552607e-07, "loss": 0.0, "num_input_tokens_seen": 199182848, "step": 92340 }, { "epoch": 16.947146265369792, "grad_norm": 480.87005615234375, "learning_rate": 6.932000358041107e-07, "loss": 0.2063, "num_input_tokens_seen": 199193504, "step": 92345 }, { "epoch": 16.948063864929345, "grad_norm": 0.00020064412092324346, "learning_rate": 6.927933106157053e-07, "loss": 0.0, "num_input_tokens_seen": 199205024, "step": 92350 }, { "epoch": 16.9489814644889, "grad_norm": 6.131251575425267e-05, "learning_rate": 6.923866959004743e-07, "loss": 0.0, "num_input_tokens_seen": 199215552, "step": 92355 }, { "epoch": 16.94989906404845, "grad_norm": 0.00010195406503044069, "learning_rate": 6.919801916688495e-07, "loss": 0.0, "num_input_tokens_seen": 199226272, "step": 92360 }, { "epoch": 16.950816663608002, "grad_norm": 0.0001459157938370481, "learning_rate": 6.915737979312559e-07, "loss": 0.0, "num_input_tokens_seen": 199237088, "step": 92365 }, { "epoch": 16.951734263167555, "grad_norm": 3.93738409911748e-05, "learning_rate": 6.911675146981161e-07, "loss": 0.0, "num_input_tokens_seen": 199246432, "step": 92370 }, { "epoch": 16.952651862727105, "grad_norm": 7.120222289813682e-05, "learning_rate": 6.907613419798526e-07, "loss": 0.0, "num_input_tokens_seen": 199258176, "step": 92375 }, { "epoch": 16.95356946228666, "grad_norm": 0.0013878792524337769, "learning_rate": 6.903552797868817e-07, "loss": 0.0, "num_input_tokens_seen": 199267872, "step": 92380 }, { "epoch": 16.954487061846212, "grad_norm": 0.00011443189578130841, "learning_rate": 6.899493281296182e-07, "loss": 0.0, "num_input_tokens_seen": 199278592, "step": 92385 }, { "epoch": 16.955404661405762, "grad_norm": 6.250484148040414e-05, "learning_rate": 6.895434870184742e-07, "loss": 0.0, "num_input_tokens_seen": 199289120, "step": 92390 }, { "epoch": 16.956322260965315, "grad_norm": 6.993340502958745e-05, "learning_rate": 6.891377564638596e-07, "loss": 0.0, "num_input_tokens_seen": 199299648, "step": 92395 }, { "epoch": 16.95723986052487, "grad_norm": 7.772183744236827e-05, "learning_rate": 6.887321364761806e-07, "loss": 0.0, "num_input_tokens_seen": 199310272, "step": 92400 }, { "epoch": 16.95815746008442, "grad_norm": 0.059379395097494125, "learning_rate": 6.883266270658395e-07, "loss": 0.0, "num_input_tokens_seen": 199321824, "step": 92405 }, { "epoch": 16.959075059643972, "grad_norm": 5.217201396590099e-05, "learning_rate": 6.879212282432385e-07, "loss": 0.0, "num_input_tokens_seen": 199332192, "step": 92410 }, { "epoch": 16.959992659203525, "grad_norm": 0.0002611865056678653, "learning_rate": 6.875159400187753e-07, "loss": 0.0, "num_input_tokens_seen": 199341984, "step": 92415 }, { "epoch": 16.960910258763075, "grad_norm": 5.162155139259994e-05, "learning_rate": 6.871107624028434e-07, "loss": 0.0, "num_input_tokens_seen": 199352672, "step": 92420 }, { "epoch": 16.96182785832263, "grad_norm": 9.270283771911636e-05, "learning_rate": 6.867056954058371e-07, "loss": 0.0, "num_input_tokens_seen": 199363872, "step": 92425 }, { "epoch": 16.962745457882182, "grad_norm": 0.0004262455040588975, "learning_rate": 6.863007390381449e-07, "loss": 0.0, "num_input_tokens_seen": 199373280, "step": 92430 }, { "epoch": 16.96366305744173, "grad_norm": 0.00014132435899227858, "learning_rate": 6.858958933101529e-07, "loss": 0.0, "num_input_tokens_seen": 199382784, "step": 92435 }, { "epoch": 16.964580657001285, "grad_norm": 5.738630352425389e-05, "learning_rate": 6.854911582322438e-07, "loss": 0.0, "num_input_tokens_seen": 199394880, "step": 92440 }, { "epoch": 16.96549825656084, "grad_norm": 8.987126057036221e-05, "learning_rate": 6.850865338148005e-07, "loss": 0.0, "num_input_tokens_seen": 199404672, "step": 92445 }, { "epoch": 16.96641585612039, "grad_norm": 7.492922304663807e-05, "learning_rate": 6.846820200682003e-07, "loss": 0.0, "num_input_tokens_seen": 199414688, "step": 92450 }, { "epoch": 16.967333455679942, "grad_norm": 5.9640209656208754e-05, "learning_rate": 6.84277617002817e-07, "loss": 0.0005, "num_input_tokens_seen": 199424608, "step": 92455 }, { "epoch": 16.968251055239495, "grad_norm": 5.441711255116388e-05, "learning_rate": 6.838733246290258e-07, "loss": 0.0, "num_input_tokens_seen": 199435264, "step": 92460 }, { "epoch": 16.969168654799045, "grad_norm": 4.236367385601625e-05, "learning_rate": 6.834691429571938e-07, "loss": 0.2143, "num_input_tokens_seen": 199446240, "step": 92465 }, { "epoch": 16.9700862543586, "grad_norm": 7.51228944864124e-05, "learning_rate": 6.830650719976872e-07, "loss": 0.0, "num_input_tokens_seen": 199457024, "step": 92470 }, { "epoch": 16.971003853918152, "grad_norm": 6.784640572732314e-05, "learning_rate": 6.826611117608722e-07, "loss": 0.0, "num_input_tokens_seen": 199468832, "step": 92475 }, { "epoch": 16.9719214534777, "grad_norm": 0.0004764558980241418, "learning_rate": 6.822572622571083e-07, "loss": 0.0, "num_input_tokens_seen": 199479872, "step": 92480 }, { "epoch": 16.972839053037255, "grad_norm": 0.0006467005587182939, "learning_rate": 6.818535234967532e-07, "loss": 0.0, "num_input_tokens_seen": 199490848, "step": 92485 }, { "epoch": 16.97375665259681, "grad_norm": 0.000148349572555162, "learning_rate": 6.814498954901622e-07, "loss": 0.0, "num_input_tokens_seen": 199502016, "step": 92490 }, { "epoch": 16.97467425215636, "grad_norm": 0.00015388858446385711, "learning_rate": 6.810463782476895e-07, "loss": 0.0, "num_input_tokens_seen": 199510848, "step": 92495 }, { "epoch": 16.97559185171591, "grad_norm": 9.269585279980674e-05, "learning_rate": 6.80642971779683e-07, "loss": 0.0588, "num_input_tokens_seen": 199521696, "step": 92500 }, { "epoch": 16.976509451275465, "grad_norm": 0.0004527397104538977, "learning_rate": 6.802396760964891e-07, "loss": 0.0, "num_input_tokens_seen": 199531776, "step": 92505 }, { "epoch": 16.977427050835015, "grad_norm": 0.00038907676935195923, "learning_rate": 6.798364912084532e-07, "loss": 0.0, "num_input_tokens_seen": 199542208, "step": 92510 }, { "epoch": 16.97834465039457, "grad_norm": 0.0001740523148328066, "learning_rate": 6.794334171259159e-07, "loss": 0.0, "num_input_tokens_seen": 199552288, "step": 92515 }, { "epoch": 16.97926224995412, "grad_norm": 0.0009735549101606011, "learning_rate": 6.790304538592152e-07, "loss": 0.0, "num_input_tokens_seen": 199562560, "step": 92520 }, { "epoch": 16.98017984951367, "grad_norm": 8.332941797561944e-05, "learning_rate": 6.786276014186866e-07, "loss": 0.0, "num_input_tokens_seen": 199573888, "step": 92525 }, { "epoch": 16.981097449073225, "grad_norm": 0.0001113990947487764, "learning_rate": 6.782248598146612e-07, "loss": 0.0, "num_input_tokens_seen": 199584000, "step": 92530 }, { "epoch": 16.98201504863278, "grad_norm": 8.42157969600521e-05, "learning_rate": 6.778222290574709e-07, "loss": 0.0, "num_input_tokens_seen": 199593344, "step": 92535 }, { "epoch": 16.982932648192328, "grad_norm": 7.913303124951199e-05, "learning_rate": 6.774197091574419e-07, "loss": 0.0, "num_input_tokens_seen": 199604928, "step": 92540 }, { "epoch": 16.98385024775188, "grad_norm": 0.0005505562294274569, "learning_rate": 6.770173001248981e-07, "loss": 0.0, "num_input_tokens_seen": 199616352, "step": 92545 }, { "epoch": 16.984767847311435, "grad_norm": 0.00028743763687089086, "learning_rate": 6.766150019701601e-07, "loss": 0.0, "num_input_tokens_seen": 199627360, "step": 92550 }, { "epoch": 16.985685446870985, "grad_norm": 0.00018475238175597042, "learning_rate": 6.762128147035463e-07, "loss": 0.0, "num_input_tokens_seen": 199637888, "step": 92555 }, { "epoch": 16.986603046430538, "grad_norm": 0.0001201904087793082, "learning_rate": 6.758107383353729e-07, "loss": 0.0, "num_input_tokens_seen": 199648160, "step": 92560 }, { "epoch": 16.98752064599009, "grad_norm": 0.00018849177286028862, "learning_rate": 6.754087728759523e-07, "loss": 0.0, "num_input_tokens_seen": 199658464, "step": 92565 }, { "epoch": 16.98843824554964, "grad_norm": 8.457346120849252e-05, "learning_rate": 6.750069183355946e-07, "loss": 0.0, "num_input_tokens_seen": 199668608, "step": 92570 }, { "epoch": 16.989355845109195, "grad_norm": 8.176581468433142e-05, "learning_rate": 6.746051747246046e-07, "loss": 0.0, "num_input_tokens_seen": 199679168, "step": 92575 }, { "epoch": 16.99027344466875, "grad_norm": 0.0019001052714884281, "learning_rate": 6.7420354205329e-07, "loss": 0.1969, "num_input_tokens_seen": 199690240, "step": 92580 }, { "epoch": 16.991191044228298, "grad_norm": 0.00010719670535763726, "learning_rate": 6.738020203319495e-07, "loss": 0.0, "num_input_tokens_seen": 199701152, "step": 92585 }, { "epoch": 16.99210864378785, "grad_norm": 0.014557729475200176, "learning_rate": 6.734006095708811e-07, "loss": 0.0, "num_input_tokens_seen": 199712544, "step": 92590 }, { "epoch": 16.993026243347405, "grad_norm": 5.99296108703129e-05, "learning_rate": 6.729993097803828e-07, "loss": 0.0, "num_input_tokens_seen": 199723904, "step": 92595 }, { "epoch": 16.993943842906955, "grad_norm": 4.579361848300323e-05, "learning_rate": 6.72598120970746e-07, "loss": 0.0, "num_input_tokens_seen": 199733888, "step": 92600 }, { "epoch": 16.994861442466508, "grad_norm": 0.00024378109083045274, "learning_rate": 6.721970431522595e-07, "loss": 0.0, "num_input_tokens_seen": 199745888, "step": 92605 }, { "epoch": 16.99577904202606, "grad_norm": 0.0012493086978793144, "learning_rate": 6.717960763352122e-07, "loss": 0.0, "num_input_tokens_seen": 199755104, "step": 92610 }, { "epoch": 16.99669664158561, "grad_norm": 0.0002614639524836093, "learning_rate": 6.713952205298874e-07, "loss": 0.0, "num_input_tokens_seen": 199764896, "step": 92615 }, { "epoch": 16.997614241145165, "grad_norm": 7.439267210429534e-05, "learning_rate": 6.709944757465664e-07, "loss": 0.0792, "num_input_tokens_seen": 199775840, "step": 92620 }, { "epoch": 16.998531840704718, "grad_norm": 0.00015814394282642752, "learning_rate": 6.705938419955271e-07, "loss": 0.0, "num_input_tokens_seen": 199786816, "step": 92625 }, { "epoch": 16.999449440264268, "grad_norm": 6.48010682198219e-05, "learning_rate": 6.701933192870463e-07, "loss": 0.0, "num_input_tokens_seen": 199797760, "step": 92630 }, { "epoch": 17.00036703982382, "grad_norm": 7.408631063299254e-05, "learning_rate": 6.697929076313969e-07, "loss": 0.0, "num_input_tokens_seen": 199807136, "step": 92635 }, { "epoch": 17.001284639383375, "grad_norm": 0.0003957004810217768, "learning_rate": 6.693926070388468e-07, "loss": 0.0, "num_input_tokens_seen": 199817728, "step": 92640 }, { "epoch": 17.002202238942925, "grad_norm": 0.0017624535830691457, "learning_rate": 6.689924175196655e-07, "loss": 0.0, "num_input_tokens_seen": 199828096, "step": 92645 }, { "epoch": 17.003119838502478, "grad_norm": 0.00010424560605315492, "learning_rate": 6.685923390841165e-07, "loss": 0.0, "num_input_tokens_seen": 199839456, "step": 92650 }, { "epoch": 17.00403743806203, "grad_norm": 0.0001561957033118233, "learning_rate": 6.681923717424593e-07, "loss": 0.0, "num_input_tokens_seen": 199849536, "step": 92655 }, { "epoch": 17.00495503762158, "grad_norm": 0.0010602809488773346, "learning_rate": 6.677925155049559e-07, "loss": 0.0, "num_input_tokens_seen": 199860704, "step": 92660 }, { "epoch": 17.005872637181135, "grad_norm": 0.0002185863268096, "learning_rate": 6.673927703818595e-07, "loss": 0.0, "num_input_tokens_seen": 199870528, "step": 92665 }, { "epoch": 17.006790236740688, "grad_norm": 0.00010907181422226131, "learning_rate": 6.669931363834242e-07, "loss": 0.0, "num_input_tokens_seen": 199881920, "step": 92670 }, { "epoch": 17.007707836300238, "grad_norm": 0.0033673825673758984, "learning_rate": 6.66593613519898e-07, "loss": 0.0, "num_input_tokens_seen": 199892064, "step": 92675 }, { "epoch": 17.00862543585979, "grad_norm": 6.965552893234417e-05, "learning_rate": 6.661942018015304e-07, "loss": 0.0, "num_input_tokens_seen": 199902432, "step": 92680 }, { "epoch": 17.009543035419345, "grad_norm": 6.020851287757978e-05, "learning_rate": 6.65794901238565e-07, "loss": 0.0, "num_input_tokens_seen": 199912544, "step": 92685 }, { "epoch": 17.010460634978894, "grad_norm": 0.00034784566378220916, "learning_rate": 6.653957118412418e-07, "loss": 0.0, "num_input_tokens_seen": 199923456, "step": 92690 }, { "epoch": 17.011378234538448, "grad_norm": 7.879197801230475e-05, "learning_rate": 6.649966336198016e-07, "loss": 0.0, "num_input_tokens_seen": 199933824, "step": 92695 }, { "epoch": 17.012295834098, "grad_norm": 8.247806545114145e-05, "learning_rate": 6.645976665844788e-07, "loss": 0.0, "num_input_tokens_seen": 199943936, "step": 92700 }, { "epoch": 17.01321343365755, "grad_norm": 0.00023387372493743896, "learning_rate": 6.641988107455072e-07, "loss": 0.0, "num_input_tokens_seen": 199954304, "step": 92705 }, { "epoch": 17.014131033217105, "grad_norm": 5.3207000746624544e-05, "learning_rate": 6.638000661131144e-07, "loss": 0.0, "num_input_tokens_seen": 199964800, "step": 92710 }, { "epoch": 17.015048632776658, "grad_norm": 0.0019094827584922314, "learning_rate": 6.634014326975313e-07, "loss": 0.0, "num_input_tokens_seen": 199975584, "step": 92715 }, { "epoch": 17.015966232336208, "grad_norm": 0.00018761746468953788, "learning_rate": 6.630029105089797e-07, "loss": 0.0, "num_input_tokens_seen": 199985856, "step": 92720 }, { "epoch": 17.01688383189576, "grad_norm": 0.00010849134559975937, "learning_rate": 6.626044995576808e-07, "loss": 0.0, "num_input_tokens_seen": 199995488, "step": 92725 }, { "epoch": 17.017801431455315, "grad_norm": 4.0575065213488415e-05, "learning_rate": 6.622061998538554e-07, "loss": 0.0, "num_input_tokens_seen": 200005856, "step": 92730 }, { "epoch": 17.018719031014864, "grad_norm": 0.00016368240176234394, "learning_rate": 6.61808011407718e-07, "loss": 0.0, "num_input_tokens_seen": 200016832, "step": 92735 }, { "epoch": 17.019636630574418, "grad_norm": 0.00011879618978127837, "learning_rate": 6.614099342294816e-07, "loss": 0.0, "num_input_tokens_seen": 200027104, "step": 92740 }, { "epoch": 17.02055423013397, "grad_norm": 7.349159568548203e-05, "learning_rate": 6.610119683293559e-07, "loss": 0.0, "num_input_tokens_seen": 200039008, "step": 92745 }, { "epoch": 17.02147182969352, "grad_norm": 0.0001253632945008576, "learning_rate": 6.606141137175481e-07, "loss": 0.0, "num_input_tokens_seen": 200050720, "step": 92750 }, { "epoch": 17.022389429253074, "grad_norm": 8.380942017538473e-05, "learning_rate": 6.602163704042625e-07, "loss": 0.0, "num_input_tokens_seen": 200062080, "step": 92755 }, { "epoch": 17.023307028812628, "grad_norm": 0.00011435097258072346, "learning_rate": 6.598187383997017e-07, "loss": 0.0, "num_input_tokens_seen": 200074400, "step": 92760 }, { "epoch": 17.024224628372178, "grad_norm": 4.412117777974345e-05, "learning_rate": 6.594212177140636e-07, "loss": 0.0, "num_input_tokens_seen": 200083616, "step": 92765 }, { "epoch": 17.02514222793173, "grad_norm": 4.798153895535506e-05, "learning_rate": 6.59023808357544e-07, "loss": 0.0, "num_input_tokens_seen": 200093920, "step": 92770 }, { "epoch": 17.026059827491284, "grad_norm": 9.368453174829483e-05, "learning_rate": 6.586265103403344e-07, "loss": 0.0, "num_input_tokens_seen": 200104544, "step": 92775 }, { "epoch": 17.026977427050834, "grad_norm": 0.00036060033016838133, "learning_rate": 6.582293236726278e-07, "loss": 0.0, "num_input_tokens_seen": 200115488, "step": 92780 }, { "epoch": 17.027895026610388, "grad_norm": 0.0001299587602261454, "learning_rate": 6.5783224836461e-07, "loss": 0.0, "num_input_tokens_seen": 200126944, "step": 92785 }, { "epoch": 17.02881262616994, "grad_norm": 0.002404926111921668, "learning_rate": 6.574352844264637e-07, "loss": 0.0, "num_input_tokens_seen": 200138304, "step": 92790 }, { "epoch": 17.02973022572949, "grad_norm": 7.187804294517264e-05, "learning_rate": 6.570384318683731e-07, "loss": 0.0, "num_input_tokens_seen": 200148480, "step": 92795 }, { "epoch": 17.030647825289044, "grad_norm": 6.26279361313209e-05, "learning_rate": 6.566416907005163e-07, "loss": 0.0, "num_input_tokens_seen": 200158016, "step": 92800 }, { "epoch": 17.031565424848598, "grad_norm": 0.000757985922973603, "learning_rate": 6.562450609330678e-07, "loss": 0.0, "num_input_tokens_seen": 200168704, "step": 92805 }, { "epoch": 17.032483024408148, "grad_norm": 5.3861094784224406e-05, "learning_rate": 6.558485425762007e-07, "loss": 0.0, "num_input_tokens_seen": 200179456, "step": 92810 }, { "epoch": 17.0334006239677, "grad_norm": 9.279585356125608e-05, "learning_rate": 6.554521356400867e-07, "loss": 0.0, "num_input_tokens_seen": 200191264, "step": 92815 }, { "epoch": 17.034318223527254, "grad_norm": 0.0005384908872656524, "learning_rate": 6.550558401348922e-07, "loss": 0.0, "num_input_tokens_seen": 200203424, "step": 92820 }, { "epoch": 17.035235823086804, "grad_norm": 6.393357762135565e-05, "learning_rate": 6.546596560707796e-07, "loss": 0.0, "num_input_tokens_seen": 200214688, "step": 92825 }, { "epoch": 17.036153422646358, "grad_norm": 0.00016107092960737646, "learning_rate": 6.542635834579136e-07, "loss": 0.0, "num_input_tokens_seen": 200226016, "step": 92830 }, { "epoch": 17.03707102220591, "grad_norm": 0.00013516173930838704, "learning_rate": 6.538676223064516e-07, "loss": 0.0, "num_input_tokens_seen": 200236960, "step": 92835 }, { "epoch": 17.03798862176546, "grad_norm": 7.918893243186176e-05, "learning_rate": 6.534717726265489e-07, "loss": 0.0, "num_input_tokens_seen": 200248128, "step": 92840 }, { "epoch": 17.038906221325014, "grad_norm": 7.566346903331578e-05, "learning_rate": 6.530760344283583e-07, "loss": 0.0, "num_input_tokens_seen": 200258496, "step": 92845 }, { "epoch": 17.039823820884568, "grad_norm": 0.0035683594178408384, "learning_rate": 6.526804077220306e-07, "loss": 0.0, "num_input_tokens_seen": 200268832, "step": 92850 }, { "epoch": 17.040741420444117, "grad_norm": 6.725532875861973e-05, "learning_rate": 6.522848925177128e-07, "loss": 0.0, "num_input_tokens_seen": 200279104, "step": 92855 }, { "epoch": 17.04165902000367, "grad_norm": 0.0006279576336964965, "learning_rate": 6.518894888255483e-07, "loss": 0.0001, "num_input_tokens_seen": 200290752, "step": 92860 }, { "epoch": 17.042576619563224, "grad_norm": 8.469529711874202e-05, "learning_rate": 6.514941966556804e-07, "loss": 0.0, "num_input_tokens_seen": 200302560, "step": 92865 }, { "epoch": 17.043494219122774, "grad_norm": 3.576336894184351e-05, "learning_rate": 6.510990160182468e-07, "loss": 0.0, "num_input_tokens_seen": 200312928, "step": 92870 }, { "epoch": 17.044411818682327, "grad_norm": 0.00016777771816123277, "learning_rate": 6.507039469233823e-07, "loss": 0.0, "num_input_tokens_seen": 200322752, "step": 92875 }, { "epoch": 17.04532941824188, "grad_norm": 0.01687377318739891, "learning_rate": 6.50308989381222e-07, "loss": 0.0, "num_input_tokens_seen": 200332448, "step": 92880 }, { "epoch": 17.04624701780143, "grad_norm": 0.0001856741582741961, "learning_rate": 6.49914143401894e-07, "loss": 0.0, "num_input_tokens_seen": 200343680, "step": 92885 }, { "epoch": 17.047164617360984, "grad_norm": 0.008731639012694359, "learning_rate": 6.49519408995527e-07, "loss": 0.0, "num_input_tokens_seen": 200355168, "step": 92890 }, { "epoch": 17.048082216920537, "grad_norm": 0.0159370806068182, "learning_rate": 6.491247861722427e-07, "loss": 0.0, "num_input_tokens_seen": 200364192, "step": 92895 }, { "epoch": 17.048999816480087, "grad_norm": 5.687971133738756e-05, "learning_rate": 6.487302749421664e-07, "loss": 0.0, "num_input_tokens_seen": 200375168, "step": 92900 }, { "epoch": 17.04991741603964, "grad_norm": 5.668155426974408e-05, "learning_rate": 6.48335875315414e-07, "loss": 0.0, "num_input_tokens_seen": 200386144, "step": 92905 }, { "epoch": 17.050835015599194, "grad_norm": 7.497271872125566e-05, "learning_rate": 6.479415873021011e-07, "loss": 0.0, "num_input_tokens_seen": 200396608, "step": 92910 }, { "epoch": 17.051752615158744, "grad_norm": 0.00015023227024357766, "learning_rate": 6.475474109123425e-07, "loss": 0.0, "num_input_tokens_seen": 200406720, "step": 92915 }, { "epoch": 17.052670214718297, "grad_norm": 0.0002724985242821276, "learning_rate": 6.471533461562469e-07, "loss": 0.0, "num_input_tokens_seen": 200417312, "step": 92920 }, { "epoch": 17.05358781427785, "grad_norm": 4.822594200959429e-05, "learning_rate": 6.467593930439209e-07, "loss": 0.0, "num_input_tokens_seen": 200428064, "step": 92925 }, { "epoch": 17.0545054138374, "grad_norm": 0.00023005554976407439, "learning_rate": 6.4636555158547e-07, "loss": 0.0, "num_input_tokens_seen": 200439168, "step": 92930 }, { "epoch": 17.055423013396954, "grad_norm": 0.00012619535846170038, "learning_rate": 6.45971821790996e-07, "loss": 0.0, "num_input_tokens_seen": 200449568, "step": 92935 }, { "epoch": 17.056340612956507, "grad_norm": 0.00015555477875750512, "learning_rate": 6.45578203670596e-07, "loss": 0.0, "num_input_tokens_seen": 200460544, "step": 92940 }, { "epoch": 17.057258212516057, "grad_norm": 0.00010623007256072015, "learning_rate": 6.451846972343668e-07, "loss": 0.0, "num_input_tokens_seen": 200470304, "step": 92945 }, { "epoch": 17.05817581207561, "grad_norm": 0.00015562005864921957, "learning_rate": 6.447913024923996e-07, "loss": 0.0, "num_input_tokens_seen": 200480704, "step": 92950 }, { "epoch": 17.059093411635164, "grad_norm": 9.515694546280429e-05, "learning_rate": 6.443980194547861e-07, "loss": 0.0, "num_input_tokens_seen": 200490080, "step": 92955 }, { "epoch": 17.060011011194714, "grad_norm": 4.786140561918728e-05, "learning_rate": 6.440048481316136e-07, "loss": 0.0, "num_input_tokens_seen": 200500832, "step": 92960 }, { "epoch": 17.060928610754267, "grad_norm": 7.20657262718305e-05, "learning_rate": 6.436117885329652e-07, "loss": 0.0, "num_input_tokens_seen": 200513120, "step": 92965 }, { "epoch": 17.06184621031382, "grad_norm": 0.0008592303493060172, "learning_rate": 6.432188406689227e-07, "loss": 0.0001, "num_input_tokens_seen": 200523680, "step": 92970 }, { "epoch": 17.06276380987337, "grad_norm": 9.325446444563568e-05, "learning_rate": 6.428260045495632e-07, "loss": 0.0, "num_input_tokens_seen": 200533632, "step": 92975 }, { "epoch": 17.063681409432924, "grad_norm": 0.0043930611573159695, "learning_rate": 6.424332801849648e-07, "loss": 0.0, "num_input_tokens_seen": 200544608, "step": 92980 }, { "epoch": 17.064599008992477, "grad_norm": 6.16977849858813e-05, "learning_rate": 6.420406675851993e-07, "loss": 0.0, "num_input_tokens_seen": 200555808, "step": 92985 }, { "epoch": 17.065516608552027, "grad_norm": 5.842815153300762e-05, "learning_rate": 6.416481667603363e-07, "loss": 0.0, "num_input_tokens_seen": 200565792, "step": 92990 }, { "epoch": 17.06643420811158, "grad_norm": 0.0005672199768014252, "learning_rate": 6.412557777204426e-07, "loss": 0.0, "num_input_tokens_seen": 200574848, "step": 92995 }, { "epoch": 17.067351807671134, "grad_norm": 9.136873268289492e-05, "learning_rate": 6.408635004755831e-07, "loss": 0.0, "num_input_tokens_seen": 200586336, "step": 93000 }, { "epoch": 17.068269407230684, "grad_norm": 0.023470045998692513, "learning_rate": 6.404713350358188e-07, "loss": 0.0, "num_input_tokens_seen": 200596032, "step": 93005 }, { "epoch": 17.069187006790237, "grad_norm": 6.940957246115431e-05, "learning_rate": 6.400792814112072e-07, "loss": 0.0, "num_input_tokens_seen": 200606784, "step": 93010 }, { "epoch": 17.07010460634979, "grad_norm": 0.008892140351235867, "learning_rate": 6.396873396118059e-07, "loss": 0.0, "num_input_tokens_seen": 200617312, "step": 93015 }, { "epoch": 17.07102220590934, "grad_norm": 0.0001167042792076245, "learning_rate": 6.392955096476667e-07, "loss": 0.0, "num_input_tokens_seen": 200627680, "step": 93020 }, { "epoch": 17.071939805468894, "grad_norm": 6.179352931212634e-05, "learning_rate": 6.389037915288388e-07, "loss": 0.0, "num_input_tokens_seen": 200638880, "step": 93025 }, { "epoch": 17.072857405028447, "grad_norm": 0.00013775320257991552, "learning_rate": 6.385121852653686e-07, "loss": 0.0, "num_input_tokens_seen": 200650016, "step": 93030 }, { "epoch": 17.073775004587997, "grad_norm": 0.00014077496598474681, "learning_rate": 6.381206908673021e-07, "loss": 0.0, "num_input_tokens_seen": 200661344, "step": 93035 }, { "epoch": 17.07469260414755, "grad_norm": 9.328421583632007e-05, "learning_rate": 6.377293083446795e-07, "loss": 0.0, "num_input_tokens_seen": 200670784, "step": 93040 }, { "epoch": 17.075610203707104, "grad_norm": 0.00083166389958933, "learning_rate": 6.373380377075383e-07, "loss": 0.0, "num_input_tokens_seen": 200681792, "step": 93045 }, { "epoch": 17.076527803266654, "grad_norm": 0.0001650757185416296, "learning_rate": 6.369468789659161e-07, "loss": 0.0, "num_input_tokens_seen": 200693472, "step": 93050 }, { "epoch": 17.077445402826207, "grad_norm": 0.00022722432913724333, "learning_rate": 6.365558321298443e-07, "loss": 0.0, "num_input_tokens_seen": 200704672, "step": 93055 }, { "epoch": 17.07836300238576, "grad_norm": 3.691643360070884e-05, "learning_rate": 6.361648972093515e-07, "loss": 0.0, "num_input_tokens_seen": 200714848, "step": 93060 }, { "epoch": 17.07928060194531, "grad_norm": 7.893584552221e-05, "learning_rate": 6.357740742144669e-07, "loss": 0.0, "num_input_tokens_seen": 200726912, "step": 93065 }, { "epoch": 17.080198201504864, "grad_norm": 0.00023522476840298623, "learning_rate": 6.353833631552137e-07, "loss": 0.0, "num_input_tokens_seen": 200739072, "step": 93070 }, { "epoch": 17.081115801064417, "grad_norm": 0.000303744018310681, "learning_rate": 6.34992764041612e-07, "loss": 0.0, "num_input_tokens_seen": 200749248, "step": 93075 }, { "epoch": 17.082033400623967, "grad_norm": 0.0023548437748104334, "learning_rate": 6.346022768836802e-07, "loss": 0.0, "num_input_tokens_seen": 200759680, "step": 93080 }, { "epoch": 17.08295100018352, "grad_norm": 0.0032338097225874662, "learning_rate": 6.34211901691435e-07, "loss": 0.0, "num_input_tokens_seen": 200770560, "step": 93085 }, { "epoch": 17.083868599743074, "grad_norm": 5.4568354244111106e-05, "learning_rate": 6.338216384748885e-07, "loss": 0.0, "num_input_tokens_seen": 200781824, "step": 93090 }, { "epoch": 17.084786199302624, "grad_norm": 5.377592970035039e-05, "learning_rate": 6.33431487244049e-07, "loss": 0.0, "num_input_tokens_seen": 200791776, "step": 93095 }, { "epoch": 17.085703798862177, "grad_norm": 8.660403545945883e-05, "learning_rate": 6.330414480089248e-07, "loss": 0.0, "num_input_tokens_seen": 200801856, "step": 93100 }, { "epoch": 17.08662139842173, "grad_norm": 0.001513854367658496, "learning_rate": 6.326515207795198e-07, "loss": 0.0, "num_input_tokens_seen": 200813376, "step": 93105 }, { "epoch": 17.08753899798128, "grad_norm": 0.00028422218747437, "learning_rate": 6.322617055658331e-07, "loss": 0.0, "num_input_tokens_seen": 200823904, "step": 93110 }, { "epoch": 17.088456597540834, "grad_norm": 0.00010579587979009375, "learning_rate": 6.318720023778651e-07, "loss": 0.0, "num_input_tokens_seen": 200834592, "step": 93115 }, { "epoch": 17.089374197100387, "grad_norm": 0.000339787220582366, "learning_rate": 6.314824112256107e-07, "loss": 0.0, "num_input_tokens_seen": 200846144, "step": 93120 }, { "epoch": 17.090291796659937, "grad_norm": 0.0001250063069164753, "learning_rate": 6.310929321190623e-07, "loss": 0.0, "num_input_tokens_seen": 200856000, "step": 93125 }, { "epoch": 17.09120939621949, "grad_norm": 7.749701035209e-05, "learning_rate": 6.30703565068207e-07, "loss": 0.0, "num_input_tokens_seen": 200865888, "step": 93130 }, { "epoch": 17.092126995779044, "grad_norm": 0.000570325122680515, "learning_rate": 6.30314310083035e-07, "loss": 0.0, "num_input_tokens_seen": 200876704, "step": 93135 }, { "epoch": 17.093044595338593, "grad_norm": 0.0002234153071185574, "learning_rate": 6.299251671735285e-07, "loss": 0.0, "num_input_tokens_seen": 200886976, "step": 93140 }, { "epoch": 17.093962194898147, "grad_norm": 9.981852053897455e-05, "learning_rate": 6.295361363496677e-07, "loss": 0.0, "num_input_tokens_seen": 200898560, "step": 93145 }, { "epoch": 17.0948797944577, "grad_norm": 0.00010574558837106451, "learning_rate": 6.29147217621432e-07, "loss": 0.0, "num_input_tokens_seen": 200909472, "step": 93150 }, { "epoch": 17.09579739401725, "grad_norm": 0.0014728365931659937, "learning_rate": 6.28758410998796e-07, "loss": 0.0, "num_input_tokens_seen": 200919488, "step": 93155 }, { "epoch": 17.096714993576803, "grad_norm": 0.00014630865189246833, "learning_rate": 6.283697164917324e-07, "loss": 0.0, "num_input_tokens_seen": 200929824, "step": 93160 }, { "epoch": 17.097632593136357, "grad_norm": 7.931482832645997e-05, "learning_rate": 6.279811341102099e-07, "loss": 0.0, "num_input_tokens_seen": 200940928, "step": 93165 }, { "epoch": 17.098550192695907, "grad_norm": 0.0007564884726889431, "learning_rate": 6.275926638641938e-07, "loss": 0.0, "num_input_tokens_seen": 200952896, "step": 93170 }, { "epoch": 17.09946779225546, "grad_norm": 7.629296305822209e-05, "learning_rate": 6.272043057636507e-07, "loss": 0.0, "num_input_tokens_seen": 200963488, "step": 93175 }, { "epoch": 17.100385391815013, "grad_norm": 0.00014018331421539187, "learning_rate": 6.268160598185402e-07, "loss": 0.0, "num_input_tokens_seen": 200973952, "step": 93180 }, { "epoch": 17.101302991374563, "grad_norm": 0.00010676380770746619, "learning_rate": 6.264279260388195e-07, "loss": 0.0, "num_input_tokens_seen": 200985024, "step": 93185 }, { "epoch": 17.102220590934117, "grad_norm": 0.005975369364023209, "learning_rate": 6.260399044344445e-07, "loss": 0.0, "num_input_tokens_seen": 200995744, "step": 93190 }, { "epoch": 17.10313819049367, "grad_norm": 8.37112675071694e-05, "learning_rate": 6.256519950153655e-07, "loss": 0.0, "num_input_tokens_seen": 201007168, "step": 93195 }, { "epoch": 17.10405579005322, "grad_norm": 0.001833554357290268, "learning_rate": 6.252641977915341e-07, "loss": 0.0, "num_input_tokens_seen": 201018176, "step": 93200 }, { "epoch": 17.104973389612773, "grad_norm": 0.0002547960029914975, "learning_rate": 6.248765127728961e-07, "loss": 0.0, "num_input_tokens_seen": 201029696, "step": 93205 }, { "epoch": 17.105890989172327, "grad_norm": 0.0006659576902166009, "learning_rate": 6.244889399693948e-07, "loss": 0.0, "num_input_tokens_seen": 201039808, "step": 93210 }, { "epoch": 17.106808588731877, "grad_norm": 5.2572391723515466e-05, "learning_rate": 6.241014793909694e-07, "loss": 0.0, "num_input_tokens_seen": 201051040, "step": 93215 }, { "epoch": 17.10772618829143, "grad_norm": 0.00012737767247017473, "learning_rate": 6.237141310475603e-07, "loss": 0.0, "num_input_tokens_seen": 201061216, "step": 93220 }, { "epoch": 17.108643787850983, "grad_norm": 0.00011386865662643686, "learning_rate": 6.233268949491011e-07, "loss": 0.0, "num_input_tokens_seen": 201072672, "step": 93225 }, { "epoch": 17.109561387410533, "grad_norm": 8.811830775812268e-05, "learning_rate": 6.22939771105523e-07, "loss": 0.0, "num_input_tokens_seen": 201082592, "step": 93230 }, { "epoch": 17.110478986970087, "grad_norm": 0.00011746472591767088, "learning_rate": 6.225527595267567e-07, "loss": 0.0, "num_input_tokens_seen": 201093376, "step": 93235 }, { "epoch": 17.11139658652964, "grad_norm": 6.89238659106195e-05, "learning_rate": 6.221658602227276e-07, "loss": 0.0, "num_input_tokens_seen": 201103456, "step": 93240 }, { "epoch": 17.11231418608919, "grad_norm": 6.180685159051791e-05, "learning_rate": 6.217790732033586e-07, "loss": 0.0, "num_input_tokens_seen": 201113280, "step": 93245 }, { "epoch": 17.113231785648743, "grad_norm": 0.00010839771130122244, "learning_rate": 6.213923984785713e-07, "loss": 0.0, "num_input_tokens_seen": 201123264, "step": 93250 }, { "epoch": 17.114149385208297, "grad_norm": 4.5513985241996124e-05, "learning_rate": 6.210058360582827e-07, "loss": 0.0, "num_input_tokens_seen": 201134176, "step": 93255 }, { "epoch": 17.115066984767846, "grad_norm": 5.475608122651465e-05, "learning_rate": 6.206193859524079e-07, "loss": 0.0, "num_input_tokens_seen": 201144704, "step": 93260 }, { "epoch": 17.1159845843274, "grad_norm": 0.003048412734642625, "learning_rate": 6.202330481708574e-07, "loss": 0.0, "num_input_tokens_seen": 201156000, "step": 93265 }, { "epoch": 17.116902183886953, "grad_norm": 0.00014458521036431193, "learning_rate": 6.198468227235421e-07, "loss": 0.0, "num_input_tokens_seen": 201167424, "step": 93270 }, { "epoch": 17.117819783446503, "grad_norm": 0.00013396776921581477, "learning_rate": 6.19460709620367e-07, "loss": 0.0, "num_input_tokens_seen": 201178112, "step": 93275 }, { "epoch": 17.118737383006057, "grad_norm": 4.89423364342656e-05, "learning_rate": 6.190747088712346e-07, "loss": 0.0, "num_input_tokens_seen": 201190432, "step": 93280 }, { "epoch": 17.11965498256561, "grad_norm": 0.00019006467482540756, "learning_rate": 6.18688820486047e-07, "loss": 0.0, "num_input_tokens_seen": 201202528, "step": 93285 }, { "epoch": 17.12057258212516, "grad_norm": 0.0003227683191653341, "learning_rate": 6.183030444747007e-07, "loss": 0.0, "num_input_tokens_seen": 201213472, "step": 93290 }, { "epoch": 17.121490181684713, "grad_norm": 7.943853415781632e-05, "learning_rate": 6.179173808470906e-07, "loss": 0.0, "num_input_tokens_seen": 201223680, "step": 93295 }, { "epoch": 17.122407781244267, "grad_norm": 0.0006791901541873813, "learning_rate": 6.175318296131072e-07, "loss": 0.0, "num_input_tokens_seen": 201233952, "step": 93300 }, { "epoch": 17.123325380803816, "grad_norm": 7.585978892166167e-05, "learning_rate": 6.171463907826408e-07, "loss": 0.0001, "num_input_tokens_seen": 201244672, "step": 93305 }, { "epoch": 17.12424298036337, "grad_norm": 0.0001519482902949676, "learning_rate": 6.16761064365577e-07, "loss": 0.0, "num_input_tokens_seen": 201255712, "step": 93310 }, { "epoch": 17.125160579922923, "grad_norm": 0.0002569090574979782, "learning_rate": 6.163758503717971e-07, "loss": 0.0, "num_input_tokens_seen": 201266720, "step": 93315 }, { "epoch": 17.126078179482473, "grad_norm": 4.715080285677686e-05, "learning_rate": 6.159907488111838e-07, "loss": 0.0, "num_input_tokens_seen": 201275744, "step": 93320 }, { "epoch": 17.126995779042026, "grad_norm": 0.0007801667088642716, "learning_rate": 6.156057596936133e-07, "loss": 0.0, "num_input_tokens_seen": 201285376, "step": 93325 }, { "epoch": 17.12791337860158, "grad_norm": 0.0001297428971156478, "learning_rate": 6.152208830289586e-07, "loss": 0.0, "num_input_tokens_seen": 201296448, "step": 93330 }, { "epoch": 17.12883097816113, "grad_norm": 0.0018092511454597116, "learning_rate": 6.148361188270934e-07, "loss": 0.0, "num_input_tokens_seen": 201306816, "step": 93335 }, { "epoch": 17.129748577720683, "grad_norm": 5.311516724759713e-05, "learning_rate": 6.144514670978857e-07, "loss": 0.0, "num_input_tokens_seen": 201318336, "step": 93340 }, { "epoch": 17.130666177280236, "grad_norm": 8.728513057576492e-05, "learning_rate": 6.140669278512007e-07, "loss": 0.0, "num_input_tokens_seen": 201329056, "step": 93345 }, { "epoch": 17.131583776839786, "grad_norm": 0.00010232798376819119, "learning_rate": 6.136825010969006e-07, "loss": 0.0, "num_input_tokens_seen": 201340224, "step": 93350 }, { "epoch": 17.13250137639934, "grad_norm": 0.0004579808737616986, "learning_rate": 6.132981868448468e-07, "loss": 0.0, "num_input_tokens_seen": 201351392, "step": 93355 }, { "epoch": 17.133418975958893, "grad_norm": 6.963622581679374e-05, "learning_rate": 6.129139851048959e-07, "loss": 0.0, "num_input_tokens_seen": 201362752, "step": 93360 }, { "epoch": 17.134336575518443, "grad_norm": 0.00017394126916769892, "learning_rate": 6.125298958869009e-07, "loss": 0.0, "num_input_tokens_seen": 201373408, "step": 93365 }, { "epoch": 17.135254175077996, "grad_norm": 6.476230919361115e-05, "learning_rate": 6.121459192007156e-07, "loss": 0.0, "num_input_tokens_seen": 201384224, "step": 93370 }, { "epoch": 17.13617177463755, "grad_norm": 0.0007500960491597652, "learning_rate": 6.117620550561865e-07, "loss": 0.0, "num_input_tokens_seen": 201394912, "step": 93375 }, { "epoch": 17.1370893741971, "grad_norm": 0.0014866681303828955, "learning_rate": 6.113783034631593e-07, "loss": 0.0, "num_input_tokens_seen": 201406336, "step": 93380 }, { "epoch": 17.138006973756653, "grad_norm": 5.077875903225504e-05, "learning_rate": 6.109946644314774e-07, "loss": 0.0, "num_input_tokens_seen": 201416256, "step": 93385 }, { "epoch": 17.138924573316206, "grad_norm": 6.18299818597734e-05, "learning_rate": 6.106111379709784e-07, "loss": 0.0, "num_input_tokens_seen": 201426048, "step": 93390 }, { "epoch": 17.139842172875756, "grad_norm": 6.924604531377554e-05, "learning_rate": 6.102277240915022e-07, "loss": 0.0, "num_input_tokens_seen": 201436224, "step": 93395 }, { "epoch": 17.14075977243531, "grad_norm": 0.0019398800795897841, "learning_rate": 6.098444228028816e-07, "loss": 0.0, "num_input_tokens_seen": 201447840, "step": 93400 }, { "epoch": 17.141677371994863, "grad_norm": 0.14933449029922485, "learning_rate": 6.09461234114947e-07, "loss": 0.0001, "num_input_tokens_seen": 201459264, "step": 93405 }, { "epoch": 17.142594971554413, "grad_norm": 6.631523865507916e-05, "learning_rate": 6.090781580375271e-07, "loss": 0.0, "num_input_tokens_seen": 201469632, "step": 93410 }, { "epoch": 17.143512571113966, "grad_norm": 5.910680556553416e-05, "learning_rate": 6.086951945804459e-07, "loss": 0.0, "num_input_tokens_seen": 201482592, "step": 93415 }, { "epoch": 17.14443017067352, "grad_norm": 5.5729604355292395e-05, "learning_rate": 6.083123437535282e-07, "loss": 0.0, "num_input_tokens_seen": 201493472, "step": 93420 }, { "epoch": 17.14534777023307, "grad_norm": 3.9912236388772726e-05, "learning_rate": 6.079296055665929e-07, "loss": 0.0, "num_input_tokens_seen": 201504384, "step": 93425 }, { "epoch": 17.146265369792623, "grad_norm": 0.0003797528625000268, "learning_rate": 6.075469800294548e-07, "loss": 0.0, "num_input_tokens_seen": 201514208, "step": 93430 }, { "epoch": 17.147182969352176, "grad_norm": 3.514007039484568e-05, "learning_rate": 6.071644671519295e-07, "loss": 0.0, "num_input_tokens_seen": 201524224, "step": 93435 }, { "epoch": 17.148100568911726, "grad_norm": 0.00011900732351932675, "learning_rate": 6.067820669438279e-07, "loss": 0.0, "num_input_tokens_seen": 201536384, "step": 93440 }, { "epoch": 17.14901816847128, "grad_norm": 6.025557377142832e-05, "learning_rate": 6.063997794149573e-07, "loss": 0.0, "num_input_tokens_seen": 201546976, "step": 93445 }, { "epoch": 17.149935768030833, "grad_norm": 6.43847743049264e-05, "learning_rate": 6.060176045751215e-07, "loss": 0.0, "num_input_tokens_seen": 201557952, "step": 93450 }, { "epoch": 17.150853367590383, "grad_norm": 0.00010712196672102436, "learning_rate": 6.056355424341259e-07, "loss": 0.0001, "num_input_tokens_seen": 201569120, "step": 93455 }, { "epoch": 17.151770967149936, "grad_norm": 0.0001154242709162645, "learning_rate": 6.052535930017672e-07, "loss": 0.0001, "num_input_tokens_seen": 201579488, "step": 93460 }, { "epoch": 17.15268856670949, "grad_norm": 0.00010605666466290131, "learning_rate": 6.04871756287842e-07, "loss": 0.0, "num_input_tokens_seen": 201591328, "step": 93465 }, { "epoch": 17.15360616626904, "grad_norm": 9.302563557866961e-05, "learning_rate": 6.044900323021452e-07, "loss": 0.0, "num_input_tokens_seen": 201601792, "step": 93470 }, { "epoch": 17.154523765828593, "grad_norm": 0.0008089859038591385, "learning_rate": 6.041084210544668e-07, "loss": 0.0, "num_input_tokens_seen": 201613216, "step": 93475 }, { "epoch": 17.155441365388146, "grad_norm": 0.00017347137327305973, "learning_rate": 6.037269225545944e-07, "loss": 0.0, "num_input_tokens_seen": 201624192, "step": 93480 }, { "epoch": 17.156358964947696, "grad_norm": 0.000672972877509892, "learning_rate": 6.03345536812312e-07, "loss": 0.0, "num_input_tokens_seen": 201635136, "step": 93485 }, { "epoch": 17.15727656450725, "grad_norm": 7.812565308995545e-05, "learning_rate": 6.029642638374028e-07, "loss": 0.0, "num_input_tokens_seen": 201647008, "step": 93490 }, { "epoch": 17.158194164066803, "grad_norm": 0.00013635415234602988, "learning_rate": 6.025831036396462e-07, "loss": 0.0, "num_input_tokens_seen": 201656992, "step": 93495 }, { "epoch": 17.159111763626353, "grad_norm": 0.0010684487642720342, "learning_rate": 6.022020562288161e-07, "loss": 0.0, "num_input_tokens_seen": 201666944, "step": 93500 }, { "epoch": 17.160029363185906, "grad_norm": 0.01021520234644413, "learning_rate": 6.018211216146885e-07, "loss": 0.0, "num_input_tokens_seen": 201677984, "step": 93505 }, { "epoch": 17.16094696274546, "grad_norm": 4.389992682263255e-05, "learning_rate": 6.014402998070323e-07, "loss": 0.0, "num_input_tokens_seen": 201688352, "step": 93510 }, { "epoch": 17.16186456230501, "grad_norm": 5.9120735386386514e-05, "learning_rate": 6.010595908156147e-07, "loss": 0.0, "num_input_tokens_seen": 201699328, "step": 93515 }, { "epoch": 17.162782161864563, "grad_norm": 9.151768608717248e-05, "learning_rate": 6.006789946502017e-07, "loss": 0.0, "num_input_tokens_seen": 201710656, "step": 93520 }, { "epoch": 17.163699761424116, "grad_norm": 0.004899337887763977, "learning_rate": 6.002985113205539e-07, "loss": 0.0, "num_input_tokens_seen": 201721440, "step": 93525 }, { "epoch": 17.164617360983666, "grad_norm": 0.01177180651575327, "learning_rate": 5.999181408364308e-07, "loss": 0.0, "num_input_tokens_seen": 201733024, "step": 93530 }, { "epoch": 17.16553496054322, "grad_norm": 0.00038532118196599185, "learning_rate": 5.995378832075865e-07, "loss": 0.0, "num_input_tokens_seen": 201742720, "step": 93535 }, { "epoch": 17.166452560102773, "grad_norm": 7.024758087936789e-05, "learning_rate": 5.991577384437764e-07, "loss": 0.0173, "num_input_tokens_seen": 201753376, "step": 93540 }, { "epoch": 17.167370159662322, "grad_norm": 0.00019685461302287877, "learning_rate": 5.987777065547495e-07, "loss": 0.0, "num_input_tokens_seen": 201765120, "step": 93545 }, { "epoch": 17.168287759221876, "grad_norm": 0.0011666681384667754, "learning_rate": 5.983977875502528e-07, "loss": 0.0, "num_input_tokens_seen": 201776864, "step": 93550 }, { "epoch": 17.16920535878143, "grad_norm": 0.0004543009854387492, "learning_rate": 5.980179814400311e-07, "loss": 0.0, "num_input_tokens_seen": 201787968, "step": 93555 }, { "epoch": 17.17012295834098, "grad_norm": 0.00023524092102888972, "learning_rate": 5.976382882338266e-07, "loss": 0.0, "num_input_tokens_seen": 201799776, "step": 93560 }, { "epoch": 17.171040557900533, "grad_norm": 5.914051507716067e-05, "learning_rate": 5.972587079413755e-07, "loss": 0.0, "num_input_tokens_seen": 201810720, "step": 93565 }, { "epoch": 17.171958157460086, "grad_norm": 8.580424764659256e-05, "learning_rate": 5.968792405724161e-07, "loss": 0.0, "num_input_tokens_seen": 201821696, "step": 93570 }, { "epoch": 17.172875757019636, "grad_norm": 9.17658835533075e-05, "learning_rate": 5.964998861366794e-07, "loss": 0.0001, "num_input_tokens_seen": 201832640, "step": 93575 }, { "epoch": 17.17379335657919, "grad_norm": 0.00017432485765311867, "learning_rate": 5.961206446438966e-07, "loss": 0.0, "num_input_tokens_seen": 201844128, "step": 93580 }, { "epoch": 17.174710956138743, "grad_norm": 0.005922090262174606, "learning_rate": 5.957415161037921e-07, "loss": 0.0, "num_input_tokens_seen": 201854368, "step": 93585 }, { "epoch": 17.175628555698292, "grad_norm": 0.00022141881345305592, "learning_rate": 5.953625005260932e-07, "loss": 0.0, "num_input_tokens_seen": 201865248, "step": 93590 }, { "epoch": 17.176546155257846, "grad_norm": 0.0014138300903141499, "learning_rate": 5.949835979205199e-07, "loss": 0.0, "num_input_tokens_seen": 201877536, "step": 93595 }, { "epoch": 17.1774637548174, "grad_norm": 7.099885260686278e-05, "learning_rate": 5.946048082967898e-07, "loss": 0.0, "num_input_tokens_seen": 201888000, "step": 93600 }, { "epoch": 17.17838135437695, "grad_norm": 0.0012472396483644843, "learning_rate": 5.942261316646187e-07, "loss": 0.0, "num_input_tokens_seen": 201899104, "step": 93605 }, { "epoch": 17.179298953936502, "grad_norm": 0.00022367008205037564, "learning_rate": 5.938475680337174e-07, "loss": 0.0, "num_input_tokens_seen": 201909824, "step": 93610 }, { "epoch": 17.180216553496056, "grad_norm": 0.00015521104796789587, "learning_rate": 5.934691174137991e-07, "loss": 0.0, "num_input_tokens_seen": 201919616, "step": 93615 }, { "epoch": 17.181134153055606, "grad_norm": 7.99998888396658e-05, "learning_rate": 5.930907798145674e-07, "loss": 0.0, "num_input_tokens_seen": 201929696, "step": 93620 }, { "epoch": 17.18205175261516, "grad_norm": 0.00027812947519123554, "learning_rate": 5.92712555245728e-07, "loss": 0.0, "num_input_tokens_seen": 201940448, "step": 93625 }, { "epoch": 17.182969352174712, "grad_norm": 0.000327699730405584, "learning_rate": 5.923344437169804e-07, "loss": 0.0, "num_input_tokens_seen": 201951488, "step": 93630 }, { "epoch": 17.183886951734262, "grad_norm": 0.00014156197721604258, "learning_rate": 5.919564452380222e-07, "loss": 0.0, "num_input_tokens_seen": 201962592, "step": 93635 }, { "epoch": 17.184804551293816, "grad_norm": 6.490062514785677e-05, "learning_rate": 5.915785598185503e-07, "loss": 0.0, "num_input_tokens_seen": 201973664, "step": 93640 }, { "epoch": 17.18572215085337, "grad_norm": 0.00010890547127928585, "learning_rate": 5.912007874682557e-07, "loss": 0.0, "num_input_tokens_seen": 201984384, "step": 93645 }, { "epoch": 17.18663975041292, "grad_norm": 0.00011144532618345693, "learning_rate": 5.908231281968274e-07, "loss": 0.0, "num_input_tokens_seen": 201994240, "step": 93650 }, { "epoch": 17.187557349972472, "grad_norm": 0.00013271420903038234, "learning_rate": 5.904455820139526e-07, "loss": 0.0, "num_input_tokens_seen": 202004576, "step": 93655 }, { "epoch": 17.188474949532026, "grad_norm": 9.013947419589385e-05, "learning_rate": 5.900681489293147e-07, "loss": 0.0, "num_input_tokens_seen": 202014464, "step": 93660 }, { "epoch": 17.189392549091576, "grad_norm": 0.00041300657903775573, "learning_rate": 5.896908289525943e-07, "loss": 0.0, "num_input_tokens_seen": 202025376, "step": 93665 }, { "epoch": 17.19031014865113, "grad_norm": 0.00028377355192787945, "learning_rate": 5.893136220934675e-07, "loss": 0.0, "num_input_tokens_seen": 202037376, "step": 93670 }, { "epoch": 17.191227748210682, "grad_norm": 0.000125738704809919, "learning_rate": 5.889365283616111e-07, "loss": 0.0, "num_input_tokens_seen": 202046336, "step": 93675 }, { "epoch": 17.192145347770232, "grad_norm": 8.62392334965989e-05, "learning_rate": 5.885595477666967e-07, "loss": 0.0, "num_input_tokens_seen": 202058048, "step": 93680 }, { "epoch": 17.193062947329786, "grad_norm": 0.002110650297254324, "learning_rate": 5.881826803183915e-07, "loss": 0.0, "num_input_tokens_seen": 202068320, "step": 93685 }, { "epoch": 17.19398054688934, "grad_norm": 5.835439878865145e-05, "learning_rate": 5.878059260263641e-07, "loss": 0.0, "num_input_tokens_seen": 202078272, "step": 93690 }, { "epoch": 17.19489814644889, "grad_norm": 7.53537897253409e-05, "learning_rate": 5.874292849002761e-07, "loss": 0.0, "num_input_tokens_seen": 202089088, "step": 93695 }, { "epoch": 17.195815746008442, "grad_norm": 0.0010772810783237219, "learning_rate": 5.870527569497875e-07, "loss": 0.0, "num_input_tokens_seen": 202100800, "step": 93700 }, { "epoch": 17.196733345567996, "grad_norm": 0.0014020702801644802, "learning_rate": 5.866763421845567e-07, "loss": 0.0, "num_input_tokens_seen": 202112448, "step": 93705 }, { "epoch": 17.197650945127545, "grad_norm": 0.00013273967488203198, "learning_rate": 5.863000406142383e-07, "loss": 0.0, "num_input_tokens_seen": 202123712, "step": 93710 }, { "epoch": 17.1985685446871, "grad_norm": 0.001012794440612197, "learning_rate": 5.859238522484828e-07, "loss": 0.0, "num_input_tokens_seen": 202135584, "step": 93715 }, { "epoch": 17.199486144246652, "grad_norm": 8.782643271842971e-05, "learning_rate": 5.855477770969381e-07, "loss": 0.0, "num_input_tokens_seen": 202145760, "step": 93720 }, { "epoch": 17.200403743806202, "grad_norm": 6.966800719965249e-05, "learning_rate": 5.851718151692526e-07, "loss": 0.0, "num_input_tokens_seen": 202156320, "step": 93725 }, { "epoch": 17.201321343365755, "grad_norm": 0.00012780922406818718, "learning_rate": 5.847959664750674e-07, "loss": 0.0, "num_input_tokens_seen": 202168544, "step": 93730 }, { "epoch": 17.20223894292531, "grad_norm": 6.776316149625927e-05, "learning_rate": 5.844202310240222e-07, "loss": 0.0, "num_input_tokens_seen": 202180288, "step": 93735 }, { "epoch": 17.20315654248486, "grad_norm": 0.0001286281767534092, "learning_rate": 5.840446088257551e-07, "loss": 0.0, "num_input_tokens_seen": 202191776, "step": 93740 }, { "epoch": 17.204074142044412, "grad_norm": 0.00011804032692452893, "learning_rate": 5.836690998898997e-07, "loss": 0.0, "num_input_tokens_seen": 202202944, "step": 93745 }, { "epoch": 17.204991741603965, "grad_norm": 0.00019029696704819798, "learning_rate": 5.832937042260872e-07, "loss": 0.0, "num_input_tokens_seen": 202214528, "step": 93750 }, { "epoch": 17.205909341163515, "grad_norm": 0.0002633330295793712, "learning_rate": 5.829184218439448e-07, "loss": 0.0, "num_input_tokens_seen": 202225216, "step": 93755 }, { "epoch": 17.20682694072307, "grad_norm": 0.00042245694203302264, "learning_rate": 5.825432527531005e-07, "loss": 0.0, "num_input_tokens_seen": 202237024, "step": 93760 }, { "epoch": 17.207744540282622, "grad_norm": 0.0001075506879715249, "learning_rate": 5.821681969631749e-07, "loss": 0.0, "num_input_tokens_seen": 202247232, "step": 93765 }, { "epoch": 17.208662139842172, "grad_norm": 0.0003516164142638445, "learning_rate": 5.817932544837873e-07, "loss": 0.0, "num_input_tokens_seen": 202256448, "step": 93770 }, { "epoch": 17.209579739401725, "grad_norm": 0.000118135372758843, "learning_rate": 5.814184253245558e-07, "loss": 0.0, "num_input_tokens_seen": 202268352, "step": 93775 }, { "epoch": 17.21049733896128, "grad_norm": 0.00031347398180514574, "learning_rate": 5.810437094950938e-07, "loss": 0.0, "num_input_tokens_seen": 202279168, "step": 93780 }, { "epoch": 17.21141493852083, "grad_norm": 5.489531031344086e-05, "learning_rate": 5.806691070050108e-07, "loss": 0.0, "num_input_tokens_seen": 202290496, "step": 93785 }, { "epoch": 17.212332538080382, "grad_norm": 0.0001319233706453815, "learning_rate": 5.802946178639168e-07, "loss": 0.0, "num_input_tokens_seen": 202301568, "step": 93790 }, { "epoch": 17.213250137639935, "grad_norm": 7.009908586042002e-05, "learning_rate": 5.79920242081416e-07, "loss": 0.0, "num_input_tokens_seen": 202312672, "step": 93795 }, { "epoch": 17.214167737199485, "grad_norm": 0.0006872377707622945, "learning_rate": 5.795459796671105e-07, "loss": 0.0, "num_input_tokens_seen": 202324000, "step": 93800 }, { "epoch": 17.21508533675904, "grad_norm": 5.579319622484036e-05, "learning_rate": 5.791718306305982e-07, "loss": 0.0, "num_input_tokens_seen": 202334752, "step": 93805 }, { "epoch": 17.216002936318592, "grad_norm": 0.0001063568633981049, "learning_rate": 5.787977949814783e-07, "loss": 0.0, "num_input_tokens_seen": 202345760, "step": 93810 }, { "epoch": 17.216920535878142, "grad_norm": 0.00012992155097890645, "learning_rate": 5.784238727293423e-07, "loss": 0.0, "num_input_tokens_seen": 202356736, "step": 93815 }, { "epoch": 17.217838135437695, "grad_norm": 0.007882085628807545, "learning_rate": 5.780500638837811e-07, "loss": 0.0, "num_input_tokens_seen": 202368000, "step": 93820 }, { "epoch": 17.21875573499725, "grad_norm": 0.00013822189066559076, "learning_rate": 5.776763684543829e-07, "loss": 0.0, "num_input_tokens_seen": 202379168, "step": 93825 }, { "epoch": 17.2196733345568, "grad_norm": 0.0002739095943979919, "learning_rate": 5.773027864507313e-07, "loss": 0.0, "num_input_tokens_seen": 202389216, "step": 93830 }, { "epoch": 17.220590934116352, "grad_norm": 0.00025684229331091046, "learning_rate": 5.769293178824081e-07, "loss": 0.0, "num_input_tokens_seen": 202399584, "step": 93835 }, { "epoch": 17.221508533675905, "grad_norm": 0.0059362188912928104, "learning_rate": 5.765559627589934e-07, "loss": 0.0, "num_input_tokens_seen": 202410816, "step": 93840 }, { "epoch": 17.222426133235455, "grad_norm": 0.0010031935526058078, "learning_rate": 5.761827210900628e-07, "loss": 0.0, "num_input_tokens_seen": 202422496, "step": 93845 }, { "epoch": 17.22334373279501, "grad_norm": 0.0001763141917763278, "learning_rate": 5.758095928851893e-07, "loss": 0.0, "num_input_tokens_seen": 202433056, "step": 93850 }, { "epoch": 17.224261332354562, "grad_norm": 0.001372169004753232, "learning_rate": 5.754365781539412e-07, "loss": 0.0, "num_input_tokens_seen": 202443936, "step": 93855 }, { "epoch": 17.22517893191411, "grad_norm": 0.00018276718037668616, "learning_rate": 5.750636769058893e-07, "loss": 0.0, "num_input_tokens_seen": 202454976, "step": 93860 }, { "epoch": 17.226096531473665, "grad_norm": 0.0008198380237445235, "learning_rate": 5.746908891505953e-07, "loss": 0.0, "num_input_tokens_seen": 202464896, "step": 93865 }, { "epoch": 17.22701413103322, "grad_norm": 0.0019876183941960335, "learning_rate": 5.743182148976207e-07, "loss": 0.0, "num_input_tokens_seen": 202476576, "step": 93870 }, { "epoch": 17.22793173059277, "grad_norm": 7.994506449904293e-05, "learning_rate": 5.739456541565258e-07, "loss": 0.0, "num_input_tokens_seen": 202487520, "step": 93875 }, { "epoch": 17.228849330152322, "grad_norm": 0.00021700984507333487, "learning_rate": 5.735732069368649e-07, "loss": 0.0, "num_input_tokens_seen": 202497440, "step": 93880 }, { "epoch": 17.229766929711875, "grad_norm": 3.8739904994145036e-05, "learning_rate": 5.732008732481897e-07, "loss": 0.002, "num_input_tokens_seen": 202508608, "step": 93885 }, { "epoch": 17.230684529271425, "grad_norm": 6.362283602356911e-05, "learning_rate": 5.728286531000526e-07, "loss": 0.0, "num_input_tokens_seen": 202519520, "step": 93890 }, { "epoch": 17.23160212883098, "grad_norm": 0.004164003301411867, "learning_rate": 5.724565465019988e-07, "loss": 0.0, "num_input_tokens_seen": 202529856, "step": 93895 }, { "epoch": 17.232519728390532, "grad_norm": 0.00010835453576873988, "learning_rate": 5.720845534635727e-07, "loss": 0.0, "num_input_tokens_seen": 202540576, "step": 93900 }, { "epoch": 17.23343732795008, "grad_norm": 5.705952207790688e-05, "learning_rate": 5.717126739943141e-07, "loss": 0.0, "num_input_tokens_seen": 202551360, "step": 93905 }, { "epoch": 17.234354927509635, "grad_norm": 6.461935117840767e-05, "learning_rate": 5.71340908103763e-07, "loss": 0.0, "num_input_tokens_seen": 202562464, "step": 93910 }, { "epoch": 17.23527252706919, "grad_norm": 0.00010988117719534785, "learning_rate": 5.70969255801454e-07, "loss": 0.0, "num_input_tokens_seen": 202573920, "step": 93915 }, { "epoch": 17.23619012662874, "grad_norm": 0.00036778851062990725, "learning_rate": 5.705977170969184e-07, "loss": 0.0, "num_input_tokens_seen": 202584224, "step": 93920 }, { "epoch": 17.23710772618829, "grad_norm": 6.027098424965516e-05, "learning_rate": 5.702262919996871e-07, "loss": 0.0, "num_input_tokens_seen": 202595584, "step": 93925 }, { "epoch": 17.238025325747845, "grad_norm": 0.0001275141694350168, "learning_rate": 5.69854980519286e-07, "loss": 0.0, "num_input_tokens_seen": 202607072, "step": 93930 }, { "epoch": 17.238942925307395, "grad_norm": 6.795596709707752e-05, "learning_rate": 5.694837826652383e-07, "loss": 0.0, "num_input_tokens_seen": 202617248, "step": 93935 }, { "epoch": 17.23986052486695, "grad_norm": 0.0001498147175880149, "learning_rate": 5.691126984470641e-07, "loss": 0.0, "num_input_tokens_seen": 202628000, "step": 93940 }, { "epoch": 17.2407781244265, "grad_norm": 5.604801117442548e-05, "learning_rate": 5.68741727874283e-07, "loss": 0.0, "num_input_tokens_seen": 202637728, "step": 93945 }, { "epoch": 17.24169572398605, "grad_norm": 0.0002736349415499717, "learning_rate": 5.68370870956409e-07, "loss": 0.0, "num_input_tokens_seen": 202647904, "step": 93950 }, { "epoch": 17.242613323545605, "grad_norm": 5.850653178640641e-05, "learning_rate": 5.680001277029524e-07, "loss": 0.0, "num_input_tokens_seen": 202659712, "step": 93955 }, { "epoch": 17.24353092310516, "grad_norm": 0.00010018290049629286, "learning_rate": 5.676294981234243e-07, "loss": 0.0, "num_input_tokens_seen": 202669376, "step": 93960 }, { "epoch": 17.244448522664708, "grad_norm": 0.0006088852533139288, "learning_rate": 5.672589822273305e-07, "loss": 0.0, "num_input_tokens_seen": 202680928, "step": 93965 }, { "epoch": 17.24536612222426, "grad_norm": 0.00034099083859473467, "learning_rate": 5.668885800241724e-07, "loss": 0.0, "num_input_tokens_seen": 202691520, "step": 93970 }, { "epoch": 17.246283721783815, "grad_norm": 0.11462999880313873, "learning_rate": 5.66518291523453e-07, "loss": 0.0001, "num_input_tokens_seen": 202702496, "step": 93975 }, { "epoch": 17.247201321343365, "grad_norm": 6.051280797692016e-05, "learning_rate": 5.661481167346677e-07, "loss": 0.0, "num_input_tokens_seen": 202713824, "step": 93980 }, { "epoch": 17.248118920902918, "grad_norm": 0.00017104986181948334, "learning_rate": 5.657780556673115e-07, "loss": 0.0, "num_input_tokens_seen": 202724160, "step": 93985 }, { "epoch": 17.24903652046247, "grad_norm": 0.00014754282892681658, "learning_rate": 5.654081083308744e-07, "loss": 0.0, "num_input_tokens_seen": 202735136, "step": 93990 }, { "epoch": 17.24995412002202, "grad_norm": 4.7639052354497835e-05, "learning_rate": 5.650382747348476e-07, "loss": 0.0, "num_input_tokens_seen": 202744832, "step": 93995 }, { "epoch": 17.250871719581575, "grad_norm": 7.73435240262188e-05, "learning_rate": 5.646685548887154e-07, "loss": 0.0, "num_input_tokens_seen": 202754464, "step": 94000 }, { "epoch": 17.25178931914113, "grad_norm": 0.00013859970204066485, "learning_rate": 5.642989488019601e-07, "loss": 0.0, "num_input_tokens_seen": 202763648, "step": 94005 }, { "epoch": 17.252706918700678, "grad_norm": 6.44098618067801e-05, "learning_rate": 5.639294564840625e-07, "loss": 0.0, "num_input_tokens_seen": 202775072, "step": 94010 }, { "epoch": 17.25362451826023, "grad_norm": 5.1401362725300714e-05, "learning_rate": 5.635600779444995e-07, "loss": 0.0, "num_input_tokens_seen": 202786176, "step": 94015 }, { "epoch": 17.254542117819785, "grad_norm": 4.777077992912382e-05, "learning_rate": 5.631908131927438e-07, "loss": 0.0, "num_input_tokens_seen": 202797568, "step": 94020 }, { "epoch": 17.255459717379335, "grad_norm": 0.00033536500995978713, "learning_rate": 5.628216622382682e-07, "loss": 0.0, "num_input_tokens_seen": 202808832, "step": 94025 }, { "epoch": 17.256377316938888, "grad_norm": 0.001317864516749978, "learning_rate": 5.624526250905388e-07, "loss": 0.0, "num_input_tokens_seen": 202819456, "step": 94030 }, { "epoch": 17.25729491649844, "grad_norm": 0.0003533480048645288, "learning_rate": 5.620837017590225e-07, "loss": 0.0, "num_input_tokens_seen": 202831360, "step": 94035 }, { "epoch": 17.25821251605799, "grad_norm": 5.761391730629839e-05, "learning_rate": 5.617148922531817e-07, "loss": 0.0478, "num_input_tokens_seen": 202841792, "step": 94040 }, { "epoch": 17.259130115617545, "grad_norm": 0.00021801820548716933, "learning_rate": 5.613461965824746e-07, "loss": 0.0, "num_input_tokens_seen": 202853472, "step": 94045 }, { "epoch": 17.260047715177098, "grad_norm": 0.00016069289995357394, "learning_rate": 5.609776147563589e-07, "loss": 0.0, "num_input_tokens_seen": 202865216, "step": 94050 }, { "epoch": 17.260965314736648, "grad_norm": 0.0001695582177489996, "learning_rate": 5.606091467842861e-07, "loss": 0.0, "num_input_tokens_seen": 202874176, "step": 94055 }, { "epoch": 17.2618829142962, "grad_norm": 6.79032236803323e-05, "learning_rate": 5.602407926757092e-07, "loss": 0.0451, "num_input_tokens_seen": 202883936, "step": 94060 }, { "epoch": 17.262800513855755, "grad_norm": 7.025124796200544e-05, "learning_rate": 5.598725524400755e-07, "loss": 0.0001, "num_input_tokens_seen": 202895328, "step": 94065 }, { "epoch": 17.263718113415305, "grad_norm": 4.207215897622518e-05, "learning_rate": 5.595044260868288e-07, "loss": 0.0, "num_input_tokens_seen": 202906752, "step": 94070 }, { "epoch": 17.264635712974858, "grad_norm": 0.0005410159938037395, "learning_rate": 5.591364136254107e-07, "loss": 0.0, "num_input_tokens_seen": 202917920, "step": 94075 }, { "epoch": 17.26555331253441, "grad_norm": 9.285192209063098e-05, "learning_rate": 5.587685150652616e-07, "loss": 0.0, "num_input_tokens_seen": 202929056, "step": 94080 }, { "epoch": 17.26647091209396, "grad_norm": 0.0003016696427948773, "learning_rate": 5.584007304158168e-07, "loss": 0.0, "num_input_tokens_seen": 202940576, "step": 94085 }, { "epoch": 17.267388511653515, "grad_norm": 6.303670670604333e-05, "learning_rate": 5.580330596865085e-07, "loss": 0.0, "num_input_tokens_seen": 202949696, "step": 94090 }, { "epoch": 17.268306111213068, "grad_norm": 0.029739171266555786, "learning_rate": 5.576655028867689e-07, "loss": 0.0, "num_input_tokens_seen": 202960928, "step": 94095 }, { "epoch": 17.269223710772618, "grad_norm": 0.0004184636636637151, "learning_rate": 5.572980600260241e-07, "loss": 0.0, "num_input_tokens_seen": 202972544, "step": 94100 }, { "epoch": 17.27014131033217, "grad_norm": 8.344967500306666e-05, "learning_rate": 5.569307311136973e-07, "loss": 0.0, "num_input_tokens_seen": 202983104, "step": 94105 }, { "epoch": 17.271058909891725, "grad_norm": 0.00029410546994768083, "learning_rate": 5.56563516159212e-07, "loss": 0.0, "num_input_tokens_seen": 202993696, "step": 94110 }, { "epoch": 17.271976509451274, "grad_norm": 0.0022980424109846354, "learning_rate": 5.561964151719862e-07, "loss": 0.0, "num_input_tokens_seen": 203004704, "step": 94115 }, { "epoch": 17.272894109010828, "grad_norm": 0.00012117059668526053, "learning_rate": 5.558294281614351e-07, "loss": 0.0, "num_input_tokens_seen": 203015520, "step": 94120 }, { "epoch": 17.27381170857038, "grad_norm": 0.00011722318595275283, "learning_rate": 5.554625551369702e-07, "loss": 0.0, "num_input_tokens_seen": 203025888, "step": 94125 }, { "epoch": 17.27472930812993, "grad_norm": 0.00013216055231168866, "learning_rate": 5.550957961080034e-07, "loss": 0.0, "num_input_tokens_seen": 203036544, "step": 94130 }, { "epoch": 17.275646907689485, "grad_norm": 0.0008014454506337643, "learning_rate": 5.547291510839404e-07, "loss": 0.0, "num_input_tokens_seen": 203047808, "step": 94135 }, { "epoch": 17.276564507249038, "grad_norm": 0.00016485418018419296, "learning_rate": 5.543626200741842e-07, "loss": 0.0, "num_input_tokens_seen": 203059744, "step": 94140 }, { "epoch": 17.277482106808588, "grad_norm": 0.012420506216585636, "learning_rate": 5.539962030881374e-07, "loss": 0.0, "num_input_tokens_seen": 203071200, "step": 94145 }, { "epoch": 17.27839970636814, "grad_norm": 0.0003351535415276885, "learning_rate": 5.536299001351975e-07, "loss": 0.0, "num_input_tokens_seen": 203081824, "step": 94150 }, { "epoch": 17.279317305927695, "grad_norm": 8.762534707784653e-05, "learning_rate": 5.532637112247585e-07, "loss": 0.0, "num_input_tokens_seen": 203092512, "step": 94155 }, { "epoch": 17.280234905487244, "grad_norm": 6.136002775747329e-05, "learning_rate": 5.528976363662142e-07, "loss": 0.0, "num_input_tokens_seen": 203102656, "step": 94160 }, { "epoch": 17.281152505046798, "grad_norm": 0.00113580166362226, "learning_rate": 5.525316755689536e-07, "loss": 0.0, "num_input_tokens_seen": 203114560, "step": 94165 }, { "epoch": 17.28207010460635, "grad_norm": 5.80454507144168e-05, "learning_rate": 5.52165828842362e-07, "loss": 0.0, "num_input_tokens_seen": 203123808, "step": 94170 }, { "epoch": 17.2829877041659, "grad_norm": 8.036317740334198e-05, "learning_rate": 5.518000961958231e-07, "loss": 0.0, "num_input_tokens_seen": 203135008, "step": 94175 }, { "epoch": 17.283905303725454, "grad_norm": 0.2204051911830902, "learning_rate": 5.514344776387182e-07, "loss": 0.0001, "num_input_tokens_seen": 203146496, "step": 94180 }, { "epoch": 17.284822903285008, "grad_norm": 0.0002169387007597834, "learning_rate": 5.510689731804242e-07, "loss": 0.0, "num_input_tokens_seen": 203157920, "step": 94185 }, { "epoch": 17.285740502844558, "grad_norm": 0.0005470648757182062, "learning_rate": 5.507035828303148e-07, "loss": 0.0, "num_input_tokens_seen": 203168800, "step": 94190 }, { "epoch": 17.28665810240411, "grad_norm": 0.005346239078789949, "learning_rate": 5.503383065977641e-07, "loss": 0.0, "num_input_tokens_seen": 203179968, "step": 94195 }, { "epoch": 17.287575701963664, "grad_norm": 0.00019919188343919814, "learning_rate": 5.499731444921391e-07, "loss": 0.0, "num_input_tokens_seen": 203190816, "step": 94200 }, { "epoch": 17.288493301523214, "grad_norm": 0.00010921332432189956, "learning_rate": 5.496080965228062e-07, "loss": 0.0, "num_input_tokens_seen": 203201120, "step": 94205 }, { "epoch": 17.289410901082768, "grad_norm": 5.543178122024983e-05, "learning_rate": 5.492431626991274e-07, "loss": 0.0, "num_input_tokens_seen": 203211168, "step": 94210 }, { "epoch": 17.29032850064232, "grad_norm": 0.0017610825598239899, "learning_rate": 5.488783430304639e-07, "loss": 0.0, "num_input_tokens_seen": 203221824, "step": 94215 }, { "epoch": 17.29124610020187, "grad_norm": 0.0010500297648832202, "learning_rate": 5.485136375261729e-07, "loss": 0.0, "num_input_tokens_seen": 203232032, "step": 94220 }, { "epoch": 17.292163699761424, "grad_norm": 0.00015668456035200506, "learning_rate": 5.481490461956063e-07, "loss": 0.0, "num_input_tokens_seen": 203242272, "step": 94225 }, { "epoch": 17.293081299320978, "grad_norm": 0.0016195905627682805, "learning_rate": 5.477845690481181e-07, "loss": 0.0, "num_input_tokens_seen": 203252480, "step": 94230 }, { "epoch": 17.293998898880528, "grad_norm": 0.00017141962598543614, "learning_rate": 5.474202060930555e-07, "loss": 0.0, "num_input_tokens_seen": 203263872, "step": 94235 }, { "epoch": 17.29491649844008, "grad_norm": 7.377781730610877e-05, "learning_rate": 5.470559573397638e-07, "loss": 0.0, "num_input_tokens_seen": 203275584, "step": 94240 }, { "epoch": 17.295834097999634, "grad_norm": 0.000247839285293594, "learning_rate": 5.466918227975854e-07, "loss": 0.0, "num_input_tokens_seen": 203287072, "step": 94245 }, { "epoch": 17.296751697559184, "grad_norm": 6.322206900222227e-05, "learning_rate": 5.463278024758584e-07, "loss": 0.0, "num_input_tokens_seen": 203297696, "step": 94250 }, { "epoch": 17.297669297118738, "grad_norm": 6.986653170315549e-05, "learning_rate": 5.45963896383922e-07, "loss": 0.0, "num_input_tokens_seen": 203308384, "step": 94255 }, { "epoch": 17.29858689667829, "grad_norm": 0.00016174983466044068, "learning_rate": 5.456001045311088e-07, "loss": 0.0, "num_input_tokens_seen": 203319520, "step": 94260 }, { "epoch": 17.29950449623784, "grad_norm": 5.979697380098514e-05, "learning_rate": 5.452364269267485e-07, "loss": 0.0, "num_input_tokens_seen": 203330112, "step": 94265 }, { "epoch": 17.300422095797394, "grad_norm": 6.106838554842398e-05, "learning_rate": 5.448728635801703e-07, "loss": 0.0, "num_input_tokens_seen": 203340960, "step": 94270 }, { "epoch": 17.301339695356948, "grad_norm": 0.0008349371491931379, "learning_rate": 5.445094145006968e-07, "loss": 0.0, "num_input_tokens_seen": 203351744, "step": 94275 }, { "epoch": 17.302257294916497, "grad_norm": 6.215323810465634e-05, "learning_rate": 5.441460796976527e-07, "loss": 0.0, "num_input_tokens_seen": 203362560, "step": 94280 }, { "epoch": 17.30317489447605, "grad_norm": 5.023262929171324e-05, "learning_rate": 5.437828591803557e-07, "loss": 0.0, "num_input_tokens_seen": 203372544, "step": 94285 }, { "epoch": 17.304092494035604, "grad_norm": 5.090589911560528e-05, "learning_rate": 5.434197529581209e-07, "loss": 0.0, "num_input_tokens_seen": 203383744, "step": 94290 }, { "epoch": 17.305010093595154, "grad_norm": 0.00016708260227460414, "learning_rate": 5.430567610402632e-07, "loss": 0.0, "num_input_tokens_seen": 203393824, "step": 94295 }, { "epoch": 17.305927693154707, "grad_norm": 9.916679846355692e-05, "learning_rate": 5.426938834360918e-07, "loss": 0.0, "num_input_tokens_seen": 203404224, "step": 94300 }, { "epoch": 17.30684529271426, "grad_norm": 0.0001310436346102506, "learning_rate": 5.423311201549142e-07, "loss": 0.0, "num_input_tokens_seen": 203415968, "step": 94305 }, { "epoch": 17.30776289227381, "grad_norm": 0.00013466246309690177, "learning_rate": 5.419684712060336e-07, "loss": 0.0032, "num_input_tokens_seen": 203426624, "step": 94310 }, { "epoch": 17.308680491833364, "grad_norm": 0.00012341802357695997, "learning_rate": 5.416059365987536e-07, "loss": 0.0, "num_input_tokens_seen": 203436384, "step": 94315 }, { "epoch": 17.309598091392917, "grad_norm": 0.00015389469626825303, "learning_rate": 5.412435163423712e-07, "loss": 0.0, "num_input_tokens_seen": 203447360, "step": 94320 }, { "epoch": 17.310515690952467, "grad_norm": 8.574601815780625e-05, "learning_rate": 5.408812104461814e-07, "loss": 0.0, "num_input_tokens_seen": 203457920, "step": 94325 }, { "epoch": 17.31143329051202, "grad_norm": 0.00024612832930870354, "learning_rate": 5.405190189194786e-07, "loss": 0.0, "num_input_tokens_seen": 203468448, "step": 94330 }, { "epoch": 17.312350890071574, "grad_norm": 6.212109292391688e-05, "learning_rate": 5.401569417715513e-07, "loss": 0.0, "num_input_tokens_seen": 203480192, "step": 94335 }, { "epoch": 17.313268489631124, "grad_norm": 0.00011209092190256342, "learning_rate": 5.397949790116852e-07, "loss": 0.0, "num_input_tokens_seen": 203490624, "step": 94340 }, { "epoch": 17.314186089190677, "grad_norm": 0.0004984744009561837, "learning_rate": 5.394331306491662e-07, "loss": 0.0, "num_input_tokens_seen": 203502656, "step": 94345 }, { "epoch": 17.31510368875023, "grad_norm": 7.255710079334676e-05, "learning_rate": 5.390713966932743e-07, "loss": 0.0, "num_input_tokens_seen": 203513312, "step": 94350 }, { "epoch": 17.31602128830978, "grad_norm": 0.0008208069484680891, "learning_rate": 5.387097771532867e-07, "loss": 0.0, "num_input_tokens_seen": 203524512, "step": 94355 }, { "epoch": 17.316938887869334, "grad_norm": 0.0058644358068704605, "learning_rate": 5.383482720384786e-07, "loss": 0.0, "num_input_tokens_seen": 203535680, "step": 94360 }, { "epoch": 17.317856487428887, "grad_norm": 5.423489346867427e-05, "learning_rate": 5.379868813581234e-07, "loss": 0.0, "num_input_tokens_seen": 203545216, "step": 94365 }, { "epoch": 17.318774086988437, "grad_norm": 8.467531733913347e-05, "learning_rate": 5.37625605121489e-07, "loss": 0.0, "num_input_tokens_seen": 203556288, "step": 94370 }, { "epoch": 17.31969168654799, "grad_norm": 0.0012192316353321075, "learning_rate": 5.372644433378405e-07, "loss": 0.0, "num_input_tokens_seen": 203566720, "step": 94375 }, { "epoch": 17.320609286107544, "grad_norm": 8.081358100753278e-05, "learning_rate": 5.369033960164438e-07, "loss": 0.0, "num_input_tokens_seen": 203577216, "step": 94380 }, { "epoch": 17.321526885667094, "grad_norm": 0.0028729960322380066, "learning_rate": 5.365424631665578e-07, "loss": 0.0, "num_input_tokens_seen": 203588736, "step": 94385 }, { "epoch": 17.322444485226647, "grad_norm": 0.00045274198055267334, "learning_rate": 5.361816447974394e-07, "loss": 0.0, "num_input_tokens_seen": 203599680, "step": 94390 }, { "epoch": 17.3233620847862, "grad_norm": 7.09350933902897e-05, "learning_rate": 5.358209409183429e-07, "loss": 0.0, "num_input_tokens_seen": 203610432, "step": 94395 }, { "epoch": 17.32427968434575, "grad_norm": 0.00012113085540477186, "learning_rate": 5.354603515385215e-07, "loss": 0.0, "num_input_tokens_seen": 203621248, "step": 94400 }, { "epoch": 17.325197283905304, "grad_norm": 0.00030008264002390206, "learning_rate": 5.350998766672227e-07, "loss": 0.0, "num_input_tokens_seen": 203632352, "step": 94405 }, { "epoch": 17.326114883464857, "grad_norm": 0.0063231629319489, "learning_rate": 5.34739516313691e-07, "loss": 0.0, "num_input_tokens_seen": 203643072, "step": 94410 }, { "epoch": 17.327032483024407, "grad_norm": 0.0006860314751975238, "learning_rate": 5.343792704871714e-07, "loss": 0.0, "num_input_tokens_seen": 203653760, "step": 94415 }, { "epoch": 17.32795008258396, "grad_norm": 8.399852231377736e-05, "learning_rate": 5.340191391969019e-07, "loss": 0.0, "num_input_tokens_seen": 203664864, "step": 94420 }, { "epoch": 17.328867682143514, "grad_norm": 0.0002064776053884998, "learning_rate": 5.336591224521192e-07, "loss": 0.0, "num_input_tokens_seen": 203676096, "step": 94425 }, { "epoch": 17.329785281703064, "grad_norm": 5.648353180731647e-05, "learning_rate": 5.332992202620585e-07, "loss": 0.0, "num_input_tokens_seen": 203687168, "step": 94430 }, { "epoch": 17.330702881262617, "grad_norm": 0.000925961066968739, "learning_rate": 5.329394326359504e-07, "loss": 0.0, "num_input_tokens_seen": 203697568, "step": 94435 }, { "epoch": 17.33162048082217, "grad_norm": 0.0020620471332222223, "learning_rate": 5.325797595830224e-07, "loss": 0.0, "num_input_tokens_seen": 203707680, "step": 94440 }, { "epoch": 17.33253808038172, "grad_norm": 0.0006321117980405688, "learning_rate": 5.322202011124989e-07, "loss": 0.0, "num_input_tokens_seen": 203718304, "step": 94445 }, { "epoch": 17.333455679941274, "grad_norm": 0.00041269115172326565, "learning_rate": 5.318607572336037e-07, "loss": 0.0, "num_input_tokens_seen": 203729280, "step": 94450 }, { "epoch": 17.334373279500827, "grad_norm": 0.00022690060723107308, "learning_rate": 5.315014279555547e-07, "loss": 0.0, "num_input_tokens_seen": 203738880, "step": 94455 }, { "epoch": 17.335290879060377, "grad_norm": 5.6041917559923604e-05, "learning_rate": 5.311422132875688e-07, "loss": 0.0, "num_input_tokens_seen": 203749728, "step": 94460 }, { "epoch": 17.33620847861993, "grad_norm": 0.00012402076390571892, "learning_rate": 5.307831132388591e-07, "loss": 0.0, "num_input_tokens_seen": 203761024, "step": 94465 }, { "epoch": 17.337126078179484, "grad_norm": 0.000124919562949799, "learning_rate": 5.304241278186351e-07, "loss": 0.0, "num_input_tokens_seen": 203772832, "step": 94470 }, { "epoch": 17.338043677739034, "grad_norm": 0.0002054812794085592, "learning_rate": 5.300652570361053e-07, "loss": 0.0, "num_input_tokens_seen": 203783840, "step": 94475 }, { "epoch": 17.338961277298587, "grad_norm": 0.00012399270781315863, "learning_rate": 5.297065009004749e-07, "loss": 0.0, "num_input_tokens_seen": 203793888, "step": 94480 }, { "epoch": 17.33987887685814, "grad_norm": 8.203973266063258e-05, "learning_rate": 5.293478594209433e-07, "loss": 0.0, "num_input_tokens_seen": 203805376, "step": 94485 }, { "epoch": 17.34079647641769, "grad_norm": 0.00016443651111330837, "learning_rate": 5.289893326067108e-07, "loss": 0.0, "num_input_tokens_seen": 203816000, "step": 94490 }, { "epoch": 17.341714075977244, "grad_norm": 0.0012646825052797794, "learning_rate": 5.286309204669715e-07, "loss": 0.0, "num_input_tokens_seen": 203827040, "step": 94495 }, { "epoch": 17.342631675536797, "grad_norm": 0.0010163943516090512, "learning_rate": 5.282726230109203e-07, "loss": 0.0, "num_input_tokens_seen": 203837952, "step": 94500 }, { "epoch": 17.343549275096347, "grad_norm": 0.00849595945328474, "learning_rate": 5.279144402477454e-07, "loss": 0.0, "num_input_tokens_seen": 203848896, "step": 94505 }, { "epoch": 17.3444668746559, "grad_norm": 0.0003073245461564511, "learning_rate": 5.275563721866334e-07, "loss": 0.0, "num_input_tokens_seen": 203858784, "step": 94510 }, { "epoch": 17.345384474215454, "grad_norm": 7.263737643370405e-05, "learning_rate": 5.271984188367695e-07, "loss": 0.0, "num_input_tokens_seen": 203870080, "step": 94515 }, { "epoch": 17.346302073775004, "grad_norm": 0.0019310351926833391, "learning_rate": 5.26840580207334e-07, "loss": 0.0, "num_input_tokens_seen": 203882080, "step": 94520 }, { "epoch": 17.347219673334557, "grad_norm": 6.834246596554294e-05, "learning_rate": 5.264828563075047e-07, "loss": 0.0, "num_input_tokens_seen": 203892704, "step": 94525 }, { "epoch": 17.34813727289411, "grad_norm": 9.949773084372282e-05, "learning_rate": 5.261252471464562e-07, "loss": 0.0, "num_input_tokens_seen": 203902528, "step": 94530 }, { "epoch": 17.34905487245366, "grad_norm": 7.683546573389322e-05, "learning_rate": 5.257677527333616e-07, "loss": 0.0, "num_input_tokens_seen": 203912608, "step": 94535 }, { "epoch": 17.349972472013214, "grad_norm": 0.0015951665118336678, "learning_rate": 5.254103730773901e-07, "loss": 0.0, "num_input_tokens_seen": 203923200, "step": 94540 }, { "epoch": 17.350890071572767, "grad_norm": 0.0001130191158154048, "learning_rate": 5.250531081877064e-07, "loss": 0.0, "num_input_tokens_seen": 203934848, "step": 94545 }, { "epoch": 17.351807671132317, "grad_norm": 0.00010091563308378682, "learning_rate": 5.246959580734762e-07, "loss": 0.0, "num_input_tokens_seen": 203946304, "step": 94550 }, { "epoch": 17.35272527069187, "grad_norm": 0.0001305726618738845, "learning_rate": 5.243389227438584e-07, "loss": 0.0, "num_input_tokens_seen": 203955968, "step": 94555 }, { "epoch": 17.353642870251424, "grad_norm": 0.0015631397254765034, "learning_rate": 5.2398200220801e-07, "loss": 0.0, "num_input_tokens_seen": 203966048, "step": 94560 }, { "epoch": 17.354560469810973, "grad_norm": 0.000785756972618401, "learning_rate": 5.236251964750866e-07, "loss": 0.0, "num_input_tokens_seen": 203975968, "step": 94565 }, { "epoch": 17.355478069370527, "grad_norm": 0.00014543923316523433, "learning_rate": 5.232685055542391e-07, "loss": 0.0, "num_input_tokens_seen": 203987872, "step": 94570 }, { "epoch": 17.35639566893008, "grad_norm": 8.240692113758996e-05, "learning_rate": 5.229119294546164e-07, "loss": 0.0, "num_input_tokens_seen": 203998464, "step": 94575 }, { "epoch": 17.35731326848963, "grad_norm": 7.926124817458913e-05, "learning_rate": 5.225554681853623e-07, "loss": 0.0, "num_input_tokens_seen": 204010368, "step": 94580 }, { "epoch": 17.358230868049183, "grad_norm": 0.0004157468501944095, "learning_rate": 5.221991217556227e-07, "loss": 0.0, "num_input_tokens_seen": 204021056, "step": 94585 }, { "epoch": 17.359148467608737, "grad_norm": 6.0751608543796465e-05, "learning_rate": 5.218428901745353e-07, "loss": 0.0, "num_input_tokens_seen": 204031072, "step": 94590 }, { "epoch": 17.360066067168287, "grad_norm": 0.00010151252354262397, "learning_rate": 5.214867734512364e-07, "loss": 0.0, "num_input_tokens_seen": 204040576, "step": 94595 }, { "epoch": 17.36098366672784, "grad_norm": 0.0001567812287248671, "learning_rate": 5.211307715948616e-07, "loss": 0.0, "num_input_tokens_seen": 204050816, "step": 94600 }, { "epoch": 17.361901266287393, "grad_norm": 0.0005065565346740186, "learning_rate": 5.20774884614541e-07, "loss": 0.0, "num_input_tokens_seen": 204061856, "step": 94605 }, { "epoch": 17.362818865846943, "grad_norm": 0.00033660835470072925, "learning_rate": 5.204191125194013e-07, "loss": 0.0, "num_input_tokens_seen": 204071136, "step": 94610 }, { "epoch": 17.363736465406497, "grad_norm": 0.001190642942674458, "learning_rate": 5.200634553185696e-07, "loss": 0.0, "num_input_tokens_seen": 204081536, "step": 94615 }, { "epoch": 17.36465406496605, "grad_norm": 0.0004775597481057048, "learning_rate": 5.197079130211674e-07, "loss": 0.0, "num_input_tokens_seen": 204092128, "step": 94620 }, { "epoch": 17.3655716645256, "grad_norm": 0.00029130521579645574, "learning_rate": 5.19352485636313e-07, "loss": 0.0, "num_input_tokens_seen": 204102112, "step": 94625 }, { "epoch": 17.366489264085153, "grad_norm": 0.004213173408061266, "learning_rate": 5.189971731731219e-07, "loss": 0.0, "num_input_tokens_seen": 204112768, "step": 94630 }, { "epoch": 17.367406863644707, "grad_norm": 0.0005616276757791638, "learning_rate": 5.186419756407096e-07, "loss": 0.0, "num_input_tokens_seen": 204121760, "step": 94635 }, { "epoch": 17.368324463204257, "grad_norm": 0.00010734520037658513, "learning_rate": 5.182868930481855e-07, "loss": 0.0, "num_input_tokens_seen": 204132864, "step": 94640 }, { "epoch": 17.36924206276381, "grad_norm": 0.00038498721551150084, "learning_rate": 5.17931925404655e-07, "loss": 0.0, "num_input_tokens_seen": 204143712, "step": 94645 }, { "epoch": 17.370159662323363, "grad_norm": 0.0007448170217685401, "learning_rate": 5.175770727192253e-07, "loss": 0.0, "num_input_tokens_seen": 204154656, "step": 94650 }, { "epoch": 17.371077261882913, "grad_norm": 0.0005493587232194841, "learning_rate": 5.172223350009963e-07, "loss": 0.0, "num_input_tokens_seen": 204166272, "step": 94655 }, { "epoch": 17.371994861442467, "grad_norm": 8.172865636879578e-05, "learning_rate": 5.168677122590671e-07, "loss": 0.0, "num_input_tokens_seen": 204175936, "step": 94660 }, { "epoch": 17.37291246100202, "grad_norm": 6.442673475248739e-05, "learning_rate": 5.165132045025317e-07, "loss": 0.0, "num_input_tokens_seen": 204187872, "step": 94665 }, { "epoch": 17.37383006056157, "grad_norm": 0.012736525386571884, "learning_rate": 5.161588117404848e-07, "loss": 0.0, "num_input_tokens_seen": 204199328, "step": 94670 }, { "epoch": 17.374747660121123, "grad_norm": 0.0001530057197669521, "learning_rate": 5.15804533982015e-07, "loss": 0.0, "num_input_tokens_seen": 204210624, "step": 94675 }, { "epoch": 17.375665259680677, "grad_norm": 0.00018666216055862606, "learning_rate": 5.154503712362092e-07, "loss": 0.0, "num_input_tokens_seen": 204222272, "step": 94680 }, { "epoch": 17.376582859240226, "grad_norm": 0.00019162922399118543, "learning_rate": 5.150963235121509e-07, "loss": 0.0, "num_input_tokens_seen": 204232576, "step": 94685 }, { "epoch": 17.37750045879978, "grad_norm": 5.7673642004374415e-05, "learning_rate": 5.147423908189198e-07, "loss": 0.0, "num_input_tokens_seen": 204243424, "step": 94690 }, { "epoch": 17.378418058359333, "grad_norm": 7.964000542415306e-05, "learning_rate": 5.143885731655962e-07, "loss": 0.0, "num_input_tokens_seen": 204254016, "step": 94695 }, { "epoch": 17.379335657918883, "grad_norm": 0.0002955577801913023, "learning_rate": 5.14034870561253e-07, "loss": 0.0, "num_input_tokens_seen": 204265568, "step": 94700 }, { "epoch": 17.380253257478437, "grad_norm": 0.001640479313209653, "learning_rate": 5.136812830149635e-07, "loss": 0.0, "num_input_tokens_seen": 204277664, "step": 94705 }, { "epoch": 17.38117085703799, "grad_norm": 0.00011131733481306583, "learning_rate": 5.133278105357952e-07, "loss": 0.0, "num_input_tokens_seen": 204288128, "step": 94710 }, { "epoch": 17.38208845659754, "grad_norm": 0.0004907094407826662, "learning_rate": 5.12974453132814e-07, "loss": 0.0, "num_input_tokens_seen": 204299552, "step": 94715 }, { "epoch": 17.383006056157093, "grad_norm": 7.574167830171064e-05, "learning_rate": 5.126212108150852e-07, "loss": 0.0, "num_input_tokens_seen": 204310784, "step": 94720 }, { "epoch": 17.383923655716647, "grad_norm": 0.0002281328634126112, "learning_rate": 5.122680835916677e-07, "loss": 0.0, "num_input_tokens_seen": 204322272, "step": 94725 }, { "epoch": 17.384841255276196, "grad_norm": 0.0003288908046670258, "learning_rate": 5.11915071471617e-07, "loss": 0.0, "num_input_tokens_seen": 204332928, "step": 94730 }, { "epoch": 17.38575885483575, "grad_norm": 0.025455070659518242, "learning_rate": 5.115621744639898e-07, "loss": 0.0, "num_input_tokens_seen": 204344864, "step": 94735 }, { "epoch": 17.386676454395303, "grad_norm": 0.0005042154807597399, "learning_rate": 5.112093925778366e-07, "loss": 0.0, "num_input_tokens_seen": 204356480, "step": 94740 }, { "epoch": 17.387594053954853, "grad_norm": 0.00010128141002496704, "learning_rate": 5.108567258222047e-07, "loss": 0.0, "num_input_tokens_seen": 204367136, "step": 94745 }, { "epoch": 17.388511653514406, "grad_norm": 6.962904444662854e-05, "learning_rate": 5.105041742061406e-07, "loss": 0.0, "num_input_tokens_seen": 204378336, "step": 94750 }, { "epoch": 17.38942925307396, "grad_norm": 0.0009844916639849544, "learning_rate": 5.101517377386867e-07, "loss": 0.0, "num_input_tokens_seen": 204388416, "step": 94755 }, { "epoch": 17.39034685263351, "grad_norm": 7.315384573303163e-05, "learning_rate": 5.097994164288822e-07, "loss": 0.0, "num_input_tokens_seen": 204400256, "step": 94760 }, { "epoch": 17.391264452193063, "grad_norm": 0.0001713170640869066, "learning_rate": 5.094472102857622e-07, "loss": 0.0, "num_input_tokens_seen": 204411232, "step": 94765 }, { "epoch": 17.392182051752616, "grad_norm": 0.0001503092353232205, "learning_rate": 5.090951193183629e-07, "loss": 0.0, "num_input_tokens_seen": 204422304, "step": 94770 }, { "epoch": 17.393099651312166, "grad_norm": 0.0001067839766619727, "learning_rate": 5.087431435357132e-07, "loss": 0.0, "num_input_tokens_seen": 204432416, "step": 94775 }, { "epoch": 17.39401725087172, "grad_norm": 0.00013755212421528995, "learning_rate": 5.083912829468408e-07, "loss": 0.0, "num_input_tokens_seen": 204443616, "step": 94780 }, { "epoch": 17.394934850431273, "grad_norm": 0.0001332946849288419, "learning_rate": 5.080395375607705e-07, "loss": 0.0, "num_input_tokens_seen": 204454048, "step": 94785 }, { "epoch": 17.395852449990823, "grad_norm": 5.677253648173064e-05, "learning_rate": 5.076879073865248e-07, "loss": 0.0, "num_input_tokens_seen": 204463392, "step": 94790 }, { "epoch": 17.396770049550376, "grad_norm": 0.00015134524437598884, "learning_rate": 5.07336392433121e-07, "loss": 0.0, "num_input_tokens_seen": 204474624, "step": 94795 }, { "epoch": 17.39768764910993, "grad_norm": 8.640120358904824e-05, "learning_rate": 5.06984992709576e-07, "loss": 0.0, "num_input_tokens_seen": 204485600, "step": 94800 }, { "epoch": 17.39860524866948, "grad_norm": 0.00013321085134521127, "learning_rate": 5.066337082249028e-07, "loss": 0.0, "num_input_tokens_seen": 204495872, "step": 94805 }, { "epoch": 17.399522848229033, "grad_norm": 5.4565203754464164e-05, "learning_rate": 5.062825389881109e-07, "loss": 0.0, "num_input_tokens_seen": 204506112, "step": 94810 }, { "epoch": 17.400440447788586, "grad_norm": 6.520832539536059e-05, "learning_rate": 5.059314850082064e-07, "loss": 0.0, "num_input_tokens_seen": 204516416, "step": 94815 }, { "epoch": 17.401358047348136, "grad_norm": 0.00012353819329291582, "learning_rate": 5.055805462941954e-07, "loss": 0.0, "num_input_tokens_seen": 204526656, "step": 94820 }, { "epoch": 17.40227564690769, "grad_norm": 0.000341240840498358, "learning_rate": 5.052297228550768e-07, "loss": 0.0, "num_input_tokens_seen": 204537632, "step": 94825 }, { "epoch": 17.403193246467243, "grad_norm": 7.80635018600151e-05, "learning_rate": 5.048790146998495e-07, "loss": 0.0, "num_input_tokens_seen": 204548864, "step": 94830 }, { "epoch": 17.404110846026793, "grad_norm": 0.0001195837976410985, "learning_rate": 5.045284218375091e-07, "loss": 0.0, "num_input_tokens_seen": 204559712, "step": 94835 }, { "epoch": 17.405028445586346, "grad_norm": 0.0001200377955683507, "learning_rate": 5.041779442770472e-07, "loss": 0.0, "num_input_tokens_seen": 204568480, "step": 94840 }, { "epoch": 17.4059460451459, "grad_norm": 7.313305832212791e-05, "learning_rate": 5.038275820274536e-07, "loss": 0.0, "num_input_tokens_seen": 204579904, "step": 94845 }, { "epoch": 17.40686364470545, "grad_norm": 0.0010597644140943885, "learning_rate": 5.03477335097713e-07, "loss": 0.0, "num_input_tokens_seen": 204590752, "step": 94850 }, { "epoch": 17.407781244265003, "grad_norm": 0.0011251801624894142, "learning_rate": 5.031272034968104e-07, "loss": 0.0, "num_input_tokens_seen": 204601024, "step": 94855 }, { "epoch": 17.408698843824556, "grad_norm": 0.00013650192704517394, "learning_rate": 5.027771872337256e-07, "loss": 0.0, "num_input_tokens_seen": 204612576, "step": 94860 }, { "epoch": 17.409616443384106, "grad_norm": 0.00011304856889182702, "learning_rate": 5.024272863174351e-07, "loss": 0.0, "num_input_tokens_seen": 204622624, "step": 94865 }, { "epoch": 17.41053404294366, "grad_norm": 8.535717643098906e-05, "learning_rate": 5.02077500756915e-07, "loss": 0.0, "num_input_tokens_seen": 204632928, "step": 94870 }, { "epoch": 17.411451642503213, "grad_norm": 0.029535284265875816, "learning_rate": 5.017278305611357e-07, "loss": 0.0, "num_input_tokens_seen": 204643136, "step": 94875 }, { "epoch": 17.412369242062763, "grad_norm": 0.0002343640080653131, "learning_rate": 5.013782757390662e-07, "loss": 0.0, "num_input_tokens_seen": 204654048, "step": 94880 }, { "epoch": 17.413286841622316, "grad_norm": 0.0004118765064049512, "learning_rate": 5.010288362996707e-07, "loss": 0.0, "num_input_tokens_seen": 204665312, "step": 94885 }, { "epoch": 17.41420444118187, "grad_norm": 0.00014400134386960417, "learning_rate": 5.006795122519131e-07, "loss": 0.0, "num_input_tokens_seen": 204673536, "step": 94890 }, { "epoch": 17.41512204074142, "grad_norm": 6.498482980532572e-05, "learning_rate": 5.003303036047536e-07, "loss": 0.0, "num_input_tokens_seen": 204683968, "step": 94895 }, { "epoch": 17.416039640300973, "grad_norm": 0.00010440112237120047, "learning_rate": 4.999812103671475e-07, "loss": 0.0005, "num_input_tokens_seen": 204694592, "step": 94900 }, { "epoch": 17.416957239860526, "grad_norm": 5.135128230904229e-05, "learning_rate": 4.99632232548049e-07, "loss": 0.0, "num_input_tokens_seen": 204706208, "step": 94905 }, { "epoch": 17.417874839420076, "grad_norm": 7.057098264340311e-05, "learning_rate": 4.992833701564087e-07, "loss": 0.0, "num_input_tokens_seen": 204716832, "step": 94910 }, { "epoch": 17.41879243897963, "grad_norm": 7.508049020543694e-05, "learning_rate": 4.989346232011738e-07, "loss": 0.0, "num_input_tokens_seen": 204726592, "step": 94915 }, { "epoch": 17.419710038539183, "grad_norm": 4.508923302637413e-05, "learning_rate": 4.985859916912905e-07, "loss": 0.0, "num_input_tokens_seen": 204737280, "step": 94920 }, { "epoch": 17.420627638098733, "grad_norm": 5.077129026176408e-05, "learning_rate": 4.982374756357e-07, "loss": 0.0, "num_input_tokens_seen": 204747552, "step": 94925 }, { "epoch": 17.421545237658286, "grad_norm": 0.00024480262072756886, "learning_rate": 4.978890750433401e-07, "loss": 0.0, "num_input_tokens_seen": 204759008, "step": 94930 }, { "epoch": 17.42246283721784, "grad_norm": 0.00012156827870057896, "learning_rate": 4.975407899231488e-07, "loss": 0.0, "num_input_tokens_seen": 204769344, "step": 94935 }, { "epoch": 17.42338043677739, "grad_norm": 0.00023543111456092447, "learning_rate": 4.971926202840582e-07, "loss": 0.0, "num_input_tokens_seen": 204781152, "step": 94940 }, { "epoch": 17.424298036336943, "grad_norm": 9.302738180849701e-05, "learning_rate": 4.96844566134998e-07, "loss": 0.0, "num_input_tokens_seen": 204791904, "step": 94945 }, { "epoch": 17.425215635896496, "grad_norm": 0.00037259000237099826, "learning_rate": 4.964966274848948e-07, "loss": 0.0, "num_input_tokens_seen": 204802560, "step": 94950 }, { "epoch": 17.426133235456046, "grad_norm": 6.689043220831081e-05, "learning_rate": 4.961488043426738e-07, "loss": 0.0, "num_input_tokens_seen": 204813472, "step": 94955 }, { "epoch": 17.4270508350156, "grad_norm": 5.966456592432223e-05, "learning_rate": 4.958010967172561e-07, "loss": 0.0, "num_input_tokens_seen": 204825056, "step": 94960 }, { "epoch": 17.427968434575153, "grad_norm": 0.0014545421581715345, "learning_rate": 4.954535046175579e-07, "loss": 0.0, "num_input_tokens_seen": 204836128, "step": 94965 }, { "epoch": 17.428886034134702, "grad_norm": 0.0006434588576667011, "learning_rate": 4.951060280524972e-07, "loss": 0.0, "num_input_tokens_seen": 204847552, "step": 94970 }, { "epoch": 17.429803633694256, "grad_norm": 0.0005401116795837879, "learning_rate": 4.947586670309851e-07, "loss": 0.0, "num_input_tokens_seen": 204858944, "step": 94975 }, { "epoch": 17.43072123325381, "grad_norm": 5.299540134728886e-05, "learning_rate": 4.9441142156193e-07, "loss": 0.0, "num_input_tokens_seen": 204869568, "step": 94980 }, { "epoch": 17.43163883281336, "grad_norm": 6.217170448508114e-05, "learning_rate": 4.940642916542387e-07, "loss": 0.0, "num_input_tokens_seen": 204879360, "step": 94985 }, { "epoch": 17.432556432372913, "grad_norm": 4.556219937512651e-05, "learning_rate": 4.937172773168153e-07, "loss": 0.0, "num_input_tokens_seen": 204890080, "step": 94990 }, { "epoch": 17.433474031932466, "grad_norm": 0.0018406786257401109, "learning_rate": 4.933703785585597e-07, "loss": 0.0, "num_input_tokens_seen": 204901472, "step": 94995 }, { "epoch": 17.434391631492016, "grad_norm": 0.001324568293057382, "learning_rate": 4.930235953883683e-07, "loss": 0.0, "num_input_tokens_seen": 204910080, "step": 95000 }, { "epoch": 17.43530923105157, "grad_norm": 0.004785258322954178, "learning_rate": 4.926769278151377e-07, "loss": 0.0, "num_input_tokens_seen": 204920768, "step": 95005 }, { "epoch": 17.436226830611123, "grad_norm": 0.00021644574007950723, "learning_rate": 4.923303758477577e-07, "loss": 0.0, "num_input_tokens_seen": 204930240, "step": 95010 }, { "epoch": 17.437144430170672, "grad_norm": 0.001387906726449728, "learning_rate": 4.91983939495117e-07, "loss": 0.0, "num_input_tokens_seen": 204941024, "step": 95015 }, { "epoch": 17.438062029730226, "grad_norm": 0.004119241144508123, "learning_rate": 4.916376187661021e-07, "loss": 0.0, "num_input_tokens_seen": 204950784, "step": 95020 }, { "epoch": 17.43897962928978, "grad_norm": 7.622564589837566e-05, "learning_rate": 4.912914136695945e-07, "loss": 0.0, "num_input_tokens_seen": 204961792, "step": 95025 }, { "epoch": 17.43989722884933, "grad_norm": 0.0001376243308186531, "learning_rate": 4.909453242144746e-07, "loss": 0.0, "num_input_tokens_seen": 204973888, "step": 95030 }, { "epoch": 17.440814828408882, "grad_norm": 0.0006535229622386396, "learning_rate": 4.905993504096179e-07, "loss": 0.0, "num_input_tokens_seen": 204985120, "step": 95035 }, { "epoch": 17.441732427968436, "grad_norm": 0.0016566854901611805, "learning_rate": 4.902534922639002e-07, "loss": 0.0, "num_input_tokens_seen": 204996352, "step": 95040 }, { "epoch": 17.442650027527986, "grad_norm": 0.003587992861866951, "learning_rate": 4.899077497861904e-07, "loss": 0.0, "num_input_tokens_seen": 205007040, "step": 95045 }, { "epoch": 17.44356762708754, "grad_norm": 9.943509212462232e-05, "learning_rate": 4.895621229853558e-07, "loss": 0.0, "num_input_tokens_seen": 205018144, "step": 95050 }, { "epoch": 17.444485226647092, "grad_norm": 0.00010070468852063641, "learning_rate": 4.892166118702635e-07, "loss": 0.0, "num_input_tokens_seen": 205028960, "step": 95055 }, { "epoch": 17.445402826206642, "grad_norm": 0.00018325381097383797, "learning_rate": 4.888712164497738e-07, "loss": 0.0, "num_input_tokens_seen": 205040256, "step": 95060 }, { "epoch": 17.446320425766196, "grad_norm": 0.013367260806262493, "learning_rate": 4.885259367327449e-07, "loss": 0.0, "num_input_tokens_seen": 205050752, "step": 95065 }, { "epoch": 17.44723802532575, "grad_norm": 0.051437415182590485, "learning_rate": 4.881807727280346e-07, "loss": 0.0002, "num_input_tokens_seen": 205061568, "step": 95070 }, { "epoch": 17.4481556248853, "grad_norm": 0.0017626415938138962, "learning_rate": 4.878357244444947e-07, "loss": 0.0, "num_input_tokens_seen": 205072288, "step": 95075 }, { "epoch": 17.449073224444852, "grad_norm": 0.015202092938125134, "learning_rate": 4.874907918909755e-07, "loss": 0.0, "num_input_tokens_seen": 205083488, "step": 95080 }, { "epoch": 17.449990824004406, "grad_norm": 0.0009822557913139462, "learning_rate": 4.871459750763224e-07, "loss": 0.0, "num_input_tokens_seen": 205093312, "step": 95085 }, { "epoch": 17.450908423563956, "grad_norm": 7.270685455296189e-05, "learning_rate": 4.86801274009382e-07, "loss": 0.0, "num_input_tokens_seen": 205103552, "step": 95090 }, { "epoch": 17.45182602312351, "grad_norm": 0.0033391371835023165, "learning_rate": 4.864566886989941e-07, "loss": 0.0, "num_input_tokens_seen": 205113792, "step": 95095 }, { "epoch": 17.452743622683062, "grad_norm": 5.0979306251974776e-05, "learning_rate": 4.861122191539969e-07, "loss": 0.0, "num_input_tokens_seen": 205124448, "step": 95100 }, { "epoch": 17.453661222242612, "grad_norm": 0.00014493201160803437, "learning_rate": 4.857678653832249e-07, "loss": 0.0, "num_input_tokens_seen": 205135776, "step": 95105 }, { "epoch": 17.454578821802166, "grad_norm": 0.0017960206605494022, "learning_rate": 4.854236273955098e-07, "loss": 0.0, "num_input_tokens_seen": 205146400, "step": 95110 }, { "epoch": 17.45549642136172, "grad_norm": 0.0002243887138320133, "learning_rate": 4.850795051996832e-07, "loss": 0.0, "num_input_tokens_seen": 205157408, "step": 95115 }, { "epoch": 17.45641402092127, "grad_norm": 0.0007657033274881542, "learning_rate": 4.847354988045694e-07, "loss": 0.0, "num_input_tokens_seen": 205168032, "step": 95120 }, { "epoch": 17.457331620480822, "grad_norm": 9.24974083318375e-05, "learning_rate": 4.84391608218992e-07, "loss": 0.0, "num_input_tokens_seen": 205179136, "step": 95125 }, { "epoch": 17.458249220040376, "grad_norm": 5.415771374828182e-05, "learning_rate": 4.840478334517712e-07, "loss": 0.0, "num_input_tokens_seen": 205189664, "step": 95130 }, { "epoch": 17.459166819599925, "grad_norm": 0.0002015544887399301, "learning_rate": 4.837041745117238e-07, "loss": 0.0, "num_input_tokens_seen": 205200800, "step": 95135 }, { "epoch": 17.46008441915948, "grad_norm": 0.0002679524477571249, "learning_rate": 4.833606314076655e-07, "loss": 0.0, "num_input_tokens_seen": 205212096, "step": 95140 }, { "epoch": 17.461002018719032, "grad_norm": 0.00022290131892077625, "learning_rate": 4.830172041484072e-07, "loss": 0.0, "num_input_tokens_seen": 205222912, "step": 95145 }, { "epoch": 17.461919618278582, "grad_norm": 0.0007999094668775797, "learning_rate": 4.826738927427555e-07, "loss": 0.0, "num_input_tokens_seen": 205233984, "step": 95150 }, { "epoch": 17.462837217838135, "grad_norm": 5.6869048421503976e-05, "learning_rate": 4.823306971995179e-07, "loss": 0.0, "num_input_tokens_seen": 205244864, "step": 95155 }, { "epoch": 17.46375481739769, "grad_norm": 0.011522842571139336, "learning_rate": 4.819876175274968e-07, "loss": 0.0, "num_input_tokens_seen": 205254496, "step": 95160 }, { "epoch": 17.46467241695724, "grad_norm": 0.00010458656470291317, "learning_rate": 4.816446537354907e-07, "loss": 0.0, "num_input_tokens_seen": 205264224, "step": 95165 }, { "epoch": 17.465590016516792, "grad_norm": 0.00036881855339743197, "learning_rate": 4.813018058322955e-07, "loss": 0.0, "num_input_tokens_seen": 205275744, "step": 95170 }, { "epoch": 17.466507616076345, "grad_norm": 5.590703949565068e-05, "learning_rate": 4.809590738267067e-07, "loss": 0.0, "num_input_tokens_seen": 205286688, "step": 95175 }, { "epoch": 17.467425215635895, "grad_norm": 0.00012516777496784925, "learning_rate": 4.806164577275135e-07, "loss": 0.0, "num_input_tokens_seen": 205297728, "step": 95180 }, { "epoch": 17.46834281519545, "grad_norm": 0.0003670094592962414, "learning_rate": 4.802739575435028e-07, "loss": 0.0, "num_input_tokens_seen": 205308960, "step": 95185 }, { "epoch": 17.469260414755002, "grad_norm": 0.00010914730228250846, "learning_rate": 4.799315732834614e-07, "loss": 0.0, "num_input_tokens_seen": 205319360, "step": 95190 }, { "epoch": 17.470178014314552, "grad_norm": 0.00014758679026272148, "learning_rate": 4.795893049561695e-07, "loss": 0.0008, "num_input_tokens_seen": 205329344, "step": 95195 }, { "epoch": 17.471095613874105, "grad_norm": 0.001270644017495215, "learning_rate": 4.792471525704051e-07, "loss": 0.0, "num_input_tokens_seen": 205339456, "step": 95200 }, { "epoch": 17.47201321343366, "grad_norm": 8.978041296359152e-05, "learning_rate": 4.789051161349456e-07, "loss": 0.0, "num_input_tokens_seen": 205349568, "step": 95205 }, { "epoch": 17.47293081299321, "grad_norm": 8.217389404308051e-05, "learning_rate": 4.785631956585629e-07, "loss": 0.0, "num_input_tokens_seen": 205359616, "step": 95210 }, { "epoch": 17.473848412552762, "grad_norm": 9.666448022471741e-05, "learning_rate": 4.782213911500266e-07, "loss": 0.0, "num_input_tokens_seen": 205370144, "step": 95215 }, { "epoch": 17.474766012112315, "grad_norm": 9.202919318340719e-05, "learning_rate": 4.778797026181026e-07, "loss": 0.0, "num_input_tokens_seen": 205381248, "step": 95220 }, { "epoch": 17.475683611671865, "grad_norm": 0.0001525961561128497, "learning_rate": 4.775381300715565e-07, "loss": 0.0, "num_input_tokens_seen": 205392256, "step": 95225 }, { "epoch": 17.47660121123142, "grad_norm": 0.00525705749168992, "learning_rate": 4.77196673519148e-07, "loss": 0.0, "num_input_tokens_seen": 205403392, "step": 95230 }, { "epoch": 17.477518810790972, "grad_norm": 0.000104823790024966, "learning_rate": 4.768553329696341e-07, "loss": 0.0, "num_input_tokens_seen": 205412800, "step": 95235 }, { "epoch": 17.478436410350522, "grad_norm": 6.300446693785489e-05, "learning_rate": 4.76514108431772e-07, "loss": 0.0, "num_input_tokens_seen": 205423456, "step": 95240 }, { "epoch": 17.479354009910075, "grad_norm": 0.0005300943157635629, "learning_rate": 4.7617299991431164e-07, "loss": 0.0, "num_input_tokens_seen": 205434080, "step": 95245 }, { "epoch": 17.48027160946963, "grad_norm": 0.0003119367465842515, "learning_rate": 4.75832007426002e-07, "loss": 0.0, "num_input_tokens_seen": 205443840, "step": 95250 }, { "epoch": 17.48118920902918, "grad_norm": 0.00014864349213894457, "learning_rate": 4.7549113097559053e-07, "loss": 0.0, "num_input_tokens_seen": 205455264, "step": 95255 }, { "epoch": 17.482106808588732, "grad_norm": 5.166505798115395e-05, "learning_rate": 4.751503705718191e-07, "loss": 0.0, "num_input_tokens_seen": 205465728, "step": 95260 }, { "epoch": 17.483024408148285, "grad_norm": 0.0001320742885582149, "learning_rate": 4.7480972622342803e-07, "loss": 0.0, "num_input_tokens_seen": 205476992, "step": 95265 }, { "epoch": 17.483942007707835, "grad_norm": 0.00012324390991125256, "learning_rate": 4.74469197939153e-07, "loss": 0.0, "num_input_tokens_seen": 205487296, "step": 95270 }, { "epoch": 17.48485960726739, "grad_norm": 0.0006929652881808579, "learning_rate": 4.741287857277299e-07, "loss": 0.0, "num_input_tokens_seen": 205498464, "step": 95275 }, { "epoch": 17.485777206826942, "grad_norm": 0.00012243317905813456, "learning_rate": 4.7378848959788893e-07, "loss": 0.0, "num_input_tokens_seen": 205508608, "step": 95280 }, { "epoch": 17.48669480638649, "grad_norm": 0.0003506389621179551, "learning_rate": 4.734483095583575e-07, "loss": 0.0, "num_input_tokens_seen": 205519904, "step": 95285 }, { "epoch": 17.487612405946045, "grad_norm": 0.00011469826858956367, "learning_rate": 4.7310824561786206e-07, "loss": 0.0, "num_input_tokens_seen": 205529824, "step": 95290 }, { "epoch": 17.4885300055056, "grad_norm": 5.332286673365161e-05, "learning_rate": 4.7276829778512445e-07, "loss": 0.0, "num_input_tokens_seen": 205539424, "step": 95295 }, { "epoch": 17.48944760506515, "grad_norm": 0.0009972162079066038, "learning_rate": 4.724284660688633e-07, "loss": 0.0, "num_input_tokens_seen": 205550720, "step": 95300 }, { "epoch": 17.490365204624702, "grad_norm": 8.803801028989255e-05, "learning_rate": 4.7208875047779377e-07, "loss": 0.0, "num_input_tokens_seen": 205561152, "step": 95305 }, { "epoch": 17.491282804184255, "grad_norm": 0.0006706160493195057, "learning_rate": 4.7174915102063125e-07, "loss": 0.0, "num_input_tokens_seen": 205571456, "step": 95310 }, { "epoch": 17.492200403743805, "grad_norm": 0.00017334108997602016, "learning_rate": 4.714096677060848e-07, "loss": 0.0, "num_input_tokens_seen": 205582048, "step": 95315 }, { "epoch": 17.49311800330336, "grad_norm": 0.00030791721655987203, "learning_rate": 4.7107030054286185e-07, "loss": 0.0, "num_input_tokens_seen": 205593408, "step": 95320 }, { "epoch": 17.494035602862912, "grad_norm": 0.00013297943223733455, "learning_rate": 4.707310495396661e-07, "loss": 0.0, "num_input_tokens_seen": 205604576, "step": 95325 }, { "epoch": 17.49495320242246, "grad_norm": 0.0002992197114508599, "learning_rate": 4.7039191470519884e-07, "loss": 0.0, "num_input_tokens_seen": 205615104, "step": 95330 }, { "epoch": 17.495870801982015, "grad_norm": 8.312365389429033e-05, "learning_rate": 4.700528960481593e-07, "loss": 0.0, "num_input_tokens_seen": 205625760, "step": 95335 }, { "epoch": 17.49678840154157, "grad_norm": 8.88510694494471e-05, "learning_rate": 4.697139935772421e-07, "loss": 0.0, "num_input_tokens_seen": 205636640, "step": 95340 }, { "epoch": 17.49770600110112, "grad_norm": 0.0002959521661978215, "learning_rate": 4.693752073011398e-07, "loss": 0.0, "num_input_tokens_seen": 205647008, "step": 95345 }, { "epoch": 17.49862360066067, "grad_norm": 4.4129435991635546e-05, "learning_rate": 4.6903653722854157e-07, "loss": 0.0, "num_input_tokens_seen": 205657632, "step": 95350 }, { "epoch": 17.499541200220225, "grad_norm": 0.000789040292147547, "learning_rate": 4.6869798336813264e-07, "loss": 0.0, "num_input_tokens_seen": 205668384, "step": 95355 }, { "epoch": 17.500458799779775, "grad_norm": 0.00010476354509592056, "learning_rate": 4.683595457285989e-07, "loss": 0.0, "num_input_tokens_seen": 205680416, "step": 95360 }, { "epoch": 17.50137639933933, "grad_norm": 8.193904795916751e-05, "learning_rate": 4.680212243186194e-07, "loss": 0.0, "num_input_tokens_seen": 205691328, "step": 95365 }, { "epoch": 17.50229399889888, "grad_norm": 0.00021316239144653082, "learning_rate": 4.6768301914687007e-07, "loss": 0.0, "num_input_tokens_seen": 205701856, "step": 95370 }, { "epoch": 17.50321159845843, "grad_norm": 0.0001453369332011789, "learning_rate": 4.6734493022202845e-07, "loss": 0.0, "num_input_tokens_seen": 205713760, "step": 95375 }, { "epoch": 17.504129198017985, "grad_norm": 7.767165516270325e-05, "learning_rate": 4.6700695755276414e-07, "loss": 0.0, "num_input_tokens_seen": 205723232, "step": 95380 }, { "epoch": 17.50504679757754, "grad_norm": 7.581405225209892e-05, "learning_rate": 4.666691011477448e-07, "loss": 0.0, "num_input_tokens_seen": 205733504, "step": 95385 }, { "epoch": 17.505964397137088, "grad_norm": 6.924098852323368e-05, "learning_rate": 4.663313610156378e-07, "loss": 0.0, "num_input_tokens_seen": 205745120, "step": 95390 }, { "epoch": 17.50688199669664, "grad_norm": 0.0001500906073488295, "learning_rate": 4.659937371651052e-07, "loss": 0.0, "num_input_tokens_seen": 205756256, "step": 95395 }, { "epoch": 17.507799596256195, "grad_norm": 0.0005037439987063408, "learning_rate": 4.656562296048062e-07, "loss": 0.0, "num_input_tokens_seen": 205766848, "step": 95400 }, { "epoch": 17.508717195815745, "grad_norm": 0.0013816216960549355, "learning_rate": 4.6531883834339595e-07, "loss": 0.0, "num_input_tokens_seen": 205778432, "step": 95405 }, { "epoch": 17.509634795375298, "grad_norm": 0.00020196812693029642, "learning_rate": 4.6498156338953047e-07, "loss": 0.0, "num_input_tokens_seen": 205790272, "step": 95410 }, { "epoch": 17.51055239493485, "grad_norm": 8.434243500232697e-05, "learning_rate": 4.646444047518595e-07, "loss": 0.0, "num_input_tokens_seen": 205799808, "step": 95415 }, { "epoch": 17.5114699944944, "grad_norm": 0.0001463747030356899, "learning_rate": 4.643073624390293e-07, "loss": 0.0, "num_input_tokens_seen": 205811776, "step": 95420 }, { "epoch": 17.512387594053955, "grad_norm": 0.0022106384858489037, "learning_rate": 4.639704364596864e-07, "loss": 0.0, "num_input_tokens_seen": 205822304, "step": 95425 }, { "epoch": 17.51330519361351, "grad_norm": 0.0008982167346403003, "learning_rate": 4.636336268224717e-07, "loss": 0.0, "num_input_tokens_seen": 205834816, "step": 95430 }, { "epoch": 17.514222793173058, "grad_norm": 0.0003104872303083539, "learning_rate": 4.632969335360238e-07, "loss": 0.0, "num_input_tokens_seen": 205846080, "step": 95435 }, { "epoch": 17.51514039273261, "grad_norm": 7.12153414497152e-05, "learning_rate": 4.6296035660897744e-07, "loss": 0.0, "num_input_tokens_seen": 205857472, "step": 95440 }, { "epoch": 17.516057992292165, "grad_norm": 0.00012509200314525515, "learning_rate": 4.6262389604996684e-07, "loss": 0.0, "num_input_tokens_seen": 205868800, "step": 95445 }, { "epoch": 17.516975591851715, "grad_norm": 0.003520315047353506, "learning_rate": 4.622875518676212e-07, "loss": 0.0, "num_input_tokens_seen": 205879392, "step": 95450 }, { "epoch": 17.517893191411268, "grad_norm": 0.00011555145465536043, "learning_rate": 4.6195132407056644e-07, "loss": 0.0, "num_input_tokens_seen": 205890240, "step": 95455 }, { "epoch": 17.51881079097082, "grad_norm": 0.0001524361432529986, "learning_rate": 4.6161521266742726e-07, "loss": 0.0, "num_input_tokens_seen": 205901312, "step": 95460 }, { "epoch": 17.51972839053037, "grad_norm": 0.00011311306298011914, "learning_rate": 4.61279217666824e-07, "loss": 0.0, "num_input_tokens_seen": 205912064, "step": 95465 }, { "epoch": 17.520645990089925, "grad_norm": 5.111379505251534e-05, "learning_rate": 4.6094333907737375e-07, "loss": 0.0, "num_input_tokens_seen": 205922560, "step": 95470 }, { "epoch": 17.521563589649478, "grad_norm": 7.7586475526914e-05, "learning_rate": 4.606075769076929e-07, "loss": 0.0, "num_input_tokens_seen": 205933280, "step": 95475 }, { "epoch": 17.522481189209028, "grad_norm": 8.018299558898434e-05, "learning_rate": 4.6027193116639226e-07, "loss": 0.0, "num_input_tokens_seen": 205945056, "step": 95480 }, { "epoch": 17.52339878876858, "grad_norm": 0.000963029742706567, "learning_rate": 4.5993640186208054e-07, "loss": 0.0, "num_input_tokens_seen": 205956096, "step": 95485 }, { "epoch": 17.524316388328135, "grad_norm": 7.596119394293055e-05, "learning_rate": 4.5960098900336256e-07, "loss": 0.0, "num_input_tokens_seen": 205968416, "step": 95490 }, { "epoch": 17.525233987887685, "grad_norm": 9.277708159061149e-05, "learning_rate": 4.5926569259884313e-07, "loss": 0.0, "num_input_tokens_seen": 205979296, "step": 95495 }, { "epoch": 17.526151587447238, "grad_norm": 4.786573117598891e-05, "learning_rate": 4.589305126571214e-07, "loss": 0.0, "num_input_tokens_seen": 205990304, "step": 95500 }, { "epoch": 17.52706918700679, "grad_norm": 0.0003425357863306999, "learning_rate": 4.585954491867933e-07, "loss": 0.0, "num_input_tokens_seen": 206001024, "step": 95505 }, { "epoch": 17.52798678656634, "grad_norm": 0.002283982001245022, "learning_rate": 4.5826050219645425e-07, "loss": 0.0, "num_input_tokens_seen": 206012800, "step": 95510 }, { "epoch": 17.528904386125895, "grad_norm": 5.578143100137822e-05, "learning_rate": 4.579256716946939e-07, "loss": 0.0, "num_input_tokens_seen": 206023456, "step": 95515 }, { "epoch": 17.529821985685448, "grad_norm": 0.00010173207556363195, "learning_rate": 4.5759095769010055e-07, "loss": 0.0, "num_input_tokens_seen": 206032416, "step": 95520 }, { "epoch": 17.530739585244998, "grad_norm": 0.0002776378532871604, "learning_rate": 4.572563601912583e-07, "loss": 0.0, "num_input_tokens_seen": 206043104, "step": 95525 }, { "epoch": 17.53165718480455, "grad_norm": 0.00041794151184149086, "learning_rate": 4.5692187920675093e-07, "loss": 0.0, "num_input_tokens_seen": 206053376, "step": 95530 }, { "epoch": 17.532574784364105, "grad_norm": 0.0005456493236124516, "learning_rate": 4.565875147451559e-07, "loss": 0.0, "num_input_tokens_seen": 206063840, "step": 95535 }, { "epoch": 17.533492383923655, "grad_norm": 0.00019115791656076908, "learning_rate": 4.562532668150493e-07, "loss": 0.0, "num_input_tokens_seen": 206074496, "step": 95540 }, { "epoch": 17.534409983483208, "grad_norm": 0.0006795976078137755, "learning_rate": 4.5591913542500477e-07, "loss": 0.0, "num_input_tokens_seen": 206085728, "step": 95545 }, { "epoch": 17.53532758304276, "grad_norm": 5.874966154806316e-05, "learning_rate": 4.555851205835904e-07, "loss": 0.0, "num_input_tokens_seen": 206095904, "step": 95550 }, { "epoch": 17.53624518260231, "grad_norm": 0.0001510260917712003, "learning_rate": 4.5525122229937547e-07, "loss": 0.0, "num_input_tokens_seen": 206107648, "step": 95555 }, { "epoch": 17.537162782161865, "grad_norm": 7.637956878170371e-05, "learning_rate": 4.549174405809231e-07, "loss": 0.0, "num_input_tokens_seen": 206118848, "step": 95560 }, { "epoch": 17.538080381721418, "grad_norm": 0.00013782299356535077, "learning_rate": 4.545837754367938e-07, "loss": 0.0, "num_input_tokens_seen": 206129184, "step": 95565 }, { "epoch": 17.538997981280968, "grad_norm": 0.00039386001299135387, "learning_rate": 4.5425022687554557e-07, "loss": 0.0, "num_input_tokens_seen": 206140416, "step": 95570 }, { "epoch": 17.53991558084052, "grad_norm": 0.0015660248463973403, "learning_rate": 4.539167949057344e-07, "loss": 0.0, "num_input_tokens_seen": 206149312, "step": 95575 }, { "epoch": 17.540833180400075, "grad_norm": 0.0006944821216166019, "learning_rate": 4.535834795359112e-07, "loss": 0.0, "num_input_tokens_seen": 206159872, "step": 95580 }, { "epoch": 17.541750779959624, "grad_norm": 5.701925329049118e-05, "learning_rate": 4.5325028077462584e-07, "loss": 0.0, "num_input_tokens_seen": 206169664, "step": 95585 }, { "epoch": 17.542668379519178, "grad_norm": 0.00013540763757191598, "learning_rate": 4.529171986304232e-07, "loss": 0.0, "num_input_tokens_seen": 206180576, "step": 95590 }, { "epoch": 17.54358597907873, "grad_norm": 0.000569047115277499, "learning_rate": 4.5258423311184794e-07, "loss": 0.0, "num_input_tokens_seen": 206190624, "step": 95595 }, { "epoch": 17.54450357863828, "grad_norm": 0.012383232824504375, "learning_rate": 4.5225138422743897e-07, "loss": 0.0, "num_input_tokens_seen": 206202240, "step": 95600 }, { "epoch": 17.545421178197834, "grad_norm": 0.00022445987269748002, "learning_rate": 4.519186519857327e-07, "loss": 0.0, "num_input_tokens_seen": 206213024, "step": 95605 }, { "epoch": 17.546338777757388, "grad_norm": 0.00021475565154105425, "learning_rate": 4.5158603639526565e-07, "loss": 0.0, "num_input_tokens_seen": 206222528, "step": 95610 }, { "epoch": 17.547256377316938, "grad_norm": 7.805076893419027e-05, "learning_rate": 4.512535374645666e-07, "loss": 0.0, "num_input_tokens_seen": 206233600, "step": 95615 }, { "epoch": 17.54817397687649, "grad_norm": 0.00018610944971442223, "learning_rate": 4.509211552021647e-07, "loss": 0.0, "num_input_tokens_seen": 206244704, "step": 95620 }, { "epoch": 17.549091576436044, "grad_norm": 0.0003609764971770346, "learning_rate": 4.505888896165839e-07, "loss": 0.0, "num_input_tokens_seen": 206256064, "step": 95625 }, { "epoch": 17.550009175995594, "grad_norm": 0.01646791771054268, "learning_rate": 4.502567407163477e-07, "loss": 0.0, "num_input_tokens_seen": 206267264, "step": 95630 }, { "epoch": 17.550926775555148, "grad_norm": 0.0003970302641391754, "learning_rate": 4.4992470850997506e-07, "loss": 0.0, "num_input_tokens_seen": 206278464, "step": 95635 }, { "epoch": 17.5518443751147, "grad_norm": 6.782275886507705e-05, "learning_rate": 4.495927930059807e-07, "loss": 0.0, "num_input_tokens_seen": 206288896, "step": 95640 }, { "epoch": 17.55276197467425, "grad_norm": 5.146921466803178e-05, "learning_rate": 4.492609942128795e-07, "loss": 0.0, "num_input_tokens_seen": 206299232, "step": 95645 }, { "epoch": 17.553679574233804, "grad_norm": 0.00013441154442261904, "learning_rate": 4.489293121391808e-07, "loss": 0.0, "num_input_tokens_seen": 206310336, "step": 95650 }, { "epoch": 17.554597173793358, "grad_norm": 7.057515904307365e-05, "learning_rate": 4.485977467933911e-07, "loss": 0.0, "num_input_tokens_seen": 206321760, "step": 95655 }, { "epoch": 17.555514773352908, "grad_norm": 4.9909813242265955e-05, "learning_rate": 4.482662981840158e-07, "loss": 0.0, "num_input_tokens_seen": 206332256, "step": 95660 }, { "epoch": 17.55643237291246, "grad_norm": 5.222495019552298e-05, "learning_rate": 4.4793496631955533e-07, "loss": 0.0, "num_input_tokens_seen": 206342720, "step": 95665 }, { "epoch": 17.557349972472014, "grad_norm": 0.0006616803002543747, "learning_rate": 4.4760375120850797e-07, "loss": 0.0, "num_input_tokens_seen": 206354112, "step": 95670 }, { "epoch": 17.558267572031564, "grad_norm": 5.74154983041808e-05, "learning_rate": 4.4727265285936796e-07, "loss": 0.0, "num_input_tokens_seen": 206364224, "step": 95675 }, { "epoch": 17.559185171591118, "grad_norm": 0.00048150724614970386, "learning_rate": 4.4694167128062903e-07, "loss": 0.0, "num_input_tokens_seen": 206375936, "step": 95680 }, { "epoch": 17.56010277115067, "grad_norm": 9.001130820252001e-05, "learning_rate": 4.4661080648078004e-07, "loss": 0.0, "num_input_tokens_seen": 206386656, "step": 95685 }, { "epoch": 17.56102037071022, "grad_norm": 0.000825007155071944, "learning_rate": 4.4628005846830524e-07, "loss": 0.0, "num_input_tokens_seen": 206397408, "step": 95690 }, { "epoch": 17.561937970269774, "grad_norm": 8.632733806734905e-05, "learning_rate": 4.459494272516907e-07, "loss": 0.0, "num_input_tokens_seen": 206407584, "step": 95695 }, { "epoch": 17.562855569829328, "grad_norm": 4.645003718906082e-05, "learning_rate": 4.4561891283941506e-07, "loss": 0.0, "num_input_tokens_seen": 206419168, "step": 95700 }, { "epoch": 17.563773169388877, "grad_norm": 8.583621820434928e-05, "learning_rate": 4.4528851523995496e-07, "loss": 0.0, "num_input_tokens_seen": 206431232, "step": 95705 }, { "epoch": 17.56469076894843, "grad_norm": 0.00019155371410306543, "learning_rate": 4.449582344617859e-07, "loss": 0.0, "num_input_tokens_seen": 206440672, "step": 95710 }, { "epoch": 17.565608368507984, "grad_norm": 0.0006197690381668508, "learning_rate": 4.4462807051337875e-07, "loss": 0.0, "num_input_tokens_seen": 206451072, "step": 95715 }, { "epoch": 17.566525968067534, "grad_norm": 8.170002547558397e-05, "learning_rate": 4.442980234032007e-07, "loss": 0.0, "num_input_tokens_seen": 206462208, "step": 95720 }, { "epoch": 17.567443567627087, "grad_norm": 9.055852569872513e-05, "learning_rate": 4.4396809313971776e-07, "loss": 0.0, "num_input_tokens_seen": 206473120, "step": 95725 }, { "epoch": 17.56836116718664, "grad_norm": 0.0001819075405364856, "learning_rate": 4.4363827973139206e-07, "loss": 0.0, "num_input_tokens_seen": 206484608, "step": 95730 }, { "epoch": 17.56927876674619, "grad_norm": 0.004462835378944874, "learning_rate": 4.433085831866835e-07, "loss": 0.0, "num_input_tokens_seen": 206495008, "step": 95735 }, { "epoch": 17.570196366305744, "grad_norm": 5.6466684327460825e-05, "learning_rate": 4.429790035140469e-07, "loss": 0.0, "num_input_tokens_seen": 206506304, "step": 95740 }, { "epoch": 17.571113965865298, "grad_norm": 5.740531923947856e-05, "learning_rate": 4.4264954072193553e-07, "loss": 0.0, "num_input_tokens_seen": 206516704, "step": 95745 }, { "epoch": 17.572031565424847, "grad_norm": 8.380948565900326e-05, "learning_rate": 4.4232019481880104e-07, "loss": 0.0, "num_input_tokens_seen": 206528320, "step": 95750 }, { "epoch": 17.5729491649844, "grad_norm": 0.00012373745266813785, "learning_rate": 4.4199096581308996e-07, "loss": 0.0, "num_input_tokens_seen": 206539328, "step": 95755 }, { "epoch": 17.573866764543954, "grad_norm": 0.005990367382764816, "learning_rate": 4.416618537132461e-07, "loss": 0.0, "num_input_tokens_seen": 206550560, "step": 95760 }, { "epoch": 17.574784364103504, "grad_norm": 0.00030359462834894657, "learning_rate": 4.4133285852771104e-07, "loss": 0.0, "num_input_tokens_seen": 206561824, "step": 95765 }, { "epoch": 17.575701963663057, "grad_norm": 0.008353213779628277, "learning_rate": 4.4100398026492187e-07, "loss": 0.0, "num_input_tokens_seen": 206572320, "step": 95770 }, { "epoch": 17.57661956322261, "grad_norm": 7.421005284413695e-05, "learning_rate": 4.4067521893331576e-07, "loss": 0.0, "num_input_tokens_seen": 206583008, "step": 95775 }, { "epoch": 17.57753716278216, "grad_norm": 8.102724677883089e-05, "learning_rate": 4.403465745413238e-07, "loss": 0.0, "num_input_tokens_seen": 206593920, "step": 95780 }, { "epoch": 17.578454762341714, "grad_norm": 0.0001466665416955948, "learning_rate": 4.400180470973753e-07, "loss": 0.0, "num_input_tokens_seen": 206605248, "step": 95785 }, { "epoch": 17.579372361901267, "grad_norm": 8.041469845920801e-05, "learning_rate": 4.3968963660989627e-07, "loss": 0.0, "num_input_tokens_seen": 206616480, "step": 95790 }, { "epoch": 17.580289961460817, "grad_norm": 0.0004503158852458, "learning_rate": 4.393613430873106e-07, "loss": 0.0, "num_input_tokens_seen": 206626688, "step": 95795 }, { "epoch": 17.58120756102037, "grad_norm": 0.003142168978229165, "learning_rate": 4.3903316653803816e-07, "loss": 0.0, "num_input_tokens_seen": 206636576, "step": 95800 }, { "epoch": 17.582125160579924, "grad_norm": 0.00013410276733338833, "learning_rate": 4.387051069704962e-07, "loss": 0.0, "num_input_tokens_seen": 206646464, "step": 95805 }, { "epoch": 17.583042760139474, "grad_norm": 8.114692172966897e-05, "learning_rate": 4.3837716439309843e-07, "loss": 0.0, "num_input_tokens_seen": 206656288, "step": 95810 }, { "epoch": 17.583960359699027, "grad_norm": 0.00011707605881383643, "learning_rate": 4.38049338814257e-07, "loss": 0.0, "num_input_tokens_seen": 206667904, "step": 95815 }, { "epoch": 17.58487795925858, "grad_norm": 0.0021199227776378393, "learning_rate": 4.3772163024237923e-07, "loss": 0.0, "num_input_tokens_seen": 206679072, "step": 95820 }, { "epoch": 17.58579555881813, "grad_norm": 7.680620183236897e-05, "learning_rate": 4.373940386858705e-07, "loss": 0.0, "num_input_tokens_seen": 206689600, "step": 95825 }, { "epoch": 17.586713158377684, "grad_norm": 0.00014826511323917657, "learning_rate": 4.370665641531341e-07, "loss": 0.0, "num_input_tokens_seen": 206702432, "step": 95830 }, { "epoch": 17.587630757937237, "grad_norm": 0.0003306121798232198, "learning_rate": 4.3673920665256833e-07, "loss": 0.0, "num_input_tokens_seen": 206711488, "step": 95835 }, { "epoch": 17.588548357496787, "grad_norm": 0.0009439183631911874, "learning_rate": 4.3641196619256867e-07, "loss": 0.0, "num_input_tokens_seen": 206721984, "step": 95840 }, { "epoch": 17.58946595705634, "grad_norm": 7.181840192060918e-05, "learning_rate": 4.360848427815295e-07, "loss": 0.0, "num_input_tokens_seen": 206732032, "step": 95845 }, { "epoch": 17.590383556615894, "grad_norm": 0.00011997637193417177, "learning_rate": 4.3575783642784144e-07, "loss": 0.0, "num_input_tokens_seen": 206744032, "step": 95850 }, { "epoch": 17.591301156175444, "grad_norm": 5.802187661174685e-05, "learning_rate": 4.354309471398904e-07, "loss": 0.0, "num_input_tokens_seen": 206754400, "step": 95855 }, { "epoch": 17.592218755734997, "grad_norm": 6.691445014439523e-05, "learning_rate": 4.351041749260604e-07, "loss": 0.0, "num_input_tokens_seen": 206764992, "step": 95860 }, { "epoch": 17.59313635529455, "grad_norm": 9.071759268408641e-05, "learning_rate": 4.3477751979473457e-07, "loss": 0.0, "num_input_tokens_seen": 206776448, "step": 95865 }, { "epoch": 17.5940539548541, "grad_norm": 8.288471144624054e-05, "learning_rate": 4.3445098175428966e-07, "loss": 0.0, "num_input_tokens_seen": 206788672, "step": 95870 }, { "epoch": 17.594971554413654, "grad_norm": 0.00015337133663706481, "learning_rate": 4.3412456081310006e-07, "loss": 0.0, "num_input_tokens_seen": 206799360, "step": 95875 }, { "epoch": 17.595889153973207, "grad_norm": 0.00024245484382845461, "learning_rate": 4.3379825697954014e-07, "loss": 0.0, "num_input_tokens_seen": 206810464, "step": 95880 }, { "epoch": 17.596806753532757, "grad_norm": 7.682704017497599e-05, "learning_rate": 4.334720702619777e-07, "loss": 0.0, "num_input_tokens_seen": 206821312, "step": 95885 }, { "epoch": 17.59772435309231, "grad_norm": 8.276238804683089e-05, "learning_rate": 4.3314600066877934e-07, "loss": 0.0, "num_input_tokens_seen": 206832160, "step": 95890 }, { "epoch": 17.598641952651864, "grad_norm": 0.00020143692381680012, "learning_rate": 4.3282004820830726e-07, "loss": 0.0, "num_input_tokens_seen": 206842784, "step": 95895 }, { "epoch": 17.599559552211414, "grad_norm": 0.0002508567413315177, "learning_rate": 4.3249421288892313e-07, "loss": 0.0, "num_input_tokens_seen": 206853440, "step": 95900 }, { "epoch": 17.600477151770967, "grad_norm": 7.497220940422267e-05, "learning_rate": 4.3216849471898356e-07, "loss": 0.0, "num_input_tokens_seen": 206865152, "step": 95905 }, { "epoch": 17.60139475133052, "grad_norm": 0.0024778239894658327, "learning_rate": 4.3184289370684195e-07, "loss": 0.0, "num_input_tokens_seen": 206874080, "step": 95910 }, { "epoch": 17.60231235089007, "grad_norm": 0.003220419632270932, "learning_rate": 4.31517409860851e-07, "loss": 0.0, "num_input_tokens_seen": 206885024, "step": 95915 }, { "epoch": 17.603229950449624, "grad_norm": 0.0007291705696843565, "learning_rate": 4.311920431893579e-07, "loss": 0.0, "num_input_tokens_seen": 206897056, "step": 95920 }, { "epoch": 17.604147550009177, "grad_norm": 0.0007988431607373059, "learning_rate": 4.3086679370070664e-07, "loss": 0.0, "num_input_tokens_seen": 206909024, "step": 95925 }, { "epoch": 17.605065149568727, "grad_norm": 0.003355705179274082, "learning_rate": 4.305416614032415e-07, "loss": 0.0, "num_input_tokens_seen": 206920064, "step": 95930 }, { "epoch": 17.60598274912828, "grad_norm": 5.8055928093381226e-05, "learning_rate": 4.302166463053015e-07, "loss": 0.0, "num_input_tokens_seen": 206931296, "step": 95935 }, { "epoch": 17.606900348687834, "grad_norm": 0.00022009035455994308, "learning_rate": 4.29891748415221e-07, "loss": 0.0, "num_input_tokens_seen": 206941568, "step": 95940 }, { "epoch": 17.607817948247384, "grad_norm": 9.160647459793836e-05, "learning_rate": 4.295669677413339e-07, "loss": 0.0, "num_input_tokens_seen": 206951840, "step": 95945 }, { "epoch": 17.608735547806937, "grad_norm": 0.00014533044304698706, "learning_rate": 4.2924230429197135e-07, "loss": 0.0, "num_input_tokens_seen": 206962912, "step": 95950 }, { "epoch": 17.60965314736649, "grad_norm": 6.123750790720806e-05, "learning_rate": 4.2891775807545944e-07, "loss": 0.0, "num_input_tokens_seen": 206974176, "step": 95955 }, { "epoch": 17.61057074692604, "grad_norm": 8.502163109369576e-05, "learning_rate": 4.2859332910012264e-07, "loss": 0.0, "num_input_tokens_seen": 206983872, "step": 95960 }, { "epoch": 17.611488346485594, "grad_norm": 6.43780076643452e-05, "learning_rate": 4.2826901737428093e-07, "loss": 0.0, "num_input_tokens_seen": 206995424, "step": 95965 }, { "epoch": 17.612405946045147, "grad_norm": 0.0001099633882404305, "learning_rate": 4.279448229062544e-07, "loss": 0.0, "num_input_tokens_seen": 207006080, "step": 95970 }, { "epoch": 17.613323545604697, "grad_norm": 0.0008824944961816072, "learning_rate": 4.276207457043569e-07, "loss": 0.0, "num_input_tokens_seen": 207017760, "step": 95975 }, { "epoch": 17.61424114516425, "grad_norm": 0.012425553053617477, "learning_rate": 4.2729678577690117e-07, "loss": 0.0, "num_input_tokens_seen": 207028640, "step": 95980 }, { "epoch": 17.615158744723804, "grad_norm": 0.00024815049255266786, "learning_rate": 4.2697294313219564e-07, "loss": 0.0, "num_input_tokens_seen": 207039456, "step": 95985 }, { "epoch": 17.616076344283353, "grad_norm": 0.00037384944153018296, "learning_rate": 4.266492177785464e-07, "loss": 0.0, "num_input_tokens_seen": 207050176, "step": 95990 }, { "epoch": 17.616993943842907, "grad_norm": 0.00050725182518363, "learning_rate": 4.263256097242557e-07, "loss": 0.0, "num_input_tokens_seen": 207060832, "step": 95995 }, { "epoch": 17.61791154340246, "grad_norm": 5.720621265936643e-05, "learning_rate": 4.260021189776259e-07, "loss": 0.0, "num_input_tokens_seen": 207072544, "step": 96000 }, { "epoch": 17.61882914296201, "grad_norm": 0.00040030598756857216, "learning_rate": 4.2567874554695187e-07, "loss": 0.0, "num_input_tokens_seen": 207083072, "step": 96005 }, { "epoch": 17.619746742521563, "grad_norm": 0.00014905762509442866, "learning_rate": 4.2535548944052816e-07, "loss": 0.0, "num_input_tokens_seen": 207093888, "step": 96010 }, { "epoch": 17.620664342081117, "grad_norm": 7.709173951297998e-05, "learning_rate": 4.2503235066664705e-07, "loss": 0.0, "num_input_tokens_seen": 207104032, "step": 96015 }, { "epoch": 17.621581941640667, "grad_norm": 0.00014547993487212807, "learning_rate": 4.2470932923359575e-07, "loss": 0.0, "num_input_tokens_seen": 207115328, "step": 96020 }, { "epoch": 17.62249954120022, "grad_norm": 0.0001411544653819874, "learning_rate": 4.2438642514965765e-07, "loss": 0.0, "num_input_tokens_seen": 207125248, "step": 96025 }, { "epoch": 17.623417140759774, "grad_norm": 9.498435247223824e-05, "learning_rate": 4.2406363842311727e-07, "loss": 0.0, "num_input_tokens_seen": 207135904, "step": 96030 }, { "epoch": 17.624334740319323, "grad_norm": 6.101809776737355e-05, "learning_rate": 4.2374096906225293e-07, "loss": 0.0, "num_input_tokens_seen": 207146464, "step": 96035 }, { "epoch": 17.625252339878877, "grad_norm": 0.0004986177664250135, "learning_rate": 4.234184170753397e-07, "loss": 0.0, "num_input_tokens_seen": 207157504, "step": 96040 }, { "epoch": 17.62616993943843, "grad_norm": 0.00011808588897110894, "learning_rate": 4.230959824706504e-07, "loss": 0.0, "num_input_tokens_seen": 207168224, "step": 96045 }, { "epoch": 17.62708753899798, "grad_norm": 9.633794252295047e-05, "learning_rate": 4.2277366525645625e-07, "loss": 0.0, "num_input_tokens_seen": 207179840, "step": 96050 }, { "epoch": 17.628005138557533, "grad_norm": 0.00013959367061033845, "learning_rate": 4.224514654410233e-07, "loss": 0.0, "num_input_tokens_seen": 207190848, "step": 96055 }, { "epoch": 17.628922738117087, "grad_norm": 0.0007122121751308441, "learning_rate": 4.221293830326151e-07, "loss": 0.0, "num_input_tokens_seen": 207201696, "step": 96060 }, { "epoch": 17.629840337676637, "grad_norm": 0.0038453389424830675, "learning_rate": 4.2180741803949433e-07, "loss": 0.0025, "num_input_tokens_seen": 207212000, "step": 96065 }, { "epoch": 17.63075793723619, "grad_norm": 6.160176417324692e-05, "learning_rate": 4.214855704699172e-07, "loss": 0.0, "num_input_tokens_seen": 207223104, "step": 96070 }, { "epoch": 17.631675536795743, "grad_norm": 0.0004962426610291004, "learning_rate": 4.211638403321394e-07, "loss": 0.0, "num_input_tokens_seen": 207234368, "step": 96075 }, { "epoch": 17.632593136355293, "grad_norm": 0.00033164897467941046, "learning_rate": 4.208422276344121e-07, "loss": 0.1128, "num_input_tokens_seen": 207244352, "step": 96080 }, { "epoch": 17.633510735914847, "grad_norm": 0.012866643257439137, "learning_rate": 4.205207323849847e-07, "loss": 0.0, "num_input_tokens_seen": 207254880, "step": 96085 }, { "epoch": 17.6344283354744, "grad_norm": 0.00028706315788440406, "learning_rate": 4.2019935459210346e-07, "loss": 0.0, "num_input_tokens_seen": 207265088, "step": 96090 }, { "epoch": 17.63534593503395, "grad_norm": 0.00012077771680196747, "learning_rate": 4.1987809426400963e-07, "loss": 0.0, "num_input_tokens_seen": 207275616, "step": 96095 }, { "epoch": 17.636263534593503, "grad_norm": 0.0010838309535756707, "learning_rate": 4.1955695140894537e-07, "loss": 0.0, "num_input_tokens_seen": 207286080, "step": 96100 }, { "epoch": 17.637181134153057, "grad_norm": 5.577975753112696e-05, "learning_rate": 4.1923592603514586e-07, "loss": 0.0001, "num_input_tokens_seen": 207296768, "step": 96105 }, { "epoch": 17.638098733712607, "grad_norm": 0.00011538283433765173, "learning_rate": 4.1891501815084447e-07, "loss": 0.0, "num_input_tokens_seen": 207307264, "step": 96110 }, { "epoch": 17.63901633327216, "grad_norm": 0.003644290380179882, "learning_rate": 4.1859422776427404e-07, "loss": 0.0, "num_input_tokens_seen": 207317088, "step": 96115 }, { "epoch": 17.639933932831713, "grad_norm": 0.00020510111062321812, "learning_rate": 4.1827355488366085e-07, "loss": 0.0, "num_input_tokens_seen": 207328672, "step": 96120 }, { "epoch": 17.640851532391263, "grad_norm": 0.001069085206836462, "learning_rate": 4.179529995172299e-07, "loss": 0.0056, "num_input_tokens_seen": 207339104, "step": 96125 }, { "epoch": 17.641769131950817, "grad_norm": 7.749360520392656e-05, "learning_rate": 4.1763256167320245e-07, "loss": 0.0, "num_input_tokens_seen": 207349856, "step": 96130 }, { "epoch": 17.64268673151037, "grad_norm": 7.02652323525399e-05, "learning_rate": 4.17312241359798e-07, "loss": 0.0, "num_input_tokens_seen": 207360896, "step": 96135 }, { "epoch": 17.64360433106992, "grad_norm": 0.00040908571099862456, "learning_rate": 4.1699203858523276e-07, "loss": 0.0, "num_input_tokens_seen": 207370528, "step": 96140 }, { "epoch": 17.644521930629473, "grad_norm": 0.000547991250641644, "learning_rate": 4.166719533577174e-07, "loss": 0.0, "num_input_tokens_seen": 207380480, "step": 96145 }, { "epoch": 17.645439530189027, "grad_norm": 8.716424054000527e-05, "learning_rate": 4.1635198568546363e-07, "loss": 0.0, "num_input_tokens_seen": 207391296, "step": 96150 }, { "epoch": 17.646357129748576, "grad_norm": 0.0006370797636918724, "learning_rate": 4.160321355766778e-07, "loss": 0.0, "num_input_tokens_seen": 207401632, "step": 96155 }, { "epoch": 17.64727472930813, "grad_norm": 0.0019591713789850473, "learning_rate": 4.1571240303956204e-07, "loss": 0.0, "num_input_tokens_seen": 207412704, "step": 96160 }, { "epoch": 17.648192328867683, "grad_norm": 0.0001033583321259357, "learning_rate": 4.1539278808231944e-07, "loss": 0.0, "num_input_tokens_seen": 207423520, "step": 96165 }, { "epoch": 17.649109928427233, "grad_norm": 6.940102321095765e-05, "learning_rate": 4.1507329071314604e-07, "loss": 0.0, "num_input_tokens_seen": 207434080, "step": 96170 }, { "epoch": 17.650027527986786, "grad_norm": 0.0004974629264324903, "learning_rate": 4.1475391094023656e-07, "loss": 0.0, "num_input_tokens_seen": 207446016, "step": 96175 }, { "epoch": 17.65094512754634, "grad_norm": 0.00816679373383522, "learning_rate": 4.144346487717832e-07, "loss": 0.0, "num_input_tokens_seen": 207456128, "step": 96180 }, { "epoch": 17.65186272710589, "grad_norm": 0.00011148907651659101, "learning_rate": 4.141155042159739e-07, "loss": 0.0, "num_input_tokens_seen": 207465568, "step": 96185 }, { "epoch": 17.652780326665443, "grad_norm": 6.475589907495305e-05, "learning_rate": 4.137964772809938e-07, "loss": 0.0, "num_input_tokens_seen": 207475808, "step": 96190 }, { "epoch": 17.653697926224996, "grad_norm": 8.857156353769824e-05, "learning_rate": 4.134775679750264e-07, "loss": 0.0001, "num_input_tokens_seen": 207486912, "step": 96195 }, { "epoch": 17.654615525784546, "grad_norm": 0.00018901687872130424, "learning_rate": 4.131587763062511e-07, "loss": 0.0, "num_input_tokens_seen": 207496288, "step": 96200 }, { "epoch": 17.6555331253441, "grad_norm": 0.00015253917081281543, "learning_rate": 4.128401022828449e-07, "loss": 0.0, "num_input_tokens_seen": 207506176, "step": 96205 }, { "epoch": 17.656450724903653, "grad_norm": 9.036046685650945e-05, "learning_rate": 4.1252154591298e-07, "loss": 0.0, "num_input_tokens_seen": 207517216, "step": 96210 }, { "epoch": 17.657368324463203, "grad_norm": 0.00011717575398506597, "learning_rate": 4.122031072048266e-07, "loss": 0.0, "num_input_tokens_seen": 207528256, "step": 96215 }, { "epoch": 17.658285924022756, "grad_norm": 0.002064522122964263, "learning_rate": 4.118847861665537e-07, "loss": 0.0, "num_input_tokens_seen": 207538176, "step": 96220 }, { "epoch": 17.65920352358231, "grad_norm": 0.0004456069727893919, "learning_rate": 4.115665828063259e-07, "loss": 0.0, "num_input_tokens_seen": 207548480, "step": 96225 }, { "epoch": 17.66012112314186, "grad_norm": 6.457029667217284e-05, "learning_rate": 4.112484971323022e-07, "loss": 0.0, "num_input_tokens_seen": 207560032, "step": 96230 }, { "epoch": 17.661038722701413, "grad_norm": 0.00011681932664941996, "learning_rate": 4.1093052915264386e-07, "loss": 0.0, "num_input_tokens_seen": 207570592, "step": 96235 }, { "epoch": 17.661956322260966, "grad_norm": 7.271006325026974e-05, "learning_rate": 4.106126788755049e-07, "loss": 0.0, "num_input_tokens_seen": 207579072, "step": 96240 }, { "epoch": 17.662873921820516, "grad_norm": 9.239281644113362e-05, "learning_rate": 4.102949463090372e-07, "loss": 0.0, "num_input_tokens_seen": 207590208, "step": 96245 }, { "epoch": 17.66379152138007, "grad_norm": 0.00035251001827418804, "learning_rate": 4.099773314613914e-07, "loss": 0.0, "num_input_tokens_seen": 207600768, "step": 96250 }, { "epoch": 17.664709120939623, "grad_norm": 0.00010333785030525178, "learning_rate": 4.096598343407132e-07, "loss": 0.0, "num_input_tokens_seen": 207611392, "step": 96255 }, { "epoch": 17.665626720499173, "grad_norm": 8.635637641418725e-05, "learning_rate": 4.093424549551456e-07, "loss": 0.0, "num_input_tokens_seen": 207623424, "step": 96260 }, { "epoch": 17.666544320058726, "grad_norm": 6.322664557956159e-05, "learning_rate": 4.0902519331282863e-07, "loss": 0.0244, "num_input_tokens_seen": 207634336, "step": 96265 }, { "epoch": 17.66746191961828, "grad_norm": 0.0002850009186659008, "learning_rate": 4.087080494219009e-07, "loss": 0.0, "num_input_tokens_seen": 207645568, "step": 96270 }, { "epoch": 17.66837951917783, "grad_norm": 0.00011721040937118232, "learning_rate": 4.0839102329049585e-07, "loss": 0.0, "num_input_tokens_seen": 207656736, "step": 96275 }, { "epoch": 17.669297118737383, "grad_norm": 7.088686834322289e-05, "learning_rate": 4.0807411492674364e-07, "loss": 0.0, "num_input_tokens_seen": 207667104, "step": 96280 }, { "epoch": 17.670214718296936, "grad_norm": 0.0005486401496455073, "learning_rate": 4.0775732433877504e-07, "loss": 0.0, "num_input_tokens_seen": 207677248, "step": 96285 }, { "epoch": 17.671132317856486, "grad_norm": 0.016967136412858963, "learning_rate": 4.0744065153471293e-07, "loss": 0.0, "num_input_tokens_seen": 207688032, "step": 96290 }, { "epoch": 17.67204991741604, "grad_norm": 4.6536897571058944e-05, "learning_rate": 4.071240965226797e-07, "loss": 0.0, "num_input_tokens_seen": 207697824, "step": 96295 }, { "epoch": 17.672967516975593, "grad_norm": 0.0013953513698652387, "learning_rate": 4.0680765931079614e-07, "loss": 0.0, "num_input_tokens_seen": 207708704, "step": 96300 }, { "epoch": 17.673885116535143, "grad_norm": 8.071955380728468e-05, "learning_rate": 4.064913399071774e-07, "loss": 0.0, "num_input_tokens_seen": 207719712, "step": 96305 }, { "epoch": 17.674802716094696, "grad_norm": 6.831822247477248e-05, "learning_rate": 4.061751383199358e-07, "loss": 0.0, "num_input_tokens_seen": 207730496, "step": 96310 }, { "epoch": 17.67572031565425, "grad_norm": 7.54241700633429e-05, "learning_rate": 4.058590545571817e-07, "loss": 0.0, "num_input_tokens_seen": 207741760, "step": 96315 }, { "epoch": 17.6766379152138, "grad_norm": 0.0005673746927641332, "learning_rate": 4.055430886270234e-07, "loss": 0.0, "num_input_tokens_seen": 207753600, "step": 96320 }, { "epoch": 17.677555514773353, "grad_norm": 0.0013196940999478102, "learning_rate": 4.0522724053756457e-07, "loss": 0.0, "num_input_tokens_seen": 207764256, "step": 96325 }, { "epoch": 17.678473114332906, "grad_norm": 0.427589476108551, "learning_rate": 4.0491151029690423e-07, "loss": 0.0002, "num_input_tokens_seen": 207774944, "step": 96330 }, { "epoch": 17.679390713892456, "grad_norm": 5.72764256503433e-05, "learning_rate": 4.0459589791314313e-07, "loss": 0.0, "num_input_tokens_seen": 207786176, "step": 96335 }, { "epoch": 17.68030831345201, "grad_norm": 0.000695128517691046, "learning_rate": 4.042804033943748e-07, "loss": 0.0, "num_input_tokens_seen": 207796992, "step": 96340 }, { "epoch": 17.681225913011563, "grad_norm": 0.00028373845270834863, "learning_rate": 4.039650267486911e-07, "loss": 0.0, "num_input_tokens_seen": 207807648, "step": 96345 }, { "epoch": 17.682143512571113, "grad_norm": 0.00018104203627444804, "learning_rate": 4.0364976798418166e-07, "loss": 0.0, "num_input_tokens_seen": 207818336, "step": 96350 }, { "epoch": 17.683061112130666, "grad_norm": 6.637486512772739e-05, "learning_rate": 4.0333462710893167e-07, "loss": 0.0, "num_input_tokens_seen": 207829536, "step": 96355 }, { "epoch": 17.68397871169022, "grad_norm": 6.400045094778761e-05, "learning_rate": 4.030196041310247e-07, "loss": 0.0, "num_input_tokens_seen": 207841472, "step": 96360 }, { "epoch": 17.68489631124977, "grad_norm": 0.0002785069227684289, "learning_rate": 4.0270469905853927e-07, "loss": 0.0, "num_input_tokens_seen": 207851584, "step": 96365 }, { "epoch": 17.685813910809323, "grad_norm": 0.00010233138164039701, "learning_rate": 4.0238991189955443e-07, "loss": 0.0, "num_input_tokens_seen": 207862080, "step": 96370 }, { "epoch": 17.686731510368876, "grad_norm": 0.00043499370804056525, "learning_rate": 4.0207524266214213e-07, "loss": 0.0, "num_input_tokens_seen": 207873664, "step": 96375 }, { "epoch": 17.687649109928426, "grad_norm": 7.389272650470957e-05, "learning_rate": 4.017606913543737e-07, "loss": 0.0, "num_input_tokens_seen": 207884352, "step": 96380 }, { "epoch": 17.68856670948798, "grad_norm": 0.00015443810843862593, "learning_rate": 4.0144625798431646e-07, "loss": 0.0, "num_input_tokens_seen": 207896064, "step": 96385 }, { "epoch": 17.689484309047533, "grad_norm": 0.00010048123658634722, "learning_rate": 4.011319425600363e-07, "loss": 0.0, "num_input_tokens_seen": 207907008, "step": 96390 }, { "epoch": 17.690401908607083, "grad_norm": 0.00010889000986935571, "learning_rate": 4.0081774508959394e-07, "loss": 0.0, "num_input_tokens_seen": 207917088, "step": 96395 }, { "epoch": 17.691319508166636, "grad_norm": 0.0013425425859168172, "learning_rate": 4.005036655810485e-07, "loss": 0.0, "num_input_tokens_seen": 207927904, "step": 96400 }, { "epoch": 17.69223710772619, "grad_norm": 7.272383663803339e-05, "learning_rate": 4.001897040424557e-07, "loss": 0.0, "num_input_tokens_seen": 207938592, "step": 96405 }, { "epoch": 17.69315470728574, "grad_norm": 0.0001097676286008209, "learning_rate": 3.99875860481867e-07, "loss": 0.0, "num_input_tokens_seen": 207950112, "step": 96410 }, { "epoch": 17.694072306845293, "grad_norm": 0.0002800262300297618, "learning_rate": 3.995621349073336e-07, "loss": 0.0, "num_input_tokens_seen": 207960192, "step": 96415 }, { "epoch": 17.694989906404846, "grad_norm": 0.0002281627821503207, "learning_rate": 3.9924852732690144e-07, "loss": 0.0, "num_input_tokens_seen": 207969792, "step": 96420 }, { "epoch": 17.695907505964396, "grad_norm": 0.00011876556527568027, "learning_rate": 3.9893503774861396e-07, "loss": 0.0, "num_input_tokens_seen": 207981376, "step": 96425 }, { "epoch": 17.69682510552395, "grad_norm": 0.00047128816368058324, "learning_rate": 3.9862166618051147e-07, "loss": 0.0, "num_input_tokens_seen": 207992384, "step": 96430 }, { "epoch": 17.697742705083503, "grad_norm": 0.005010194610804319, "learning_rate": 3.983084126306319e-07, "loss": 0.0, "num_input_tokens_seen": 208002816, "step": 96435 }, { "epoch": 17.698660304643052, "grad_norm": 0.00016992638120427728, "learning_rate": 3.9799527710701e-07, "loss": 0.0, "num_input_tokens_seen": 208013984, "step": 96440 }, { "epoch": 17.699577904202606, "grad_norm": 0.00012663174129556865, "learning_rate": 3.976822596176766e-07, "loss": 0.0, "num_input_tokens_seen": 208023104, "step": 96445 }, { "epoch": 17.70049550376216, "grad_norm": 8.770192653173581e-05, "learning_rate": 3.9736936017065906e-07, "loss": 0.0, "num_input_tokens_seen": 208035232, "step": 96450 }, { "epoch": 17.70141310332171, "grad_norm": 7.729957724222913e-05, "learning_rate": 3.970565787739855e-07, "loss": 0.0, "num_input_tokens_seen": 208045920, "step": 96455 }, { "epoch": 17.702330702881262, "grad_norm": 0.00014624095638282597, "learning_rate": 3.967439154356767e-07, "loss": 0.0, "num_input_tokens_seen": 208057536, "step": 96460 }, { "epoch": 17.703248302440816, "grad_norm": 0.000689207692630589, "learning_rate": 3.9643137016375064e-07, "loss": 0.0, "num_input_tokens_seen": 208068352, "step": 96465 }, { "epoch": 17.704165902000366, "grad_norm": 0.0011419380316510797, "learning_rate": 3.9611894296622653e-07, "loss": 0.0, "num_input_tokens_seen": 208080320, "step": 96470 }, { "epoch": 17.70508350155992, "grad_norm": 9.139715257333592e-05, "learning_rate": 3.958066338511157e-07, "loss": 0.0, "num_input_tokens_seen": 208091072, "step": 96475 }, { "epoch": 17.706001101119472, "grad_norm": 8.16172978375107e-05, "learning_rate": 3.9549444282642847e-07, "loss": 0.0, "num_input_tokens_seen": 208101472, "step": 96480 }, { "epoch": 17.706918700679022, "grad_norm": 0.00016389196389354765, "learning_rate": 3.9518236990017276e-07, "loss": 0.0, "num_input_tokens_seen": 208112832, "step": 96485 }, { "epoch": 17.707836300238576, "grad_norm": 0.0021873076912015676, "learning_rate": 3.9487041508035284e-07, "loss": 0.0, "num_input_tokens_seen": 208123264, "step": 96490 }, { "epoch": 17.70875389979813, "grad_norm": 0.0002090624038828537, "learning_rate": 3.9455857837496945e-07, "loss": 0.0, "num_input_tokens_seen": 208135008, "step": 96495 }, { "epoch": 17.70967149935768, "grad_norm": 5.220610182732344e-05, "learning_rate": 3.9424685979202013e-07, "loss": 0.0, "num_input_tokens_seen": 208146720, "step": 96500 }, { "epoch": 17.710589098917232, "grad_norm": 9.116564615396783e-05, "learning_rate": 3.939352593395007e-07, "loss": 0.0, "num_input_tokens_seen": 208157056, "step": 96505 }, { "epoch": 17.711506698476786, "grad_norm": 0.0004287625488359481, "learning_rate": 3.9362377702540367e-07, "loss": 0.0, "num_input_tokens_seen": 208169440, "step": 96510 }, { "epoch": 17.712424298036336, "grad_norm": 0.00026303352206014097, "learning_rate": 3.933124128577165e-07, "loss": 0.0, "num_input_tokens_seen": 208180576, "step": 96515 }, { "epoch": 17.71334189759589, "grad_norm": 0.0013393666595220566, "learning_rate": 3.9300116684442724e-07, "loss": 0.0, "num_input_tokens_seen": 208191232, "step": 96520 }, { "epoch": 17.714259497155442, "grad_norm": 0.00012760900426656008, "learning_rate": 3.9269003899351786e-07, "loss": 0.0, "num_input_tokens_seen": 208202624, "step": 96525 }, { "epoch": 17.715177096714992, "grad_norm": 0.0003123026981484145, "learning_rate": 3.9237902931296813e-07, "loss": 0.0, "num_input_tokens_seen": 208213536, "step": 96530 }, { "epoch": 17.716094696274546, "grad_norm": 0.0001359566522296518, "learning_rate": 3.920681378107544e-07, "loss": 0.0, "num_input_tokens_seen": 208224160, "step": 96535 }, { "epoch": 17.7170122958341, "grad_norm": 0.00010266350000165403, "learning_rate": 3.917573644948519e-07, "loss": 0.0, "num_input_tokens_seen": 208235552, "step": 96540 }, { "epoch": 17.71792989539365, "grad_norm": 0.0006841485155746341, "learning_rate": 3.9144670937323104e-07, "loss": 0.0, "num_input_tokens_seen": 208246336, "step": 96545 }, { "epoch": 17.718847494953202, "grad_norm": 8.31499055493623e-05, "learning_rate": 3.911361724538587e-07, "loss": 0.0, "num_input_tokens_seen": 208257152, "step": 96550 }, { "epoch": 17.719765094512756, "grad_norm": 7.621308759553358e-05, "learning_rate": 3.908257537447008e-07, "loss": 0.0, "num_input_tokens_seen": 208267744, "step": 96555 }, { "epoch": 17.720682694072305, "grad_norm": 0.00014854453911539167, "learning_rate": 3.905154532537192e-07, "loss": 0.0, "num_input_tokens_seen": 208277600, "step": 96560 }, { "epoch": 17.72160029363186, "grad_norm": 0.0003632795996963978, "learning_rate": 3.902052709888715e-07, "loss": 0.0, "num_input_tokens_seen": 208288640, "step": 96565 }, { "epoch": 17.722517893191412, "grad_norm": 0.0020617160480469465, "learning_rate": 3.898952069581147e-07, "loss": 0.1501, "num_input_tokens_seen": 208298496, "step": 96570 }, { "epoch": 17.723435492750962, "grad_norm": 7.081147487042472e-05, "learning_rate": 3.8958526116940066e-07, "loss": 0.0, "num_input_tokens_seen": 208308928, "step": 96575 }, { "epoch": 17.724353092310515, "grad_norm": 0.004259288311004639, "learning_rate": 3.892754336306792e-07, "loss": 0.0, "num_input_tokens_seen": 208319520, "step": 96580 }, { "epoch": 17.72527069187007, "grad_norm": 0.017170974984765053, "learning_rate": 3.889657243498962e-07, "loss": 0.0, "num_input_tokens_seen": 208329664, "step": 96585 }, { "epoch": 17.72618829142962, "grad_norm": 0.0011416071793064475, "learning_rate": 3.8865613333499697e-07, "loss": 0.0, "num_input_tokens_seen": 208341056, "step": 96590 }, { "epoch": 17.727105890989172, "grad_norm": 0.0001373245322611183, "learning_rate": 3.8834666059392067e-07, "loss": 0.0, "num_input_tokens_seen": 208351680, "step": 96595 }, { "epoch": 17.728023490548726, "grad_norm": 0.003858291544020176, "learning_rate": 3.8803730613460544e-07, "loss": 0.0, "num_input_tokens_seen": 208362112, "step": 96600 }, { "epoch": 17.728941090108275, "grad_norm": 0.0003499250451568514, "learning_rate": 3.877280699649838e-07, "loss": 0.0, "num_input_tokens_seen": 208372480, "step": 96605 }, { "epoch": 17.72985868966783, "grad_norm": 0.0002853606711141765, "learning_rate": 3.8741895209299053e-07, "loss": 0.0, "num_input_tokens_seen": 208382240, "step": 96610 }, { "epoch": 17.730776289227382, "grad_norm": 0.00019710218475665897, "learning_rate": 3.87109952526552e-07, "loss": 0.0, "num_input_tokens_seen": 208393408, "step": 96615 }, { "epoch": 17.731693888786932, "grad_norm": 6.424835009966046e-05, "learning_rate": 3.8680107127359367e-07, "loss": 0.0, "num_input_tokens_seen": 208403104, "step": 96620 }, { "epoch": 17.732611488346485, "grad_norm": 0.0033094712998718023, "learning_rate": 3.86492308342038e-07, "loss": 0.0, "num_input_tokens_seen": 208413056, "step": 96625 }, { "epoch": 17.73352908790604, "grad_norm": 0.00017546980234328657, "learning_rate": 3.8618366373980364e-07, "loss": 0.0, "num_input_tokens_seen": 208424096, "step": 96630 }, { "epoch": 17.73444668746559, "grad_norm": 6.675221084151417e-05, "learning_rate": 3.858751374748082e-07, "loss": 0.0, "num_input_tokens_seen": 208434528, "step": 96635 }, { "epoch": 17.735364287025142, "grad_norm": 6.341278640320525e-05, "learning_rate": 3.855667295549648e-07, "loss": 0.0, "num_input_tokens_seen": 208445536, "step": 96640 }, { "epoch": 17.736281886584695, "grad_norm": 7.150105375330895e-05, "learning_rate": 3.8525843998818257e-07, "loss": 0.0, "num_input_tokens_seen": 208454912, "step": 96645 }, { "epoch": 17.737199486144245, "grad_norm": 4.657204044633545e-05, "learning_rate": 3.8495026878236805e-07, "loss": 0.0, "num_input_tokens_seen": 208466336, "step": 96650 }, { "epoch": 17.7381170857038, "grad_norm": 6.630723510170355e-05, "learning_rate": 3.846422159454277e-07, "loss": 0.0, "num_input_tokens_seen": 208476704, "step": 96655 }, { "epoch": 17.739034685263352, "grad_norm": 0.00044684464228339493, "learning_rate": 3.8433428148526133e-07, "loss": 0.0, "num_input_tokens_seen": 208487360, "step": 96660 }, { "epoch": 17.739952284822902, "grad_norm": 6.385223241522908e-05, "learning_rate": 3.84026465409767e-07, "loss": 0.0, "num_input_tokens_seen": 208497856, "step": 96665 }, { "epoch": 17.740869884382455, "grad_norm": 0.00032949086744338274, "learning_rate": 3.837187677268389e-07, "loss": 0.0, "num_input_tokens_seen": 208508672, "step": 96670 }, { "epoch": 17.74178748394201, "grad_norm": 6.622568616876379e-05, "learning_rate": 3.834111884443709e-07, "loss": 0.0, "num_input_tokens_seen": 208519296, "step": 96675 }, { "epoch": 17.74270508350156, "grad_norm": 0.00010303466842742637, "learning_rate": 3.831037275702504e-07, "loss": 0.0, "num_input_tokens_seen": 208530464, "step": 96680 }, { "epoch": 17.743622683061112, "grad_norm": 0.0003706391726154834, "learning_rate": 3.827963851123634e-07, "loss": 0.0, "num_input_tokens_seen": 208540480, "step": 96685 }, { "epoch": 17.744540282620665, "grad_norm": 0.00022082505165599287, "learning_rate": 3.824891610785936e-07, "loss": 0.0, "num_input_tokens_seen": 208552160, "step": 96690 }, { "epoch": 17.745457882180215, "grad_norm": 0.00010559307702351362, "learning_rate": 3.821820554768202e-07, "loss": 0.0, "num_input_tokens_seen": 208563296, "step": 96695 }, { "epoch": 17.74637548173977, "grad_norm": 8.318167965626344e-05, "learning_rate": 3.8187506831491973e-07, "loss": 0.0, "num_input_tokens_seen": 208574432, "step": 96700 }, { "epoch": 17.747293081299322, "grad_norm": 0.0001751506351865828, "learning_rate": 3.815681996007664e-07, "loss": 0.0, "num_input_tokens_seen": 208584800, "step": 96705 }, { "epoch": 17.74821068085887, "grad_norm": 5.80147207074333e-05, "learning_rate": 3.812614493422312e-07, "loss": 0.0, "num_input_tokens_seen": 208595904, "step": 96710 }, { "epoch": 17.749128280418425, "grad_norm": 0.00041396592860110104, "learning_rate": 3.809548175471817e-07, "loss": 0.0, "num_input_tokens_seen": 208605792, "step": 96715 }, { "epoch": 17.75004587997798, "grad_norm": 0.0007523149834014475, "learning_rate": 3.8064830422348154e-07, "loss": 0.0, "num_input_tokens_seen": 208617472, "step": 96720 }, { "epoch": 17.75096347953753, "grad_norm": 0.00017239124281331897, "learning_rate": 3.803419093789934e-07, "loss": 0.0, "num_input_tokens_seen": 208628864, "step": 96725 }, { "epoch": 17.751881079097082, "grad_norm": 0.00011420913506299257, "learning_rate": 3.800356330215754e-07, "loss": 0.0, "num_input_tokens_seen": 208639488, "step": 96730 }, { "epoch": 17.752798678656635, "grad_norm": 0.00042218060116283596, "learning_rate": 3.7972947515908245e-07, "loss": 0.0, "num_input_tokens_seen": 208648768, "step": 96735 }, { "epoch": 17.753716278216185, "grad_norm": 0.00017232946993317455, "learning_rate": 3.7942343579936867e-07, "loss": 0.0, "num_input_tokens_seen": 208659072, "step": 96740 }, { "epoch": 17.75463387777574, "grad_norm": 0.00028726120945066214, "learning_rate": 3.7911751495028235e-07, "loss": 0.0032, "num_input_tokens_seen": 208668832, "step": 96745 }, { "epoch": 17.755551477335292, "grad_norm": 0.0003758046659640968, "learning_rate": 3.788117126196694e-07, "loss": 0.0, "num_input_tokens_seen": 208680416, "step": 96750 }, { "epoch": 17.75646907689484, "grad_norm": 0.00010125894914381206, "learning_rate": 3.785060288153747e-07, "loss": 0.0, "num_input_tokens_seen": 208690976, "step": 96755 }, { "epoch": 17.757386676454395, "grad_norm": 0.00012910639634355903, "learning_rate": 3.782004635452374e-07, "loss": 0.0, "num_input_tokens_seen": 208702048, "step": 96760 }, { "epoch": 17.75830427601395, "grad_norm": 35.19396209716797, "learning_rate": 3.778950168170953e-07, "loss": 0.1221, "num_input_tokens_seen": 208712704, "step": 96765 }, { "epoch": 17.7592218755735, "grad_norm": 0.00010141224629478529, "learning_rate": 3.7758968863878144e-07, "loss": 0.0, "num_input_tokens_seen": 208722944, "step": 96770 }, { "epoch": 17.76013947513305, "grad_norm": 0.00010577518696663901, "learning_rate": 3.7728447901812904e-07, "loss": 0.0, "num_input_tokens_seen": 208733056, "step": 96775 }, { "epoch": 17.761057074692605, "grad_norm": 9.6429321274627e-05, "learning_rate": 3.7697938796296516e-07, "loss": 0.0, "num_input_tokens_seen": 208743456, "step": 96780 }, { "epoch": 17.761974674252155, "grad_norm": 0.0002887833397835493, "learning_rate": 3.7667441548111363e-07, "loss": 0.0, "num_input_tokens_seen": 208755008, "step": 96785 }, { "epoch": 17.76289227381171, "grad_norm": 5.47281997569371e-05, "learning_rate": 3.763695615803992e-07, "loss": 0.0, "num_input_tokens_seen": 208764928, "step": 96790 }, { "epoch": 17.76380987337126, "grad_norm": 0.003796589095145464, "learning_rate": 3.760648262686395e-07, "loss": 0.0, "num_input_tokens_seen": 208775648, "step": 96795 }, { "epoch": 17.76472747293081, "grad_norm": 0.00028620968805626035, "learning_rate": 3.7576020955364947e-07, "loss": 0.0, "num_input_tokens_seen": 208785696, "step": 96800 }, { "epoch": 17.765645072490365, "grad_norm": 0.00024505078908987343, "learning_rate": 3.754557114432439e-07, "loss": 0.0, "num_input_tokens_seen": 208794944, "step": 96805 }, { "epoch": 17.76656267204992, "grad_norm": 0.00014772557187825441, "learning_rate": 3.751513319452321e-07, "loss": 0.0, "num_input_tokens_seen": 208805984, "step": 96810 }, { "epoch": 17.767480271609468, "grad_norm": 0.0020570342894643545, "learning_rate": 3.748470710674207e-07, "loss": 0.0, "num_input_tokens_seen": 208817856, "step": 96815 }, { "epoch": 17.76839787116902, "grad_norm": 5.759616760769859e-05, "learning_rate": 3.745429288176139e-07, "loss": 0.0, "num_input_tokens_seen": 208827872, "step": 96820 }, { "epoch": 17.769315470728575, "grad_norm": 0.00010536487388890237, "learning_rate": 3.7423890520361104e-07, "loss": 0.0, "num_input_tokens_seen": 208840000, "step": 96825 }, { "epoch": 17.770233070288125, "grad_norm": 0.0001014285662677139, "learning_rate": 3.7393500023321204e-07, "loss": 0.0, "num_input_tokens_seen": 208852000, "step": 96830 }, { "epoch": 17.771150669847678, "grad_norm": 7.303278107428923e-05, "learning_rate": 3.7363121391421007e-07, "loss": 0.0, "num_input_tokens_seen": 208863392, "step": 96835 }, { "epoch": 17.77206826940723, "grad_norm": 0.0002620881423354149, "learning_rate": 3.7332754625439784e-07, "loss": 0.0, "num_input_tokens_seen": 208874368, "step": 96840 }, { "epoch": 17.77298586896678, "grad_norm": 0.00015955544949974865, "learning_rate": 3.730239972615629e-07, "loss": 0.0, "num_input_tokens_seen": 208886400, "step": 96845 }, { "epoch": 17.773903468526335, "grad_norm": 6.983285857131705e-05, "learning_rate": 3.727205669434908e-07, "loss": 0.0, "num_input_tokens_seen": 208899328, "step": 96850 }, { "epoch": 17.77482106808589, "grad_norm": 0.0007268719491548836, "learning_rate": 3.7241725530796524e-07, "loss": 0.0, "num_input_tokens_seen": 208909120, "step": 96855 }, { "epoch": 17.775738667645438, "grad_norm": 0.00013474920706357807, "learning_rate": 3.7211406236276503e-07, "loss": 0.0, "num_input_tokens_seen": 208919456, "step": 96860 }, { "epoch": 17.77665626720499, "grad_norm": 0.00034214527113363147, "learning_rate": 3.718109881156662e-07, "loss": 0.0, "num_input_tokens_seen": 208931424, "step": 96865 }, { "epoch": 17.777573866764545, "grad_norm": 0.00011637504940154031, "learning_rate": 3.715080325744419e-07, "loss": 0.0, "num_input_tokens_seen": 208941568, "step": 96870 }, { "epoch": 17.778491466324095, "grad_norm": 7.518725760746747e-05, "learning_rate": 3.7120519574686377e-07, "loss": 0.0, "num_input_tokens_seen": 208952256, "step": 96875 }, { "epoch": 17.779409065883648, "grad_norm": 0.0020924995187669992, "learning_rate": 3.709024776406983e-07, "loss": 0.0, "num_input_tokens_seen": 208963040, "step": 96880 }, { "epoch": 17.7803266654432, "grad_norm": 0.00011287706001894549, "learning_rate": 3.7059987826370934e-07, "loss": 0.0, "num_input_tokens_seen": 208973984, "step": 96885 }, { "epoch": 17.78124426500275, "grad_norm": 0.0036101362202316523, "learning_rate": 3.7029739762365904e-07, "loss": 0.0, "num_input_tokens_seen": 208985280, "step": 96890 }, { "epoch": 17.782161864562305, "grad_norm": 0.000184646385605447, "learning_rate": 3.6999503572830555e-07, "loss": 0.0, "num_input_tokens_seen": 208995648, "step": 96895 }, { "epoch": 17.783079464121858, "grad_norm": 9.004588355310261e-05, "learning_rate": 3.6969279258540325e-07, "loss": 0.0, "num_input_tokens_seen": 209005792, "step": 96900 }, { "epoch": 17.783997063681408, "grad_norm": 0.00025085764355026186, "learning_rate": 3.6939066820270376e-07, "loss": 0.0, "num_input_tokens_seen": 209016384, "step": 96905 }, { "epoch": 17.78491466324096, "grad_norm": 0.0012606964446604252, "learning_rate": 3.690886625879575e-07, "loss": 0.0, "num_input_tokens_seen": 209026336, "step": 96910 }, { "epoch": 17.785832262800515, "grad_norm": 0.000305872003082186, "learning_rate": 3.6878677574890996e-07, "loss": 0.0, "num_input_tokens_seen": 209037248, "step": 96915 }, { "epoch": 17.786749862360065, "grad_norm": 0.003613763954490423, "learning_rate": 3.6848500769330275e-07, "loss": 0.0, "num_input_tokens_seen": 209047328, "step": 96920 }, { "epoch": 17.787667461919618, "grad_norm": 0.00017662443860899657, "learning_rate": 3.68183358428878e-07, "loss": 0.0, "num_input_tokens_seen": 209057664, "step": 96925 }, { "epoch": 17.78858506147917, "grad_norm": 0.0004190348263364285, "learning_rate": 3.678818279633717e-07, "loss": 0.0, "num_input_tokens_seen": 209067744, "step": 96930 }, { "epoch": 17.78950266103872, "grad_norm": 0.0014684582129120827, "learning_rate": 3.67580416304516e-07, "loss": 0.0, "num_input_tokens_seen": 209077664, "step": 96935 }, { "epoch": 17.790420260598275, "grad_norm": 4.0425893530482426e-05, "learning_rate": 3.672791234600442e-07, "loss": 0.0, "num_input_tokens_seen": 209088640, "step": 96940 }, { "epoch": 17.791337860157828, "grad_norm": 6.419766577892005e-05, "learning_rate": 3.6697794943768293e-07, "loss": 0.0, "num_input_tokens_seen": 209101120, "step": 96945 }, { "epoch": 17.792255459717378, "grad_norm": 0.0001908565900521353, "learning_rate": 3.666768942451565e-07, "loss": 0.0, "num_input_tokens_seen": 209112608, "step": 96950 }, { "epoch": 17.79317305927693, "grad_norm": 0.0003998707979917526, "learning_rate": 3.6637595789018597e-07, "loss": 0.0, "num_input_tokens_seen": 209123456, "step": 96955 }, { "epoch": 17.794090658836485, "grad_norm": 0.00017483034753240645, "learning_rate": 3.660751403804913e-07, "loss": 0.0, "num_input_tokens_seen": 209133760, "step": 96960 }, { "epoch": 17.795008258396035, "grad_norm": 0.0322592668235302, "learning_rate": 3.657744417237874e-07, "loss": 0.0, "num_input_tokens_seen": 209145536, "step": 96965 }, { "epoch": 17.795925857955588, "grad_norm": 0.0008841616217978299, "learning_rate": 3.6547386192778587e-07, "loss": 0.0, "num_input_tokens_seen": 209155936, "step": 96970 }, { "epoch": 17.79684345751514, "grad_norm": 0.0003522981714922935, "learning_rate": 3.651734010001978e-07, "loss": 0.0, "num_input_tokens_seen": 209166400, "step": 96975 }, { "epoch": 17.79776105707469, "grad_norm": 6.353572098305449e-05, "learning_rate": 3.648730589487287e-07, "loss": 0.0, "num_input_tokens_seen": 209177920, "step": 96980 }, { "epoch": 17.798678656634245, "grad_norm": 0.0011285371147096157, "learning_rate": 3.645728357810818e-07, "loss": 0.0, "num_input_tokens_seen": 209188896, "step": 96985 }, { "epoch": 17.799596256193798, "grad_norm": 0.0041661509312689304, "learning_rate": 3.6427273150495655e-07, "loss": 0.0, "num_input_tokens_seen": 209199456, "step": 96990 }, { "epoch": 17.800513855753348, "grad_norm": 0.00011561903374968097, "learning_rate": 3.639727461280518e-07, "loss": 0.0, "num_input_tokens_seen": 209210816, "step": 96995 }, { "epoch": 17.8014314553129, "grad_norm": 5.391488957684487e-05, "learning_rate": 3.636728796580613e-07, "loss": 0.0, "num_input_tokens_seen": 209221728, "step": 97000 }, { "epoch": 17.802349054872455, "grad_norm": 9.182302164845169e-05, "learning_rate": 3.6337313210267456e-07, "loss": 0.0, "num_input_tokens_seen": 209231648, "step": 97005 }, { "epoch": 17.803266654432004, "grad_norm": 0.002980755642056465, "learning_rate": 3.63073503469582e-07, "loss": 0.0284, "num_input_tokens_seen": 209242880, "step": 97010 }, { "epoch": 17.804184253991558, "grad_norm": 0.0006942884065210819, "learning_rate": 3.6277399376646703e-07, "loss": 0.0, "num_input_tokens_seen": 209253344, "step": 97015 }, { "epoch": 17.80510185355111, "grad_norm": 6.178919284138829e-05, "learning_rate": 3.624746030010112e-07, "loss": 0.0, "num_input_tokens_seen": 209264960, "step": 97020 }, { "epoch": 17.80601945311066, "grad_norm": 9.077785216504708e-05, "learning_rate": 3.6217533118089567e-07, "loss": 0.0, "num_input_tokens_seen": 209275552, "step": 97025 }, { "epoch": 17.806937052670214, "grad_norm": 0.0016510607674717903, "learning_rate": 3.6187617831379475e-07, "loss": 0.0, "num_input_tokens_seen": 209287392, "step": 97030 }, { "epoch": 17.807854652229768, "grad_norm": 0.00011707949306583032, "learning_rate": 3.6157714440738124e-07, "loss": 0.0, "num_input_tokens_seen": 209298432, "step": 97035 }, { "epoch": 17.808772251789318, "grad_norm": 0.0032145301811397076, "learning_rate": 3.612782294693251e-07, "loss": 0.0, "num_input_tokens_seen": 209308192, "step": 97040 }, { "epoch": 17.80968985134887, "grad_norm": 0.00030723438248969615, "learning_rate": 3.609794335072925e-07, "loss": 0.0, "num_input_tokens_seen": 209318880, "step": 97045 }, { "epoch": 17.810607450908424, "grad_norm": 0.00012058814900228754, "learning_rate": 3.6068075652894774e-07, "loss": 0.0, "num_input_tokens_seen": 209328032, "step": 97050 }, { "epoch": 17.811525050467974, "grad_norm": 0.00043215524055995047, "learning_rate": 3.60382198541952e-07, "loss": 0.0, "num_input_tokens_seen": 209337344, "step": 97055 }, { "epoch": 17.812442650027528, "grad_norm": 9.534207492833957e-05, "learning_rate": 3.600837595539619e-07, "loss": 0.0, "num_input_tokens_seen": 209347168, "step": 97060 }, { "epoch": 17.81336024958708, "grad_norm": 4.909822382614948e-05, "learning_rate": 3.597854395726319e-07, "loss": 0.0, "num_input_tokens_seen": 209358560, "step": 97065 }, { "epoch": 17.81427784914663, "grad_norm": 0.0004445904341991991, "learning_rate": 3.594872386056131e-07, "loss": 0.0, "num_input_tokens_seen": 209369312, "step": 97070 }, { "epoch": 17.815195448706184, "grad_norm": 9.148487151833251e-05, "learning_rate": 3.5918915666055487e-07, "loss": 0.0, "num_input_tokens_seen": 209379616, "step": 97075 }, { "epoch": 17.816113048265738, "grad_norm": 0.011565395630896091, "learning_rate": 3.5889119374510285e-07, "loss": 0.0, "num_input_tokens_seen": 209390656, "step": 97080 }, { "epoch": 17.817030647825288, "grad_norm": 0.00011476130021037534, "learning_rate": 3.585933498668981e-07, "loss": 0.0, "num_input_tokens_seen": 209400320, "step": 97085 }, { "epoch": 17.81794824738484, "grad_norm": 0.00019266361778136343, "learning_rate": 3.582956250335801e-07, "loss": 0.0, "num_input_tokens_seen": 209412192, "step": 97090 }, { "epoch": 17.818865846944394, "grad_norm": 0.030935421586036682, "learning_rate": 3.5799801925278546e-07, "loss": 0.0, "num_input_tokens_seen": 209423936, "step": 97095 }, { "epoch": 17.819783446503944, "grad_norm": 0.02027043327689171, "learning_rate": 3.5770053253214755e-07, "loss": 0.0, "num_input_tokens_seen": 209433760, "step": 97100 }, { "epoch": 17.820701046063498, "grad_norm": 0.00017119258700404316, "learning_rate": 3.5740316487929527e-07, "loss": 0.0, "num_input_tokens_seen": 209444160, "step": 97105 }, { "epoch": 17.82161864562305, "grad_norm": 0.000415062066167593, "learning_rate": 3.571059163018575e-07, "loss": 0.0, "num_input_tokens_seen": 209455168, "step": 97110 }, { "epoch": 17.8225362451826, "grad_norm": 6.365477020153776e-05, "learning_rate": 3.5680878680745657e-07, "loss": 0.0, "num_input_tokens_seen": 209464352, "step": 97115 }, { "epoch": 17.823453844742154, "grad_norm": 7.654340151930228e-05, "learning_rate": 3.565117764037146e-07, "loss": 0.0, "num_input_tokens_seen": 209476128, "step": 97120 }, { "epoch": 17.824371444301708, "grad_norm": 7.219702092697844e-05, "learning_rate": 3.5621488509824775e-07, "loss": 0.0, "num_input_tokens_seen": 209487104, "step": 97125 }, { "epoch": 17.825289043861257, "grad_norm": 6.047575152479112e-05, "learning_rate": 3.5591811289867274e-07, "loss": 0.0, "num_input_tokens_seen": 209499104, "step": 97130 }, { "epoch": 17.82620664342081, "grad_norm": 0.00840795785188675, "learning_rate": 3.556214598126001e-07, "loss": 0.0, "num_input_tokens_seen": 209510912, "step": 97135 }, { "epoch": 17.827124242980364, "grad_norm": 0.0008849994046613574, "learning_rate": 3.553249258476382e-07, "loss": 0.0, "num_input_tokens_seen": 209521312, "step": 97140 }, { "epoch": 17.828041842539914, "grad_norm": 0.0004769597144331783, "learning_rate": 3.5502851101139436e-07, "loss": 0.0, "num_input_tokens_seen": 209531968, "step": 97145 }, { "epoch": 17.828959442099467, "grad_norm": 7.626091246493161e-05, "learning_rate": 3.5473221531147015e-07, "loss": 0.0, "num_input_tokens_seen": 209543680, "step": 97150 }, { "epoch": 17.82987704165902, "grad_norm": 4.9098052978515625, "learning_rate": 3.5443603875546404e-07, "loss": 0.019, "num_input_tokens_seen": 209554272, "step": 97155 }, { "epoch": 17.83079464121857, "grad_norm": 8.542931755073369e-05, "learning_rate": 3.5413998135097493e-07, "loss": 0.0, "num_input_tokens_seen": 209565280, "step": 97160 }, { "epoch": 17.831712240778124, "grad_norm": 0.009188145399093628, "learning_rate": 3.538440431055945e-07, "loss": 0.0, "num_input_tokens_seen": 209575104, "step": 97165 }, { "epoch": 17.832629840337678, "grad_norm": 0.009496723301708698, "learning_rate": 3.5354822402691336e-07, "loss": 0.0, "num_input_tokens_seen": 209586464, "step": 97170 }, { "epoch": 17.833547439897227, "grad_norm": 0.00047181019908748567, "learning_rate": 3.532525241225182e-07, "loss": 0.0, "num_input_tokens_seen": 209598336, "step": 97175 }, { "epoch": 17.83446503945678, "grad_norm": 9.869041241472587e-05, "learning_rate": 3.5295694339999467e-07, "loss": 0.0, "num_input_tokens_seen": 209608416, "step": 97180 }, { "epoch": 17.835382639016334, "grad_norm": 0.00015119531599339098, "learning_rate": 3.526614818669233e-07, "loss": 0.0, "num_input_tokens_seen": 209617792, "step": 97185 }, { "epoch": 17.836300238575884, "grad_norm": 0.0004277693515177816, "learning_rate": 3.5236613953088197e-07, "loss": 0.0, "num_input_tokens_seen": 209628992, "step": 97190 }, { "epoch": 17.837217838135437, "grad_norm": 0.002350120572373271, "learning_rate": 3.520709163994462e-07, "loss": 0.0, "num_input_tokens_seen": 209639776, "step": 97195 }, { "epoch": 17.83813543769499, "grad_norm": 4.516172885894775, "learning_rate": 3.517758124801879e-07, "loss": 0.0006, "num_input_tokens_seen": 209650816, "step": 97200 }, { "epoch": 17.83905303725454, "grad_norm": 0.00026469709700904787, "learning_rate": 3.514808277806753e-07, "loss": 0.0, "num_input_tokens_seen": 209662336, "step": 97205 }, { "epoch": 17.839970636814094, "grad_norm": 0.0001640146365389228, "learning_rate": 3.5118596230847513e-07, "loss": 0.0, "num_input_tokens_seen": 209673248, "step": 97210 }, { "epoch": 17.840888236373647, "grad_norm": 0.0003544321225490421, "learning_rate": 3.508912160711508e-07, "loss": 0.0, "num_input_tokens_seen": 209684736, "step": 97215 }, { "epoch": 17.841805835933197, "grad_norm": 0.004737798124551773, "learning_rate": 3.505965890762608e-07, "loss": 0.0, "num_input_tokens_seen": 209695488, "step": 97220 }, { "epoch": 17.84272343549275, "grad_norm": 0.001423993264324963, "learning_rate": 3.5030208133136176e-07, "loss": 0.0, "num_input_tokens_seen": 209706560, "step": 97225 }, { "epoch": 17.843641035052304, "grad_norm": 0.00022025474754627794, "learning_rate": 3.500076928440088e-07, "loss": 0.0, "num_input_tokens_seen": 209718240, "step": 97230 }, { "epoch": 17.844558634611854, "grad_norm": 0.0066152107901871204, "learning_rate": 3.497134236217514e-07, "loss": 0.0, "num_input_tokens_seen": 209729440, "step": 97235 }, { "epoch": 17.845476234171407, "grad_norm": 0.00016124396643135697, "learning_rate": 3.494192736721369e-07, "loss": 0.0, "num_input_tokens_seen": 209740352, "step": 97240 }, { "epoch": 17.84639383373096, "grad_norm": 8.081132546067238e-05, "learning_rate": 3.491252430027109e-07, "loss": 0.0, "num_input_tokens_seen": 209751360, "step": 97245 }, { "epoch": 17.84731143329051, "grad_norm": 7.16097783879377e-05, "learning_rate": 3.488313316210146e-07, "loss": 0.0, "num_input_tokens_seen": 209762784, "step": 97250 }, { "epoch": 17.848229032850064, "grad_norm": 8.144987077685073e-05, "learning_rate": 3.485375395345858e-07, "loss": 0.0, "num_input_tokens_seen": 209774048, "step": 97255 }, { "epoch": 17.849146632409617, "grad_norm": 0.011247582733631134, "learning_rate": 3.4824386675095966e-07, "loss": 0.0, "num_input_tokens_seen": 209784160, "step": 97260 }, { "epoch": 17.850064231969167, "grad_norm": 0.00011071820335928351, "learning_rate": 3.4795031327766906e-07, "loss": 0.0, "num_input_tokens_seen": 209795584, "step": 97265 }, { "epoch": 17.85098183152872, "grad_norm": 0.0006651945877820253, "learning_rate": 3.4765687912224177e-07, "loss": 0.0, "num_input_tokens_seen": 209807136, "step": 97270 }, { "epoch": 17.851899431088274, "grad_norm": 9.21626269700937e-05, "learning_rate": 3.473635642922063e-07, "loss": 0.0, "num_input_tokens_seen": 209819072, "step": 97275 }, { "epoch": 17.852817030647824, "grad_norm": 0.00024978508008643985, "learning_rate": 3.4707036879508437e-07, "loss": 0.0, "num_input_tokens_seen": 209829824, "step": 97280 }, { "epoch": 17.853734630207377, "grad_norm": 0.00010247808677377179, "learning_rate": 3.467772926383961e-07, "loss": 0.0, "num_input_tokens_seen": 209840448, "step": 97285 }, { "epoch": 17.85465222976693, "grad_norm": 7.715321407886222e-05, "learning_rate": 3.4648433582965767e-07, "loss": 0.0, "num_input_tokens_seen": 209849504, "step": 97290 }, { "epoch": 17.85556982932648, "grad_norm": 0.0007159450906328857, "learning_rate": 3.461914983763842e-07, "loss": 0.0, "num_input_tokens_seen": 209859456, "step": 97295 }, { "epoch": 17.856487428886034, "grad_norm": 4.0536837332183495e-05, "learning_rate": 3.4589878028608635e-07, "loss": 0.0, "num_input_tokens_seen": 209869824, "step": 97300 }, { "epoch": 17.857405028445587, "grad_norm": 9.864571620710194e-05, "learning_rate": 3.4560618156627144e-07, "loss": 0.0974, "num_input_tokens_seen": 209881216, "step": 97305 }, { "epoch": 17.858322628005137, "grad_norm": 0.0017022069077938795, "learning_rate": 3.453137022244435e-07, "loss": 0.0, "num_input_tokens_seen": 209891232, "step": 97310 }, { "epoch": 17.85924022756469, "grad_norm": 6.051846867194399e-05, "learning_rate": 3.450213422681059e-07, "loss": 0.0, "num_input_tokens_seen": 209902560, "step": 97315 }, { "epoch": 17.860157827124244, "grad_norm": 0.00023361630155704916, "learning_rate": 3.4472910170475604e-07, "loss": 0.0, "num_input_tokens_seen": 209911456, "step": 97320 }, { "epoch": 17.861075426683794, "grad_norm": 0.00012957416765857488, "learning_rate": 3.444369805418896e-07, "loss": 0.0, "num_input_tokens_seen": 209922080, "step": 97325 }, { "epoch": 17.861993026243347, "grad_norm": 0.0005904855206608772, "learning_rate": 3.441449787869994e-07, "loss": 0.0, "num_input_tokens_seen": 209932064, "step": 97330 }, { "epoch": 17.8629106258029, "grad_norm": 0.00016710563795641065, "learning_rate": 3.438530964475745e-07, "loss": 0.0, "num_input_tokens_seen": 209943008, "step": 97335 }, { "epoch": 17.86382822536245, "grad_norm": 6.556774314958602e-05, "learning_rate": 3.4356133353110057e-07, "loss": 0.0, "num_input_tokens_seen": 209953536, "step": 97340 }, { "epoch": 17.864745824922004, "grad_norm": 6.673853931715712e-05, "learning_rate": 3.432696900450627e-07, "loss": 0.0, "num_input_tokens_seen": 209965984, "step": 97345 }, { "epoch": 17.865663424481557, "grad_norm": 6.259714427869767e-05, "learning_rate": 3.4297816599693944e-07, "loss": 0.0, "num_input_tokens_seen": 209976640, "step": 97350 }, { "epoch": 17.866581024041107, "grad_norm": 0.00031238317023962736, "learning_rate": 3.426867613942092e-07, "loss": 0.0, "num_input_tokens_seen": 209987552, "step": 97355 }, { "epoch": 17.86749862360066, "grad_norm": 7.082899537635967e-05, "learning_rate": 3.4239547624434375e-07, "loss": 0.0, "num_input_tokens_seen": 209997760, "step": 97360 }, { "epoch": 17.868416223160214, "grad_norm": 0.00011887144501088187, "learning_rate": 3.421043105548172e-07, "loss": 0.0, "num_input_tokens_seen": 210008736, "step": 97365 }, { "epoch": 17.869333822719764, "grad_norm": 0.0017369022825732827, "learning_rate": 3.418132643330957e-07, "loss": 0.0, "num_input_tokens_seen": 210019584, "step": 97370 }, { "epoch": 17.870251422279317, "grad_norm": 0.0002554287784732878, "learning_rate": 3.4152233758664386e-07, "loss": 0.0, "num_input_tokens_seen": 210031360, "step": 97375 }, { "epoch": 17.87116902183887, "grad_norm": 0.00040063029155135155, "learning_rate": 3.412315303229247e-07, "loss": 0.0, "num_input_tokens_seen": 210041344, "step": 97380 }, { "epoch": 17.87208662139842, "grad_norm": 0.002212179359048605, "learning_rate": 3.4094084254939596e-07, "loss": 0.0, "num_input_tokens_seen": 210052832, "step": 97385 }, { "epoch": 17.873004220957974, "grad_norm": 9.41184043767862e-05, "learning_rate": 3.406502742735135e-07, "loss": 0.0, "num_input_tokens_seen": 210064128, "step": 97390 }, { "epoch": 17.873921820517527, "grad_norm": 0.00031664205016568303, "learning_rate": 3.4035982550273074e-07, "loss": 0.0, "num_input_tokens_seen": 210075072, "step": 97395 }, { "epoch": 17.874839420077077, "grad_norm": 0.00011976598761975765, "learning_rate": 3.400694962444967e-07, "loss": 0.0, "num_input_tokens_seen": 210085152, "step": 97400 }, { "epoch": 17.87575701963663, "grad_norm": 0.0008835400803945959, "learning_rate": 3.3977928650625766e-07, "loss": 0.0, "num_input_tokens_seen": 210096224, "step": 97405 }, { "epoch": 17.876674619196184, "grad_norm": 5.2011924708494917e-05, "learning_rate": 3.394891962954566e-07, "loss": 0.0, "num_input_tokens_seen": 210106368, "step": 97410 }, { "epoch": 17.877592218755733, "grad_norm": 8.726628584554419e-05, "learning_rate": 3.391992256195353e-07, "loss": 0.0, "num_input_tokens_seen": 210117728, "step": 97415 }, { "epoch": 17.878509818315287, "grad_norm": 0.00020832382142543793, "learning_rate": 3.3890937448593064e-07, "loss": 0.0, "num_input_tokens_seen": 210128480, "step": 97420 }, { "epoch": 17.87942741787484, "grad_norm": 9.555307042319328e-05, "learning_rate": 3.386196429020749e-07, "loss": 0.0, "num_input_tokens_seen": 210138112, "step": 97425 }, { "epoch": 17.88034501743439, "grad_norm": 0.00029930929304100573, "learning_rate": 3.383300308754023e-07, "loss": 0.0, "num_input_tokens_seen": 210149984, "step": 97430 }, { "epoch": 17.881262616993943, "grad_norm": 0.00019410696404520422, "learning_rate": 3.380405384133395e-07, "loss": 0.0, "num_input_tokens_seen": 210160160, "step": 97435 }, { "epoch": 17.882180216553497, "grad_norm": 0.0002591022930573672, "learning_rate": 3.377511655233112e-07, "loss": 0.0, "num_input_tokens_seen": 210171360, "step": 97440 }, { "epoch": 17.883097816113047, "grad_norm": 0.0003020184230990708, "learning_rate": 3.3746191221273874e-07, "loss": 0.0, "num_input_tokens_seen": 210183232, "step": 97445 }, { "epoch": 17.8840154156726, "grad_norm": 0.0017468800069764256, "learning_rate": 3.3717277848904327e-07, "loss": 0.0, "num_input_tokens_seen": 210194784, "step": 97450 }, { "epoch": 17.884933015232154, "grad_norm": 5.435260754893534e-05, "learning_rate": 3.36883764359639e-07, "loss": 0.0, "num_input_tokens_seen": 210205728, "step": 97455 }, { "epoch": 17.885850614791703, "grad_norm": 0.00024483739980496466, "learning_rate": 3.3659486983193877e-07, "loss": 0.0, "num_input_tokens_seen": 210216320, "step": 97460 }, { "epoch": 17.886768214351257, "grad_norm": 0.00010011174890678376, "learning_rate": 3.3630609491335175e-07, "loss": 0.0, "num_input_tokens_seen": 210228320, "step": 97465 }, { "epoch": 17.88768581391081, "grad_norm": 0.00012861638970207423, "learning_rate": 3.360174396112864e-07, "loss": 0.0, "num_input_tokens_seen": 210240000, "step": 97470 }, { "epoch": 17.88860341347036, "grad_norm": 5.356407928047702e-05, "learning_rate": 3.3572890393314517e-07, "loss": 0.0, "num_input_tokens_seen": 210251008, "step": 97475 }, { "epoch": 17.889521013029913, "grad_norm": 0.0010689315386116505, "learning_rate": 3.354404878863288e-07, "loss": 0.0107, "num_input_tokens_seen": 210261120, "step": 97480 }, { "epoch": 17.890438612589467, "grad_norm": 0.0001005111334961839, "learning_rate": 3.351521914782341e-07, "loss": 0.0, "num_input_tokens_seen": 210271936, "step": 97485 }, { "epoch": 17.891356212149017, "grad_norm": 7.981587259564549e-05, "learning_rate": 3.3486401471625516e-07, "loss": 0.0, "num_input_tokens_seen": 210282624, "step": 97490 }, { "epoch": 17.89227381170857, "grad_norm": 0.001209902111440897, "learning_rate": 3.345759576077845e-07, "loss": 0.0, "num_input_tokens_seen": 210294144, "step": 97495 }, { "epoch": 17.893191411268123, "grad_norm": 8.310389966936782e-05, "learning_rate": 3.3428802016021e-07, "loss": 0.0, "num_input_tokens_seen": 210305440, "step": 97500 }, { "epoch": 17.894109010827673, "grad_norm": 0.00033193742274306715, "learning_rate": 3.340002023809169e-07, "loss": 0.0, "num_input_tokens_seen": 210315872, "step": 97505 }, { "epoch": 17.895026610387227, "grad_norm": 4.3167307012481615e-05, "learning_rate": 3.337125042772854e-07, "loss": 0.0, "num_input_tokens_seen": 210327296, "step": 97510 }, { "epoch": 17.89594420994678, "grad_norm": 0.0028090716805309057, "learning_rate": 3.334249258566974e-07, "loss": 0.0, "num_input_tokens_seen": 210336736, "step": 97515 }, { "epoch": 17.89686180950633, "grad_norm": 8.007962605915964e-05, "learning_rate": 3.331374671265275e-07, "loss": 0.0, "num_input_tokens_seen": 210348064, "step": 97520 }, { "epoch": 17.897779409065883, "grad_norm": 7.303513120859861e-05, "learning_rate": 3.328501280941476e-07, "loss": 0.0, "num_input_tokens_seen": 210360288, "step": 97525 }, { "epoch": 17.898697008625437, "grad_norm": 0.0001619085087440908, "learning_rate": 3.3256290876692965e-07, "loss": 0.0, "num_input_tokens_seen": 210369920, "step": 97530 }, { "epoch": 17.899614608184987, "grad_norm": 0.00030681933276355267, "learning_rate": 3.3227580915223877e-07, "loss": 0.0, "num_input_tokens_seen": 210381600, "step": 97535 }, { "epoch": 17.90053220774454, "grad_norm": 8.338909537997097e-05, "learning_rate": 3.319888292574397e-07, "loss": 0.0, "num_input_tokens_seen": 210391904, "step": 97540 }, { "epoch": 17.901449807304093, "grad_norm": 0.0003242350067012012, "learning_rate": 3.3170196908989093e-07, "loss": 0.0, "num_input_tokens_seen": 210402496, "step": 97545 }, { "epoch": 17.902367406863643, "grad_norm": 0.0029998004902154207, "learning_rate": 3.3141522865695276e-07, "loss": 0.0, "num_input_tokens_seen": 210413536, "step": 97550 }, { "epoch": 17.903285006423197, "grad_norm": 0.005365191027522087, "learning_rate": 3.3112860796597813e-07, "loss": 0.0, "num_input_tokens_seen": 210424096, "step": 97555 }, { "epoch": 17.90420260598275, "grad_norm": 0.0005906561273150146, "learning_rate": 3.308421070243173e-07, "loss": 0.0, "num_input_tokens_seen": 210434048, "step": 97560 }, { "epoch": 17.9051202055423, "grad_norm": 0.0002774089516606182, "learning_rate": 3.3055572583932163e-07, "loss": 0.0, "num_input_tokens_seen": 210444832, "step": 97565 }, { "epoch": 17.906037805101853, "grad_norm": 0.00013194530038163066, "learning_rate": 3.302694644183341e-07, "loss": 0.0, "num_input_tokens_seen": 210455360, "step": 97570 }, { "epoch": 17.906955404661407, "grad_norm": 0.00013406990910880268, "learning_rate": 3.2998332276869714e-07, "loss": 0.0, "num_input_tokens_seen": 210465952, "step": 97575 }, { "epoch": 17.907873004220956, "grad_norm": 9.241822408512235e-05, "learning_rate": 3.296973008977494e-07, "loss": 0.0, "num_input_tokens_seen": 210477152, "step": 97580 }, { "epoch": 17.90879060378051, "grad_norm": 0.0008179693832062185, "learning_rate": 3.2941139881282893e-07, "loss": 0.0, "num_input_tokens_seen": 210487104, "step": 97585 }, { "epoch": 17.909708203340063, "grad_norm": 0.000127984065329656, "learning_rate": 3.291256165212664e-07, "loss": 0.0, "num_input_tokens_seen": 210498080, "step": 97590 }, { "epoch": 17.910625802899613, "grad_norm": 0.0001720891013974324, "learning_rate": 3.288399540303927e-07, "loss": 0.0, "num_input_tokens_seen": 210508192, "step": 97595 }, { "epoch": 17.911543402459166, "grad_norm": 3.7774359952891245e-05, "learning_rate": 3.2855441134753473e-07, "loss": 0.0, "num_input_tokens_seen": 210519680, "step": 97600 }, { "epoch": 17.91246100201872, "grad_norm": 6.60303994663991e-05, "learning_rate": 3.2826898848001664e-07, "loss": 0.0, "num_input_tokens_seen": 210530720, "step": 97605 }, { "epoch": 17.91337860157827, "grad_norm": 8.304049697471783e-05, "learning_rate": 3.27983685435157e-07, "loss": 0.0, "num_input_tokens_seen": 210542400, "step": 97610 }, { "epoch": 17.914296201137823, "grad_norm": 0.0024785767309367657, "learning_rate": 3.2769850222027613e-07, "loss": 0.0, "num_input_tokens_seen": 210552448, "step": 97615 }, { "epoch": 17.915213800697376, "grad_norm": 5.364079697756097e-05, "learning_rate": 3.2741343884268695e-07, "loss": 0.0, "num_input_tokens_seen": 210561952, "step": 97620 }, { "epoch": 17.916131400256926, "grad_norm": 0.00013730679347645491, "learning_rate": 3.271284953097015e-07, "loss": 0.0, "num_input_tokens_seen": 210572960, "step": 97625 }, { "epoch": 17.91704899981648, "grad_norm": 0.00020494336786214262, "learning_rate": 3.268436716286266e-07, "loss": 0.0, "num_input_tokens_seen": 210583296, "step": 97630 }, { "epoch": 17.917966599376033, "grad_norm": 9.120073809754103e-05, "learning_rate": 3.2655896780676986e-07, "loss": 0.0, "num_input_tokens_seen": 210594208, "step": 97635 }, { "epoch": 17.918884198935583, "grad_norm": 0.00013927865074947476, "learning_rate": 3.2627438385143264e-07, "loss": 0.0, "num_input_tokens_seen": 210605088, "step": 97640 }, { "epoch": 17.919801798495136, "grad_norm": 0.0001170644536614418, "learning_rate": 3.259899197699129e-07, "loss": 0.0, "num_input_tokens_seen": 210616416, "step": 97645 }, { "epoch": 17.92071939805469, "grad_norm": 0.0004941004444845021, "learning_rate": 3.257055755695082e-07, "loss": 0.0, "num_input_tokens_seen": 210626912, "step": 97650 }, { "epoch": 17.92163699761424, "grad_norm": 0.005892022047191858, "learning_rate": 3.254213512575111e-07, "loss": 0.0, "num_input_tokens_seen": 210638016, "step": 97655 }, { "epoch": 17.922554597173793, "grad_norm": 7.383099728031084e-05, "learning_rate": 3.2513724684121063e-07, "loss": 0.0, "num_input_tokens_seen": 210648480, "step": 97660 }, { "epoch": 17.923472196733346, "grad_norm": 0.003362817456945777, "learning_rate": 3.248532623278955e-07, "loss": 0.0, "num_input_tokens_seen": 210659104, "step": 97665 }, { "epoch": 17.924389796292896, "grad_norm": 0.00017999009287450463, "learning_rate": 3.2456939772484816e-07, "loss": 0.0, "num_input_tokens_seen": 210669280, "step": 97670 }, { "epoch": 17.92530739585245, "grad_norm": 0.0029200592543929815, "learning_rate": 3.2428565303934956e-07, "loss": 0.0, "num_input_tokens_seen": 210679648, "step": 97675 }, { "epoch": 17.926224995412003, "grad_norm": 8.751942368689924e-05, "learning_rate": 3.240020282786771e-07, "loss": 0.0, "num_input_tokens_seen": 210690784, "step": 97680 }, { "epoch": 17.927142594971553, "grad_norm": 0.0001108450087485835, "learning_rate": 3.2371852345010445e-07, "loss": 0.0, "num_input_tokens_seen": 210701056, "step": 97685 }, { "epoch": 17.928060194531106, "grad_norm": 6.600601773243397e-05, "learning_rate": 3.2343513856090525e-07, "loss": 0.0, "num_input_tokens_seen": 210712512, "step": 97690 }, { "epoch": 17.92897779409066, "grad_norm": 7.368977821897715e-05, "learning_rate": 3.2315187361834697e-07, "loss": 0.0, "num_input_tokens_seen": 210723040, "step": 97695 }, { "epoch": 17.92989539365021, "grad_norm": 0.00040274698403663933, "learning_rate": 3.2286872862969443e-07, "loss": 0.0, "num_input_tokens_seen": 210733824, "step": 97700 }, { "epoch": 17.930812993209763, "grad_norm": 0.0002366197732044384, "learning_rate": 3.225857036022101e-07, "loss": 0.0, "num_input_tokens_seen": 210743616, "step": 97705 }, { "epoch": 17.931730592769316, "grad_norm": 0.00033404582063667476, "learning_rate": 3.2230279854315205e-07, "loss": 0.0, "num_input_tokens_seen": 210753888, "step": 97710 }, { "epoch": 17.932648192328866, "grad_norm": 9.549115929985419e-05, "learning_rate": 3.220200134597784e-07, "loss": 0.0, "num_input_tokens_seen": 210764512, "step": 97715 }, { "epoch": 17.93356579188842, "grad_norm": 8.926817827159539e-05, "learning_rate": 3.217373483593417e-07, "loss": 0.0, "num_input_tokens_seen": 210774624, "step": 97720 }, { "epoch": 17.934483391447973, "grad_norm": 6.345845758914948e-05, "learning_rate": 3.214548032490905e-07, "loss": 0.0, "num_input_tokens_seen": 210786048, "step": 97725 }, { "epoch": 17.935400991007523, "grad_norm": 0.0016599382506683469, "learning_rate": 3.2117237813627247e-07, "loss": 0.0, "num_input_tokens_seen": 210796384, "step": 97730 }, { "epoch": 17.936318590567076, "grad_norm": 8.976192475529388e-05, "learning_rate": 3.2089007302813167e-07, "loss": 0.0, "num_input_tokens_seen": 210806752, "step": 97735 }, { "epoch": 17.93723619012663, "grad_norm": 0.00033052609069272876, "learning_rate": 3.206078879319086e-07, "loss": 0.0, "num_input_tokens_seen": 210817728, "step": 97740 }, { "epoch": 17.93815378968618, "grad_norm": 7.577311771456152e-05, "learning_rate": 3.203258228548406e-07, "loss": 0.0, "num_input_tokens_seen": 210828832, "step": 97745 }, { "epoch": 17.939071389245733, "grad_norm": 0.00010920263594016433, "learning_rate": 3.200438778041626e-07, "loss": 0.0, "num_input_tokens_seen": 210840672, "step": 97750 }, { "epoch": 17.939988988805286, "grad_norm": 0.0021127795334905386, "learning_rate": 3.1976205278710593e-07, "loss": 0.0, "num_input_tokens_seen": 210852320, "step": 97755 }, { "epoch": 17.940906588364836, "grad_norm": 0.0001984813716262579, "learning_rate": 3.194803478108993e-07, "loss": 0.0045, "num_input_tokens_seen": 210862912, "step": 97760 }, { "epoch": 17.94182418792439, "grad_norm": 0.00011999288108199835, "learning_rate": 3.191987628827664e-07, "loss": 0.0, "num_input_tokens_seen": 210873856, "step": 97765 }, { "epoch": 17.942741787483943, "grad_norm": 0.4753844439983368, "learning_rate": 3.1891729800993145e-07, "loss": 0.0, "num_input_tokens_seen": 210884608, "step": 97770 }, { "epoch": 17.943659387043493, "grad_norm": 0.00023981752747204155, "learning_rate": 3.1863595319961304e-07, "loss": 0.0, "num_input_tokens_seen": 210894848, "step": 97775 }, { "epoch": 17.944576986603046, "grad_norm": 8.526402962161228e-05, "learning_rate": 3.1835472845902604e-07, "loss": 0.0, "num_input_tokens_seen": 210905408, "step": 97780 }, { "epoch": 17.9454945861626, "grad_norm": 0.00015898654237389565, "learning_rate": 3.1807362379538464e-07, "loss": 0.0, "num_input_tokens_seen": 210916160, "step": 97785 }, { "epoch": 17.94641218572215, "grad_norm": 0.000523409980814904, "learning_rate": 3.177926392158992e-07, "loss": 0.0, "num_input_tokens_seen": 210925920, "step": 97790 }, { "epoch": 17.947329785281703, "grad_norm": 7.208680472103879e-05, "learning_rate": 3.1751177472777507e-07, "loss": 0.0, "num_input_tokens_seen": 210936832, "step": 97795 }, { "epoch": 17.948247384841256, "grad_norm": 7.324499893002212e-05, "learning_rate": 3.17231030338217e-07, "loss": 0.0, "num_input_tokens_seen": 210947328, "step": 97800 }, { "epoch": 17.949164984400806, "grad_norm": 5.387757846619934e-05, "learning_rate": 3.169504060544254e-07, "loss": 0.0, "num_input_tokens_seen": 210957408, "step": 97805 }, { "epoch": 17.95008258396036, "grad_norm": 0.00029397467733360827, "learning_rate": 3.166699018835978e-07, "loss": 0.0, "num_input_tokens_seen": 210968192, "step": 97810 }, { "epoch": 17.951000183519913, "grad_norm": 8.828653517412022e-05, "learning_rate": 3.163895178329285e-07, "loss": 0.0, "num_input_tokens_seen": 210979136, "step": 97815 }, { "epoch": 17.951917783079463, "grad_norm": 0.0001995171478483826, "learning_rate": 3.1610925390960944e-07, "loss": 0.0, "num_input_tokens_seen": 210989248, "step": 97820 }, { "epoch": 17.952835382639016, "grad_norm": 9.138546738540754e-05, "learning_rate": 3.158291101208288e-07, "loss": 0.0, "num_input_tokens_seen": 211000288, "step": 97825 }, { "epoch": 17.95375298219857, "grad_norm": 0.00015631449059583247, "learning_rate": 3.155490864737709e-07, "loss": 0.0, "num_input_tokens_seen": 211010464, "step": 97830 }, { "epoch": 17.95467058175812, "grad_norm": 0.007140613626688719, "learning_rate": 3.1526918297561937e-07, "loss": 0.0, "num_input_tokens_seen": 211020224, "step": 97835 }, { "epoch": 17.955588181317673, "grad_norm": 7.515124889323488e-05, "learning_rate": 3.1498939963355236e-07, "loss": 0.0, "num_input_tokens_seen": 211031616, "step": 97840 }, { "epoch": 17.956505780877226, "grad_norm": 0.00038599493564106524, "learning_rate": 3.1470973645474577e-07, "loss": 0.0, "num_input_tokens_seen": 211041888, "step": 97845 }, { "epoch": 17.957423380436776, "grad_norm": 0.0005140076973475516, "learning_rate": 3.144301934463734e-07, "loss": 0.0, "num_input_tokens_seen": 211052608, "step": 97850 }, { "epoch": 17.95834097999633, "grad_norm": 0.00011232074757572263, "learning_rate": 3.1415077061560494e-07, "loss": 0.0, "num_input_tokens_seen": 211063456, "step": 97855 }, { "epoch": 17.959258579555883, "grad_norm": 0.0003649616555776447, "learning_rate": 3.138714679696064e-07, "loss": 0.0, "num_input_tokens_seen": 211073344, "step": 97860 }, { "epoch": 17.960176179115432, "grad_norm": 0.0001382399641443044, "learning_rate": 3.1359228551554154e-07, "loss": 0.0, "num_input_tokens_seen": 211084128, "step": 97865 }, { "epoch": 17.961093778674986, "grad_norm": 7.018395990598947e-05, "learning_rate": 3.133132232605718e-07, "loss": 0.0, "num_input_tokens_seen": 211094208, "step": 97870 }, { "epoch": 17.96201137823454, "grad_norm": 0.0015252451412379742, "learning_rate": 3.1303428121185417e-07, "loss": 0.0, "num_input_tokens_seen": 211105408, "step": 97875 }, { "epoch": 17.96292897779409, "grad_norm": 0.00012343905109446496, "learning_rate": 3.127554593765425e-07, "loss": 0.0, "num_input_tokens_seen": 211116000, "step": 97880 }, { "epoch": 17.963846577353642, "grad_norm": 0.0003805376763921231, "learning_rate": 3.1247675776178934e-07, "loss": 0.0, "num_input_tokens_seen": 211125856, "step": 97885 }, { "epoch": 17.964764176913196, "grad_norm": 7.815347635187209e-05, "learning_rate": 3.1219817637474226e-07, "loss": 0.0, "num_input_tokens_seen": 211135360, "step": 97890 }, { "epoch": 17.965681776472746, "grad_norm": 0.0002801391528919339, "learning_rate": 3.119197152225467e-07, "loss": 0.0, "num_input_tokens_seen": 211145344, "step": 97895 }, { "epoch": 17.9665993760323, "grad_norm": 0.00026435445761308074, "learning_rate": 3.116413743123442e-07, "loss": 0.0, "num_input_tokens_seen": 211156160, "step": 97900 }, { "epoch": 17.967516975591852, "grad_norm": 5.334732122719288e-05, "learning_rate": 3.11363153651274e-07, "loss": 0.0, "num_input_tokens_seen": 211166080, "step": 97905 }, { "epoch": 17.968434575151402, "grad_norm": 0.00020147596660535783, "learning_rate": 3.1108505324647263e-07, "loss": 0.0, "num_input_tokens_seen": 211176256, "step": 97910 }, { "epoch": 17.969352174710956, "grad_norm": 6.593571015400812e-05, "learning_rate": 3.108070731050722e-07, "loss": 0.0, "num_input_tokens_seen": 211187328, "step": 97915 }, { "epoch": 17.97026977427051, "grad_norm": 8.146498294081539e-05, "learning_rate": 3.1052921323420304e-07, "loss": 0.0, "num_input_tokens_seen": 211197856, "step": 97920 }, { "epoch": 17.97118737383006, "grad_norm": 0.005245721898972988, "learning_rate": 3.102514736409917e-07, "loss": 0.0, "num_input_tokens_seen": 211208608, "step": 97925 }, { "epoch": 17.972104973389612, "grad_norm": 6.551425030920655e-05, "learning_rate": 3.099738543325609e-07, "loss": 0.0, "num_input_tokens_seen": 211219168, "step": 97930 }, { "epoch": 17.973022572949166, "grad_norm": 5.22264999744948e-05, "learning_rate": 3.0969635531603206e-07, "loss": 0.0, "num_input_tokens_seen": 211229152, "step": 97935 }, { "epoch": 17.973940172508716, "grad_norm": 5.101190981804393e-05, "learning_rate": 3.094189765985228e-07, "loss": 0.0, "num_input_tokens_seen": 211239776, "step": 97940 }, { "epoch": 17.97485777206827, "grad_norm": 0.00019571330631151795, "learning_rate": 3.09141718187147e-07, "loss": 0.0, "num_input_tokens_seen": 211251136, "step": 97945 }, { "epoch": 17.975775371627822, "grad_norm": 4.429317050380632e-05, "learning_rate": 3.08864580089015e-07, "loss": 0.0, "num_input_tokens_seen": 211261472, "step": 97950 }, { "epoch": 17.976692971187372, "grad_norm": 9.517880971543491e-05, "learning_rate": 3.085875623112372e-07, "loss": 0.0, "num_input_tokens_seen": 211272608, "step": 97955 }, { "epoch": 17.977610570746926, "grad_norm": 5.6885684898588806e-05, "learning_rate": 3.083106648609169e-07, "loss": 0.0, "num_input_tokens_seen": 211283296, "step": 97960 }, { "epoch": 17.97852817030648, "grad_norm": 0.000276641221717, "learning_rate": 3.0803388774515606e-07, "loss": 0.0, "num_input_tokens_seen": 211293312, "step": 97965 }, { "epoch": 17.97944576986603, "grad_norm": 0.00021909359202254564, "learning_rate": 3.077572309710547e-07, "loss": 0.0, "num_input_tokens_seen": 211304992, "step": 97970 }, { "epoch": 17.980363369425582, "grad_norm": 0.00014584138989448547, "learning_rate": 3.0748069454570815e-07, "loss": 0.0, "num_input_tokens_seen": 211314656, "step": 97975 }, { "epoch": 17.981280968985136, "grad_norm": 5.752099968958646e-05, "learning_rate": 3.07204278476208e-07, "loss": 0.0, "num_input_tokens_seen": 211324448, "step": 97980 }, { "epoch": 17.982198568544685, "grad_norm": 0.0001454496377846226, "learning_rate": 3.0692798276964584e-07, "loss": 0.0, "num_input_tokens_seen": 211335008, "step": 97985 }, { "epoch": 17.98311616810424, "grad_norm": 0.0003756480000447482, "learning_rate": 3.0665180743310764e-07, "loss": 0.0, "num_input_tokens_seen": 211346752, "step": 97990 }, { "epoch": 17.984033767663792, "grad_norm": 0.00012712774332612753, "learning_rate": 3.0637575247367656e-07, "loss": 0.0, "num_input_tokens_seen": 211356832, "step": 97995 }, { "epoch": 17.984951367223342, "grad_norm": 0.001617582282051444, "learning_rate": 3.06099817898432e-07, "loss": 0.0, "num_input_tokens_seen": 211368736, "step": 98000 }, { "epoch": 17.985868966782895, "grad_norm": 0.0006932609830982983, "learning_rate": 3.0582400371445274e-07, "loss": 0.0, "num_input_tokens_seen": 211379584, "step": 98005 }, { "epoch": 17.98678656634245, "grad_norm": 0.0006320911343209445, "learning_rate": 3.055483099288126e-07, "loss": 0.0, "num_input_tokens_seen": 211389024, "step": 98010 }, { "epoch": 17.987704165902, "grad_norm": 0.00013126224803272635, "learning_rate": 3.052727365485819e-07, "loss": 0.0, "num_input_tokens_seen": 211400288, "step": 98015 }, { "epoch": 17.988621765461552, "grad_norm": 6.329689495032653e-05, "learning_rate": 3.049972835808301e-07, "loss": 0.0, "num_input_tokens_seen": 211412800, "step": 98020 }, { "epoch": 17.989539365021106, "grad_norm": 0.011488933116197586, "learning_rate": 3.047219510326216e-07, "loss": 0.0, "num_input_tokens_seen": 211423808, "step": 98025 }, { "epoch": 17.990456964580655, "grad_norm": 6.365147419273853e-05, "learning_rate": 3.0444673891101784e-07, "loss": 0.0, "num_input_tokens_seen": 211434208, "step": 98030 }, { "epoch": 17.99137456414021, "grad_norm": 0.0010680453851819038, "learning_rate": 3.0417164722307713e-07, "loss": 0.0, "num_input_tokens_seen": 211445472, "step": 98035 }, { "epoch": 17.992292163699762, "grad_norm": 5.053569111623801e-05, "learning_rate": 3.0389667597585657e-07, "loss": 0.0, "num_input_tokens_seen": 211456576, "step": 98040 }, { "epoch": 17.993209763259312, "grad_norm": 8.99914521141909e-05, "learning_rate": 3.036218251764078e-07, "loss": 0.0, "num_input_tokens_seen": 211466656, "step": 98045 }, { "epoch": 17.994127362818865, "grad_norm": 4.9056787247536704e-05, "learning_rate": 3.033470948317796e-07, "loss": 0.0, "num_input_tokens_seen": 211477888, "step": 98050 }, { "epoch": 17.99504496237842, "grad_norm": 0.0006377635872922838, "learning_rate": 3.030724849490202e-07, "loss": 0.0, "num_input_tokens_seen": 211488384, "step": 98055 }, { "epoch": 17.99596256193797, "grad_norm": 6.211131403688341e-05, "learning_rate": 3.0279799553517174e-07, "loss": 0.0, "num_input_tokens_seen": 211500096, "step": 98060 }, { "epoch": 17.996880161497522, "grad_norm": 0.00017266705981455743, "learning_rate": 3.025236265972742e-07, "loss": 0.0, "num_input_tokens_seen": 211510496, "step": 98065 }, { "epoch": 17.997797761057075, "grad_norm": 8.76629856065847e-05, "learning_rate": 3.022493781423663e-07, "loss": 0.0, "num_input_tokens_seen": 211520992, "step": 98070 }, { "epoch": 17.998715360616625, "grad_norm": 6.262355600483716e-05, "learning_rate": 3.0197525017748033e-07, "loss": 0.0, "num_input_tokens_seen": 211532704, "step": 98075 }, { "epoch": 17.99963296017618, "grad_norm": 6.07440342719201e-05, "learning_rate": 3.017012427096483e-07, "loss": 0.0, "num_input_tokens_seen": 211543552, "step": 98080 }, { "epoch": 18.0, "eval_loss": 0.6489610075950623, "eval_runtime": 178.8361, "eval_samples_per_second": 30.469, "eval_steps_per_second": 7.622, "num_input_tokens_seen": 211546832, "step": 98082 }, { "epoch": 18.000550559735732, "grad_norm": 0.00126553641166538, "learning_rate": 3.014273557458969e-07, "loss": 0.0, "num_input_tokens_seen": 211553456, "step": 98085 }, { "epoch": 18.001468159295282, "grad_norm": 0.002603900618851185, "learning_rate": 3.0115358929325267e-07, "loss": 0.0, "num_input_tokens_seen": 211563888, "step": 98090 }, { "epoch": 18.002385758854835, "grad_norm": 0.0001347728248219937, "learning_rate": 3.008799433587356e-07, "loss": 0.0008, "num_input_tokens_seen": 211573840, "step": 98095 }, { "epoch": 18.00330335841439, "grad_norm": 4.323354733060114e-05, "learning_rate": 3.006064179493651e-07, "loss": 0.0, "num_input_tokens_seen": 211584176, "step": 98100 }, { "epoch": 18.00422095797394, "grad_norm": 4.547208300209604e-05, "learning_rate": 3.003330130721566e-07, "loss": 0.0, "num_input_tokens_seen": 211595504, "step": 98105 }, { "epoch": 18.005138557533492, "grad_norm": 0.00022112420992925763, "learning_rate": 3.000597287341228e-07, "loss": 0.0, "num_input_tokens_seen": 211607024, "step": 98110 }, { "epoch": 18.006056157093045, "grad_norm": 7.827969966456294e-05, "learning_rate": 2.997865649422732e-07, "loss": 0.0, "num_input_tokens_seen": 211618416, "step": 98115 }, { "epoch": 18.006973756652595, "grad_norm": 0.0012662194203585386, "learning_rate": 2.995135217036127e-07, "loss": 0.0, "num_input_tokens_seen": 211628208, "step": 98120 }, { "epoch": 18.00789135621215, "grad_norm": 0.0002072827483061701, "learning_rate": 2.9924059902514515e-07, "loss": 0.0, "num_input_tokens_seen": 211639440, "step": 98125 }, { "epoch": 18.008808955771702, "grad_norm": 0.00035247221239842474, "learning_rate": 2.9896779691387103e-07, "loss": 0.0, "num_input_tokens_seen": 211650736, "step": 98130 }, { "epoch": 18.00972655533125, "grad_norm": 5.1327748224139214e-05, "learning_rate": 2.9869511537678753e-07, "loss": 0.0, "num_input_tokens_seen": 211662352, "step": 98135 }, { "epoch": 18.010644154890805, "grad_norm": 0.00017782476788852364, "learning_rate": 2.9842255442088744e-07, "loss": 0.0, "num_input_tokens_seen": 211673072, "step": 98140 }, { "epoch": 18.01156175445036, "grad_norm": 8.877661457518116e-05, "learning_rate": 2.9815011405316227e-07, "loss": 0.0, "num_input_tokens_seen": 211685520, "step": 98145 }, { "epoch": 18.01247935400991, "grad_norm": 7.69385660532862e-05, "learning_rate": 2.9787779428059825e-07, "loss": 0.0, "num_input_tokens_seen": 211694800, "step": 98150 }, { "epoch": 18.013396953569462, "grad_norm": 7.316144183278084e-05, "learning_rate": 2.976055951101825e-07, "loss": 0.0, "num_input_tokens_seen": 211705200, "step": 98155 }, { "epoch": 18.014314553129015, "grad_norm": 0.001396234380081296, "learning_rate": 2.9733351654889495e-07, "loss": 0.0, "num_input_tokens_seen": 211714192, "step": 98160 }, { "epoch": 18.015232152688565, "grad_norm": 4.9970858526648954e-05, "learning_rate": 2.9706155860371344e-07, "loss": 0.0, "num_input_tokens_seen": 211724176, "step": 98165 }, { "epoch": 18.01614975224812, "grad_norm": 0.00031958712497726083, "learning_rate": 2.967897212816151e-07, "loss": 0.0, "num_input_tokens_seen": 211734192, "step": 98170 }, { "epoch": 18.017067351807672, "grad_norm": 0.00016178292571567, "learning_rate": 2.96518004589571e-07, "loss": 0.0, "num_input_tokens_seen": 211745488, "step": 98175 }, { "epoch": 18.01798495136722, "grad_norm": 0.000244081427808851, "learning_rate": 2.962464085345501e-07, "loss": 0.0, "num_input_tokens_seen": 211756944, "step": 98180 }, { "epoch": 18.018902550926775, "grad_norm": 0.000622798630502075, "learning_rate": 2.9597493312351844e-07, "loss": 0.0, "num_input_tokens_seen": 211766192, "step": 98185 }, { "epoch": 18.01982015048633, "grad_norm": 0.0015726144192740321, "learning_rate": 2.9570357836343933e-07, "loss": 0.0, "num_input_tokens_seen": 211777072, "step": 98190 }, { "epoch": 18.02073775004588, "grad_norm": 0.00023317558225244284, "learning_rate": 2.9543234426127274e-07, "loss": 0.0, "num_input_tokens_seen": 211788080, "step": 98195 }, { "epoch": 18.02165534960543, "grad_norm": 5.1894690841436386e-05, "learning_rate": 2.951612308239743e-07, "loss": 0.0, "num_input_tokens_seen": 211798000, "step": 98200 }, { "epoch": 18.022572949164985, "grad_norm": 5.204117769608274e-05, "learning_rate": 2.9489023805849893e-07, "loss": 0.0, "num_input_tokens_seen": 211808336, "step": 98205 }, { "epoch": 18.023490548724535, "grad_norm": 0.0005582875455729663, "learning_rate": 2.946193659717972e-07, "loss": 0.0, "num_input_tokens_seen": 211818192, "step": 98210 }, { "epoch": 18.02440814828409, "grad_norm": 7.543485844507813e-05, "learning_rate": 2.9434861457081575e-07, "loss": 0.0, "num_input_tokens_seen": 211829072, "step": 98215 }, { "epoch": 18.02532574784364, "grad_norm": 5.1145001634722576e-05, "learning_rate": 2.9407798386249854e-07, "loss": 0.0, "num_input_tokens_seen": 211838544, "step": 98220 }, { "epoch": 18.02624334740319, "grad_norm": 0.00015546628856100142, "learning_rate": 2.9380747385378825e-07, "loss": 0.0, "num_input_tokens_seen": 211849968, "step": 98225 }, { "epoch": 18.027160946962745, "grad_norm": 0.00065945356618613, "learning_rate": 2.9353708455162224e-07, "loss": 0.0, "num_input_tokens_seen": 211861616, "step": 98230 }, { "epoch": 18.0280785465223, "grad_norm": 0.00022747888579033315, "learning_rate": 2.932668159629348e-07, "loss": 0.0, "num_input_tokens_seen": 211872624, "step": 98235 }, { "epoch": 18.028996146081848, "grad_norm": 0.0001780742168193683, "learning_rate": 2.9299666809466e-07, "loss": 0.0, "num_input_tokens_seen": 211883408, "step": 98240 }, { "epoch": 18.0299137456414, "grad_norm": 0.00035561464028432965, "learning_rate": 2.9272664095372494e-07, "loss": 0.0, "num_input_tokens_seen": 211894096, "step": 98245 }, { "epoch": 18.030831345200955, "grad_norm": 5.012786641600542e-05, "learning_rate": 2.9245673454705525e-07, "loss": 0.0, "num_input_tokens_seen": 211904336, "step": 98250 }, { "epoch": 18.031748944760505, "grad_norm": 5.71776254219003e-05, "learning_rate": 2.921869488815754e-07, "loss": 0.0, "num_input_tokens_seen": 211914576, "step": 98255 }, { "epoch": 18.032666544320058, "grad_norm": 7.231856579892337e-05, "learning_rate": 2.9191728396420373e-07, "loss": 0.0, "num_input_tokens_seen": 211925904, "step": 98260 }, { "epoch": 18.03358414387961, "grad_norm": 0.00010874033614527434, "learning_rate": 2.916477398018569e-07, "loss": 0.0, "num_input_tokens_seen": 211937808, "step": 98265 }, { "epoch": 18.03450174343916, "grad_norm": 9.415065869688988e-05, "learning_rate": 2.9137831640144723e-07, "loss": 0.0, "num_input_tokens_seen": 211949168, "step": 98270 }, { "epoch": 18.035419342998715, "grad_norm": 7.975458720466122e-05, "learning_rate": 2.9110901376988686e-07, "loss": 0.0, "num_input_tokens_seen": 211960176, "step": 98275 }, { "epoch": 18.03633694255827, "grad_norm": 0.00023460821830667555, "learning_rate": 2.9083983191408206e-07, "loss": 0.0, "num_input_tokens_seen": 211972112, "step": 98280 }, { "epoch": 18.037254542117818, "grad_norm": 0.00011694352724589407, "learning_rate": 2.9057077084093667e-07, "loss": 0.0, "num_input_tokens_seen": 211984112, "step": 98285 }, { "epoch": 18.03817214167737, "grad_norm": 5.122000220580958e-05, "learning_rate": 2.903018305573524e-07, "loss": 0.0, "num_input_tokens_seen": 211995248, "step": 98290 }, { "epoch": 18.039089741236925, "grad_norm": 0.00010740043217083439, "learning_rate": 2.9003301107022705e-07, "loss": 0.0, "num_input_tokens_seen": 212006160, "step": 98295 }, { "epoch": 18.040007340796475, "grad_norm": 0.00045574517571367323, "learning_rate": 2.8976431238645465e-07, "loss": 0.0, "num_input_tokens_seen": 212018512, "step": 98300 }, { "epoch": 18.040924940356028, "grad_norm": 9.962146577890962e-05, "learning_rate": 2.8949573451292787e-07, "loss": 0.0, "num_input_tokens_seen": 212029488, "step": 98305 }, { "epoch": 18.04184253991558, "grad_norm": 0.002829235978424549, "learning_rate": 2.892272774565352e-07, "loss": 0.0, "num_input_tokens_seen": 212040432, "step": 98310 }, { "epoch": 18.04276013947513, "grad_norm": 0.00032763334456831217, "learning_rate": 2.889589412241611e-07, "loss": 0.0, "num_input_tokens_seen": 212051216, "step": 98315 }, { "epoch": 18.043677739034685, "grad_norm": 0.0005861016688868403, "learning_rate": 2.8869072582268844e-07, "loss": 0.0, "num_input_tokens_seen": 212061872, "step": 98320 }, { "epoch": 18.044595338594238, "grad_norm": 0.00014369188284035772, "learning_rate": 2.8842263125899774e-07, "loss": 0.0, "num_input_tokens_seen": 212072432, "step": 98325 }, { "epoch": 18.045512938153788, "grad_norm": 0.00023502552357967943, "learning_rate": 2.881546575399641e-07, "loss": 0.0, "num_input_tokens_seen": 212084208, "step": 98330 }, { "epoch": 18.04643053771334, "grad_norm": 0.00017946556909009814, "learning_rate": 2.878868046724609e-07, "loss": 0.0, "num_input_tokens_seen": 212094736, "step": 98335 }, { "epoch": 18.047348137272895, "grad_norm": 0.0002259318280266598, "learning_rate": 2.8761907266335766e-07, "loss": 0.0, "num_input_tokens_seen": 212104976, "step": 98340 }, { "epoch": 18.048265736832445, "grad_norm": 5.9181777032790706e-05, "learning_rate": 2.873514615195222e-07, "loss": 0.0, "num_input_tokens_seen": 212116080, "step": 98345 }, { "epoch": 18.049183336391998, "grad_norm": 0.0007006189553067088, "learning_rate": 2.870839712478163e-07, "loss": 0.0, "num_input_tokens_seen": 212127088, "step": 98350 }, { "epoch": 18.05010093595155, "grad_norm": 0.00010932564327958971, "learning_rate": 2.868166018551038e-07, "loss": 0.0, "num_input_tokens_seen": 212138576, "step": 98355 }, { "epoch": 18.0510185355111, "grad_norm": 0.000340645550750196, "learning_rate": 2.8654935334824e-07, "loss": 0.0, "num_input_tokens_seen": 212149328, "step": 98360 }, { "epoch": 18.051936135070655, "grad_norm": 6.972732080612332e-05, "learning_rate": 2.862822257340803e-07, "loss": 0.0, "num_input_tokens_seen": 212161136, "step": 98365 }, { "epoch": 18.052853734630208, "grad_norm": 0.0003128050593659282, "learning_rate": 2.860152190194754e-07, "loss": 0.04, "num_input_tokens_seen": 212172336, "step": 98370 }, { "epoch": 18.053771334189758, "grad_norm": 0.000108398096926976, "learning_rate": 2.857483332112748e-07, "loss": 0.0, "num_input_tokens_seen": 212183696, "step": 98375 }, { "epoch": 18.05468893374931, "grad_norm": 0.0016771515365689993, "learning_rate": 2.8548156831632314e-07, "loss": 0.0, "num_input_tokens_seen": 212192848, "step": 98380 }, { "epoch": 18.055606533308865, "grad_norm": 0.0002586087503004819, "learning_rate": 2.852149243414615e-07, "loss": 0.0, "num_input_tokens_seen": 212204400, "step": 98385 }, { "epoch": 18.056524132868415, "grad_norm": 0.00013007575762458146, "learning_rate": 2.849484012935305e-07, "loss": 0.0, "num_input_tokens_seen": 212216272, "step": 98390 }, { "epoch": 18.057441732427968, "grad_norm": 0.0013738194247707725, "learning_rate": 2.8468199917936535e-07, "loss": 0.0, "num_input_tokens_seen": 212226192, "step": 98395 }, { "epoch": 18.05835933198752, "grad_norm": 7.217621168820187e-05, "learning_rate": 2.844157180057988e-07, "loss": 0.0, "num_input_tokens_seen": 212237584, "step": 98400 }, { "epoch": 18.05927693154707, "grad_norm": 8.849192090565339e-05, "learning_rate": 2.841495577796599e-07, "loss": 0.0, "num_input_tokens_seen": 212248368, "step": 98405 }, { "epoch": 18.060194531106625, "grad_norm": 4.061711297254078e-05, "learning_rate": 2.8388351850777653e-07, "loss": 0.0, "num_input_tokens_seen": 212259728, "step": 98410 }, { "epoch": 18.061112130666178, "grad_norm": 6.633876182604581e-05, "learning_rate": 2.836176001969715e-07, "loss": 0.0, "num_input_tokens_seen": 212270928, "step": 98415 }, { "epoch": 18.062029730225728, "grad_norm": 8.181543671526015e-05, "learning_rate": 2.8335180285406494e-07, "loss": 0.0, "num_input_tokens_seen": 212281296, "step": 98420 }, { "epoch": 18.06294732978528, "grad_norm": 0.00022231788898352534, "learning_rate": 2.830861264858753e-07, "loss": 0.0, "num_input_tokens_seen": 212292944, "step": 98425 }, { "epoch": 18.063864929344835, "grad_norm": 0.0005542640574276447, "learning_rate": 2.8282057109921545e-07, "loss": 0.0, "num_input_tokens_seen": 212303472, "step": 98430 }, { "epoch": 18.064782528904384, "grad_norm": 0.0002122780861100182, "learning_rate": 2.825551367008966e-07, "loss": 0.0, "num_input_tokens_seen": 212314544, "step": 98435 }, { "epoch": 18.065700128463938, "grad_norm": 0.00018203517538495362, "learning_rate": 2.822898232977278e-07, "loss": 0.0, "num_input_tokens_seen": 212325840, "step": 98440 }, { "epoch": 18.06661772802349, "grad_norm": 0.00018531888781581074, "learning_rate": 2.8202463089651354e-07, "loss": 0.0, "num_input_tokens_seen": 212336880, "step": 98445 }, { "epoch": 18.06753532758304, "grad_norm": 0.00015592768613714725, "learning_rate": 2.8175955950405453e-07, "loss": 0.0, "num_input_tokens_seen": 212349104, "step": 98450 }, { "epoch": 18.068452927142594, "grad_norm": 4.6627937990706414e-05, "learning_rate": 2.8149460912715034e-07, "loss": 0.0, "num_input_tokens_seen": 212359088, "step": 98455 }, { "epoch": 18.069370526702148, "grad_norm": 0.00012945539492648095, "learning_rate": 2.8122977977259657e-07, "loss": 0.0, "num_input_tokens_seen": 212370992, "step": 98460 }, { "epoch": 18.070288126261698, "grad_norm": 5.706342199118808e-05, "learning_rate": 2.8096507144718567e-07, "loss": 0.0, "num_input_tokens_seen": 212381232, "step": 98465 }, { "epoch": 18.07120572582125, "grad_norm": 0.00039106662734411657, "learning_rate": 2.80700484157706e-07, "loss": 0.0, "num_input_tokens_seen": 212391408, "step": 98470 }, { "epoch": 18.072123325380804, "grad_norm": 0.005419272463768721, "learning_rate": 2.8043601791094555e-07, "loss": 0.0, "num_input_tokens_seen": 212401904, "step": 98475 }, { "epoch": 18.073040924940354, "grad_norm": 0.00011660787276923656, "learning_rate": 2.801716727136866e-07, "loss": 0.0, "num_input_tokens_seen": 212412656, "step": 98480 }, { "epoch": 18.073958524499908, "grad_norm": 6.728636799380183e-05, "learning_rate": 2.7990744857270823e-07, "loss": 0.0, "num_input_tokens_seen": 212422160, "step": 98485 }, { "epoch": 18.07487612405946, "grad_norm": 0.0005031159962527454, "learning_rate": 2.796433454947894e-07, "loss": 0.0, "num_input_tokens_seen": 212431472, "step": 98490 }, { "epoch": 18.07579372361901, "grad_norm": 0.00034790055360645056, "learning_rate": 2.7937936348670256e-07, "loss": 0.0, "num_input_tokens_seen": 212442032, "step": 98495 }, { "epoch": 18.076711323178564, "grad_norm": 0.00012574021820910275, "learning_rate": 2.791155025552189e-07, "loss": 0.0, "num_input_tokens_seen": 212451984, "step": 98500 }, { "epoch": 18.077628922738118, "grad_norm": 5.537666584132239e-05, "learning_rate": 2.7885176270710525e-07, "loss": 0.0, "num_input_tokens_seen": 212461680, "step": 98505 }, { "epoch": 18.078546522297668, "grad_norm": 0.002660835860297084, "learning_rate": 2.7858814394912725e-07, "loss": 0.0, "num_input_tokens_seen": 212472752, "step": 98510 }, { "epoch": 18.07946412185722, "grad_norm": 0.00013087857223581523, "learning_rate": 2.7832464628804624e-07, "loss": 0.0, "num_input_tokens_seen": 212484112, "step": 98515 }, { "epoch": 18.080381721416774, "grad_norm": 8.465434075333178e-05, "learning_rate": 2.78061269730619e-07, "loss": 0.0, "num_input_tokens_seen": 212495152, "step": 98520 }, { "epoch": 18.081299320976324, "grad_norm": 0.0038638839032500982, "learning_rate": 2.777980142836029e-07, "loss": 0.0, "num_input_tokens_seen": 212506576, "step": 98525 }, { "epoch": 18.082216920535878, "grad_norm": 0.000965687504503876, "learning_rate": 2.775348799537486e-07, "loss": 0.0, "num_input_tokens_seen": 212517328, "step": 98530 }, { "epoch": 18.08313452009543, "grad_norm": 0.003313190070912242, "learning_rate": 2.7727186674780583e-07, "loss": 0.0, "num_input_tokens_seen": 212527024, "step": 98535 }, { "epoch": 18.08405211965498, "grad_norm": 6.260695226956159e-05, "learning_rate": 2.7700897467251965e-07, "loss": 0.0, "num_input_tokens_seen": 212538512, "step": 98540 }, { "epoch": 18.084969719214534, "grad_norm": 5.10888421558775e-05, "learning_rate": 2.7674620373463303e-07, "loss": 0.0, "num_input_tokens_seen": 212548784, "step": 98545 }, { "epoch": 18.085887318774088, "grad_norm": 0.0001796386350179091, "learning_rate": 2.7648355394088666e-07, "loss": 0.0, "num_input_tokens_seen": 212559728, "step": 98550 }, { "epoch": 18.086804918333637, "grad_norm": 0.000157833012053743, "learning_rate": 2.7622102529801576e-07, "loss": 0.0, "num_input_tokens_seen": 212570224, "step": 98555 }, { "epoch": 18.08772251789319, "grad_norm": 0.00022133535821922123, "learning_rate": 2.759586178127549e-07, "loss": 0.0, "num_input_tokens_seen": 212582192, "step": 98560 }, { "epoch": 18.088640117452744, "grad_norm": 0.00013502599904313684, "learning_rate": 2.7569633149183375e-07, "loss": 0.0, "num_input_tokens_seen": 212592272, "step": 98565 }, { "epoch": 18.089557717012294, "grad_norm": 0.0011588489869609475, "learning_rate": 2.7543416634197907e-07, "loss": 0.0, "num_input_tokens_seen": 212603280, "step": 98570 }, { "epoch": 18.090475316571847, "grad_norm": 6.507386569865048e-05, "learning_rate": 2.751721223699161e-07, "loss": 0.0, "num_input_tokens_seen": 212613872, "step": 98575 }, { "epoch": 18.0913929161314, "grad_norm": 8.939761755755171e-05, "learning_rate": 2.7491019958236554e-07, "loss": 0.0, "num_input_tokens_seen": 212626064, "step": 98580 }, { "epoch": 18.09231051569095, "grad_norm": 0.00011688278755173087, "learning_rate": 2.746483979860448e-07, "loss": 0.0, "num_input_tokens_seen": 212637072, "step": 98585 }, { "epoch": 18.093228115250504, "grad_norm": 0.00014437180652748793, "learning_rate": 2.743867175876691e-07, "loss": 0.0, "num_input_tokens_seen": 212647952, "step": 98590 }, { "epoch": 18.094145714810058, "grad_norm": 0.06634415686130524, "learning_rate": 2.7412515839395025e-07, "loss": 0.0, "num_input_tokens_seen": 212659472, "step": 98595 }, { "epoch": 18.095063314369607, "grad_norm": 0.00010981545347021893, "learning_rate": 2.7386372041159627e-07, "loss": 0.0, "num_input_tokens_seen": 212671120, "step": 98600 }, { "epoch": 18.09598091392916, "grad_norm": 4.486474790610373e-05, "learning_rate": 2.7360240364731285e-07, "loss": 0.0, "num_input_tokens_seen": 212681968, "step": 98605 }, { "epoch": 18.096898513488714, "grad_norm": 4.5910444896435365e-05, "learning_rate": 2.7334120810780297e-07, "loss": 0.0, "num_input_tokens_seen": 212691568, "step": 98610 }, { "epoch": 18.097816113048264, "grad_norm": 9.141593181993812e-05, "learning_rate": 2.730801337997657e-07, "loss": 0.0, "num_input_tokens_seen": 212701296, "step": 98615 }, { "epoch": 18.098733712607817, "grad_norm": 9.287636930821463e-05, "learning_rate": 2.728191807298958e-07, "loss": 0.0, "num_input_tokens_seen": 212712432, "step": 98620 }, { "epoch": 18.09965131216737, "grad_norm": 0.0018024586606770754, "learning_rate": 2.7255834890488883e-07, "loss": 0.0, "num_input_tokens_seen": 212722608, "step": 98625 }, { "epoch": 18.10056891172692, "grad_norm": 4.1112263716058806e-05, "learning_rate": 2.7229763833143296e-07, "loss": 0.0, "num_input_tokens_seen": 212734160, "step": 98630 }, { "epoch": 18.101486511286474, "grad_norm": 6.982434570090845e-05, "learning_rate": 2.7203704901621495e-07, "loss": 0.0, "num_input_tokens_seen": 212745680, "step": 98635 }, { "epoch": 18.102404110846027, "grad_norm": 0.00038214283995330334, "learning_rate": 2.7177658096591894e-07, "loss": 0.0, "num_input_tokens_seen": 212756400, "step": 98640 }, { "epoch": 18.103321710405577, "grad_norm": 5.168649659026414e-05, "learning_rate": 2.715162341872257e-07, "loss": 0.0, "num_input_tokens_seen": 212767312, "step": 98645 }, { "epoch": 18.10423930996513, "grad_norm": 0.00010425784421386197, "learning_rate": 2.7125600868681326e-07, "loss": 0.0, "num_input_tokens_seen": 212778800, "step": 98650 }, { "epoch": 18.105156909524684, "grad_norm": 0.000211261140066199, "learning_rate": 2.709959044713539e-07, "loss": 0.0, "num_input_tokens_seen": 212788944, "step": 98655 }, { "epoch": 18.106074509084234, "grad_norm": 0.0002994351089000702, "learning_rate": 2.707359215475214e-07, "loss": 0.0, "num_input_tokens_seen": 212799472, "step": 98660 }, { "epoch": 18.106992108643787, "grad_norm": 0.00035884242970496416, "learning_rate": 2.70476059921983e-07, "loss": 0.0, "num_input_tokens_seen": 212811088, "step": 98665 }, { "epoch": 18.10790970820334, "grad_norm": 0.00011064960563089699, "learning_rate": 2.7021631960140296e-07, "loss": 0.0, "num_input_tokens_seen": 212821488, "step": 98670 }, { "epoch": 18.10882730776289, "grad_norm": 5.162169327377342e-05, "learning_rate": 2.699567005924436e-07, "loss": 0.0, "num_input_tokens_seen": 212831216, "step": 98675 }, { "epoch": 18.109744907322444, "grad_norm": 0.0005550148198381066, "learning_rate": 2.6969720290176414e-07, "loss": 0.0, "num_input_tokens_seen": 212842704, "step": 98680 }, { "epoch": 18.110662506881997, "grad_norm": 0.00012787528976332396, "learning_rate": 2.694378265360209e-07, "loss": 0.0, "num_input_tokens_seen": 212852976, "step": 98685 }, { "epoch": 18.111580106441547, "grad_norm": 4.7952125896699727e-05, "learning_rate": 2.691785715018647e-07, "loss": 0.0, "num_input_tokens_seen": 212864432, "step": 98690 }, { "epoch": 18.1124977060011, "grad_norm": 0.0001248841144843027, "learning_rate": 2.689194378059462e-07, "loss": 0.0, "num_input_tokens_seen": 212874128, "step": 98695 }, { "epoch": 18.113415305560654, "grad_norm": 6.103597115725279e-05, "learning_rate": 2.6866042545491247e-07, "loss": 0.0, "num_input_tokens_seen": 212884464, "step": 98700 }, { "epoch": 18.114332905120204, "grad_norm": 7.344881305471063e-05, "learning_rate": 2.684015344554047e-07, "loss": 0.0, "num_input_tokens_seen": 212895856, "step": 98705 }, { "epoch": 18.115250504679757, "grad_norm": 0.00018280134827364236, "learning_rate": 2.68142764814065e-07, "loss": 0.0, "num_input_tokens_seen": 212905680, "step": 98710 }, { "epoch": 18.11616810423931, "grad_norm": 5.692173363058828e-05, "learning_rate": 2.6788411653752953e-07, "loss": 0.0, "num_input_tokens_seen": 212916752, "step": 98715 }, { "epoch": 18.11708570379886, "grad_norm": 0.0004077556368429214, "learning_rate": 2.6762558963243255e-07, "loss": 0.0, "num_input_tokens_seen": 212927120, "step": 98720 }, { "epoch": 18.118003303358414, "grad_norm": 0.00013247824972495437, "learning_rate": 2.6736718410540377e-07, "loss": 0.0, "num_input_tokens_seen": 212938928, "step": 98725 }, { "epoch": 18.118920902917967, "grad_norm": 9.643500379752368e-05, "learning_rate": 2.6710889996307275e-07, "loss": 0.0, "num_input_tokens_seen": 212949328, "step": 98730 }, { "epoch": 18.11983850247752, "grad_norm": 0.006868679076433182, "learning_rate": 2.6685073721206323e-07, "loss": 0.0, "num_input_tokens_seen": 212959728, "step": 98735 }, { "epoch": 18.12075610203707, "grad_norm": 4.713465386885218e-05, "learning_rate": 2.6659269585899595e-07, "loss": 0.0, "num_input_tokens_seen": 212970352, "step": 98740 }, { "epoch": 18.121673701596624, "grad_norm": 0.00010668693721527234, "learning_rate": 2.6633477591049005e-07, "loss": 0.0, "num_input_tokens_seen": 212981680, "step": 98745 }, { "epoch": 18.122591301156177, "grad_norm": 0.0001782397011993453, "learning_rate": 2.6607697737316084e-07, "loss": 0.0, "num_input_tokens_seen": 212992464, "step": 98750 }, { "epoch": 18.123508900715727, "grad_norm": 0.00010366384958615527, "learning_rate": 2.658193002536208e-07, "loss": 0.0, "num_input_tokens_seen": 213003216, "step": 98755 }, { "epoch": 18.12442650027528, "grad_norm": 0.00012839987175539136, "learning_rate": 2.655617445584779e-07, "loss": 0.0, "num_input_tokens_seen": 213014000, "step": 98760 }, { "epoch": 18.125344099834834, "grad_norm": 0.0002714429865591228, "learning_rate": 2.653043102943376e-07, "loss": 0.0, "num_input_tokens_seen": 213025616, "step": 98765 }, { "epoch": 18.126261699394384, "grad_norm": 6.485287303803489e-05, "learning_rate": 2.6504699746780493e-07, "loss": 0.0, "num_input_tokens_seen": 213037936, "step": 98770 }, { "epoch": 18.127179298953937, "grad_norm": 7.079624629113823e-05, "learning_rate": 2.6478980608547755e-07, "loss": 0.0, "num_input_tokens_seen": 213049200, "step": 98775 }, { "epoch": 18.12809689851349, "grad_norm": 3.626773104770109e-05, "learning_rate": 2.6453273615395345e-07, "loss": 0.0, "num_input_tokens_seen": 213060720, "step": 98780 }, { "epoch": 18.12901449807304, "grad_norm": 0.000882148218806833, "learning_rate": 2.642757876798252e-07, "loss": 0.0, "num_input_tokens_seen": 213072240, "step": 98785 }, { "epoch": 18.129932097632594, "grad_norm": 0.00034727645106613636, "learning_rate": 2.6401896066968245e-07, "loss": 0.0, "num_input_tokens_seen": 213084528, "step": 98790 }, { "epoch": 18.130849697192147, "grad_norm": 0.0005745819653384387, "learning_rate": 2.637622551301139e-07, "loss": 0.0, "num_input_tokens_seen": 213095088, "step": 98795 }, { "epoch": 18.131767296751697, "grad_norm": 0.0002337637561140582, "learning_rate": 2.635056710677031e-07, "loss": 0.0, "num_input_tokens_seen": 213106288, "step": 98800 }, { "epoch": 18.13268489631125, "grad_norm": 7.520774670410901e-05, "learning_rate": 2.632492084890309e-07, "loss": 0.0, "num_input_tokens_seen": 213116432, "step": 98805 }, { "epoch": 18.133602495870804, "grad_norm": 0.002059575170278549, "learning_rate": 2.629928674006743e-07, "loss": 0.0, "num_input_tokens_seen": 213126896, "step": 98810 }, { "epoch": 18.134520095430354, "grad_norm": 6.187849066918716e-05, "learning_rate": 2.627366478092097e-07, "loss": 0.0, "num_input_tokens_seen": 213137936, "step": 98815 }, { "epoch": 18.135437694989907, "grad_norm": 0.0012062634341418743, "learning_rate": 2.624805497212085e-07, "loss": 0.0, "num_input_tokens_seen": 213148592, "step": 98820 }, { "epoch": 18.13635529454946, "grad_norm": 3.757207377930172e-05, "learning_rate": 2.6222457314323713e-07, "loss": 0.0, "num_input_tokens_seen": 213159312, "step": 98825 }, { "epoch": 18.13727289410901, "grad_norm": 5.920027615502477e-05, "learning_rate": 2.619687180818642e-07, "loss": 0.0, "num_input_tokens_seen": 213170832, "step": 98830 }, { "epoch": 18.138190493668564, "grad_norm": 9.207944094669074e-05, "learning_rate": 2.6171298454365e-07, "loss": 0.0, "num_input_tokens_seen": 213182000, "step": 98835 }, { "epoch": 18.139108093228117, "grad_norm": 0.0008155127870850265, "learning_rate": 2.6145737253515325e-07, "loss": 0.0, "num_input_tokens_seen": 213192976, "step": 98840 }, { "epoch": 18.140025692787667, "grad_norm": 8.92940370249562e-05, "learning_rate": 2.612018820629314e-07, "loss": 0.0, "num_input_tokens_seen": 213203792, "step": 98845 }, { "epoch": 18.14094329234722, "grad_norm": 0.0001887352846097201, "learning_rate": 2.6094651313353704e-07, "loss": 0.0, "num_input_tokens_seen": 213213840, "step": 98850 }, { "epoch": 18.141860891906774, "grad_norm": 9.352659981232136e-05, "learning_rate": 2.606912657535199e-07, "loss": 0.0645, "num_input_tokens_seen": 213223824, "step": 98855 }, { "epoch": 18.142778491466323, "grad_norm": 9.524304914521053e-05, "learning_rate": 2.604361399294253e-07, "loss": 0.0, "num_input_tokens_seen": 213233776, "step": 98860 }, { "epoch": 18.143696091025877, "grad_norm": 7.79164838604629e-05, "learning_rate": 2.601811356677991e-07, "loss": 0.0, "num_input_tokens_seen": 213245168, "step": 98865 }, { "epoch": 18.14461369058543, "grad_norm": 6.414879317162558e-05, "learning_rate": 2.599262529751806e-07, "loss": 0.0, "num_input_tokens_seen": 213256464, "step": 98870 }, { "epoch": 18.14553129014498, "grad_norm": 0.00011390705913072452, "learning_rate": 2.596714918581067e-07, "loss": 0.0, "num_input_tokens_seen": 213265904, "step": 98875 }, { "epoch": 18.146448889704534, "grad_norm": 0.0017097567906603217, "learning_rate": 2.594168523231133e-07, "loss": 0.0, "num_input_tokens_seen": 213277296, "step": 98880 }, { "epoch": 18.147366489264087, "grad_norm": 9.007225162349641e-05, "learning_rate": 2.5916233437672965e-07, "loss": 0.0, "num_input_tokens_seen": 213288592, "step": 98885 }, { "epoch": 18.148284088823637, "grad_norm": 4.951438313582912e-05, "learning_rate": 2.5890793802548443e-07, "loss": 0.0, "num_input_tokens_seen": 213299888, "step": 98890 }, { "epoch": 18.14920168838319, "grad_norm": 0.00032442298834212124, "learning_rate": 2.5865366327590293e-07, "loss": 0.0, "num_input_tokens_seen": 213310096, "step": 98895 }, { "epoch": 18.150119287942744, "grad_norm": 0.00015132242697291076, "learning_rate": 2.583995101345066e-07, "loss": 0.0, "num_input_tokens_seen": 213321040, "step": 98900 }, { "epoch": 18.151036887502293, "grad_norm": 0.0001283494639210403, "learning_rate": 2.5814547860781356e-07, "loss": 0.0, "num_input_tokens_seen": 213332048, "step": 98905 }, { "epoch": 18.151954487061847, "grad_norm": 0.0001597515365574509, "learning_rate": 2.578915687023398e-07, "loss": 0.0, "num_input_tokens_seen": 213343248, "step": 98910 }, { "epoch": 18.1528720866214, "grad_norm": 4.15416834584903e-05, "learning_rate": 2.5763778042459773e-07, "loss": 0.0, "num_input_tokens_seen": 213353712, "step": 98915 }, { "epoch": 18.15378968618095, "grad_norm": 6.621809006901458e-05, "learning_rate": 2.573841137810973e-07, "loss": 0.0, "num_input_tokens_seen": 213365744, "step": 98920 }, { "epoch": 18.154707285740503, "grad_norm": 0.0017629187786951661, "learning_rate": 2.571305687783426e-07, "loss": 0.0, "num_input_tokens_seen": 213377488, "step": 98925 }, { "epoch": 18.155624885300057, "grad_norm": 7.172874757088721e-05, "learning_rate": 2.5687714542283914e-07, "loss": 0.0, "num_input_tokens_seen": 213389072, "step": 98930 }, { "epoch": 18.156542484859607, "grad_norm": 0.0001409739925293252, "learning_rate": 2.566238437210861e-07, "loss": 0.0, "num_input_tokens_seen": 213399920, "step": 98935 }, { "epoch": 18.15746008441916, "grad_norm": 8.989320755004883, "learning_rate": 2.5637066367957817e-07, "loss": 0.04, "num_input_tokens_seen": 213409968, "step": 98940 }, { "epoch": 18.158377683978713, "grad_norm": 0.0002000723616220057, "learning_rate": 2.561176053048126e-07, "loss": 0.0, "num_input_tokens_seen": 213420528, "step": 98945 }, { "epoch": 18.159295283538263, "grad_norm": 0.00013431053957901895, "learning_rate": 2.558646686032773e-07, "loss": 0.0, "num_input_tokens_seen": 213431536, "step": 98950 }, { "epoch": 18.160212883097817, "grad_norm": 0.0020736134611070156, "learning_rate": 2.556118535814606e-07, "loss": 0.0, "num_input_tokens_seen": 213441744, "step": 98955 }, { "epoch": 18.16113048265737, "grad_norm": 0.0002778928610496223, "learning_rate": 2.553591602458461e-07, "loss": 0.0, "num_input_tokens_seen": 213452528, "step": 98960 }, { "epoch": 18.16204808221692, "grad_norm": 0.0012293992331251502, "learning_rate": 2.551065886029164e-07, "loss": 0.0, "num_input_tokens_seen": 213463152, "step": 98965 }, { "epoch": 18.162965681776473, "grad_norm": 0.00017747835954651237, "learning_rate": 2.548541386591491e-07, "loss": 0.0, "num_input_tokens_seen": 213475600, "step": 98970 }, { "epoch": 18.163883281336027, "grad_norm": 0.00034746865276247263, "learning_rate": 2.546018104210185e-07, "loss": 0.0, "num_input_tokens_seen": 213486864, "step": 98975 }, { "epoch": 18.164800880895577, "grad_norm": 6.741109973518178e-05, "learning_rate": 2.5434960389499665e-07, "loss": 0.0, "num_input_tokens_seen": 213498000, "step": 98980 }, { "epoch": 18.16571848045513, "grad_norm": 5.613305984297767e-05, "learning_rate": 2.5409751908755163e-07, "loss": 0.0, "num_input_tokens_seen": 213508880, "step": 98985 }, { "epoch": 18.166636080014683, "grad_norm": 8.270193211501464e-05, "learning_rate": 2.538455560051506e-07, "loss": 0.0, "num_input_tokens_seen": 213519696, "step": 98990 }, { "epoch": 18.167553679574233, "grad_norm": 0.002506978577002883, "learning_rate": 2.535937146542555e-07, "loss": 0.0, "num_input_tokens_seen": 213531696, "step": 98995 }, { "epoch": 18.168471279133787, "grad_norm": 7.402292249025777e-05, "learning_rate": 2.533419950413246e-07, "loss": 0.0, "num_input_tokens_seen": 213543216, "step": 99000 }, { "epoch": 18.16938887869334, "grad_norm": 0.00015976186841726303, "learning_rate": 2.530903971728155e-07, "loss": 0.0, "num_input_tokens_seen": 213554928, "step": 99005 }, { "epoch": 18.17030647825289, "grad_norm": 0.0005340026109479368, "learning_rate": 2.5283892105518016e-07, "loss": 0.0, "num_input_tokens_seen": 213565712, "step": 99010 }, { "epoch": 18.171224077812443, "grad_norm": 0.0001966391719179228, "learning_rate": 2.5258756669486906e-07, "loss": 0.0, "num_input_tokens_seen": 213575408, "step": 99015 }, { "epoch": 18.172141677371997, "grad_norm": 9.477714047534391e-05, "learning_rate": 2.5233633409832923e-07, "loss": 0.0, "num_input_tokens_seen": 213584240, "step": 99020 }, { "epoch": 18.173059276931546, "grad_norm": 0.0006069772061891854, "learning_rate": 2.520852232720039e-07, "loss": 0.0, "num_input_tokens_seen": 213594992, "step": 99025 }, { "epoch": 18.1739768764911, "grad_norm": 0.00037875783164054155, "learning_rate": 2.5183423422233456e-07, "loss": 0.0, "num_input_tokens_seen": 213606480, "step": 99030 }, { "epoch": 18.174894476050653, "grad_norm": 0.00010930559801636264, "learning_rate": 2.515833669557577e-07, "loss": 0.0, "num_input_tokens_seen": 213616880, "step": 99035 }, { "epoch": 18.175812075610203, "grad_norm": 0.0003766132576856762, "learning_rate": 2.5133262147870876e-07, "loss": 0.0, "num_input_tokens_seen": 213627344, "step": 99040 }, { "epoch": 18.176729675169756, "grad_norm": 0.00026961261755786836, "learning_rate": 2.51081997797617e-07, "loss": 0.0, "num_input_tokens_seen": 213638736, "step": 99045 }, { "epoch": 18.17764727472931, "grad_norm": 7.695495878579095e-05, "learning_rate": 2.5083149591891285e-07, "loss": 0.0, "num_input_tokens_seen": 213649584, "step": 99050 }, { "epoch": 18.17856487428886, "grad_norm": 0.0008912608027458191, "learning_rate": 2.5058111584902065e-07, "loss": 0.0, "num_input_tokens_seen": 213660080, "step": 99055 }, { "epoch": 18.179482473848413, "grad_norm": 0.0009177632164210081, "learning_rate": 2.503308575943608e-07, "loss": 0.0, "num_input_tokens_seen": 213671472, "step": 99060 }, { "epoch": 18.180400073407966, "grad_norm": 0.0005112665821798146, "learning_rate": 2.5008072116135374e-07, "loss": 0.0, "num_input_tokens_seen": 213682224, "step": 99065 }, { "epoch": 18.181317672967516, "grad_norm": 0.0003929012455046177, "learning_rate": 2.498307065564143e-07, "loss": 0.0, "num_input_tokens_seen": 213693424, "step": 99070 }, { "epoch": 18.18223527252707, "grad_norm": 7.998257933650166e-05, "learning_rate": 2.495808137859551e-07, "loss": 0.0, "num_input_tokens_seen": 213704464, "step": 99075 }, { "epoch": 18.183152872086623, "grad_norm": 5.461345790536143e-05, "learning_rate": 2.4933104285638556e-07, "loss": 0.0, "num_input_tokens_seen": 213714416, "step": 99080 }, { "epoch": 18.184070471646173, "grad_norm": 0.00011241438915021718, "learning_rate": 2.490813937741121e-07, "loss": 0.0001, "num_input_tokens_seen": 213725488, "step": 99085 }, { "epoch": 18.184988071205726, "grad_norm": 0.00012958364095538855, "learning_rate": 2.4883186654553806e-07, "loss": 0.0, "num_input_tokens_seen": 213735920, "step": 99090 }, { "epoch": 18.18590567076528, "grad_norm": 6.462073361035436e-05, "learning_rate": 2.4858246117706207e-07, "loss": 0.0, "num_input_tokens_seen": 213746672, "step": 99095 }, { "epoch": 18.18682327032483, "grad_norm": 0.00010927878611255437, "learning_rate": 2.483331776750825e-07, "loss": 0.0, "num_input_tokens_seen": 213757008, "step": 99100 }, { "epoch": 18.187740869884383, "grad_norm": 0.00016058165056165308, "learning_rate": 2.480840160459924e-07, "loss": 0.0, "num_input_tokens_seen": 213767632, "step": 99105 }, { "epoch": 18.188658469443936, "grad_norm": 0.0001018857947201468, "learning_rate": 2.478349762961818e-07, "loss": 0.0, "num_input_tokens_seen": 213777744, "step": 99110 }, { "epoch": 18.189576069003486, "grad_norm": 8.196826092898846e-05, "learning_rate": 2.4758605843203996e-07, "loss": 0.0, "num_input_tokens_seen": 213787984, "step": 99115 }, { "epoch": 18.19049366856304, "grad_norm": 0.0007207125308923423, "learning_rate": 2.473372624599496e-07, "loss": 0.0, "num_input_tokens_seen": 213798416, "step": 99120 }, { "epoch": 18.191411268122593, "grad_norm": 0.001801403472200036, "learning_rate": 2.470885883862928e-07, "loss": 0.0, "num_input_tokens_seen": 213808688, "step": 99125 }, { "epoch": 18.192328867682143, "grad_norm": 0.00028972383006475866, "learning_rate": 2.468400362174467e-07, "loss": 0.0, "num_input_tokens_seen": 213818928, "step": 99130 }, { "epoch": 18.193246467241696, "grad_norm": 0.000283918809145689, "learning_rate": 2.4659160595978784e-07, "loss": 0.0, "num_input_tokens_seen": 213829808, "step": 99135 }, { "epoch": 18.19416406680125, "grad_norm": 0.0002960765268653631, "learning_rate": 2.4634329761968667e-07, "loss": 0.0, "num_input_tokens_seen": 213840112, "step": 99140 }, { "epoch": 18.1950816663608, "grad_norm": 0.00023543172574136406, "learning_rate": 2.46095111203512e-07, "loss": 0.0, "num_input_tokens_seen": 213852144, "step": 99145 }, { "epoch": 18.195999265920353, "grad_norm": 0.00020997585670556873, "learning_rate": 2.458470467176305e-07, "loss": 0.0, "num_input_tokens_seen": 213863088, "step": 99150 }, { "epoch": 18.196916865479906, "grad_norm": 0.00025673850905150175, "learning_rate": 2.4559910416840413e-07, "loss": 0.0, "num_input_tokens_seen": 213872624, "step": 99155 }, { "epoch": 18.197834465039456, "grad_norm": 6.547055090777576e-05, "learning_rate": 2.4535128356219075e-07, "loss": 0.0, "num_input_tokens_seen": 213884016, "step": 99160 }, { "epoch": 18.19875206459901, "grad_norm": 0.0008096602978184819, "learning_rate": 2.451035849053496e-07, "loss": 0.0, "num_input_tokens_seen": 213895056, "step": 99165 }, { "epoch": 18.199669664158563, "grad_norm": 0.01315884105861187, "learning_rate": 2.4485600820423114e-07, "loss": 0.0, "num_input_tokens_seen": 213905520, "step": 99170 }, { "epoch": 18.200587263718113, "grad_norm": 8.254711428890005e-05, "learning_rate": 2.4460855346518706e-07, "loss": 0.0, "num_input_tokens_seen": 213915568, "step": 99175 }, { "epoch": 18.201504863277666, "grad_norm": 4.949080175720155e-05, "learning_rate": 2.4436122069456223e-07, "loss": 0.0, "num_input_tokens_seen": 213927088, "step": 99180 }, { "epoch": 18.20242246283722, "grad_norm": 8.677349251229316e-05, "learning_rate": 2.4411400989870213e-07, "loss": 0.0, "num_input_tokens_seen": 213938416, "step": 99185 }, { "epoch": 18.20334006239677, "grad_norm": 7.522029045503587e-05, "learning_rate": 2.438669210839467e-07, "loss": 0.0, "num_input_tokens_seen": 213948944, "step": 99190 }, { "epoch": 18.204257661956323, "grad_norm": 0.00034146764664910734, "learning_rate": 2.4361995425663367e-07, "loss": 0.0, "num_input_tokens_seen": 213960176, "step": 99195 }, { "epoch": 18.205175261515876, "grad_norm": 6.369071343215182e-05, "learning_rate": 2.4337310942309734e-07, "loss": 0.0, "num_input_tokens_seen": 213969840, "step": 99200 }, { "epoch": 18.206092861075426, "grad_norm": 0.005803998094052076, "learning_rate": 2.4312638658966823e-07, "loss": 0.0, "num_input_tokens_seen": 213981776, "step": 99205 }, { "epoch": 18.20701046063498, "grad_norm": 4.652742427424528e-05, "learning_rate": 2.428797857626741e-07, "loss": 0.0, "num_input_tokens_seen": 213991536, "step": 99210 }, { "epoch": 18.207928060194533, "grad_norm": 0.00044602100388146937, "learning_rate": 2.4263330694844156e-07, "loss": 0.0, "num_input_tokens_seen": 214003472, "step": 99215 }, { "epoch": 18.208845659754083, "grad_norm": 0.0018686051480472088, "learning_rate": 2.423869501532916e-07, "loss": 0.0, "num_input_tokens_seen": 214013296, "step": 99220 }, { "epoch": 18.209763259313636, "grad_norm": 5.881855759071186e-05, "learning_rate": 2.42140715383542e-07, "loss": 0.0, "num_input_tokens_seen": 214023664, "step": 99225 }, { "epoch": 18.21068085887319, "grad_norm": 0.0001629723992664367, "learning_rate": 2.418946026455088e-07, "loss": 0.0, "num_input_tokens_seen": 214034544, "step": 99230 }, { "epoch": 18.21159845843274, "grad_norm": 0.0001002897770376876, "learning_rate": 2.416486119455053e-07, "loss": 0.0, "num_input_tokens_seen": 214046320, "step": 99235 }, { "epoch": 18.212516057992293, "grad_norm": 0.00021678497432731092, "learning_rate": 2.4140274328984025e-07, "loss": 0.0, "num_input_tokens_seen": 214057488, "step": 99240 }, { "epoch": 18.213433657551846, "grad_norm": 7.840293255867437e-05, "learning_rate": 2.411569966848193e-07, "loss": 0.0, "num_input_tokens_seen": 214069168, "step": 99245 }, { "epoch": 18.214351257111396, "grad_norm": 0.00015471734513994306, "learning_rate": 2.4091137213674564e-07, "loss": 0.0, "num_input_tokens_seen": 214080336, "step": 99250 }, { "epoch": 18.21526885667095, "grad_norm": 7.347243081312627e-05, "learning_rate": 2.4066586965191985e-07, "loss": 0.0, "num_input_tokens_seen": 214090320, "step": 99255 }, { "epoch": 18.216186456230503, "grad_norm": 4.5278251491254196e-05, "learning_rate": 2.404204892366385e-07, "loss": 0.0, "num_input_tokens_seen": 214101840, "step": 99260 }, { "epoch": 18.217104055790053, "grad_norm": 9.917718853102997e-05, "learning_rate": 2.4017523089719385e-07, "loss": 0.0, "num_input_tokens_seen": 214112336, "step": 99265 }, { "epoch": 18.218021655349606, "grad_norm": 0.0001160481697297655, "learning_rate": 2.399300946398786e-07, "loss": 0.0, "num_input_tokens_seen": 214121904, "step": 99270 }, { "epoch": 18.21893925490916, "grad_norm": 0.000955012917984277, "learning_rate": 2.3968508047097826e-07, "loss": 0.0, "num_input_tokens_seen": 214133136, "step": 99275 }, { "epoch": 18.21985685446871, "grad_norm": 9.542569023324177e-05, "learning_rate": 2.3944018839677784e-07, "loss": 0.0087, "num_input_tokens_seen": 214142704, "step": 99280 }, { "epoch": 18.220774454028263, "grad_norm": 0.01828308030962944, "learning_rate": 2.3919541842355843e-07, "loss": 0.0, "num_input_tokens_seen": 214152752, "step": 99285 }, { "epoch": 18.221692053587816, "grad_norm": 0.00019375588453840464, "learning_rate": 2.389507705575983e-07, "loss": 0.0, "num_input_tokens_seen": 214162576, "step": 99290 }, { "epoch": 18.222609653147366, "grad_norm": 6.565629155375063e-05, "learning_rate": 2.3870624480517134e-07, "loss": 0.0, "num_input_tokens_seen": 214172656, "step": 99295 }, { "epoch": 18.22352725270692, "grad_norm": 6.991664849920198e-05, "learning_rate": 2.3846184117255034e-07, "loss": 0.0, "num_input_tokens_seen": 214184208, "step": 99300 }, { "epoch": 18.224444852266473, "grad_norm": 5.532735667657107e-05, "learning_rate": 2.3821755966600357e-07, "loss": 0.0, "num_input_tokens_seen": 214196080, "step": 99305 }, { "epoch": 18.225362451826022, "grad_norm": 4.824903953704052e-05, "learning_rate": 2.3797340029179605e-07, "loss": 0.0, "num_input_tokens_seen": 214206160, "step": 99310 }, { "epoch": 18.226280051385576, "grad_norm": 0.00023140983830671757, "learning_rate": 2.3772936305618999e-07, "loss": 0.0, "num_input_tokens_seen": 214217776, "step": 99315 }, { "epoch": 18.22719765094513, "grad_norm": 9.137835149886087e-05, "learning_rate": 2.3748544796544537e-07, "loss": 0.0, "num_input_tokens_seen": 214227792, "step": 99320 }, { "epoch": 18.22811525050468, "grad_norm": 0.00029778011958114803, "learning_rate": 2.3724165502581774e-07, "loss": 0.0, "num_input_tokens_seen": 214238320, "step": 99325 }, { "epoch": 18.229032850064232, "grad_norm": 0.0042509823106229305, "learning_rate": 2.3699798424355936e-07, "loss": 0.0, "num_input_tokens_seen": 214248496, "step": 99330 }, { "epoch": 18.229950449623786, "grad_norm": 6.233346357475966e-05, "learning_rate": 2.367544356249213e-07, "loss": 0.0, "num_input_tokens_seen": 214259504, "step": 99335 }, { "epoch": 18.230868049183336, "grad_norm": 0.0005276441224850714, "learning_rate": 2.3651100917615021e-07, "loss": 0.0, "num_input_tokens_seen": 214270736, "step": 99340 }, { "epoch": 18.23178564874289, "grad_norm": 0.00010872221901081502, "learning_rate": 2.3626770490348782e-07, "loss": 0.0, "num_input_tokens_seen": 214281392, "step": 99345 }, { "epoch": 18.232703248302442, "grad_norm": 4.2628329538274556e-05, "learning_rate": 2.3602452281317634e-07, "loss": 0.0, "num_input_tokens_seen": 214292816, "step": 99350 }, { "epoch": 18.233620847861992, "grad_norm": 5.250621325103566e-05, "learning_rate": 2.3578146291145242e-07, "loss": 0.0, "num_input_tokens_seen": 214304528, "step": 99355 }, { "epoch": 18.234538447421546, "grad_norm": 0.0004556705243885517, "learning_rate": 2.3553852520455e-07, "loss": 0.0, "num_input_tokens_seen": 214315728, "step": 99360 }, { "epoch": 18.2354560469811, "grad_norm": 0.00019274026271887124, "learning_rate": 2.3529570969869963e-07, "loss": 0.0, "num_input_tokens_seen": 214326960, "step": 99365 }, { "epoch": 18.23637364654065, "grad_norm": 0.00015394431829918176, "learning_rate": 2.3505301640013022e-07, "loss": 0.0, "num_input_tokens_seen": 214338384, "step": 99370 }, { "epoch": 18.237291246100202, "grad_norm": 6.46934422547929e-05, "learning_rate": 2.3481044531506626e-07, "loss": 0.0, "num_input_tokens_seen": 214349328, "step": 99375 }, { "epoch": 18.238208845659756, "grad_norm": 0.0016218447126448154, "learning_rate": 2.3456799644972828e-07, "loss": 0.0, "num_input_tokens_seen": 214357776, "step": 99380 }, { "epoch": 18.239126445219306, "grad_norm": 0.00013397294969763607, "learning_rate": 2.3432566981033577e-07, "loss": 0.0, "num_input_tokens_seen": 214368368, "step": 99385 }, { "epoch": 18.24004404477886, "grad_norm": 8.776774484431371e-05, "learning_rate": 2.3408346540310379e-07, "loss": 0.0, "num_input_tokens_seen": 214380272, "step": 99390 }, { "epoch": 18.240961644338412, "grad_norm": 0.0001554423215566203, "learning_rate": 2.3384138323424455e-07, "loss": 0.0, "num_input_tokens_seen": 214391472, "step": 99395 }, { "epoch": 18.241879243897962, "grad_norm": 0.0003260847879573703, "learning_rate": 2.3359942330996644e-07, "loss": 0.0, "num_input_tokens_seen": 214401520, "step": 99400 }, { "epoch": 18.242796843457516, "grad_norm": 4.818233719561249e-05, "learning_rate": 2.3335758563647614e-07, "loss": 0.0, "num_input_tokens_seen": 214412304, "step": 99405 }, { "epoch": 18.24371444301707, "grad_norm": 6.757699884474277e-05, "learning_rate": 2.331158702199765e-07, "loss": 0.0, "num_input_tokens_seen": 214422736, "step": 99410 }, { "epoch": 18.24463204257662, "grad_norm": 0.0002393108734395355, "learning_rate": 2.32874277066667e-07, "loss": 0.0, "num_input_tokens_seen": 214433264, "step": 99415 }, { "epoch": 18.245549642136172, "grad_norm": 0.00013102155935484916, "learning_rate": 2.326328061827432e-07, "loss": 0.0, "num_input_tokens_seen": 214445584, "step": 99420 }, { "epoch": 18.246467241695726, "grad_norm": 0.0001247048785444349, "learning_rate": 2.3239145757439961e-07, "loss": 0.0, "num_input_tokens_seen": 214456080, "step": 99425 }, { "epoch": 18.247384841255275, "grad_norm": 6.155391020001844e-05, "learning_rate": 2.321502312478252e-07, "loss": 0.0, "num_input_tokens_seen": 214466288, "step": 99430 }, { "epoch": 18.24830244081483, "grad_norm": 6.0376136389095336e-05, "learning_rate": 2.3190912720920888e-07, "loss": 0.0, "num_input_tokens_seen": 214477712, "step": 99435 }, { "epoch": 18.249220040374382, "grad_norm": 0.00010276949615217745, "learning_rate": 2.316681454647335e-07, "loss": 0.0, "num_input_tokens_seen": 214489584, "step": 99440 }, { "epoch": 18.250137639933932, "grad_norm": 0.0003119056345894933, "learning_rate": 2.3142728602057962e-07, "loss": 0.0, "num_input_tokens_seen": 214499088, "step": 99445 }, { "epoch": 18.251055239493486, "grad_norm": 0.00034850931842811406, "learning_rate": 2.3118654888292458e-07, "loss": 0.0, "num_input_tokens_seen": 214510352, "step": 99450 }, { "epoch": 18.25197283905304, "grad_norm": 8.599906868766993e-05, "learning_rate": 2.3094593405794453e-07, "loss": 0.0, "num_input_tokens_seen": 214521616, "step": 99455 }, { "epoch": 18.25289043861259, "grad_norm": 0.0003588230174500495, "learning_rate": 2.3070544155180952e-07, "loss": 0.0, "num_input_tokens_seen": 214532944, "step": 99460 }, { "epoch": 18.253808038172142, "grad_norm": 0.00010903206566581503, "learning_rate": 2.3046507137068797e-07, "loss": 0.0, "num_input_tokens_seen": 214542512, "step": 99465 }, { "epoch": 18.254725637731696, "grad_norm": 9.217372280545533e-05, "learning_rate": 2.3022482352074548e-07, "loss": 0.0, "num_input_tokens_seen": 214553968, "step": 99470 }, { "epoch": 18.255643237291245, "grad_norm": 0.00026811478892341256, "learning_rate": 2.2998469800814382e-07, "loss": 0.0, "num_input_tokens_seen": 214563184, "step": 99475 }, { "epoch": 18.2565608368508, "grad_norm": 7.193444616859779e-05, "learning_rate": 2.2974469483904138e-07, "loss": 0.0, "num_input_tokens_seen": 214573744, "step": 99480 }, { "epoch": 18.257478436410352, "grad_norm": 5.455561404232867e-05, "learning_rate": 2.295048140195949e-07, "loss": 0.0, "num_input_tokens_seen": 214583472, "step": 99485 }, { "epoch": 18.258396035969902, "grad_norm": 0.00010145332635147497, "learning_rate": 2.292650555559567e-07, "loss": 0.0, "num_input_tokens_seen": 214593776, "step": 99490 }, { "epoch": 18.259313635529455, "grad_norm": 5.930122279096395e-05, "learning_rate": 2.2902541945427514e-07, "loss": 0.0, "num_input_tokens_seen": 214604048, "step": 99495 }, { "epoch": 18.26023123508901, "grad_norm": 6.751646287739277e-05, "learning_rate": 2.2878590572069702e-07, "loss": 0.0, "num_input_tokens_seen": 214614352, "step": 99500 }, { "epoch": 18.26114883464856, "grad_norm": 6.641674553975463e-05, "learning_rate": 2.2854651436136633e-07, "loss": 0.0, "num_input_tokens_seen": 214625968, "step": 99505 }, { "epoch": 18.262066434208112, "grad_norm": 6.456773553509265e-05, "learning_rate": 2.28307245382422e-07, "loss": 0.0, "num_input_tokens_seen": 214637168, "step": 99510 }, { "epoch": 18.262984033767665, "grad_norm": 6.069838855182752e-05, "learning_rate": 2.2806809879000136e-07, "loss": 0.0, "num_input_tokens_seen": 214648304, "step": 99515 }, { "epoch": 18.263901633327215, "grad_norm": 7.418230961775407e-05, "learning_rate": 2.278290745902384e-07, "loss": 0.0, "num_input_tokens_seen": 214659952, "step": 99520 }, { "epoch": 18.26481923288677, "grad_norm": 5.655019776895642e-05, "learning_rate": 2.2759017278926377e-07, "loss": 0.0, "num_input_tokens_seen": 214671440, "step": 99525 }, { "epoch": 18.265736832446322, "grad_norm": 0.00013733591185882688, "learning_rate": 2.2735139339320366e-07, "loss": 0.0, "num_input_tokens_seen": 214682384, "step": 99530 }, { "epoch": 18.266654432005872, "grad_norm": 0.0002054773212876171, "learning_rate": 2.2711273640818433e-07, "loss": 0.0, "num_input_tokens_seen": 214693488, "step": 99535 }, { "epoch": 18.267572031565425, "grad_norm": 0.002116314833983779, "learning_rate": 2.2687420184032583e-07, "loss": 0.0, "num_input_tokens_seen": 214704720, "step": 99540 }, { "epoch": 18.26848963112498, "grad_norm": 0.00010688660404412076, "learning_rate": 2.266357896957466e-07, "loss": 0.0, "num_input_tokens_seen": 214715216, "step": 99545 }, { "epoch": 18.26940723068453, "grad_norm": 0.0014303915668278933, "learning_rate": 2.263974999805607e-07, "loss": 0.0, "num_input_tokens_seen": 214725968, "step": 99550 }, { "epoch": 18.270324830244082, "grad_norm": 0.00017361062054987997, "learning_rate": 2.2615933270088098e-07, "loss": 0.0, "num_input_tokens_seen": 214737520, "step": 99555 }, { "epoch": 18.271242429803635, "grad_norm": 9.400337876286358e-05, "learning_rate": 2.259212878628153e-07, "loss": 0.0, "num_input_tokens_seen": 214748176, "step": 99560 }, { "epoch": 18.272160029363185, "grad_norm": 5.8631045249057934e-05, "learning_rate": 2.256833654724694e-07, "loss": 0.0, "num_input_tokens_seen": 214758224, "step": 99565 }, { "epoch": 18.27307762892274, "grad_norm": 4.1292994865216315e-05, "learning_rate": 2.254455655359461e-07, "loss": 0.0, "num_input_tokens_seen": 214769168, "step": 99570 }, { "epoch": 18.273995228482292, "grad_norm": 6.0989616031292826e-05, "learning_rate": 2.252078880593439e-07, "loss": 0.0, "num_input_tokens_seen": 214779152, "step": 99575 }, { "epoch": 18.274912828041842, "grad_norm": 7.855281728552654e-05, "learning_rate": 2.2497033304875903e-07, "loss": 0.0, "num_input_tokens_seen": 214789936, "step": 99580 }, { "epoch": 18.275830427601395, "grad_norm": 0.00013693369692191482, "learning_rate": 2.247329005102844e-07, "loss": 0.0, "num_input_tokens_seen": 214800368, "step": 99585 }, { "epoch": 18.27674802716095, "grad_norm": 7.699491106905043e-05, "learning_rate": 2.2449559045001012e-07, "loss": 0.0, "num_input_tokens_seen": 214810128, "step": 99590 }, { "epoch": 18.2776656267205, "grad_norm": 0.004409764427691698, "learning_rate": 2.2425840287402246e-07, "loss": 0.0, "num_input_tokens_seen": 214821296, "step": 99595 }, { "epoch": 18.278583226280052, "grad_norm": 5.867289291927591e-05, "learning_rate": 2.2402133778840484e-07, "loss": 0.0, "num_input_tokens_seen": 214832944, "step": 99600 }, { "epoch": 18.279500825839605, "grad_norm": 0.0010335920378565788, "learning_rate": 2.23784395199238e-07, "loss": 0.0, "num_input_tokens_seen": 214844112, "step": 99605 }, { "epoch": 18.280418425399155, "grad_norm": 6.62741731503047e-05, "learning_rate": 2.2354757511259927e-07, "loss": 0.0, "num_input_tokens_seen": 214854896, "step": 99610 }, { "epoch": 18.28133602495871, "grad_norm": 4.612886914401315e-05, "learning_rate": 2.2331087753456216e-07, "loss": 0.0, "num_input_tokens_seen": 214865936, "step": 99615 }, { "epoch": 18.282253624518262, "grad_norm": 0.0016618123045191169, "learning_rate": 2.2307430247119788e-07, "loss": 0.0, "num_input_tokens_seen": 214876912, "step": 99620 }, { "epoch": 18.28317122407781, "grad_norm": 8.961507410276681e-05, "learning_rate": 2.2283784992857383e-07, "loss": 0.0, "num_input_tokens_seen": 214888400, "step": 99625 }, { "epoch": 18.284088823637365, "grad_norm": 0.0017316179582849145, "learning_rate": 2.226015199127557e-07, "loss": 0.0, "num_input_tokens_seen": 214899760, "step": 99630 }, { "epoch": 18.28500642319692, "grad_norm": 5.997473999741487e-05, "learning_rate": 2.2236531242980364e-07, "loss": 0.0, "num_input_tokens_seen": 214910160, "step": 99635 }, { "epoch": 18.28592402275647, "grad_norm": 8.3683873526752e-05, "learning_rate": 2.2212922748577725e-07, "loss": 0.0, "num_input_tokens_seen": 214921552, "step": 99640 }, { "epoch": 18.28684162231602, "grad_norm": 0.00038967534783296287, "learning_rate": 2.2189326508673114e-07, "loss": 0.0, "num_input_tokens_seen": 214932720, "step": 99645 }, { "epoch": 18.287759221875575, "grad_norm": 0.0002875776553992182, "learning_rate": 2.2165742523871603e-07, "loss": 0.0, "num_input_tokens_seen": 214942480, "step": 99650 }, { "epoch": 18.288676821435125, "grad_norm": 4.3317315430613235e-05, "learning_rate": 2.2142170794778374e-07, "loss": 0.0, "num_input_tokens_seen": 214953328, "step": 99655 }, { "epoch": 18.28959442099468, "grad_norm": 0.0001240057754330337, "learning_rate": 2.211861132199783e-07, "loss": 0.0, "num_input_tokens_seen": 214963728, "step": 99660 }, { "epoch": 18.29051202055423, "grad_norm": 0.0004577704821713269, "learning_rate": 2.2095064106134157e-07, "loss": 0.0, "num_input_tokens_seen": 214974512, "step": 99665 }, { "epoch": 18.29142962011378, "grad_norm": 0.0002048637979896739, "learning_rate": 2.207152914779148e-07, "loss": 0.0, "num_input_tokens_seen": 214984880, "step": 99670 }, { "epoch": 18.292347219673335, "grad_norm": 0.00035451375879347324, "learning_rate": 2.2048006447573377e-07, "loss": 0.0, "num_input_tokens_seen": 214996176, "step": 99675 }, { "epoch": 18.29326481923289, "grad_norm": 6.192890577949584e-05, "learning_rate": 2.202449600608314e-07, "loss": 0.0, "num_input_tokens_seen": 215006480, "step": 99680 }, { "epoch": 18.29418241879244, "grad_norm": 0.00039719470078125596, "learning_rate": 2.200099782392373e-07, "loss": 0.0, "num_input_tokens_seen": 215018256, "step": 99685 }, { "epoch": 18.29510001835199, "grad_norm": 0.00012710476585198194, "learning_rate": 2.1977511901697947e-07, "loss": 0.0, "num_input_tokens_seen": 215028624, "step": 99690 }, { "epoch": 18.296017617911545, "grad_norm": 3.536734948283993e-05, "learning_rate": 2.195403824000808e-07, "loss": 0.0, "num_input_tokens_seen": 215039760, "step": 99695 }, { "epoch": 18.296935217471095, "grad_norm": 0.0007925858371891081, "learning_rate": 2.1930576839456208e-07, "loss": 0.0, "num_input_tokens_seen": 215050832, "step": 99700 }, { "epoch": 18.29785281703065, "grad_norm": 5.752473225584254e-05, "learning_rate": 2.190712770064418e-07, "loss": 0.0, "num_input_tokens_seen": 215062640, "step": 99705 }, { "epoch": 18.2987704165902, "grad_norm": 0.0008581126458011568, "learning_rate": 2.1883690824173354e-07, "loss": 0.0, "num_input_tokens_seen": 215073840, "step": 99710 }, { "epoch": 18.29968801614975, "grad_norm": 4.93868938065134e-05, "learning_rate": 2.18602662106448e-07, "loss": 0.0, "num_input_tokens_seen": 215085808, "step": 99715 }, { "epoch": 18.300605615709305, "grad_norm": 9.563152707414702e-05, "learning_rate": 2.1836853860659312e-07, "loss": 0.0, "num_input_tokens_seen": 215095792, "step": 99720 }, { "epoch": 18.30152321526886, "grad_norm": 0.00027068430790677667, "learning_rate": 2.1813453774817528e-07, "loss": 0.0, "num_input_tokens_seen": 215105584, "step": 99725 }, { "epoch": 18.302440814828408, "grad_norm": 7.081862713675946e-05, "learning_rate": 2.179006595371952e-07, "loss": 0.0, "num_input_tokens_seen": 215116016, "step": 99730 }, { "epoch": 18.30335841438796, "grad_norm": 8.114874799503013e-05, "learning_rate": 2.1766690397965084e-07, "loss": 0.0, "num_input_tokens_seen": 215125776, "step": 99735 }, { "epoch": 18.304276013947515, "grad_norm": 0.00018365563300903887, "learning_rate": 2.174332710815391e-07, "loss": 0.0, "num_input_tokens_seen": 215136816, "step": 99740 }, { "epoch": 18.305193613507065, "grad_norm": 0.0002667877124622464, "learning_rate": 2.1719976084885186e-07, "loss": 0.0, "num_input_tokens_seen": 215147216, "step": 99745 }, { "epoch": 18.306111213066618, "grad_norm": 7.873374124756083e-05, "learning_rate": 2.1696637328757707e-07, "loss": 0.0, "num_input_tokens_seen": 215158096, "step": 99750 }, { "epoch": 18.30702881262617, "grad_norm": 0.0012015009997412562, "learning_rate": 2.1673310840370277e-07, "loss": 0.0, "num_input_tokens_seen": 215167984, "step": 99755 }, { "epoch": 18.30794641218572, "grad_norm": 0.0001333175168838352, "learning_rate": 2.164999662032108e-07, "loss": 0.0, "num_input_tokens_seen": 215177328, "step": 99760 }, { "epoch": 18.308864011745275, "grad_norm": 0.0005466404254548252, "learning_rate": 2.1626694669208082e-07, "loss": 0.0, "num_input_tokens_seen": 215187888, "step": 99765 }, { "epoch": 18.309781611304828, "grad_norm": 5.9883332141907886e-05, "learning_rate": 2.1603404987628918e-07, "loss": 0.0, "num_input_tokens_seen": 215199696, "step": 99770 }, { "epoch": 18.310699210864378, "grad_norm": 0.00013718636182602495, "learning_rate": 2.1580127576180998e-07, "loss": 0.0, "num_input_tokens_seen": 215211376, "step": 99775 }, { "epoch": 18.31161681042393, "grad_norm": 0.0003753137425519526, "learning_rate": 2.1556862435461344e-07, "loss": 0.0, "num_input_tokens_seen": 215221936, "step": 99780 }, { "epoch": 18.312534409983485, "grad_norm": 8.647845970699564e-05, "learning_rate": 2.1533609566066594e-07, "loss": 0.0, "num_input_tokens_seen": 215231248, "step": 99785 }, { "epoch": 18.313452009543035, "grad_norm": 5.1136132242390886e-05, "learning_rate": 2.1510368968593266e-07, "loss": 0.0, "num_input_tokens_seen": 215242512, "step": 99790 }, { "epoch": 18.314369609102588, "grad_norm": 7.845120853744447e-05, "learning_rate": 2.1487140643637328e-07, "loss": 0.0, "num_input_tokens_seen": 215253872, "step": 99795 }, { "epoch": 18.31528720866214, "grad_norm": 5.248725938145071e-05, "learning_rate": 2.1463924591794581e-07, "loss": 0.0, "num_input_tokens_seen": 215264080, "step": 99800 }, { "epoch": 18.31620480822169, "grad_norm": 8.109439659165218e-05, "learning_rate": 2.144072081366061e-07, "loss": 0.0, "num_input_tokens_seen": 215275088, "step": 99805 }, { "epoch": 18.317122407781245, "grad_norm": 6.0376976762199774e-05, "learning_rate": 2.1417529309830376e-07, "loss": 0.0, "num_input_tokens_seen": 215285648, "step": 99810 }, { "epoch": 18.318040007340798, "grad_norm": 7.925173122202978e-05, "learning_rate": 2.1394350080898795e-07, "loss": 0.0, "num_input_tokens_seen": 215295664, "step": 99815 }, { "epoch": 18.318957606900348, "grad_norm": 8.905095455702394e-05, "learning_rate": 2.1371183127460337e-07, "loss": 0.0, "num_input_tokens_seen": 215306352, "step": 99820 }, { "epoch": 18.3198752064599, "grad_norm": 0.00029153216746635735, "learning_rate": 2.134802845010925e-07, "loss": 0.0, "num_input_tokens_seen": 215317296, "step": 99825 }, { "epoch": 18.320792806019455, "grad_norm": 0.00011632392124738544, "learning_rate": 2.1324886049439442e-07, "loss": 0.0, "num_input_tokens_seen": 215329488, "step": 99830 }, { "epoch": 18.321710405579005, "grad_norm": 3.706523420987651e-05, "learning_rate": 2.1301755926044386e-07, "loss": 0.0, "num_input_tokens_seen": 215340592, "step": 99835 }, { "epoch": 18.322628005138558, "grad_norm": 0.00025624310364946723, "learning_rate": 2.127863808051739e-07, "loss": 0.0, "num_input_tokens_seen": 215349616, "step": 99840 }, { "epoch": 18.32354560469811, "grad_norm": 0.00015505017654504627, "learning_rate": 2.1255532513451304e-07, "loss": 0.0, "num_input_tokens_seen": 215360464, "step": 99845 }, { "epoch": 18.32446320425766, "grad_norm": 0.0014570117928087711, "learning_rate": 2.1232439225438883e-07, "loss": 0.0, "num_input_tokens_seen": 215369808, "step": 99850 }, { "epoch": 18.325380803817215, "grad_norm": 6.363566353684291e-05, "learning_rate": 2.1209358217072374e-07, "loss": 0.0, "num_input_tokens_seen": 215379600, "step": 99855 }, { "epoch": 18.326298403376768, "grad_norm": 7.838696183171123e-05, "learning_rate": 2.1186289488943746e-07, "loss": 0.0, "num_input_tokens_seen": 215389808, "step": 99860 }, { "epoch": 18.327216002936318, "grad_norm": 0.0002456664515193552, "learning_rate": 2.1163233041644749e-07, "loss": 0.0, "num_input_tokens_seen": 215400624, "step": 99865 }, { "epoch": 18.32813360249587, "grad_norm": 7.739625289104879e-05, "learning_rate": 2.1140188875766575e-07, "loss": 0.0, "num_input_tokens_seen": 215410416, "step": 99870 }, { "epoch": 18.329051202055425, "grad_norm": 0.00012738992518279701, "learning_rate": 2.1117156991900534e-07, "loss": 0.0, "num_input_tokens_seen": 215421264, "step": 99875 }, { "epoch": 18.329968801614974, "grad_norm": 5.358878115657717e-05, "learning_rate": 2.1094137390637148e-07, "loss": 0.0, "num_input_tokens_seen": 215432208, "step": 99880 }, { "epoch": 18.330886401174528, "grad_norm": 5.113684164825827e-05, "learning_rate": 2.1071130072566836e-07, "loss": 0.0, "num_input_tokens_seen": 215442160, "step": 99885 }, { "epoch": 18.33180400073408, "grad_norm": 0.0002137753035640344, "learning_rate": 2.1048135038279848e-07, "loss": 0.0, "num_input_tokens_seen": 215452496, "step": 99890 }, { "epoch": 18.33272160029363, "grad_norm": 0.0005339835770428181, "learning_rate": 2.102515228836588e-07, "loss": 0.0, "num_input_tokens_seen": 215462832, "step": 99895 }, { "epoch": 18.333639199853184, "grad_norm": 3.655787804746069e-05, "learning_rate": 2.1002181823414458e-07, "loss": 0.0, "num_input_tokens_seen": 215473936, "step": 99900 }, { "epoch": 18.334556799412738, "grad_norm": 0.0013214141363278031, "learning_rate": 2.0979223644014557e-07, "loss": 0.0, "num_input_tokens_seen": 215484048, "step": 99905 }, { "epoch": 18.335474398972288, "grad_norm": 0.00012813530338462442, "learning_rate": 2.0956277750755262e-07, "loss": 0.0, "num_input_tokens_seen": 215494864, "step": 99910 }, { "epoch": 18.33639199853184, "grad_norm": 6.353702337946743e-05, "learning_rate": 2.0933344144224988e-07, "loss": 0.0, "num_input_tokens_seen": 215506608, "step": 99915 }, { "epoch": 18.337309598091394, "grad_norm": 0.0007254562806338072, "learning_rate": 2.0910422825011877e-07, "loss": 0.0, "num_input_tokens_seen": 215516976, "step": 99920 }, { "epoch": 18.338227197650944, "grad_norm": 0.0002567184856161475, "learning_rate": 2.088751379370396e-07, "loss": 0.0, "num_input_tokens_seen": 215527376, "step": 99925 }, { "epoch": 18.339144797210498, "grad_norm": 0.0026752904523164034, "learning_rate": 2.086461705088877e-07, "loss": 0.0, "num_input_tokens_seen": 215538288, "step": 99930 }, { "epoch": 18.34006239677005, "grad_norm": 0.00018978452135343105, "learning_rate": 2.0841732597153498e-07, "loss": 0.0, "num_input_tokens_seen": 215549776, "step": 99935 }, { "epoch": 18.3409799963296, "grad_norm": 0.0005501401610672474, "learning_rate": 2.0818860433085232e-07, "loss": 0.0, "num_input_tokens_seen": 215559792, "step": 99940 }, { "epoch": 18.341897595889154, "grad_norm": 0.0015266913687810302, "learning_rate": 2.0796000559270501e-07, "loss": 0.0, "num_input_tokens_seen": 215570192, "step": 99945 }, { "epoch": 18.342815195448708, "grad_norm": 0.0005259895115159452, "learning_rate": 2.0773152976295673e-07, "loss": 0.0, "num_input_tokens_seen": 215580400, "step": 99950 }, { "epoch": 18.343732795008258, "grad_norm": 6.312947516562417e-05, "learning_rate": 2.0750317684746669e-07, "loss": 0.0, "num_input_tokens_seen": 215590640, "step": 99955 }, { "epoch": 18.34465039456781, "grad_norm": 0.00011367849947419018, "learning_rate": 2.072749468520935e-07, "loss": 0.0, "num_input_tokens_seen": 215602704, "step": 99960 }, { "epoch": 18.345567994127364, "grad_norm": 0.0001057838526321575, "learning_rate": 2.0704683978268913e-07, "loss": 0.0, "num_input_tokens_seen": 215614032, "step": 99965 }, { "epoch": 18.346485593686914, "grad_norm": 0.0001731941447360441, "learning_rate": 2.068188556451045e-07, "loss": 0.0, "num_input_tokens_seen": 215624912, "step": 99970 }, { "epoch": 18.347403193246468, "grad_norm": 5.370075086830184e-05, "learning_rate": 2.0659099444518827e-07, "loss": 0.0, "num_input_tokens_seen": 215636912, "step": 99975 }, { "epoch": 18.34832079280602, "grad_norm": 4.932763840770349e-05, "learning_rate": 2.0636325618878406e-07, "loss": 0.0, "num_input_tokens_seen": 215647504, "step": 99980 }, { "epoch": 18.34923839236557, "grad_norm": 0.00024184546782635152, "learning_rate": 2.061356408817322e-07, "loss": 0.0, "num_input_tokens_seen": 215658640, "step": 99985 }, { "epoch": 18.350155991925124, "grad_norm": 7.482521323254332e-05, "learning_rate": 2.0590814852987194e-07, "loss": 0.0, "num_input_tokens_seen": 215670544, "step": 99990 }, { "epoch": 18.351073591484678, "grad_norm": 0.006871453952044249, "learning_rate": 2.056807791390375e-07, "loss": 0.0, "num_input_tokens_seen": 215680336, "step": 99995 }, { "epoch": 18.351991191044227, "grad_norm": 0.00020297130686230958, "learning_rate": 2.054535327150603e-07, "loss": 0.0, "num_input_tokens_seen": 215690736, "step": 100000 }, { "epoch": 18.35290879060378, "grad_norm": 5.511146810022183e-05, "learning_rate": 2.0522640926376846e-07, "loss": 0.0, "num_input_tokens_seen": 215702128, "step": 100005 }, { "epoch": 18.353826390163334, "grad_norm": 4.645478111342527e-05, "learning_rate": 2.04999408790989e-07, "loss": 0.0, "num_input_tokens_seen": 215712368, "step": 100010 }, { "epoch": 18.354743989722884, "grad_norm": 0.0001612362830201164, "learning_rate": 2.0477253130254283e-07, "loss": 0.0, "num_input_tokens_seen": 215724176, "step": 100015 }, { "epoch": 18.355661589282438, "grad_norm": 0.000208668177947402, "learning_rate": 2.045457768042486e-07, "loss": 0.0, "num_input_tokens_seen": 215735536, "step": 100020 }, { "epoch": 18.35657918884199, "grad_norm": 5.421398236649111e-05, "learning_rate": 2.0431914530192388e-07, "loss": 0.0, "num_input_tokens_seen": 215745744, "step": 100025 }, { "epoch": 18.35749678840154, "grad_norm": 0.00011335669114487246, "learning_rate": 2.0409263680138015e-07, "loss": 0.0, "num_input_tokens_seen": 215756848, "step": 100030 }, { "epoch": 18.358414387961094, "grad_norm": 0.0002919340622611344, "learning_rate": 2.0386625130842774e-07, "loss": 0.0, "num_input_tokens_seen": 215768176, "step": 100035 }, { "epoch": 18.359331987520648, "grad_norm": 3.608272527344525e-05, "learning_rate": 2.0363998882887147e-07, "loss": 0.0, "num_input_tokens_seen": 215779568, "step": 100040 }, { "epoch": 18.360249587080197, "grad_norm": 4.550486846710555e-05, "learning_rate": 2.0341384936851672e-07, "loss": 0.0, "num_input_tokens_seen": 215790448, "step": 100045 }, { "epoch": 18.36116718663975, "grad_norm": 0.0034669034648686647, "learning_rate": 2.0318783293316268e-07, "loss": 0.0, "num_input_tokens_seen": 215801232, "step": 100050 }, { "epoch": 18.362084786199304, "grad_norm": 5.0210757763125e-05, "learning_rate": 2.0296193952860643e-07, "loss": 0.0, "num_input_tokens_seen": 215812688, "step": 100055 }, { "epoch": 18.363002385758854, "grad_norm": 5.975612657493912e-05, "learning_rate": 2.0273616916064165e-07, "loss": 0.0, "num_input_tokens_seen": 215824496, "step": 100060 }, { "epoch": 18.363919985318407, "grad_norm": 0.0003358875692356378, "learning_rate": 2.0251052183505814e-07, "loss": 0.0, "num_input_tokens_seen": 215835824, "step": 100065 }, { "epoch": 18.36483758487796, "grad_norm": 4.7298130084527656e-05, "learning_rate": 2.022849975576452e-07, "loss": 0.0, "num_input_tokens_seen": 215846384, "step": 100070 }, { "epoch": 18.36575518443751, "grad_norm": 0.0024797108490020037, "learning_rate": 2.020595963341865e-07, "loss": 0.0, "num_input_tokens_seen": 215855888, "step": 100075 }, { "epoch": 18.366672783997064, "grad_norm": 0.0001212431670865044, "learning_rate": 2.0183431817046238e-07, "loss": 0.0, "num_input_tokens_seen": 215867216, "step": 100080 }, { "epoch": 18.367590383556617, "grad_norm": 0.00013366997882258147, "learning_rate": 2.016091630722522e-07, "loss": 0.0, "num_input_tokens_seen": 215877520, "step": 100085 }, { "epoch": 18.368507983116167, "grad_norm": 5.456713552121073e-05, "learning_rate": 2.0138413104532904e-07, "loss": 0.0, "num_input_tokens_seen": 215889040, "step": 100090 }, { "epoch": 18.36942558267572, "grad_norm": 0.001080330228433013, "learning_rate": 2.0115922209546667e-07, "loss": 0.0, "num_input_tokens_seen": 215899824, "step": 100095 }, { "epoch": 18.370343182235274, "grad_norm": 0.0001900269417092204, "learning_rate": 2.0093443622843267e-07, "loss": 0.0, "num_input_tokens_seen": 215910896, "step": 100100 }, { "epoch": 18.371260781794824, "grad_norm": 6.999557081144303e-05, "learning_rate": 2.0070977344999186e-07, "loss": 0.0, "num_input_tokens_seen": 215921904, "step": 100105 }, { "epoch": 18.372178381354377, "grad_norm": 5.4277032177196816e-05, "learning_rate": 2.0048523376590745e-07, "loss": 0.0001, "num_input_tokens_seen": 215931504, "step": 100110 }, { "epoch": 18.37309598091393, "grad_norm": 0.000982925877906382, "learning_rate": 2.0026081718193867e-07, "loss": 0.0, "num_input_tokens_seen": 215943376, "step": 100115 }, { "epoch": 18.37401358047348, "grad_norm": 5.1816172344842926e-05, "learning_rate": 2.0003652370383985e-07, "loss": 0.0, "num_input_tokens_seen": 215954000, "step": 100120 }, { "epoch": 18.374931180033034, "grad_norm": 4.7862038627499714e-05, "learning_rate": 1.9981235333736637e-07, "loss": 0.0, "num_input_tokens_seen": 215965040, "step": 100125 }, { "epoch": 18.375848779592587, "grad_norm": 0.00011133919178973883, "learning_rate": 1.9958830608826586e-07, "loss": 0.0, "num_input_tokens_seen": 215974992, "step": 100130 }, { "epoch": 18.376766379152137, "grad_norm": 8.412003080593422e-05, "learning_rate": 1.9936438196228535e-07, "loss": 0.0, "num_input_tokens_seen": 215986128, "step": 100135 }, { "epoch": 18.37768397871169, "grad_norm": 6.653221498709172e-05, "learning_rate": 1.9914058096516753e-07, "loss": 0.0, "num_input_tokens_seen": 215998000, "step": 100140 }, { "epoch": 18.378601578271244, "grad_norm": 6.467157800216228e-05, "learning_rate": 1.9891690310265388e-07, "loss": 0.0, "num_input_tokens_seen": 216009936, "step": 100145 }, { "epoch": 18.379519177830794, "grad_norm": 0.00013358608703128994, "learning_rate": 1.9869334838048038e-07, "loss": 0.0, "num_input_tokens_seen": 216021104, "step": 100150 }, { "epoch": 18.380436777390347, "grad_norm": 0.00022005820937920362, "learning_rate": 1.9846991680438078e-07, "loss": 0.0, "num_input_tokens_seen": 216031632, "step": 100155 }, { "epoch": 18.3813543769499, "grad_norm": 0.00016599784430582076, "learning_rate": 1.9824660838008658e-07, "loss": 0.0, "num_input_tokens_seen": 216043248, "step": 100160 }, { "epoch": 18.38227197650945, "grad_norm": 9.422960283700377e-05, "learning_rate": 1.980234231133249e-07, "loss": 0.0, "num_input_tokens_seen": 216052880, "step": 100165 }, { "epoch": 18.383189576069004, "grad_norm": 6.296681385720149e-05, "learning_rate": 1.9780036100981946e-07, "loss": 0.0, "num_input_tokens_seen": 216063856, "step": 100170 }, { "epoch": 18.384107175628557, "grad_norm": 0.0007599270902574062, "learning_rate": 1.9757742207529296e-07, "loss": 0.0, "num_input_tokens_seen": 216073872, "step": 100175 }, { "epoch": 18.385024775188107, "grad_norm": 7.19065938028507e-05, "learning_rate": 1.973546063154619e-07, "loss": 0.0, "num_input_tokens_seen": 216085168, "step": 100180 }, { "epoch": 18.38594237474766, "grad_norm": 0.0001668614859227091, "learning_rate": 1.9713191373604225e-07, "loss": 0.0, "num_input_tokens_seen": 216096752, "step": 100185 }, { "epoch": 18.386859974307214, "grad_norm": 4.987585998605937e-05, "learning_rate": 1.9690934434274445e-07, "loss": 0.0, "num_input_tokens_seen": 216106864, "step": 100190 }, { "epoch": 18.387777573866764, "grad_norm": 0.00018934308900497854, "learning_rate": 1.966868981412784e-07, "loss": 0.0, "num_input_tokens_seen": 216116976, "step": 100195 }, { "epoch": 18.388695173426317, "grad_norm": 0.00019200211681891233, "learning_rate": 1.9646457513734896e-07, "loss": 0.0, "num_input_tokens_seen": 216128336, "step": 100200 }, { "epoch": 18.38961277298587, "grad_norm": 0.0001188882888527587, "learning_rate": 1.962423753366577e-07, "loss": 0.0, "num_input_tokens_seen": 216138832, "step": 100205 }, { "epoch": 18.39053037254542, "grad_norm": 0.0007877764292061329, "learning_rate": 1.9602029874490502e-07, "loss": 0.0, "num_input_tokens_seen": 216150608, "step": 100210 }, { "epoch": 18.391447972104974, "grad_norm": 6.255539483390749e-05, "learning_rate": 1.9579834536778642e-07, "loss": 0.0, "num_input_tokens_seen": 216160784, "step": 100215 }, { "epoch": 18.392365571664527, "grad_norm": 5.343719749362208e-05, "learning_rate": 1.9557651521099397e-07, "loss": 0.0, "num_input_tokens_seen": 216171088, "step": 100220 }, { "epoch": 18.393283171224077, "grad_norm": 0.00044726079795509577, "learning_rate": 1.9535480828021757e-07, "loss": 0.0, "num_input_tokens_seen": 216181424, "step": 100225 }, { "epoch": 18.39420077078363, "grad_norm": 0.0002379565848968923, "learning_rate": 1.9513322458114438e-07, "loss": 0.0, "num_input_tokens_seen": 216191024, "step": 100230 }, { "epoch": 18.395118370343184, "grad_norm": 0.0013946949038654566, "learning_rate": 1.94911764119457e-07, "loss": 0.0, "num_input_tokens_seen": 216202256, "step": 100235 }, { "epoch": 18.396035969902734, "grad_norm": 4.106072447029874e-05, "learning_rate": 1.9469042690083483e-07, "loss": 0.0, "num_input_tokens_seen": 216213392, "step": 100240 }, { "epoch": 18.396953569462287, "grad_norm": 8.207779319491237e-05, "learning_rate": 1.9446921293095667e-07, "loss": 0.0, "num_input_tokens_seen": 216223472, "step": 100245 }, { "epoch": 18.39787116902184, "grad_norm": 0.00011007829016307369, "learning_rate": 1.942481222154946e-07, "loss": 0.0, "num_input_tokens_seen": 216232752, "step": 100250 }, { "epoch": 18.39878876858139, "grad_norm": 5.194428740651347e-05, "learning_rate": 1.940271547601208e-07, "loss": 0.0, "num_input_tokens_seen": 216244272, "step": 100255 }, { "epoch": 18.399706368140944, "grad_norm": 7.213206845335662e-05, "learning_rate": 1.938063105705007e-07, "loss": 0.0, "num_input_tokens_seen": 216254992, "step": 100260 }, { "epoch": 18.400623967700497, "grad_norm": 0.0019437042064964771, "learning_rate": 1.935855896523009e-07, "loss": 0.0, "num_input_tokens_seen": 216265872, "step": 100265 }, { "epoch": 18.401541567260047, "grad_norm": 9.365612640976906e-05, "learning_rate": 1.9336499201118076e-07, "loss": 0.0, "num_input_tokens_seen": 216276368, "step": 100270 }, { "epoch": 18.4024591668196, "grad_norm": 0.007110481150448322, "learning_rate": 1.9314451765279963e-07, "loss": 0.0, "num_input_tokens_seen": 216287888, "step": 100275 }, { "epoch": 18.403376766379154, "grad_norm": 0.0002010761818382889, "learning_rate": 1.9292416658281132e-07, "loss": 0.0, "num_input_tokens_seen": 216298672, "step": 100280 }, { "epoch": 18.404294365938703, "grad_norm": 0.00030167275690473616, "learning_rate": 1.9270393880686798e-07, "loss": 0.0, "num_input_tokens_seen": 216309328, "step": 100285 }, { "epoch": 18.405211965498257, "grad_norm": 0.00012555185821838677, "learning_rate": 1.9248383433061734e-07, "loss": 0.0, "num_input_tokens_seen": 216319632, "step": 100290 }, { "epoch": 18.40612956505781, "grad_norm": 7.487432594643906e-05, "learning_rate": 1.9226385315970597e-07, "loss": 0.0, "num_input_tokens_seen": 216331024, "step": 100295 }, { "epoch": 18.40704716461736, "grad_norm": 0.00011351649300195277, "learning_rate": 1.9204399529977547e-07, "loss": 0.0, "num_input_tokens_seen": 216340496, "step": 100300 }, { "epoch": 18.407964764176914, "grad_norm": 5.969623089185916e-05, "learning_rate": 1.918242607564641e-07, "loss": 0.0, "num_input_tokens_seen": 216351440, "step": 100305 }, { "epoch": 18.408882363736467, "grad_norm": 6.835864769527689e-05, "learning_rate": 1.9160464953540958e-07, "loss": 0.0, "num_input_tokens_seen": 216360944, "step": 100310 }, { "epoch": 18.409799963296017, "grad_norm": 0.00048682859051041305, "learning_rate": 1.9138516164224298e-07, "loss": 0.0, "num_input_tokens_seen": 216370608, "step": 100315 }, { "epoch": 18.41071756285557, "grad_norm": 7.783454202581197e-05, "learning_rate": 1.911657970825942e-07, "loss": 0.0, "num_input_tokens_seen": 216381200, "step": 100320 }, { "epoch": 18.411635162415124, "grad_norm": 0.00010384734196122736, "learning_rate": 1.9094655586208932e-07, "loss": 0.0, "num_input_tokens_seen": 216392304, "step": 100325 }, { "epoch": 18.412552761974673, "grad_norm": 0.007216764148324728, "learning_rate": 1.9072743798635275e-07, "loss": 0.0, "num_input_tokens_seen": 216402736, "step": 100330 }, { "epoch": 18.413470361534227, "grad_norm": 0.0002030176983680576, "learning_rate": 1.9050844346100329e-07, "loss": 0.0, "num_input_tokens_seen": 216413040, "step": 100335 }, { "epoch": 18.41438796109378, "grad_norm": 3.891356027452275e-05, "learning_rate": 1.9028957229165756e-07, "loss": 0.0, "num_input_tokens_seen": 216423984, "step": 100340 }, { "epoch": 18.41530556065333, "grad_norm": 0.00016420036263298243, "learning_rate": 1.900708244839311e-07, "loss": 0.0, "num_input_tokens_seen": 216434960, "step": 100345 }, { "epoch": 18.416223160212883, "grad_norm": 4.5717391913058236e-05, "learning_rate": 1.8985220004343274e-07, "loss": 0.0, "num_input_tokens_seen": 216445712, "step": 100350 }, { "epoch": 18.417140759772437, "grad_norm": 8.742520003579557e-05, "learning_rate": 1.8963369897577076e-07, "loss": 0.0, "num_input_tokens_seen": 216456496, "step": 100355 }, { "epoch": 18.418058359331987, "grad_norm": 0.00011343280493747443, "learning_rate": 1.8941532128654794e-07, "loss": 0.0, "num_input_tokens_seen": 216467088, "step": 100360 }, { "epoch": 18.41897595889154, "grad_norm": 0.00026512614567764103, "learning_rate": 1.8919706698136753e-07, "loss": 0.0, "num_input_tokens_seen": 216477136, "step": 100365 }, { "epoch": 18.419893558451093, "grad_norm": 0.013168280012905598, "learning_rate": 1.8897893606582562e-07, "loss": 0.0, "num_input_tokens_seen": 216488080, "step": 100370 }, { "epoch": 18.420811158010643, "grad_norm": 6.931152165634558e-05, "learning_rate": 1.8876092854551776e-07, "loss": 0.0, "num_input_tokens_seen": 216499600, "step": 100375 }, { "epoch": 18.421728757570197, "grad_norm": 8.878701919456944e-05, "learning_rate": 1.8854304442603555e-07, "loss": 0.0, "num_input_tokens_seen": 216510576, "step": 100380 }, { "epoch": 18.42264635712975, "grad_norm": 4.531748709268868e-05, "learning_rate": 1.8832528371296733e-07, "loss": 0.0, "num_input_tokens_seen": 216520528, "step": 100385 }, { "epoch": 18.4235639566893, "grad_norm": 8.320452616317198e-05, "learning_rate": 1.8810764641189695e-07, "loss": 0.0, "num_input_tokens_seen": 216531888, "step": 100390 }, { "epoch": 18.424481556248853, "grad_norm": 6.523245974676684e-05, "learning_rate": 1.8789013252840882e-07, "loss": 0.0, "num_input_tokens_seen": 216542736, "step": 100395 }, { "epoch": 18.425399155808407, "grad_norm": 6.928492075530812e-05, "learning_rate": 1.8767274206808073e-07, "loss": 0.0, "num_input_tokens_seen": 216553872, "step": 100400 }, { "epoch": 18.426316755367957, "grad_norm": 0.0015857673715800047, "learning_rate": 1.874554750364882e-07, "loss": 0.0, "num_input_tokens_seen": 216565360, "step": 100405 }, { "epoch": 18.42723435492751, "grad_norm": 0.0001780279417289421, "learning_rate": 1.8723833143920345e-07, "loss": 0.0, "num_input_tokens_seen": 216576048, "step": 100410 }, { "epoch": 18.428151954487063, "grad_norm": 0.0001090239456971176, "learning_rate": 1.8702131128179702e-07, "loss": 0.0, "num_input_tokens_seen": 216587184, "step": 100415 }, { "epoch": 18.429069554046613, "grad_norm": 0.0013247064780443907, "learning_rate": 1.8680441456983446e-07, "loss": 0.0, "num_input_tokens_seen": 216598544, "step": 100420 }, { "epoch": 18.429987153606167, "grad_norm": 0.00019223678100388497, "learning_rate": 1.8658764130887853e-07, "loss": 0.0, "num_input_tokens_seen": 216610288, "step": 100425 }, { "epoch": 18.43090475316572, "grad_norm": 0.0008145890897139907, "learning_rate": 1.863709915044898e-07, "loss": 0.0, "num_input_tokens_seen": 216621680, "step": 100430 }, { "epoch": 18.43182235272527, "grad_norm": 0.0015350963221862912, "learning_rate": 1.861544651622249e-07, "loss": 0.0, "num_input_tokens_seen": 216631696, "step": 100435 }, { "epoch": 18.432739952284823, "grad_norm": 0.0002690997498575598, "learning_rate": 1.859380622876361e-07, "loss": 0.0, "num_input_tokens_seen": 216642896, "step": 100440 }, { "epoch": 18.433657551844377, "grad_norm": 0.00955576915293932, "learning_rate": 1.8572178288627617e-07, "loss": 0.0, "num_input_tokens_seen": 216652944, "step": 100445 }, { "epoch": 18.434575151403926, "grad_norm": 0.00012445631728041917, "learning_rate": 1.8550562696369068e-07, "loss": 0.0, "num_input_tokens_seen": 216663568, "step": 100450 }, { "epoch": 18.43549275096348, "grad_norm": 0.007345785852521658, "learning_rate": 1.8528959452542406e-07, "loss": 0.0, "num_input_tokens_seen": 216673104, "step": 100455 }, { "epoch": 18.436410350523033, "grad_norm": 0.00028916611336171627, "learning_rate": 1.8507368557701687e-07, "loss": 0.0, "num_input_tokens_seen": 216684048, "step": 100460 }, { "epoch": 18.437327950082583, "grad_norm": 0.00014896586071699858, "learning_rate": 1.848579001240075e-07, "loss": 0.0071, "num_input_tokens_seen": 216695152, "step": 100465 }, { "epoch": 18.438245549642136, "grad_norm": 0.0006041672895662487, "learning_rate": 1.8464223817193039e-07, "loss": 0.0, "num_input_tokens_seen": 216705872, "step": 100470 }, { "epoch": 18.43916314920169, "grad_norm": 0.0007990298327058554, "learning_rate": 1.8442669972631665e-07, "loss": 0.0, "num_input_tokens_seen": 216716560, "step": 100475 }, { "epoch": 18.44008074876124, "grad_norm": 3.978185850428417e-05, "learning_rate": 1.8421128479269357e-07, "loss": 0.0, "num_input_tokens_seen": 216727632, "step": 100480 }, { "epoch": 18.440998348320793, "grad_norm": 0.0008854864863678813, "learning_rate": 1.8399599337658836e-07, "loss": 0.0, "num_input_tokens_seen": 216739312, "step": 100485 }, { "epoch": 18.441915947880346, "grad_norm": 0.004070188384503126, "learning_rate": 1.837808254835216e-07, "loss": 0.0, "num_input_tokens_seen": 216750704, "step": 100490 }, { "epoch": 18.442833547439896, "grad_norm": 6.40666694380343e-05, "learning_rate": 1.8356578111901225e-07, "loss": 0.0, "num_input_tokens_seen": 216760880, "step": 100495 }, { "epoch": 18.44375114699945, "grad_norm": 7.34645509510301e-05, "learning_rate": 1.8335086028857585e-07, "loss": 0.0, "num_input_tokens_seen": 216770832, "step": 100500 }, { "epoch": 18.444668746559003, "grad_norm": 5.679196692653932e-05, "learning_rate": 1.8313606299772468e-07, "loss": 0.0, "num_input_tokens_seen": 216780976, "step": 100505 }, { "epoch": 18.445586346118553, "grad_norm": 0.0001591285690665245, "learning_rate": 1.8292138925196767e-07, "loss": 0.0, "num_input_tokens_seen": 216790736, "step": 100510 }, { "epoch": 18.446503945678106, "grad_norm": 0.0027373225893825293, "learning_rate": 1.8270683905681153e-07, "loss": 0.0, "num_input_tokens_seen": 216801328, "step": 100515 }, { "epoch": 18.44742154523766, "grad_norm": 7.059983181534335e-05, "learning_rate": 1.8249241241775906e-07, "loss": 0.0, "num_input_tokens_seen": 216811728, "step": 100520 }, { "epoch": 18.44833914479721, "grad_norm": 7.276901305885985e-05, "learning_rate": 1.822781093403092e-07, "loss": 0.0, "num_input_tokens_seen": 216822928, "step": 100525 }, { "epoch": 18.449256744356763, "grad_norm": 0.00014231112436391413, "learning_rate": 1.8206392982995924e-07, "loss": 0.0, "num_input_tokens_seen": 216833488, "step": 100530 }, { "epoch": 18.450174343916316, "grad_norm": 0.00018695021572057158, "learning_rate": 1.818498738922031e-07, "loss": 0.0, "num_input_tokens_seen": 216844240, "step": 100535 }, { "epoch": 18.451091943475866, "grad_norm": 0.00016891135601326823, "learning_rate": 1.8163594153252972e-07, "loss": 0.0, "num_input_tokens_seen": 216855344, "step": 100540 }, { "epoch": 18.45200954303542, "grad_norm": 0.0001556679926579818, "learning_rate": 1.8142213275642583e-07, "loss": 0.0, "num_input_tokens_seen": 216867312, "step": 100545 }, { "epoch": 18.452927142594973, "grad_norm": 7.58419482735917e-05, "learning_rate": 1.8120844756937705e-07, "loss": 0.0, "num_input_tokens_seen": 216877808, "step": 100550 }, { "epoch": 18.453844742154523, "grad_norm": 0.0017007333226501942, "learning_rate": 1.8099488597686287e-07, "loss": 0.0, "num_input_tokens_seen": 216887984, "step": 100555 }, { "epoch": 18.454762341714076, "grad_norm": 0.00016005393990781158, "learning_rate": 1.8078144798436114e-07, "loss": 0.0, "num_input_tokens_seen": 216898960, "step": 100560 }, { "epoch": 18.45567994127363, "grad_norm": 0.00024633511202409863, "learning_rate": 1.8056813359734583e-07, "loss": 0.0, "num_input_tokens_seen": 216910384, "step": 100565 }, { "epoch": 18.45659754083318, "grad_norm": 0.00046597086475230753, "learning_rate": 1.8035494282128918e-07, "loss": 0.0, "num_input_tokens_seen": 216919760, "step": 100570 }, { "epoch": 18.457515140392733, "grad_norm": 7.111029844963923e-05, "learning_rate": 1.8014187566165743e-07, "loss": 0.0, "num_input_tokens_seen": 216929328, "step": 100575 }, { "epoch": 18.458432739952286, "grad_norm": 0.0021064234897494316, "learning_rate": 1.7992893212391725e-07, "loss": 0.0, "num_input_tokens_seen": 216941744, "step": 100580 }, { "epoch": 18.459350339511836, "grad_norm": 8.260286995209754e-05, "learning_rate": 1.7971611221352991e-07, "loss": 0.0, "num_input_tokens_seen": 216952976, "step": 100585 }, { "epoch": 18.46026793907139, "grad_norm": 0.00019318964041303843, "learning_rate": 1.7950341593595265e-07, "loss": 0.0, "num_input_tokens_seen": 216964432, "step": 100590 }, { "epoch": 18.461185538630943, "grad_norm": 6.933926488272846e-05, "learning_rate": 1.792908432966417e-07, "loss": 0.0, "num_input_tokens_seen": 216975728, "step": 100595 }, { "epoch": 18.462103138190493, "grad_norm": 0.0006085370550863445, "learning_rate": 1.790783943010499e-07, "loss": 0.0, "num_input_tokens_seen": 216986288, "step": 100600 }, { "epoch": 18.463020737750046, "grad_norm": 0.0013988612918183208, "learning_rate": 1.788660689546251e-07, "loss": 0.0, "num_input_tokens_seen": 216997424, "step": 100605 }, { "epoch": 18.4639383373096, "grad_norm": 0.00011039772653020918, "learning_rate": 1.7865386726281352e-07, "loss": 0.0, "num_input_tokens_seen": 217008080, "step": 100610 }, { "epoch": 18.46485593686915, "grad_norm": 8.088498725555837e-05, "learning_rate": 1.7844178923105805e-07, "loss": 0.0, "num_input_tokens_seen": 217019856, "step": 100615 }, { "epoch": 18.465773536428703, "grad_norm": 0.00010247404861729592, "learning_rate": 1.7822983486479762e-07, "loss": 0.0, "num_input_tokens_seen": 217031472, "step": 100620 }, { "epoch": 18.466691135988256, "grad_norm": 4.4194097426952794e-05, "learning_rate": 1.7801800416946902e-07, "loss": 0.0, "num_input_tokens_seen": 217042608, "step": 100625 }, { "epoch": 18.467608735547806, "grad_norm": 0.0003161526983603835, "learning_rate": 1.7780629715050512e-07, "loss": 0.0, "num_input_tokens_seen": 217053136, "step": 100630 }, { "epoch": 18.46852633510736, "grad_norm": 0.00048671808326616883, "learning_rate": 1.775947138133366e-07, "loss": 0.0, "num_input_tokens_seen": 217063856, "step": 100635 }, { "epoch": 18.469443934666913, "grad_norm": 5.783102460554801e-05, "learning_rate": 1.773832541633891e-07, "loss": 0.0, "num_input_tokens_seen": 217074800, "step": 100640 }, { "epoch": 18.470361534226463, "grad_norm": 0.0001988846925087273, "learning_rate": 1.7717191820608604e-07, "loss": 0.0, "num_input_tokens_seen": 217085136, "step": 100645 }, { "epoch": 18.471279133786016, "grad_norm": 0.00016315440007019788, "learning_rate": 1.7696070594684978e-07, "loss": 0.0, "num_input_tokens_seen": 217094640, "step": 100650 }, { "epoch": 18.47219673334557, "grad_norm": 10.634744644165039, "learning_rate": 1.7674961739109597e-07, "loss": 0.0063, "num_input_tokens_seen": 217105456, "step": 100655 }, { "epoch": 18.47311433290512, "grad_norm": 0.000135772381327115, "learning_rate": 1.7653865254423863e-07, "loss": 0.0, "num_input_tokens_seen": 217116528, "step": 100660 }, { "epoch": 18.474031932464673, "grad_norm": 0.0001897386828204617, "learning_rate": 1.7632781141168953e-07, "loss": 0.0, "num_input_tokens_seen": 217127952, "step": 100665 }, { "epoch": 18.474949532024226, "grad_norm": 0.00044453312875702977, "learning_rate": 1.7611709399885657e-07, "loss": 0.0, "num_input_tokens_seen": 217139024, "step": 100670 }, { "epoch": 18.475867131583776, "grad_norm": 0.0002890422474592924, "learning_rate": 1.759065003111432e-07, "loss": 0.0, "num_input_tokens_seen": 217149872, "step": 100675 }, { "epoch": 18.47678473114333, "grad_norm": 5.09167330164928e-05, "learning_rate": 1.756960303539512e-07, "loss": 0.0, "num_input_tokens_seen": 217160272, "step": 100680 }, { "epoch": 18.477702330702883, "grad_norm": 0.0006618123152293265, "learning_rate": 1.7548568413267964e-07, "loss": 0.0, "num_input_tokens_seen": 217170384, "step": 100685 }, { "epoch": 18.478619930262433, "grad_norm": 0.0001408426178386435, "learning_rate": 1.7527546165272302e-07, "loss": 0.0, "num_input_tokens_seen": 217182160, "step": 100690 }, { "epoch": 18.479537529821986, "grad_norm": 7.706502947257832e-05, "learning_rate": 1.750653629194732e-07, "loss": 0.0, "num_input_tokens_seen": 217192368, "step": 100695 }, { "epoch": 18.48045512938154, "grad_norm": 4.19812131440267e-05, "learning_rate": 1.7485538793831858e-07, "loss": 0.0, "num_input_tokens_seen": 217202704, "step": 100700 }, { "epoch": 18.48137272894109, "grad_norm": 5.730350312660448e-05, "learning_rate": 1.7464553671464434e-07, "loss": 0.0, "num_input_tokens_seen": 217213968, "step": 100705 }, { "epoch": 18.482290328500643, "grad_norm": 0.0011524595320224762, "learning_rate": 1.7443580925383397e-07, "loss": 0.0, "num_input_tokens_seen": 217223472, "step": 100710 }, { "epoch": 18.483207928060196, "grad_norm": 4.2415489588165656e-05, "learning_rate": 1.7422620556126647e-07, "loss": 0.0, "num_input_tokens_seen": 217233360, "step": 100715 }, { "epoch": 18.484125527619746, "grad_norm": 0.0002976882387883961, "learning_rate": 1.7401672564231752e-07, "loss": 0.0, "num_input_tokens_seen": 217244496, "step": 100720 }, { "epoch": 18.4850431271793, "grad_norm": 0.0008921109256334603, "learning_rate": 1.738073695023601e-07, "loss": 0.0, "num_input_tokens_seen": 217255088, "step": 100725 }, { "epoch": 18.485960726738853, "grad_norm": 0.00020673745893873274, "learning_rate": 1.7359813714676266e-07, "loss": 0.0, "num_input_tokens_seen": 217264304, "step": 100730 }, { "epoch": 18.486878326298402, "grad_norm": 0.0001343862822977826, "learning_rate": 1.7338902858089367e-07, "loss": 0.0, "num_input_tokens_seen": 217276240, "step": 100735 }, { "epoch": 18.487795925857956, "grad_norm": 0.00011997785622952506, "learning_rate": 1.7318004381011556e-07, "loss": 0.0, "num_input_tokens_seen": 217286512, "step": 100740 }, { "epoch": 18.48871352541751, "grad_norm": 0.0004613522905856371, "learning_rate": 1.7297118283978731e-07, "loss": 0.0, "num_input_tokens_seen": 217298032, "step": 100745 }, { "epoch": 18.48963112497706, "grad_norm": 9.180453344015405e-05, "learning_rate": 1.72762445675268e-07, "loss": 0.0, "num_input_tokens_seen": 217309232, "step": 100750 }, { "epoch": 18.490548724536612, "grad_norm": 0.00018168742826674134, "learning_rate": 1.7255383232191058e-07, "loss": 0.0, "num_input_tokens_seen": 217320336, "step": 100755 }, { "epoch": 18.491466324096166, "grad_norm": 0.00015302124666050076, "learning_rate": 1.7234534278506465e-07, "loss": 0.0, "num_input_tokens_seen": 217331440, "step": 100760 }, { "epoch": 18.492383923655716, "grad_norm": 0.0014874584740027785, "learning_rate": 1.7213697707007927e-07, "loss": 0.0, "num_input_tokens_seen": 217341200, "step": 100765 }, { "epoch": 18.49330152321527, "grad_norm": 0.0001284799218410626, "learning_rate": 1.7192873518229792e-07, "loss": 0.0, "num_input_tokens_seen": 217351664, "step": 100770 }, { "epoch": 18.494219122774822, "grad_norm": 9.42698388826102e-05, "learning_rate": 1.717206171270619e-07, "loss": 0.0, "num_input_tokens_seen": 217360336, "step": 100775 }, { "epoch": 18.495136722334372, "grad_norm": 0.00010314970131730661, "learning_rate": 1.715126229097075e-07, "loss": 0.0, "num_input_tokens_seen": 217371216, "step": 100780 }, { "epoch": 18.496054321893926, "grad_norm": 0.0008811516454443336, "learning_rate": 1.7130475253557211e-07, "loss": 0.0, "num_input_tokens_seen": 217382704, "step": 100785 }, { "epoch": 18.49697192145348, "grad_norm": 6.244134419830516e-05, "learning_rate": 1.7109700600998592e-07, "loss": 0.0, "num_input_tokens_seen": 217393008, "step": 100790 }, { "epoch": 18.49788952101303, "grad_norm": 0.00010577699140412733, "learning_rate": 1.7088938333827688e-07, "loss": 0.0, "num_input_tokens_seen": 217403952, "step": 100795 }, { "epoch": 18.498807120572582, "grad_norm": 0.0001511323353042826, "learning_rate": 1.706818845257713e-07, "loss": 0.0, "num_input_tokens_seen": 217414864, "step": 100800 }, { "epoch": 18.499724720132136, "grad_norm": 8.796394831733778e-05, "learning_rate": 1.7047450957779044e-07, "loss": 0.0, "num_input_tokens_seen": 217425072, "step": 100805 }, { "epoch": 18.500642319691686, "grad_norm": 0.0002775913162622601, "learning_rate": 1.7026725849965341e-07, "loss": 0.0, "num_input_tokens_seen": 217435952, "step": 100810 }, { "epoch": 18.50155991925124, "grad_norm": 0.00040513926069252193, "learning_rate": 1.7006013129667488e-07, "loss": 0.0, "num_input_tokens_seen": 217446480, "step": 100815 }, { "epoch": 18.502477518810792, "grad_norm": 0.00033470612834207714, "learning_rate": 1.698531279741694e-07, "loss": 0.0, "num_input_tokens_seen": 217457040, "step": 100820 }, { "epoch": 18.503395118370342, "grad_norm": 0.00018263858510181308, "learning_rate": 1.6964624853744448e-07, "loss": 0.0, "num_input_tokens_seen": 217467632, "step": 100825 }, { "epoch": 18.504312717929896, "grad_norm": 5.8509805967332795e-05, "learning_rate": 1.6943949299180694e-07, "loss": 0.0, "num_input_tokens_seen": 217479216, "step": 100830 }, { "epoch": 18.50523031748945, "grad_norm": 0.00037191834417171776, "learning_rate": 1.6923286134255978e-07, "loss": 0.0, "num_input_tokens_seen": 217490960, "step": 100835 }, { "epoch": 18.506147917049, "grad_norm": 0.00037309533217921853, "learning_rate": 1.690263535950032e-07, "loss": 0.0, "num_input_tokens_seen": 217502224, "step": 100840 }, { "epoch": 18.507065516608552, "grad_norm": 0.00019815929408650845, "learning_rate": 1.6881996975443239e-07, "loss": 0.0, "num_input_tokens_seen": 217512976, "step": 100845 }, { "epoch": 18.507983116168106, "grad_norm": 0.0009754101047292352, "learning_rate": 1.6861370982614255e-07, "loss": 0.0, "num_input_tokens_seen": 217524432, "step": 100850 }, { "epoch": 18.508900715727655, "grad_norm": 9.492601384408772e-05, "learning_rate": 1.6840757381542284e-07, "loss": 0.0, "num_input_tokens_seen": 217534672, "step": 100855 }, { "epoch": 18.50981831528721, "grad_norm": 8.07539327070117e-05, "learning_rate": 1.6820156172756063e-07, "loss": 0.0, "num_input_tokens_seen": 217545232, "step": 100860 }, { "epoch": 18.510735914846762, "grad_norm": 0.00038556166691705585, "learning_rate": 1.6799567356783953e-07, "loss": 0.0, "num_input_tokens_seen": 217556304, "step": 100865 }, { "epoch": 18.511653514406312, "grad_norm": 0.026578588411211967, "learning_rate": 1.6778990934154082e-07, "loss": 0.0, "num_input_tokens_seen": 217566736, "step": 100870 }, { "epoch": 18.512571113965866, "grad_norm": 0.0010277937399223447, "learning_rate": 1.6758426905394144e-07, "loss": 0.0, "num_input_tokens_seen": 217577584, "step": 100875 }, { "epoch": 18.51348871352542, "grad_norm": 8.834111940814182e-05, "learning_rate": 1.6737875271031546e-07, "loss": 0.0, "num_input_tokens_seen": 217589264, "step": 100880 }, { "epoch": 18.51440631308497, "grad_norm": 0.00047281052684411407, "learning_rate": 1.6717336031593534e-07, "loss": 0.0, "num_input_tokens_seen": 217598768, "step": 100885 }, { "epoch": 18.515323912644522, "grad_norm": 0.0001487100380472839, "learning_rate": 1.6696809187606855e-07, "loss": 0.0, "num_input_tokens_seen": 217611504, "step": 100890 }, { "epoch": 18.516241512204076, "grad_norm": 0.00023586304450873286, "learning_rate": 1.667629473959792e-07, "loss": 0.0, "num_input_tokens_seen": 217620496, "step": 100895 }, { "epoch": 18.517159111763625, "grad_norm": 0.0002464120334479958, "learning_rate": 1.665579268809292e-07, "loss": 0.0, "num_input_tokens_seen": 217630704, "step": 100900 }, { "epoch": 18.51807671132318, "grad_norm": 0.00015125556092243642, "learning_rate": 1.6635303033617767e-07, "loss": 0.0, "num_input_tokens_seen": 217642128, "step": 100905 }, { "epoch": 18.518994310882732, "grad_norm": 0.0003574114525690675, "learning_rate": 1.661482577669793e-07, "loss": 0.0, "num_input_tokens_seen": 217653104, "step": 100910 }, { "epoch": 18.519911910442282, "grad_norm": 5.8780449762707576e-05, "learning_rate": 1.6594360917858655e-07, "loss": 0.0, "num_input_tokens_seen": 217665232, "step": 100915 }, { "epoch": 18.520829510001835, "grad_norm": 0.00010488330008229241, "learning_rate": 1.65739084576248e-07, "loss": 0.0, "num_input_tokens_seen": 217676912, "step": 100920 }, { "epoch": 18.52174710956139, "grad_norm": 0.00028553593438118696, "learning_rate": 1.655346839652089e-07, "loss": 0.0, "num_input_tokens_seen": 217686512, "step": 100925 }, { "epoch": 18.52266470912094, "grad_norm": 0.0004540772351901978, "learning_rate": 1.6533040735071336e-07, "loss": 0.0, "num_input_tokens_seen": 217698480, "step": 100930 }, { "epoch": 18.523582308680492, "grad_norm": 6.337275408441201e-05, "learning_rate": 1.651262547379995e-07, "loss": 0.0, "num_input_tokens_seen": 217709168, "step": 100935 }, { "epoch": 18.524499908240045, "grad_norm": 4.2837680666707456e-05, "learning_rate": 1.6492222613230358e-07, "loss": 0.0, "num_input_tokens_seen": 217719792, "step": 100940 }, { "epoch": 18.525417507799595, "grad_norm": 5.5263706599362195e-05, "learning_rate": 1.6471832153885926e-07, "loss": 0.0, "num_input_tokens_seen": 217728848, "step": 100945 }, { "epoch": 18.52633510735915, "grad_norm": 0.00037174197495914996, "learning_rate": 1.645145409628951e-07, "loss": 0.0, "num_input_tokens_seen": 217739792, "step": 100950 }, { "epoch": 18.527252706918702, "grad_norm": 6.13279698882252e-05, "learning_rate": 1.6431088440963972e-07, "loss": 0.0, "num_input_tokens_seen": 217750064, "step": 100955 }, { "epoch": 18.528170306478252, "grad_norm": 7.548548455815762e-05, "learning_rate": 1.6410735188431503e-07, "loss": 0.0, "num_input_tokens_seen": 217760784, "step": 100960 }, { "epoch": 18.529087906037805, "grad_norm": 6.90653469064273e-05, "learning_rate": 1.6390394339214133e-07, "loss": 0.0, "num_input_tokens_seen": 217769584, "step": 100965 }, { "epoch": 18.53000550559736, "grad_norm": 4.917912883684039e-05, "learning_rate": 1.637006589383372e-07, "loss": 0.0, "num_input_tokens_seen": 217780240, "step": 100970 }, { "epoch": 18.53092310515691, "grad_norm": 0.00012131310359109193, "learning_rate": 1.6349749852811515e-07, "loss": 0.0, "num_input_tokens_seen": 217791696, "step": 100975 }, { "epoch": 18.531840704716462, "grad_norm": 0.00015778605302330106, "learning_rate": 1.6329446216668543e-07, "loss": 0.0, "num_input_tokens_seen": 217801936, "step": 100980 }, { "epoch": 18.532758304276015, "grad_norm": 5.4804611863801256e-05, "learning_rate": 1.630915498592578e-07, "loss": 0.0, "num_input_tokens_seen": 217812656, "step": 100985 }, { "epoch": 18.533675903835565, "grad_norm": 0.0015261303633451462, "learning_rate": 1.6288876161103528e-07, "loss": 0.0, "num_input_tokens_seen": 217823696, "step": 100990 }, { "epoch": 18.53459350339512, "grad_norm": 4.5937627874081954e-05, "learning_rate": 1.6268609742721874e-07, "loss": 0.0, "num_input_tokens_seen": 217834608, "step": 100995 }, { "epoch": 18.535511102954672, "grad_norm": 8.623071335023269e-05, "learning_rate": 1.624835573130068e-07, "loss": 0.0, "num_input_tokens_seen": 217846192, "step": 101000 }, { "epoch": 18.536428702514222, "grad_norm": 9.491982928011566e-05, "learning_rate": 1.622811412735942e-07, "loss": 0.0, "num_input_tokens_seen": 217857360, "step": 101005 }, { "epoch": 18.537346302073775, "grad_norm": 0.0002903333806898445, "learning_rate": 1.620788493141723e-07, "loss": 0.0, "num_input_tokens_seen": 217866192, "step": 101010 }, { "epoch": 18.53826390163333, "grad_norm": 6.779863906558603e-05, "learning_rate": 1.6187668143992974e-07, "loss": 0.0, "num_input_tokens_seen": 217876976, "step": 101015 }, { "epoch": 18.53918150119288, "grad_norm": 0.00017489722813479602, "learning_rate": 1.616746376560524e-07, "loss": 0.0, "num_input_tokens_seen": 217888912, "step": 101020 }, { "epoch": 18.540099100752432, "grad_norm": 9.703320392873138e-05, "learning_rate": 1.6147271796772168e-07, "loss": 0.0, "num_input_tokens_seen": 217898672, "step": 101025 }, { "epoch": 18.541016700311985, "grad_norm": 0.001837518997490406, "learning_rate": 1.6127092238011622e-07, "loss": 0.0, "num_input_tokens_seen": 217910320, "step": 101030 }, { "epoch": 18.541934299871535, "grad_norm": 3.9818882214603946e-05, "learning_rate": 1.6106925089841296e-07, "loss": 0.0, "num_input_tokens_seen": 217921136, "step": 101035 }, { "epoch": 18.54285189943109, "grad_norm": 8.243204501923174e-05, "learning_rate": 1.6086770352778336e-07, "loss": 0.0, "num_input_tokens_seen": 217932784, "step": 101040 }, { "epoch": 18.543769498990642, "grad_norm": 0.00014801144425291568, "learning_rate": 1.606662802733977e-07, "loss": 0.0, "num_input_tokens_seen": 217944464, "step": 101045 }, { "epoch": 18.54468709855019, "grad_norm": 0.0013079475611448288, "learning_rate": 1.6046498114042076e-07, "loss": 0.0, "num_input_tokens_seen": 217954416, "step": 101050 }, { "epoch": 18.545604698109745, "grad_norm": 0.0001231542119057849, "learning_rate": 1.6026380613401726e-07, "loss": 0.0, "num_input_tokens_seen": 217966288, "step": 101055 }, { "epoch": 18.5465222976693, "grad_norm": 0.00012298705405555665, "learning_rate": 1.600627552593459e-07, "loss": 0.0, "num_input_tokens_seen": 217978000, "step": 101060 }, { "epoch": 18.54743989722885, "grad_norm": 0.0014492070768028498, "learning_rate": 1.5986182852156307e-07, "loss": 0.0, "num_input_tokens_seen": 217989104, "step": 101065 }, { "epoch": 18.5483574967884, "grad_norm": 5.475480429595336e-05, "learning_rate": 1.5966102592582356e-07, "loss": 0.0, "num_input_tokens_seen": 217998672, "step": 101070 }, { "epoch": 18.549275096347955, "grad_norm": 0.00010244333680020645, "learning_rate": 1.5946034747727711e-07, "loss": 0.0, "num_input_tokens_seen": 218009200, "step": 101075 }, { "epoch": 18.550192695907505, "grad_norm": 0.0001917923364089802, "learning_rate": 1.5925979318106965e-07, "loss": 0.0, "num_input_tokens_seen": 218019120, "step": 101080 }, { "epoch": 18.55111029546706, "grad_norm": 7.61361006880179e-05, "learning_rate": 1.5905936304234703e-07, "loss": 0.0, "num_input_tokens_seen": 218030000, "step": 101085 }, { "epoch": 18.55202789502661, "grad_norm": 0.0003370389458723366, "learning_rate": 1.5885905706624849e-07, "loss": 0.0, "num_input_tokens_seen": 218041424, "step": 101090 }, { "epoch": 18.55294549458616, "grad_norm": 0.04322950541973114, "learning_rate": 1.5865887525791212e-07, "loss": 0.0, "num_input_tokens_seen": 218052048, "step": 101095 }, { "epoch": 18.553863094145715, "grad_norm": 5.997687185299583e-05, "learning_rate": 1.5845881762247162e-07, "loss": 0.056, "num_input_tokens_seen": 218063088, "step": 101100 }, { "epoch": 18.55478069370527, "grad_norm": 0.0005251697148196399, "learning_rate": 1.5825888416505953e-07, "loss": 0.0, "num_input_tokens_seen": 218073488, "step": 101105 }, { "epoch": 18.55569829326482, "grad_norm": 6.695916090393439e-05, "learning_rate": 1.5805907489080285e-07, "loss": 0.0, "num_input_tokens_seen": 218083440, "step": 101110 }, { "epoch": 18.55661589282437, "grad_norm": 7.299370918190107e-05, "learning_rate": 1.5785938980482696e-07, "loss": 0.0, "num_input_tokens_seen": 218093616, "step": 101115 }, { "epoch": 18.557533492383925, "grad_norm": 6.198046321514994e-05, "learning_rate": 1.576598289122522e-07, "loss": 0.0, "num_input_tokens_seen": 218104656, "step": 101120 }, { "epoch": 18.558451091943475, "grad_norm": 0.0013729927595704794, "learning_rate": 1.574603922181983e-07, "loss": 0.0, "num_input_tokens_seen": 218114864, "step": 101125 }, { "epoch": 18.55936869150303, "grad_norm": 5.9207610320299864e-05, "learning_rate": 1.5726107972778015e-07, "loss": 0.0, "num_input_tokens_seen": 218125232, "step": 101130 }, { "epoch": 18.56028629106258, "grad_norm": 6.767099694116041e-05, "learning_rate": 1.570618914461103e-07, "loss": 0.0, "num_input_tokens_seen": 218135728, "step": 101135 }, { "epoch": 18.56120389062213, "grad_norm": 0.00018008695042226464, "learning_rate": 1.5686282737829627e-07, "loss": 0.0, "num_input_tokens_seen": 218145584, "step": 101140 }, { "epoch": 18.562121490181685, "grad_norm": 0.0012172101996839046, "learning_rate": 1.566638875294446e-07, "loss": 0.0, "num_input_tokens_seen": 218155888, "step": 101145 }, { "epoch": 18.56303908974124, "grad_norm": 0.00020633700478356332, "learning_rate": 1.5646507190465788e-07, "loss": 0.0, "num_input_tokens_seen": 218165904, "step": 101150 }, { "epoch": 18.563956689300788, "grad_norm": 0.00020886170386802405, "learning_rate": 1.562663805090353e-07, "loss": 0.0, "num_input_tokens_seen": 218176080, "step": 101155 }, { "epoch": 18.56487428886034, "grad_norm": 7.259838457684964e-05, "learning_rate": 1.5606781334767285e-07, "loss": 0.0, "num_input_tokens_seen": 218187184, "step": 101160 }, { "epoch": 18.565791888419895, "grad_norm": 5.3270770877134055e-05, "learning_rate": 1.5586937042566365e-07, "loss": 0.0, "num_input_tokens_seen": 218198672, "step": 101165 }, { "epoch": 18.566709487979445, "grad_norm": 5.339800190995447e-05, "learning_rate": 1.5567105174809748e-07, "loss": 0.0, "num_input_tokens_seen": 218209424, "step": 101170 }, { "epoch": 18.567627087538998, "grad_norm": 9.807593596633524e-05, "learning_rate": 1.5547285732006034e-07, "loss": 0.0, "num_input_tokens_seen": 218221008, "step": 101175 }, { "epoch": 18.56854468709855, "grad_norm": 5.348500417312607e-05, "learning_rate": 1.55274787146637e-07, "loss": 0.0, "num_input_tokens_seen": 218231408, "step": 101180 }, { "epoch": 18.5694622866581, "grad_norm": 0.00014006455603521317, "learning_rate": 1.5507684123290567e-07, "loss": 0.0, "num_input_tokens_seen": 218242672, "step": 101185 }, { "epoch": 18.570379886217655, "grad_norm": 0.00021366163855418563, "learning_rate": 1.5487901958394503e-07, "loss": 0.0, "num_input_tokens_seen": 218252112, "step": 101190 }, { "epoch": 18.571297485777208, "grad_norm": 0.00032661951263435185, "learning_rate": 1.5468132220482822e-07, "loss": 0.0, "num_input_tokens_seen": 218263952, "step": 101195 }, { "epoch": 18.572215085336758, "grad_norm": 6.592085992451757e-05, "learning_rate": 1.5448374910062514e-07, "loss": 0.0, "num_input_tokens_seen": 218274576, "step": 101200 }, { "epoch": 18.57313268489631, "grad_norm": 7.797341822879389e-05, "learning_rate": 1.5428630027640502e-07, "loss": 0.0, "num_input_tokens_seen": 218285360, "step": 101205 }, { "epoch": 18.574050284455865, "grad_norm": 0.0013840548926964402, "learning_rate": 1.5408897573723102e-07, "loss": 0.0, "num_input_tokens_seen": 218295664, "step": 101210 }, { "epoch": 18.574967884015415, "grad_norm": 0.00014067966549191624, "learning_rate": 1.5389177548816358e-07, "loss": 0.0, "num_input_tokens_seen": 218306352, "step": 101215 }, { "epoch": 18.575885483574968, "grad_norm": 3.062742325710133e-05, "learning_rate": 1.5369469953426198e-07, "loss": 0.0, "num_input_tokens_seen": 218317136, "step": 101220 }, { "epoch": 18.57680308313452, "grad_norm": 6.37403572909534e-05, "learning_rate": 1.5349774788058048e-07, "loss": 0.0, "num_input_tokens_seen": 218326768, "step": 101225 }, { "epoch": 18.57772068269407, "grad_norm": 1.0397439002990723, "learning_rate": 1.5330092053217005e-07, "loss": 0.0012, "num_input_tokens_seen": 218336528, "step": 101230 }, { "epoch": 18.578638282253625, "grad_norm": 7.671588537050411e-05, "learning_rate": 1.5310421749407888e-07, "loss": 0.0, "num_input_tokens_seen": 218346864, "step": 101235 }, { "epoch": 18.579555881813178, "grad_norm": 5.2833431254839525e-05, "learning_rate": 1.5290763877135295e-07, "loss": 0.0, "num_input_tokens_seen": 218358384, "step": 101240 }, { "epoch": 18.580473481372728, "grad_norm": 0.0009588036336936057, "learning_rate": 1.5271118436903376e-07, "loss": 0.0, "num_input_tokens_seen": 218369360, "step": 101245 }, { "epoch": 18.58139108093228, "grad_norm": 8.207465725718066e-05, "learning_rate": 1.5251485429215952e-07, "loss": 0.0012, "num_input_tokens_seen": 218379824, "step": 101250 }, { "epoch": 18.582308680491835, "grad_norm": 0.0004396488075144589, "learning_rate": 1.5231864854576728e-07, "loss": 0.0, "num_input_tokens_seen": 218390704, "step": 101255 }, { "epoch": 18.583226280051385, "grad_norm": 4.401044498081319e-05, "learning_rate": 1.5212256713488805e-07, "loss": 0.0, "num_input_tokens_seen": 218402000, "step": 101260 }, { "epoch": 18.584143879610938, "grad_norm": 0.0005683597992174327, "learning_rate": 1.5192661006455166e-07, "loss": 0.0, "num_input_tokens_seen": 218412464, "step": 101265 }, { "epoch": 18.58506147917049, "grad_norm": 0.0001247878244612366, "learning_rate": 1.5173077733978304e-07, "loss": 0.0, "num_input_tokens_seen": 218422416, "step": 101270 }, { "epoch": 18.58597907873004, "grad_norm": 7.593745249323547e-05, "learning_rate": 1.5153506896560644e-07, "loss": 0.0, "num_input_tokens_seen": 218433648, "step": 101275 }, { "epoch": 18.586896678289595, "grad_norm": 8.296540181618184e-05, "learning_rate": 1.513394849470412e-07, "loss": 0.0, "num_input_tokens_seen": 218444016, "step": 101280 }, { "epoch": 18.587814277849148, "grad_norm": 0.00033717998303472996, "learning_rate": 1.5114402528910278e-07, "loss": 0.0, "num_input_tokens_seen": 218453456, "step": 101285 }, { "epoch": 18.588731877408698, "grad_norm": 0.00018621158960741013, "learning_rate": 1.5094868999680547e-07, "loss": 0.0, "num_input_tokens_seen": 218464880, "step": 101290 }, { "epoch": 18.58964947696825, "grad_norm": 9.92236819001846e-05, "learning_rate": 1.5075347907515913e-07, "loss": 0.0, "num_input_tokens_seen": 218476080, "step": 101295 }, { "epoch": 18.590567076527805, "grad_norm": 9.861190483206883e-05, "learning_rate": 1.5055839252916981e-07, "loss": 0.0, "num_input_tokens_seen": 218487312, "step": 101300 }, { "epoch": 18.591484676087354, "grad_norm": 0.0007582366815768182, "learning_rate": 1.5036343036384182e-07, "loss": 0.0, "num_input_tokens_seen": 218497904, "step": 101305 }, { "epoch": 18.592402275646908, "grad_norm": 0.0015948531217873096, "learning_rate": 1.5016859258417615e-07, "loss": 0.0, "num_input_tokens_seen": 218507664, "step": 101310 }, { "epoch": 18.59331987520646, "grad_norm": 4.7542696847813204e-05, "learning_rate": 1.4997387919516936e-07, "loss": 0.0, "num_input_tokens_seen": 218517840, "step": 101315 }, { "epoch": 18.59423747476601, "grad_norm": 9.198659972753376e-05, "learning_rate": 1.4977929020181526e-07, "loss": 0.0, "num_input_tokens_seen": 218528624, "step": 101320 }, { "epoch": 18.595155074325564, "grad_norm": 0.00033476072712801397, "learning_rate": 1.4958482560910592e-07, "loss": 0.0, "num_input_tokens_seen": 218538512, "step": 101325 }, { "epoch": 18.596072673885118, "grad_norm": 0.0019297789549455047, "learning_rate": 1.4939048542202795e-07, "loss": 0.0, "num_input_tokens_seen": 218550160, "step": 101330 }, { "epoch": 18.596990273444668, "grad_norm": 4.4373926357366145e-05, "learning_rate": 1.4919626964556622e-07, "loss": 0.0, "num_input_tokens_seen": 218560752, "step": 101335 }, { "epoch": 18.59790787300422, "grad_norm": 4.88101904920768e-05, "learning_rate": 1.4900217828470176e-07, "loss": 0.0, "num_input_tokens_seen": 218572272, "step": 101340 }, { "epoch": 18.598825472563774, "grad_norm": 8.708472887519747e-05, "learning_rate": 1.488082113444139e-07, "loss": 0.0, "num_input_tokens_seen": 218582864, "step": 101345 }, { "epoch": 18.599743072123324, "grad_norm": 5.5805820011300966e-05, "learning_rate": 1.486143688296765e-07, "loss": 0.0, "num_input_tokens_seen": 218593584, "step": 101350 }, { "epoch": 18.600660671682878, "grad_norm": 9.784174471860752e-05, "learning_rate": 1.4842065074546107e-07, "loss": 0.0, "num_input_tokens_seen": 218604880, "step": 101355 }, { "epoch": 18.60157827124243, "grad_norm": 0.00013319366553332657, "learning_rate": 1.4822705709673756e-07, "loss": 0.0, "num_input_tokens_seen": 218615600, "step": 101360 }, { "epoch": 18.60249587080198, "grad_norm": 0.0001993951154872775, "learning_rate": 1.4803358788846977e-07, "loss": 0.0, "num_input_tokens_seen": 218627056, "step": 101365 }, { "epoch": 18.603413470361534, "grad_norm": 0.0012030908837914467, "learning_rate": 1.478402431256204e-07, "loss": 0.0, "num_input_tokens_seen": 218637232, "step": 101370 }, { "epoch": 18.604331069921088, "grad_norm": 0.00011398540664231405, "learning_rate": 1.476470228131488e-07, "loss": 0.0, "num_input_tokens_seen": 218647408, "step": 101375 }, { "epoch": 18.605248669480638, "grad_norm": 0.00011576687393244356, "learning_rate": 1.4745392695601103e-07, "loss": 0.0, "num_input_tokens_seen": 218659120, "step": 101380 }, { "epoch": 18.60616626904019, "grad_norm": 0.00012290863378439099, "learning_rate": 1.4726095555915864e-07, "loss": 0.0, "num_input_tokens_seen": 218669488, "step": 101385 }, { "epoch": 18.607083868599744, "grad_norm": 8.812164014670998e-05, "learning_rate": 1.4706810862754217e-07, "loss": 0.0, "num_input_tokens_seen": 218680848, "step": 101390 }, { "epoch": 18.608001468159294, "grad_norm": 0.000502621871419251, "learning_rate": 1.4687538616610707e-07, "loss": 0.0, "num_input_tokens_seen": 218691184, "step": 101395 }, { "epoch": 18.608919067718848, "grad_norm": 0.0001337985013378784, "learning_rate": 1.4668278817979718e-07, "loss": 0.0, "num_input_tokens_seen": 218701776, "step": 101400 }, { "epoch": 18.6098366672784, "grad_norm": 0.00011990871280431747, "learning_rate": 1.464903146735508e-07, "loss": 0.0, "num_input_tokens_seen": 218712464, "step": 101405 }, { "epoch": 18.61075426683795, "grad_norm": 6.813014624640346e-05, "learning_rate": 1.462979656523067e-07, "loss": 0.0, "num_input_tokens_seen": 218723184, "step": 101410 }, { "epoch": 18.611671866397504, "grad_norm": 8.087451715255156e-05, "learning_rate": 1.4610574112099652e-07, "loss": 0.0, "num_input_tokens_seen": 218733232, "step": 101415 }, { "epoch": 18.612589465957058, "grad_norm": 0.00010527735867071897, "learning_rate": 1.459136410845513e-07, "loss": 0.0, "num_input_tokens_seen": 218744560, "step": 101420 }, { "epoch": 18.613507065516607, "grad_norm": 0.00010655193909769878, "learning_rate": 1.4572166554789825e-07, "loss": 0.0, "num_input_tokens_seen": 218756528, "step": 101425 }, { "epoch": 18.61442466507616, "grad_norm": 0.0035422989167273045, "learning_rate": 1.4552981451596117e-07, "loss": 0.0, "num_input_tokens_seen": 218766608, "step": 101430 }, { "epoch": 18.615342264635714, "grad_norm": 0.0043905810452997684, "learning_rate": 1.4533808799366001e-07, "loss": 0.0, "num_input_tokens_seen": 218775696, "step": 101435 }, { "epoch": 18.616259864195264, "grad_norm": 5.447359217214398e-05, "learning_rate": 1.4514648598591309e-07, "loss": 0.0, "num_input_tokens_seen": 218787376, "step": 101440 }, { "epoch": 18.617177463754818, "grad_norm": 0.0001858557661762461, "learning_rate": 1.449550084976348e-07, "loss": 0.0, "num_input_tokens_seen": 218798032, "step": 101445 }, { "epoch": 18.61809506331437, "grad_norm": 0.001460555475205183, "learning_rate": 1.4476365553373616e-07, "loss": 0.0, "num_input_tokens_seen": 218809840, "step": 101450 }, { "epoch": 18.61901266287392, "grad_norm": 0.00015346652071457356, "learning_rate": 1.445724270991239e-07, "loss": 0.0, "num_input_tokens_seen": 218821648, "step": 101455 }, { "epoch": 18.619930262433474, "grad_norm": 8.758588955970481e-05, "learning_rate": 1.44381323198704e-07, "loss": 0.0, "num_input_tokens_seen": 218832176, "step": 101460 }, { "epoch": 18.620847861993028, "grad_norm": 0.0013387062354013324, "learning_rate": 1.4419034383737817e-07, "loss": 0.0, "num_input_tokens_seen": 218842736, "step": 101465 }, { "epoch": 18.621765461552577, "grad_norm": 0.00016936597239691764, "learning_rate": 1.4399948902004301e-07, "loss": 0.0, "num_input_tokens_seen": 218853936, "step": 101470 }, { "epoch": 18.62268306111213, "grad_norm": 6.74589755362831e-05, "learning_rate": 1.4380875875159572e-07, "loss": 0.0, "num_input_tokens_seen": 218864336, "step": 101475 }, { "epoch": 18.623600660671684, "grad_norm": 0.00010415040014777333, "learning_rate": 1.436181530369274e-07, "loss": 0.0, "num_input_tokens_seen": 218875600, "step": 101480 }, { "epoch": 18.624518260231234, "grad_norm": 0.00013411948748398572, "learning_rate": 1.434276718809263e-07, "loss": 0.0, "num_input_tokens_seen": 218886896, "step": 101485 }, { "epoch": 18.625435859790787, "grad_norm": 4.9060039600590244e-05, "learning_rate": 1.4323731528847862e-07, "loss": 0.0, "num_input_tokens_seen": 218897136, "step": 101490 }, { "epoch": 18.62635345935034, "grad_norm": 0.0003747148730326444, "learning_rate": 1.4304708326446704e-07, "loss": 0.0, "num_input_tokens_seen": 218908432, "step": 101495 }, { "epoch": 18.62727105890989, "grad_norm": 7.726281910436228e-05, "learning_rate": 1.428569758137699e-07, "loss": 0.0, "num_input_tokens_seen": 218917744, "step": 101500 }, { "epoch": 18.628188658469444, "grad_norm": 5.804347529192455e-05, "learning_rate": 1.4266699294126273e-07, "loss": 0.0, "num_input_tokens_seen": 218928400, "step": 101505 }, { "epoch": 18.629106258028997, "grad_norm": 9.263669926440343e-05, "learning_rate": 1.4247713465181946e-07, "loss": 0.0, "num_input_tokens_seen": 218938480, "step": 101510 }, { "epoch": 18.630023857588547, "grad_norm": 0.00030280000646598637, "learning_rate": 1.4228740095030945e-07, "loss": 0.0, "num_input_tokens_seen": 218949648, "step": 101515 }, { "epoch": 18.6309414571481, "grad_norm": 6.408497574739158e-05, "learning_rate": 1.4209779184159832e-07, "loss": 0.0, "num_input_tokens_seen": 218960496, "step": 101520 }, { "epoch": 18.631859056707654, "grad_norm": 0.0003756664227694273, "learning_rate": 1.419083073305505e-07, "loss": 0.0, "num_input_tokens_seen": 218970960, "step": 101525 }, { "epoch": 18.632776656267204, "grad_norm": 0.00044551261817105114, "learning_rate": 1.4171894742202487e-07, "loss": 0.0, "num_input_tokens_seen": 218981744, "step": 101530 }, { "epoch": 18.633694255826757, "grad_norm": 6.0660993767669424e-05, "learning_rate": 1.4152971212087807e-07, "loss": 0.0, "num_input_tokens_seen": 218992624, "step": 101535 }, { "epoch": 18.63461185538631, "grad_norm": 7.533036114182323e-05, "learning_rate": 1.413406014319646e-07, "loss": 0.0, "num_input_tokens_seen": 219003600, "step": 101540 }, { "epoch": 18.63552945494586, "grad_norm": 9.163007780443877e-05, "learning_rate": 1.41151615360135e-07, "loss": 0.0, "num_input_tokens_seen": 219014704, "step": 101545 }, { "epoch": 18.636447054505414, "grad_norm": 0.00015873236407060176, "learning_rate": 1.409627539102354e-07, "loss": 0.0, "num_input_tokens_seen": 219024432, "step": 101550 }, { "epoch": 18.637364654064967, "grad_norm": 5.588047133642249e-05, "learning_rate": 1.407740170871108e-07, "loss": 0.0, "num_input_tokens_seen": 219035504, "step": 101555 }, { "epoch": 18.638282253624517, "grad_norm": 0.0002073434297926724, "learning_rate": 1.4058540489560123e-07, "loss": 0.0, "num_input_tokens_seen": 219047312, "step": 101560 }, { "epoch": 18.63919985318407, "grad_norm": 0.00012678003986366093, "learning_rate": 1.4039691734054396e-07, "loss": 0.0, "num_input_tokens_seen": 219059216, "step": 101565 }, { "epoch": 18.640117452743624, "grad_norm": 0.0006721368408761919, "learning_rate": 1.4020855442677507e-07, "loss": 0.0, "num_input_tokens_seen": 219069104, "step": 101570 }, { "epoch": 18.641035052303174, "grad_norm": 0.00016318091365974396, "learning_rate": 1.400203161591246e-07, "loss": 0.0, "num_input_tokens_seen": 219079472, "step": 101575 }, { "epoch": 18.641952651862727, "grad_norm": 9.00636805454269e-05, "learning_rate": 1.3983220254242036e-07, "loss": 0.0, "num_input_tokens_seen": 219090000, "step": 101580 }, { "epoch": 18.64287025142228, "grad_norm": 7.500939682358876e-05, "learning_rate": 1.3964421358148794e-07, "loss": 0.0, "num_input_tokens_seen": 219100944, "step": 101585 }, { "epoch": 18.64378785098183, "grad_norm": 0.00011500661639729515, "learning_rate": 1.3945634928114794e-07, "loss": 0.0, "num_input_tokens_seen": 219112016, "step": 101590 }, { "epoch": 18.644705450541384, "grad_norm": 0.0005073150969110429, "learning_rate": 1.392686096462198e-07, "loss": 0.0, "num_input_tokens_seen": 219124752, "step": 101595 }, { "epoch": 18.645623050100937, "grad_norm": 0.0005549599882215261, "learning_rate": 1.3908099468151858e-07, "loss": 0.0, "num_input_tokens_seen": 219135568, "step": 101600 }, { "epoch": 18.646540649660487, "grad_norm": 0.0003679068759083748, "learning_rate": 1.3889350439185544e-07, "loss": 0.0, "num_input_tokens_seen": 219146448, "step": 101605 }, { "epoch": 18.64745824922004, "grad_norm": 0.00011653435649350286, "learning_rate": 1.3870613878204042e-07, "loss": 0.0, "num_input_tokens_seen": 219157136, "step": 101610 }, { "epoch": 18.648375848779594, "grad_norm": 0.00014200739678926766, "learning_rate": 1.3851889785687856e-07, "loss": 0.0, "num_input_tokens_seen": 219167792, "step": 101615 }, { "epoch": 18.649293448339144, "grad_norm": 7.821537292329594e-05, "learning_rate": 1.383317816211721e-07, "loss": 0.0, "num_input_tokens_seen": 219177744, "step": 101620 }, { "epoch": 18.650211047898697, "grad_norm": 7.287089829333127e-05, "learning_rate": 1.381447900797206e-07, "loss": 0.0, "num_input_tokens_seen": 219188560, "step": 101625 }, { "epoch": 18.65112864745825, "grad_norm": 0.0005543697625398636, "learning_rate": 1.3795792323732072e-07, "loss": 0.0, "num_input_tokens_seen": 219198672, "step": 101630 }, { "epoch": 18.6520462470178, "grad_norm": 9.07893045223318e-05, "learning_rate": 1.377711810987642e-07, "loss": 0.0, "num_input_tokens_seen": 219209872, "step": 101635 }, { "epoch": 18.652963846577354, "grad_norm": 9.373249486088753e-05, "learning_rate": 1.3758456366884054e-07, "loss": 0.0, "num_input_tokens_seen": 219220112, "step": 101640 }, { "epoch": 18.653881446136907, "grad_norm": 0.0003871135995723307, "learning_rate": 1.373980709523376e-07, "loss": 0.0, "num_input_tokens_seen": 219230736, "step": 101645 }, { "epoch": 18.654799045696457, "grad_norm": 5.872832116438076e-05, "learning_rate": 1.3721170295403709e-07, "loss": 0.0, "num_input_tokens_seen": 219241904, "step": 101650 }, { "epoch": 18.65571664525601, "grad_norm": 7.624323188792914e-05, "learning_rate": 1.3702545967872016e-07, "loss": 0.0, "num_input_tokens_seen": 219252976, "step": 101655 }, { "epoch": 18.656634244815564, "grad_norm": 9.528071677777916e-05, "learning_rate": 1.3683934113116304e-07, "loss": 0.0, "num_input_tokens_seen": 219262864, "step": 101660 }, { "epoch": 18.657551844375114, "grad_norm": 7.831581024220213e-05, "learning_rate": 1.366533473161402e-07, "loss": 0.0, "num_input_tokens_seen": 219274224, "step": 101665 }, { "epoch": 18.658469443934667, "grad_norm": 4.74932421639096e-05, "learning_rate": 1.3646747823842065e-07, "loss": 0.0, "num_input_tokens_seen": 219284368, "step": 101670 }, { "epoch": 18.65938704349422, "grad_norm": 6.358782411552966e-05, "learning_rate": 1.3628173390277278e-07, "loss": 0.0, "num_input_tokens_seen": 219294768, "step": 101675 }, { "epoch": 18.66030464305377, "grad_norm": 0.0015211306745186448, "learning_rate": 1.3609611431396054e-07, "loss": 0.0, "num_input_tokens_seen": 219305168, "step": 101680 }, { "epoch": 18.661222242613324, "grad_norm": 3.79183329641819e-05, "learning_rate": 1.3591061947674455e-07, "loss": 0.0, "num_input_tokens_seen": 219316784, "step": 101685 }, { "epoch": 18.662139842172877, "grad_norm": 0.00025460502365604043, "learning_rate": 1.3572524939588217e-07, "loss": 0.0, "num_input_tokens_seen": 219328688, "step": 101690 }, { "epoch": 18.663057441732427, "grad_norm": 8.659351442474872e-05, "learning_rate": 1.35540004076129e-07, "loss": 0.0, "num_input_tokens_seen": 219339536, "step": 101695 }, { "epoch": 18.66397504129198, "grad_norm": 0.00017115676018875092, "learning_rate": 1.3535488352223513e-07, "loss": 0.0, "num_input_tokens_seen": 219350960, "step": 101700 }, { "epoch": 18.664892640851534, "grad_norm": 0.04401189088821411, "learning_rate": 1.3516988773894845e-07, "loss": 0.0, "num_input_tokens_seen": 219360912, "step": 101705 }, { "epoch": 18.665810240411083, "grad_norm": 4.333233300712891e-05, "learning_rate": 1.349850167310146e-07, "loss": 0.0, "num_input_tokens_seen": 219371312, "step": 101710 }, { "epoch": 18.666727839970637, "grad_norm": 0.0003191651194356382, "learning_rate": 1.3480027050317533e-07, "loss": 0.0, "num_input_tokens_seen": 219383184, "step": 101715 }, { "epoch": 18.66764543953019, "grad_norm": 5.819519719807431e-05, "learning_rate": 1.346156490601691e-07, "loss": 0.0, "num_input_tokens_seen": 219395024, "step": 101720 }, { "epoch": 18.66856303908974, "grad_norm": 6.561056216014549e-05, "learning_rate": 1.3443115240672989e-07, "loss": 0.0, "num_input_tokens_seen": 219405552, "step": 101725 }, { "epoch": 18.669480638649294, "grad_norm": 8.553119550924748e-05, "learning_rate": 1.342467805475911e-07, "loss": 0.0, "num_input_tokens_seen": 219417584, "step": 101730 }, { "epoch": 18.670398238208847, "grad_norm": 9.228837734553963e-05, "learning_rate": 1.3406253348748123e-07, "loss": 0.0, "num_input_tokens_seen": 219428848, "step": 101735 }, { "epoch": 18.671315837768397, "grad_norm": 5.46023620699998e-05, "learning_rate": 1.3387841123112534e-07, "loss": 0.0, "num_input_tokens_seen": 219440144, "step": 101740 }, { "epoch": 18.67223343732795, "grad_norm": 0.00010182037658523768, "learning_rate": 1.336944137832469e-07, "loss": 0.0, "num_input_tokens_seen": 219451024, "step": 101745 }, { "epoch": 18.673151036887504, "grad_norm": 7.539502257714048e-05, "learning_rate": 1.3351054114856488e-07, "loss": 0.0, "num_input_tokens_seen": 219462960, "step": 101750 }, { "epoch": 18.674068636447053, "grad_norm": 5.5667122069280595e-05, "learning_rate": 1.3332679333179443e-07, "loss": 0.0, "num_input_tokens_seen": 219474768, "step": 101755 }, { "epoch": 18.674986236006607, "grad_norm": 0.00019298116967547685, "learning_rate": 1.3314317033764957e-07, "loss": 0.0, "num_input_tokens_seen": 219485680, "step": 101760 }, { "epoch": 18.67590383556616, "grad_norm": 5.419414446805604e-05, "learning_rate": 1.3295967217083926e-07, "loss": 0.0, "num_input_tokens_seen": 219495920, "step": 101765 }, { "epoch": 18.67682143512571, "grad_norm": 8.418795187026262e-05, "learning_rate": 1.3277629883607035e-07, "loss": 0.0, "num_input_tokens_seen": 219507056, "step": 101770 }, { "epoch": 18.677739034685263, "grad_norm": 6.217354530235752e-05, "learning_rate": 1.325930503380457e-07, "loss": 0.0, "num_input_tokens_seen": 219517136, "step": 101775 }, { "epoch": 18.678656634244817, "grad_norm": 7.755881233606488e-05, "learning_rate": 1.3240992668146546e-07, "loss": 0.0, "num_input_tokens_seen": 219528304, "step": 101780 }, { "epoch": 18.679574233804367, "grad_norm": 5.671638064086437e-05, "learning_rate": 1.322269278710264e-07, "loss": 0.0, "num_input_tokens_seen": 219538896, "step": 101785 }, { "epoch": 18.68049183336392, "grad_norm": 0.0010395161807537079, "learning_rate": 1.320440539114226e-07, "loss": 0.0, "num_input_tokens_seen": 219549680, "step": 101790 }, { "epoch": 18.681409432923473, "grad_norm": 0.0002903485728893429, "learning_rate": 1.3186130480734417e-07, "loss": 0.0, "num_input_tokens_seen": 219560592, "step": 101795 }, { "epoch": 18.682327032483023, "grad_norm": 0.0002882521948777139, "learning_rate": 1.3167868056347843e-07, "loss": 0.0, "num_input_tokens_seen": 219572208, "step": 101800 }, { "epoch": 18.683244632042577, "grad_norm": 0.00022955096210353076, "learning_rate": 1.3149618118450836e-07, "loss": 0.0, "num_input_tokens_seen": 219582000, "step": 101805 }, { "epoch": 18.68416223160213, "grad_norm": 7.047117833280936e-05, "learning_rate": 1.3131380667511683e-07, "loss": 0.0, "num_input_tokens_seen": 219591664, "step": 101810 }, { "epoch": 18.68507983116168, "grad_norm": 0.00010824817582033575, "learning_rate": 1.3113155703998016e-07, "loss": 0.0, "num_input_tokens_seen": 219602128, "step": 101815 }, { "epoch": 18.685997430721233, "grad_norm": 0.0033015450462698936, "learning_rate": 1.309494322837729e-07, "loss": 0.0, "num_input_tokens_seen": 219612784, "step": 101820 }, { "epoch": 18.686915030280787, "grad_norm": 6.349672912620008e-05, "learning_rate": 1.3076743241116573e-07, "loss": 0.0, "num_input_tokens_seen": 219623088, "step": 101825 }, { "epoch": 18.687832629840337, "grad_norm": 3.7587436963804066e-05, "learning_rate": 1.3058555742682777e-07, "loss": 0.0, "num_input_tokens_seen": 219633104, "step": 101830 }, { "epoch": 18.68875022939989, "grad_norm": 0.00011786824325099587, "learning_rate": 1.3040380733542356e-07, "loss": 0.0, "num_input_tokens_seen": 219644784, "step": 101835 }, { "epoch": 18.689667828959443, "grad_norm": 0.0004190178879071027, "learning_rate": 1.3022218214161442e-07, "loss": 0.0, "num_input_tokens_seen": 219656560, "step": 101840 }, { "epoch": 18.690585428518993, "grad_norm": 0.0003520628670230508, "learning_rate": 1.3004068185005881e-07, "loss": 0.0, "num_input_tokens_seen": 219667824, "step": 101845 }, { "epoch": 18.691503028078547, "grad_norm": 9.112794941756874e-05, "learning_rate": 1.2985930646541188e-07, "loss": 0.0, "num_input_tokens_seen": 219679088, "step": 101850 }, { "epoch": 18.6924206276381, "grad_norm": 0.00041386135853827, "learning_rate": 1.296780559923261e-07, "loss": 0.0, "num_input_tokens_seen": 219689808, "step": 101855 }, { "epoch": 18.69333822719765, "grad_norm": 4.1452498408034444e-05, "learning_rate": 1.2949693043544875e-07, "loss": 0.0, "num_input_tokens_seen": 219700304, "step": 101860 }, { "epoch": 18.694255826757203, "grad_norm": 0.0001460517814848572, "learning_rate": 1.2931592979942787e-07, "loss": 0.0, "num_input_tokens_seen": 219710192, "step": 101865 }, { "epoch": 18.695173426316757, "grad_norm": 0.0001931852602865547, "learning_rate": 1.2913505408890414e-07, "loss": 0.0, "num_input_tokens_seen": 219721808, "step": 101870 }, { "epoch": 18.696091025876306, "grad_norm": 0.0004967408021911979, "learning_rate": 1.289543033085161e-07, "loss": 0.0, "num_input_tokens_seen": 219733360, "step": 101875 }, { "epoch": 18.69700862543586, "grad_norm": 0.00011652849207166582, "learning_rate": 1.287736774629017e-07, "loss": 0.0, "num_input_tokens_seen": 219743952, "step": 101880 }, { "epoch": 18.697926224995413, "grad_norm": 5.080013579572551e-05, "learning_rate": 1.2859317655669279e-07, "loss": 0.0, "num_input_tokens_seen": 219753808, "step": 101885 }, { "epoch": 18.698843824554963, "grad_norm": 0.00013573562318924814, "learning_rate": 1.2841280059451844e-07, "loss": 0.0, "num_input_tokens_seen": 219764848, "step": 101890 }, { "epoch": 18.699761424114516, "grad_norm": 0.002002336550503969, "learning_rate": 1.2823254958100606e-07, "loss": 0.0, "num_input_tokens_seen": 219776272, "step": 101895 }, { "epoch": 18.70067902367407, "grad_norm": 0.0008203263278119266, "learning_rate": 1.2805242352077808e-07, "loss": 0.0, "num_input_tokens_seen": 219787248, "step": 101900 }, { "epoch": 18.70159662323362, "grad_norm": 0.0025787213817238808, "learning_rate": 1.2787242241845465e-07, "loss": 0.0, "num_input_tokens_seen": 219798288, "step": 101905 }, { "epoch": 18.702514222793173, "grad_norm": 0.0003373691870365292, "learning_rate": 1.2769254627865213e-07, "loss": 0.0, "num_input_tokens_seen": 219809776, "step": 101910 }, { "epoch": 18.703431822352726, "grad_norm": 0.0003977420274168253, "learning_rate": 1.2751279510598458e-07, "loss": 0.0, "num_input_tokens_seen": 219819792, "step": 101915 }, { "epoch": 18.704349421912276, "grad_norm": 5.9113426686963066e-05, "learning_rate": 1.273331689050622e-07, "loss": 0.0, "num_input_tokens_seen": 219831024, "step": 101920 }, { "epoch": 18.70526702147183, "grad_norm": 0.001668085460551083, "learning_rate": 1.2715366768049186e-07, "loss": 0.0, "num_input_tokens_seen": 219841808, "step": 101925 }, { "epoch": 18.706184621031383, "grad_norm": 5.708020398742519e-05, "learning_rate": 1.2697429143687768e-07, "loss": 0.0, "num_input_tokens_seen": 219852336, "step": 101930 }, { "epoch": 18.707102220590933, "grad_norm": 5.203075852477923e-05, "learning_rate": 1.2679504017882094e-07, "loss": 0.0, "num_input_tokens_seen": 219863216, "step": 101935 }, { "epoch": 18.708019820150486, "grad_norm": 5.725140363210812e-05, "learning_rate": 1.2661591391091797e-07, "loss": 0.0, "num_input_tokens_seen": 219874480, "step": 101940 }, { "epoch": 18.70893741971004, "grad_norm": 0.004708779044449329, "learning_rate": 1.2643691263776404e-07, "loss": 0.0, "num_input_tokens_seen": 219883024, "step": 101945 }, { "epoch": 18.70985501926959, "grad_norm": 0.0001241457212017849, "learning_rate": 1.262580363639504e-07, "loss": 0.0, "num_input_tokens_seen": 219894384, "step": 101950 }, { "epoch": 18.710772618829143, "grad_norm": 4.6044355258345604e-05, "learning_rate": 1.2607928509406452e-07, "loss": 0.0, "num_input_tokens_seen": 219904816, "step": 101955 }, { "epoch": 18.711690218388696, "grad_norm": 0.00017097045201808214, "learning_rate": 1.2590065883269e-07, "loss": 0.0, "num_input_tokens_seen": 219915152, "step": 101960 }, { "epoch": 18.712607817948246, "grad_norm": 5.680712274624966e-05, "learning_rate": 1.257221575844103e-07, "loss": 0.0, "num_input_tokens_seen": 219924528, "step": 101965 }, { "epoch": 18.7135254175078, "grad_norm": 4.7572641051374376e-05, "learning_rate": 1.2554378135380296e-07, "loss": 0.0, "num_input_tokens_seen": 219934704, "step": 101970 }, { "epoch": 18.714443017067353, "grad_norm": 5.175309343030676e-05, "learning_rate": 1.2536553014544263e-07, "loss": 0.0, "num_input_tokens_seen": 219945360, "step": 101975 }, { "epoch": 18.715360616626903, "grad_norm": 6.910380761837587e-05, "learning_rate": 1.2518740396390115e-07, "loss": 0.0, "num_input_tokens_seen": 219956304, "step": 101980 }, { "epoch": 18.716278216186456, "grad_norm": 7.65676741139032e-05, "learning_rate": 1.2500940281374774e-07, "loss": 0.0, "num_input_tokens_seen": 219968144, "step": 101985 }, { "epoch": 18.71719581574601, "grad_norm": 5.440140739665367e-05, "learning_rate": 1.2483152669954756e-07, "loss": 0.0, "num_input_tokens_seen": 219979216, "step": 101990 }, { "epoch": 18.71811341530556, "grad_norm": 8.073603385128081e-05, "learning_rate": 1.2465377562586312e-07, "loss": 0.0, "num_input_tokens_seen": 219990352, "step": 101995 }, { "epoch": 18.719031014865113, "grad_norm": 0.0003066153440158814, "learning_rate": 1.244761495972535e-07, "loss": 0.0, "num_input_tokens_seen": 220001776, "step": 102000 }, { "epoch": 18.719948614424666, "grad_norm": 0.00019575562328100204, "learning_rate": 1.2429864861827345e-07, "loss": 0.0, "num_input_tokens_seen": 220012240, "step": 102005 }, { "epoch": 18.720866213984216, "grad_norm": 0.00035440095234662294, "learning_rate": 1.2412127269347653e-07, "loss": 0.0, "num_input_tokens_seen": 220023344, "step": 102010 }, { "epoch": 18.72178381354377, "grad_norm": 0.00039900682168081403, "learning_rate": 1.239440218274124e-07, "loss": 0.0, "num_input_tokens_seen": 220033840, "step": 102015 }, { "epoch": 18.722701413103323, "grad_norm": 7.112749881343916e-05, "learning_rate": 1.237668960246269e-07, "loss": 0.0, "num_input_tokens_seen": 220043888, "step": 102020 }, { "epoch": 18.723619012662873, "grad_norm": 6.341110565699637e-05, "learning_rate": 1.2358989528966303e-07, "loss": 0.0, "num_input_tokens_seen": 220054960, "step": 102025 }, { "epoch": 18.724536612222426, "grad_norm": 0.00011276639997959137, "learning_rate": 1.2341301962706054e-07, "loss": 0.0, "num_input_tokens_seen": 220066224, "step": 102030 }, { "epoch": 18.72545421178198, "grad_norm": 3.74700830434449e-05, "learning_rate": 1.2323626904135578e-07, "loss": 0.0, "num_input_tokens_seen": 220077296, "step": 102035 }, { "epoch": 18.72637181134153, "grad_norm": 0.00011642361641861498, "learning_rate": 1.2305964353708289e-07, "loss": 0.0, "num_input_tokens_seen": 220088624, "step": 102040 }, { "epoch": 18.727289410901083, "grad_norm": 0.00012912646343465894, "learning_rate": 1.2288314311877103e-07, "loss": 0.0, "num_input_tokens_seen": 220099952, "step": 102045 }, { "epoch": 18.728207010460636, "grad_norm": 0.0001181327024823986, "learning_rate": 1.2270676779094827e-07, "loss": 0.0, "num_input_tokens_seen": 220110192, "step": 102050 }, { "epoch": 18.729124610020186, "grad_norm": 4.647509558708407e-05, "learning_rate": 1.225305175581376e-07, "loss": 0.0, "num_input_tokens_seen": 220120816, "step": 102055 }, { "epoch": 18.73004220957974, "grad_norm": 0.000920722377486527, "learning_rate": 1.2235439242485937e-07, "loss": 0.0, "num_input_tokens_seen": 220131504, "step": 102060 }, { "epoch": 18.730959809139293, "grad_norm": 9.45612700888887e-05, "learning_rate": 1.2217839239563156e-07, "loss": 0.0, "num_input_tokens_seen": 220142672, "step": 102065 }, { "epoch": 18.731877408698843, "grad_norm": 0.00012582752970047295, "learning_rate": 1.2200251747496838e-07, "loss": 0.0, "num_input_tokens_seen": 220152752, "step": 102070 }, { "epoch": 18.732795008258396, "grad_norm": 7.570716115878895e-05, "learning_rate": 1.2182676766738012e-07, "loss": 0.0, "num_input_tokens_seen": 220162480, "step": 102075 }, { "epoch": 18.73371260781795, "grad_norm": 0.00018781448306981474, "learning_rate": 1.216511429773748e-07, "loss": 0.0, "num_input_tokens_seen": 220173200, "step": 102080 }, { "epoch": 18.7346302073775, "grad_norm": 0.00579188484698534, "learning_rate": 1.2147564340945718e-07, "loss": 0.0, "num_input_tokens_seen": 220184464, "step": 102085 }, { "epoch": 18.735547806937053, "grad_norm": 8.496928057866171e-05, "learning_rate": 1.2130026896812809e-07, "loss": 0.0, "num_input_tokens_seen": 220195984, "step": 102090 }, { "epoch": 18.736465406496606, "grad_norm": 0.0003414596721995622, "learning_rate": 1.2112501965788558e-07, "loss": 0.0, "num_input_tokens_seen": 220206864, "step": 102095 }, { "epoch": 18.737383006056156, "grad_norm": 0.0004061747749801725, "learning_rate": 1.20949895483225e-07, "loss": 0.0, "num_input_tokens_seen": 220217680, "step": 102100 }, { "epoch": 18.73830060561571, "grad_norm": 5.304738078848459e-05, "learning_rate": 1.207748964486377e-07, "loss": 0.0, "num_input_tokens_seen": 220230160, "step": 102105 }, { "epoch": 18.739218205175263, "grad_norm": 6.479444709839299e-05, "learning_rate": 1.206000225586118e-07, "loss": 0.0, "num_input_tokens_seen": 220241712, "step": 102110 }, { "epoch": 18.740135804734813, "grad_norm": 7.456284947693348e-05, "learning_rate": 1.2042527381763313e-07, "loss": 0.0, "num_input_tokens_seen": 220253648, "step": 102115 }, { "epoch": 18.741053404294366, "grad_norm": 0.0001345051423413679, "learning_rate": 1.2025065023018423e-07, "loss": 0.0, "num_input_tokens_seen": 220265040, "step": 102120 }, { "epoch": 18.74197100385392, "grad_norm": 7.434100552927703e-05, "learning_rate": 1.2007615180074206e-07, "loss": 0.0, "num_input_tokens_seen": 220275536, "step": 102125 }, { "epoch": 18.74288860341347, "grad_norm": 7.99116023699753e-05, "learning_rate": 1.1990177853378415e-07, "loss": 0.0, "num_input_tokens_seen": 220286000, "step": 102130 }, { "epoch": 18.743806202973023, "grad_norm": 0.00014380675565917045, "learning_rate": 1.197275304337825e-07, "loss": 0.0, "num_input_tokens_seen": 220296656, "step": 102135 }, { "epoch": 18.744723802532576, "grad_norm": 9.250835137208924e-05, "learning_rate": 1.1955340750520516e-07, "loss": 0.0, "num_input_tokens_seen": 220308208, "step": 102140 }, { "epoch": 18.745641402092126, "grad_norm": 6.172264693304896e-05, "learning_rate": 1.1937940975251916e-07, "loss": 0.0, "num_input_tokens_seen": 220319952, "step": 102145 }, { "epoch": 18.74655900165168, "grad_norm": 9.262995445169508e-05, "learning_rate": 1.1920553718018702e-07, "loss": 0.0, "num_input_tokens_seen": 220330384, "step": 102150 }, { "epoch": 18.747476601211233, "grad_norm": 5.84958543186076e-05, "learning_rate": 1.1903178979266905e-07, "loss": 0.0, "num_input_tokens_seen": 220341136, "step": 102155 }, { "epoch": 18.748394200770782, "grad_norm": 0.00016153769684024155, "learning_rate": 1.1885816759441948e-07, "loss": 0.0, "num_input_tokens_seen": 220352208, "step": 102160 }, { "epoch": 18.749311800330336, "grad_norm": 0.0014231824316084385, "learning_rate": 1.1868467058989364e-07, "loss": 0.0, "num_input_tokens_seen": 220363792, "step": 102165 }, { "epoch": 18.75022939988989, "grad_norm": 9.155632142210379e-05, "learning_rate": 1.1851129878354072e-07, "loss": 0.0, "num_input_tokens_seen": 220374800, "step": 102170 }, { "epoch": 18.75114699944944, "grad_norm": 4.1412644350202754e-05, "learning_rate": 1.183380521798072e-07, "loss": 0.0, "num_input_tokens_seen": 220386192, "step": 102175 }, { "epoch": 18.752064599008992, "grad_norm": 0.0002839638327714056, "learning_rate": 1.1816493078313674e-07, "loss": 0.0, "num_input_tokens_seen": 220397296, "step": 102180 }, { "epoch": 18.752982198568546, "grad_norm": 4.6096280129859224e-05, "learning_rate": 1.1799193459796965e-07, "loss": 0.0, "num_input_tokens_seen": 220408304, "step": 102185 }, { "epoch": 18.753899798128096, "grad_norm": 0.0002210037928307429, "learning_rate": 1.1781906362874296e-07, "loss": 0.0, "num_input_tokens_seen": 220418800, "step": 102190 }, { "epoch": 18.75481739768765, "grad_norm": 0.0001406310620950535, "learning_rate": 1.176463178798909e-07, "loss": 0.0, "num_input_tokens_seen": 220429232, "step": 102195 }, { "epoch": 18.755734997247203, "grad_norm": 0.0006522493786178529, "learning_rate": 1.1747369735584324e-07, "loss": 0.0, "num_input_tokens_seen": 220440496, "step": 102200 }, { "epoch": 18.756652596806752, "grad_norm": 0.0002257504384033382, "learning_rate": 1.1730120206102869e-07, "loss": 0.0, "num_input_tokens_seen": 220449680, "step": 102205 }, { "epoch": 18.757570196366306, "grad_norm": 9.279829828301445e-05, "learning_rate": 1.1712883199987035e-07, "loss": 0.0, "num_input_tokens_seen": 220459312, "step": 102210 }, { "epoch": 18.75848779592586, "grad_norm": 0.0002158431161660701, "learning_rate": 1.1695658717679026e-07, "loss": 0.0, "num_input_tokens_seen": 220469936, "step": 102215 }, { "epoch": 18.75940539548541, "grad_norm": 4.347172216512263e-05, "learning_rate": 1.1678446759620543e-07, "loss": 0.0, "num_input_tokens_seen": 220480400, "step": 102220 }, { "epoch": 18.760322995044962, "grad_norm": 8.598734711995348e-05, "learning_rate": 1.1661247326253011e-07, "loss": 0.0, "num_input_tokens_seen": 220489456, "step": 102225 }, { "epoch": 18.761240594604516, "grad_norm": 0.0007151756435632706, "learning_rate": 1.1644060418017689e-07, "loss": 0.0, "num_input_tokens_seen": 220500528, "step": 102230 }, { "epoch": 18.762158194164066, "grad_norm": 4.94617925141938e-05, "learning_rate": 1.1626886035355334e-07, "loss": 0.0, "num_input_tokens_seen": 220512752, "step": 102235 }, { "epoch": 18.76307579372362, "grad_norm": 5.626420897897333e-05, "learning_rate": 1.1609724178706427e-07, "loss": 0.0, "num_input_tokens_seen": 220522928, "step": 102240 }, { "epoch": 18.763993393283172, "grad_norm": 5.63460671401117e-05, "learning_rate": 1.1592574848511118e-07, "loss": 0.0, "num_input_tokens_seen": 220533424, "step": 102245 }, { "epoch": 18.764910992842722, "grad_norm": 7.275010284502059e-05, "learning_rate": 1.1575438045209331e-07, "loss": 0.0, "num_input_tokens_seen": 220544400, "step": 102250 }, { "epoch": 18.765828592402276, "grad_norm": 0.00023477032664231956, "learning_rate": 1.1558313769240603e-07, "loss": 0.0, "num_input_tokens_seen": 220554640, "step": 102255 }, { "epoch": 18.76674619196183, "grad_norm": 0.0005930551560595632, "learning_rate": 1.1541202021044029e-07, "loss": 0.0, "num_input_tokens_seen": 220564496, "step": 102260 }, { "epoch": 18.76766379152138, "grad_norm": 0.0001152294862549752, "learning_rate": 1.1524102801058646e-07, "loss": 0.0, "num_input_tokens_seen": 220575056, "step": 102265 }, { "epoch": 18.768581391080932, "grad_norm": 7.784117042319849e-05, "learning_rate": 1.150701610972299e-07, "loss": 0.0, "num_input_tokens_seen": 220585584, "step": 102270 }, { "epoch": 18.769498990640486, "grad_norm": 5.892195622436702e-05, "learning_rate": 1.1489941947475213e-07, "loss": 0.0, "num_input_tokens_seen": 220595792, "step": 102275 }, { "epoch": 18.770416590200036, "grad_norm": 6.729485903633758e-05, "learning_rate": 1.1472880314753299e-07, "loss": 0.0, "num_input_tokens_seen": 220605776, "step": 102280 }, { "epoch": 18.77133418975959, "grad_norm": 0.00010166379797738045, "learning_rate": 1.1455831211994895e-07, "loss": 0.0, "num_input_tokens_seen": 220617168, "step": 102285 }, { "epoch": 18.772251789319142, "grad_norm": 7.459484186256304e-05, "learning_rate": 1.1438794639637264e-07, "loss": 0.0, "num_input_tokens_seen": 220628176, "step": 102290 }, { "epoch": 18.773169388878692, "grad_norm": 0.001570180058479309, "learning_rate": 1.1421770598117276e-07, "loss": 0.0, "num_input_tokens_seen": 220639728, "step": 102295 }, { "epoch": 18.774086988438246, "grad_norm": 0.00015889097994659096, "learning_rate": 1.1404759087871697e-07, "loss": 0.0, "num_input_tokens_seen": 220651088, "step": 102300 }, { "epoch": 18.7750045879978, "grad_norm": 0.00024364149430766702, "learning_rate": 1.1387760109336788e-07, "loss": 0.0, "num_input_tokens_seen": 220660688, "step": 102305 }, { "epoch": 18.77592218755735, "grad_norm": 0.00024423268041573465, "learning_rate": 1.1370773662948532e-07, "loss": 0.0, "num_input_tokens_seen": 220671984, "step": 102310 }, { "epoch": 18.776839787116902, "grad_norm": 0.00046890441444702446, "learning_rate": 1.1353799749142636e-07, "loss": 0.0, "num_input_tokens_seen": 220683984, "step": 102315 }, { "epoch": 18.777757386676456, "grad_norm": 0.0007323520840145648, "learning_rate": 1.1336838368354419e-07, "loss": 0.0, "num_input_tokens_seen": 220694800, "step": 102320 }, { "epoch": 18.778674986236005, "grad_norm": 5.16679574502632e-05, "learning_rate": 1.1319889521018978e-07, "loss": 0.0, "num_input_tokens_seen": 220705296, "step": 102325 }, { "epoch": 18.77959258579556, "grad_norm": 7.969063153723255e-05, "learning_rate": 1.1302953207570965e-07, "loss": 0.0, "num_input_tokens_seen": 220715664, "step": 102330 }, { "epoch": 18.780510185355112, "grad_norm": 0.004195498768240213, "learning_rate": 1.1286029428444812e-07, "loss": 0.0, "num_input_tokens_seen": 220725328, "step": 102335 }, { "epoch": 18.781427784914662, "grad_norm": 9.8874639661517e-05, "learning_rate": 1.1269118184074556e-07, "loss": 0.0, "num_input_tokens_seen": 220735632, "step": 102340 }, { "epoch": 18.782345384474215, "grad_norm": 0.0006181288044899702, "learning_rate": 1.125221947489391e-07, "loss": 0.0, "num_input_tokens_seen": 220746160, "step": 102345 }, { "epoch": 18.78326298403377, "grad_norm": 6.884142203489318e-05, "learning_rate": 1.1235333301336415e-07, "loss": 0.0329, "num_input_tokens_seen": 220756880, "step": 102350 }, { "epoch": 18.78418058359332, "grad_norm": 0.00041278405115008354, "learning_rate": 1.1218459663835058e-07, "loss": 0.0, "num_input_tokens_seen": 220767952, "step": 102355 }, { "epoch": 18.785098183152872, "grad_norm": 0.0064937942661345005, "learning_rate": 1.1201598562822713e-07, "loss": 0.0, "num_input_tokens_seen": 220778800, "step": 102360 }, { "epoch": 18.786015782712425, "grad_norm": 4.962446473655291e-05, "learning_rate": 1.1184749998731703e-07, "loss": 0.0, "num_input_tokens_seen": 220789264, "step": 102365 }, { "epoch": 18.786933382271975, "grad_norm": 3.435588951106183e-05, "learning_rate": 1.1167913971994348e-07, "loss": 0.0, "num_input_tokens_seen": 220800400, "step": 102370 }, { "epoch": 18.78785098183153, "grad_norm": 7.924600504338741e-05, "learning_rate": 1.1151090483042359e-07, "loss": 0.0, "num_input_tokens_seen": 220811536, "step": 102375 }, { "epoch": 18.788768581391082, "grad_norm": 0.0023183710873126984, "learning_rate": 1.1134279532307224e-07, "loss": 0.0, "num_input_tokens_seen": 220821680, "step": 102380 }, { "epoch": 18.789686180950632, "grad_norm": 0.00010085143003379926, "learning_rate": 1.1117481120220208e-07, "loss": 0.0, "num_input_tokens_seen": 220832912, "step": 102385 }, { "epoch": 18.790603780510185, "grad_norm": 0.0001676558458711952, "learning_rate": 1.1100695247212079e-07, "loss": 0.0, "num_input_tokens_seen": 220844848, "step": 102390 }, { "epoch": 18.79152138006974, "grad_norm": 0.0006915385602042079, "learning_rate": 1.1083921913713325e-07, "loss": 0.0, "num_input_tokens_seen": 220855632, "step": 102395 }, { "epoch": 18.79243897962929, "grad_norm": 9.768067684490234e-05, "learning_rate": 1.1067161120154268e-07, "loss": 0.0, "num_input_tokens_seen": 220866064, "step": 102400 }, { "epoch": 18.793356579188842, "grad_norm": 7.038481999188662e-05, "learning_rate": 1.1050412866964789e-07, "loss": 0.0, "num_input_tokens_seen": 220875312, "step": 102405 }, { "epoch": 18.794274178748395, "grad_norm": 8.710323891136795e-05, "learning_rate": 1.1033677154574373e-07, "loss": 0.0, "num_input_tokens_seen": 220886352, "step": 102410 }, { "epoch": 18.795191778307945, "grad_norm": 3.873493187711574e-05, "learning_rate": 1.1016953983412349e-07, "loss": 0.0, "num_input_tokens_seen": 220897040, "step": 102415 }, { "epoch": 18.7961093778675, "grad_norm": 0.00018519940203987062, "learning_rate": 1.1000243353907536e-07, "loss": 0.0, "num_input_tokens_seen": 220908752, "step": 102420 }, { "epoch": 18.797026977427052, "grad_norm": 9.909279469866306e-05, "learning_rate": 1.098354526648865e-07, "loss": 0.0, "num_input_tokens_seen": 220920144, "step": 102425 }, { "epoch": 18.797944576986602, "grad_norm": 0.0006341963307932019, "learning_rate": 1.09668597215839e-07, "loss": 0.0, "num_input_tokens_seen": 220930480, "step": 102430 }, { "epoch": 18.798862176546155, "grad_norm": 0.0004779851878993213, "learning_rate": 1.095018671962128e-07, "loss": 0.0, "num_input_tokens_seen": 220941904, "step": 102435 }, { "epoch": 18.79977977610571, "grad_norm": 0.0003253152535762638, "learning_rate": 1.0933526261028449e-07, "loss": 0.0, "num_input_tokens_seen": 220954000, "step": 102440 }, { "epoch": 18.80069737566526, "grad_norm": 8.272096602013335e-05, "learning_rate": 1.0916878346232618e-07, "loss": 0.0, "num_input_tokens_seen": 220963920, "step": 102445 }, { "epoch": 18.801614975224812, "grad_norm": 3.416230538277887e-05, "learning_rate": 1.0900242975660835e-07, "loss": 0.0, "num_input_tokens_seen": 220974640, "step": 102450 }, { "epoch": 18.802532574784365, "grad_norm": 6.081823448766954e-05, "learning_rate": 1.0883620149739871e-07, "loss": 0.0, "num_input_tokens_seen": 220985936, "step": 102455 }, { "epoch": 18.803450174343915, "grad_norm": 5.329707346390933e-05, "learning_rate": 1.0867009868895939e-07, "loss": 0.0, "num_input_tokens_seen": 220997136, "step": 102460 }, { "epoch": 18.80436777390347, "grad_norm": 6.537114677485079e-05, "learning_rate": 1.0850412133555088e-07, "loss": 0.0, "num_input_tokens_seen": 221008304, "step": 102465 }, { "epoch": 18.805285373463022, "grad_norm": 7.368376827798784e-05, "learning_rate": 1.0833826944143089e-07, "loss": 0.0, "num_input_tokens_seen": 221018896, "step": 102470 }, { "epoch": 18.80620297302257, "grad_norm": 8.490230538882315e-05, "learning_rate": 1.0817254301085267e-07, "loss": 0.0, "num_input_tokens_seen": 221029616, "step": 102475 }, { "epoch": 18.807120572582125, "grad_norm": 4.676240496337414e-05, "learning_rate": 1.0800694204806672e-07, "loss": 0.0, "num_input_tokens_seen": 221041328, "step": 102480 }, { "epoch": 18.80803817214168, "grad_norm": 0.00010366500646341592, "learning_rate": 1.078414665573213e-07, "loss": 0.0, "num_input_tokens_seen": 221052624, "step": 102485 }, { "epoch": 18.80895577170123, "grad_norm": 5.508357207872905e-05, "learning_rate": 1.0767611654286025e-07, "loss": 0.0, "num_input_tokens_seen": 221063984, "step": 102490 }, { "epoch": 18.80987337126078, "grad_norm": 6.782417767681181e-05, "learning_rate": 1.0751089200892461e-07, "loss": 0.0, "num_input_tokens_seen": 221074896, "step": 102495 }, { "epoch": 18.810790970820335, "grad_norm": 0.0003439837309997529, "learning_rate": 1.0734579295975101e-07, "loss": 0.0, "num_input_tokens_seen": 221086672, "step": 102500 }, { "epoch": 18.811708570379885, "grad_norm": 0.00018007440667133778, "learning_rate": 1.0718081939957548e-07, "loss": 0.0, "num_input_tokens_seen": 221097200, "step": 102505 }, { "epoch": 18.81262616993944, "grad_norm": 3.3907108445419e-05, "learning_rate": 1.0701597133262908e-07, "loss": 0.0, "num_input_tokens_seen": 221109168, "step": 102510 }, { "epoch": 18.81354376949899, "grad_norm": 9.94526781141758e-05, "learning_rate": 1.0685124876313901e-07, "loss": 0.0, "num_input_tokens_seen": 221119152, "step": 102515 }, { "epoch": 18.81446136905854, "grad_norm": 4.706934851128608e-05, "learning_rate": 1.0668665169533076e-07, "loss": 0.0, "num_input_tokens_seen": 221129200, "step": 102520 }, { "epoch": 18.815378968618095, "grad_norm": 4.805213393410668e-05, "learning_rate": 1.0652218013342596e-07, "loss": 0.0, "num_input_tokens_seen": 221140080, "step": 102525 }, { "epoch": 18.81629656817765, "grad_norm": 0.0029721837490797043, "learning_rate": 1.0635783408164291e-07, "loss": 0.0, "num_input_tokens_seen": 221151088, "step": 102530 }, { "epoch": 18.8172141677372, "grad_norm": 4.0829174395184964e-05, "learning_rate": 1.0619361354419766e-07, "loss": 0.0, "num_input_tokens_seen": 221160688, "step": 102535 }, { "epoch": 18.81813176729675, "grad_norm": 8.158809941960499e-05, "learning_rate": 1.0602951852530075e-07, "loss": 0.0, "num_input_tokens_seen": 221171952, "step": 102540 }, { "epoch": 18.819049366856305, "grad_norm": 9.805244917515665e-05, "learning_rate": 1.0586554902916214e-07, "loss": 0.0, "num_input_tokens_seen": 221183312, "step": 102545 }, { "epoch": 18.819966966415855, "grad_norm": 0.0002790113212540746, "learning_rate": 1.0570170505998679e-07, "loss": 0.0, "num_input_tokens_seen": 221194096, "step": 102550 }, { "epoch": 18.82088456597541, "grad_norm": 0.001735315890982747, "learning_rate": 1.0553798662197745e-07, "loss": 0.0, "num_input_tokens_seen": 221204976, "step": 102555 }, { "epoch": 18.82180216553496, "grad_norm": 6.381617276929319e-05, "learning_rate": 1.05374393719333e-07, "loss": 0.0, "num_input_tokens_seen": 221216784, "step": 102560 }, { "epoch": 18.82271976509451, "grad_norm": 6.993652641540393e-05, "learning_rate": 1.0521092635624897e-07, "loss": 0.0, "num_input_tokens_seen": 221228080, "step": 102565 }, { "epoch": 18.823637364654065, "grad_norm": 5.851400055689737e-05, "learning_rate": 1.0504758453691866e-07, "loss": 0.0, "num_input_tokens_seen": 221239312, "step": 102570 }, { "epoch": 18.82455496421362, "grad_norm": 0.0002981102152261883, "learning_rate": 1.048843682655315e-07, "loss": 0.0, "num_input_tokens_seen": 221250512, "step": 102575 }, { "epoch": 18.825472563773168, "grad_norm": 0.00030719314236193895, "learning_rate": 1.0472127754627304e-07, "loss": 0.0, "num_input_tokens_seen": 221261744, "step": 102580 }, { "epoch": 18.82639016333272, "grad_norm": 9.143204079009593e-05, "learning_rate": 1.045583123833277e-07, "loss": 0.0, "num_input_tokens_seen": 221273296, "step": 102585 }, { "epoch": 18.827307762892275, "grad_norm": 7.847720553399995e-05, "learning_rate": 1.043954727808738e-07, "loss": 0.0, "num_input_tokens_seen": 221284592, "step": 102590 }, { "epoch": 18.828225362451825, "grad_norm": 0.001017986098304391, "learning_rate": 1.0423275874308858e-07, "loss": 0.0, "num_input_tokens_seen": 221295696, "step": 102595 }, { "epoch": 18.829142962011378, "grad_norm": 6.23643645667471e-05, "learning_rate": 1.0407017027414535e-07, "loss": 0.0, "num_input_tokens_seen": 221306064, "step": 102600 }, { "epoch": 18.83006056157093, "grad_norm": 0.00018657381588127464, "learning_rate": 1.039077073782141e-07, "loss": 0.0, "num_input_tokens_seen": 221316880, "step": 102605 }, { "epoch": 18.83097816113048, "grad_norm": 5.707895616069436e-05, "learning_rate": 1.0374537005946261e-07, "loss": 0.0, "num_input_tokens_seen": 221327792, "step": 102610 }, { "epoch": 18.831895760690035, "grad_norm": 0.00036682674544863403, "learning_rate": 1.0358315832205257e-07, "loss": 0.0, "num_input_tokens_seen": 221338992, "step": 102615 }, { "epoch": 18.832813360249588, "grad_norm": 0.00011170373181812465, "learning_rate": 1.0342107217014674e-07, "loss": 0.0, "num_input_tokens_seen": 221350288, "step": 102620 }, { "epoch": 18.833730959809138, "grad_norm": 4.4757038267562166e-05, "learning_rate": 1.0325911160790126e-07, "loss": 0.0, "num_input_tokens_seen": 221360496, "step": 102625 }, { "epoch": 18.83464855936869, "grad_norm": 0.00015594306751154363, "learning_rate": 1.0309727663947055e-07, "loss": 0.0, "num_input_tokens_seen": 221371472, "step": 102630 }, { "epoch": 18.835566158928245, "grad_norm": 6.387633038684726e-05, "learning_rate": 1.0293556726900522e-07, "loss": 0.0, "num_input_tokens_seen": 221381552, "step": 102635 }, { "epoch": 18.836483758487795, "grad_norm": 0.00010487446706974879, "learning_rate": 1.0277398350065249e-07, "loss": 0.0, "num_input_tokens_seen": 221392368, "step": 102640 }, { "epoch": 18.837401358047348, "grad_norm": 8.443247497780249e-05, "learning_rate": 1.0261252533855681e-07, "loss": 0.0, "num_input_tokens_seen": 221402864, "step": 102645 }, { "epoch": 18.8383189576069, "grad_norm": 0.00012598282773979008, "learning_rate": 1.0245119278685989e-07, "loss": 0.0, "num_input_tokens_seen": 221413296, "step": 102650 }, { "epoch": 18.83923655716645, "grad_norm": 3.990321420133114e-05, "learning_rate": 1.0228998584969951e-07, "loss": 0.0, "num_input_tokens_seen": 221423440, "step": 102655 }, { "epoch": 18.840154156726005, "grad_norm": 2.581475564511493e-05, "learning_rate": 1.0212890453121016e-07, "loss": 0.0284, "num_input_tokens_seen": 221433136, "step": 102660 }, { "epoch": 18.841071756285558, "grad_norm": 0.000633329909760505, "learning_rate": 1.0196794883552296e-07, "loss": 0.0, "num_input_tokens_seen": 221444432, "step": 102665 }, { "epoch": 18.841989355845108, "grad_norm": 3.761745392694138e-05, "learning_rate": 1.0180711876676686e-07, "loss": 0.0, "num_input_tokens_seen": 221455504, "step": 102670 }, { "epoch": 18.84290695540466, "grad_norm": 0.0002928038884419948, "learning_rate": 1.0164641432906686e-07, "loss": 0.0, "num_input_tokens_seen": 221467600, "step": 102675 }, { "epoch": 18.843824554964215, "grad_norm": 7.148359873099253e-05, "learning_rate": 1.0148583552654467e-07, "loss": 0.0, "num_input_tokens_seen": 221478512, "step": 102680 }, { "epoch": 18.844742154523765, "grad_norm": 0.00011649748921627179, "learning_rate": 1.0132538236331813e-07, "loss": 0.0, "num_input_tokens_seen": 221488496, "step": 102685 }, { "epoch": 18.845659754083318, "grad_norm": 0.0007793043623678386, "learning_rate": 1.0116505484350392e-07, "loss": 0.0, "num_input_tokens_seen": 221498960, "step": 102690 }, { "epoch": 18.84657735364287, "grad_norm": 8.899450767785311e-05, "learning_rate": 1.0100485297121321e-07, "loss": 0.0, "num_input_tokens_seen": 221511088, "step": 102695 }, { "epoch": 18.84749495320242, "grad_norm": 0.002070442773401737, "learning_rate": 1.0084477675055548e-07, "loss": 0.0, "num_input_tokens_seen": 221521808, "step": 102700 }, { "epoch": 18.848412552761975, "grad_norm": 12.123217582702637, "learning_rate": 1.006848261856358e-07, "loss": 0.0284, "num_input_tokens_seen": 221533232, "step": 102705 }, { "epoch": 18.849330152321528, "grad_norm": 7.750363147351891e-05, "learning_rate": 1.0052500128055753e-07, "loss": 0.0, "num_input_tokens_seen": 221545520, "step": 102710 }, { "epoch": 18.850247751881078, "grad_norm": 0.000920455961022526, "learning_rate": 1.0036530203941908e-07, "loss": 0.0, "num_input_tokens_seen": 221556016, "step": 102715 }, { "epoch": 18.85116535144063, "grad_norm": 0.0001315951521974057, "learning_rate": 1.0020572846631771e-07, "loss": 0.0, "num_input_tokens_seen": 221564912, "step": 102720 }, { "epoch": 18.852082951000185, "grad_norm": 7.134001498343423e-05, "learning_rate": 1.000462805653446e-07, "loss": 0.0, "num_input_tokens_seen": 221576208, "step": 102725 }, { "epoch": 18.853000550559734, "grad_norm": 0.00025609639124013484, "learning_rate": 9.988695834059092e-08, "loss": 0.0, "num_input_tokens_seen": 221586576, "step": 102730 }, { "epoch": 18.853918150119288, "grad_norm": 0.0003820870188064873, "learning_rate": 9.972776179614118e-08, "loss": 0.0, "num_input_tokens_seen": 221596944, "step": 102735 }, { "epoch": 18.85483574967884, "grad_norm": 0.0022004232741892338, "learning_rate": 9.956869093608046e-08, "loss": 0.0, "num_input_tokens_seen": 221606352, "step": 102740 }, { "epoch": 18.85575334923839, "grad_norm": 5.2890773076796904e-05, "learning_rate": 9.94097457644877e-08, "loss": 0.0, "num_input_tokens_seen": 221617808, "step": 102745 }, { "epoch": 18.856670948797944, "grad_norm": 0.0015671132132411003, "learning_rate": 9.925092628543908e-08, "loss": 0.0, "num_input_tokens_seen": 221628432, "step": 102750 }, { "epoch": 18.857588548357498, "grad_norm": 5.0571190513437614e-05, "learning_rate": 9.90922325030097e-08, "loss": 0.0, "num_input_tokens_seen": 221638928, "step": 102755 }, { "epoch": 18.858506147917048, "grad_norm": 0.0002444011624902487, "learning_rate": 9.89336644212685e-08, "loss": 0.004, "num_input_tokens_seen": 221648304, "step": 102760 }, { "epoch": 18.8594237474766, "grad_norm": 0.0010114508913829923, "learning_rate": 9.877522204428225e-08, "loss": 0.0, "num_input_tokens_seen": 221658704, "step": 102765 }, { "epoch": 18.860341347036155, "grad_norm": 0.0004980329540558159, "learning_rate": 9.861690537611601e-08, "loss": 0.0, "num_input_tokens_seen": 221669840, "step": 102770 }, { "epoch": 18.861258946595704, "grad_norm": 5.60886473977007e-05, "learning_rate": 9.845871442082989e-08, "loss": 0.0, "num_input_tokens_seen": 221682640, "step": 102775 }, { "epoch": 18.862176546155258, "grad_norm": 0.0018293455941602588, "learning_rate": 9.830064918248061e-08, "loss": 0.0, "num_input_tokens_seen": 221694000, "step": 102780 }, { "epoch": 18.86309414571481, "grad_norm": 7.220797124318779e-05, "learning_rate": 9.814270966512218e-08, "loss": 0.0, "num_input_tokens_seen": 221704592, "step": 102785 }, { "epoch": 18.86401174527436, "grad_norm": 0.000138570845592767, "learning_rate": 9.79848958728069e-08, "loss": 0.0, "num_input_tokens_seen": 221715632, "step": 102790 }, { "epoch": 18.864929344833914, "grad_norm": 7.078857015585527e-05, "learning_rate": 9.782720780958155e-08, "loss": 0.0, "num_input_tokens_seen": 221726128, "step": 102795 }, { "epoch": 18.865846944393468, "grad_norm": 5.869085725862533e-05, "learning_rate": 9.766964547949009e-08, "loss": 0.0, "num_input_tokens_seen": 221735440, "step": 102800 }, { "epoch": 18.866764543953018, "grad_norm": 0.00012379858526401222, "learning_rate": 9.751220888657486e-08, "loss": 0.0, "num_input_tokens_seen": 221745872, "step": 102805 }, { "epoch": 18.86768214351257, "grad_norm": 0.0005772049189545214, "learning_rate": 9.73548980348732e-08, "loss": 0.0, "num_input_tokens_seen": 221757136, "step": 102810 }, { "epoch": 18.868599743072124, "grad_norm": 8.853145845932886e-05, "learning_rate": 9.719771292842017e-08, "loss": 0.0, "num_input_tokens_seen": 221769552, "step": 102815 }, { "epoch": 18.869517342631674, "grad_norm": 0.0001716455299174413, "learning_rate": 9.704065357124648e-08, "loss": 0.0, "num_input_tokens_seen": 221779376, "step": 102820 }, { "epoch": 18.870434942191228, "grad_norm": 0.00011291584087302908, "learning_rate": 9.688371996738166e-08, "loss": 0.0, "num_input_tokens_seen": 221789584, "step": 102825 }, { "epoch": 18.87135254175078, "grad_norm": 4.5965334720676765e-05, "learning_rate": 9.672691212085028e-08, "loss": 0.0, "num_input_tokens_seen": 221800304, "step": 102830 }, { "epoch": 18.87227014131033, "grad_norm": 0.00035554522764869034, "learning_rate": 9.657023003567411e-08, "loss": 0.0, "num_input_tokens_seen": 221811888, "step": 102835 }, { "epoch": 18.873187740869884, "grad_norm": 5.674310523318127e-05, "learning_rate": 9.641367371587163e-08, "loss": 0.0, "num_input_tokens_seen": 221822448, "step": 102840 }, { "epoch": 18.874105340429438, "grad_norm": 7.868700049584731e-05, "learning_rate": 9.625724316545904e-08, "loss": 0.0, "num_input_tokens_seen": 221832080, "step": 102845 }, { "epoch": 18.875022939988988, "grad_norm": 0.0001693844242254272, "learning_rate": 9.610093838844814e-08, "loss": 0.0, "num_input_tokens_seen": 221843472, "step": 102850 }, { "epoch": 18.87594053954854, "grad_norm": 5.5791184422560036e-05, "learning_rate": 9.594475938884739e-08, "loss": 0.0, "num_input_tokens_seen": 221853712, "step": 102855 }, { "epoch": 18.876858139108094, "grad_norm": 4.207495294394903e-05, "learning_rate": 9.57887061706636e-08, "loss": 0.0, "num_input_tokens_seen": 221864880, "step": 102860 }, { "epoch": 18.877775738667644, "grad_norm": 4.724830068880692e-05, "learning_rate": 9.563277873789745e-08, "loss": 0.0, "num_input_tokens_seen": 221877200, "step": 102865 }, { "epoch": 18.878693338227198, "grad_norm": 0.00017906606080941856, "learning_rate": 9.547697709455073e-08, "loss": 0.0, "num_input_tokens_seen": 221886800, "step": 102870 }, { "epoch": 18.87961093778675, "grad_norm": 0.0015732505125924945, "learning_rate": 9.532130124461747e-08, "loss": 0.0, "num_input_tokens_seen": 221898224, "step": 102875 }, { "epoch": 18.8805285373463, "grad_norm": 4.1256706026615575e-05, "learning_rate": 9.516575119209171e-08, "loss": 0.0, "num_input_tokens_seen": 221910640, "step": 102880 }, { "epoch": 18.881446136905854, "grad_norm": 9.581432095728815e-05, "learning_rate": 9.50103269409619e-08, "loss": 0.0, "num_input_tokens_seen": 221921872, "step": 102885 }, { "epoch": 18.882363736465408, "grad_norm": 0.00012737957877106965, "learning_rate": 9.485502849521599e-08, "loss": 0.0, "num_input_tokens_seen": 221932240, "step": 102890 }, { "epoch": 18.883281336024957, "grad_norm": 0.00012038958811899647, "learning_rate": 9.469985585883579e-08, "loss": 0.0, "num_input_tokens_seen": 221942512, "step": 102895 }, { "epoch": 18.88419893558451, "grad_norm": 0.0002035833749687299, "learning_rate": 9.454480903580143e-08, "loss": 0.0, "num_input_tokens_seen": 221953456, "step": 102900 }, { "epoch": 18.885116535144064, "grad_norm": 0.00485489284619689, "learning_rate": 9.438988803009086e-08, "loss": 0.0, "num_input_tokens_seen": 221964176, "step": 102905 }, { "epoch": 18.886034134703614, "grad_norm": 0.00013557085185311735, "learning_rate": 9.423509284567645e-08, "loss": 0.0, "num_input_tokens_seen": 221975984, "step": 102910 }, { "epoch": 18.886951734263167, "grad_norm": 0.0006492721731774509, "learning_rate": 9.408042348652835e-08, "loss": 0.0, "num_input_tokens_seen": 221985840, "step": 102915 }, { "epoch": 18.88786933382272, "grad_norm": 5.203695400268771e-05, "learning_rate": 9.392587995661396e-08, "loss": 0.0, "num_input_tokens_seen": 221994128, "step": 102920 }, { "epoch": 18.88878693338227, "grad_norm": 4.7340512537630275e-05, "learning_rate": 9.377146225989676e-08, "loss": 0.0, "num_input_tokens_seen": 222005424, "step": 102925 }, { "epoch": 18.889704532941824, "grad_norm": 8.169724605977535e-05, "learning_rate": 9.361717040033802e-08, "loss": 0.0, "num_input_tokens_seen": 222017072, "step": 102930 }, { "epoch": 18.890622132501377, "grad_norm": 0.013645310886204243, "learning_rate": 9.34630043818946e-08, "loss": 0.0, "num_input_tokens_seen": 222027024, "step": 102935 }, { "epoch": 18.891539732060927, "grad_norm": 0.00010398677113698795, "learning_rate": 9.330896420852054e-08, "loss": 0.0, "num_input_tokens_seen": 222038480, "step": 102940 }, { "epoch": 18.89245733162048, "grad_norm": 0.00015463567979168147, "learning_rate": 9.315504988416713e-08, "loss": 0.0, "num_input_tokens_seen": 222049392, "step": 102945 }, { "epoch": 18.893374931180034, "grad_norm": 0.00011467710282886401, "learning_rate": 9.300126141278177e-08, "loss": 0.0, "num_input_tokens_seen": 222059984, "step": 102950 }, { "epoch": 18.894292530739584, "grad_norm": 8.294336294056848e-05, "learning_rate": 9.284759879830796e-08, "loss": 0.0, "num_input_tokens_seen": 222071984, "step": 102955 }, { "epoch": 18.895210130299137, "grad_norm": 0.00015192093269433826, "learning_rate": 9.269406204468867e-08, "loss": 0.0, "num_input_tokens_seen": 222083408, "step": 102960 }, { "epoch": 18.89612772985869, "grad_norm": 7.749677752144635e-05, "learning_rate": 9.25406511558613e-08, "loss": 0.0, "num_input_tokens_seen": 222093328, "step": 102965 }, { "epoch": 18.89704532941824, "grad_norm": 7.427884702337906e-05, "learning_rate": 9.238736613575994e-08, "loss": 0.0, "num_input_tokens_seen": 222104624, "step": 102970 }, { "epoch": 18.897962928977794, "grad_norm": 0.6607380509376526, "learning_rate": 9.223420698831642e-08, "loss": 0.0004, "num_input_tokens_seen": 222116336, "step": 102975 }, { "epoch": 18.898880528537347, "grad_norm": 0.0010511388536542654, "learning_rate": 9.208117371745928e-08, "loss": 0.0, "num_input_tokens_seen": 222125232, "step": 102980 }, { "epoch": 18.899798128096897, "grad_norm": 0.0004612373886629939, "learning_rate": 9.192826632711315e-08, "loss": 0.0, "num_input_tokens_seen": 222135024, "step": 102985 }, { "epoch": 18.90071572765645, "grad_norm": 8.28008123789914e-05, "learning_rate": 9.177548482120102e-08, "loss": 0.0, "num_input_tokens_seen": 222144432, "step": 102990 }, { "epoch": 18.901633327216004, "grad_norm": 0.00012101029278710485, "learning_rate": 9.16228292036403e-08, "loss": 0.0, "num_input_tokens_seen": 222155184, "step": 102995 }, { "epoch": 18.902550926775554, "grad_norm": 0.0002713511639740318, "learning_rate": 9.147029947834618e-08, "loss": 0.0, "num_input_tokens_seen": 222167536, "step": 103000 }, { "epoch": 18.903468526335107, "grad_norm": 5.31248479092028e-05, "learning_rate": 9.131789564923166e-08, "loss": 0.0, "num_input_tokens_seen": 222178032, "step": 103005 }, { "epoch": 18.90438612589466, "grad_norm": 8.298781176563352e-05, "learning_rate": 9.116561772020527e-08, "loss": 0.0, "num_input_tokens_seen": 222189936, "step": 103010 }, { "epoch": 18.90530372545421, "grad_norm": 0.00016003847122192383, "learning_rate": 9.101346569517334e-08, "loss": 0.0, "num_input_tokens_seen": 222199440, "step": 103015 }, { "epoch": 18.906221325013764, "grad_norm": 9.084679913939908e-05, "learning_rate": 9.086143957803717e-08, "loss": 0.0, "num_input_tokens_seen": 222209488, "step": 103020 }, { "epoch": 18.907138924573317, "grad_norm": 0.0026810888666659594, "learning_rate": 9.070953937269645e-08, "loss": 0.0, "num_input_tokens_seen": 222220016, "step": 103025 }, { "epoch": 18.908056524132867, "grad_norm": 4.8446283472003415e-05, "learning_rate": 9.055776508304804e-08, "loss": 0.0, "num_input_tokens_seen": 222231312, "step": 103030 }, { "epoch": 18.90897412369242, "grad_norm": 0.0021128349471837282, "learning_rate": 9.040611671298327e-08, "loss": 0.0, "num_input_tokens_seen": 222241968, "step": 103035 }, { "epoch": 18.909891723251974, "grad_norm": 0.004899407736957073, "learning_rate": 9.025459426639294e-08, "loss": 0.0, "num_input_tokens_seen": 222253424, "step": 103040 }, { "epoch": 18.910809322811524, "grad_norm": 0.0008615981787443161, "learning_rate": 9.010319774716281e-08, "loss": 0.0, "num_input_tokens_seen": 222264720, "step": 103045 }, { "epoch": 18.911726922371077, "grad_norm": 0.000112360947241541, "learning_rate": 8.995192715917588e-08, "loss": 0.0, "num_input_tokens_seen": 222275504, "step": 103050 }, { "epoch": 18.91264452193063, "grad_norm": 0.000106282634078525, "learning_rate": 8.980078250631241e-08, "loss": 0.0, "num_input_tokens_seen": 222286544, "step": 103055 }, { "epoch": 18.91356212149018, "grad_norm": 0.0006887006456963718, "learning_rate": 8.964976379244816e-08, "loss": 0.0, "num_input_tokens_seen": 222295248, "step": 103060 }, { "epoch": 18.914479721049734, "grad_norm": 0.002057145582512021, "learning_rate": 8.949887102145783e-08, "loss": 0.0, "num_input_tokens_seen": 222306864, "step": 103065 }, { "epoch": 18.915397320609287, "grad_norm": 0.00013090085121802986, "learning_rate": 8.934810419721052e-08, "loss": 0.0, "num_input_tokens_seen": 222318128, "step": 103070 }, { "epoch": 18.916314920168837, "grad_norm": 8.980504935607314e-05, "learning_rate": 8.91974633235737e-08, "loss": 0.0, "num_input_tokens_seen": 222329200, "step": 103075 }, { "epoch": 18.91723251972839, "grad_norm": 0.0004311940283514559, "learning_rate": 8.904694840441041e-08, "loss": 0.0, "num_input_tokens_seen": 222339344, "step": 103080 }, { "epoch": 18.918150119287944, "grad_norm": 0.0004791893297806382, "learning_rate": 8.8896559443582e-08, "loss": 0.0, "num_input_tokens_seen": 222350800, "step": 103085 }, { "epoch": 18.919067718847494, "grad_norm": 0.00010027178359450772, "learning_rate": 8.874629644494481e-08, "loss": 0.0, "num_input_tokens_seen": 222361648, "step": 103090 }, { "epoch": 18.919985318407047, "grad_norm": 0.025323882699012756, "learning_rate": 8.85961594123541e-08, "loss": 0.0, "num_input_tokens_seen": 222370480, "step": 103095 }, { "epoch": 18.9209029179666, "grad_norm": 8.648099901620299e-05, "learning_rate": 8.844614834965959e-08, "loss": 0.0, "num_input_tokens_seen": 222380144, "step": 103100 }, { "epoch": 18.92182051752615, "grad_norm": 0.0001516462943982333, "learning_rate": 8.829626326070872e-08, "loss": 0.0, "num_input_tokens_seen": 222391472, "step": 103105 }, { "epoch": 18.922738117085704, "grad_norm": 7.737778651062399e-05, "learning_rate": 8.814650414934677e-08, "loss": 0.0, "num_input_tokens_seen": 222402064, "step": 103110 }, { "epoch": 18.923655716645257, "grad_norm": 9.288352885050699e-05, "learning_rate": 8.799687101941456e-08, "loss": 0.0, "num_input_tokens_seen": 222413488, "step": 103115 }, { "epoch": 18.924573316204807, "grad_norm": 0.00022753624944016337, "learning_rate": 8.784736387474902e-08, "loss": 0.0, "num_input_tokens_seen": 222422992, "step": 103120 }, { "epoch": 18.92549091576436, "grad_norm": 0.00028597633354365826, "learning_rate": 8.769798271918595e-08, "loss": 0.0, "num_input_tokens_seen": 222432784, "step": 103125 }, { "epoch": 18.926408515323914, "grad_norm": 0.001737375045195222, "learning_rate": 8.75487275565562e-08, "loss": 0.0, "num_input_tokens_seen": 222444400, "step": 103130 }, { "epoch": 18.927326114883464, "grad_norm": 0.0036447036545723677, "learning_rate": 8.739959839068779e-08, "loss": 0.0, "num_input_tokens_seen": 222456144, "step": 103135 }, { "epoch": 18.928243714443017, "grad_norm": 7.700309652136639e-05, "learning_rate": 8.725059522540546e-08, "loss": 0.0, "num_input_tokens_seen": 222467504, "step": 103140 }, { "epoch": 18.92916131400257, "grad_norm": 5.4482654377352446e-05, "learning_rate": 8.710171806453171e-08, "loss": 0.0, "num_input_tokens_seen": 222477712, "step": 103145 }, { "epoch": 18.93007891356212, "grad_norm": 0.00010133312025573105, "learning_rate": 8.695296691188514e-08, "loss": 0.0, "num_input_tokens_seen": 222488304, "step": 103150 }, { "epoch": 18.930996513121674, "grad_norm": 6.868797208881006e-05, "learning_rate": 8.680434177127938e-08, "loss": 0.0002, "num_input_tokens_seen": 222499728, "step": 103155 }, { "epoch": 18.931914112681227, "grad_norm": 6.0885755374329165e-05, "learning_rate": 8.665584264652805e-08, "loss": 0.0, "num_input_tokens_seen": 222510800, "step": 103160 }, { "epoch": 18.932831712240777, "grad_norm": 5.745806629420258e-05, "learning_rate": 8.650746954143919e-08, "loss": 0.0, "num_input_tokens_seen": 222519536, "step": 103165 }, { "epoch": 18.93374931180033, "grad_norm": 0.002643370535224676, "learning_rate": 8.635922245981865e-08, "loss": 0.0, "num_input_tokens_seen": 222529808, "step": 103170 }, { "epoch": 18.934666911359884, "grad_norm": 0.00019812840037047863, "learning_rate": 8.62111014054684e-08, "loss": 0.0002, "num_input_tokens_seen": 222540144, "step": 103175 }, { "epoch": 18.935584510919433, "grad_norm": 0.00011748390534194186, "learning_rate": 8.606310638218818e-08, "loss": 0.0, "num_input_tokens_seen": 222550704, "step": 103180 }, { "epoch": 18.936502110478987, "grad_norm": 0.000513751816470176, "learning_rate": 8.591523739377328e-08, "loss": 0.0, "num_input_tokens_seen": 222561424, "step": 103185 }, { "epoch": 18.93741971003854, "grad_norm": 6.846670294180512e-05, "learning_rate": 8.576749444401566e-08, "loss": 0.0, "num_input_tokens_seen": 222572112, "step": 103190 }, { "epoch": 18.93833730959809, "grad_norm": 3.574324000510387e-05, "learning_rate": 8.56198775367062e-08, "loss": 0.0, "num_input_tokens_seen": 222581744, "step": 103195 }, { "epoch": 18.939254909157643, "grad_norm": 0.00012103199696866795, "learning_rate": 8.547238667563018e-08, "loss": 0.0, "num_input_tokens_seen": 222592784, "step": 103200 }, { "epoch": 18.940172508717197, "grad_norm": 4.445877129910514e-05, "learning_rate": 8.532502186457014e-08, "loss": 0.0, "num_input_tokens_seen": 222603344, "step": 103205 }, { "epoch": 18.941090108276747, "grad_norm": 0.00011417722998885438, "learning_rate": 8.517778310730696e-08, "loss": 0.0, "num_input_tokens_seen": 222615312, "step": 103210 }, { "epoch": 18.9420077078363, "grad_norm": 4.119443838135339e-05, "learning_rate": 8.503067040761593e-08, "loss": 0.0, "num_input_tokens_seen": 222627120, "step": 103215 }, { "epoch": 18.942925307395853, "grad_norm": 8.768808766035363e-05, "learning_rate": 8.48836837692707e-08, "loss": 0.0, "num_input_tokens_seen": 222638320, "step": 103220 }, { "epoch": 18.943842906955403, "grad_norm": 0.00010623879643389955, "learning_rate": 8.473682319604104e-08, "loss": 0.0, "num_input_tokens_seen": 222648720, "step": 103225 }, { "epoch": 18.944760506514957, "grad_norm": 0.00013904976367484778, "learning_rate": 8.45900886916945e-08, "loss": 0.0, "num_input_tokens_seen": 222658864, "step": 103230 }, { "epoch": 18.94567810607451, "grad_norm": 0.00014467921573668718, "learning_rate": 8.44434802599936e-08, "loss": 0.0, "num_input_tokens_seen": 222670416, "step": 103235 }, { "epoch": 18.94659570563406, "grad_norm": 7.084834214765579e-05, "learning_rate": 8.429699790469869e-08, "loss": 0.0, "num_input_tokens_seen": 222680912, "step": 103240 }, { "epoch": 18.947513305193613, "grad_norm": 0.014189896173775196, "learning_rate": 8.415064162956787e-08, "loss": 0.0, "num_input_tokens_seen": 222692240, "step": 103245 }, { "epoch": 18.948430904753167, "grad_norm": 0.00602451479062438, "learning_rate": 8.40044114383537e-08, "loss": 0.0, "num_input_tokens_seen": 222701488, "step": 103250 }, { "epoch": 18.949348504312717, "grad_norm": 0.0003275301423855126, "learning_rate": 8.38583073348076e-08, "loss": 0.0, "num_input_tokens_seen": 222712720, "step": 103255 }, { "epoch": 18.95026610387227, "grad_norm": 7.894611917436123e-05, "learning_rate": 8.371232932267603e-08, "loss": 0.0, "num_input_tokens_seen": 222723600, "step": 103260 }, { "epoch": 18.951183703431823, "grad_norm": 0.00012835134111810476, "learning_rate": 8.356647740570434e-08, "loss": 0.0, "num_input_tokens_seen": 222733488, "step": 103265 }, { "epoch": 18.952101302991373, "grad_norm": 0.0002301429776707664, "learning_rate": 8.34207515876323e-08, "loss": 0.0, "num_input_tokens_seen": 222744624, "step": 103270 }, { "epoch": 18.953018902550927, "grad_norm": 0.0007754253456369042, "learning_rate": 8.327515187219859e-08, "loss": 0.0, "num_input_tokens_seen": 222755600, "step": 103275 }, { "epoch": 18.95393650211048, "grad_norm": 0.00018623648793436587, "learning_rate": 8.312967826313633e-08, "loss": 0.0, "num_input_tokens_seen": 222766480, "step": 103280 }, { "epoch": 18.95485410167003, "grad_norm": 0.00016528950072824955, "learning_rate": 8.298433076417755e-08, "loss": 0.0, "num_input_tokens_seen": 222776304, "step": 103285 }, { "epoch": 18.955771701229583, "grad_norm": 5.618434079224244e-05, "learning_rate": 8.283910937904981e-08, "loss": 0.0, "num_input_tokens_seen": 222788016, "step": 103290 }, { "epoch": 18.956689300789137, "grad_norm": 0.012891157530248165, "learning_rate": 8.269401411147848e-08, "loss": 0.0, "num_input_tokens_seen": 222798576, "step": 103295 }, { "epoch": 18.957606900348686, "grad_norm": 0.0002613214892335236, "learning_rate": 8.254904496518446e-08, "loss": 0.0, "num_input_tokens_seen": 222809360, "step": 103300 }, { "epoch": 18.95852449990824, "grad_norm": 0.0003763777785934508, "learning_rate": 8.240420194388532e-08, "loss": 0.0, "num_input_tokens_seen": 222820592, "step": 103305 }, { "epoch": 18.959442099467793, "grad_norm": 0.00016742938896641135, "learning_rate": 8.225948505129755e-08, "loss": 0.0, "num_input_tokens_seen": 222831632, "step": 103310 }, { "epoch": 18.960359699027343, "grad_norm": 0.0017651129746809602, "learning_rate": 8.211489429113206e-08, "loss": 0.0, "num_input_tokens_seen": 222844368, "step": 103315 }, { "epoch": 18.961277298586896, "grad_norm": 6.001633300911635e-05, "learning_rate": 8.197042966709756e-08, "loss": 0.0, "num_input_tokens_seen": 222855376, "step": 103320 }, { "epoch": 18.96219489814645, "grad_norm": 7.551857561338693e-05, "learning_rate": 8.182609118289886e-08, "loss": 0.0, "num_input_tokens_seen": 222867696, "step": 103325 }, { "epoch": 18.963112497706, "grad_norm": 0.0018094175029546022, "learning_rate": 8.168187884223911e-08, "loss": 0.0, "num_input_tokens_seen": 222878224, "step": 103330 }, { "epoch": 18.964030097265553, "grad_norm": 6.762252451153472e-05, "learning_rate": 8.15377926488159e-08, "loss": 0.0, "num_input_tokens_seen": 222888816, "step": 103335 }, { "epoch": 18.964947696825107, "grad_norm": 0.0001971715537365526, "learning_rate": 8.139383260632571e-08, "loss": 0.0, "num_input_tokens_seen": 222900144, "step": 103340 }, { "epoch": 18.965865296384656, "grad_norm": 0.00026287022046744823, "learning_rate": 8.124999871846062e-08, "loss": 0.0, "num_input_tokens_seen": 222911152, "step": 103345 }, { "epoch": 18.96678289594421, "grad_norm": 8.279189933091402e-05, "learning_rate": 8.110629098890932e-08, "loss": 0.0, "num_input_tokens_seen": 222921424, "step": 103350 }, { "epoch": 18.967700495503763, "grad_norm": 5.7258421293227e-05, "learning_rate": 8.096270942135776e-08, "loss": 0.0, "num_input_tokens_seen": 222932080, "step": 103355 }, { "epoch": 18.968618095063313, "grad_norm": 0.0009298687218688428, "learning_rate": 8.081925401948964e-08, "loss": 0.0, "num_input_tokens_seen": 222942928, "step": 103360 }, { "epoch": 18.969535694622866, "grad_norm": 7.207965973066166e-05, "learning_rate": 8.067592478698371e-08, "loss": 0.0, "num_input_tokens_seen": 222952848, "step": 103365 }, { "epoch": 18.97045329418242, "grad_norm": 0.00015746167628094554, "learning_rate": 8.053272172751591e-08, "loss": 0.0, "num_input_tokens_seen": 222963664, "step": 103370 }, { "epoch": 18.97137089374197, "grad_norm": 0.00018280668882653117, "learning_rate": 8.038964484475886e-08, "loss": 0.0, "num_input_tokens_seen": 222974832, "step": 103375 }, { "epoch": 18.972288493301523, "grad_norm": 0.00022961974900681525, "learning_rate": 8.024669414238295e-08, "loss": 0.0, "num_input_tokens_seen": 222984816, "step": 103380 }, { "epoch": 18.973206092861076, "grad_norm": 8.405819971812889e-05, "learning_rate": 8.010386962405415e-08, "loss": 0.0001, "num_input_tokens_seen": 222996016, "step": 103385 }, { "epoch": 18.974123692420626, "grad_norm": 5.957404209766537e-05, "learning_rate": 7.996117129343616e-08, "loss": 0.0, "num_input_tokens_seen": 223005904, "step": 103390 }, { "epoch": 18.97504129198018, "grad_norm": 6.169827247504145e-05, "learning_rate": 7.981859915418888e-08, "loss": 0.0, "num_input_tokens_seen": 223017232, "step": 103395 }, { "epoch": 18.975958891539733, "grad_norm": 0.0008609246578998864, "learning_rate": 7.96761532099688e-08, "loss": 0.0, "num_input_tokens_seen": 223027792, "step": 103400 }, { "epoch": 18.976876491099283, "grad_norm": 0.002385898260399699, "learning_rate": 7.953383346443022e-08, "loss": 0.0, "num_input_tokens_seen": 223039376, "step": 103405 }, { "epoch": 18.977794090658836, "grad_norm": 6.871274672448635e-05, "learning_rate": 7.939163992122189e-08, "loss": 0.0, "num_input_tokens_seen": 223048656, "step": 103410 }, { "epoch": 18.97871169021839, "grad_norm": 0.0001334058033535257, "learning_rate": 7.924957258399202e-08, "loss": 0.0, "num_input_tokens_seen": 223059216, "step": 103415 }, { "epoch": 18.97962928977794, "grad_norm": 0.0008472769986838102, "learning_rate": 7.910763145638434e-08, "loss": 0.0, "num_input_tokens_seen": 223070160, "step": 103420 }, { "epoch": 18.980546889337493, "grad_norm": 0.0002443295088596642, "learning_rate": 7.896581654203872e-08, "loss": 0.0, "num_input_tokens_seen": 223080464, "step": 103425 }, { "epoch": 18.981464488897046, "grad_norm": 5.0615046347957104e-05, "learning_rate": 7.882412784459336e-08, "loss": 0.0, "num_input_tokens_seen": 223092272, "step": 103430 }, { "epoch": 18.982382088456596, "grad_norm": 0.00012428256741259247, "learning_rate": 7.868256536768203e-08, "loss": 0.0, "num_input_tokens_seen": 223102384, "step": 103435 }, { "epoch": 18.98329968801615, "grad_norm": 0.0002942747378256172, "learning_rate": 7.854112911493516e-08, "loss": 0.0, "num_input_tokens_seen": 223114416, "step": 103440 }, { "epoch": 18.984217287575703, "grad_norm": 0.00017434328037779778, "learning_rate": 7.839981908998151e-08, "loss": 0.0, "num_input_tokens_seen": 223125328, "step": 103445 }, { "epoch": 18.985134887135253, "grad_norm": 7.977423956617713e-05, "learning_rate": 7.825863529644429e-08, "loss": 0.0, "num_input_tokens_seen": 223136944, "step": 103450 }, { "epoch": 18.986052486694806, "grad_norm": 0.0013258906546980143, "learning_rate": 7.811757773794504e-08, "loss": 0.0, "num_input_tokens_seen": 223147312, "step": 103455 }, { "epoch": 18.98697008625436, "grad_norm": 0.0013442097697407007, "learning_rate": 7.797664641810143e-08, "loss": 0.0, "num_input_tokens_seen": 223158160, "step": 103460 }, { "epoch": 18.98788768581391, "grad_norm": 9.640344796935096e-05, "learning_rate": 7.783584134052891e-08, "loss": 0.0, "num_input_tokens_seen": 223169392, "step": 103465 }, { "epoch": 18.988805285373463, "grad_norm": 5.1112594519509e-05, "learning_rate": 7.769516250883846e-08, "loss": 0.0, "num_input_tokens_seen": 223181008, "step": 103470 }, { "epoch": 18.989722884933016, "grad_norm": 8.061002881731838e-05, "learning_rate": 7.755460992663722e-08, "loss": 0.0, "num_input_tokens_seen": 223191504, "step": 103475 }, { "epoch": 18.990640484492566, "grad_norm": 9.712482278700918e-05, "learning_rate": 7.741418359753228e-08, "loss": 0.0, "num_input_tokens_seen": 223202960, "step": 103480 }, { "epoch": 18.99155808405212, "grad_norm": 0.00014681759057566524, "learning_rate": 7.727388352512355e-08, "loss": 0.0, "num_input_tokens_seen": 223213456, "step": 103485 }, { "epoch": 18.992475683611673, "grad_norm": 4.017214087070897e-05, "learning_rate": 7.713370971301093e-08, "loss": 0.0, "num_input_tokens_seen": 223223184, "step": 103490 }, { "epoch": 18.993393283171223, "grad_norm": 3.114025457762182e-05, "learning_rate": 7.699366216478821e-08, "loss": 0.0, "num_input_tokens_seen": 223233328, "step": 103495 }, { "epoch": 18.994310882730776, "grad_norm": 0.0007343744509853423, "learning_rate": 7.685374088404807e-08, "loss": 0.0, "num_input_tokens_seen": 223244592, "step": 103500 }, { "epoch": 18.99522848229033, "grad_norm": 0.001151786302216351, "learning_rate": 7.671394587437931e-08, "loss": 0.0, "num_input_tokens_seen": 223254352, "step": 103505 }, { "epoch": 18.99614608184988, "grad_norm": 0.00014667473442386836, "learning_rate": 7.657427713936794e-08, "loss": 0.0, "num_input_tokens_seen": 223265040, "step": 103510 }, { "epoch": 18.997063681409433, "grad_norm": 4.28608400397934e-05, "learning_rate": 7.643473468259554e-08, "loss": 0.0, "num_input_tokens_seen": 223275344, "step": 103515 }, { "epoch": 18.997981280968986, "grad_norm": 0.0004109037108719349, "learning_rate": 7.62953185076415e-08, "loss": 0.0, "num_input_tokens_seen": 223287440, "step": 103520 }, { "epoch": 18.998898880528536, "grad_norm": 0.0035238454584032297, "learning_rate": 7.615602861808069e-08, "loss": 0.0, "num_input_tokens_seen": 223299248, "step": 103525 }, { "epoch": 18.99981648008809, "grad_norm": 9.142232738668099e-05, "learning_rate": 7.601686501748695e-08, "loss": 0.0, "num_input_tokens_seen": 223310032, "step": 103530 }, { "epoch": 19.000734079647643, "grad_norm": 8.079322287812829e-05, "learning_rate": 7.587782770942965e-08, "loss": 0.0, "num_input_tokens_seen": 223320816, "step": 103535 }, { "epoch": 19.001651679207193, "grad_norm": 8.858242654241621e-05, "learning_rate": 7.573891669747369e-08, "loss": 0.0, "num_input_tokens_seen": 223331280, "step": 103540 }, { "epoch": 19.002569278766746, "grad_norm": 6.12023432040587e-05, "learning_rate": 7.56001319851829e-08, "loss": 0.0, "num_input_tokens_seen": 223342672, "step": 103545 }, { "epoch": 19.0034868783263, "grad_norm": 0.00011006428394466639, "learning_rate": 7.546147357611666e-08, "loss": 0.0, "num_input_tokens_seen": 223353872, "step": 103550 }, { "epoch": 19.00440447788585, "grad_norm": 0.00013455832959152758, "learning_rate": 7.532294147383101e-08, "loss": 0.0, "num_input_tokens_seen": 223365616, "step": 103555 }, { "epoch": 19.005322077445403, "grad_norm": 8.06097814347595e-05, "learning_rate": 7.518453568187922e-08, "loss": 0.0, "num_input_tokens_seen": 223376048, "step": 103560 }, { "epoch": 19.006239677004956, "grad_norm": 0.00012204933591419831, "learning_rate": 7.504625620381178e-08, "loss": 0.0, "num_input_tokens_seen": 223384976, "step": 103565 }, { "epoch": 19.007157276564506, "grad_norm": 2.0600028038024902, "learning_rate": 7.490810304317475e-08, "loss": 0.0057, "num_input_tokens_seen": 223395504, "step": 103570 }, { "epoch": 19.00807487612406, "grad_norm": 8.799249917501584e-05, "learning_rate": 7.47700762035114e-08, "loss": 0.0, "num_input_tokens_seen": 223406448, "step": 103575 }, { "epoch": 19.008992475683613, "grad_norm": 6.803705036872998e-05, "learning_rate": 7.463217568836222e-08, "loss": 0.0, "num_input_tokens_seen": 223417520, "step": 103580 }, { "epoch": 19.009910075243162, "grad_norm": 4.8760397476144135e-05, "learning_rate": 7.449440150126441e-08, "loss": 0.0, "num_input_tokens_seen": 223429200, "step": 103585 }, { "epoch": 19.010827674802716, "grad_norm": 8.99481019587256e-05, "learning_rate": 7.435675364575124e-08, "loss": 0.0, "num_input_tokens_seen": 223440464, "step": 103590 }, { "epoch": 19.01174527436227, "grad_norm": 0.19992059469223022, "learning_rate": 7.42192321253532e-08, "loss": 0.0001, "num_input_tokens_seen": 223450992, "step": 103595 }, { "epoch": 19.01266287392182, "grad_norm": 8.73578610480763e-05, "learning_rate": 7.40818369435975e-08, "loss": 0.0, "num_input_tokens_seen": 223462288, "step": 103600 }, { "epoch": 19.013580473481372, "grad_norm": 0.0011364545207470655, "learning_rate": 7.394456810400852e-08, "loss": 0.0, "num_input_tokens_seen": 223473264, "step": 103605 }, { "epoch": 19.014498073040926, "grad_norm": 0.0002462373813614249, "learning_rate": 7.380742561010623e-08, "loss": 0.0, "num_input_tokens_seen": 223484528, "step": 103610 }, { "epoch": 19.015415672600476, "grad_norm": 0.00011633601388894022, "learning_rate": 7.367040946540894e-08, "loss": 0.0, "num_input_tokens_seen": 223495056, "step": 103615 }, { "epoch": 19.01633327216003, "grad_norm": 0.0003977287851739675, "learning_rate": 7.353351967343048e-08, "loss": 0.0, "num_input_tokens_seen": 223505744, "step": 103620 }, { "epoch": 19.017250871719583, "grad_norm": 0.00010117802594322711, "learning_rate": 7.339675623768194e-08, "loss": 0.0, "num_input_tokens_seen": 223517104, "step": 103625 }, { "epoch": 19.018168471279132, "grad_norm": 5.445088027045131e-05, "learning_rate": 7.326011916167108e-08, "loss": 0.0, "num_input_tokens_seen": 223527568, "step": 103630 }, { "epoch": 19.019086070838686, "grad_norm": 4.7791774704819545e-05, "learning_rate": 7.312360844890232e-08, "loss": 0.0, "num_input_tokens_seen": 223537232, "step": 103635 }, { "epoch": 19.02000367039824, "grad_norm": 6.91981113050133e-05, "learning_rate": 7.298722410287728e-08, "loss": 0.0, "num_input_tokens_seen": 223547248, "step": 103640 }, { "epoch": 19.02092126995779, "grad_norm": 0.0034907616209238768, "learning_rate": 7.28509661270932e-08, "loss": 0.0, "num_input_tokens_seen": 223556688, "step": 103645 }, { "epoch": 19.021838869517342, "grad_norm": 0.0007623977144248784, "learning_rate": 7.271483452504557e-08, "loss": 0.0, "num_input_tokens_seen": 223567152, "step": 103650 }, { "epoch": 19.022756469076896, "grad_norm": 0.0010772711830213666, "learning_rate": 7.257882930022608e-08, "loss": 0.0, "num_input_tokens_seen": 223577744, "step": 103655 }, { "epoch": 19.023674068636446, "grad_norm": 0.0003137778548989445, "learning_rate": 7.244295045612249e-08, "loss": 0.0, "num_input_tokens_seen": 223589808, "step": 103660 }, { "epoch": 19.024591668196, "grad_norm": 8.3079474279657e-05, "learning_rate": 7.230719799622087e-08, "loss": 0.0, "num_input_tokens_seen": 223600528, "step": 103665 }, { "epoch": 19.025509267755552, "grad_norm": 6.172440043883398e-05, "learning_rate": 7.217157192400181e-08, "loss": 0.0, "num_input_tokens_seen": 223611056, "step": 103670 }, { "epoch": 19.026426867315102, "grad_norm": 0.00014127869508229196, "learning_rate": 7.203607224294473e-08, "loss": 0.0, "num_input_tokens_seen": 223623152, "step": 103675 }, { "epoch": 19.027344466874656, "grad_norm": 0.00013472825230564922, "learning_rate": 7.190069895652463e-08, "loss": 0.0, "num_input_tokens_seen": 223633552, "step": 103680 }, { "epoch": 19.02826206643421, "grad_norm": 9.491875243838876e-05, "learning_rate": 7.176545206821373e-08, "loss": 0.0, "num_input_tokens_seen": 223642288, "step": 103685 }, { "epoch": 19.02917966599376, "grad_norm": 7.325942715397105e-05, "learning_rate": 7.163033158148147e-08, "loss": 0.0, "num_input_tokens_seen": 223653360, "step": 103690 }, { "epoch": 19.030097265553312, "grad_norm": 9.439504356123507e-05, "learning_rate": 7.149533749979176e-08, "loss": 0.0, "num_input_tokens_seen": 223663952, "step": 103695 }, { "epoch": 19.031014865112866, "grad_norm": 9.359046816825867e-05, "learning_rate": 7.136046982660904e-08, "loss": 0.0, "num_input_tokens_seen": 223675632, "step": 103700 }, { "epoch": 19.031932464672416, "grad_norm": 0.00043650317820720375, "learning_rate": 7.122572856539167e-08, "loss": 0.0, "num_input_tokens_seen": 223686160, "step": 103705 }, { "epoch": 19.03285006423197, "grad_norm": 7.645934238098562e-05, "learning_rate": 7.109111371959521e-08, "loss": 0.0, "num_input_tokens_seen": 223696688, "step": 103710 }, { "epoch": 19.033767663791522, "grad_norm": 9.104630589717999e-05, "learning_rate": 7.095662529267244e-08, "loss": 0.0, "num_input_tokens_seen": 223708080, "step": 103715 }, { "epoch": 19.034685263351072, "grad_norm": 7.971753075253218e-05, "learning_rate": 7.082226328807285e-08, "loss": 0.0, "num_input_tokens_seen": 223718640, "step": 103720 }, { "epoch": 19.035602862910626, "grad_norm": 0.00010737864795373753, "learning_rate": 7.068802770924255e-08, "loss": 0.0, "num_input_tokens_seen": 223729200, "step": 103725 }, { "epoch": 19.03652046247018, "grad_norm": 4.147215076955035e-05, "learning_rate": 7.05539185596249e-08, "loss": 0.0, "num_input_tokens_seen": 223739952, "step": 103730 }, { "epoch": 19.03743806202973, "grad_norm": 0.000669115805067122, "learning_rate": 7.041993584265938e-08, "loss": 0.0, "num_input_tokens_seen": 223751184, "step": 103735 }, { "epoch": 19.038355661589282, "grad_norm": 6.040386142558418e-05, "learning_rate": 7.028607956178268e-08, "loss": 0.0, "num_input_tokens_seen": 223761968, "step": 103740 }, { "epoch": 19.039273261148836, "grad_norm": 9.619263437343761e-05, "learning_rate": 7.015234972042651e-08, "loss": 0.0144, "num_input_tokens_seen": 223770640, "step": 103745 }, { "epoch": 19.040190860708385, "grad_norm": 4.915808676742017e-05, "learning_rate": 7.00187463220231e-08, "loss": 0.0, "num_input_tokens_seen": 223782064, "step": 103750 }, { "epoch": 19.04110846026794, "grad_norm": 0.0006616205791942775, "learning_rate": 6.988526936999751e-08, "loss": 0.0002, "num_input_tokens_seen": 223793232, "step": 103755 }, { "epoch": 19.042026059827492, "grad_norm": 0.00013415183639153838, "learning_rate": 6.975191886777366e-08, "loss": 0.0, "num_input_tokens_seen": 223803792, "step": 103760 }, { "epoch": 19.042943659387042, "grad_norm": 0.0001709965436020866, "learning_rate": 6.961869481877215e-08, "loss": 0.0, "num_input_tokens_seen": 223815440, "step": 103765 }, { "epoch": 19.043861258946595, "grad_norm": 5.476340083987452e-05, "learning_rate": 6.948559722641024e-08, "loss": 0.0, "num_input_tokens_seen": 223826992, "step": 103770 }, { "epoch": 19.04477885850615, "grad_norm": 0.00019205651187803596, "learning_rate": 6.935262609410076e-08, "loss": 0.0, "num_input_tokens_seen": 223838224, "step": 103775 }, { "epoch": 19.0456964580657, "grad_norm": 8.561044523958117e-05, "learning_rate": 6.921978142525376e-08, "loss": 0.0, "num_input_tokens_seen": 223849648, "step": 103780 }, { "epoch": 19.046614057625252, "grad_norm": 0.001211678609251976, "learning_rate": 6.908706322327818e-08, "loss": 0.0, "num_input_tokens_seen": 223860976, "step": 103785 }, { "epoch": 19.047531657184805, "grad_norm": 0.014142343774437904, "learning_rate": 6.895447149157741e-08, "loss": 0.0, "num_input_tokens_seen": 223872848, "step": 103790 }, { "epoch": 19.048449256744355, "grad_norm": 0.010675999335944653, "learning_rate": 6.882200623355151e-08, "loss": 0.0, "num_input_tokens_seen": 223882768, "step": 103795 }, { "epoch": 19.04936685630391, "grad_norm": 5.009966480429284e-05, "learning_rate": 6.868966745259886e-08, "loss": 0.0, "num_input_tokens_seen": 223894448, "step": 103800 }, { "epoch": 19.050284455863462, "grad_norm": 0.00012427005276549608, "learning_rate": 6.855745515211343e-08, "loss": 0.0, "num_input_tokens_seen": 223904208, "step": 103805 }, { "epoch": 19.051202055423012, "grad_norm": 5.2574745495803654e-05, "learning_rate": 6.842536933548583e-08, "loss": 0.0, "num_input_tokens_seen": 223914928, "step": 103810 }, { "epoch": 19.052119654982565, "grad_norm": 0.0001463451044401154, "learning_rate": 6.829341000610445e-08, "loss": 0.0, "num_input_tokens_seen": 223925680, "step": 103815 }, { "epoch": 19.05303725454212, "grad_norm": 0.00020105055591557175, "learning_rate": 6.816157716735383e-08, "loss": 0.0, "num_input_tokens_seen": 223936080, "step": 103820 }, { "epoch": 19.05395485410167, "grad_norm": 0.00010569637379376218, "learning_rate": 6.802987082261514e-08, "loss": 0.0, "num_input_tokens_seen": 223946704, "step": 103825 }, { "epoch": 19.054872453661222, "grad_norm": 0.0004345103516243398, "learning_rate": 6.789829097526569e-08, "loss": 0.0, "num_input_tokens_seen": 223956880, "step": 103830 }, { "epoch": 19.055790053220775, "grad_norm": 0.00028163217939436436, "learning_rate": 6.776683762868164e-08, "loss": 0.0, "num_input_tokens_seen": 223967344, "step": 103835 }, { "epoch": 19.056707652780325, "grad_norm": 9.844174928730354e-05, "learning_rate": 6.76355107862342e-08, "loss": 0.0, "num_input_tokens_seen": 223978160, "step": 103840 }, { "epoch": 19.05762525233988, "grad_norm": 8.90587834874168e-05, "learning_rate": 6.750431045129069e-08, "loss": 0.0, "num_input_tokens_seen": 223988720, "step": 103845 }, { "epoch": 19.058542851899432, "grad_norm": 4.6796321839792654e-05, "learning_rate": 6.737323662721728e-08, "loss": 0.0, "num_input_tokens_seen": 224000560, "step": 103850 }, { "epoch": 19.059460451458982, "grad_norm": 6.0381942603271455e-05, "learning_rate": 6.724228931737576e-08, "loss": 0.0, "num_input_tokens_seen": 224011120, "step": 103855 }, { "epoch": 19.060378051018535, "grad_norm": 0.0002922284184023738, "learning_rate": 6.711146852512395e-08, "loss": 0.0, "num_input_tokens_seen": 224021264, "step": 103860 }, { "epoch": 19.06129565057809, "grad_norm": 0.0002743445220403373, "learning_rate": 6.69807742538181e-08, "loss": 0.0, "num_input_tokens_seen": 224029712, "step": 103865 }, { "epoch": 19.06221325013764, "grad_norm": 0.00022937532048672438, "learning_rate": 6.68502065068094e-08, "loss": 0.0, "num_input_tokens_seen": 224039632, "step": 103870 }, { "epoch": 19.063130849697192, "grad_norm": 7.771601667627692e-05, "learning_rate": 6.671976528744795e-08, "loss": 0.0, "num_input_tokens_seen": 224049648, "step": 103875 }, { "epoch": 19.064048449256745, "grad_norm": 8.982184226624668e-05, "learning_rate": 6.658945059907773e-08, "loss": 0.0, "num_input_tokens_seen": 224060208, "step": 103880 }, { "epoch": 19.064966048816295, "grad_norm": 0.00012905418407171965, "learning_rate": 6.645926244504275e-08, "loss": 0.0, "num_input_tokens_seen": 224071760, "step": 103885 }, { "epoch": 19.06588364837585, "grad_norm": 7.282940350705758e-05, "learning_rate": 6.632920082868144e-08, "loss": 0.0, "num_input_tokens_seen": 224083216, "step": 103890 }, { "epoch": 19.066801247935402, "grad_norm": 6.446366023737937e-05, "learning_rate": 6.619926575332891e-08, "loss": 0.0, "num_input_tokens_seen": 224093744, "step": 103895 }, { "epoch": 19.06771884749495, "grad_norm": 0.0001515356998424977, "learning_rate": 6.606945722231916e-08, "loss": 0.0, "num_input_tokens_seen": 224103664, "step": 103900 }, { "epoch": 19.068636447054505, "grad_norm": 0.0009158870088867843, "learning_rate": 6.593977523898066e-08, "loss": 0.0, "num_input_tokens_seen": 224113712, "step": 103905 }, { "epoch": 19.06955404661406, "grad_norm": 7.131843449315056e-05, "learning_rate": 6.581021980664015e-08, "loss": 0.0, "num_input_tokens_seen": 224124944, "step": 103910 }, { "epoch": 19.07047164617361, "grad_norm": 6.0052956541767344e-05, "learning_rate": 6.568079092862e-08, "loss": 0.0, "num_input_tokens_seen": 224134000, "step": 103915 }, { "epoch": 19.07138924573316, "grad_norm": 0.002575626829639077, "learning_rate": 6.555148860823979e-08, "loss": 0.0, "num_input_tokens_seen": 224144624, "step": 103920 }, { "epoch": 19.072306845292715, "grad_norm": 0.0001677798863966018, "learning_rate": 6.542231284881628e-08, "loss": 0.0, "num_input_tokens_seen": 224156368, "step": 103925 }, { "epoch": 19.073224444852265, "grad_norm": 5.577411502599716e-05, "learning_rate": 6.529326365366295e-08, "loss": 0.0, "num_input_tokens_seen": 224166928, "step": 103930 }, { "epoch": 19.07414204441182, "grad_norm": 0.0006602579960599542, "learning_rate": 6.516434102608882e-08, "loss": 0.0, "num_input_tokens_seen": 224177328, "step": 103935 }, { "epoch": 19.07505964397137, "grad_norm": 0.000658995530102402, "learning_rate": 6.503554496940123e-08, "loss": 0.0, "num_input_tokens_seen": 224188112, "step": 103940 }, { "epoch": 19.07597724353092, "grad_norm": 0.0007399937021546066, "learning_rate": 6.490687548690366e-08, "loss": 0.0, "num_input_tokens_seen": 224199856, "step": 103945 }, { "epoch": 19.076894843090475, "grad_norm": 0.0006987724918872118, "learning_rate": 6.47783325818957e-08, "loss": 0.0, "num_input_tokens_seen": 224210448, "step": 103950 }, { "epoch": 19.07781244265003, "grad_norm": 0.0012737358920276165, "learning_rate": 6.464991625767469e-08, "loss": 0.0, "num_input_tokens_seen": 224221616, "step": 103955 }, { "epoch": 19.07873004220958, "grad_norm": 6.738750380463898e-05, "learning_rate": 6.452162651753413e-08, "loss": 0.0, "num_input_tokens_seen": 224233456, "step": 103960 }, { "epoch": 19.07964764176913, "grad_norm": 5.5203574447659776e-05, "learning_rate": 6.43934633647647e-08, "loss": 0.0, "num_input_tokens_seen": 224244432, "step": 103965 }, { "epoch": 19.080565241328685, "grad_norm": 9.068510553333908e-05, "learning_rate": 6.426542680265324e-08, "loss": 0.0, "num_input_tokens_seen": 224255696, "step": 103970 }, { "epoch": 19.081482840888235, "grad_norm": 5.974180021439679e-05, "learning_rate": 6.413751683448432e-08, "loss": 0.0, "num_input_tokens_seen": 224268304, "step": 103975 }, { "epoch": 19.08240044044779, "grad_norm": 0.0002031678450293839, "learning_rate": 6.400973346353756e-08, "loss": 0.0, "num_input_tokens_seen": 224279600, "step": 103980 }, { "epoch": 19.08331804000734, "grad_norm": 0.0006793444626964629, "learning_rate": 6.388207669309143e-08, "loss": 0.0, "num_input_tokens_seen": 224290608, "step": 103985 }, { "epoch": 19.08423563956689, "grad_norm": 0.00018810812616720796, "learning_rate": 6.375454652641999e-08, "loss": 0.0, "num_input_tokens_seen": 224301776, "step": 103990 }, { "epoch": 19.085153239126445, "grad_norm": 0.00012018286361126229, "learning_rate": 6.362714296679396e-08, "loss": 0.0, "num_input_tokens_seen": 224312848, "step": 103995 }, { "epoch": 19.086070838686, "grad_norm": 0.0017878072103485465, "learning_rate": 6.349986601748015e-08, "loss": 0.0, "num_input_tokens_seen": 224323472, "step": 104000 }, { "epoch": 19.086988438245548, "grad_norm": 0.00018932089733425528, "learning_rate": 6.337271568174485e-08, "loss": 0.0, "num_input_tokens_seen": 224333520, "step": 104005 }, { "epoch": 19.0879060378051, "grad_norm": 6.891418888699263e-05, "learning_rate": 6.324569196284768e-08, "loss": 0.0, "num_input_tokens_seen": 224344496, "step": 104010 }, { "epoch": 19.088823637364655, "grad_norm": 4.9461028538644314e-05, "learning_rate": 6.31187948640477e-08, "loss": 0.0, "num_input_tokens_seen": 224355248, "step": 104015 }, { "epoch": 19.089741236924205, "grad_norm": 0.00023075735953170806, "learning_rate": 6.299202438859898e-08, "loss": 0.0, "num_input_tokens_seen": 224366480, "step": 104020 }, { "epoch": 19.090658836483758, "grad_norm": 4.570373130263761e-05, "learning_rate": 6.286538053975333e-08, "loss": 0.0, "num_input_tokens_seen": 224377072, "step": 104025 }, { "epoch": 19.09157643604331, "grad_norm": 4.761695527122356e-05, "learning_rate": 6.273886332075818e-08, "loss": 0.0, "num_input_tokens_seen": 224387920, "step": 104030 }, { "epoch": 19.09249403560286, "grad_norm": 9.614535520086065e-05, "learning_rate": 6.261247273485981e-08, "loss": 0.0, "num_input_tokens_seen": 224398640, "step": 104035 }, { "epoch": 19.093411635162415, "grad_norm": 0.00020589142513927072, "learning_rate": 6.248620878529898e-08, "loss": 0.0, "num_input_tokens_seen": 224408688, "step": 104040 }, { "epoch": 19.094329234721968, "grad_norm": 0.001333202701061964, "learning_rate": 6.236007147531475e-08, "loss": 0.0, "num_input_tokens_seen": 224419920, "step": 104045 }, { "epoch": 19.095246834281518, "grad_norm": 0.0003069186059292406, "learning_rate": 6.223406080814121e-08, "loss": 0.0, "num_input_tokens_seen": 224430224, "step": 104050 }, { "epoch": 19.09616443384107, "grad_norm": 7.805498171364889e-05, "learning_rate": 6.210817678701187e-08, "loss": 0.0, "num_input_tokens_seen": 224440048, "step": 104055 }, { "epoch": 19.097082033400625, "grad_norm": 0.00021567889780271798, "learning_rate": 6.19824194151547e-08, "loss": 0.0, "num_input_tokens_seen": 224450000, "step": 104060 }, { "epoch": 19.097999632960175, "grad_norm": 6.236512854229659e-05, "learning_rate": 6.185678869579492e-08, "loss": 0.0, "num_input_tokens_seen": 224461072, "step": 104065 }, { "epoch": 19.098917232519728, "grad_norm": 5.8485151384957135e-05, "learning_rate": 6.17312846321555e-08, "loss": 0.0, "num_input_tokens_seen": 224472880, "step": 104070 }, { "epoch": 19.09983483207928, "grad_norm": 5.207874346524477e-05, "learning_rate": 6.160590722745496e-08, "loss": 0.0, "num_input_tokens_seen": 224484624, "step": 104075 }, { "epoch": 19.10075243163883, "grad_norm": 3.5195665986975655e-05, "learning_rate": 6.148065648490852e-08, "loss": 0.0, "num_input_tokens_seen": 224494096, "step": 104080 }, { "epoch": 19.101670031198385, "grad_norm": 0.0001444983499823138, "learning_rate": 6.135553240772973e-08, "loss": 0.0, "num_input_tokens_seen": 224504496, "step": 104085 }, { "epoch": 19.102587630757938, "grad_norm": 6.372670031851158e-05, "learning_rate": 6.123053499912768e-08, "loss": 0.0, "num_input_tokens_seen": 224515792, "step": 104090 }, { "epoch": 19.103505230317488, "grad_norm": 0.0001326282072113827, "learning_rate": 6.110566426230758e-08, "loss": 0.0, "num_input_tokens_seen": 224527376, "step": 104095 }, { "epoch": 19.10442282987704, "grad_norm": 0.0006949591333977878, "learning_rate": 6.098092020047242e-08, "loss": 0.0, "num_input_tokens_seen": 224539024, "step": 104100 }, { "epoch": 19.105340429436595, "grad_norm": 0.0013130315346643329, "learning_rate": 6.085630281682187e-08, "loss": 0.0, "num_input_tokens_seen": 224550192, "step": 104105 }, { "epoch": 19.106258028996145, "grad_norm": 4.5300479541765526e-05, "learning_rate": 6.073181211455281e-08, "loss": 0.0, "num_input_tokens_seen": 224561136, "step": 104110 }, { "epoch": 19.107175628555698, "grad_norm": 5.905970829189755e-05, "learning_rate": 6.06074480968566e-08, "loss": 0.0, "num_input_tokens_seen": 224571152, "step": 104115 }, { "epoch": 19.10809322811525, "grad_norm": 5.67807765037287e-05, "learning_rate": 6.048321076692454e-08, "loss": 0.0, "num_input_tokens_seen": 224582000, "step": 104120 }, { "epoch": 19.1090108276748, "grad_norm": 0.00024453981313854456, "learning_rate": 6.035910012794299e-08, "loss": 0.0, "num_input_tokens_seen": 224593168, "step": 104125 }, { "epoch": 19.109928427234355, "grad_norm": 0.0006176958559080958, "learning_rate": 6.023511618309441e-08, "loss": 0.0, "num_input_tokens_seen": 224604272, "step": 104130 }, { "epoch": 19.110846026793908, "grad_norm": 5.992675869492814e-05, "learning_rate": 6.011125893555902e-08, "loss": 0.0, "num_input_tokens_seen": 224614800, "step": 104135 }, { "epoch": 19.111763626353458, "grad_norm": 0.00016527737898286432, "learning_rate": 5.998752838851374e-08, "loss": 0.0, "num_input_tokens_seen": 224625776, "step": 104140 }, { "epoch": 19.11268122591301, "grad_norm": 6.292972102528438e-05, "learning_rate": 5.986392454513213e-08, "loss": 0.0, "num_input_tokens_seen": 224637552, "step": 104145 }, { "epoch": 19.113598825472565, "grad_norm": 0.00020501580729614943, "learning_rate": 5.974044740858386e-08, "loss": 0.0, "num_input_tokens_seen": 224648112, "step": 104150 }, { "epoch": 19.114516425032114, "grad_norm": 0.00018082653696183115, "learning_rate": 5.961709698203699e-08, "loss": 0.0, "num_input_tokens_seen": 224659472, "step": 104155 }, { "epoch": 19.115434024591668, "grad_norm": 0.0001277392148040235, "learning_rate": 5.9493873268654524e-08, "loss": 0.0, "num_input_tokens_seen": 224669808, "step": 104160 }, { "epoch": 19.11635162415122, "grad_norm": 4.91764658363536e-05, "learning_rate": 5.937077627159726e-08, "loss": 0.0, "num_input_tokens_seen": 224679504, "step": 104165 }, { "epoch": 19.11726922371077, "grad_norm": 5.6991124438354746e-05, "learning_rate": 5.924780599402213e-08, "loss": 0.0, "num_input_tokens_seen": 224690192, "step": 104170 }, { "epoch": 19.118186823270324, "grad_norm": 0.00010701955761760473, "learning_rate": 5.9124962439083274e-08, "loss": 0.0, "num_input_tokens_seen": 224700720, "step": 104175 }, { "epoch": 19.119104422829878, "grad_norm": 0.0016477524768561125, "learning_rate": 5.900224560993151e-08, "loss": 0.0, "num_input_tokens_seen": 224711152, "step": 104180 }, { "epoch": 19.120022022389428, "grad_norm": 0.0003963121271226555, "learning_rate": 5.8879655509714306e-08, "loss": 0.0, "num_input_tokens_seen": 224721872, "step": 104185 }, { "epoch": 19.12093962194898, "grad_norm": 0.00018672380247153342, "learning_rate": 5.875719214157582e-08, "loss": 0.0, "num_input_tokens_seen": 224731504, "step": 104190 }, { "epoch": 19.121857221508535, "grad_norm": 6.626259710174054e-05, "learning_rate": 5.863485550865744e-08, "loss": 0.0, "num_input_tokens_seen": 224742896, "step": 104195 }, { "epoch": 19.122774821068084, "grad_norm": 3.8810343539807945e-05, "learning_rate": 5.8512645614096086e-08, "loss": 0.0, "num_input_tokens_seen": 224752912, "step": 104200 }, { "epoch": 19.123692420627638, "grad_norm": 4.614250792656094e-05, "learning_rate": 5.839056246102703e-08, "loss": 0.0, "num_input_tokens_seen": 224763504, "step": 104205 }, { "epoch": 19.12461002018719, "grad_norm": 5.558696648222394e-05, "learning_rate": 5.826860605258111e-08, "loss": 0.0, "num_input_tokens_seen": 224774000, "step": 104210 }, { "epoch": 19.12552761974674, "grad_norm": 0.00042249239049851894, "learning_rate": 5.814677639188637e-08, "loss": 0.0, "num_input_tokens_seen": 224785584, "step": 104215 }, { "epoch": 19.126445219306294, "grad_norm": 0.0002706884406507015, "learning_rate": 5.8025073482068095e-08, "loss": 0.0, "num_input_tokens_seen": 224796560, "step": 104220 }, { "epoch": 19.127362818865848, "grad_norm": 0.00010917787585640326, "learning_rate": 5.7903497326247116e-08, "loss": 0.0, "num_input_tokens_seen": 224807152, "step": 104225 }, { "epoch": 19.128280418425398, "grad_norm": 9.232132288161665e-05, "learning_rate": 5.778204792754205e-08, "loss": 0.0, "num_input_tokens_seen": 224818128, "step": 104230 }, { "epoch": 19.12919801798495, "grad_norm": 5.551598223973997e-05, "learning_rate": 5.766072528906708e-08, "loss": 0.0, "num_input_tokens_seen": 224829488, "step": 104235 }, { "epoch": 19.130115617544504, "grad_norm": 0.00035717239370569587, "learning_rate": 5.753952941393526e-08, "loss": 0.0, "num_input_tokens_seen": 224840144, "step": 104240 }, { "epoch": 19.131033217104054, "grad_norm": 0.0006543067283928394, "learning_rate": 5.741846030525411e-08, "loss": 0.0, "num_input_tokens_seen": 224850352, "step": 104245 }, { "epoch": 19.131950816663608, "grad_norm": 0.011329884640872478, "learning_rate": 5.7297517966128926e-08, "loss": 0.0, "num_input_tokens_seen": 224861744, "step": 104250 }, { "epoch": 19.13286841622316, "grad_norm": 0.00013038053293712437, "learning_rate": 5.717670239966222e-08, "loss": 0.0, "num_input_tokens_seen": 224871440, "step": 104255 }, { "epoch": 19.13378601578271, "grad_norm": 0.0005755224265158176, "learning_rate": 5.705601360895263e-08, "loss": 0.0, "num_input_tokens_seen": 224882192, "step": 104260 }, { "epoch": 19.134703615342264, "grad_norm": 0.006121719256043434, "learning_rate": 5.693545159709491e-08, "loss": 0.0, "num_input_tokens_seen": 224893392, "step": 104265 }, { "epoch": 19.135621214901818, "grad_norm": 0.00035104964626953006, "learning_rate": 5.6815016367181564e-08, "loss": 0.0, "num_input_tokens_seen": 224902384, "step": 104270 }, { "epoch": 19.136538814461368, "grad_norm": 0.00017720785399433225, "learning_rate": 5.669470792230236e-08, "loss": 0.0, "num_input_tokens_seen": 224914480, "step": 104275 }, { "epoch": 19.13745641402092, "grad_norm": 0.0006779178511351347, "learning_rate": 5.657452626554261e-08, "loss": 0.0, "num_input_tokens_seen": 224924400, "step": 104280 }, { "epoch": 19.138374013580474, "grad_norm": 7.273453957168385e-05, "learning_rate": 5.6454471399984275e-08, "loss": 0.0, "num_input_tokens_seen": 224933424, "step": 104285 }, { "epoch": 19.139291613140024, "grad_norm": 0.00010102671512868255, "learning_rate": 5.6334543328707134e-08, "loss": 0.0, "num_input_tokens_seen": 224944080, "step": 104290 }, { "epoch": 19.140209212699578, "grad_norm": 0.0001285149046452716, "learning_rate": 5.621474205478705e-08, "loss": 0.0, "num_input_tokens_seen": 224954512, "step": 104295 }, { "epoch": 19.14112681225913, "grad_norm": 4.481868018046953e-05, "learning_rate": 5.609506758129601e-08, "loss": 0.0, "num_input_tokens_seen": 224965744, "step": 104300 }, { "epoch": 19.14204441181868, "grad_norm": 5.6435022997902706e-05, "learning_rate": 5.59755199113049e-08, "loss": 0.0, "num_input_tokens_seen": 224976080, "step": 104305 }, { "epoch": 19.142962011378234, "grad_norm": 5.698386303265579e-05, "learning_rate": 5.585609904787903e-08, "loss": 0.0, "num_input_tokens_seen": 224986192, "step": 104310 }, { "epoch": 19.143879610937788, "grad_norm": 0.00020347685494925827, "learning_rate": 5.5736804994081515e-08, "loss": 0.0, "num_input_tokens_seen": 224996656, "step": 104315 }, { "epoch": 19.144797210497337, "grad_norm": 7.154939521569759e-05, "learning_rate": 5.5617637752971575e-08, "loss": 0.0, "num_input_tokens_seen": 225007312, "step": 104320 }, { "epoch": 19.14571481005689, "grad_norm": 5.290695116855204e-05, "learning_rate": 5.549859732760676e-08, "loss": 0.0, "num_input_tokens_seen": 225018832, "step": 104325 }, { "epoch": 19.146632409616444, "grad_norm": 0.0002895962679758668, "learning_rate": 5.537968372103908e-08, "loss": 0.0, "num_input_tokens_seen": 225029456, "step": 104330 }, { "epoch": 19.147550009175994, "grad_norm": 0.0001250218047061935, "learning_rate": 5.526089693631942e-08, "loss": 0.0028, "num_input_tokens_seen": 225039440, "step": 104335 }, { "epoch": 19.148467608735547, "grad_norm": 7.190530595835298e-05, "learning_rate": 5.514223697649368e-08, "loss": 0.0, "num_input_tokens_seen": 225049360, "step": 104340 }, { "epoch": 19.1493852082951, "grad_norm": 0.00010219809337286279, "learning_rate": 5.502370384460609e-08, "loss": 0.0, "num_input_tokens_seen": 225059152, "step": 104345 }, { "epoch": 19.15030280785465, "grad_norm": 0.00010219814430456609, "learning_rate": 5.4905297543696446e-08, "loss": 0.0, "num_input_tokens_seen": 225070064, "step": 104350 }, { "epoch": 19.151220407414204, "grad_norm": 9.055864939000458e-05, "learning_rate": 5.478701807680176e-08, "loss": 0.0, "num_input_tokens_seen": 225081424, "step": 104355 }, { "epoch": 19.152138006973757, "grad_norm": 8.521631389157847e-05, "learning_rate": 5.466886544695571e-08, "loss": 0.0, "num_input_tokens_seen": 225092368, "step": 104360 }, { "epoch": 19.153055606533307, "grad_norm": 0.00021354386990424246, "learning_rate": 5.455083965718866e-08, "loss": 0.0, "num_input_tokens_seen": 225103248, "step": 104365 }, { "epoch": 19.15397320609286, "grad_norm": 5.6919874623417854e-05, "learning_rate": 5.443294071052763e-08, "loss": 0.0, "num_input_tokens_seen": 225115216, "step": 104370 }, { "epoch": 19.154890805652414, "grad_norm": 5.279365723254159e-05, "learning_rate": 5.431516860999686e-08, "loss": 0.0, "num_input_tokens_seen": 225126000, "step": 104375 }, { "epoch": 19.155808405211964, "grad_norm": 5.256224540062249e-05, "learning_rate": 5.4197523358617276e-08, "loss": 0.0, "num_input_tokens_seen": 225135824, "step": 104380 }, { "epoch": 19.156726004771517, "grad_norm": 0.000255072140134871, "learning_rate": 5.40800049594048e-08, "loss": 0.0, "num_input_tokens_seen": 225146896, "step": 104385 }, { "epoch": 19.15764360433107, "grad_norm": 7.383196498267353e-05, "learning_rate": 5.3962613415375895e-08, "loss": 0.0, "num_input_tokens_seen": 225157520, "step": 104390 }, { "epoch": 19.15856120389062, "grad_norm": 0.007908900268375874, "learning_rate": 5.384534872953984e-08, "loss": 0.0, "num_input_tokens_seen": 225168624, "step": 104395 }, { "epoch": 19.159478803450174, "grad_norm": 0.0007090280414558947, "learning_rate": 5.372821090490421e-08, "loss": 0.0, "num_input_tokens_seen": 225178384, "step": 104400 }, { "epoch": 19.160396403009727, "grad_norm": 0.00030484600574709475, "learning_rate": 5.3611199944474946e-08, "loss": 0.0, "num_input_tokens_seen": 225188880, "step": 104405 }, { "epoch": 19.161314002569277, "grad_norm": 0.000299671693937853, "learning_rate": 5.3494315851251866e-08, "loss": 0.0, "num_input_tokens_seen": 225199632, "step": 104410 }, { "epoch": 19.16223160212883, "grad_norm": 21.291881561279297, "learning_rate": 5.337755862823313e-08, "loss": 0.0306, "num_input_tokens_seen": 225210960, "step": 104415 }, { "epoch": 19.163149201688384, "grad_norm": 8.689038804732263e-05, "learning_rate": 5.3260928278413006e-08, "loss": 0.0, "num_input_tokens_seen": 225221872, "step": 104420 }, { "epoch": 19.164066801247934, "grad_norm": 7.950581493787467e-05, "learning_rate": 5.3144424804783545e-08, "loss": 0.0, "num_input_tokens_seen": 225231728, "step": 104425 }, { "epoch": 19.164984400807487, "grad_norm": 0.0003159632033202797, "learning_rate": 5.302804821033292e-08, "loss": 0.0, "num_input_tokens_seen": 225241840, "step": 104430 }, { "epoch": 19.16590200036704, "grad_norm": 7.994411862455308e-05, "learning_rate": 5.29117984980454e-08, "loss": 0.0, "num_input_tokens_seen": 225252944, "step": 104435 }, { "epoch": 19.16681959992659, "grad_norm": 0.00010343293979531154, "learning_rate": 5.2795675670903044e-08, "loss": 0.0, "num_input_tokens_seen": 225263312, "step": 104440 }, { "epoch": 19.167737199486144, "grad_norm": 0.00018730800366029143, "learning_rate": 5.2679679731884595e-08, "loss": 0.0, "num_input_tokens_seen": 225274384, "step": 104445 }, { "epoch": 19.168654799045697, "grad_norm": 0.0001463748631067574, "learning_rate": 5.256381068396432e-08, "loss": 0.0, "num_input_tokens_seen": 225285648, "step": 104450 }, { "epoch": 19.169572398605247, "grad_norm": 6.609714182559401e-05, "learning_rate": 5.24480685301143e-08, "loss": 0.0, "num_input_tokens_seen": 225295664, "step": 104455 }, { "epoch": 19.1704899981648, "grad_norm": 0.00033940337016247213, "learning_rate": 5.2332453273303827e-08, "loss": 0.0, "num_input_tokens_seen": 225307024, "step": 104460 }, { "epoch": 19.171407597724354, "grad_norm": 3.330289837322198e-05, "learning_rate": 5.221696491649775e-08, "loss": 0.0, "num_input_tokens_seen": 225318160, "step": 104465 }, { "epoch": 19.172325197283904, "grad_norm": 0.0004618088423740119, "learning_rate": 5.2101603462657576e-08, "loss": 0.0, "num_input_tokens_seen": 225329456, "step": 104470 }, { "epoch": 19.173242796843457, "grad_norm": 0.00021409281180240214, "learning_rate": 5.198636891474262e-08, "loss": 0.0, "num_input_tokens_seen": 225340496, "step": 104475 }, { "epoch": 19.17416039640301, "grad_norm": 7.499857747461647e-05, "learning_rate": 5.1871261275709405e-08, "loss": 0.0, "num_input_tokens_seen": 225350864, "step": 104480 }, { "epoch": 19.17507799596256, "grad_norm": 9.451901132706553e-05, "learning_rate": 5.17562805485089e-08, "loss": 0.0, "num_input_tokens_seen": 225361328, "step": 104485 }, { "epoch": 19.175995595522114, "grad_norm": 7.55990986363031e-05, "learning_rate": 5.164142673609041e-08, "loss": 0.0, "num_input_tokens_seen": 225373232, "step": 104490 }, { "epoch": 19.176913195081667, "grad_norm": 0.001333769760094583, "learning_rate": 5.152669984140102e-08, "loss": 0.0, "num_input_tokens_seen": 225384592, "step": 104495 }, { "epoch": 19.177830794641217, "grad_norm": 8.282812632387504e-05, "learning_rate": 5.1412099867381715e-08, "loss": 0.0, "num_input_tokens_seen": 225395440, "step": 104500 }, { "epoch": 19.17874839420077, "grad_norm": 0.0001477049372624606, "learning_rate": 5.129762681697237e-08, "loss": 0.0, "num_input_tokens_seen": 225406096, "step": 104505 }, { "epoch": 19.179665993760324, "grad_norm": 8.970321505330503e-05, "learning_rate": 5.118328069310896e-08, "loss": 0.0, "num_input_tokens_seen": 225416400, "step": 104510 }, { "epoch": 19.180583593319874, "grad_norm": 6.627944821957499e-05, "learning_rate": 5.10690614987247e-08, "loss": 0.0, "num_input_tokens_seen": 225426160, "step": 104515 }, { "epoch": 19.181501192879427, "grad_norm": 6.017565101501532e-05, "learning_rate": 5.095496923674892e-08, "loss": 0.0, "num_input_tokens_seen": 225438160, "step": 104520 }, { "epoch": 19.18241879243898, "grad_norm": 0.0011366520775482059, "learning_rate": 5.08410039101076e-08, "loss": 0.0, "num_input_tokens_seen": 225448784, "step": 104525 }, { "epoch": 19.18333639199853, "grad_norm": 0.0028677319642156363, "learning_rate": 5.072716552172452e-08, "loss": 0.0, "num_input_tokens_seen": 225459856, "step": 104530 }, { "epoch": 19.184253991558084, "grad_norm": 0.0005291841807775199, "learning_rate": 5.0613454074518455e-08, "loss": 0.0, "num_input_tokens_seen": 225470512, "step": 104535 }, { "epoch": 19.185171591117637, "grad_norm": 0.00018738768994808197, "learning_rate": 5.049986957140651e-08, "loss": 0.0, "num_input_tokens_seen": 225480400, "step": 104540 }, { "epoch": 19.186089190677187, "grad_norm": 5.299362965160981e-05, "learning_rate": 5.0386412015302475e-08, "loss": 0.0, "num_input_tokens_seen": 225491696, "step": 104545 }, { "epoch": 19.18700679023674, "grad_norm": 5.02268121636007e-05, "learning_rate": 5.027308140911513e-08, "loss": 0.0, "num_input_tokens_seen": 225503600, "step": 104550 }, { "epoch": 19.187924389796294, "grad_norm": 9.471666999161243e-05, "learning_rate": 5.015987775575215e-08, "loss": 0.0, "num_input_tokens_seen": 225515632, "step": 104555 }, { "epoch": 19.188841989355844, "grad_norm": 0.00010350440425099805, "learning_rate": 5.004680105811677e-08, "loss": 0.0, "num_input_tokens_seen": 225526896, "step": 104560 }, { "epoch": 19.189759588915397, "grad_norm": 0.0002034873905358836, "learning_rate": 4.993385131910888e-08, "loss": 0.0, "num_input_tokens_seen": 225537872, "step": 104565 }, { "epoch": 19.19067718847495, "grad_norm": 0.00022259337129071355, "learning_rate": 4.982102854162618e-08, "loss": 0.0, "num_input_tokens_seen": 225548496, "step": 104570 }, { "epoch": 19.1915947880345, "grad_norm": 0.0001915444590849802, "learning_rate": 4.97083327285619e-08, "loss": 0.0, "num_input_tokens_seen": 225557808, "step": 104575 }, { "epoch": 19.192512387594054, "grad_norm": 0.00023175585374701768, "learning_rate": 4.9595763882806514e-08, "loss": 0.0, "num_input_tokens_seen": 225568784, "step": 104580 }, { "epoch": 19.193429987153607, "grad_norm": 9.246491390513256e-05, "learning_rate": 4.9483322007247145e-08, "loss": 0.0, "num_input_tokens_seen": 225577616, "step": 104585 }, { "epoch": 19.194347586713157, "grad_norm": 3.773721618927084e-05, "learning_rate": 4.937100710476872e-08, "loss": 0.0, "num_input_tokens_seen": 225588592, "step": 104590 }, { "epoch": 19.19526518627271, "grad_norm": 9.566129301674664e-05, "learning_rate": 4.9258819178250596e-08, "loss": 0.004, "num_input_tokens_seen": 225598288, "step": 104595 }, { "epoch": 19.196182785832264, "grad_norm": 5.181245069252327e-05, "learning_rate": 4.914675823057102e-08, "loss": 0.0, "num_input_tokens_seen": 225609776, "step": 104600 }, { "epoch": 19.197100385391813, "grad_norm": 0.00026194521342404187, "learning_rate": 4.903482426460382e-08, "loss": 0.0, "num_input_tokens_seen": 225622096, "step": 104605 }, { "epoch": 19.198017984951367, "grad_norm": 7.304782775463536e-05, "learning_rate": 4.892301728322002e-08, "loss": 0.0, "num_input_tokens_seen": 225633552, "step": 104610 }, { "epoch": 19.19893558451092, "grad_norm": 9.494636469753459e-05, "learning_rate": 4.881133728928733e-08, "loss": 0.0, "num_input_tokens_seen": 225644688, "step": 104615 }, { "epoch": 19.19985318407047, "grad_norm": 6.158666656119749e-05, "learning_rate": 4.869978428567012e-08, "loss": 0.0, "num_input_tokens_seen": 225655184, "step": 104620 }, { "epoch": 19.200770783630023, "grad_norm": 6.332543125608936e-05, "learning_rate": 4.858835827523001e-08, "loss": 0.0, "num_input_tokens_seen": 225665520, "step": 104625 }, { "epoch": 19.201688383189577, "grad_norm": 0.00041371153201907873, "learning_rate": 4.8477059260824685e-08, "loss": 0.0, "num_input_tokens_seen": 225675888, "step": 104630 }, { "epoch": 19.202605982749127, "grad_norm": 0.00038705748738721013, "learning_rate": 4.8365887245308e-08, "loss": 0.0, "num_input_tokens_seen": 225685968, "step": 104635 }, { "epoch": 19.20352358230868, "grad_norm": 0.0001438625913579017, "learning_rate": 4.8254842231532095e-08, "loss": 0.0, "num_input_tokens_seen": 225697040, "step": 104640 }, { "epoch": 19.204441181868233, "grad_norm": 8.713112765690312e-05, "learning_rate": 4.814392422234526e-08, "loss": 0.0, "num_input_tokens_seen": 225708560, "step": 104645 }, { "epoch": 19.205358781427783, "grad_norm": 5.729836630052887e-05, "learning_rate": 4.803313322059189e-08, "loss": 0.0, "num_input_tokens_seen": 225719312, "step": 104650 }, { "epoch": 19.206276380987337, "grad_norm": 3.822282451437786e-05, "learning_rate": 4.792246922911359e-08, "loss": 0.0, "num_input_tokens_seen": 225730320, "step": 104655 }, { "epoch": 19.20719398054689, "grad_norm": 4.221151539240964e-05, "learning_rate": 4.7811932250749205e-08, "loss": 0.0, "num_input_tokens_seen": 225740912, "step": 104660 }, { "epoch": 19.20811158010644, "grad_norm": 0.00018547284707892686, "learning_rate": 4.7701522288333694e-08, "loss": 0.0, "num_input_tokens_seen": 225753456, "step": 104665 }, { "epoch": 19.209029179665993, "grad_norm": 0.00011569372145459056, "learning_rate": 4.759123934469867e-08, "loss": 0.0, "num_input_tokens_seen": 225764016, "step": 104670 }, { "epoch": 19.209946779225547, "grad_norm": 0.0019033907447010279, "learning_rate": 4.7481083422672435e-08, "loss": 0.0, "num_input_tokens_seen": 225774384, "step": 104675 }, { "epoch": 19.210864378785097, "grad_norm": 9.079655865207314e-05, "learning_rate": 4.737105452508106e-08, "loss": 0.0, "num_input_tokens_seen": 225785616, "step": 104680 }, { "epoch": 19.21178197834465, "grad_norm": 0.0003714035847224295, "learning_rate": 4.726115265474673e-08, "loss": 0.0, "num_input_tokens_seen": 225796592, "step": 104685 }, { "epoch": 19.212699577904203, "grad_norm": 0.00015465157048311085, "learning_rate": 4.715137781448664e-08, "loss": 0.0, "num_input_tokens_seen": 225806800, "step": 104690 }, { "epoch": 19.213617177463753, "grad_norm": 8.911301847547293e-05, "learning_rate": 4.7041730007118536e-08, "loss": 0.0, "num_input_tokens_seen": 225817872, "step": 104695 }, { "epoch": 19.214534777023307, "grad_norm": 0.00013757481065113097, "learning_rate": 4.69322092354535e-08, "loss": 0.0, "num_input_tokens_seen": 225829168, "step": 104700 }, { "epoch": 19.21545237658286, "grad_norm": 9.292100730817765e-05, "learning_rate": 4.6822815502299834e-08, "loss": 0.0, "num_input_tokens_seen": 225840784, "step": 104705 }, { "epoch": 19.21636997614241, "grad_norm": 0.0003350919869262725, "learning_rate": 4.67135488104653e-08, "loss": 0.0, "num_input_tokens_seen": 225851920, "step": 104710 }, { "epoch": 19.217287575701963, "grad_norm": 0.00011867550347233191, "learning_rate": 4.6604409162750995e-08, "loss": 0.0, "num_input_tokens_seen": 225861776, "step": 104715 }, { "epoch": 19.218205175261517, "grad_norm": 5.789556598756462e-05, "learning_rate": 4.649539656195634e-08, "loss": 0.0, "num_input_tokens_seen": 225873264, "step": 104720 }, { "epoch": 19.219122774821066, "grad_norm": 6.301964458543807e-05, "learning_rate": 4.6386511010877435e-08, "loss": 0.0, "num_input_tokens_seen": 225884432, "step": 104725 }, { "epoch": 19.22004037438062, "grad_norm": 0.00010077454498969018, "learning_rate": 4.6277752512307595e-08, "loss": 0.0, "num_input_tokens_seen": 225895632, "step": 104730 }, { "epoch": 19.220957973940173, "grad_norm": 9.18779187486507e-05, "learning_rate": 4.6169121069035703e-08, "loss": 0.0, "num_input_tokens_seen": 225907024, "step": 104735 }, { "epoch": 19.221875573499723, "grad_norm": 6.874706014059484e-05, "learning_rate": 4.606061668384787e-08, "loss": 0.0, "num_input_tokens_seen": 225917584, "step": 104740 }, { "epoch": 19.222793173059276, "grad_norm": 0.00013077701441943645, "learning_rate": 4.5952239359527416e-08, "loss": 0.0, "num_input_tokens_seen": 225927824, "step": 104745 }, { "epoch": 19.22371077261883, "grad_norm": 0.00012029671052005142, "learning_rate": 4.584398909885379e-08, "loss": 0.0, "num_input_tokens_seen": 225938864, "step": 104750 }, { "epoch": 19.22462837217838, "grad_norm": 0.00033457769313827157, "learning_rate": 4.573586590460366e-08, "loss": 0.0, "num_input_tokens_seen": 225950032, "step": 104755 }, { "epoch": 19.225545971737933, "grad_norm": 0.00012190300913061947, "learning_rate": 4.562786977955036e-08, "loss": 0.0, "num_input_tokens_seen": 225961616, "step": 104760 }, { "epoch": 19.226463571297487, "grad_norm": 0.0010749693028628826, "learning_rate": 4.552000072646334e-08, "loss": 0.0, "num_input_tokens_seen": 225971984, "step": 104765 }, { "epoch": 19.227381170857036, "grad_norm": 0.0001668542536208406, "learning_rate": 4.541225874810984e-08, "loss": 0.0, "num_input_tokens_seen": 225983120, "step": 104770 }, { "epoch": 19.22829877041659, "grad_norm": 0.000938552781008184, "learning_rate": 4.5304643847252636e-08, "loss": 0.0, "num_input_tokens_seen": 225994384, "step": 104775 }, { "epoch": 19.229216369976143, "grad_norm": 7.068790000630543e-05, "learning_rate": 4.5197156026652866e-08, "loss": 0.0, "num_input_tokens_seen": 226004496, "step": 104780 }, { "epoch": 19.230133969535693, "grad_norm": 4.1431492718402296e-05, "learning_rate": 4.508979528906609e-08, "loss": 0.0, "num_input_tokens_seen": 226015600, "step": 104785 }, { "epoch": 19.231051569095246, "grad_norm": 4.005302980658598e-05, "learning_rate": 4.4982561637247346e-08, "loss": 0.0, "num_input_tokens_seen": 226026256, "step": 104790 }, { "epoch": 19.2319691686548, "grad_norm": 0.00011352221918059513, "learning_rate": 4.487545507394608e-08, "loss": 0.0, "num_input_tokens_seen": 226036688, "step": 104795 }, { "epoch": 19.23288676821435, "grad_norm": 7.04252888681367e-05, "learning_rate": 4.476847560190956e-08, "loss": 0.0, "num_input_tokens_seen": 226047504, "step": 104800 }, { "epoch": 19.233804367773903, "grad_norm": 0.00012597393651958555, "learning_rate": 4.466162322388112e-08, "loss": 0.0, "num_input_tokens_seen": 226057136, "step": 104805 }, { "epoch": 19.234721967333456, "grad_norm": 5.059036266175099e-05, "learning_rate": 4.4554897942603034e-08, "loss": 0.0, "num_input_tokens_seen": 226067408, "step": 104810 }, { "epoch": 19.235639566893006, "grad_norm": 0.00108096853364259, "learning_rate": 4.4448299760810884e-08, "loss": 0.0, "num_input_tokens_seen": 226076720, "step": 104815 }, { "epoch": 19.23655716645256, "grad_norm": 0.0002930315677076578, "learning_rate": 4.434182868123971e-08, "loss": 0.0, "num_input_tokens_seen": 226086256, "step": 104820 }, { "epoch": 19.237474766012113, "grad_norm": 0.0001522265956737101, "learning_rate": 4.4235484706619535e-08, "loss": 0.0, "num_input_tokens_seen": 226095952, "step": 104825 }, { "epoch": 19.238392365571663, "grad_norm": 7.977609493536875e-05, "learning_rate": 4.4129267839679305e-08, "loss": 0.0, "num_input_tokens_seen": 226107472, "step": 104830 }, { "epoch": 19.239309965131216, "grad_norm": 0.0003464520559646189, "learning_rate": 4.402317808314183e-08, "loss": 0.0, "num_input_tokens_seen": 226118576, "step": 104835 }, { "epoch": 19.24022756469077, "grad_norm": 0.00107179197948426, "learning_rate": 4.3917215439728824e-08, "loss": 0.0, "num_input_tokens_seen": 226129360, "step": 104840 }, { "epoch": 19.24114516425032, "grad_norm": 0.00012370396871119738, "learning_rate": 4.3811379912158114e-08, "loss": 0.0, "num_input_tokens_seen": 226140560, "step": 104845 }, { "epoch": 19.242062763809873, "grad_norm": 4.92054496135097e-05, "learning_rate": 4.37056715031442e-08, "loss": 0.0, "num_input_tokens_seen": 226151760, "step": 104850 }, { "epoch": 19.242980363369426, "grad_norm": 9.262399544240907e-05, "learning_rate": 4.360009021539768e-08, "loss": 0.0, "num_input_tokens_seen": 226160880, "step": 104855 }, { "epoch": 19.243897962928976, "grad_norm": 6.24179228907451e-05, "learning_rate": 4.349463605162807e-08, "loss": 0.0, "num_input_tokens_seen": 226170800, "step": 104860 }, { "epoch": 19.24481556248853, "grad_norm": 0.00017007226415444165, "learning_rate": 4.338930901453875e-08, "loss": 0.0, "num_input_tokens_seen": 226181904, "step": 104865 }, { "epoch": 19.245733162048083, "grad_norm": 0.0001299805735470727, "learning_rate": 4.328410910683145e-08, "loss": 0.0, "num_input_tokens_seen": 226192400, "step": 104870 }, { "epoch": 19.246650761607633, "grad_norm": 6.536541332025081e-05, "learning_rate": 4.317903633120457e-08, "loss": 0.0, "num_input_tokens_seen": 226203568, "step": 104875 }, { "epoch": 19.247568361167186, "grad_norm": 8.216431160690263e-05, "learning_rate": 4.3074090690353175e-08, "loss": 0.0, "num_input_tokens_seen": 226214704, "step": 104880 }, { "epoch": 19.24848596072674, "grad_norm": 0.00548659497871995, "learning_rate": 4.2969272186969004e-08, "loss": 0.0, "num_input_tokens_seen": 226225552, "step": 104885 }, { "epoch": 19.24940356028629, "grad_norm": 7.636423106305301e-05, "learning_rate": 4.286458082373934e-08, "loss": 0.0, "num_input_tokens_seen": 226235952, "step": 104890 }, { "epoch": 19.250321159845843, "grad_norm": 0.00016335307736881077, "learning_rate": 4.2760016603351493e-08, "loss": 0.0, "num_input_tokens_seen": 226246608, "step": 104895 }, { "epoch": 19.251238759405396, "grad_norm": 7.375497807515785e-05, "learning_rate": 4.265557952848554e-08, "loss": 0.0, "num_input_tokens_seen": 226257840, "step": 104900 }, { "epoch": 19.252156358964946, "grad_norm": 0.0006072645192034543, "learning_rate": 4.255126960182099e-08, "loss": 0.0, "num_input_tokens_seen": 226268336, "step": 104905 }, { "epoch": 19.2530739585245, "grad_norm": 0.00017133765504695475, "learning_rate": 4.244708682603293e-08, "loss": 0.0, "num_input_tokens_seen": 226278480, "step": 104910 }, { "epoch": 19.253991558084053, "grad_norm": 0.00018168111273553222, "learning_rate": 4.234303120379368e-08, "loss": 0.0, "num_input_tokens_seen": 226289328, "step": 104915 }, { "epoch": 19.254909157643603, "grad_norm": 0.0010285030584782362, "learning_rate": 4.2239102737772207e-08, "loss": 0.0, "num_input_tokens_seen": 226299856, "step": 104920 }, { "epoch": 19.255826757203156, "grad_norm": 0.00015807244926691055, "learning_rate": 4.2135301430633046e-08, "loss": 0.0, "num_input_tokens_seen": 226310192, "step": 104925 }, { "epoch": 19.25674435676271, "grad_norm": 0.0005024176207371056, "learning_rate": 4.203162728504018e-08, "loss": 0.0, "num_input_tokens_seen": 226320912, "step": 104930 }, { "epoch": 19.25766195632226, "grad_norm": 0.001168909133411944, "learning_rate": 4.192808030365203e-08, "loss": 0.0, "num_input_tokens_seen": 226332624, "step": 104935 }, { "epoch": 19.258579555881813, "grad_norm": 0.00020769414550159127, "learning_rate": 4.1824660489123705e-08, "loss": 0.0, "num_input_tokens_seen": 226344880, "step": 104940 }, { "epoch": 19.259497155441366, "grad_norm": 0.0003706834395416081, "learning_rate": 4.172136784410918e-08, "loss": 0.0, "num_input_tokens_seen": 226356848, "step": 104945 }, { "epoch": 19.260414755000916, "grad_norm": 0.0004845541261602193, "learning_rate": 4.1618202371256355e-08, "loss": 0.0, "num_input_tokens_seen": 226367664, "step": 104950 }, { "epoch": 19.26133235456047, "grad_norm": 0.00017204444156959653, "learning_rate": 4.1515164073211987e-08, "loss": 0.0, "num_input_tokens_seen": 226379344, "step": 104955 }, { "epoch": 19.262249954120023, "grad_norm": 0.0001673900696914643, "learning_rate": 4.141225295261842e-08, "loss": 0.0, "num_input_tokens_seen": 226388880, "step": 104960 }, { "epoch": 19.263167553679573, "grad_norm": 0.0008532290230505168, "learning_rate": 4.1309469012115744e-08, "loss": 0.0, "num_input_tokens_seen": 226399568, "step": 104965 }, { "epoch": 19.264085153239126, "grad_norm": 6.074182238080539e-05, "learning_rate": 4.1206812254340204e-08, "loss": 0.0, "num_input_tokens_seen": 226410800, "step": 104970 }, { "epoch": 19.26500275279868, "grad_norm": 0.0002790990110952407, "learning_rate": 4.110428268192412e-08, "loss": 0.0, "num_input_tokens_seen": 226421840, "step": 104975 }, { "epoch": 19.26592035235823, "grad_norm": 0.0001441125787096098, "learning_rate": 4.100188029749763e-08, "loss": 0.0, "num_input_tokens_seen": 226432112, "step": 104980 }, { "epoch": 19.266837951917783, "grad_norm": 0.001533434959128499, "learning_rate": 4.0899605103686956e-08, "loss": 0.0, "num_input_tokens_seen": 226443600, "step": 104985 }, { "epoch": 19.267755551477336, "grad_norm": 5.936638262937777e-05, "learning_rate": 4.079745710311611e-08, "loss": 0.0, "num_input_tokens_seen": 226455600, "step": 104990 }, { "epoch": 19.268673151036886, "grad_norm": 9.574431169312447e-05, "learning_rate": 4.069543629840411e-08, "loss": 0.0, "num_input_tokens_seen": 226466256, "step": 104995 }, { "epoch": 19.26959075059644, "grad_norm": 5.639901792164892e-05, "learning_rate": 4.0593542692167755e-08, "loss": 0.0, "num_input_tokens_seen": 226478352, "step": 105000 }, { "epoch": 19.270508350155993, "grad_norm": 6.717738870065659e-05, "learning_rate": 4.04917762870205e-08, "loss": 0.0, "num_input_tokens_seen": 226489456, "step": 105005 }, { "epoch": 19.271425949715542, "grad_norm": 0.0032953121699392796, "learning_rate": 4.0390137085573046e-08, "loss": 0.0, "num_input_tokens_seen": 226498416, "step": 105010 }, { "epoch": 19.272343549275096, "grad_norm": 7.739016291452572e-05, "learning_rate": 4.0288625090431634e-08, "loss": 0.0, "num_input_tokens_seen": 226509616, "step": 105015 }, { "epoch": 19.27326114883465, "grad_norm": 5.254355346551165e-05, "learning_rate": 4.0187240304200294e-08, "loss": 0.0, "num_input_tokens_seen": 226520496, "step": 105020 }, { "epoch": 19.2741787483942, "grad_norm": 7.059674680931494e-05, "learning_rate": 4.008598272947917e-08, "loss": 0.0, "num_input_tokens_seen": 226532016, "step": 105025 }, { "epoch": 19.275096347953752, "grad_norm": 6.224456592462957e-05, "learning_rate": 3.9984852368865626e-08, "loss": 0.0, "num_input_tokens_seen": 226542288, "step": 105030 }, { "epoch": 19.276013947513306, "grad_norm": 0.0001251145004061982, "learning_rate": 3.988384922495314e-08, "loss": 0.0, "num_input_tokens_seen": 226553232, "step": 105035 }, { "epoch": 19.276931547072856, "grad_norm": 4.653623909689486e-05, "learning_rate": 3.978297330033187e-08, "loss": 0.0, "num_input_tokens_seen": 226562672, "step": 105040 }, { "epoch": 19.27784914663241, "grad_norm": 4.497521513258107e-05, "learning_rate": 3.9682224597590303e-08, "loss": 0.0, "num_input_tokens_seen": 226572304, "step": 105045 }, { "epoch": 19.278766746191963, "grad_norm": 0.000282889959635213, "learning_rate": 3.9581603119311915e-08, "loss": 0.0, "num_input_tokens_seen": 226582896, "step": 105050 }, { "epoch": 19.279684345751512, "grad_norm": 4.3741743866121396e-05, "learning_rate": 3.948110886807743e-08, "loss": 0.0, "num_input_tokens_seen": 226593904, "step": 105055 }, { "epoch": 19.280601945311066, "grad_norm": 5.7382618251722306e-05, "learning_rate": 3.938074184646423e-08, "loss": 0.0, "num_input_tokens_seen": 226604144, "step": 105060 }, { "epoch": 19.28151954487062, "grad_norm": 7.3581664764788e-05, "learning_rate": 3.928050205704692e-08, "loss": 0.0, "num_input_tokens_seen": 226615152, "step": 105065 }, { "epoch": 19.28243714443017, "grad_norm": 6.913067772984505e-05, "learning_rate": 3.9180389502396224e-08, "loss": 0.0, "num_input_tokens_seen": 226626000, "step": 105070 }, { "epoch": 19.283354743989722, "grad_norm": 0.00010315753752365708, "learning_rate": 3.9080404185079524e-08, "loss": 0.0, "num_input_tokens_seen": 226636656, "step": 105075 }, { "epoch": 19.284272343549276, "grad_norm": 5.2691892051370814e-05, "learning_rate": 3.898054610766255e-08, "loss": 0.0, "num_input_tokens_seen": 226647248, "step": 105080 }, { "epoch": 19.285189943108826, "grad_norm": 5.725157097913325e-05, "learning_rate": 3.888081527270493e-08, "loss": 0.0, "num_input_tokens_seen": 226657200, "step": 105085 }, { "epoch": 19.28610754266838, "grad_norm": 0.0004858085885643959, "learning_rate": 3.8781211682765716e-08, "loss": 0.0, "num_input_tokens_seen": 226666640, "step": 105090 }, { "epoch": 19.287025142227932, "grad_norm": 0.0003919348237104714, "learning_rate": 3.8681735340398984e-08, "loss": 0.0, "num_input_tokens_seen": 226676240, "step": 105095 }, { "epoch": 19.287942741787482, "grad_norm": 9.669655264588073e-05, "learning_rate": 3.8582386248157133e-08, "loss": 0.0, "num_input_tokens_seen": 226686512, "step": 105100 }, { "epoch": 19.288860341347036, "grad_norm": 9.383070573676378e-05, "learning_rate": 3.8483164408587016e-08, "loss": 0.0, "num_input_tokens_seen": 226697008, "step": 105105 }, { "epoch": 19.28977794090659, "grad_norm": 0.0001114110418711789, "learning_rate": 3.838406982423382e-08, "loss": 0.0, "num_input_tokens_seen": 226707440, "step": 105110 }, { "epoch": 19.29069554046614, "grad_norm": 5.300208067637868e-05, "learning_rate": 3.828510249763995e-08, "loss": 0.0, "num_input_tokens_seen": 226717776, "step": 105115 }, { "epoch": 19.291613140025692, "grad_norm": 0.00024384087009821087, "learning_rate": 3.8186262431342826e-08, "loss": 0.0, "num_input_tokens_seen": 226727920, "step": 105120 }, { "epoch": 19.292530739585246, "grad_norm": 0.00010387610382167622, "learning_rate": 3.8087549627878196e-08, "loss": 0.0, "num_input_tokens_seen": 226739088, "step": 105125 }, { "epoch": 19.293448339144796, "grad_norm": 9.09728987608105e-05, "learning_rate": 3.798896408977737e-08, "loss": 0.0, "num_input_tokens_seen": 226749776, "step": 105130 }, { "epoch": 19.29436593870435, "grad_norm": 0.00034186019911430776, "learning_rate": 3.7890505819569435e-08, "loss": 0.0, "num_input_tokens_seen": 226760432, "step": 105135 }, { "epoch": 19.295283538263902, "grad_norm": 0.0008070315234363079, "learning_rate": 3.779217481977959e-08, "loss": 0.0, "num_input_tokens_seen": 226771696, "step": 105140 }, { "epoch": 19.296201137823452, "grad_norm": 0.0001925333490362391, "learning_rate": 3.769397109292971e-08, "loss": 0.0, "num_input_tokens_seen": 226782960, "step": 105145 }, { "epoch": 19.297118737383006, "grad_norm": 5.542365033761598e-05, "learning_rate": 3.759589464153834e-08, "loss": 0.0, "num_input_tokens_seen": 226794064, "step": 105150 }, { "epoch": 19.29803633694256, "grad_norm": 0.0007707609911449254, "learning_rate": 3.7497945468121244e-08, "loss": 0.0, "num_input_tokens_seen": 226804560, "step": 105155 }, { "epoch": 19.29895393650211, "grad_norm": 0.00011856284982059151, "learning_rate": 3.740012357519085e-08, "loss": 0.0, "num_input_tokens_seen": 226815344, "step": 105160 }, { "epoch": 19.299871536061662, "grad_norm": 0.00012794659414794296, "learning_rate": 3.7302428965256263e-08, "loss": 0.0, "num_input_tokens_seen": 226826448, "step": 105165 }, { "epoch": 19.300789135621216, "grad_norm": 0.00015571349649690092, "learning_rate": 3.7204861640822154e-08, "loss": 0.0, "num_input_tokens_seen": 226837776, "step": 105170 }, { "epoch": 19.301706735180765, "grad_norm": 0.00015582016203552485, "learning_rate": 3.710742160439207e-08, "loss": 0.0, "num_input_tokens_seen": 226848880, "step": 105175 }, { "epoch": 19.30262433474032, "grad_norm": 0.00030199435423128307, "learning_rate": 3.701010885846512e-08, "loss": 0.0, "num_input_tokens_seen": 226859440, "step": 105180 }, { "epoch": 19.303541934299872, "grad_norm": 5.5728327424731106e-05, "learning_rate": 3.691292340553654e-08, "loss": 0.0, "num_input_tokens_seen": 226870512, "step": 105185 }, { "epoch": 19.304459533859422, "grad_norm": 0.00012781568511854857, "learning_rate": 3.681586524809932e-08, "loss": 0.0, "num_input_tokens_seen": 226880976, "step": 105190 }, { "epoch": 19.305377133418975, "grad_norm": 9.446660988032818e-05, "learning_rate": 3.671893438864316e-08, "loss": 0.0, "num_input_tokens_seen": 226893008, "step": 105195 }, { "epoch": 19.30629473297853, "grad_norm": 5.593520472757518e-05, "learning_rate": 3.662213082965383e-08, "loss": 0.0, "num_input_tokens_seen": 226903472, "step": 105200 }, { "epoch": 19.30721233253808, "grad_norm": 0.0002572928788140416, "learning_rate": 3.652545457361489e-08, "loss": 0.0, "num_input_tokens_seen": 226914192, "step": 105205 }, { "epoch": 19.308129932097632, "grad_norm": 6.023603418725543e-05, "learning_rate": 3.642890562300439e-08, "loss": 0.0, "num_input_tokens_seen": 226926320, "step": 105210 }, { "epoch": 19.309047531657185, "grad_norm": 0.00042312746518291533, "learning_rate": 3.6332483980300316e-08, "loss": 0.0, "num_input_tokens_seen": 226937296, "step": 105215 }, { "epoch": 19.309965131216735, "grad_norm": 6.317556108115241e-05, "learning_rate": 3.62361896479746e-08, "loss": 0.0, "num_input_tokens_seen": 226946256, "step": 105220 }, { "epoch": 19.31088273077629, "grad_norm": 0.00015487709606532007, "learning_rate": 3.614002262849803e-08, "loss": 0.0, "num_input_tokens_seen": 226956688, "step": 105225 }, { "epoch": 19.311800330335842, "grad_norm": 9.811278141569346e-05, "learning_rate": 3.604398292433586e-08, "loss": 0.0, "num_input_tokens_seen": 226967952, "step": 105230 }, { "epoch": 19.312717929895392, "grad_norm": 7.352573447860777e-05, "learning_rate": 3.5948070537952796e-08, "loss": 0.0, "num_input_tokens_seen": 226979696, "step": 105235 }, { "epoch": 19.313635529454945, "grad_norm": 6.237437628442422e-05, "learning_rate": 3.585228547180797e-08, "loss": 0.0, "num_input_tokens_seen": 226990160, "step": 105240 }, { "epoch": 19.3145531290145, "grad_norm": 9.226513793691993e-05, "learning_rate": 3.575662772835775e-08, "loss": 0.0, "num_input_tokens_seen": 226999600, "step": 105245 }, { "epoch": 19.31547072857405, "grad_norm": 7.327328057726845e-05, "learning_rate": 3.5661097310056846e-08, "loss": 0.0, "num_input_tokens_seen": 227009904, "step": 105250 }, { "epoch": 19.316388328133602, "grad_norm": 4.335638368502259e-05, "learning_rate": 3.5565694219354406e-08, "loss": 0.0, "num_input_tokens_seen": 227019280, "step": 105255 }, { "epoch": 19.317305927693155, "grad_norm": 5.099318514112383e-05, "learning_rate": 3.5470418458697365e-08, "loss": 0.0, "num_input_tokens_seen": 227030512, "step": 105260 }, { "epoch": 19.318223527252705, "grad_norm": 4.0431976231047884e-05, "learning_rate": 3.5375270030530427e-08, "loss": 0.0, "num_input_tokens_seen": 227041200, "step": 105265 }, { "epoch": 19.31914112681226, "grad_norm": 6.689353176625445e-05, "learning_rate": 3.5280248937293316e-08, "loss": 0.0, "num_input_tokens_seen": 227051344, "step": 105270 }, { "epoch": 19.320058726371812, "grad_norm": 8.138668636092916e-05, "learning_rate": 3.518535518142297e-08, "loss": 0.0, "num_input_tokens_seen": 227062064, "step": 105275 }, { "epoch": 19.320976325931362, "grad_norm": 0.00010044247756013647, "learning_rate": 3.509058876535354e-08, "loss": 0.0001, "num_input_tokens_seen": 227072848, "step": 105280 }, { "epoch": 19.321893925490915, "grad_norm": 0.003248698776587844, "learning_rate": 3.499594969151532e-08, "loss": 0.0, "num_input_tokens_seen": 227082320, "step": 105285 }, { "epoch": 19.32281152505047, "grad_norm": 0.00019415217684581876, "learning_rate": 3.490143796233636e-08, "loss": 0.0, "num_input_tokens_seen": 227094512, "step": 105290 }, { "epoch": 19.32372912461002, "grad_norm": 0.00020563017460517585, "learning_rate": 3.4807053580239726e-08, "loss": 0.0, "num_input_tokens_seen": 227105424, "step": 105295 }, { "epoch": 19.324646724169572, "grad_norm": 9.100939496420324e-05, "learning_rate": 3.471279654764737e-08, "loss": 0.0, "num_input_tokens_seen": 227115888, "step": 105300 }, { "epoch": 19.325564323729125, "grad_norm": 5.4536343668587506e-05, "learning_rate": 3.461866686697624e-08, "loss": 0.0, "num_input_tokens_seen": 227125744, "step": 105305 }, { "epoch": 19.326481923288675, "grad_norm": 6.260697409743443e-05, "learning_rate": 3.4524664540640515e-08, "loss": 0.0, "num_input_tokens_seen": 227137136, "step": 105310 }, { "epoch": 19.32739952284823, "grad_norm": 0.00013621954713016748, "learning_rate": 3.4430789571051615e-08, "loss": 0.0, "num_input_tokens_seen": 227148432, "step": 105315 }, { "epoch": 19.328317122407782, "grad_norm": 0.0003483487234916538, "learning_rate": 3.4337041960616494e-08, "loss": 0.0, "num_input_tokens_seen": 227160080, "step": 105320 }, { "epoch": 19.32923472196733, "grad_norm": 8.857586362864822e-05, "learning_rate": 3.4243421711740996e-08, "loss": 0.0, "num_input_tokens_seen": 227171184, "step": 105325 }, { "epoch": 19.330152321526885, "grad_norm": 6.602924986509606e-05, "learning_rate": 3.414992882682433e-08, "loss": 0.0, "num_input_tokens_seen": 227181776, "step": 105330 }, { "epoch": 19.33106992108644, "grad_norm": 0.00015307540888898075, "learning_rate": 3.405656330826679e-08, "loss": 0.0, "num_input_tokens_seen": 227191216, "step": 105335 }, { "epoch": 19.33198752064599, "grad_norm": 4.3469826778164133e-05, "learning_rate": 3.396332515846146e-08, "loss": 0.0, "num_input_tokens_seen": 227202928, "step": 105340 }, { "epoch": 19.33290512020554, "grad_norm": 0.0003267607244197279, "learning_rate": 3.387021437979976e-08, "loss": 0.0, "num_input_tokens_seen": 227214032, "step": 105345 }, { "epoch": 19.333822719765095, "grad_norm": 8.649624942336231e-05, "learning_rate": 3.377723097467089e-08, "loss": 0.0, "num_input_tokens_seen": 227224720, "step": 105350 }, { "epoch": 19.334740319324645, "grad_norm": 0.0001340151793556288, "learning_rate": 3.3684374945458506e-08, "loss": 0.0, "num_input_tokens_seen": 227237232, "step": 105355 }, { "epoch": 19.3356579188842, "grad_norm": 0.00103692093398422, "learning_rate": 3.3591646294545146e-08, "loss": 0.0, "num_input_tokens_seen": 227248048, "step": 105360 }, { "epoch": 19.33657551844375, "grad_norm": 9.90066328085959e-05, "learning_rate": 3.34990450243089e-08, "loss": 0.0, "num_input_tokens_seen": 227258992, "step": 105365 }, { "epoch": 19.3374931180033, "grad_norm": 0.0001693109079496935, "learning_rate": 3.340657113712453e-08, "loss": 0.0, "num_input_tokens_seen": 227270288, "step": 105370 }, { "epoch": 19.338410717562855, "grad_norm": 0.00011660382733680308, "learning_rate": 3.3314224635364603e-08, "loss": 0.0, "num_input_tokens_seen": 227281776, "step": 105375 }, { "epoch": 19.33932831712241, "grad_norm": 0.006548000033944845, "learning_rate": 3.3222005521396097e-08, "loss": 0.0, "num_input_tokens_seen": 227292912, "step": 105380 }, { "epoch": 19.34024591668196, "grad_norm": 0.0001427802926627919, "learning_rate": 3.312991379758657e-08, "loss": 0.0, "num_input_tokens_seen": 227303408, "step": 105385 }, { "epoch": 19.34116351624151, "grad_norm": 0.0012517486466094851, "learning_rate": 3.303794946629635e-08, "loss": 0.0, "num_input_tokens_seen": 227313968, "step": 105390 }, { "epoch": 19.342081115801065, "grad_norm": 9.36250580707565e-05, "learning_rate": 3.294611252988411e-08, "loss": 0.0, "num_input_tokens_seen": 227325264, "step": 105395 }, { "epoch": 19.342998715360615, "grad_norm": 6.656989717157558e-05, "learning_rate": 3.2854402990706305e-08, "loss": 0.0, "num_input_tokens_seen": 227335504, "step": 105400 }, { "epoch": 19.34391631492017, "grad_norm": 0.00042197052971459925, "learning_rate": 3.276282085111493e-08, "loss": 0.0, "num_input_tokens_seen": 227345680, "step": 105405 }, { "epoch": 19.34483391447972, "grad_norm": 0.00014551926869899035, "learning_rate": 3.267136611345812e-08, "loss": 0.0, "num_input_tokens_seen": 227356912, "step": 105410 }, { "epoch": 19.34575151403927, "grad_norm": 0.00011284769425401464, "learning_rate": 3.2580038780082315e-08, "loss": 0.0, "num_input_tokens_seen": 227367024, "step": 105415 }, { "epoch": 19.346669113598825, "grad_norm": 0.004462110344320536, "learning_rate": 3.2488838853330096e-08, "loss": 0.0, "num_input_tokens_seen": 227377360, "step": 105420 }, { "epoch": 19.34758671315838, "grad_norm": 6.698020297335461e-05, "learning_rate": 3.239776633553959e-08, "loss": 0.0, "num_input_tokens_seen": 227387984, "step": 105425 }, { "epoch": 19.348504312717928, "grad_norm": 7.37574155209586e-05, "learning_rate": 3.2306821229047826e-08, "loss": 0.0, "num_input_tokens_seen": 227397840, "step": 105430 }, { "epoch": 19.34942191227748, "grad_norm": 0.0010424493812024593, "learning_rate": 3.221600353618681e-08, "loss": 0.0, "num_input_tokens_seen": 227408688, "step": 105435 }, { "epoch": 19.350339511837035, "grad_norm": 0.00013520708307623863, "learning_rate": 3.212531325928525e-08, "loss": 0.0, "num_input_tokens_seen": 227420496, "step": 105440 }, { "epoch": 19.351257111396585, "grad_norm": 0.00011576856195461005, "learning_rate": 3.203475040067072e-08, "loss": 0.0, "num_input_tokens_seen": 227430800, "step": 105445 }, { "epoch": 19.352174710956138, "grad_norm": 3.785676744882949e-05, "learning_rate": 3.19443149626647e-08, "loss": 0.0, "num_input_tokens_seen": 227441072, "step": 105450 }, { "epoch": 19.35309231051569, "grad_norm": 0.00010180986282648519, "learning_rate": 3.1854006947587554e-08, "loss": 0.0, "num_input_tokens_seen": 227451856, "step": 105455 }, { "epoch": 19.35400991007524, "grad_norm": 0.00036719811032526195, "learning_rate": 3.1763826357755214e-08, "loss": 0.0, "num_input_tokens_seen": 227464016, "step": 105460 }, { "epoch": 19.354927509634795, "grad_norm": 7.02727265888825e-05, "learning_rate": 3.167377319548026e-08, "loss": 0.0, "num_input_tokens_seen": 227474448, "step": 105465 }, { "epoch": 19.355845109194348, "grad_norm": 0.00025204880512319505, "learning_rate": 3.158384746307308e-08, "loss": 0.0, "num_input_tokens_seen": 227485840, "step": 105470 }, { "epoch": 19.356762708753898, "grad_norm": 0.0004157026414759457, "learning_rate": 3.1494049162840155e-08, "loss": 0.0, "num_input_tokens_seen": 227495568, "step": 105475 }, { "epoch": 19.35768030831345, "grad_norm": 7.398334128083661e-05, "learning_rate": 3.140437829708354e-08, "loss": 0.0, "num_input_tokens_seen": 227506288, "step": 105480 }, { "epoch": 19.358597907873005, "grad_norm": 0.0012148503446951509, "learning_rate": 3.131483486810472e-08, "loss": 0.0, "num_input_tokens_seen": 227518544, "step": 105485 }, { "epoch": 19.359515507432555, "grad_norm": 4.5416978537105024e-05, "learning_rate": 3.1225418878199635e-08, "loss": 0.0, "num_input_tokens_seen": 227529136, "step": 105490 }, { "epoch": 19.360433106992108, "grad_norm": 0.00013898912584409118, "learning_rate": 3.1136130329661455e-08, "loss": 0.0, "num_input_tokens_seen": 227540464, "step": 105495 }, { "epoch": 19.36135070655166, "grad_norm": 7.71589984651655e-05, "learning_rate": 3.104696922478057e-08, "loss": 0.0, "num_input_tokens_seen": 227550928, "step": 105500 }, { "epoch": 19.36226830611121, "grad_norm": 0.002467480953782797, "learning_rate": 3.095793556584348e-08, "loss": 0.0, "num_input_tokens_seen": 227561648, "step": 105505 }, { "epoch": 19.363185905670765, "grad_norm": 0.00012087136565241963, "learning_rate": 3.0869029355134474e-08, "loss": 0.0, "num_input_tokens_seen": 227572976, "step": 105510 }, { "epoch": 19.364103505230318, "grad_norm": 5.91083153267391e-05, "learning_rate": 3.0780250594932836e-08, "loss": 0.0, "num_input_tokens_seen": 227583824, "step": 105515 }, { "epoch": 19.365021104789868, "grad_norm": 0.005735411308705807, "learning_rate": 3.069159928751675e-08, "loss": 0.0, "num_input_tokens_seen": 227595856, "step": 105520 }, { "epoch": 19.36593870434942, "grad_norm": 7.498998456867412e-05, "learning_rate": 3.0603075435158836e-08, "loss": 0.0, "num_input_tokens_seen": 227606352, "step": 105525 }, { "epoch": 19.366856303908975, "grad_norm": 6.040549124008976e-05, "learning_rate": 3.0514679040130614e-08, "loss": 0.0, "num_input_tokens_seen": 227616720, "step": 105530 }, { "epoch": 19.367773903468525, "grad_norm": 0.00015060305304359645, "learning_rate": 3.04264101046986e-08, "loss": 0.0, "num_input_tokens_seen": 227628624, "step": 105535 }, { "epoch": 19.368691503028078, "grad_norm": 0.0008050713804550469, "learning_rate": 3.03382686311271e-08, "loss": 0.0, "num_input_tokens_seen": 227638960, "step": 105540 }, { "epoch": 19.36960910258763, "grad_norm": 0.00032017662306316197, "learning_rate": 3.0250254621677077e-08, "loss": 0.0, "num_input_tokens_seen": 227650704, "step": 105545 }, { "epoch": 19.37052670214718, "grad_norm": 6.722759280819446e-05, "learning_rate": 3.016236807860506e-08, "loss": 0.0, "num_input_tokens_seen": 227661392, "step": 105550 }, { "epoch": 19.371444301706735, "grad_norm": 0.0005831619491800666, "learning_rate": 3.007460900416592e-08, "loss": 0.0, "num_input_tokens_seen": 227671504, "step": 105555 }, { "epoch": 19.372361901266288, "grad_norm": 0.0012192960130050778, "learning_rate": 2.998697740061063e-08, "loss": 0.0, "num_input_tokens_seen": 227682480, "step": 105560 }, { "epoch": 19.373279500825838, "grad_norm": 7.874715083744377e-05, "learning_rate": 2.9899473270186276e-08, "loss": 0.0, "num_input_tokens_seen": 227693488, "step": 105565 }, { "epoch": 19.37419710038539, "grad_norm": 6.358668906614184e-05, "learning_rate": 2.981209661513773e-08, "loss": 0.0, "num_input_tokens_seen": 227703760, "step": 105570 }, { "epoch": 19.375114699944945, "grad_norm": 0.00010400104656582698, "learning_rate": 2.9724847437705428e-08, "loss": 0.0, "num_input_tokens_seen": 227714032, "step": 105575 }, { "epoch": 19.376032299504494, "grad_norm": 0.0005047513986937702, "learning_rate": 2.9637725740127578e-08, "loss": 0.0, "num_input_tokens_seen": 227723344, "step": 105580 }, { "epoch": 19.376949899064048, "grad_norm": 5.0326674681855366e-05, "learning_rate": 2.9550731524639053e-08, "loss": 0.0, "num_input_tokens_seen": 227733872, "step": 105585 }, { "epoch": 19.3778674986236, "grad_norm": 6.337231025099754e-05, "learning_rate": 2.9463864793470853e-08, "loss": 0.0, "num_input_tokens_seen": 227743696, "step": 105590 }, { "epoch": 19.37878509818315, "grad_norm": 8.629118383396417e-05, "learning_rate": 2.9377125548850638e-08, "loss": 0.0, "num_input_tokens_seen": 227754704, "step": 105595 }, { "epoch": 19.379702697742704, "grad_norm": 0.00012296209752094, "learning_rate": 2.9290513793003294e-08, "loss": 0.0, "num_input_tokens_seen": 227766480, "step": 105600 }, { "epoch": 19.380620297302258, "grad_norm": 0.0001368658267892897, "learning_rate": 2.9204029528150378e-08, "loss": 0.0, "num_input_tokens_seen": 227778096, "step": 105605 }, { "epoch": 19.381537896861808, "grad_norm": 0.00030407257145270705, "learning_rate": 2.9117672756510673e-08, "loss": 0.0, "num_input_tokens_seen": 227789072, "step": 105610 }, { "epoch": 19.38245549642136, "grad_norm": 7.170392200350761e-05, "learning_rate": 2.9031443480297406e-08, "loss": 0.0, "num_input_tokens_seen": 227800048, "step": 105615 }, { "epoch": 19.383373095980915, "grad_norm": 0.00021700403885915875, "learning_rate": 2.894534170172436e-08, "loss": 0.0, "num_input_tokens_seen": 227811056, "step": 105620 }, { "epoch": 19.384290695540464, "grad_norm": 9.285874693887308e-05, "learning_rate": 2.8859367422998108e-08, "loss": 0.0, "num_input_tokens_seen": 227822544, "step": 105625 }, { "epoch": 19.385208295100018, "grad_norm": 0.00533651327714324, "learning_rate": 2.8773520646325214e-08, "loss": 0.0, "num_input_tokens_seen": 227832880, "step": 105630 }, { "epoch": 19.38612589465957, "grad_norm": 7.509940769523382e-05, "learning_rate": 2.8687801373906142e-08, "loss": 0.0, "num_input_tokens_seen": 227843344, "step": 105635 }, { "epoch": 19.38704349421912, "grad_norm": 8.708989480510354e-05, "learning_rate": 2.8602209607940247e-08, "loss": 0.0, "num_input_tokens_seen": 227852688, "step": 105640 }, { "epoch": 19.387961093778674, "grad_norm": 0.01413606759160757, "learning_rate": 2.8516745350622987e-08, "loss": 0.0, "num_input_tokens_seen": 227862992, "step": 105645 }, { "epoch": 19.388878693338228, "grad_norm": 9.317765216110274e-05, "learning_rate": 2.8431408604145948e-08, "loss": 0.0, "num_input_tokens_seen": 227873424, "step": 105650 }, { "epoch": 19.389796292897778, "grad_norm": 0.00020022223179694265, "learning_rate": 2.8346199370698492e-08, "loss": 0.0, "num_input_tokens_seen": 227884240, "step": 105655 }, { "epoch": 19.39071389245733, "grad_norm": 7.037428440526128e-05, "learning_rate": 2.8261117652464974e-08, "loss": 0.0, "num_input_tokens_seen": 227895376, "step": 105660 }, { "epoch": 19.391631492016884, "grad_norm": 0.00012936470739077777, "learning_rate": 2.8176163451628656e-08, "loss": 0.0, "num_input_tokens_seen": 227905200, "step": 105665 }, { "epoch": 19.392549091576434, "grad_norm": 8.433518814854324e-05, "learning_rate": 2.8091336770367794e-08, "loss": 0.0, "num_input_tokens_seen": 227914672, "step": 105670 }, { "epoch": 19.393466691135988, "grad_norm": 0.00041240229620598257, "learning_rate": 2.8006637610858976e-08, "loss": 0.0, "num_input_tokens_seen": 227925136, "step": 105675 }, { "epoch": 19.39438429069554, "grad_norm": 4.243976582074538e-05, "learning_rate": 2.7922065975273806e-08, "loss": 0.0, "num_input_tokens_seen": 227935568, "step": 105680 }, { "epoch": 19.39530189025509, "grad_norm": 0.00033833261113613844, "learning_rate": 2.7837621865781094e-08, "loss": 0.0, "num_input_tokens_seen": 227946640, "step": 105685 }, { "epoch": 19.396219489814644, "grad_norm": 0.00012179219629615545, "learning_rate": 2.775330528454745e-08, "loss": 0.0, "num_input_tokens_seen": 227957584, "step": 105690 }, { "epoch": 19.397137089374198, "grad_norm": 6.156572635518387e-05, "learning_rate": 2.7669116233735584e-08, "loss": 0.0, "num_input_tokens_seen": 227968752, "step": 105695 }, { "epoch": 19.398054688933748, "grad_norm": 6.697713979519904e-05, "learning_rate": 2.7585054715504324e-08, "loss": 0.0, "num_input_tokens_seen": 227979344, "step": 105700 }, { "epoch": 19.3989722884933, "grad_norm": 5.6899221817729995e-05, "learning_rate": 2.7501120732009722e-08, "loss": 0.0, "num_input_tokens_seen": 227990224, "step": 105705 }, { "epoch": 19.399889888052854, "grad_norm": 0.0008975224918685853, "learning_rate": 2.7417314285405062e-08, "loss": 0.0, "num_input_tokens_seen": 228000784, "step": 105710 }, { "epoch": 19.400807487612404, "grad_norm": 0.0006252197781577706, "learning_rate": 2.733363537783862e-08, "loss": 0.0, "num_input_tokens_seen": 228011536, "step": 105715 }, { "epoch": 19.401725087171958, "grad_norm": 4.404336505103856e-05, "learning_rate": 2.7250084011458122e-08, "loss": 0.0, "num_input_tokens_seen": 228021904, "step": 105720 }, { "epoch": 19.40264268673151, "grad_norm": 0.00024050472711678594, "learning_rate": 2.71666601884063e-08, "loss": 0.0, "num_input_tokens_seen": 228032528, "step": 105725 }, { "epoch": 19.40356028629106, "grad_norm": 9.25688145798631e-05, "learning_rate": 2.7083363910822004e-08, "loss": 0.0, "num_input_tokens_seen": 228042928, "step": 105730 }, { "epoch": 19.404477885850614, "grad_norm": 0.0005808359710499644, "learning_rate": 2.7000195180841848e-08, "loss": 0.0, "num_input_tokens_seen": 228054416, "step": 105735 }, { "epoch": 19.405395485410168, "grad_norm": 0.00010284404561389238, "learning_rate": 2.6917154000599688e-08, "loss": 0.0, "num_input_tokens_seen": 228065616, "step": 105740 }, { "epoch": 19.40631308496972, "grad_norm": 5.735021841246635e-05, "learning_rate": 2.683424037222493e-08, "loss": 0.0, "num_input_tokens_seen": 228076784, "step": 105745 }, { "epoch": 19.40723068452927, "grad_norm": 0.00039317470509558916, "learning_rate": 2.675145429784365e-08, "loss": 0.0, "num_input_tokens_seen": 228087408, "step": 105750 }, { "epoch": 19.408148284088824, "grad_norm": 0.008649945259094238, "learning_rate": 2.666879577958026e-08, "loss": 0.0, "num_input_tokens_seen": 228099888, "step": 105755 }, { "epoch": 19.409065883648378, "grad_norm": 8.915305807022378e-05, "learning_rate": 2.6586264819554175e-08, "loss": 0.0, "num_input_tokens_seen": 228111312, "step": 105760 }, { "epoch": 19.409983483207927, "grad_norm": 0.0007985577685758471, "learning_rate": 2.6503861419882036e-08, "loss": 0.0, "num_input_tokens_seen": 228120560, "step": 105765 }, { "epoch": 19.41090108276748, "grad_norm": 0.0005020176176913083, "learning_rate": 2.6421585582678266e-08, "loss": 0.0, "num_input_tokens_seen": 228132112, "step": 105770 }, { "epoch": 19.411818682327034, "grad_norm": 3.776254743570462e-05, "learning_rate": 2.633943731005173e-08, "loss": 0.0, "num_input_tokens_seen": 228142320, "step": 105775 }, { "epoch": 19.412736281886584, "grad_norm": 0.0001638582325540483, "learning_rate": 2.6257416604110742e-08, "loss": 0.0, "num_input_tokens_seen": 228153456, "step": 105780 }, { "epoch": 19.413653881446137, "grad_norm": 0.00013796576240565628, "learning_rate": 2.6175523466958063e-08, "loss": 0.0, "num_input_tokens_seen": 228162832, "step": 105785 }, { "epoch": 19.41457148100569, "grad_norm": 0.00013941062206868082, "learning_rate": 2.6093757900694795e-08, "loss": 0.0, "num_input_tokens_seen": 228173776, "step": 105790 }, { "epoch": 19.41548908056524, "grad_norm": 0.00035601810668595135, "learning_rate": 2.601211990741759e-08, "loss": 0.0, "num_input_tokens_seen": 228184624, "step": 105795 }, { "epoch": 19.416406680124794, "grad_norm": 0.00038088145083747804, "learning_rate": 2.5930609489220327e-08, "loss": 0.0, "num_input_tokens_seen": 228195472, "step": 105800 }, { "epoch": 19.417324279684347, "grad_norm": 8.545996388420463e-05, "learning_rate": 2.5849226648194115e-08, "loss": 0.0, "num_input_tokens_seen": 228206800, "step": 105805 }, { "epoch": 19.418241879243897, "grad_norm": 7.207535236375406e-05, "learning_rate": 2.5767971386425616e-08, "loss": 0.0, "num_input_tokens_seen": 228216368, "step": 105810 }, { "epoch": 19.41915947880345, "grad_norm": 0.00013478212349582464, "learning_rate": 2.568684370599983e-08, "loss": 0.0, "num_input_tokens_seen": 228226128, "step": 105815 }, { "epoch": 19.420077078363004, "grad_norm": 8.387106208829209e-05, "learning_rate": 2.560584360899676e-08, "loss": 0.0, "num_input_tokens_seen": 228236624, "step": 105820 }, { "epoch": 19.420994677922554, "grad_norm": 7.520960207330063e-05, "learning_rate": 2.552497109749419e-08, "loss": 0.0, "num_input_tokens_seen": 228247344, "step": 105825 }, { "epoch": 19.421912277482107, "grad_norm": 0.0039033740758895874, "learning_rate": 2.5444226173566565e-08, "loss": 0.0, "num_input_tokens_seen": 228258032, "step": 105830 }, { "epoch": 19.42282987704166, "grad_norm": 8.189569052774459e-05, "learning_rate": 2.5363608839283905e-08, "loss": 0.0, "num_input_tokens_seen": 228269424, "step": 105835 }, { "epoch": 19.42374747660121, "grad_norm": 5.206226705922745e-05, "learning_rate": 2.5283119096715658e-08, "loss": 0.0, "num_input_tokens_seen": 228278288, "step": 105840 }, { "epoch": 19.424665076160764, "grad_norm": 0.0006917500286363065, "learning_rate": 2.5202756947925178e-08, "loss": 0.0, "num_input_tokens_seen": 228289552, "step": 105845 }, { "epoch": 19.425582675720317, "grad_norm": 6.0859136283397675e-05, "learning_rate": 2.5122522394973037e-08, "loss": 0.0, "num_input_tokens_seen": 228299664, "step": 105850 }, { "epoch": 19.426500275279867, "grad_norm": 8.788672130322084e-05, "learning_rate": 2.5042415439918145e-08, "loss": 0.0, "num_input_tokens_seen": 228309200, "step": 105855 }, { "epoch": 19.42741787483942, "grad_norm": 7.957030175020918e-05, "learning_rate": 2.4962436084814966e-08, "loss": 0.0, "num_input_tokens_seen": 228319824, "step": 105860 }, { "epoch": 19.428335474398974, "grad_norm": 5.876452996744774e-05, "learning_rate": 2.48825843317152e-08, "loss": 0.0, "num_input_tokens_seen": 228329936, "step": 105865 }, { "epoch": 19.429253073958524, "grad_norm": 5.317258546710946e-05, "learning_rate": 2.4802860182665533e-08, "loss": 0.0, "num_input_tokens_seen": 228341008, "step": 105870 }, { "epoch": 19.430170673518077, "grad_norm": 4.345148045103997e-05, "learning_rate": 2.4723263639712114e-08, "loss": 0.0, "num_input_tokens_seen": 228351760, "step": 105875 }, { "epoch": 19.43108827307763, "grad_norm": 7.251588249346241e-05, "learning_rate": 2.4643794704896083e-08, "loss": 0.0, "num_input_tokens_seen": 228361936, "step": 105880 }, { "epoch": 19.43200587263718, "grad_norm": 0.00017204388859681785, "learning_rate": 2.456445338025526e-08, "loss": 0.0, "num_input_tokens_seen": 228372944, "step": 105885 }, { "epoch": 19.432923472196734, "grad_norm": 7.502431981265545e-05, "learning_rate": 2.4485239667825234e-08, "loss": 0.0, "num_input_tokens_seen": 228382800, "step": 105890 }, { "epoch": 19.433841071756287, "grad_norm": 0.00040286415605805814, "learning_rate": 2.4406153569637157e-08, "loss": 0.0, "num_input_tokens_seen": 228393616, "step": 105895 }, { "epoch": 19.434758671315837, "grad_norm": 0.0007525795372202992, "learning_rate": 2.432719508771997e-08, "loss": 0.0, "num_input_tokens_seen": 228405520, "step": 105900 }, { "epoch": 19.43567627087539, "grad_norm": 0.0007456834428012371, "learning_rate": 2.424836422409871e-08, "loss": 0.0, "num_input_tokens_seen": 228415632, "step": 105905 }, { "epoch": 19.436593870434944, "grad_norm": 8.312408317578956e-05, "learning_rate": 2.4169660980795095e-08, "loss": 0.0, "num_input_tokens_seen": 228426992, "step": 105910 }, { "epoch": 19.437511469994494, "grad_norm": 0.0002014144993154332, "learning_rate": 2.409108535982807e-08, "loss": 0.0, "num_input_tokens_seen": 228438128, "step": 105915 }, { "epoch": 19.438429069554047, "grad_norm": 7.588474545627832e-05, "learning_rate": 2.4012637363212133e-08, "loss": 0.0, "num_input_tokens_seen": 228448720, "step": 105920 }, { "epoch": 19.4393466691136, "grad_norm": 0.000162579701282084, "learning_rate": 2.3934316992960673e-08, "loss": 0.0, "num_input_tokens_seen": 228458640, "step": 105925 }, { "epoch": 19.44026426867315, "grad_norm": 0.0018196284072473645, "learning_rate": 2.3856124251081525e-08, "loss": 0.0, "num_input_tokens_seen": 228469392, "step": 105930 }, { "epoch": 19.441181868232704, "grad_norm": 0.00022704513685312122, "learning_rate": 2.377805913958031e-08, "loss": 0.0, "num_input_tokens_seen": 228479952, "step": 105935 }, { "epoch": 19.442099467792257, "grad_norm": 0.0006215041503310204, "learning_rate": 2.370012166045932e-08, "loss": 0.0, "num_input_tokens_seen": 228491728, "step": 105940 }, { "epoch": 19.443017067351807, "grad_norm": 0.0001631277846172452, "learning_rate": 2.3622311815718058e-08, "loss": 0.0, "num_input_tokens_seen": 228501456, "step": 105945 }, { "epoch": 19.44393466691136, "grad_norm": 0.00013471819693222642, "learning_rate": 2.35446296073516e-08, "loss": 0.0, "num_input_tokens_seen": 228511472, "step": 105950 }, { "epoch": 19.444852266470914, "grad_norm": 8.790908032096922e-05, "learning_rate": 2.3467075037352795e-08, "loss": 0.0, "num_input_tokens_seen": 228522384, "step": 105955 }, { "epoch": 19.445769866030464, "grad_norm": 0.00013122742529958487, "learning_rate": 2.3389648107710605e-08, "loss": 0.0, "num_input_tokens_seen": 228533040, "step": 105960 }, { "epoch": 19.446687465590017, "grad_norm": 6.146902887849137e-05, "learning_rate": 2.3312348820410668e-08, "loss": 0.0, "num_input_tokens_seen": 228542288, "step": 105965 }, { "epoch": 19.44760506514957, "grad_norm": 8.138611883623526e-05, "learning_rate": 2.3235177177435287e-08, "loss": 0.0, "num_input_tokens_seen": 228553584, "step": 105970 }, { "epoch": 19.44852266470912, "grad_norm": 9.776390652405098e-05, "learning_rate": 2.3158133180765097e-08, "loss": 0.0, "num_input_tokens_seen": 228564400, "step": 105975 }, { "epoch": 19.449440264268674, "grad_norm": 7.29749008314684e-05, "learning_rate": 2.3081216832375187e-08, "loss": 0.0, "num_input_tokens_seen": 228576528, "step": 105980 }, { "epoch": 19.450357863828227, "grad_norm": 0.00029835535679012537, "learning_rate": 2.3004428134238423e-08, "loss": 0.0, "num_input_tokens_seen": 228586928, "step": 105985 }, { "epoch": 19.451275463387777, "grad_norm": 0.0009575590956956148, "learning_rate": 2.2927767088324338e-08, "loss": 0.0, "num_input_tokens_seen": 228597744, "step": 105990 }, { "epoch": 19.45219306294733, "grad_norm": 0.00015609196270816028, "learning_rate": 2.2851233696599696e-08, "loss": 0.0, "num_input_tokens_seen": 228607600, "step": 105995 }, { "epoch": 19.453110662506884, "grad_norm": 0.00011132985673611984, "learning_rate": 2.277482796102681e-08, "loss": 0.0, "num_input_tokens_seen": 228616880, "step": 106000 }, { "epoch": 19.454028262066434, "grad_norm": 0.00010777961870189756, "learning_rate": 2.269854988356579e-08, "loss": 0.0, "num_input_tokens_seen": 228627312, "step": 106005 }, { "epoch": 19.454945861625987, "grad_norm": 0.00011224547779420391, "learning_rate": 2.2622399466172286e-08, "loss": 0.0, "num_input_tokens_seen": 228637712, "step": 106010 }, { "epoch": 19.45586346118554, "grad_norm": 5.42380366823636e-05, "learning_rate": 2.2546376710800287e-08, "loss": 0.0, "num_input_tokens_seen": 228648016, "step": 106015 }, { "epoch": 19.45678106074509, "grad_norm": 6.258556095417589e-05, "learning_rate": 2.2470481619399354e-08, "loss": 0.0, "num_input_tokens_seen": 228659632, "step": 106020 }, { "epoch": 19.457698660304644, "grad_norm": 4.99754496559035e-05, "learning_rate": 2.2394714193916257e-08, "loss": 0.0, "num_input_tokens_seen": 228670608, "step": 106025 }, { "epoch": 19.458616259864197, "grad_norm": 8.961561252363026e-05, "learning_rate": 2.2319074436294442e-08, "loss": 0.0, "num_input_tokens_seen": 228682736, "step": 106030 }, { "epoch": 19.459533859423747, "grad_norm": 0.0002100073324982077, "learning_rate": 2.2243562348472915e-08, "loss": 0.0, "num_input_tokens_seen": 228694224, "step": 106035 }, { "epoch": 19.4604514589833, "grad_norm": 0.00011379117495380342, "learning_rate": 2.2168177932389566e-08, "loss": 0.0, "num_input_tokens_seen": 228704592, "step": 106040 }, { "epoch": 19.461369058542854, "grad_norm": 0.0001375834981445223, "learning_rate": 2.2092921189977856e-08, "loss": 0.0, "num_input_tokens_seen": 228715728, "step": 106045 }, { "epoch": 19.462286658102403, "grad_norm": 0.0009586148662492633, "learning_rate": 2.2017792123167348e-08, "loss": 0.0, "num_input_tokens_seen": 228725968, "step": 106050 }, { "epoch": 19.463204257661957, "grad_norm": 0.00017160132119897753, "learning_rate": 2.1942790733884833e-08, "loss": 0.0, "num_input_tokens_seen": 228736208, "step": 106055 }, { "epoch": 19.46412185722151, "grad_norm": 0.00011243273911532015, "learning_rate": 2.1867917024054886e-08, "loss": 0.0, "num_input_tokens_seen": 228747024, "step": 106060 }, { "epoch": 19.46503945678106, "grad_norm": 0.0004359486629255116, "learning_rate": 2.1793170995597636e-08, "loss": 0.0, "num_input_tokens_seen": 228758224, "step": 106065 }, { "epoch": 19.465957056340613, "grad_norm": 5.581090590567328e-05, "learning_rate": 2.171855265042988e-08, "loss": 0.0, "num_input_tokens_seen": 228768176, "step": 106070 }, { "epoch": 19.466874655900167, "grad_norm": 9.570734255248681e-05, "learning_rate": 2.1644061990465647e-08, "loss": 0.0, "num_input_tokens_seen": 228780560, "step": 106075 }, { "epoch": 19.467792255459717, "grad_norm": 0.00011246942449361086, "learning_rate": 2.1569699017615076e-08, "loss": 0.0, "num_input_tokens_seen": 228792304, "step": 106080 }, { "epoch": 19.46870985501927, "grad_norm": 0.0013311798684298992, "learning_rate": 2.1495463733786082e-08, "loss": 0.0, "num_input_tokens_seen": 228804400, "step": 106085 }, { "epoch": 19.469627454578823, "grad_norm": 0.00010049495904240757, "learning_rate": 2.14213561408827e-08, "loss": 0.0, "num_input_tokens_seen": 228814384, "step": 106090 }, { "epoch": 19.470545054138373, "grad_norm": 0.006955542601644993, "learning_rate": 2.1347376240805627e-08, "loss": 0.0, "num_input_tokens_seen": 228825808, "step": 106095 }, { "epoch": 19.471462653697927, "grad_norm": 0.0014620739966630936, "learning_rate": 2.1273524035451687e-08, "loss": 0.0, "num_input_tokens_seen": 228837616, "step": 106100 }, { "epoch": 19.47238025325748, "grad_norm": 6.74713883199729e-05, "learning_rate": 2.119979952671547e-08, "loss": 0.0, "num_input_tokens_seen": 228849328, "step": 106105 }, { "epoch": 19.47329785281703, "grad_norm": 0.0011382750235497952, "learning_rate": 2.112620271648824e-08, "loss": 0.0, "num_input_tokens_seen": 228860560, "step": 106110 }, { "epoch": 19.474215452376583, "grad_norm": 0.0001409446558682248, "learning_rate": 2.1052733606657382e-08, "loss": 0.0, "num_input_tokens_seen": 228870864, "step": 106115 }, { "epoch": 19.475133051936137, "grad_norm": 0.0006514985579997301, "learning_rate": 2.097939219910694e-08, "loss": 0.0, "num_input_tokens_seen": 228882320, "step": 106120 }, { "epoch": 19.476050651495687, "grad_norm": 0.00011448860459495336, "learning_rate": 2.0906178495718187e-08, "loss": 0.0, "num_input_tokens_seen": 228894192, "step": 106125 }, { "epoch": 19.47696825105524, "grad_norm": 6.103545456426218e-05, "learning_rate": 2.0833092498369624e-08, "loss": 0.0, "num_input_tokens_seen": 228905744, "step": 106130 }, { "epoch": 19.477885850614793, "grad_norm": 8.931959018809721e-05, "learning_rate": 2.0760134208934747e-08, "loss": 0.0, "num_input_tokens_seen": 228916304, "step": 106135 }, { "epoch": 19.478803450174343, "grad_norm": 8.624937618151307e-05, "learning_rate": 2.0687303629285393e-08, "loss": 0.0, "num_input_tokens_seen": 228927344, "step": 106140 }, { "epoch": 19.479721049733897, "grad_norm": 7.316987466765568e-05, "learning_rate": 2.061460076129007e-08, "loss": 0.0, "num_input_tokens_seen": 228938640, "step": 106145 }, { "epoch": 19.48063864929345, "grad_norm": 0.00020989628683310002, "learning_rate": 2.0542025606812287e-08, "loss": 0.0, "num_input_tokens_seen": 228950288, "step": 106150 }, { "epoch": 19.481556248853, "grad_norm": 6.230292638065293e-05, "learning_rate": 2.046957816771389e-08, "loss": 0.0, "num_input_tokens_seen": 228961104, "step": 106155 }, { "epoch": 19.482473848412553, "grad_norm": 5.3843672503717244e-05, "learning_rate": 2.039725844585394e-08, "loss": 0.0, "num_input_tokens_seen": 228971696, "step": 106160 }, { "epoch": 19.483391447972107, "grad_norm": 0.0006339914980344474, "learning_rate": 2.0325066443085962e-08, "loss": 0.0, "num_input_tokens_seen": 228983120, "step": 106165 }, { "epoch": 19.484309047531656, "grad_norm": 0.0003650745202321559, "learning_rate": 2.0253002161262358e-08, "loss": 0.0, "num_input_tokens_seen": 228993520, "step": 106170 }, { "epoch": 19.48522664709121, "grad_norm": 6.95780836394988e-05, "learning_rate": 2.0181065602231652e-08, "loss": 0.0, "num_input_tokens_seen": 229004080, "step": 106175 }, { "epoch": 19.486144246650763, "grad_norm": 0.00023169501218944788, "learning_rate": 2.010925676783848e-08, "loss": 0.0, "num_input_tokens_seen": 229015984, "step": 106180 }, { "epoch": 19.487061846210313, "grad_norm": 7.823905616533011e-05, "learning_rate": 2.0037575659924703e-08, "loss": 0.0, "num_input_tokens_seen": 229026448, "step": 106185 }, { "epoch": 19.487979445769867, "grad_norm": 0.00012900946603622288, "learning_rate": 1.9966022280328845e-08, "loss": 0.0, "num_input_tokens_seen": 229038096, "step": 106190 }, { "epoch": 19.48889704532942, "grad_norm": 0.00014530960470438004, "learning_rate": 1.9894596630886108e-08, "loss": 0.0, "num_input_tokens_seen": 229048656, "step": 106195 }, { "epoch": 19.48981464488897, "grad_norm": 0.0029455209150910378, "learning_rate": 1.9823298713428363e-08, "loss": 0.0, "num_input_tokens_seen": 229059248, "step": 106200 }, { "epoch": 19.490732244448523, "grad_norm": 6.214142922544852e-05, "learning_rate": 1.9752128529784696e-08, "loss": 0.0, "num_input_tokens_seen": 229070032, "step": 106205 }, { "epoch": 19.491649844008077, "grad_norm": 0.00031472466071136296, "learning_rate": 1.968108608178032e-08, "loss": 0.0, "num_input_tokens_seen": 229080720, "step": 106210 }, { "epoch": 19.492567443567626, "grad_norm": 0.00017701691831462085, "learning_rate": 1.9610171371237107e-08, "loss": 0.0, "num_input_tokens_seen": 229092144, "step": 106215 }, { "epoch": 19.49348504312718, "grad_norm": 5.872662222827785e-05, "learning_rate": 1.953938439997416e-08, "loss": 0.0, "num_input_tokens_seen": 229102384, "step": 106220 }, { "epoch": 19.494402642686733, "grad_norm": 0.0006148869288153946, "learning_rate": 1.946872516980669e-08, "loss": 0.0, "num_input_tokens_seen": 229113680, "step": 106225 }, { "epoch": 19.495320242246283, "grad_norm": 0.00012708066788036376, "learning_rate": 1.9398193682547693e-08, "loss": 0.0, "num_input_tokens_seen": 229125232, "step": 106230 }, { "epoch": 19.496237841805836, "grad_norm": 0.00138440215960145, "learning_rate": 1.9327789940005727e-08, "loss": 0.0, "num_input_tokens_seen": 229136240, "step": 106235 }, { "epoch": 19.49715544136539, "grad_norm": 0.000122555808047764, "learning_rate": 1.925751394398656e-08, "loss": 0.0, "num_input_tokens_seen": 229147600, "step": 106240 }, { "epoch": 19.49807304092494, "grad_norm": 0.00013230214244686067, "learning_rate": 1.9187365696292647e-08, "loss": 0.0, "num_input_tokens_seen": 229158928, "step": 106245 }, { "epoch": 19.498990640484493, "grad_norm": 7.183167326729745e-05, "learning_rate": 1.911734519872366e-08, "loss": 0.0, "num_input_tokens_seen": 229170224, "step": 106250 }, { "epoch": 19.499908240044046, "grad_norm": 0.00012729571608360857, "learning_rate": 1.9047452453074268e-08, "loss": 0.0, "num_input_tokens_seen": 229181936, "step": 106255 }, { "epoch": 19.500825839603596, "grad_norm": 4.854392318520695e-05, "learning_rate": 1.8977687461138596e-08, "loss": 0.0, "num_input_tokens_seen": 229192208, "step": 106260 }, { "epoch": 19.50174343916315, "grad_norm": 4.383219493320212e-05, "learning_rate": 1.890805022470521e-08, "loss": 0.0, "num_input_tokens_seen": 229201648, "step": 106265 }, { "epoch": 19.502661038722703, "grad_norm": 0.00044287837226875126, "learning_rate": 1.8838540745560465e-08, "loss": 0.0, "num_input_tokens_seen": 229212976, "step": 106270 }, { "epoch": 19.503578638282253, "grad_norm": 6.161651253933087e-05, "learning_rate": 1.8769159025486816e-08, "loss": 0.0, "num_input_tokens_seen": 229222896, "step": 106275 }, { "epoch": 19.504496237841806, "grad_norm": 6.987110828049481e-05, "learning_rate": 1.8699905066263958e-08, "loss": 0.0, "num_input_tokens_seen": 229233776, "step": 106280 }, { "epoch": 19.50541383740136, "grad_norm": 0.00010751489753602073, "learning_rate": 1.8630778869668244e-08, "loss": 0.0, "num_input_tokens_seen": 229246416, "step": 106285 }, { "epoch": 19.50633143696091, "grad_norm": 0.0003765913425013423, "learning_rate": 1.8561780437473254e-08, "loss": 0.0, "num_input_tokens_seen": 229257552, "step": 106290 }, { "epoch": 19.507249036520463, "grad_norm": 0.00023930998577270657, "learning_rate": 1.8492909771447575e-08, "loss": 0.0, "num_input_tokens_seen": 229267760, "step": 106295 }, { "epoch": 19.508166636080016, "grad_norm": 8.425147098023444e-05, "learning_rate": 1.8424166873357573e-08, "loss": 0.0, "num_input_tokens_seen": 229279664, "step": 106300 }, { "epoch": 19.509084235639566, "grad_norm": 0.0016539416974410415, "learning_rate": 1.8355551744967947e-08, "loss": 0.0, "num_input_tokens_seen": 229291984, "step": 106305 }, { "epoch": 19.51000183519912, "grad_norm": 3.618825940066017e-05, "learning_rate": 1.8287064388036736e-08, "loss": 0.0, "num_input_tokens_seen": 229302928, "step": 106310 }, { "epoch": 19.510919434758673, "grad_norm": 3.9223166822921485e-05, "learning_rate": 1.8218704804321973e-08, "loss": 0.0, "num_input_tokens_seen": 229312976, "step": 106315 }, { "epoch": 19.511837034318223, "grad_norm": 9.263995889341459e-05, "learning_rate": 1.815047299557615e-08, "loss": 0.0, "num_input_tokens_seen": 229324176, "step": 106320 }, { "epoch": 19.512754633877776, "grad_norm": 0.0009533480042591691, "learning_rate": 1.8082368963549533e-08, "loss": 0.0, "num_input_tokens_seen": 229336304, "step": 106325 }, { "epoch": 19.51367223343733, "grad_norm": 0.0009081107564270496, "learning_rate": 1.801439270998906e-08, "loss": 0.0, "num_input_tokens_seen": 229347376, "step": 106330 }, { "epoch": 19.51458983299688, "grad_norm": 9.861899161478505e-05, "learning_rate": 1.7946544236637774e-08, "loss": 0.0, "num_input_tokens_seen": 229357648, "step": 106335 }, { "epoch": 19.515507432556433, "grad_norm": 7.792914402671158e-05, "learning_rate": 1.7878823545235956e-08, "loss": 0.0, "num_input_tokens_seen": 229367792, "step": 106340 }, { "epoch": 19.516425032115986, "grad_norm": 6.809912883909419e-05, "learning_rate": 1.7811230637521105e-08, "loss": 0.0, "num_input_tokens_seen": 229378032, "step": 106345 }, { "epoch": 19.517342631675536, "grad_norm": 0.0006134019349701703, "learning_rate": 1.7743765515226274e-08, "loss": 0.0, "num_input_tokens_seen": 229387920, "step": 106350 }, { "epoch": 19.51826023123509, "grad_norm": 0.0013970667496323586, "learning_rate": 1.7676428180082305e-08, "loss": 0.0, "num_input_tokens_seen": 229398544, "step": 106355 }, { "epoch": 19.519177830794643, "grad_norm": 8.165035978890955e-05, "learning_rate": 1.7609218633815596e-08, "loss": 0.0, "num_input_tokens_seen": 229409648, "step": 106360 }, { "epoch": 19.520095430354193, "grad_norm": 6.888109783176333e-05, "learning_rate": 1.7542136878150873e-08, "loss": 0.0, "num_input_tokens_seen": 229421296, "step": 106365 }, { "epoch": 19.521013029913746, "grad_norm": 6.313883204711601e-05, "learning_rate": 1.7475182914808432e-08, "loss": 0.0, "num_input_tokens_seen": 229431536, "step": 106370 }, { "epoch": 19.5219306294733, "grad_norm": 0.0011684331111609936, "learning_rate": 1.7408356745504674e-08, "loss": 0.0, "num_input_tokens_seen": 229441904, "step": 106375 }, { "epoch": 19.52284822903285, "grad_norm": 0.00026870056171901524, "learning_rate": 1.7341658371954896e-08, "loss": 0.0, "num_input_tokens_seen": 229452016, "step": 106380 }, { "epoch": 19.523765828592403, "grad_norm": 0.00021932179515715688, "learning_rate": 1.7275087795868837e-08, "loss": 0.0, "num_input_tokens_seen": 229462640, "step": 106385 }, { "epoch": 19.524683428151956, "grad_norm": 0.0008509682375006378, "learning_rate": 1.7208645018954028e-08, "loss": 0.0, "num_input_tokens_seen": 229472656, "step": 106390 }, { "epoch": 19.525601027711506, "grad_norm": 5.6653858337085694e-05, "learning_rate": 1.714233004291521e-08, "loss": 0.0, "num_input_tokens_seen": 229483920, "step": 106395 }, { "epoch": 19.52651862727106, "grad_norm": 5.8060359151568264e-05, "learning_rate": 1.7076142869452694e-08, "loss": 0.0, "num_input_tokens_seen": 229494512, "step": 106400 }, { "epoch": 19.527436226830613, "grad_norm": 0.0003001389268320054, "learning_rate": 1.7010083500264006e-08, "loss": 0.0, "num_input_tokens_seen": 229505264, "step": 106405 }, { "epoch": 19.528353826390163, "grad_norm": 4.174772766418755e-05, "learning_rate": 1.6944151937044463e-08, "loss": 0.0, "num_input_tokens_seen": 229516752, "step": 106410 }, { "epoch": 19.529271425949716, "grad_norm": 5.865464117960073e-05, "learning_rate": 1.6878348181483816e-08, "loss": 0.0, "num_input_tokens_seen": 229527504, "step": 106415 }, { "epoch": 19.53018902550927, "grad_norm": 4.123633334529586e-05, "learning_rate": 1.681267223527072e-08, "loss": 0.0, "num_input_tokens_seen": 229540176, "step": 106420 }, { "epoch": 19.53110662506882, "grad_norm": 0.0002368995628785342, "learning_rate": 1.674712410008883e-08, "loss": 0.0, "num_input_tokens_seen": 229550096, "step": 106425 }, { "epoch": 19.532024224628373, "grad_norm": 7.597107469337061e-05, "learning_rate": 1.6681703777620683e-08, "loss": 0.0, "num_input_tokens_seen": 229561168, "step": 106430 }, { "epoch": 19.532941824187926, "grad_norm": 6.400964775821194e-05, "learning_rate": 1.6616411269542722e-08, "loss": 0.0, "num_input_tokens_seen": 229572176, "step": 106435 }, { "epoch": 19.533859423747476, "grad_norm": 3.8177400710992515e-05, "learning_rate": 1.6551246577530267e-08, "loss": 0.0, "num_input_tokens_seen": 229582928, "step": 106440 }, { "epoch": 19.53477702330703, "grad_norm": 0.00010904329246841371, "learning_rate": 1.6486209703255318e-08, "loss": 0.0, "num_input_tokens_seen": 229594128, "step": 106445 }, { "epoch": 19.535694622866583, "grad_norm": 6.789697363274172e-05, "learning_rate": 1.6421300648384876e-08, "loss": 0.0, "num_input_tokens_seen": 229604880, "step": 106450 }, { "epoch": 19.536612222426132, "grad_norm": 0.00020086322911083698, "learning_rate": 1.635651941458427e-08, "loss": 0.0, "num_input_tokens_seen": 229615920, "step": 106455 }, { "epoch": 19.537529821985686, "grad_norm": 7.997518696356565e-05, "learning_rate": 1.6291866003514957e-08, "loss": 0.0, "num_input_tokens_seen": 229626480, "step": 106460 }, { "epoch": 19.53844742154524, "grad_norm": 9.774771024240181e-05, "learning_rate": 1.6227340416835047e-08, "loss": 0.0, "num_input_tokens_seen": 229637200, "step": 106465 }, { "epoch": 19.53936502110479, "grad_norm": 0.0001390355173498392, "learning_rate": 1.6162942656200443e-08, "loss": 0.0, "num_input_tokens_seen": 229647376, "step": 106470 }, { "epoch": 19.540282620664343, "grad_norm": 0.0009344587451778352, "learning_rate": 1.6098672723261487e-08, "loss": 0.0, "num_input_tokens_seen": 229659856, "step": 106475 }, { "epoch": 19.541200220223896, "grad_norm": 0.0005096435197629035, "learning_rate": 1.603453061966742e-08, "loss": 0.0, "num_input_tokens_seen": 229671184, "step": 106480 }, { "epoch": 19.542117819783446, "grad_norm": 0.00017337371536996216, "learning_rate": 1.5970516347063037e-08, "loss": 0.0, "num_input_tokens_seen": 229682224, "step": 106485 }, { "epoch": 19.543035419343, "grad_norm": 10.266154289245605, "learning_rate": 1.5906629907090355e-08, "loss": 0.056, "num_input_tokens_seen": 229692880, "step": 106490 }, { "epoch": 19.543953018902553, "grad_norm": 0.0001256446266779676, "learning_rate": 1.5842871301388064e-08, "loss": 0.0, "num_input_tokens_seen": 229704432, "step": 106495 }, { "epoch": 19.544870618462102, "grad_norm": 0.0001405704824719578, "learning_rate": 1.577924053159152e-08, "loss": 0.0451, "num_input_tokens_seen": 229716304, "step": 106500 }, { "epoch": 19.545788218021656, "grad_norm": 0.0019408214138820767, "learning_rate": 1.5715737599332203e-08, "loss": 0.0, "num_input_tokens_seen": 229727728, "step": 106505 }, { "epoch": 19.54670581758121, "grad_norm": 9.185011003864929e-05, "learning_rate": 1.5652362506239915e-08, "loss": 0.0, "num_input_tokens_seen": 229739280, "step": 106510 }, { "epoch": 19.54762341714076, "grad_norm": 9.702846000436693e-05, "learning_rate": 1.5589115253938914e-08, "loss": 0.0, "num_input_tokens_seen": 229749744, "step": 106515 }, { "epoch": 19.548541016700312, "grad_norm": 5.444053385872394e-05, "learning_rate": 1.5525995844052345e-08, "loss": 0.0, "num_input_tokens_seen": 229760240, "step": 106520 }, { "epoch": 19.549458616259866, "grad_norm": 0.00017122880672104657, "learning_rate": 1.546300427819891e-08, "loss": 0.0, "num_input_tokens_seen": 229769648, "step": 106525 }, { "epoch": 19.550376215819416, "grad_norm": 5.611351298284717e-05, "learning_rate": 1.5400140557993437e-08, "loss": 0.0, "num_input_tokens_seen": 229780496, "step": 106530 }, { "epoch": 19.55129381537897, "grad_norm": 9.676439367467538e-05, "learning_rate": 1.533740468504963e-08, "loss": 0.0, "num_input_tokens_seen": 229789616, "step": 106535 }, { "epoch": 19.552211414938522, "grad_norm": 6.714254413964227e-05, "learning_rate": 1.527479666097509e-08, "loss": 0.0, "num_input_tokens_seen": 229801872, "step": 106540 }, { "epoch": 19.553129014498072, "grad_norm": 0.001063857227563858, "learning_rate": 1.5212316487376867e-08, "loss": 0.0, "num_input_tokens_seen": 229812560, "step": 106545 }, { "epoch": 19.554046614057626, "grad_norm": 0.00041757113649509847, "learning_rate": 1.514996416585701e-08, "loss": 0.0, "num_input_tokens_seen": 229823824, "step": 106550 }, { "epoch": 19.55496421361718, "grad_norm": 0.00032223740709014237, "learning_rate": 1.5087739698014804e-08, "loss": 0.0, "num_input_tokens_seen": 229834928, "step": 106555 }, { "epoch": 19.55588181317673, "grad_norm": 0.0003782181302085519, "learning_rate": 1.5025643085446183e-08, "loss": 0.0, "num_input_tokens_seen": 229845104, "step": 106560 }, { "epoch": 19.556799412736282, "grad_norm": 5.312514986144379e-05, "learning_rate": 1.4963674329743216e-08, "loss": 0.0, "num_input_tokens_seen": 229857072, "step": 106565 }, { "epoch": 19.557717012295836, "grad_norm": 5.977005639579147e-05, "learning_rate": 1.490183343249685e-08, "loss": 0.0, "num_input_tokens_seen": 229867984, "step": 106570 }, { "epoch": 19.558634611855386, "grad_norm": 4.938054189551622e-05, "learning_rate": 1.4840120395291368e-08, "loss": 0.0, "num_input_tokens_seen": 229879056, "step": 106575 }, { "epoch": 19.55955221141494, "grad_norm": 0.0001333926193183288, "learning_rate": 1.4778535219711066e-08, "loss": 0.0, "num_input_tokens_seen": 229889488, "step": 106580 }, { "epoch": 19.560469810974492, "grad_norm": 0.0003318301751278341, "learning_rate": 1.4717077907334675e-08, "loss": 0.0, "num_input_tokens_seen": 229900144, "step": 106585 }, { "epoch": 19.561387410534042, "grad_norm": 0.0016408907249569893, "learning_rate": 1.4655748459738717e-08, "loss": 0.0, "num_input_tokens_seen": 229911024, "step": 106590 }, { "epoch": 19.562305010093596, "grad_norm": 0.00010264777665724978, "learning_rate": 1.4594546878496373e-08, "loss": 0.0, "num_input_tokens_seen": 229922544, "step": 106595 }, { "epoch": 19.56322260965315, "grad_norm": 0.0012868375051766634, "learning_rate": 1.4533473165177503e-08, "loss": 0.0, "num_input_tokens_seen": 229932816, "step": 106600 }, { "epoch": 19.5641402092127, "grad_norm": 0.0003196910838596523, "learning_rate": 1.4472527321348074e-08, "loss": 0.0, "num_input_tokens_seen": 229944304, "step": 106605 }, { "epoch": 19.565057808772252, "grad_norm": 6.918616418261081e-05, "learning_rate": 1.4411709348570724e-08, "loss": 0.0, "num_input_tokens_seen": 229955280, "step": 106610 }, { "epoch": 19.565975408331806, "grad_norm": 0.00013358675641939044, "learning_rate": 1.4351019248406983e-08, "loss": 0.0, "num_input_tokens_seen": 229966128, "step": 106615 }, { "epoch": 19.566893007891355, "grad_norm": 4.471218562684953e-05, "learning_rate": 1.4290457022412274e-08, "loss": 0.0, "num_input_tokens_seen": 229976880, "step": 106620 }, { "epoch": 19.56781060745091, "grad_norm": 8.277397137135267e-05, "learning_rate": 1.4230022672139799e-08, "loss": 0.0, "num_input_tokens_seen": 229987760, "step": 106625 }, { "epoch": 19.568728207010462, "grad_norm": 5.3299430874176323e-05, "learning_rate": 1.4169716199140537e-08, "loss": 0.0, "num_input_tokens_seen": 229998704, "step": 106630 }, { "epoch": 19.569645806570012, "grad_norm": 4.8795373004395515e-05, "learning_rate": 1.4109537604960477e-08, "loss": 0.0, "num_input_tokens_seen": 230010736, "step": 106635 }, { "epoch": 19.570563406129565, "grad_norm": 0.00022720279230270535, "learning_rate": 1.4049486891143938e-08, "loss": 0.0, "num_input_tokens_seen": 230021744, "step": 106640 }, { "epoch": 19.57148100568912, "grad_norm": 0.0001798812736524269, "learning_rate": 1.3989564059229687e-08, "loss": 0.0, "num_input_tokens_seen": 230033040, "step": 106645 }, { "epoch": 19.57239860524867, "grad_norm": 0.00013543356908485293, "learning_rate": 1.3929769110755943e-08, "loss": 0.0, "num_input_tokens_seen": 230044048, "step": 106650 }, { "epoch": 19.573316204808222, "grad_norm": 0.0003184440720360726, "learning_rate": 1.3870102047255917e-08, "loss": 0.0, "num_input_tokens_seen": 230054480, "step": 106655 }, { "epoch": 19.574233804367775, "grad_norm": 4.809352685697377e-05, "learning_rate": 1.3810562870259504e-08, "loss": 0.0, "num_input_tokens_seen": 230064944, "step": 106660 }, { "epoch": 19.575151403927325, "grad_norm": 0.00041422407957725227, "learning_rate": 1.3751151581294919e-08, "loss": 0.0, "num_input_tokens_seen": 230076432, "step": 106665 }, { "epoch": 19.57606900348688, "grad_norm": 0.00012947253708261997, "learning_rate": 1.3691868181884838e-08, "loss": 0.0, "num_input_tokens_seen": 230086928, "step": 106670 }, { "epoch": 19.576986603046432, "grad_norm": 0.0006234536413103342, "learning_rate": 1.3632712673550263e-08, "loss": 0.0, "num_input_tokens_seen": 230098224, "step": 106675 }, { "epoch": 19.577904202605982, "grad_norm": 0.0004248645273037255, "learning_rate": 1.3573685057808872e-08, "loss": 0.0, "num_input_tokens_seen": 230108560, "step": 106680 }, { "epoch": 19.578821802165535, "grad_norm": 0.0001370488025713712, "learning_rate": 1.3514785336173897e-08, "loss": 0.0, "num_input_tokens_seen": 230119920, "step": 106685 }, { "epoch": 19.57973940172509, "grad_norm": 4.553162943921052e-05, "learning_rate": 1.3456013510156351e-08, "loss": 0.0, "num_input_tokens_seen": 230131760, "step": 106690 }, { "epoch": 19.58065700128464, "grad_norm": 6.386953464243561e-05, "learning_rate": 1.3397369581263364e-08, "loss": 0.0, "num_input_tokens_seen": 230143216, "step": 106695 }, { "epoch": 19.581574600844192, "grad_norm": 7.789294613758102e-05, "learning_rate": 1.3338853550999841e-08, "loss": 0.0, "num_input_tokens_seen": 230154384, "step": 106700 }, { "epoch": 19.582492200403745, "grad_norm": 0.00025641368119977415, "learning_rate": 1.3280465420865695e-08, "loss": 0.0, "num_input_tokens_seen": 230164112, "step": 106705 }, { "epoch": 19.583409799963295, "grad_norm": 6.159774056868628e-05, "learning_rate": 1.3222205192359172e-08, "loss": 0.0, "num_input_tokens_seen": 230174096, "step": 106710 }, { "epoch": 19.58432739952285, "grad_norm": 0.0003242136735934764, "learning_rate": 1.3164072866974076e-08, "loss": 0.0, "num_input_tokens_seen": 230184688, "step": 106715 }, { "epoch": 19.585244999082402, "grad_norm": 0.00026328637613914907, "learning_rate": 1.3106068446201991e-08, "loss": 0.0, "num_input_tokens_seen": 230195696, "step": 106720 }, { "epoch": 19.586162598641952, "grad_norm": 0.00014795603055972606, "learning_rate": 1.3048191931529508e-08, "loss": 0.0, "num_input_tokens_seen": 230206544, "step": 106725 }, { "epoch": 19.587080198201505, "grad_norm": 9.600802877685055e-05, "learning_rate": 1.299044332444266e-08, "loss": 0.0, "num_input_tokens_seen": 230217328, "step": 106730 }, { "epoch": 19.58799779776106, "grad_norm": 0.00010452798596816137, "learning_rate": 1.293282262642137e-08, "loss": 0.0, "num_input_tokens_seen": 230229488, "step": 106735 }, { "epoch": 19.58891539732061, "grad_norm": 0.0002899510436691344, "learning_rate": 1.2875329838943907e-08, "loss": 0.0, "num_input_tokens_seen": 230239920, "step": 106740 }, { "epoch": 19.589832996880162, "grad_norm": 0.0001020263007376343, "learning_rate": 1.2817964963484641e-08, "loss": 0.0, "num_input_tokens_seen": 230250672, "step": 106745 }, { "epoch": 19.590750596439715, "grad_norm": 9.860092541202903e-05, "learning_rate": 1.2760728001515733e-08, "loss": 0.0, "num_input_tokens_seen": 230261904, "step": 106750 }, { "epoch": 19.591668195999265, "grad_norm": 0.0006738306838087738, "learning_rate": 1.2703618954504337e-08, "loss": 0.0, "num_input_tokens_seen": 230272944, "step": 106755 }, { "epoch": 19.59258579555882, "grad_norm": 8.846721902955323e-05, "learning_rate": 1.2646637823915397e-08, "loss": 0.0, "num_input_tokens_seen": 230283664, "step": 106760 }, { "epoch": 19.593503395118372, "grad_norm": 8.373118180315942e-05, "learning_rate": 1.258978461121052e-08, "loss": 0.0, "num_input_tokens_seen": 230294192, "step": 106765 }, { "epoch": 19.59442099467792, "grad_norm": 0.00017708251834847033, "learning_rate": 1.2533059317847985e-08, "loss": 0.0, "num_input_tokens_seen": 230305840, "step": 106770 }, { "epoch": 19.595338594237475, "grad_norm": 7.92562059359625e-05, "learning_rate": 1.2476461945282736e-08, "loss": 0.0, "num_input_tokens_seen": 230315216, "step": 106775 }, { "epoch": 19.59625619379703, "grad_norm": 6.309236050583422e-05, "learning_rate": 1.2419992494965837e-08, "loss": 0.0, "num_input_tokens_seen": 230325776, "step": 106780 }, { "epoch": 19.59717379335658, "grad_norm": 6.678484351141378e-05, "learning_rate": 1.2363650968346685e-08, "loss": 0.0, "num_input_tokens_seen": 230335472, "step": 106785 }, { "epoch": 19.598091392916132, "grad_norm": 9.623201913200319e-05, "learning_rate": 1.2307437366869679e-08, "loss": 0.0, "num_input_tokens_seen": 230345936, "step": 106790 }, { "epoch": 19.599008992475685, "grad_norm": 8.354327292181551e-05, "learning_rate": 1.2251351691975888e-08, "loss": 0.0, "num_input_tokens_seen": 230356784, "step": 106795 }, { "epoch": 19.599926592035235, "grad_norm": 4.942352825310081e-05, "learning_rate": 1.2195393945105271e-08, "loss": 0.0, "num_input_tokens_seen": 230367632, "step": 106800 }, { "epoch": 19.60084419159479, "grad_norm": 0.0005392381572164595, "learning_rate": 1.2139564127692238e-08, "loss": 0.0, "num_input_tokens_seen": 230378256, "step": 106805 }, { "epoch": 19.601761791154342, "grad_norm": 9.066574421012774e-05, "learning_rate": 1.2083862241168976e-08, "loss": 0.0, "num_input_tokens_seen": 230389584, "step": 106810 }, { "epoch": 19.60267939071389, "grad_norm": 0.00010584181291051209, "learning_rate": 1.202828828696434e-08, "loss": 0.0, "num_input_tokens_seen": 230400048, "step": 106815 }, { "epoch": 19.603596990273445, "grad_norm": 0.00031184268300421536, "learning_rate": 1.1972842266503305e-08, "loss": 0.0, "num_input_tokens_seen": 230412304, "step": 106820 }, { "epoch": 19.604514589833, "grad_norm": 5.228004738455638e-05, "learning_rate": 1.1917524181208063e-08, "loss": 0.0003, "num_input_tokens_seen": 230421648, "step": 106825 }, { "epoch": 19.60543218939255, "grad_norm": 5.4894200729904696e-05, "learning_rate": 1.1862334032496925e-08, "loss": 0.0, "num_input_tokens_seen": 230431792, "step": 106830 }, { "epoch": 19.6063497889521, "grad_norm": 5.521619095816277e-05, "learning_rate": 1.1807271821786536e-08, "loss": 0.0, "num_input_tokens_seen": 230441296, "step": 106835 }, { "epoch": 19.607267388511655, "grad_norm": 0.0003333145286887884, "learning_rate": 1.1752337550489101e-08, "loss": 0.0, "num_input_tokens_seen": 230451696, "step": 106840 }, { "epoch": 19.608184988071205, "grad_norm": 0.00014084302529226989, "learning_rate": 1.1697531220012382e-08, "loss": 0.0, "num_input_tokens_seen": 230462960, "step": 106845 }, { "epoch": 19.60910258763076, "grad_norm": 0.00010465597006259486, "learning_rate": 1.1642852831763029e-08, "loss": 0.0, "num_input_tokens_seen": 230473584, "step": 106850 }, { "epoch": 19.61002018719031, "grad_norm": 7.047391409287229e-05, "learning_rate": 1.1588302387143257e-08, "loss": 0.0, "num_input_tokens_seen": 230484912, "step": 106855 }, { "epoch": 19.61093778674986, "grad_norm": 6.108123488957062e-05, "learning_rate": 1.1533879887551947e-08, "loss": 0.0, "num_input_tokens_seen": 230495792, "step": 106860 }, { "epoch": 19.611855386309415, "grad_norm": 7.788676884956658e-05, "learning_rate": 1.1479585334385757e-08, "loss": 0.0, "num_input_tokens_seen": 230506320, "step": 106865 }, { "epoch": 19.61277298586897, "grad_norm": 0.00037786949542351067, "learning_rate": 1.1425418729036353e-08, "loss": 0.0, "num_input_tokens_seen": 230516528, "step": 106870 }, { "epoch": 19.613690585428518, "grad_norm": 0.00013718822447117418, "learning_rate": 1.1371380072893735e-08, "loss": 0.0, "num_input_tokens_seen": 230526352, "step": 106875 }, { "epoch": 19.61460818498807, "grad_norm": 0.00011137032561236992, "learning_rate": 1.1317469367342903e-08, "loss": 0.0, "num_input_tokens_seen": 230537616, "step": 106880 }, { "epoch": 19.615525784547625, "grad_norm": 6.476962880697101e-05, "learning_rate": 1.1263686613767755e-08, "loss": 0.0, "num_input_tokens_seen": 230548848, "step": 106885 }, { "epoch": 19.616443384107175, "grad_norm": 7.128671859391034e-05, "learning_rate": 1.1210031813547185e-08, "loss": 0.0, "num_input_tokens_seen": 230559088, "step": 106890 }, { "epoch": 19.617360983666728, "grad_norm": 0.00045694454456679523, "learning_rate": 1.1156504968056758e-08, "loss": 0.0, "num_input_tokens_seen": 230569520, "step": 106895 }, { "epoch": 19.61827858322628, "grad_norm": 3.75947893189732e-05, "learning_rate": 1.1103106078670378e-08, "loss": 0.0, "num_input_tokens_seen": 230580464, "step": 106900 }, { "epoch": 19.61919618278583, "grad_norm": 5.9612088080029935e-05, "learning_rate": 1.1049835146757504e-08, "loss": 0.0, "num_input_tokens_seen": 230591376, "step": 106905 }, { "epoch": 19.620113782345385, "grad_norm": 7.173026097007096e-05, "learning_rate": 1.0996692173684265e-08, "loss": 0.0, "num_input_tokens_seen": 230600240, "step": 106910 }, { "epoch": 19.62103138190494, "grad_norm": 0.00026734487619251013, "learning_rate": 1.0943677160812904e-08, "loss": 0.0, "num_input_tokens_seen": 230611216, "step": 106915 }, { "epoch": 19.621948981464488, "grad_norm": 5.740203050663695e-05, "learning_rate": 1.0890790109504556e-08, "loss": 0.0, "num_input_tokens_seen": 230620336, "step": 106920 }, { "epoch": 19.62286658102404, "grad_norm": 5.005842831451446e-05, "learning_rate": 1.0838031021114803e-08, "loss": 0.0, "num_input_tokens_seen": 230631056, "step": 106925 }, { "epoch": 19.623784180583595, "grad_norm": 6.351769116008654e-05, "learning_rate": 1.0785399896997007e-08, "loss": 0.0, "num_input_tokens_seen": 230642128, "step": 106930 }, { "epoch": 19.624701780143145, "grad_norm": 0.00020869067520834506, "learning_rate": 1.0732896738500643e-08, "loss": 0.0, "num_input_tokens_seen": 230652752, "step": 106935 }, { "epoch": 19.625619379702698, "grad_norm": 0.003228261135518551, "learning_rate": 1.0680521546973522e-08, "loss": 0.0, "num_input_tokens_seen": 230662960, "step": 106940 }, { "epoch": 19.62653697926225, "grad_norm": 0.003175682621076703, "learning_rate": 1.0628274323757903e-08, "loss": 0.0, "num_input_tokens_seen": 230674000, "step": 106945 }, { "epoch": 19.6274545788218, "grad_norm": 0.00010047792602563277, "learning_rate": 1.0576155070194939e-08, "loss": 0.0, "num_input_tokens_seen": 230685040, "step": 106950 }, { "epoch": 19.628372178381355, "grad_norm": 0.00010097713675349951, "learning_rate": 1.0524163787619669e-08, "loss": 0.0, "num_input_tokens_seen": 230695568, "step": 106955 }, { "epoch": 19.629289777940908, "grad_norm": 0.00010527370613999665, "learning_rate": 1.0472300477367137e-08, "loss": 0.0, "num_input_tokens_seen": 230706000, "step": 106960 }, { "epoch": 19.630207377500458, "grad_norm": 0.0001524480030639097, "learning_rate": 1.0420565140766837e-08, "loss": 0.0, "num_input_tokens_seen": 230716144, "step": 106965 }, { "epoch": 19.63112497706001, "grad_norm": 0.0002975229872390628, "learning_rate": 1.0368957779146039e-08, "loss": 0.0, "num_input_tokens_seen": 230726672, "step": 106970 }, { "epoch": 19.632042576619565, "grad_norm": 0.00037260932731442153, "learning_rate": 1.0317478393828684e-08, "loss": 0.0, "num_input_tokens_seen": 230736240, "step": 106975 }, { "epoch": 19.632960176179115, "grad_norm": 5.011606481275521e-05, "learning_rate": 1.0266126986133718e-08, "loss": 0.0, "num_input_tokens_seen": 230745392, "step": 106980 }, { "epoch": 19.633877775738668, "grad_norm": 4.588230876834132e-05, "learning_rate": 1.0214903557380085e-08, "loss": 0.0, "num_input_tokens_seen": 230755984, "step": 106985 }, { "epoch": 19.63479537529822, "grad_norm": 7.163840200519189e-05, "learning_rate": 1.016380810888007e-08, "loss": 0.0, "num_input_tokens_seen": 230767216, "step": 106990 }, { "epoch": 19.63571297485777, "grad_norm": 0.0011716597946360707, "learning_rate": 1.0112840641945399e-08, "loss": 0.0, "num_input_tokens_seen": 230776240, "step": 106995 }, { "epoch": 19.636630574417325, "grad_norm": 9.060402226168662e-05, "learning_rate": 1.0062001157882251e-08, "loss": 0.0, "num_input_tokens_seen": 230787216, "step": 107000 }, { "epoch": 19.637548173976878, "grad_norm": 7.574644405394793e-05, "learning_rate": 1.0011289657995693e-08, "loss": 0.0, "num_input_tokens_seen": 230798576, "step": 107005 }, { "epoch": 19.638465773536428, "grad_norm": 9.990543912863359e-05, "learning_rate": 9.960706143585241e-09, "loss": 0.0, "num_input_tokens_seen": 230809840, "step": 107010 }, { "epoch": 19.63938337309598, "grad_norm": 6.317932275123894e-05, "learning_rate": 9.910250615948747e-09, "loss": 0.0, "num_input_tokens_seen": 230820464, "step": 107015 }, { "epoch": 19.640300972655535, "grad_norm": 8.259643072960898e-05, "learning_rate": 9.859923076380728e-09, "loss": 0.0, "num_input_tokens_seen": 230830736, "step": 107020 }, { "epoch": 19.641218572215084, "grad_norm": 5.037959272158332e-05, "learning_rate": 9.809723526171266e-09, "loss": 0.0, "num_input_tokens_seen": 230840880, "step": 107025 }, { "epoch": 19.642136171774638, "grad_norm": 0.001216303906403482, "learning_rate": 9.759651966608774e-09, "loss": 0.0, "num_input_tokens_seen": 230851824, "step": 107030 }, { "epoch": 19.64305377133419, "grad_norm": 0.0001424730580765754, "learning_rate": 9.709708398976669e-09, "loss": 0.0, "num_input_tokens_seen": 230863408, "step": 107035 }, { "epoch": 19.64397137089374, "grad_norm": 5.2926196076441556e-05, "learning_rate": 9.659892824556705e-09, "loss": 0.0, "num_input_tokens_seen": 230874352, "step": 107040 }, { "epoch": 19.644888970453295, "grad_norm": 0.00028811805532313883, "learning_rate": 9.610205244625637e-09, "loss": 0.0, "num_input_tokens_seen": 230884528, "step": 107045 }, { "epoch": 19.645806570012848, "grad_norm": 0.0009397337562404573, "learning_rate": 9.560645660458556e-09, "loss": 0.0, "num_input_tokens_seen": 230894096, "step": 107050 }, { "epoch": 19.646724169572398, "grad_norm": 0.00018347844888921827, "learning_rate": 9.511214073326668e-09, "loss": 0.0, "num_input_tokens_seen": 230904528, "step": 107055 }, { "epoch": 19.64764176913195, "grad_norm": 0.0001279770367546007, "learning_rate": 9.461910484497294e-09, "loss": 0.0, "num_input_tokens_seen": 230914608, "step": 107060 }, { "epoch": 19.648559368691505, "grad_norm": 0.00014682579785585403, "learning_rate": 9.412734895235532e-09, "loss": 0.0, "num_input_tokens_seen": 230924464, "step": 107065 }, { "epoch": 19.649476968251054, "grad_norm": 0.00016195506032090634, "learning_rate": 9.363687306802594e-09, "loss": 0.0, "num_input_tokens_seen": 230935184, "step": 107070 }, { "epoch": 19.650394567810608, "grad_norm": 0.0001101793022826314, "learning_rate": 9.314767720455809e-09, "loss": 0.0, "num_input_tokens_seen": 230945264, "step": 107075 }, { "epoch": 19.65131216737016, "grad_norm": 5.737020546803251e-05, "learning_rate": 9.265976137450284e-09, "loss": 0.0, "num_input_tokens_seen": 230956560, "step": 107080 }, { "epoch": 19.65222976692971, "grad_norm": 7.816699508111924e-05, "learning_rate": 9.21731255903835e-09, "loss": 0.0, "num_input_tokens_seen": 230967824, "step": 107085 }, { "epoch": 19.653147366489264, "grad_norm": 0.0006335593643598258, "learning_rate": 9.168776986466787e-09, "loss": 0.0, "num_input_tokens_seen": 230978096, "step": 107090 }, { "epoch": 19.654064966048818, "grad_norm": 5.838732613483444e-05, "learning_rate": 9.120369420980712e-09, "loss": 0.0, "num_input_tokens_seen": 230987088, "step": 107095 }, { "epoch": 19.654982565608368, "grad_norm": 5.8285520935896784e-05, "learning_rate": 9.072089863822464e-09, "loss": 0.0, "num_input_tokens_seen": 230998704, "step": 107100 }, { "epoch": 19.65590016516792, "grad_norm": 0.00037962375790812075, "learning_rate": 9.023938316229941e-09, "loss": 0.0, "num_input_tokens_seen": 231008880, "step": 107105 }, { "epoch": 19.656817764727474, "grad_norm": 5.032467015553266e-05, "learning_rate": 8.97591477943771e-09, "loss": 0.0, "num_input_tokens_seen": 231018512, "step": 107110 }, { "epoch": 19.657735364287024, "grad_norm": 0.0005928323371335864, "learning_rate": 8.928019254678123e-09, "loss": 0.0, "num_input_tokens_seen": 231029328, "step": 107115 }, { "epoch": 19.658652963846578, "grad_norm": 0.0002465144789312035, "learning_rate": 8.880251743179081e-09, "loss": 0.0, "num_input_tokens_seen": 231041360, "step": 107120 }, { "epoch": 19.65957056340613, "grad_norm": 0.00029149939655326307, "learning_rate": 8.832612246166273e-09, "loss": 0.0, "num_input_tokens_seen": 231052848, "step": 107125 }, { "epoch": 19.66048816296568, "grad_norm": 4.996037750970572e-05, "learning_rate": 8.785100764861498e-09, "loss": 0.0, "num_input_tokens_seen": 231064048, "step": 107130 }, { "epoch": 19.661405762525234, "grad_norm": 0.00016710533236619085, "learning_rate": 8.737717300483228e-09, "loss": 0.0, "num_input_tokens_seen": 231076240, "step": 107135 }, { "epoch": 19.662323362084788, "grad_norm": 0.0001664945884840563, "learning_rate": 8.690461854246601e-09, "loss": 0.0, "num_input_tokens_seen": 231085264, "step": 107140 }, { "epoch": 19.663240961644338, "grad_norm": 0.0004369569360278547, "learning_rate": 8.643334427363425e-09, "loss": 0.0, "num_input_tokens_seen": 231096432, "step": 107145 }, { "epoch": 19.66415856120389, "grad_norm": 5.1814546168316156e-05, "learning_rate": 8.59633502104329e-09, "loss": 0.0, "num_input_tokens_seen": 231107312, "step": 107150 }, { "epoch": 19.665076160763444, "grad_norm": 8.848699508234859e-05, "learning_rate": 8.549463636491339e-09, "loss": 0.0, "num_input_tokens_seen": 231118320, "step": 107155 }, { "epoch": 19.665993760322994, "grad_norm": 0.001218097168020904, "learning_rate": 8.502720274909392e-09, "loss": 0.0, "num_input_tokens_seen": 231129712, "step": 107160 }, { "epoch": 19.666911359882548, "grad_norm": 9.749852324603125e-05, "learning_rate": 8.45610493749649e-09, "loss": 0.0, "num_input_tokens_seen": 231141040, "step": 107165 }, { "epoch": 19.6678289594421, "grad_norm": 7.801611354807392e-05, "learning_rate": 8.409617625448342e-09, "loss": 0.0, "num_input_tokens_seen": 231151600, "step": 107170 }, { "epoch": 19.66874655900165, "grad_norm": 8.875910862116143e-05, "learning_rate": 8.36325833995677e-09, "loss": 0.0, "num_input_tokens_seen": 231162064, "step": 107175 }, { "epoch": 19.669664158561204, "grad_norm": 4.020957203465514e-05, "learning_rate": 8.317027082211937e-09, "loss": 0.0, "num_input_tokens_seen": 231173328, "step": 107180 }, { "epoch": 19.670581758120758, "grad_norm": 0.00019416875147726387, "learning_rate": 8.270923853399004e-09, "loss": 0.0, "num_input_tokens_seen": 231184560, "step": 107185 }, { "epoch": 19.671499357680307, "grad_norm": 9.551879338687286e-05, "learning_rate": 8.224948654699806e-09, "loss": 0.0, "num_input_tokens_seen": 231195568, "step": 107190 }, { "epoch": 19.67241695723986, "grad_norm": 0.00014326951350085437, "learning_rate": 8.179101487294505e-09, "loss": 0.0001, "num_input_tokens_seen": 231205200, "step": 107195 }, { "epoch": 19.673334556799414, "grad_norm": 7.842704508220777e-05, "learning_rate": 8.133382352358276e-09, "loss": 0.0, "num_input_tokens_seen": 231215440, "step": 107200 }, { "epoch": 19.674252156358964, "grad_norm": 8.957168029155582e-05, "learning_rate": 8.087791251064624e-09, "loss": 0.0, "num_input_tokens_seen": 231225360, "step": 107205 }, { "epoch": 19.675169755918517, "grad_norm": 4.163054109085351e-05, "learning_rate": 8.04232818458206e-09, "loss": 0.0, "num_input_tokens_seen": 231235728, "step": 107210 }, { "epoch": 19.67608735547807, "grad_norm": 0.00018120990716852248, "learning_rate": 7.996993154076871e-09, "loss": 0.0, "num_input_tokens_seen": 231247056, "step": 107215 }, { "epoch": 19.67700495503762, "grad_norm": 5.266943117021583e-05, "learning_rate": 7.95178616071257e-09, "loss": 0.0, "num_input_tokens_seen": 231256560, "step": 107220 }, { "epoch": 19.677922554597174, "grad_norm": 0.01658383384346962, "learning_rate": 7.906707205647124e-09, "loss": 0.0, "num_input_tokens_seen": 231266640, "step": 107225 }, { "epoch": 19.678840154156727, "grad_norm": 0.00011259208258707076, "learning_rate": 7.861756290037936e-09, "loss": 0.0, "num_input_tokens_seen": 231278672, "step": 107230 }, { "epoch": 19.679757753716277, "grad_norm": 0.0007090811268426478, "learning_rate": 7.81693341503742e-09, "loss": 0.0, "num_input_tokens_seen": 231289616, "step": 107235 }, { "epoch": 19.68067535327583, "grad_norm": 4.576929131872021e-05, "learning_rate": 7.772238581795766e-09, "loss": 0.0, "num_input_tokens_seen": 231301424, "step": 107240 }, { "epoch": 19.681592952835384, "grad_norm": 0.0005255632568150759, "learning_rate": 7.727671791458724e-09, "loss": 0.0, "num_input_tokens_seen": 231312592, "step": 107245 }, { "epoch": 19.682510552394934, "grad_norm": 5.798468555440195e-05, "learning_rate": 7.683233045169825e-09, "loss": 0.0, "num_input_tokens_seen": 231323408, "step": 107250 }, { "epoch": 19.683428151954487, "grad_norm": 7.132702739909291e-05, "learning_rate": 7.638922344068156e-09, "loss": 0.0, "num_input_tokens_seen": 231333744, "step": 107255 }, { "epoch": 19.68434575151404, "grad_norm": 0.028517678380012512, "learning_rate": 7.594739689291142e-09, "loss": 0.0, "num_input_tokens_seen": 231344752, "step": 107260 }, { "epoch": 19.68526335107359, "grad_norm": 4.219100810587406e-05, "learning_rate": 7.550685081970655e-09, "loss": 0.0, "num_input_tokens_seen": 231355024, "step": 107265 }, { "epoch": 19.686180950633144, "grad_norm": 0.0007796098943799734, "learning_rate": 7.506758523238567e-09, "loss": 0.0, "num_input_tokens_seen": 231365552, "step": 107270 }, { "epoch": 19.687098550192697, "grad_norm": 0.0008091417257674038, "learning_rate": 7.46296001422009e-09, "loss": 0.0, "num_input_tokens_seen": 231374800, "step": 107275 }, { "epoch": 19.688016149752247, "grad_norm": 0.00010211901098955423, "learning_rate": 7.4192895560387665e-09, "loss": 0.0, "num_input_tokens_seen": 231385936, "step": 107280 }, { "epoch": 19.6889337493118, "grad_norm": 5.682701157638803e-05, "learning_rate": 7.3757471498148156e-09, "loss": 0.0, "num_input_tokens_seen": 231397104, "step": 107285 }, { "epoch": 19.689851348871354, "grad_norm": 0.00016635798965580761, "learning_rate": 7.3323327966651206e-09, "loss": 0.0, "num_input_tokens_seen": 231408400, "step": 107290 }, { "epoch": 19.690768948430904, "grad_norm": 0.00011156611435580999, "learning_rate": 7.289046497703234e-09, "loss": 0.0, "num_input_tokens_seen": 231418704, "step": 107295 }, { "epoch": 19.691686547990457, "grad_norm": 6.936130375834182e-05, "learning_rate": 7.245888254039379e-09, "loss": 0.0, "num_input_tokens_seen": 231429936, "step": 107300 }, { "epoch": 19.69260414755001, "grad_norm": 7.494654710171744e-05, "learning_rate": 7.202858066780449e-09, "loss": 0.0, "num_input_tokens_seen": 231440464, "step": 107305 }, { "epoch": 19.69352174710956, "grad_norm": 0.000511953083332628, "learning_rate": 7.159955937030005e-09, "loss": 0.0, "num_input_tokens_seen": 231451824, "step": 107310 }, { "epoch": 19.694439346669114, "grad_norm": 6.36623299214989e-05, "learning_rate": 7.117181865888279e-09, "loss": 0.0, "num_input_tokens_seen": 231461904, "step": 107315 }, { "epoch": 19.695356946228667, "grad_norm": 4.656318196794018e-05, "learning_rate": 7.0745358544527246e-09, "loss": 0.0, "num_input_tokens_seen": 231473104, "step": 107320 }, { "epoch": 19.696274545788217, "grad_norm": 0.00015630309644620866, "learning_rate": 7.032017903817467e-09, "loss": 0.0, "num_input_tokens_seen": 231482576, "step": 107325 }, { "epoch": 19.69719214534777, "grad_norm": 0.0037762457504868507, "learning_rate": 6.989628015072192e-09, "loss": 0.0, "num_input_tokens_seen": 231492880, "step": 107330 }, { "epoch": 19.698109744907324, "grad_norm": 0.0013711608480662107, "learning_rate": 6.9473661893043605e-09, "loss": 0.0, "num_input_tokens_seen": 231504016, "step": 107335 }, { "epoch": 19.699027344466874, "grad_norm": 0.0002992894151248038, "learning_rate": 6.905232427598108e-09, "loss": 0.0, "num_input_tokens_seen": 231514480, "step": 107340 }, { "epoch": 19.699944944026427, "grad_norm": 0.0013108605053275824, "learning_rate": 6.863226731034234e-09, "loss": 0.0, "num_input_tokens_seen": 231526000, "step": 107345 }, { "epoch": 19.70086254358598, "grad_norm": 5.736163802794181e-05, "learning_rate": 6.821349100689656e-09, "loss": 0.0, "num_input_tokens_seen": 231537008, "step": 107350 }, { "epoch": 19.70178014314553, "grad_norm": 0.0002611797535791993, "learning_rate": 6.779599537638515e-09, "loss": 0.0, "num_input_tokens_seen": 231547600, "step": 107355 }, { "epoch": 19.702697742705084, "grad_norm": 0.0008289712714031339, "learning_rate": 6.737978042952176e-09, "loss": 0.0, "num_input_tokens_seen": 231559152, "step": 107360 }, { "epoch": 19.703615342264637, "grad_norm": 4.5497235987568274e-05, "learning_rate": 6.696484617698118e-09, "loss": 0.0, "num_input_tokens_seen": 231570256, "step": 107365 }, { "epoch": 19.704532941824187, "grad_norm": 0.00011073536734329537, "learning_rate": 6.655119262939935e-09, "loss": 0.0, "num_input_tokens_seen": 231580112, "step": 107370 }, { "epoch": 19.70545054138374, "grad_norm": 0.019284479320049286, "learning_rate": 6.613881979739556e-09, "loss": 0.0, "num_input_tokens_seen": 231591696, "step": 107375 }, { "epoch": 19.706368140943294, "grad_norm": 0.0007230970077216625, "learning_rate": 6.572772769153357e-09, "loss": 0.0, "num_input_tokens_seen": 231602576, "step": 107380 }, { "epoch": 19.707285740502844, "grad_norm": 0.0005747420946136117, "learning_rate": 6.531791632236606e-09, "loss": 0.0, "num_input_tokens_seen": 231612048, "step": 107385 }, { "epoch": 19.708203340062397, "grad_norm": 9.345808939542621e-05, "learning_rate": 6.490938570040128e-09, "loss": 0.0, "num_input_tokens_seen": 231621872, "step": 107390 }, { "epoch": 19.70912093962195, "grad_norm": 4.825895302928984e-05, "learning_rate": 6.450213583611975e-09, "loss": 0.0, "num_input_tokens_seen": 231632496, "step": 107395 }, { "epoch": 19.7100385391815, "grad_norm": 0.00019574671750888228, "learning_rate": 6.4096166739968655e-09, "loss": 0.0, "num_input_tokens_seen": 231642480, "step": 107400 }, { "epoch": 19.710956138741054, "grad_norm": 0.000386141735361889, "learning_rate": 6.369147842235079e-09, "loss": 0.0, "num_input_tokens_seen": 231652496, "step": 107405 }, { "epoch": 19.711873738300607, "grad_norm": 0.00013396679423749447, "learning_rate": 6.328807089365785e-09, "loss": 0.0, "num_input_tokens_seen": 231663408, "step": 107410 }, { "epoch": 19.712791337860157, "grad_norm": 4.367494329926558e-05, "learning_rate": 6.288594416423155e-09, "loss": 0.0, "num_input_tokens_seen": 231673552, "step": 107415 }, { "epoch": 19.71370893741971, "grad_norm": 6.507657963084057e-05, "learning_rate": 6.248509824438032e-09, "loss": 0.0, "num_input_tokens_seen": 231682704, "step": 107420 }, { "epoch": 19.714626536979264, "grad_norm": 8.617912681074813e-05, "learning_rate": 6.208553314439037e-09, "loss": 0.0, "num_input_tokens_seen": 231693520, "step": 107425 }, { "epoch": 19.715544136538814, "grad_norm": 4.623015411198139e-05, "learning_rate": 6.1687248874514605e-09, "loss": 0.0, "num_input_tokens_seen": 231703600, "step": 107430 }, { "epoch": 19.716461736098367, "grad_norm": 0.020520789548754692, "learning_rate": 6.129024544496154e-09, "loss": 0.0, "num_input_tokens_seen": 231713616, "step": 107435 }, { "epoch": 19.71737933565792, "grad_norm": 7.161923713283613e-05, "learning_rate": 6.089452286591191e-09, "loss": 0.0, "num_input_tokens_seen": 231724432, "step": 107440 }, { "epoch": 19.71829693521747, "grad_norm": 6.446905172197148e-05, "learning_rate": 6.050008114752426e-09, "loss": 0.0, "num_input_tokens_seen": 231735888, "step": 107445 }, { "epoch": 19.719214534777024, "grad_norm": 0.00015980633907020092, "learning_rate": 6.010692029990717e-09, "loss": 0.0, "num_input_tokens_seen": 231747024, "step": 107450 }, { "epoch": 19.720132134336577, "grad_norm": 6.177159957587719e-05, "learning_rate": 5.971504033314701e-09, "loss": 0.0, "num_input_tokens_seen": 231757520, "step": 107455 }, { "epoch": 19.721049733896127, "grad_norm": 0.00012183484068373218, "learning_rate": 5.932444125729686e-09, "loss": 0.0, "num_input_tokens_seen": 231770064, "step": 107460 }, { "epoch": 19.72196733345568, "grad_norm": 0.000529629469383508, "learning_rate": 5.8935123082376475e-09, "loss": 0.0, "num_input_tokens_seen": 231779056, "step": 107465 }, { "epoch": 19.722884933015234, "grad_norm": 9.863974992185831e-05, "learning_rate": 5.854708581836677e-09, "loss": 0.0, "num_input_tokens_seen": 231790544, "step": 107470 }, { "epoch": 19.723802532574783, "grad_norm": 7.470568380085751e-05, "learning_rate": 5.816032947522088e-09, "loss": 0.0, "num_input_tokens_seen": 231801808, "step": 107475 }, { "epoch": 19.724720132134337, "grad_norm": 9.672666783444583e-05, "learning_rate": 5.777485406285866e-09, "loss": 0.0, "num_input_tokens_seen": 231812528, "step": 107480 }, { "epoch": 19.72563773169389, "grad_norm": 5.077959212940186e-05, "learning_rate": 5.739065959117218e-09, "loss": 0.0, "num_input_tokens_seen": 231823216, "step": 107485 }, { "epoch": 19.72655533125344, "grad_norm": 5.9350935771362856e-05, "learning_rate": 5.700774607000914e-09, "loss": 0.0, "num_input_tokens_seen": 231833936, "step": 107490 }, { "epoch": 19.727472930812993, "grad_norm": 9.433466766495258e-05, "learning_rate": 5.662611350918945e-09, "loss": 0.0, "num_input_tokens_seen": 231845936, "step": 107495 }, { "epoch": 19.728390530372547, "grad_norm": 7.895006274338812e-05, "learning_rate": 5.624576191851084e-09, "loss": 0.0, "num_input_tokens_seen": 231856816, "step": 107500 }, { "epoch": 19.729308129932097, "grad_norm": 3.1583265808876604e-05, "learning_rate": 5.586669130771549e-09, "loss": 0.0, "num_input_tokens_seen": 231868784, "step": 107505 }, { "epoch": 19.73022572949165, "grad_norm": 0.0008614119724370539, "learning_rate": 5.548890168654008e-09, "loss": 0.0, "num_input_tokens_seen": 231880336, "step": 107510 }, { "epoch": 19.731143329051203, "grad_norm": 5.24903298355639e-05, "learning_rate": 5.511239306466576e-09, "loss": 0.0, "num_input_tokens_seen": 231890192, "step": 107515 }, { "epoch": 19.732060928610753, "grad_norm": 0.00033487004111520946, "learning_rate": 5.47371654517459e-09, "loss": 0.0, "num_input_tokens_seen": 231902064, "step": 107520 }, { "epoch": 19.732978528170307, "grad_norm": 8.434396295342594e-05, "learning_rate": 5.436321885741725e-09, "loss": 0.0, "num_input_tokens_seen": 231913936, "step": 107525 }, { "epoch": 19.73389612772986, "grad_norm": 0.0001905081153381616, "learning_rate": 5.399055329126102e-09, "loss": 0.0, "num_input_tokens_seen": 231923056, "step": 107530 }, { "epoch": 19.73481372728941, "grad_norm": 0.002892453223466873, "learning_rate": 5.361916876283069e-09, "loss": 0.0, "num_input_tokens_seen": 231932976, "step": 107535 }, { "epoch": 19.735731326848963, "grad_norm": 7.846754306228831e-05, "learning_rate": 5.324906528166862e-09, "loss": 0.0, "num_input_tokens_seen": 231942640, "step": 107540 }, { "epoch": 19.736648926408517, "grad_norm": 8.33487429190427e-05, "learning_rate": 5.288024285725057e-09, "loss": 0.0, "num_input_tokens_seen": 231953616, "step": 107545 }, { "epoch": 19.737566525968067, "grad_norm": 6.88251166138798e-05, "learning_rate": 5.251270149904675e-09, "loss": 0.0, "num_input_tokens_seen": 231963216, "step": 107550 }, { "epoch": 19.73848412552762, "grad_norm": 4.161669858149253e-05, "learning_rate": 5.214644121648293e-09, "loss": 0.0, "num_input_tokens_seen": 231973776, "step": 107555 }, { "epoch": 19.739401725087173, "grad_norm": 0.0008094420190900564, "learning_rate": 5.178146201894607e-09, "loss": 0.0, "num_input_tokens_seen": 231983760, "step": 107560 }, { "epoch": 19.740319324646723, "grad_norm": 4.032691867905669e-05, "learning_rate": 5.1417763915800885e-09, "loss": 0.0, "num_input_tokens_seen": 231993904, "step": 107565 }, { "epoch": 19.741236924206277, "grad_norm": 0.00258346414193511, "learning_rate": 5.105534691638437e-09, "loss": 0.0, "num_input_tokens_seen": 232005712, "step": 107570 }, { "epoch": 19.74215452376583, "grad_norm": 0.0007690873462706804, "learning_rate": 5.069421102997796e-09, "loss": 0.0, "num_input_tokens_seen": 232016496, "step": 107575 }, { "epoch": 19.74307212332538, "grad_norm": 5.171436714590527e-05, "learning_rate": 5.033435626585204e-09, "loss": 0.0, "num_input_tokens_seen": 232027280, "step": 107580 }, { "epoch": 19.743989722884933, "grad_norm": 0.0016396078281104565, "learning_rate": 4.997578263323255e-09, "loss": 0.0, "num_input_tokens_seen": 232037808, "step": 107585 }, { "epoch": 19.744907322444487, "grad_norm": 6.268532160902396e-05, "learning_rate": 4.961849014132325e-09, "loss": 0.0, "num_input_tokens_seen": 232049584, "step": 107590 }, { "epoch": 19.745824922004036, "grad_norm": 0.41271960735321045, "learning_rate": 4.926247879928348e-09, "loss": 0.0004, "num_input_tokens_seen": 232060112, "step": 107595 }, { "epoch": 19.74674252156359, "grad_norm": 7.480438944185153e-05, "learning_rate": 4.890774861623926e-09, "loss": 0.0, "num_input_tokens_seen": 232071344, "step": 107600 }, { "epoch": 19.747660121123143, "grad_norm": 0.0011661864118650556, "learning_rate": 4.855429960129998e-09, "loss": 0.0, "num_input_tokens_seen": 232081936, "step": 107605 }, { "epoch": 19.748577720682693, "grad_norm": 3.7333757063606754e-05, "learning_rate": 4.8202131763519515e-09, "loss": 0.0, "num_input_tokens_seen": 232094288, "step": 107610 }, { "epoch": 19.749495320242247, "grad_norm": 0.0002543798182159662, "learning_rate": 4.785124511194061e-09, "loss": 0.0, "num_input_tokens_seen": 232104880, "step": 107615 }, { "epoch": 19.7504129198018, "grad_norm": 9.321023389929906e-05, "learning_rate": 4.75016396555561e-09, "loss": 0.0, "num_input_tokens_seen": 232115600, "step": 107620 }, { "epoch": 19.75133051936135, "grad_norm": 7.946640107547864e-05, "learning_rate": 4.715331540333656e-09, "loss": 0.0, "num_input_tokens_seen": 232127120, "step": 107625 }, { "epoch": 19.752248118920903, "grad_norm": 7.12148321326822e-05, "learning_rate": 4.6806272364213754e-09, "loss": 0.0, "num_input_tokens_seen": 232138704, "step": 107630 }, { "epoch": 19.753165718480457, "grad_norm": 8.839499787427485e-05, "learning_rate": 4.646051054709166e-09, "loss": 0.0, "num_input_tokens_seen": 232148784, "step": 107635 }, { "epoch": 19.754083318040006, "grad_norm": 0.0036035911180078983, "learning_rate": 4.6116029960835415e-09, "loss": 0.0, "num_input_tokens_seen": 232160528, "step": 107640 }, { "epoch": 19.75500091759956, "grad_norm": 9.070448868442327e-05, "learning_rate": 4.577283061428239e-09, "loss": 0.0, "num_input_tokens_seen": 232170608, "step": 107645 }, { "epoch": 19.755918517159113, "grad_norm": 5.347848855308257e-05, "learning_rate": 4.543091251623111e-09, "loss": 0.0, "num_input_tokens_seen": 232181584, "step": 107650 }, { "epoch": 19.756836116718663, "grad_norm": 0.00011740536865545437, "learning_rate": 4.5090275675452326e-09, "loss": 0.0, "num_input_tokens_seen": 232193104, "step": 107655 }, { "epoch": 19.757753716278216, "grad_norm": 0.00014658234431408346, "learning_rate": 4.475092010068905e-09, "loss": 0.0, "num_input_tokens_seen": 232205200, "step": 107660 }, { "epoch": 19.75867131583777, "grad_norm": 6.228886923054233e-05, "learning_rate": 4.441284580064542e-09, "loss": 0.0, "num_input_tokens_seen": 232215024, "step": 107665 }, { "epoch": 19.75958891539732, "grad_norm": 9.819807019084692e-05, "learning_rate": 4.407605278398119e-09, "loss": 0.0, "num_input_tokens_seen": 232225968, "step": 107670 }, { "epoch": 19.760506514956873, "grad_norm": 6.869861681479961e-05, "learning_rate": 4.3740541059345e-09, "loss": 0.0, "num_input_tokens_seen": 232236016, "step": 107675 }, { "epoch": 19.761424114516426, "grad_norm": 4.684209488914348e-05, "learning_rate": 4.340631063533551e-09, "loss": 0.0, "num_input_tokens_seen": 232246384, "step": 107680 }, { "epoch": 19.762341714075976, "grad_norm": 4.495500979828648e-05, "learning_rate": 4.307336152052921e-09, "loss": 0.0, "num_input_tokens_seen": 232258160, "step": 107685 }, { "epoch": 19.76325931363553, "grad_norm": 5.397627319325693e-05, "learning_rate": 4.2741693723469255e-09, "loss": 0.0, "num_input_tokens_seen": 232268976, "step": 107690 }, { "epoch": 19.764176913195083, "grad_norm": 0.0018099015578627586, "learning_rate": 4.241130725265441e-09, "loss": 0.0, "num_input_tokens_seen": 232279408, "step": 107695 }, { "epoch": 19.765094512754633, "grad_norm": 5.12487422383856e-05, "learning_rate": 4.208220211656122e-09, "loss": 0.0, "num_input_tokens_seen": 232290416, "step": 107700 }, { "epoch": 19.766012112314186, "grad_norm": 5.43341193406377e-05, "learning_rate": 4.175437832363294e-09, "loss": 0.0, "num_input_tokens_seen": 232301616, "step": 107705 }, { "epoch": 19.76692971187374, "grad_norm": 0.00016162902466021478, "learning_rate": 4.14278358822795e-09, "loss": 0.0, "num_input_tokens_seen": 232311984, "step": 107710 }, { "epoch": 19.76784731143329, "grad_norm": 0.00035709404619410634, "learning_rate": 4.110257480086644e-09, "loss": 0.0, "num_input_tokens_seen": 232323152, "step": 107715 }, { "epoch": 19.768764910992843, "grad_norm": 0.00011063359852414578, "learning_rate": 4.077859508774817e-09, "loss": 0.0, "num_input_tokens_seen": 232334576, "step": 107720 }, { "epoch": 19.769682510552396, "grad_norm": 7.568094588350505e-05, "learning_rate": 4.045589675122919e-09, "loss": 0.0, "num_input_tokens_seen": 232345808, "step": 107725 }, { "epoch": 19.770600110111946, "grad_norm": 0.001003127545118332, "learning_rate": 4.013447979958618e-09, "loss": 0.0, "num_input_tokens_seen": 232356944, "step": 107730 }, { "epoch": 19.7715177096715, "grad_norm": 8.802199590718374e-05, "learning_rate": 3.981434424106256e-09, "loss": 0.0, "num_input_tokens_seen": 232368528, "step": 107735 }, { "epoch": 19.772435309231053, "grad_norm": 7.557071512565017e-05, "learning_rate": 3.949549008386844e-09, "loss": 0.0, "num_input_tokens_seen": 232378736, "step": 107740 }, { "epoch": 19.773352908790603, "grad_norm": 4.239321788190864e-05, "learning_rate": 3.917791733618614e-09, "loss": 0.0, "num_input_tokens_seen": 232389808, "step": 107745 }, { "epoch": 19.774270508350156, "grad_norm": 0.00015849921328481287, "learning_rate": 3.886162600615362e-09, "loss": 0.0, "num_input_tokens_seen": 232401392, "step": 107750 }, { "epoch": 19.77518810790971, "grad_norm": 0.00011218953295610845, "learning_rate": 3.854661610189214e-09, "loss": 0.0, "num_input_tokens_seen": 232413456, "step": 107755 }, { "epoch": 19.77610570746926, "grad_norm": 0.0004427574167493731, "learning_rate": 3.823288763147304e-09, "loss": 0.0, "num_input_tokens_seen": 232425008, "step": 107760 }, { "epoch": 19.777023307028813, "grad_norm": 0.0010916879400610924, "learning_rate": 3.792044060295097e-09, "loss": 0.0189, "num_input_tokens_seen": 232435952, "step": 107765 }, { "epoch": 19.777940906588366, "grad_norm": 0.00018350569007452577, "learning_rate": 3.760927502433065e-09, "loss": 0.0, "num_input_tokens_seen": 232448112, "step": 107770 }, { "epoch": 19.778858506147916, "grad_norm": 5.554862946155481e-05, "learning_rate": 3.729939090360013e-09, "loss": 0.0, "num_input_tokens_seen": 232457872, "step": 107775 }, { "epoch": 19.77977610570747, "grad_norm": 8.469614840578288e-05, "learning_rate": 3.699078824870306e-09, "loss": 0.0, "num_input_tokens_seen": 232470192, "step": 107780 }, { "epoch": 19.780693705267023, "grad_norm": 4.999136945116334e-05, "learning_rate": 3.6683467067560872e-09, "loss": 0.0, "num_input_tokens_seen": 232481776, "step": 107785 }, { "epoch": 19.781611304826573, "grad_norm": 0.00014475264470092952, "learning_rate": 3.637742736805061e-09, "loss": 0.0, "num_input_tokens_seen": 232492752, "step": 107790 }, { "epoch": 19.782528904386126, "grad_norm": 6.398391997208819e-05, "learning_rate": 3.6072669158021544e-09, "loss": 0.0, "num_input_tokens_seen": 232504528, "step": 107795 }, { "epoch": 19.78344650394568, "grad_norm": 7.06068822182715e-05, "learning_rate": 3.576919244528965e-09, "loss": 0.0, "num_input_tokens_seen": 232515728, "step": 107800 }, { "epoch": 19.78436410350523, "grad_norm": 0.004647620487958193, "learning_rate": 3.546699723764313e-09, "loss": 0.0, "num_input_tokens_seen": 232525264, "step": 107805 }, { "epoch": 19.785281703064783, "grad_norm": 0.00035328924423083663, "learning_rate": 3.51660835428258e-09, "loss": 0.0, "num_input_tokens_seen": 232536400, "step": 107810 }, { "epoch": 19.786199302624336, "grad_norm": 0.0002497912500984967, "learning_rate": 3.4866451368564812e-09, "loss": 0.0, "num_input_tokens_seen": 232546768, "step": 107815 }, { "epoch": 19.787116902183886, "grad_norm": 5.8939876907970756e-05, "learning_rate": 3.4568100722537358e-09, "loss": 0.0, "num_input_tokens_seen": 232556464, "step": 107820 }, { "epoch": 19.78803450174344, "grad_norm": 0.0002349492715438828, "learning_rate": 3.427103161240397e-09, "loss": 0.0, "num_input_tokens_seen": 232567504, "step": 107825 }, { "epoch": 19.788952101302993, "grad_norm": 8.31533907330595e-05, "learning_rate": 3.3975244045775234e-09, "loss": 0.0, "num_input_tokens_seen": 232578032, "step": 107830 }, { "epoch": 19.789869700862543, "grad_norm": 0.0005733907455578446, "learning_rate": 3.368073803023952e-09, "loss": 0.0, "num_input_tokens_seen": 232588688, "step": 107835 }, { "epoch": 19.790787300422096, "grad_norm": 6.252981984289363e-05, "learning_rate": 3.3387513573351902e-09, "loss": 0.0, "num_input_tokens_seen": 232599792, "step": 107840 }, { "epoch": 19.79170489998165, "grad_norm": 0.00028581751394085586, "learning_rate": 3.3095570682634136e-09, "loss": 0.0, "num_input_tokens_seen": 232609424, "step": 107845 }, { "epoch": 19.7926224995412, "grad_norm": 6.47936831228435e-05, "learning_rate": 3.2804909365574676e-09, "loss": 0.0, "num_input_tokens_seen": 232620688, "step": 107850 }, { "epoch": 19.793540099100753, "grad_norm": 4.777952199219726e-05, "learning_rate": 3.2515529629628674e-09, "loss": 0.0, "num_input_tokens_seen": 232630800, "step": 107855 }, { "epoch": 19.794457698660306, "grad_norm": 4.8023190174717456e-05, "learning_rate": 3.2227431482212413e-09, "loss": 0.0, "num_input_tokens_seen": 232641904, "step": 107860 }, { "epoch": 19.795375298219856, "grad_norm": 0.0007529498543590307, "learning_rate": 3.194061493071998e-09, "loss": 0.0, "num_input_tokens_seen": 232653616, "step": 107865 }, { "epoch": 19.79629289777941, "grad_norm": 9.327178850071505e-05, "learning_rate": 3.165507998251216e-09, "loss": 0.0, "num_input_tokens_seen": 232664240, "step": 107870 }, { "epoch": 19.797210497338963, "grad_norm": 0.00010989547445205972, "learning_rate": 3.1370826644899764e-09, "loss": 0.0, "num_input_tokens_seen": 232675120, "step": 107875 }, { "epoch": 19.798128096898512, "grad_norm": 4.415409057401121e-05, "learning_rate": 3.1087854925188067e-09, "loss": 0.0006, "num_input_tokens_seen": 232686448, "step": 107880 }, { "epoch": 19.799045696458066, "grad_norm": 0.00015755397907923907, "learning_rate": 3.080616483062682e-09, "loss": 0.0, "num_input_tokens_seen": 232695824, "step": 107885 }, { "epoch": 19.79996329601762, "grad_norm": 7.12845430825837e-05, "learning_rate": 3.052575636843802e-09, "loss": 0.0, "num_input_tokens_seen": 232706288, "step": 107890 }, { "epoch": 19.80088089557717, "grad_norm": 0.00011254091077717021, "learning_rate": 3.024662954582147e-09, "loss": 0.0, "num_input_tokens_seen": 232716080, "step": 107895 }, { "epoch": 19.801798495136723, "grad_norm": 0.00011779471969930455, "learning_rate": 2.9968784369932557e-09, "loss": 0.0, "num_input_tokens_seen": 232727280, "step": 107900 }, { "epoch": 19.802716094696276, "grad_norm": 0.0001634712389204651, "learning_rate": 2.969222084789891e-09, "loss": 0.0, "num_input_tokens_seen": 232737776, "step": 107905 }, { "epoch": 19.803633694255826, "grad_norm": 0.003072350984439254, "learning_rate": 2.9416938986814857e-09, "loss": 0.0, "num_input_tokens_seen": 232748944, "step": 107910 }, { "epoch": 19.80455129381538, "grad_norm": 0.003573861438781023, "learning_rate": 2.9142938793735862e-09, "loss": 0.0, "num_input_tokens_seen": 232759888, "step": 107915 }, { "epoch": 19.805468893374933, "grad_norm": 0.00363877578638494, "learning_rate": 2.887022027568964e-09, "loss": 0.0, "num_input_tokens_seen": 232771600, "step": 107920 }, { "epoch": 19.806386492934482, "grad_norm": 4.8567057092441246e-05, "learning_rate": 2.8598783439676147e-09, "loss": 0.0, "num_input_tokens_seen": 232783152, "step": 107925 }, { "epoch": 19.807304092494036, "grad_norm": 7.490635471185669e-05, "learning_rate": 2.8328628292656477e-09, "loss": 0.0, "num_input_tokens_seen": 232794672, "step": 107930 }, { "epoch": 19.80822169205359, "grad_norm": 0.00022786916815675795, "learning_rate": 2.805975484155843e-09, "loss": 0.0, "num_input_tokens_seen": 232805264, "step": 107935 }, { "epoch": 19.80913929161314, "grad_norm": 0.001148262177594006, "learning_rate": 2.779216309327648e-09, "loss": 0.0, "num_input_tokens_seen": 232816624, "step": 107940 }, { "epoch": 19.810056891172692, "grad_norm": 2.9669226933037862e-05, "learning_rate": 2.7525853054677367e-09, "loss": 0.0, "num_input_tokens_seen": 232827312, "step": 107945 }, { "epoch": 19.810974490732246, "grad_norm": 0.00012677631457336247, "learning_rate": 2.7260824732588954e-09, "loss": 0.0, "num_input_tokens_seen": 232837040, "step": 107950 }, { "epoch": 19.811892090291796, "grad_norm": 5.2248797146603465e-05, "learning_rate": 2.6997078133811363e-09, "loss": 0.0, "num_input_tokens_seen": 232848048, "step": 107955 }, { "epoch": 19.81280968985135, "grad_norm": 0.0001530270674265921, "learning_rate": 2.673461326510585e-09, "loss": 0.0, "num_input_tokens_seen": 232858768, "step": 107960 }, { "epoch": 19.813727289410902, "grad_norm": 0.0005103322910144925, "learning_rate": 2.6473430133205913e-09, "loss": 0.0, "num_input_tokens_seen": 232868208, "step": 107965 }, { "epoch": 19.814644888970452, "grad_norm": 0.00015655024617444724, "learning_rate": 2.6213528744811757e-09, "loss": 0.0, "num_input_tokens_seen": 232879440, "step": 107970 }, { "epoch": 19.815562488530006, "grad_norm": 6.345628935378045e-05, "learning_rate": 2.5954909106590266e-09, "loss": 0.0, "num_input_tokens_seen": 232890576, "step": 107975 }, { "epoch": 19.81648008808956, "grad_norm": 0.00012166154192527756, "learning_rate": 2.5697571225169473e-09, "loss": 0.0, "num_input_tokens_seen": 232900496, "step": 107980 }, { "epoch": 19.81739768764911, "grad_norm": 0.00014306198863778263, "learning_rate": 2.544151510714965e-09, "loss": 0.0, "num_input_tokens_seen": 232911152, "step": 107985 }, { "epoch": 19.818315287208662, "grad_norm": 0.0002583457971923053, "learning_rate": 2.5186740759108876e-09, "loss": 0.0, "num_input_tokens_seen": 232921200, "step": 107990 }, { "epoch": 19.819232886768216, "grad_norm": 6.328992458293214e-05, "learning_rate": 2.4933248187569703e-09, "loss": 0.0, "num_input_tokens_seen": 232931984, "step": 107995 }, { "epoch": 19.820150486327766, "grad_norm": 0.00048212215187959373, "learning_rate": 2.468103739903804e-09, "loss": 0.0, "num_input_tokens_seen": 232943632, "step": 108000 }, { "epoch": 19.82106808588732, "grad_norm": 0.0003297868825029582, "learning_rate": 2.4430108399986495e-09, "loss": 0.0, "num_input_tokens_seen": 232955824, "step": 108005 }, { "epoch": 19.821985685446872, "grad_norm": 3.895803092746064e-05, "learning_rate": 2.418046119684325e-09, "loss": 0.0, "num_input_tokens_seen": 232967152, "step": 108010 }, { "epoch": 19.822903285006422, "grad_norm": 0.0023275730200111866, "learning_rate": 2.3932095796014297e-09, "loss": 0.0, "num_input_tokens_seen": 232978288, "step": 108015 }, { "epoch": 19.823820884565976, "grad_norm": 4.691296271630563e-05, "learning_rate": 2.3685012203877867e-09, "loss": 0.0, "num_input_tokens_seen": 232988528, "step": 108020 }, { "epoch": 19.82473848412553, "grad_norm": 0.0013831579126417637, "learning_rate": 2.3439210426762225e-09, "loss": 0.0, "num_input_tokens_seen": 232999184, "step": 108025 }, { "epoch": 19.82565608368508, "grad_norm": 7.694066152907908e-05, "learning_rate": 2.319469047097345e-09, "loss": 0.0, "num_input_tokens_seen": 233010608, "step": 108030 }, { "epoch": 19.826573683244632, "grad_norm": 0.000101519312011078, "learning_rate": 2.2951452342784287e-09, "loss": 0.0, "num_input_tokens_seen": 233021072, "step": 108035 }, { "epoch": 19.827491282804186, "grad_norm": 4.368924783193506e-05, "learning_rate": 2.2709496048428647e-09, "loss": 0.0, "num_input_tokens_seen": 233031632, "step": 108040 }, { "epoch": 19.828408882363735, "grad_norm": 5.0821970944525674e-05, "learning_rate": 2.246882159411823e-09, "loss": 0.0, "num_input_tokens_seen": 233043184, "step": 108045 }, { "epoch": 19.82932648192329, "grad_norm": 0.0001816434523789212, "learning_rate": 2.222942898603142e-09, "loss": 0.0, "num_input_tokens_seen": 233053488, "step": 108050 }, { "epoch": 19.830244081482842, "grad_norm": 0.00013689561455976218, "learning_rate": 2.1991318230296655e-09, "loss": 0.0, "num_input_tokens_seen": 233065552, "step": 108055 }, { "epoch": 19.831161681042392, "grad_norm": 0.00035057502100244164, "learning_rate": 2.1754489333020156e-09, "loss": 0.0, "num_input_tokens_seen": 233076400, "step": 108060 }, { "epoch": 19.832079280601945, "grad_norm": 5.980389323667623e-05, "learning_rate": 2.15189423002915e-09, "loss": 0.0, "num_input_tokens_seen": 233086864, "step": 108065 }, { "epoch": 19.8329968801615, "grad_norm": 0.00019018496095668525, "learning_rate": 2.1284677138133648e-09, "loss": 0.0, "num_input_tokens_seen": 233096944, "step": 108070 }, { "epoch": 19.83391447972105, "grad_norm": 0.00012733372568618506, "learning_rate": 2.1051693852569555e-09, "loss": 0.0, "num_input_tokens_seen": 233107344, "step": 108075 }, { "epoch": 19.834832079280602, "grad_norm": 7.311480294447392e-05, "learning_rate": 2.081999244956667e-09, "loss": 0.0, "num_input_tokens_seen": 233116784, "step": 108080 }, { "epoch": 19.835749678840155, "grad_norm": 9.093289554584771e-05, "learning_rate": 2.0589572935070247e-09, "loss": 0.0, "num_input_tokens_seen": 233127440, "step": 108085 }, { "epoch": 19.836667278399705, "grad_norm": 0.0008848941652104259, "learning_rate": 2.036043531499221e-09, "loss": 0.0, "num_input_tokens_seen": 233138384, "step": 108090 }, { "epoch": 19.83758487795926, "grad_norm": 4.298644489608705e-05, "learning_rate": 2.0132579595205646e-09, "loss": 0.0, "num_input_tokens_seen": 233147824, "step": 108095 }, { "epoch": 19.838502477518812, "grad_norm": 0.00012273313768673688, "learning_rate": 1.990600578155588e-09, "loss": 0.0, "num_input_tokens_seen": 233158288, "step": 108100 }, { "epoch": 19.839420077078362, "grad_norm": 0.0004886295064352453, "learning_rate": 1.968071387986048e-09, "loss": 0.0, "num_input_tokens_seen": 233169616, "step": 108105 }, { "epoch": 19.840337676637915, "grad_norm": 0.00010253785876557231, "learning_rate": 1.9456703895887054e-09, "loss": 0.0, "num_input_tokens_seen": 233179696, "step": 108110 }, { "epoch": 19.84125527619747, "grad_norm": 0.00011577706027310342, "learning_rate": 1.9233975835386553e-09, "loss": 0.0, "num_input_tokens_seen": 233190704, "step": 108115 }, { "epoch": 19.84217287575702, "grad_norm": 5.958466499578208e-05, "learning_rate": 1.901252970407108e-09, "loss": 0.0, "num_input_tokens_seen": 233202736, "step": 108120 }, { "epoch": 19.843090475316572, "grad_norm": 8.356956823263317e-05, "learning_rate": 1.8792365507624975e-09, "loss": 0.0, "num_input_tokens_seen": 233214000, "step": 108125 }, { "epoch": 19.844008074876125, "grad_norm": 0.0008324837544932961, "learning_rate": 1.8573483251688173e-09, "loss": 0.0, "num_input_tokens_seen": 233224368, "step": 108130 }, { "epoch": 19.844925674435675, "grad_norm": 0.0020709065720438957, "learning_rate": 1.835588294187285e-09, "loss": 0.0, "num_input_tokens_seen": 233234544, "step": 108135 }, { "epoch": 19.84584327399523, "grad_norm": 5.1039813115494326e-05, "learning_rate": 1.8139564583768977e-09, "loss": 0.0, "num_input_tokens_seen": 233244848, "step": 108140 }, { "epoch": 19.846760873554782, "grad_norm": 0.0002631019160617143, "learning_rate": 1.792452818292212e-09, "loss": 0.0, "num_input_tokens_seen": 233256272, "step": 108145 }, { "epoch": 19.847678473114332, "grad_norm": 0.00022121038637124002, "learning_rate": 1.7710773744844533e-09, "loss": 0.0, "num_input_tokens_seen": 233267600, "step": 108150 }, { "epoch": 19.848596072673885, "grad_norm": 8.446793799521402e-05, "learning_rate": 1.7498301275020724e-09, "loss": 0.0, "num_input_tokens_seen": 233277968, "step": 108155 }, { "epoch": 19.84951367223344, "grad_norm": 5.8817204262595624e-05, "learning_rate": 1.7287110778896333e-09, "loss": 0.0, "num_input_tokens_seen": 233289488, "step": 108160 }, { "epoch": 19.85043127179299, "grad_norm": 0.0003088217054028064, "learning_rate": 1.7077202261894798e-09, "loss": 0.0, "num_input_tokens_seen": 233300368, "step": 108165 }, { "epoch": 19.851348871352542, "grad_norm": 9.07499561435543e-05, "learning_rate": 1.6868575729395154e-09, "loss": 0.0, "num_input_tokens_seen": 233310832, "step": 108170 }, { "epoch": 19.852266470912095, "grad_norm": 5.7046891015488654e-05, "learning_rate": 1.6661231186748673e-09, "loss": 0.0, "num_input_tokens_seen": 233321840, "step": 108175 }, { "epoch": 19.853184070471645, "grad_norm": 0.00011001677194144577, "learning_rate": 1.6455168639273322e-09, "loss": 0.0, "num_input_tokens_seen": 233331184, "step": 108180 }, { "epoch": 19.8541016700312, "grad_norm": 6.204216333571821e-05, "learning_rate": 1.6250388092259317e-09, "loss": 0.0, "num_input_tokens_seen": 233341840, "step": 108185 }, { "epoch": 19.855019269590752, "grad_norm": 5.64173205930274e-05, "learning_rate": 1.604688955095246e-09, "loss": 0.0, "num_input_tokens_seen": 233351920, "step": 108190 }, { "epoch": 19.8559368691503, "grad_norm": 6.992151611484587e-05, "learning_rate": 1.5844673020576351e-09, "loss": 0.0, "num_input_tokens_seen": 233362928, "step": 108195 }, { "epoch": 19.856854468709855, "grad_norm": 0.00011467295553302392, "learning_rate": 1.5643738506315731e-09, "loss": 0.0, "num_input_tokens_seen": 233373712, "step": 108200 }, { "epoch": 19.85777206826941, "grad_norm": 7.109736907295883e-05, "learning_rate": 1.5444086013327586e-09, "loss": 0.0, "num_input_tokens_seen": 233384656, "step": 108205 }, { "epoch": 19.85868966782896, "grad_norm": 0.00020570543711073697, "learning_rate": 1.5245715546724493e-09, "loss": 0.0, "num_input_tokens_seen": 233396496, "step": 108210 }, { "epoch": 19.859607267388512, "grad_norm": 0.0020651291124522686, "learning_rate": 1.5048627111602376e-09, "loss": 0.0, "num_input_tokens_seen": 233408048, "step": 108215 }, { "epoch": 19.860524866948065, "grad_norm": 0.00011218841973459348, "learning_rate": 1.48528207130072e-09, "loss": 0.0, "num_input_tokens_seen": 233418992, "step": 108220 }, { "epoch": 19.861442466507615, "grad_norm": 0.0009468509815633297, "learning_rate": 1.4658296355973822e-09, "loss": 0.0, "num_input_tokens_seen": 233429776, "step": 108225 }, { "epoch": 19.86236006606717, "grad_norm": 7.779199950164184e-05, "learning_rate": 1.4465054045481597e-09, "loss": 0.0, "num_input_tokens_seen": 233439376, "step": 108230 }, { "epoch": 19.863277665626722, "grad_norm": 0.0004469699051696807, "learning_rate": 1.427309378649322e-09, "loss": 0.0, "num_input_tokens_seen": 233449456, "step": 108235 }, { "epoch": 19.86419526518627, "grad_norm": 8.662663458380848e-05, "learning_rate": 1.408241558392698e-09, "loss": 0.0, "num_input_tokens_seen": 233459696, "step": 108240 }, { "epoch": 19.865112864745825, "grad_norm": 7.804617780493572e-05, "learning_rate": 1.3893019442678958e-09, "loss": 0.0, "num_input_tokens_seen": 233470384, "step": 108245 }, { "epoch": 19.86603046430538, "grad_norm": 0.000504668743815273, "learning_rate": 1.3704905367600829e-09, "loss": 0.0, "num_input_tokens_seen": 233481840, "step": 108250 }, { "epoch": 19.86694806386493, "grad_norm": 0.0001550045853946358, "learning_rate": 1.3518073363516515e-09, "loss": 0.0, "num_input_tokens_seen": 233493296, "step": 108255 }, { "epoch": 19.86786566342448, "grad_norm": 5.2166389650665224e-05, "learning_rate": 1.3332523435227728e-09, "loss": 0.0, "num_input_tokens_seen": 233504560, "step": 108260 }, { "epoch": 19.868783262984035, "grad_norm": 8.46188486320898e-05, "learning_rate": 1.3148255587486225e-09, "loss": 0.0, "num_input_tokens_seen": 233515280, "step": 108265 }, { "epoch": 19.869700862543585, "grad_norm": 4.413169153849594e-05, "learning_rate": 1.2965269825016002e-09, "loss": 0.0, "num_input_tokens_seen": 233526192, "step": 108270 }, { "epoch": 19.87061846210314, "grad_norm": 0.00010044759255833924, "learning_rate": 1.2783566152518856e-09, "loss": 0.0, "num_input_tokens_seen": 233536976, "step": 108275 }, { "epoch": 19.87153606166269, "grad_norm": 0.00035205489257350564, "learning_rate": 1.260314457464662e-09, "loss": 0.0, "num_input_tokens_seen": 233546288, "step": 108280 }, { "epoch": 19.87245366122224, "grad_norm": 7.678916881559417e-05, "learning_rate": 1.2424005096028925e-09, "loss": 0.0, "num_input_tokens_seen": 233557616, "step": 108285 }, { "epoch": 19.873371260781795, "grad_norm": 2.9936603823443875e-05, "learning_rate": 1.2246147721262092e-09, "loss": 0.0, "num_input_tokens_seen": 233568624, "step": 108290 }, { "epoch": 19.87428886034135, "grad_norm": 6.048694558558054e-05, "learning_rate": 1.2069572454909139e-09, "loss": 0.0, "num_input_tokens_seen": 233579792, "step": 108295 }, { "epoch": 19.875206459900898, "grad_norm": 0.00010474607552168891, "learning_rate": 1.1894279301499777e-09, "loss": 0.0, "num_input_tokens_seen": 233590896, "step": 108300 }, { "epoch": 19.87612405946045, "grad_norm": 5.3484043746721e-05, "learning_rate": 1.1720268265524858e-09, "loss": 0.0, "num_input_tokens_seen": 233601104, "step": 108305 }, { "epoch": 19.877041659020005, "grad_norm": 4.7493336751358584e-05, "learning_rate": 1.154753935144748e-09, "loss": 0.0, "num_input_tokens_seen": 233612112, "step": 108310 }, { "epoch": 19.877959258579555, "grad_norm": 0.00019952960428781807, "learning_rate": 1.137609256370298e-09, "loss": 0.0, "num_input_tokens_seen": 233622064, "step": 108315 }, { "epoch": 19.878876858139108, "grad_norm": 9.949654486263171e-05, "learning_rate": 1.1205927906687842e-09, "loss": 0.0, "num_input_tokens_seen": 233632880, "step": 108320 }, { "epoch": 19.87979445769866, "grad_norm": 6.0788661357946694e-05, "learning_rate": 1.1037045384765244e-09, "loss": 0.0, "num_input_tokens_seen": 233642896, "step": 108325 }, { "epoch": 19.88071205725821, "grad_norm": 0.00010257217945763841, "learning_rate": 1.0869445002265056e-09, "loss": 0.0, "num_input_tokens_seen": 233653328, "step": 108330 }, { "epoch": 19.881629656817765, "grad_norm": 8.786407124716789e-05, "learning_rate": 1.070312676348939e-09, "loss": 0.0, "num_input_tokens_seen": 233663536, "step": 108335 }, { "epoch": 19.88254725637732, "grad_norm": 8.112248178804293e-05, "learning_rate": 1.0538090672701506e-09, "loss": 0.0, "num_input_tokens_seen": 233674448, "step": 108340 }, { "epoch": 19.883464855936868, "grad_norm": 6.850489444332197e-05, "learning_rate": 1.0374336734131352e-09, "loss": 0.0, "num_input_tokens_seen": 233685456, "step": 108345 }, { "epoch": 19.88438245549642, "grad_norm": 0.00032845683745108545, "learning_rate": 1.0211864951986671e-09, "loss": 0.0, "num_input_tokens_seen": 233696848, "step": 108350 }, { "epoch": 19.885300055055975, "grad_norm": 0.00011129787162644789, "learning_rate": 1.0050675330430803e-09, "loss": 0.0, "num_input_tokens_seen": 233706800, "step": 108355 }, { "epoch": 19.886217654615525, "grad_norm": 0.0002331368305021897, "learning_rate": 9.890767873593777e-10, "loss": 0.0, "num_input_tokens_seen": 233718256, "step": 108360 }, { "epoch": 19.887135254175078, "grad_norm": 5.4480340622831136e-05, "learning_rate": 9.732142585583416e-10, "loss": 0.0, "num_input_tokens_seen": 233728912, "step": 108365 }, { "epoch": 19.88805285373463, "grad_norm": 8.555220847483724e-05, "learning_rate": 9.574799470463137e-10, "loss": 0.0, "num_input_tokens_seen": 233738640, "step": 108370 }, { "epoch": 19.88897045329418, "grad_norm": 6.214427412487566e-05, "learning_rate": 9.418738532274151e-10, "loss": 0.0, "num_input_tokens_seen": 233748720, "step": 108375 }, { "epoch": 19.889888052853735, "grad_norm": 0.00012422914733178914, "learning_rate": 9.263959775018816e-10, "loss": 0.0, "num_input_tokens_seen": 233759536, "step": 108380 }, { "epoch": 19.890805652413288, "grad_norm": 0.0003022407472599298, "learning_rate": 9.110463202660625e-10, "loss": 0.0, "num_input_tokens_seen": 233770480, "step": 108385 }, { "epoch": 19.891723251972838, "grad_norm": 5.0227539759362116e-05, "learning_rate": 8.958248819140869e-10, "loss": 0.0, "num_input_tokens_seen": 233780080, "step": 108390 }, { "epoch": 19.89264085153239, "grad_norm": 4.81502793263644e-05, "learning_rate": 8.807316628361984e-10, "loss": 0.0, "num_input_tokens_seen": 233790352, "step": 108395 }, { "epoch": 19.893558451091945, "grad_norm": 0.0011030785972252488, "learning_rate": 8.657666634193096e-10, "loss": 0.0, "num_input_tokens_seen": 233801072, "step": 108400 }, { "epoch": 19.894476050651495, "grad_norm": 0.0010953106684610248, "learning_rate": 8.509298840481128e-10, "loss": 0.0, "num_input_tokens_seen": 233811920, "step": 108405 }, { "epoch": 19.895393650211048, "grad_norm": 0.0013723776210099459, "learning_rate": 8.362213251023044e-10, "loss": 0.0, "num_input_tokens_seen": 233823760, "step": 108410 }, { "epoch": 19.8963112497706, "grad_norm": 0.00015313008043449372, "learning_rate": 8.2164098695936e-10, "loss": 0.0, "num_input_tokens_seen": 233836400, "step": 108415 }, { "epoch": 19.89722884933015, "grad_norm": 7.663480937480927e-05, "learning_rate": 8.07188869993425e-10, "loss": 0.0, "num_input_tokens_seen": 233847568, "step": 108420 }, { "epoch": 19.898146448889705, "grad_norm": 9.456997213419527e-05, "learning_rate": 7.928649745753136e-10, "loss": 0.0, "num_input_tokens_seen": 233858832, "step": 108425 }, { "epoch": 19.899064048449258, "grad_norm": 9.534077253192663e-05, "learning_rate": 7.786693010719548e-10, "loss": 0.0, "num_input_tokens_seen": 233869488, "step": 108430 }, { "epoch": 19.899981648008808, "grad_norm": 0.00011799211642937735, "learning_rate": 7.646018498475016e-10, "loss": 0.0, "num_input_tokens_seen": 233879408, "step": 108435 }, { "epoch": 19.90089924756836, "grad_norm": 7.868189277360216e-05, "learning_rate": 7.506626212627766e-10, "loss": 0.0001, "num_input_tokens_seen": 233889680, "step": 108440 }, { "epoch": 19.901816847127915, "grad_norm": 0.0014205507468432188, "learning_rate": 7.368516156758266e-10, "loss": 0.0, "num_input_tokens_seen": 233900240, "step": 108445 }, { "epoch": 19.902734446687464, "grad_norm": 0.0012528053484857082, "learning_rate": 7.231688334402576e-10, "loss": 0.0, "num_input_tokens_seen": 233911376, "step": 108450 }, { "epoch": 19.903652046247018, "grad_norm": 0.0001873708824859932, "learning_rate": 7.096142749074553e-10, "loss": 0.0, "num_input_tokens_seen": 233923152, "step": 108455 }, { "epoch": 19.90456964580657, "grad_norm": 0.0022310654167085886, "learning_rate": 6.961879404243643e-10, "loss": 0.0, "num_input_tokens_seen": 233933424, "step": 108460 }, { "epoch": 19.90548724536612, "grad_norm": 0.0003027090278919786, "learning_rate": 6.828898303362641e-10, "loss": 0.0, "num_input_tokens_seen": 233943728, "step": 108465 }, { "epoch": 19.906404844925675, "grad_norm": 5.376795161282644e-05, "learning_rate": 6.69719944983438e-10, "loss": 0.0, "num_input_tokens_seen": 233954992, "step": 108470 }, { "epoch": 19.907322444485228, "grad_norm": 4.803364208783023e-05, "learning_rate": 6.566782847045039e-10, "loss": 0.0, "num_input_tokens_seen": 233966352, "step": 108475 }, { "epoch": 19.908240044044778, "grad_norm": 0.00018366356380283833, "learning_rate": 6.43764849833084e-10, "loss": 0.0, "num_input_tokens_seen": 233977488, "step": 108480 }, { "epoch": 19.90915764360433, "grad_norm": 9.640889038564637e-05, "learning_rate": 6.309796407005797e-10, "loss": 0.0, "num_input_tokens_seen": 233988144, "step": 108485 }, { "epoch": 19.910075243163885, "grad_norm": 0.00018267688574269414, "learning_rate": 6.183226576356172e-10, "loss": 0.0, "num_input_tokens_seen": 233999600, "step": 108490 }, { "epoch": 19.910992842723434, "grad_norm": 9.366118320031092e-05, "learning_rate": 6.057939009623815e-10, "loss": 0.0, "num_input_tokens_seen": 234009360, "step": 108495 }, { "epoch": 19.911910442282988, "grad_norm": 4.121442179894075e-05, "learning_rate": 5.93393371001727e-10, "loss": 0.0, "num_input_tokens_seen": 234020400, "step": 108500 }, { "epoch": 19.91282804184254, "grad_norm": 0.00021859313710592687, "learning_rate": 5.811210680728429e-10, "loss": 0.0, "num_input_tokens_seen": 234030128, "step": 108505 }, { "epoch": 19.91374564140209, "grad_norm": 0.00013423558266367763, "learning_rate": 5.689769924893673e-10, "loss": 0.0, "num_input_tokens_seen": 234040336, "step": 108510 }, { "epoch": 19.914663240961644, "grad_norm": 0.00010092918091686442, "learning_rate": 5.569611445632728e-10, "loss": 0.0, "num_input_tokens_seen": 234050928, "step": 108515 }, { "epoch": 19.915580840521198, "grad_norm": 0.00016765542386565357, "learning_rate": 5.450735246026462e-10, "loss": 0.0, "num_input_tokens_seen": 234061840, "step": 108520 }, { "epoch": 19.916498440080748, "grad_norm": 0.00014205284242052585, "learning_rate": 5.333141329122438e-10, "loss": 0.0, "num_input_tokens_seen": 234072176, "step": 108525 }, { "epoch": 19.9174160396403, "grad_norm": 0.00013925238454248756, "learning_rate": 5.216829697940462e-10, "loss": 0.0, "num_input_tokens_seen": 234082864, "step": 108530 }, { "epoch": 19.918333639199854, "grad_norm": 0.00022193920449353755, "learning_rate": 5.101800355461483e-10, "loss": 0.0, "num_input_tokens_seen": 234093712, "step": 108535 }, { "epoch": 19.919251238759404, "grad_norm": 8.212921238737181e-05, "learning_rate": 4.988053304638696e-10, "loss": 0.0, "num_input_tokens_seen": 234105104, "step": 108540 }, { "epoch": 19.920168838318958, "grad_norm": 0.001019825809635222, "learning_rate": 4.875588548380883e-10, "loss": 0.0, "num_input_tokens_seen": 234115600, "step": 108545 }, { "epoch": 19.92108643787851, "grad_norm": 0.0008680210448801517, "learning_rate": 4.764406089585727e-10, "loss": 0.0, "num_input_tokens_seen": 234126128, "step": 108550 }, { "epoch": 19.92200403743806, "grad_norm": 5.385343320085667e-05, "learning_rate": 4.6545059310953986e-10, "loss": 0.0, "num_input_tokens_seen": 234136240, "step": 108555 }, { "epoch": 19.922921636997614, "grad_norm": 9.701929229777306e-05, "learning_rate": 4.5458880757298653e-10, "loss": 0.0, "num_input_tokens_seen": 234147280, "step": 108560 }, { "epoch": 19.923839236557168, "grad_norm": 6.0855032643303275e-05, "learning_rate": 4.438552526281337e-10, "loss": 0.0, "num_input_tokens_seen": 234157712, "step": 108565 }, { "epoch": 19.924756836116718, "grad_norm": 5.035777576267719e-05, "learning_rate": 4.3324992854920645e-10, "loss": 0.0, "num_input_tokens_seen": 234168080, "step": 108570 }, { "epoch": 19.92567443567627, "grad_norm": 0.00021454256784636527, "learning_rate": 4.227728356087646e-10, "loss": 0.0, "num_input_tokens_seen": 234179024, "step": 108575 }, { "epoch": 19.926592035235824, "grad_norm": 0.0010059673804789782, "learning_rate": 4.1242397407603717e-10, "loss": 0.0, "num_input_tokens_seen": 234190000, "step": 108580 }, { "epoch": 19.927509634795374, "grad_norm": 0.00015846005408093333, "learning_rate": 4.0220334421581244e-10, "loss": 0.0, "num_input_tokens_seen": 234200976, "step": 108585 }, { "epoch": 19.928427234354928, "grad_norm": 0.00010625104187056422, "learning_rate": 3.921109462901029e-10, "loss": 0.0, "num_input_tokens_seen": 234211952, "step": 108590 }, { "epoch": 19.92934483391448, "grad_norm": 0.00013900957128498703, "learning_rate": 3.821467805581458e-10, "loss": 0.0, "num_input_tokens_seen": 234223664, "step": 108595 }, { "epoch": 19.93026243347403, "grad_norm": 0.00042933301301673055, "learning_rate": 3.723108472758474e-10, "loss": 0.0, "num_input_tokens_seen": 234234800, "step": 108600 }, { "epoch": 19.931180033033584, "grad_norm": 0.00022087048273533583, "learning_rate": 3.626031466946733e-10, "loss": 0.0, "num_input_tokens_seen": 234246928, "step": 108605 }, { "epoch": 19.932097632593138, "grad_norm": 0.0002448798040859401, "learning_rate": 3.530236790638686e-10, "loss": 0.0, "num_input_tokens_seen": 234258384, "step": 108610 }, { "epoch": 19.933015232152687, "grad_norm": 7.026140519883484e-05, "learning_rate": 3.435724446299027e-10, "loss": 0.0, "num_input_tokens_seen": 234269520, "step": 108615 }, { "epoch": 19.93393283171224, "grad_norm": 5.552982838707976e-05, "learning_rate": 3.3424944363369405e-10, "loss": 0.0285, "num_input_tokens_seen": 234280112, "step": 108620 }, { "epoch": 19.934850431271794, "grad_norm": 0.00013446588127408177, "learning_rate": 3.250546763156059e-10, "loss": 0.0, "num_input_tokens_seen": 234290192, "step": 108625 }, { "epoch": 19.935768030831344, "grad_norm": 0.00017219212895724922, "learning_rate": 3.1598814291100563e-10, "loss": 0.0, "num_input_tokens_seen": 234299888, "step": 108630 }, { "epoch": 19.936685630390897, "grad_norm": 0.0007070529391057789, "learning_rate": 3.0704984365304e-10, "loss": 0.0, "num_input_tokens_seen": 234310032, "step": 108635 }, { "epoch": 19.93760322995045, "grad_norm": 5.568302367464639e-05, "learning_rate": 2.982397787698599e-10, "loss": 0.0, "num_input_tokens_seen": 234322896, "step": 108640 }, { "epoch": 19.93852082951, "grad_norm": 6.156768358778208e-05, "learning_rate": 2.895579484879507e-10, "loss": 0.0, "num_input_tokens_seen": 234333680, "step": 108645 }, { "epoch": 19.939438429069554, "grad_norm": 0.0027330087032169104, "learning_rate": 2.8100435303046737e-10, "loss": 0.0, "num_input_tokens_seen": 234344336, "step": 108650 }, { "epoch": 19.940356028629107, "grad_norm": 6.697984645143151e-05, "learning_rate": 2.725789926155686e-10, "loss": 0.0, "num_input_tokens_seen": 234354672, "step": 108655 }, { "epoch": 19.941273628188657, "grad_norm": 0.0007166539435274899, "learning_rate": 2.6428186746085827e-10, "loss": 0.0, "num_input_tokens_seen": 234364368, "step": 108660 }, { "epoch": 19.94219122774821, "grad_norm": 0.002227888209745288, "learning_rate": 2.5611297777838886e-10, "loss": 0.0, "num_input_tokens_seen": 234376016, "step": 108665 }, { "epoch": 19.943108827307764, "grad_norm": 6.136537558631971e-05, "learning_rate": 2.480723237774374e-10, "loss": 0.0, "num_input_tokens_seen": 234387120, "step": 108670 }, { "epoch": 19.944026426867314, "grad_norm": 8.319163316627964e-05, "learning_rate": 2.4015990566450543e-10, "loss": 0.0, "num_input_tokens_seen": 234398832, "step": 108675 }, { "epoch": 19.944944026426867, "grad_norm": 0.0006851259968243539, "learning_rate": 2.3237572364276374e-10, "loss": 0.0, "num_input_tokens_seen": 234409680, "step": 108680 }, { "epoch": 19.94586162598642, "grad_norm": 0.000255233229836449, "learning_rate": 2.2471977791149735e-10, "loss": 0.0, "num_input_tokens_seen": 234420080, "step": 108685 }, { "epoch": 19.94677922554597, "grad_norm": 0.00010389300587121397, "learning_rate": 2.1719206866721575e-10, "loss": 0.0, "num_input_tokens_seen": 234429744, "step": 108690 }, { "epoch": 19.947696825105524, "grad_norm": 8.338200859725475e-05, "learning_rate": 2.0979259610309776e-10, "loss": 0.0, "num_input_tokens_seen": 234438736, "step": 108695 }, { "epoch": 19.948614424665077, "grad_norm": 9.172565478365868e-05, "learning_rate": 2.0252136040899152e-10, "loss": 0.0, "num_input_tokens_seen": 234448816, "step": 108700 }, { "epoch": 19.949532024224627, "grad_norm": 6.385876622516662e-05, "learning_rate": 1.9537836177085934e-10, "loss": 0.0, "num_input_tokens_seen": 234460304, "step": 108705 }, { "epoch": 19.95044962378418, "grad_norm": 5.742858775192872e-05, "learning_rate": 1.8836360037244316e-10, "loss": 0.0, "num_input_tokens_seen": 234470608, "step": 108710 }, { "epoch": 19.951367223343734, "grad_norm": 9.125927317654714e-05, "learning_rate": 1.8147707639359913e-10, "loss": 0.0, "num_input_tokens_seen": 234481008, "step": 108715 }, { "epoch": 19.952284822903284, "grad_norm": 0.00048210026579909027, "learning_rate": 1.747187900108527e-10, "loss": 0.0, "num_input_tokens_seen": 234493200, "step": 108720 }, { "epoch": 19.953202422462837, "grad_norm": 3.906090933014639e-05, "learning_rate": 1.680887413973986e-10, "loss": 0.0, "num_input_tokens_seen": 234504368, "step": 108725 }, { "epoch": 19.95412002202239, "grad_norm": 7.792716496624053e-05, "learning_rate": 1.6158693072310106e-10, "loss": 0.0, "num_input_tokens_seen": 234513936, "step": 108730 }, { "epoch": 19.95503762158194, "grad_norm": 6.0337599279591814e-05, "learning_rate": 1.5521335815560367e-10, "loss": 0.0, "num_input_tokens_seen": 234524336, "step": 108735 }, { "epoch": 19.955955221141494, "grad_norm": 7.083160016918555e-05, "learning_rate": 1.4896802385755415e-10, "loss": 0.0, "num_input_tokens_seen": 234534608, "step": 108740 }, { "epoch": 19.956872820701047, "grad_norm": 0.001000792603008449, "learning_rate": 1.4285092798937972e-10, "loss": 0.0, "num_input_tokens_seen": 234546128, "step": 108745 }, { "epoch": 19.957790420260597, "grad_norm": 6.590512202819809e-05, "learning_rate": 1.3686207070817693e-10, "loss": 0.0, "num_input_tokens_seen": 234557488, "step": 108750 }, { "epoch": 19.95870801982015, "grad_norm": 0.0005649778177030385, "learning_rate": 1.3100145216715653e-10, "loss": 0.0, "num_input_tokens_seen": 234567600, "step": 108755 }, { "epoch": 19.959625619379704, "grad_norm": 0.0005028278101235628, "learning_rate": 1.252690725173089e-10, "loss": 0.0, "num_input_tokens_seen": 234576976, "step": 108760 }, { "epoch": 19.960543218939254, "grad_norm": 4.8429195885546505e-05, "learning_rate": 1.1966493190462836e-10, "loss": 0.0, "num_input_tokens_seen": 234587952, "step": 108765 }, { "epoch": 19.961460818498807, "grad_norm": 8.455033093923703e-05, "learning_rate": 1.1418903047399899e-10, "loss": 0.0, "num_input_tokens_seen": 234597968, "step": 108770 }, { "epoch": 19.96237841805836, "grad_norm": 6.369220500346273e-05, "learning_rate": 1.0884136836475378e-10, "loss": 0.0, "num_input_tokens_seen": 234609232, "step": 108775 }, { "epoch": 19.96329601761791, "grad_norm": 0.0007293576491065323, "learning_rate": 1.0362194571511552e-10, "loss": 0.0, "num_input_tokens_seen": 234622032, "step": 108780 }, { "epoch": 19.964213617177464, "grad_norm": 0.00012435612734407187, "learning_rate": 9.853076265831096e-11, "loss": 0.0, "num_input_tokens_seen": 234632368, "step": 108785 }, { "epoch": 19.965131216737017, "grad_norm": 6.51568770990707e-05, "learning_rate": 9.356781932479131e-11, "loss": 0.0, "num_input_tokens_seen": 234642384, "step": 108790 }, { "epoch": 19.966048816296567, "grad_norm": 0.0001283724996028468, "learning_rate": 8.87331158422322e-11, "loss": 0.0, "num_input_tokens_seen": 234653264, "step": 108795 }, { "epoch": 19.96696641585612, "grad_norm": 0.00015572032134514302, "learning_rate": 8.402665233442352e-11, "loss": 0.0, "num_input_tokens_seen": 234664336, "step": 108800 }, { "epoch": 19.967884015415674, "grad_norm": 6.976494478294626e-05, "learning_rate": 7.944842892237958e-11, "loss": 0.0, "num_input_tokens_seen": 234676048, "step": 108805 }, { "epoch": 19.968801614975224, "grad_norm": 9.768980089575052e-05, "learning_rate": 7.49984457232289e-11, "loss": 0.0, "num_input_tokens_seen": 234686000, "step": 108810 }, { "epoch": 19.969719214534777, "grad_norm": 0.00015274606994353235, "learning_rate": 7.067670285076933e-11, "loss": 0.0, "num_input_tokens_seen": 234697328, "step": 108815 }, { "epoch": 19.97063681409433, "grad_norm": 3.773428397835232e-05, "learning_rate": 6.64832004165783e-11, "loss": 0.0, "num_input_tokens_seen": 234707728, "step": 108820 }, { "epoch": 19.97155441365388, "grad_norm": 0.00010175124043598771, "learning_rate": 6.241793852834743e-11, "loss": 0.0, "num_input_tokens_seen": 234719600, "step": 108825 }, { "epoch": 19.972472013213434, "grad_norm": 0.0009300797246396542, "learning_rate": 5.848091728932748e-11, "loss": 0.0, "num_input_tokens_seen": 234729680, "step": 108830 }, { "epoch": 19.973389612772987, "grad_norm": 7.3655741289258e-05, "learning_rate": 5.4672136801103836e-11, "loss": 0.0, "num_input_tokens_seen": 234739696, "step": 108835 }, { "epoch": 19.974307212332537, "grad_norm": 0.00011770258424803615, "learning_rate": 5.099159716137614e-11, "loss": 0.0, "num_input_tokens_seen": 234750704, "step": 108840 }, { "epoch": 19.97522481189209, "grad_norm": 3.7277379306033254e-05, "learning_rate": 4.7439298464513337e-11, "loss": 0.0, "num_input_tokens_seen": 234761232, "step": 108845 }, { "epoch": 19.976142411451644, "grad_norm": 5.6870914704632014e-05, "learning_rate": 4.401524080210884e-11, "loss": 0.0, "num_input_tokens_seen": 234772304, "step": 108850 }, { "epoch": 19.977060011011194, "grad_norm": 0.00015625816013198346, "learning_rate": 4.0719424260760035e-11, "loss": 0.0, "num_input_tokens_seen": 234782704, "step": 108855 }, { "epoch": 19.977977610570747, "grad_norm": 5.315787348081358e-05, "learning_rate": 3.755184892595409e-11, "loss": 0.0, "num_input_tokens_seen": 234792112, "step": 108860 }, { "epoch": 19.9788952101303, "grad_norm": 0.0003517511358950287, "learning_rate": 3.45125148787373e-11, "loss": 0.0, "num_input_tokens_seen": 234802960, "step": 108865 }, { "epoch": 19.97981280968985, "grad_norm": 0.00010750824003480375, "learning_rate": 3.1601422197380383e-11, "loss": 0.0, "num_input_tokens_seen": 234812784, "step": 108870 }, { "epoch": 19.980730409249404, "grad_norm": 0.00044936142512597144, "learning_rate": 2.8818570955713166e-11, "loss": 0.0, "num_input_tokens_seen": 234823248, "step": 108875 }, { "epoch": 19.981648008808957, "grad_norm": 7.453910075128078e-05, "learning_rate": 2.616396122590015e-11, "loss": 0.0451, "num_input_tokens_seen": 234833392, "step": 108880 }, { "epoch": 19.982565608368507, "grad_norm": 0.00046602330985479057, "learning_rate": 2.3637593075664933e-11, "loss": 0.0, "num_input_tokens_seen": 234843024, "step": 108885 }, { "epoch": 19.98348320792806, "grad_norm": 6.770318577764556e-05, "learning_rate": 2.1239466569400458e-11, "loss": 0.0, "num_input_tokens_seen": 234854640, "step": 108890 }, { "epoch": 19.984400807487614, "grad_norm": 0.00034994614543393254, "learning_rate": 1.8969581769834324e-11, "loss": 0.0, "num_input_tokens_seen": 234864560, "step": 108895 }, { "epoch": 19.985318407047163, "grad_norm": 0.0001736355188768357, "learning_rate": 1.6827938733587902e-11, "loss": 0.0, "num_input_tokens_seen": 234876240, "step": 108900 }, { "epoch": 19.986236006606717, "grad_norm": 8.111532224575058e-05, "learning_rate": 1.4814537517282566e-11, "loss": 0.0, "num_input_tokens_seen": 234887024, "step": 108905 }, { "epoch": 19.98715360616627, "grad_norm": 5.460376269184053e-05, "learning_rate": 1.2929378170878359e-11, "loss": 0.0, "num_input_tokens_seen": 234897648, "step": 108910 }, { "epoch": 19.98807120572582, "grad_norm": 7.180396642070264e-05, "learning_rate": 1.1172460744335311e-11, "loss": 0.0, "num_input_tokens_seen": 234909776, "step": 108915 }, { "epoch": 19.988988805285373, "grad_norm": 8.425516716670245e-05, "learning_rate": 9.543785281507235e-12, "loss": 0.0, "num_input_tokens_seen": 234920688, "step": 108920 }, { "epoch": 19.989906404844927, "grad_norm": 0.00011527138849487528, "learning_rate": 8.043351824582601e-12, "loss": 0.0, "num_input_tokens_seen": 234931728, "step": 108925 }, { "epoch": 19.990824004404477, "grad_norm": 5.051324114901945e-05, "learning_rate": 6.67116041241922e-12, "loss": 0.0, "num_input_tokens_seen": 234944112, "step": 108930 }, { "epoch": 19.99174160396403, "grad_norm": 3.521583494148217e-05, "learning_rate": 5.427211079434002e-12, "loss": 0.0, "num_input_tokens_seen": 234954864, "step": 108935 }, { "epoch": 19.992659203523583, "grad_norm": 9.99123731162399e-05, "learning_rate": 4.31150385837853e-12, "loss": 0.0, "num_input_tokens_seen": 234966128, "step": 108940 }, { "epoch": 19.993576803083133, "grad_norm": 4.062967491336167e-05, "learning_rate": 3.324038777008376e-12, "loss": 0.0, "num_input_tokens_seen": 234977168, "step": 108945 }, { "epoch": 19.994494402642687, "grad_norm": 0.00025976018514484167, "learning_rate": 2.464815861413783e-12, "loss": 0.0, "num_input_tokens_seen": 234988208, "step": 108950 }, { "epoch": 19.99541200220224, "grad_norm": 8.218088623834774e-05, "learning_rate": 1.7338351332440994e-12, "loss": 0.0, "num_input_tokens_seen": 235000016, "step": 108955 }, { "epoch": 19.99632960176179, "grad_norm": 0.0025979841593652964, "learning_rate": 1.1310966108180055e-12, "loss": 0.0, "num_input_tokens_seen": 235010640, "step": 108960 }, { "epoch": 19.997247201321343, "grad_norm": 7.899785123299807e-05, "learning_rate": 6.566003107888464e-13, "loss": 0.0, "num_input_tokens_seen": 235021104, "step": 108965 }, { "epoch": 19.998164800880897, "grad_norm": 6.854497769381851e-05, "learning_rate": 3.103462442588523e-13, "loss": 0.0, "num_input_tokens_seen": 235032496, "step": 108970 }, { "epoch": 19.999082400440447, "grad_norm": 6.372784991981462e-05, "learning_rate": 9.233442066491904e-14, "loss": 0.0, "num_input_tokens_seen": 235044080, "step": 108975 }, { "epoch": 20.0, "grad_norm": 8.783608791418374e-05, "learning_rate": 2.564845003050209e-15, "loss": 0.0, "num_input_tokens_seen": 235053712, "step": 108980 }, { "epoch": 20.0, "eval_loss": 0.6625613570213318, "eval_runtime": 178.7859, "eval_samples_per_second": 30.478, "eval_steps_per_second": 7.624, "num_input_tokens_seen": 235053712, "step": 108980 }, { "epoch": 20.0, "num_input_tokens_seen": 235053712, "step": 108980, "total_flos": 1.0584362414603895e+19, "train_loss": 0.11991507525748572, "train_runtime": 34589.6479, "train_samples_per_second": 12.601, "train_steps_per_second": 3.151 } ], "logging_steps": 5, "max_steps": 108980, "num_input_tokens_seen": 235053712, "num_train_epochs": 20, "save_steps": 10898, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0584362414603895e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }