{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1431, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006988120195667365, "grad_norm": 20.01421639294812, "learning_rate": 0.0, "loss": 26.1109, "num_tokens": 7466.0, "step": 1 }, { "epoch": 0.001397624039133473, "grad_norm": 15.36930185770397, "learning_rate": 4.651162790697674e-06, "loss": 26.1453, "num_tokens": 14867.0, "step": 2 }, { "epoch": 0.0020964360587002098, "grad_norm": 17.05352336534915, "learning_rate": 9.302325581395349e-06, "loss": 26.0888, "num_tokens": 22758.0, "step": 3 }, { "epoch": 0.002795248078266946, "grad_norm": 19.04711212026682, "learning_rate": 1.3953488372093024e-05, "loss": 25.6631, "num_tokens": 30251.0, "step": 4 }, { "epoch": 0.0034940600978336828, "grad_norm": 28.812900832385424, "learning_rate": 1.8604651162790697e-05, "loss": 25.3928, "num_tokens": 37442.0, "step": 5 }, { "epoch": 0.0041928721174004195, "grad_norm": 63.640698917817346, "learning_rate": 2.3255813953488374e-05, "loss": 26.0258, "num_tokens": 44450.0, "step": 6 }, { "epoch": 0.004891684136967156, "grad_norm": 60.46179037964818, "learning_rate": 2.7906976744186048e-05, "loss": 25.6428, "num_tokens": 51596.0, "step": 7 }, { "epoch": 0.005590496156533892, "grad_norm": 31.396116729467284, "learning_rate": 3.2558139534883724e-05, "loss": 24.7731, "num_tokens": 58501.0, "step": 8 }, { "epoch": 0.006289308176100629, "grad_norm": 33.63353151516081, "learning_rate": 3.7209302325581394e-05, "loss": 23.8918, "num_tokens": 65085.0, "step": 9 }, { "epoch": 0.0069881201956673656, "grad_norm": 22.00249182717323, "learning_rate": 4.186046511627907e-05, "loss": 23.6714, "num_tokens": 72520.0, "step": 10 }, { "epoch": 0.007686932215234102, "grad_norm": 21.52192510277941, "learning_rate": 4.651162790697675e-05, "loss": 23.4993, "num_tokens": 79490.0, "step": 11 }, { "epoch": 0.008385744234800839, "grad_norm": 18.356839468525436, "learning_rate": 5.1162790697674425e-05, "loss": 23.458, "num_tokens": 86902.0, "step": 12 }, { "epoch": 0.009084556254367574, "grad_norm": 11.54924819578326, "learning_rate": 5.5813953488372095e-05, "loss": 22.736, "num_tokens": 92738.0, "step": 13 }, { "epoch": 0.009783368273934312, "grad_norm": 36.52460306155796, "learning_rate": 6.0465116279069765e-05, "loss": 23.2197, "num_tokens": 100060.0, "step": 14 }, { "epoch": 0.010482180293501049, "grad_norm": 12.549039640154193, "learning_rate": 6.511627906976745e-05, "loss": 21.9672, "num_tokens": 107649.0, "step": 15 }, { "epoch": 0.011180992313067784, "grad_norm": 6.65450263785363, "learning_rate": 6.976744186046513e-05, "loss": 21.4422, "num_tokens": 114327.0, "step": 16 }, { "epoch": 0.011879804332634521, "grad_norm": 26.60725050433419, "learning_rate": 7.441860465116279e-05, "loss": 21.7565, "num_tokens": 121566.0, "step": 17 }, { "epoch": 0.012578616352201259, "grad_norm": 14.909109212430055, "learning_rate": 7.906976744186047e-05, "loss": 21.4321, "num_tokens": 128982.0, "step": 18 }, { "epoch": 0.013277428371767994, "grad_norm": 8.850594729109151, "learning_rate": 8.372093023255814e-05, "loss": 21.2315, "num_tokens": 136536.0, "step": 19 }, { "epoch": 0.013976240391334731, "grad_norm": 4.64153452983114, "learning_rate": 8.837209302325582e-05, "loss": 20.9173, "num_tokens": 144292.0, "step": 20 }, { "epoch": 0.014675052410901468, "grad_norm": 7.1557479804922774, "learning_rate": 9.30232558139535e-05, "loss": 20.6413, "num_tokens": 151133.0, "step": 21 }, { "epoch": 0.015373864430468204, "grad_norm": 12.58835350216716, "learning_rate": 9.767441860465116e-05, "loss": 20.6017, "num_tokens": 158252.0, "step": 22 }, { "epoch": 0.01607267645003494, "grad_norm": 4.738505709948937, "learning_rate": 0.00010232558139534885, "loss": 20.2481, "num_tokens": 165457.0, "step": 23 }, { "epoch": 0.016771488469601678, "grad_norm": 3.7959176276381172, "learning_rate": 0.00010697674418604651, "loss": 20.0811, "num_tokens": 172785.0, "step": 24 }, { "epoch": 0.017470300489168415, "grad_norm": 11.694063836440657, "learning_rate": 0.00011162790697674419, "loss": 20.1969, "num_tokens": 179494.0, "step": 25 }, { "epoch": 0.01816911250873515, "grad_norm": 9.55869849357741, "learning_rate": 0.00011627906976744187, "loss": 20.2425, "num_tokens": 187501.0, "step": 26 }, { "epoch": 0.018867924528301886, "grad_norm": 4.6654736532723255, "learning_rate": 0.00012093023255813953, "loss": 20.1472, "num_tokens": 194486.0, "step": 27 }, { "epoch": 0.019566736547868623, "grad_norm": 3.543295429416166, "learning_rate": 0.0001255813953488372, "loss": 19.6634, "num_tokens": 201489.0, "step": 28 }, { "epoch": 0.02026554856743536, "grad_norm": 4.256858179676704, "learning_rate": 0.0001302325581395349, "loss": 19.6026, "num_tokens": 209354.0, "step": 29 }, { "epoch": 0.020964360587002098, "grad_norm": 6.097755497420057, "learning_rate": 0.00013488372093023256, "loss": 19.3001, "num_tokens": 216345.0, "step": 30 }, { "epoch": 0.02166317260656883, "grad_norm": 3.5691978934846538, "learning_rate": 0.00013953488372093025, "loss": 19.4532, "num_tokens": 223037.0, "step": 31 }, { "epoch": 0.02236198462613557, "grad_norm": 3.9279818551475314, "learning_rate": 0.00014418604651162791, "loss": 19.4291, "num_tokens": 230416.0, "step": 32 }, { "epoch": 0.023060796645702306, "grad_norm": 3.2960834272756525, "learning_rate": 0.00014883720930232558, "loss": 18.919, "num_tokens": 237115.0, "step": 33 }, { "epoch": 0.023759608665269043, "grad_norm": 3.334129318532873, "learning_rate": 0.00015348837209302327, "loss": 18.7862, "num_tokens": 244240.0, "step": 34 }, { "epoch": 0.02445842068483578, "grad_norm": 4.663518875645515, "learning_rate": 0.00015813953488372093, "loss": 18.9105, "num_tokens": 251723.0, "step": 35 }, { "epoch": 0.025157232704402517, "grad_norm": 3.0967131320916286, "learning_rate": 0.00016279069767441862, "loss": 18.3404, "num_tokens": 258362.0, "step": 36 }, { "epoch": 0.02585604472396925, "grad_norm": 3.47077766113365, "learning_rate": 0.00016744186046511629, "loss": 18.5825, "num_tokens": 265593.0, "step": 37 }, { "epoch": 0.026554856743535988, "grad_norm": 3.2117310830547514, "learning_rate": 0.00017209302325581395, "loss": 17.9212, "num_tokens": 272624.0, "step": 38 }, { "epoch": 0.027253668763102725, "grad_norm": 5.4619700154340665, "learning_rate": 0.00017674418604651164, "loss": 18.594, "num_tokens": 279181.0, "step": 39 }, { "epoch": 0.027952480782669462, "grad_norm": 3.404742927998481, "learning_rate": 0.0001813953488372093, "loss": 18.1244, "num_tokens": 285979.0, "step": 40 }, { "epoch": 0.0286512928022362, "grad_norm": 2.6009044708722584, "learning_rate": 0.000186046511627907, "loss": 18.0623, "num_tokens": 293070.0, "step": 41 }, { "epoch": 0.029350104821802937, "grad_norm": 3.109662691913911, "learning_rate": 0.00019069767441860466, "loss": 17.4661, "num_tokens": 299558.0, "step": 42 }, { "epoch": 0.03004891684136967, "grad_norm": 3.648517834649003, "learning_rate": 0.00019534883720930232, "loss": 17.6153, "num_tokens": 306733.0, "step": 43 }, { "epoch": 0.030747728860936407, "grad_norm": 4.72227718473326, "learning_rate": 0.0002, "loss": 17.4022, "num_tokens": 313625.0, "step": 44 }, { "epoch": 0.031446540880503145, "grad_norm": 3.624756224705375, "learning_rate": 0.00019999974385219888, "loss": 17.382, "num_tokens": 320835.0, "step": 45 }, { "epoch": 0.03214535290006988, "grad_norm": 3.4671960501465815, "learning_rate": 0.00019999897541010772, "loss": 16.9532, "num_tokens": 328348.0, "step": 46 }, { "epoch": 0.03284416491963662, "grad_norm": 3.103292868331405, "learning_rate": 0.00019999769467766323, "loss": 16.7449, "num_tokens": 335913.0, "step": 47 }, { "epoch": 0.033542976939203356, "grad_norm": 4.264418554780992, "learning_rate": 0.00019999590166142655, "loss": 16.7949, "num_tokens": 343205.0, "step": 48 }, { "epoch": 0.03424178895877009, "grad_norm": 2.9976124716737447, "learning_rate": 0.0001999935963705832, "loss": 16.4737, "num_tokens": 350261.0, "step": 49 }, { "epoch": 0.03494060097833683, "grad_norm": 5.4690748444855455, "learning_rate": 0.0001999907788169431, "loss": 16.8885, "num_tokens": 357367.0, "step": 50 }, { "epoch": 0.03563941299790356, "grad_norm": 3.093190984373506, "learning_rate": 0.00019998744901494049, "loss": 16.2166, "num_tokens": 364705.0, "step": 51 }, { "epoch": 0.0363382250174703, "grad_norm": 3.231011659915006, "learning_rate": 0.00019998360698163375, "loss": 16.3892, "num_tokens": 371422.0, "step": 52 }, { "epoch": 0.037037037037037035, "grad_norm": 3.267414032427895, "learning_rate": 0.00019997925273670543, "loss": 16.1599, "num_tokens": 378970.0, "step": 53 }, { "epoch": 0.03773584905660377, "grad_norm": 3.249781757856964, "learning_rate": 0.0001999743863024622, "loss": 16.0245, "num_tokens": 386189.0, "step": 54 }, { "epoch": 0.03843466107617051, "grad_norm": 2.7793521009997457, "learning_rate": 0.00019996900770383454, "loss": 16.2024, "num_tokens": 393815.0, "step": 55 }, { "epoch": 0.039133473095737246, "grad_norm": 2.3740220719738527, "learning_rate": 0.0001999631169683768, "loss": 15.9862, "num_tokens": 399923.0, "step": 56 }, { "epoch": 0.039832285115303984, "grad_norm": 2.504515937578137, "learning_rate": 0.0001999567141262669, "loss": 15.8262, "num_tokens": 406458.0, "step": 57 }, { "epoch": 0.04053109713487072, "grad_norm": 2.585577847048995, "learning_rate": 0.0001999497992103064, "loss": 15.8221, "num_tokens": 414710.0, "step": 58 }, { "epoch": 0.04122990915443746, "grad_norm": 4.077863543627188, "learning_rate": 0.00019994237225592012, "loss": 15.8001, "num_tokens": 422447.0, "step": 59 }, { "epoch": 0.041928721174004195, "grad_norm": 2.3612560688538227, "learning_rate": 0.00019993443330115592, "loss": 15.4257, "num_tokens": 428907.0, "step": 60 }, { "epoch": 0.04262753319357093, "grad_norm": 3.1342939019471863, "learning_rate": 0.0001999259823866848, "loss": 15.5804, "num_tokens": 436247.0, "step": 61 }, { "epoch": 0.04332634521313766, "grad_norm": 2.667474376403426, "learning_rate": 0.0001999170195558004, "loss": 15.2497, "num_tokens": 443409.0, "step": 62 }, { "epoch": 0.0440251572327044, "grad_norm": 2.94929141586954, "learning_rate": 0.0001999075448544189, "loss": 15.2836, "num_tokens": 450956.0, "step": 63 }, { "epoch": 0.04472396925227114, "grad_norm": 4.115184597341321, "learning_rate": 0.00019989755833107876, "loss": 15.0676, "num_tokens": 458417.0, "step": 64 }, { "epoch": 0.045422781271837874, "grad_norm": 2.2936726283661093, "learning_rate": 0.00019988706003694055, "loss": 15.1663, "num_tokens": 465196.0, "step": 65 }, { "epoch": 0.04612159329140461, "grad_norm": 3.263296116111796, "learning_rate": 0.00019987605002578653, "loss": 14.9832, "num_tokens": 472489.0, "step": 66 }, { "epoch": 0.04682040531097135, "grad_norm": 2.5788902845272443, "learning_rate": 0.0001998645283540205, "loss": 14.8966, "num_tokens": 479315.0, "step": 67 }, { "epoch": 0.047519217330538085, "grad_norm": 2.631994580040522, "learning_rate": 0.00019985249508066755, "loss": 15.1958, "num_tokens": 486906.0, "step": 68 }, { "epoch": 0.04821802935010482, "grad_norm": 2.5782733328645304, "learning_rate": 0.0001998399502673735, "loss": 14.796, "num_tokens": 493938.0, "step": 69 }, { "epoch": 0.04891684136967156, "grad_norm": 3.0213769951549816, "learning_rate": 0.00019982689397840496, "loss": 14.9338, "num_tokens": 500590.0, "step": 70 }, { "epoch": 0.0496156533892383, "grad_norm": 3.5678659064272087, "learning_rate": 0.00019981332628064865, "loss": 14.9063, "num_tokens": 508345.0, "step": 71 }, { "epoch": 0.050314465408805034, "grad_norm": 2.313425627629096, "learning_rate": 0.0001997992472436114, "loss": 14.7596, "num_tokens": 515272.0, "step": 72 }, { "epoch": 0.05101327742837177, "grad_norm": 2.4067823507135753, "learning_rate": 0.0001997846569394194, "loss": 14.7652, "num_tokens": 522104.0, "step": 73 }, { "epoch": 0.0517120894479385, "grad_norm": 2.2152061116996364, "learning_rate": 0.00019976955544281815, "loss": 14.6565, "num_tokens": 529458.0, "step": 74 }, { "epoch": 0.05241090146750524, "grad_norm": 3.2940129288988547, "learning_rate": 0.000199753942831172, "loss": 14.3745, "num_tokens": 537278.0, "step": 75 }, { "epoch": 0.053109713487071976, "grad_norm": 3.2872893577085933, "learning_rate": 0.0001997378191844636, "loss": 14.654, "num_tokens": 544549.0, "step": 76 }, { "epoch": 0.05380852550663871, "grad_norm": 3.4280215978452544, "learning_rate": 0.00019972118458529375, "loss": 14.2309, "num_tokens": 551068.0, "step": 77 }, { "epoch": 0.05450733752620545, "grad_norm": 2.064355698482165, "learning_rate": 0.00019970403911888078, "loss": 14.5744, "num_tokens": 558188.0, "step": 78 }, { "epoch": 0.05520614954577219, "grad_norm": 2.716813836457923, "learning_rate": 0.0001996863828730601, "loss": 14.4787, "num_tokens": 565805.0, "step": 79 }, { "epoch": 0.055904961565338925, "grad_norm": 2.4118756690009997, "learning_rate": 0.00019966821593828392, "loss": 14.2957, "num_tokens": 573381.0, "step": 80 }, { "epoch": 0.05660377358490566, "grad_norm": 2.4955272028998454, "learning_rate": 0.0001996495384076206, "loss": 14.1731, "num_tokens": 581350.0, "step": 81 }, { "epoch": 0.0573025856044724, "grad_norm": 2.3646224597467445, "learning_rate": 0.0001996303503767544, "loss": 14.1787, "num_tokens": 588473.0, "step": 82 }, { "epoch": 0.058001397624039136, "grad_norm": 2.9154207825942637, "learning_rate": 0.00019961065194398466, "loss": 14.0221, "num_tokens": 595623.0, "step": 83 }, { "epoch": 0.05870020964360587, "grad_norm": 2.144505504814399, "learning_rate": 0.00019959044321022563, "loss": 14.3507, "num_tokens": 602999.0, "step": 84 }, { "epoch": 0.0593990216631726, "grad_norm": 2.32919869196643, "learning_rate": 0.00019956972427900578, "loss": 13.9408, "num_tokens": 610543.0, "step": 85 }, { "epoch": 0.06009783368273934, "grad_norm": 2.744731912474738, "learning_rate": 0.00019954849525646726, "loss": 13.917, "num_tokens": 617203.0, "step": 86 }, { "epoch": 0.06079664570230608, "grad_norm": 3.0081451482892416, "learning_rate": 0.0001995267562513654, "loss": 14.4755, "num_tokens": 623602.0, "step": 87 }, { "epoch": 0.061495457721872815, "grad_norm": 2.0337097238095176, "learning_rate": 0.00019950450737506824, "loss": 14.0758, "num_tokens": 631160.0, "step": 88 }, { "epoch": 0.06219426974143955, "grad_norm": 1.872341236813854, "learning_rate": 0.00019948174874155573, "loss": 13.6134, "num_tokens": 638355.0, "step": 89 }, { "epoch": 0.06289308176100629, "grad_norm": 1.8676837839342457, "learning_rate": 0.00019945848046741934, "loss": 13.8425, "num_tokens": 645362.0, "step": 90 }, { "epoch": 0.06359189378057302, "grad_norm": 1.7615381904391172, "learning_rate": 0.00019943470267186144, "loss": 13.7478, "num_tokens": 652539.0, "step": 91 }, { "epoch": 0.06429070580013976, "grad_norm": 2.3029110752387307, "learning_rate": 0.00019941041547669465, "loss": 13.9631, "num_tokens": 659738.0, "step": 92 }, { "epoch": 0.0649895178197065, "grad_norm": 2.0875724707236114, "learning_rate": 0.0001993856190063412, "loss": 13.7651, "num_tokens": 667862.0, "step": 93 }, { "epoch": 0.06568832983927324, "grad_norm": 2.8116799014033287, "learning_rate": 0.00019936031338783225, "loss": 13.9233, "num_tokens": 675183.0, "step": 94 }, { "epoch": 0.06638714185883997, "grad_norm": 2.1242762146916303, "learning_rate": 0.00019933449875080746, "loss": 13.5429, "num_tokens": 682199.0, "step": 95 }, { "epoch": 0.06708595387840671, "grad_norm": 3.0916928086184865, "learning_rate": 0.00019930817522751401, "loss": 13.6249, "num_tokens": 689412.0, "step": 96 }, { "epoch": 0.06778476589797344, "grad_norm": 2.1525163646868517, "learning_rate": 0.0001992813429528062, "loss": 13.592, "num_tokens": 696608.0, "step": 97 }, { "epoch": 0.06848357791754019, "grad_norm": 2.5580695598792955, "learning_rate": 0.0001992540020641446, "loss": 13.4303, "num_tokens": 703838.0, "step": 98 }, { "epoch": 0.06918238993710692, "grad_norm": 2.2127195967055884, "learning_rate": 0.0001992261527015953, "loss": 13.6424, "num_tokens": 711432.0, "step": 99 }, { "epoch": 0.06988120195667366, "grad_norm": 2.092591387075435, "learning_rate": 0.00019919779500782948, "loss": 13.6159, "num_tokens": 717755.0, "step": 100 }, { "epoch": 0.07058001397624039, "grad_norm": 2.649063124260544, "learning_rate": 0.0001991689291281223, "loss": 13.7373, "num_tokens": 725381.0, "step": 101 }, { "epoch": 0.07127882599580712, "grad_norm": 2.058843564636829, "learning_rate": 0.00019913955521035234, "loss": 13.2791, "num_tokens": 732317.0, "step": 102 }, { "epoch": 0.07197763801537387, "grad_norm": 2.003927168402966, "learning_rate": 0.00019910967340500094, "loss": 13.5031, "num_tokens": 739043.0, "step": 103 }, { "epoch": 0.0726764500349406, "grad_norm": 1.7028539422821216, "learning_rate": 0.00019907928386515126, "loss": 13.4382, "num_tokens": 745729.0, "step": 104 }, { "epoch": 0.07337526205450734, "grad_norm": 2.2764500816492568, "learning_rate": 0.00019904838674648763, "loss": 13.3326, "num_tokens": 753195.0, "step": 105 }, { "epoch": 0.07407407407407407, "grad_norm": 2.09678454014234, "learning_rate": 0.00019901698220729458, "loss": 13.5748, "num_tokens": 759938.0, "step": 106 }, { "epoch": 0.07477288609364081, "grad_norm": 2.2370783473533984, "learning_rate": 0.00019898507040845616, "loss": 13.2876, "num_tokens": 767625.0, "step": 107 }, { "epoch": 0.07547169811320754, "grad_norm": 1.922904626284315, "learning_rate": 0.00019895265151345518, "loss": 13.1701, "num_tokens": 775131.0, "step": 108 }, { "epoch": 0.07617051013277429, "grad_norm": 1.6799896369866845, "learning_rate": 0.00019891972568837214, "loss": 13.2668, "num_tokens": 782395.0, "step": 109 }, { "epoch": 0.07686932215234102, "grad_norm": 2.3877041372388272, "learning_rate": 0.00019888629310188465, "loss": 13.3328, "num_tokens": 789064.0, "step": 110 }, { "epoch": 0.07756813417190776, "grad_norm": 1.9592142247636106, "learning_rate": 0.00019885235392526636, "loss": 13.2284, "num_tokens": 796698.0, "step": 111 }, { "epoch": 0.07826694619147449, "grad_norm": 1.9080524611085286, "learning_rate": 0.00019881790833238617, "loss": 13.2042, "num_tokens": 803919.0, "step": 112 }, { "epoch": 0.07896575821104122, "grad_norm": 2.24974818928377, "learning_rate": 0.00019878295649970734, "loss": 13.1838, "num_tokens": 810971.0, "step": 113 }, { "epoch": 0.07966457023060797, "grad_norm": 2.1506162194631515, "learning_rate": 0.0001987474986062866, "loss": 13.0353, "num_tokens": 817887.0, "step": 114 }, { "epoch": 0.0803633822501747, "grad_norm": 1.7308970262493222, "learning_rate": 0.00019871153483377315, "loss": 13.0944, "num_tokens": 824738.0, "step": 115 }, { "epoch": 0.08106219426974144, "grad_norm": 1.9315001985846878, "learning_rate": 0.0001986750653664078, "loss": 13.1709, "num_tokens": 832079.0, "step": 116 }, { "epoch": 0.08176100628930817, "grad_norm": 2.260956956052656, "learning_rate": 0.0001986380903910221, "loss": 13.1032, "num_tokens": 838908.0, "step": 117 }, { "epoch": 0.08245981830887492, "grad_norm": 1.5924533402089038, "learning_rate": 0.00019860061009703713, "loss": 13.1797, "num_tokens": 845348.0, "step": 118 }, { "epoch": 0.08315863032844165, "grad_norm": 2.1606192319469115, "learning_rate": 0.00019856262467646282, "loss": 13.2937, "num_tokens": 852162.0, "step": 119 }, { "epoch": 0.08385744234800839, "grad_norm": 1.5684395978061723, "learning_rate": 0.00019852413432389684, "loss": 13.061, "num_tokens": 860170.0, "step": 120 }, { "epoch": 0.08455625436757512, "grad_norm": 2.0070206310239476, "learning_rate": 0.00019848513923652358, "loss": 13.0771, "num_tokens": 867476.0, "step": 121 }, { "epoch": 0.08525506638714186, "grad_norm": 1.5728708315681226, "learning_rate": 0.00019844563961411309, "loss": 12.9597, "num_tokens": 874866.0, "step": 122 }, { "epoch": 0.0859538784067086, "grad_norm": 2.162003809520903, "learning_rate": 0.00019840563565902026, "loss": 13.1082, "num_tokens": 881774.0, "step": 123 }, { "epoch": 0.08665269042627533, "grad_norm": 1.829836068226425, "learning_rate": 0.00019836512757618355, "loss": 13.0226, "num_tokens": 888149.0, "step": 124 }, { "epoch": 0.08735150244584207, "grad_norm": 2.2023507815241103, "learning_rate": 0.00019832411557312414, "loss": 13.311, "num_tokens": 894693.0, "step": 125 }, { "epoch": 0.0880503144654088, "grad_norm": 2.0453068368799863, "learning_rate": 0.00019828259985994463, "loss": 13.273, "num_tokens": 901024.0, "step": 126 }, { "epoch": 0.08874912648497554, "grad_norm": 1.4939550415133291, "learning_rate": 0.00019824058064932831, "loss": 13.0516, "num_tokens": 908206.0, "step": 127 }, { "epoch": 0.08944793850454227, "grad_norm": 2.2979259766990654, "learning_rate": 0.00019819805815653768, "loss": 13.1368, "num_tokens": 914376.0, "step": 128 }, { "epoch": 0.09014675052410902, "grad_norm": 1.8419979568421898, "learning_rate": 0.00019815503259941358, "loss": 13.0085, "num_tokens": 921721.0, "step": 129 }, { "epoch": 0.09084556254367575, "grad_norm": 1.8302796577604992, "learning_rate": 0.0001981115041983741, "loss": 12.9987, "num_tokens": 928937.0, "step": 130 }, { "epoch": 0.09154437456324249, "grad_norm": 1.6088442418792948, "learning_rate": 0.0001980674731764133, "loss": 12.9823, "num_tokens": 936881.0, "step": 131 }, { "epoch": 0.09224318658280922, "grad_norm": 1.9971274822057858, "learning_rate": 0.00019802293975910016, "loss": 13.0976, "num_tokens": 943685.0, "step": 132 }, { "epoch": 0.09294199860237597, "grad_norm": 1.5298730715886686, "learning_rate": 0.00019797790417457742, "loss": 12.8484, "num_tokens": 950976.0, "step": 133 }, { "epoch": 0.0936408106219427, "grad_norm": 2.0586082579521334, "learning_rate": 0.0001979323666535604, "loss": 13.0251, "num_tokens": 957928.0, "step": 134 }, { "epoch": 0.09433962264150944, "grad_norm": 1.6761450760295233, "learning_rate": 0.00019788632742933585, "loss": 12.877, "num_tokens": 964412.0, "step": 135 }, { "epoch": 0.09503843466107617, "grad_norm": 1.6881768033105196, "learning_rate": 0.00019783978673776063, "loss": 12.8942, "num_tokens": 971468.0, "step": 136 }, { "epoch": 0.0957372466806429, "grad_norm": 1.7144893089190771, "learning_rate": 0.00019779274481726073, "loss": 12.9506, "num_tokens": 978459.0, "step": 137 }, { "epoch": 0.09643605870020965, "grad_norm": 1.4983030666694932, "learning_rate": 0.00019774520190882978, "loss": 12.9223, "num_tokens": 985920.0, "step": 138 }, { "epoch": 0.09713487071977638, "grad_norm": 1.6874127706486612, "learning_rate": 0.00019769715825602803, "loss": 12.8427, "num_tokens": 992764.0, "step": 139 }, { "epoch": 0.09783368273934312, "grad_norm": 1.6763957290933016, "learning_rate": 0.00019764861410498098, "loss": 12.9646, "num_tokens": 999854.0, "step": 140 }, { "epoch": 0.09853249475890985, "grad_norm": 1.9957992886089357, "learning_rate": 0.00019759956970437825, "loss": 12.8047, "num_tokens": 1006667.0, "step": 141 }, { "epoch": 0.0992313067784766, "grad_norm": 1.5867823770505232, "learning_rate": 0.00019755002530547208, "loss": 12.9937, "num_tokens": 1014089.0, "step": 142 }, { "epoch": 0.09993011879804332, "grad_norm": 1.6071217458716918, "learning_rate": 0.00019749998116207621, "loss": 13.0317, "num_tokens": 1020912.0, "step": 143 }, { "epoch": 0.10062893081761007, "grad_norm": 1.733309225841312, "learning_rate": 0.00019744943753056472, "loss": 12.6994, "num_tokens": 1028495.0, "step": 144 }, { "epoch": 0.1013277428371768, "grad_norm": 1.5096215182516033, "learning_rate": 0.0001973983946698703, "loss": 12.7572, "num_tokens": 1036332.0, "step": 145 }, { "epoch": 0.10202655485674354, "grad_norm": 1.683071966779659, "learning_rate": 0.0001973468528414833, "loss": 12.8509, "num_tokens": 1043895.0, "step": 146 }, { "epoch": 0.10272536687631027, "grad_norm": 1.7014370401459447, "learning_rate": 0.0001972948123094503, "loss": 12.8687, "num_tokens": 1051123.0, "step": 147 }, { "epoch": 0.103424178895877, "grad_norm": 1.3443197503129412, "learning_rate": 0.00019724227334037256, "loss": 12.8757, "num_tokens": 1058636.0, "step": 148 }, { "epoch": 0.10412299091544375, "grad_norm": 1.690543644382228, "learning_rate": 0.00019718923620340496, "loss": 12.8779, "num_tokens": 1065532.0, "step": 149 }, { "epoch": 0.10482180293501048, "grad_norm": 1.3035032802273938, "learning_rate": 0.00019713570117025443, "loss": 12.7007, "num_tokens": 1073092.0, "step": 150 }, { "epoch": 0.10552061495457722, "grad_norm": 1.5307507354350658, "learning_rate": 0.0001970816685151786, "loss": 12.6439, "num_tokens": 1081083.0, "step": 151 }, { "epoch": 0.10621942697414395, "grad_norm": 1.4929379554646116, "learning_rate": 0.00019702713851498435, "loss": 12.7074, "num_tokens": 1088405.0, "step": 152 }, { "epoch": 0.1069182389937107, "grad_norm": 1.4933612831212741, "learning_rate": 0.00019697211144902648, "loss": 12.6299, "num_tokens": 1095299.0, "step": 153 }, { "epoch": 0.10761705101327743, "grad_norm": 1.5975456969431756, "learning_rate": 0.00019691658759920624, "loss": 12.7551, "num_tokens": 1102272.0, "step": 154 }, { "epoch": 0.10831586303284417, "grad_norm": 1.6091046101499575, "learning_rate": 0.00019686056724996988, "loss": 12.8102, "num_tokens": 1108878.0, "step": 155 }, { "epoch": 0.1090146750524109, "grad_norm": 1.4718644833323113, "learning_rate": 0.00019680405068830717, "loss": 12.748, "num_tokens": 1116345.0, "step": 156 }, { "epoch": 0.10971348707197764, "grad_norm": 1.672327371288932, "learning_rate": 0.00019674703820374994, "loss": 12.7993, "num_tokens": 1123338.0, "step": 157 }, { "epoch": 0.11041229909154437, "grad_norm": 1.6851443027272985, "learning_rate": 0.0001966895300883707, "loss": 12.6513, "num_tokens": 1130116.0, "step": 158 }, { "epoch": 0.1111111111111111, "grad_norm": 1.3721921552177048, "learning_rate": 0.00019663152663678099, "loss": 12.4606, "num_tokens": 1137314.0, "step": 159 }, { "epoch": 0.11180992313067785, "grad_norm": 1.5450593769920604, "learning_rate": 0.0001965730281461299, "loss": 12.8374, "num_tokens": 1143586.0, "step": 160 }, { "epoch": 0.11250873515024458, "grad_norm": 1.4770180074283248, "learning_rate": 0.00019651403491610268, "loss": 12.6782, "num_tokens": 1150678.0, "step": 161 }, { "epoch": 0.11320754716981132, "grad_norm": 1.445408495118887, "learning_rate": 0.000196454547248919, "loss": 12.645, "num_tokens": 1158316.0, "step": 162 }, { "epoch": 0.11390635918937805, "grad_norm": 1.5275106147817101, "learning_rate": 0.00019639456544933155, "loss": 12.7599, "num_tokens": 1165236.0, "step": 163 }, { "epoch": 0.1146051712089448, "grad_norm": 1.7109304556766742, "learning_rate": 0.0001963340898246245, "loss": 12.838, "num_tokens": 1172589.0, "step": 164 }, { "epoch": 0.11530398322851153, "grad_norm": 1.3543533906158267, "learning_rate": 0.00019627312068461184, "loss": 12.7582, "num_tokens": 1179343.0, "step": 165 }, { "epoch": 0.11600279524807827, "grad_norm": 1.4744050532881612, "learning_rate": 0.00019621165834163572, "loss": 12.6345, "num_tokens": 1185779.0, "step": 166 }, { "epoch": 0.116701607267645, "grad_norm": 1.5081447035442486, "learning_rate": 0.00019614970311056503, "loss": 12.9426, "num_tokens": 1192364.0, "step": 167 }, { "epoch": 0.11740041928721175, "grad_norm": 1.3892660490129107, "learning_rate": 0.00019608725530879375, "loss": 12.66, "num_tokens": 1199385.0, "step": 168 }, { "epoch": 0.11809923130677848, "grad_norm": 1.5001836923777718, "learning_rate": 0.00019602431525623918, "loss": 12.8446, "num_tokens": 1206524.0, "step": 169 }, { "epoch": 0.1187980433263452, "grad_norm": 1.4450185187090752, "learning_rate": 0.00019596088327534047, "loss": 12.5973, "num_tokens": 1213487.0, "step": 170 }, { "epoch": 0.11949685534591195, "grad_norm": 1.3844082514866582, "learning_rate": 0.0001958969596910568, "loss": 12.6159, "num_tokens": 1220301.0, "step": 171 }, { "epoch": 0.12019566736547868, "grad_norm": 1.3862156732184288, "learning_rate": 0.000195832544830866, "loss": 12.5869, "num_tokens": 1227778.0, "step": 172 }, { "epoch": 0.12089447938504543, "grad_norm": 1.5410063676466346, "learning_rate": 0.00019576763902476242, "loss": 12.6891, "num_tokens": 1234261.0, "step": 173 }, { "epoch": 0.12159329140461216, "grad_norm": 1.5794782038580049, "learning_rate": 0.0001957022426052558, "loss": 12.5885, "num_tokens": 1241757.0, "step": 174 }, { "epoch": 0.1222921034241789, "grad_norm": 1.4130269812346519, "learning_rate": 0.00019563635590736901, "loss": 12.5449, "num_tokens": 1248424.0, "step": 175 }, { "epoch": 0.12299091544374563, "grad_norm": 1.4804916312885361, "learning_rate": 0.00019556997926863673, "loss": 12.8005, "num_tokens": 1255116.0, "step": 176 }, { "epoch": 0.12368972746331237, "grad_norm": 1.391666293716997, "learning_rate": 0.0001955031130291036, "loss": 12.6645, "num_tokens": 1262373.0, "step": 177 }, { "epoch": 0.1243885394828791, "grad_norm": 1.462524566865328, "learning_rate": 0.0001954357575313224, "loss": 12.7713, "num_tokens": 1268591.0, "step": 178 }, { "epoch": 0.12508735150244585, "grad_norm": 1.3533516804568375, "learning_rate": 0.0001953679131203524, "loss": 12.6721, "num_tokens": 1276309.0, "step": 179 }, { "epoch": 0.12578616352201258, "grad_norm": 1.6846564652870581, "learning_rate": 0.00019529958014375746, "loss": 12.581, "num_tokens": 1283604.0, "step": 180 }, { "epoch": 0.1264849755415793, "grad_norm": 1.3037737767137891, "learning_rate": 0.0001952307589516045, "loss": 12.6895, "num_tokens": 1290423.0, "step": 181 }, { "epoch": 0.12718378756114604, "grad_norm": 1.4763749163183653, "learning_rate": 0.00019516144989646143, "loss": 12.7782, "num_tokens": 1297162.0, "step": 182 }, { "epoch": 0.1278825995807128, "grad_norm": 1.198895326351823, "learning_rate": 0.00019509165333339551, "loss": 12.577, "num_tokens": 1304042.0, "step": 183 }, { "epoch": 0.12858141160027953, "grad_norm": 1.4329209092636808, "learning_rate": 0.0001950213696199714, "loss": 12.4852, "num_tokens": 1311266.0, "step": 184 }, { "epoch": 0.12928022361984626, "grad_norm": 1.6220658819801768, "learning_rate": 0.00019495059911624958, "loss": 12.7953, "num_tokens": 1317490.0, "step": 185 }, { "epoch": 0.129979035639413, "grad_norm": 1.36259479273472, "learning_rate": 0.00019487934218478413, "loss": 12.6933, "num_tokens": 1324708.0, "step": 186 }, { "epoch": 0.13067784765897975, "grad_norm": 1.5346998304400725, "learning_rate": 0.0001948075991906212, "loss": 12.46, "num_tokens": 1331506.0, "step": 187 }, { "epoch": 0.13137665967854648, "grad_norm": 1.5977678492934153, "learning_rate": 0.00019473537050129704, "loss": 12.5766, "num_tokens": 1338737.0, "step": 188 }, { "epoch": 0.1320754716981132, "grad_norm": 1.4504648749412852, "learning_rate": 0.00019466265648683602, "loss": 12.6044, "num_tokens": 1346238.0, "step": 189 }, { "epoch": 0.13277428371767994, "grad_norm": 1.6038070232816393, "learning_rate": 0.0001945894575197488, "loss": 12.5935, "num_tokens": 1353786.0, "step": 190 }, { "epoch": 0.1334730957372467, "grad_norm": 1.516022452074653, "learning_rate": 0.00019451577397503053, "loss": 12.6887, "num_tokens": 1360969.0, "step": 191 }, { "epoch": 0.13417190775681342, "grad_norm": 1.475254652217503, "learning_rate": 0.00019444160623015874, "loss": 12.7507, "num_tokens": 1368167.0, "step": 192 }, { "epoch": 0.13487071977638015, "grad_norm": 1.6428399957055764, "learning_rate": 0.00019436695466509152, "loss": 12.4319, "num_tokens": 1375092.0, "step": 193 }, { "epoch": 0.13556953179594688, "grad_norm": 1.8224392036810861, "learning_rate": 0.00019429181966226558, "loss": 12.3294, "num_tokens": 1383015.0, "step": 194 }, { "epoch": 0.13626834381551362, "grad_norm": 1.4142248070635162, "learning_rate": 0.00019421620160659417, "loss": 12.4263, "num_tokens": 1389785.0, "step": 195 }, { "epoch": 0.13696715583508037, "grad_norm": 1.7732304272767438, "learning_rate": 0.00019414010088546535, "loss": 12.4284, "num_tokens": 1397770.0, "step": 196 }, { "epoch": 0.1376659678546471, "grad_norm": 1.2887513644847417, "learning_rate": 0.00019406351788873972, "loss": 12.3058, "num_tokens": 1404674.0, "step": 197 }, { "epoch": 0.13836477987421383, "grad_norm": 1.915692503409536, "learning_rate": 0.00019398645300874865, "loss": 12.7271, "num_tokens": 1411618.0, "step": 198 }, { "epoch": 0.13906359189378056, "grad_norm": 1.5922007813255112, "learning_rate": 0.00019390890664029204, "loss": 12.4225, "num_tokens": 1418834.0, "step": 199 }, { "epoch": 0.13976240391334732, "grad_norm": 1.562568647357345, "learning_rate": 0.0001938308791806366, "loss": 12.587, "num_tokens": 1425877.0, "step": 200 }, { "epoch": 0.14046121593291405, "grad_norm": 1.6484034424059983, "learning_rate": 0.0001937523710295136, "loss": 12.5672, "num_tokens": 1432515.0, "step": 201 }, { "epoch": 0.14116002795248078, "grad_norm": 1.3489309672054464, "learning_rate": 0.00019367338258911675, "loss": 12.514, "num_tokens": 1439548.0, "step": 202 }, { "epoch": 0.1418588399720475, "grad_norm": 1.8825352386410237, "learning_rate": 0.0001935939142641004, "loss": 12.4288, "num_tokens": 1446322.0, "step": 203 }, { "epoch": 0.14255765199161424, "grad_norm": 1.3919561230568087, "learning_rate": 0.0001935139664615773, "loss": 12.6324, "num_tokens": 1453298.0, "step": 204 }, { "epoch": 0.143256464011181, "grad_norm": 1.6509748790652776, "learning_rate": 0.00019343353959111652, "loss": 12.4141, "num_tokens": 1460188.0, "step": 205 }, { "epoch": 0.14395527603074773, "grad_norm": 1.6488562651674301, "learning_rate": 0.00019335263406474137, "loss": 12.4702, "num_tokens": 1467199.0, "step": 206 }, { "epoch": 0.14465408805031446, "grad_norm": 1.5210417677317358, "learning_rate": 0.00019327125029692735, "loss": 12.4063, "num_tokens": 1474116.0, "step": 207 }, { "epoch": 0.1453529000698812, "grad_norm": 1.4513815903689755, "learning_rate": 0.00019318938870459984, "loss": 12.4471, "num_tokens": 1480988.0, "step": 208 }, { "epoch": 0.14605171208944795, "grad_norm": 1.478959568743595, "learning_rate": 0.00019310704970713224, "loss": 12.2195, "num_tokens": 1487900.0, "step": 209 }, { "epoch": 0.14675052410901468, "grad_norm": 1.3540930180495643, "learning_rate": 0.0001930242337263436, "loss": 12.4247, "num_tokens": 1495543.0, "step": 210 }, { "epoch": 0.1474493361285814, "grad_norm": 1.5214245330484042, "learning_rate": 0.00019294094118649653, "loss": 12.23, "num_tokens": 1502498.0, "step": 211 }, { "epoch": 0.14814814814814814, "grad_norm": 1.4560111174480097, "learning_rate": 0.00019285717251429506, "loss": 12.2885, "num_tokens": 1509580.0, "step": 212 }, { "epoch": 0.1488469601677149, "grad_norm": 1.4716461734559856, "learning_rate": 0.00019277292813888244, "loss": 12.3907, "num_tokens": 1516376.0, "step": 213 }, { "epoch": 0.14954577218728163, "grad_norm": 1.375026775365828, "learning_rate": 0.00019268820849183883, "loss": 12.4456, "num_tokens": 1523015.0, "step": 214 }, { "epoch": 0.15024458420684836, "grad_norm": 1.5686453771772464, "learning_rate": 0.00019260301400717938, "loss": 12.577, "num_tokens": 1530696.0, "step": 215 }, { "epoch": 0.1509433962264151, "grad_norm": 1.3279067290386655, "learning_rate": 0.00019251734512135157, "loss": 12.7059, "num_tokens": 1537893.0, "step": 216 }, { "epoch": 0.15164220824598182, "grad_norm": 1.2564753041744463, "learning_rate": 0.00019243120227323333, "loss": 12.2507, "num_tokens": 1545460.0, "step": 217 }, { "epoch": 0.15234102026554858, "grad_norm": 1.2500188377338248, "learning_rate": 0.00019234458590413077, "loss": 12.2926, "num_tokens": 1552764.0, "step": 218 }, { "epoch": 0.1530398322851153, "grad_norm": 1.4480919319809458, "learning_rate": 0.0001922574964577757, "loss": 12.4254, "num_tokens": 1559826.0, "step": 219 }, { "epoch": 0.15373864430468204, "grad_norm": 1.4659507852094718, "learning_rate": 0.0001921699343803235, "loss": 12.5645, "num_tokens": 1567575.0, "step": 220 }, { "epoch": 0.15443745632424877, "grad_norm": 1.3643705799081125, "learning_rate": 0.00019208190012035087, "loss": 12.3877, "num_tokens": 1574362.0, "step": 221 }, { "epoch": 0.15513626834381553, "grad_norm": 2.007483521518634, "learning_rate": 0.00019199339412885347, "loss": 12.2945, "num_tokens": 1581335.0, "step": 222 }, { "epoch": 0.15583508036338226, "grad_norm": 1.3421403438430553, "learning_rate": 0.00019190441685924353, "loss": 12.2394, "num_tokens": 1588536.0, "step": 223 }, { "epoch": 0.15653389238294899, "grad_norm": 1.619215765976526, "learning_rate": 0.00019181496876734776, "loss": 12.4262, "num_tokens": 1595480.0, "step": 224 }, { "epoch": 0.15723270440251572, "grad_norm": 1.2307496781017198, "learning_rate": 0.0001917250503114048, "loss": 12.4242, "num_tokens": 1602858.0, "step": 225 }, { "epoch": 0.15793151642208245, "grad_norm": 1.3116673091039746, "learning_rate": 0.0001916346619520629, "loss": 12.2034, "num_tokens": 1610558.0, "step": 226 }, { "epoch": 0.1586303284416492, "grad_norm": 1.247482468822277, "learning_rate": 0.00019154380415237768, "loss": 12.5231, "num_tokens": 1617490.0, "step": 227 }, { "epoch": 0.15932914046121593, "grad_norm": 1.2513001860850739, "learning_rate": 0.00019145247737780961, "loss": 12.3687, "num_tokens": 1624406.0, "step": 228 }, { "epoch": 0.16002795248078266, "grad_norm": 1.158869133135163, "learning_rate": 0.00019136068209622183, "loss": 12.3431, "num_tokens": 1631539.0, "step": 229 }, { "epoch": 0.1607267645003494, "grad_norm": 1.2794317571864204, "learning_rate": 0.00019126841877787745, "loss": 12.2967, "num_tokens": 1638417.0, "step": 230 }, { "epoch": 0.16142557651991615, "grad_norm": 1.147970460946765, "learning_rate": 0.00019117568789543742, "loss": 12.2909, "num_tokens": 1645769.0, "step": 231 }, { "epoch": 0.16212438853948288, "grad_norm": 1.2532108422981187, "learning_rate": 0.00019108248992395795, "loss": 12.3622, "num_tokens": 1653953.0, "step": 232 }, { "epoch": 0.1628232005590496, "grad_norm": 1.0938253419136699, "learning_rate": 0.0001909888253408882, "loss": 12.1525, "num_tokens": 1661632.0, "step": 233 }, { "epoch": 0.16352201257861634, "grad_norm": 1.2405546810649237, "learning_rate": 0.00019089469462606765, "loss": 12.2075, "num_tokens": 1668594.0, "step": 234 }, { "epoch": 0.1642208245981831, "grad_norm": 1.197216633614794, "learning_rate": 0.00019080009826172387, "loss": 12.257, "num_tokens": 1675137.0, "step": 235 }, { "epoch": 0.16491963661774983, "grad_norm": 1.309655057462355, "learning_rate": 0.00019070503673246982, "loss": 12.238, "num_tokens": 1682132.0, "step": 236 }, { "epoch": 0.16561844863731656, "grad_norm": 1.244523044571817, "learning_rate": 0.0001906095105253016, "loss": 12.5012, "num_tokens": 1688115.0, "step": 237 }, { "epoch": 0.1663172606568833, "grad_norm": 1.3805924558908977, "learning_rate": 0.00019051352012959568, "loss": 12.3363, "num_tokens": 1694968.0, "step": 238 }, { "epoch": 0.16701607267645002, "grad_norm": 1.3621136578810145, "learning_rate": 0.0001904170660371067, "loss": 12.5149, "num_tokens": 1702568.0, "step": 239 }, { "epoch": 0.16771488469601678, "grad_norm": 1.388178453568404, "learning_rate": 0.00019032014874196474, "loss": 12.4916, "num_tokens": 1709560.0, "step": 240 }, { "epoch": 0.1684136967155835, "grad_norm": 1.1942869510376484, "learning_rate": 0.0001902227687406728, "loss": 12.337, "num_tokens": 1717750.0, "step": 241 }, { "epoch": 0.16911250873515024, "grad_norm": 1.4017872903001107, "learning_rate": 0.0001901249265321044, "loss": 12.2873, "num_tokens": 1724736.0, "step": 242 }, { "epoch": 0.16981132075471697, "grad_norm": 1.2742005252701514, "learning_rate": 0.00019002662261750078, "loss": 12.2834, "num_tokens": 1732240.0, "step": 243 }, { "epoch": 0.17051013277428373, "grad_norm": 1.354069951288616, "learning_rate": 0.00018992785750046863, "loss": 12.1109, "num_tokens": 1739543.0, "step": 244 }, { "epoch": 0.17120894479385046, "grad_norm": 1.2292579561482964, "learning_rate": 0.00018982863168697734, "loss": 12.357, "num_tokens": 1746459.0, "step": 245 }, { "epoch": 0.1719077568134172, "grad_norm": 1.3113989556044794, "learning_rate": 0.00018972894568535634, "loss": 12.4115, "num_tokens": 1753478.0, "step": 246 }, { "epoch": 0.17260656883298392, "grad_norm": 1.2595357641289937, "learning_rate": 0.00018962880000629258, "loss": 12.0726, "num_tokens": 1760374.0, "step": 247 }, { "epoch": 0.17330538085255065, "grad_norm": 1.2704656650350132, "learning_rate": 0.0001895281951628281, "loss": 12.4336, "num_tokens": 1767809.0, "step": 248 }, { "epoch": 0.1740041928721174, "grad_norm": 1.244384651549712, "learning_rate": 0.000189427131670357, "loss": 12.4644, "num_tokens": 1775284.0, "step": 249 }, { "epoch": 0.17470300489168414, "grad_norm": 1.2916600144904333, "learning_rate": 0.00018932561004662312, "loss": 12.1551, "num_tokens": 1782896.0, "step": 250 }, { "epoch": 0.17540181691125087, "grad_norm": 1.323531415132343, "learning_rate": 0.00018922363081171723, "loss": 12.238, "num_tokens": 1790348.0, "step": 251 }, { "epoch": 0.1761006289308176, "grad_norm": 1.2454277559117337, "learning_rate": 0.0001891211944880746, "loss": 12.6069, "num_tokens": 1796663.0, "step": 252 }, { "epoch": 0.17679944095038436, "grad_norm": 1.155499185809833, "learning_rate": 0.00018901830160047184, "loss": 12.1616, "num_tokens": 1804575.0, "step": 253 }, { "epoch": 0.1774982529699511, "grad_norm": 1.49853696436634, "learning_rate": 0.0001889149526760248, "loss": 12.3159, "num_tokens": 1810818.0, "step": 254 }, { "epoch": 0.17819706498951782, "grad_norm": 1.2735723628571658, "learning_rate": 0.0001888111482441855, "loss": 12.1924, "num_tokens": 1817813.0, "step": 255 }, { "epoch": 0.17889587700908455, "grad_norm": 1.3127755468927709, "learning_rate": 0.00018870688883673936, "loss": 12.2746, "num_tokens": 1824365.0, "step": 256 }, { "epoch": 0.1795946890286513, "grad_norm": 1.4197695569247741, "learning_rate": 0.00018860217498780285, "loss": 12.3002, "num_tokens": 1831336.0, "step": 257 }, { "epoch": 0.18029350104821804, "grad_norm": 1.461090176993007, "learning_rate": 0.00018849700723382035, "loss": 12.1504, "num_tokens": 1838657.0, "step": 258 }, { "epoch": 0.18099231306778477, "grad_norm": 1.3059767534305093, "learning_rate": 0.0001883913861135617, "loss": 12.2029, "num_tokens": 1845965.0, "step": 259 }, { "epoch": 0.1816911250873515, "grad_norm": 1.4172408521615742, "learning_rate": 0.00018828531216811913, "loss": 12.479, "num_tokens": 1852638.0, "step": 260 }, { "epoch": 0.18238993710691823, "grad_norm": 1.4950969515194465, "learning_rate": 0.00018817878594090494, "loss": 12.3214, "num_tokens": 1859877.0, "step": 261 }, { "epoch": 0.18308874912648498, "grad_norm": 1.921065371477902, "learning_rate": 0.00018807180797764822, "loss": 12.2991, "num_tokens": 1866923.0, "step": 262 }, { "epoch": 0.18378756114605171, "grad_norm": 1.5413785930712542, "learning_rate": 0.00018796437882639242, "loss": 12.1631, "num_tokens": 1873292.0, "step": 263 }, { "epoch": 0.18448637316561844, "grad_norm": 2.058759919250719, "learning_rate": 0.00018785649903749234, "loss": 12.2534, "num_tokens": 1879744.0, "step": 264 }, { "epoch": 0.18518518518518517, "grad_norm": 1.3730183956055928, "learning_rate": 0.00018774816916361137, "loss": 12.2737, "num_tokens": 1886064.0, "step": 265 }, { "epoch": 0.18588399720475193, "grad_norm": 1.771295055388875, "learning_rate": 0.00018763938975971872, "loss": 12.0608, "num_tokens": 1893813.0, "step": 266 }, { "epoch": 0.18658280922431866, "grad_norm": 1.325037236296118, "learning_rate": 0.0001875301613830865, "loss": 12.6401, "num_tokens": 1901211.0, "step": 267 }, { "epoch": 0.1872816212438854, "grad_norm": 1.650441961527195, "learning_rate": 0.00018742048459328682, "loss": 12.253, "num_tokens": 1907987.0, "step": 268 }, { "epoch": 0.18798043326345212, "grad_norm": 1.3109663633913111, "learning_rate": 0.00018731035995218914, "loss": 12.4699, "num_tokens": 1915853.0, "step": 269 }, { "epoch": 0.18867924528301888, "grad_norm": 1.6175242871144049, "learning_rate": 0.00018719978802395705, "loss": 12.0339, "num_tokens": 1923310.0, "step": 270 }, { "epoch": 0.1893780573025856, "grad_norm": 1.5608096103087008, "learning_rate": 0.0001870887693750458, "loss": 12.0746, "num_tokens": 1930169.0, "step": 271 }, { "epoch": 0.19007686932215234, "grad_norm": 1.301318723039762, "learning_rate": 0.00018697730457419893, "loss": 12.2536, "num_tokens": 1937617.0, "step": 272 }, { "epoch": 0.19077568134171907, "grad_norm": 1.3298561548013452, "learning_rate": 0.00018686539419244578, "loss": 12.0688, "num_tokens": 1944358.0, "step": 273 }, { "epoch": 0.1914744933612858, "grad_norm": 1.3519869211393252, "learning_rate": 0.0001867530388030983, "loss": 12.1702, "num_tokens": 1951629.0, "step": 274 }, { "epoch": 0.19217330538085256, "grad_norm": 1.3057581412928303, "learning_rate": 0.00018664023898174817, "loss": 12.1388, "num_tokens": 1958779.0, "step": 275 }, { "epoch": 0.1928721174004193, "grad_norm": 1.4643939657595633, "learning_rate": 0.00018652699530626398, "loss": 12.0666, "num_tokens": 1966253.0, "step": 276 }, { "epoch": 0.19357092941998602, "grad_norm": 1.220622846026274, "learning_rate": 0.00018641330835678804, "loss": 12.2549, "num_tokens": 1973038.0, "step": 277 }, { "epoch": 0.19426974143955275, "grad_norm": 1.271130886700587, "learning_rate": 0.00018629917871573366, "loss": 12.3878, "num_tokens": 1980735.0, "step": 278 }, { "epoch": 0.1949685534591195, "grad_norm": 1.2558395545718568, "learning_rate": 0.0001861846069677819, "loss": 12.1311, "num_tokens": 1988250.0, "step": 279 }, { "epoch": 0.19566736547868624, "grad_norm": 1.286119199563044, "learning_rate": 0.00018606959369987883, "loss": 12.208, "num_tokens": 1995184.0, "step": 280 }, { "epoch": 0.19636617749825297, "grad_norm": 1.2007396708232814, "learning_rate": 0.00018595413950123235, "loss": 12.0446, "num_tokens": 2001901.0, "step": 281 }, { "epoch": 0.1970649895178197, "grad_norm": 1.2677570769488615, "learning_rate": 0.00018583824496330923, "loss": 12.3823, "num_tokens": 2009242.0, "step": 282 }, { "epoch": 0.19776380153738643, "grad_norm": 1.2317309257123488, "learning_rate": 0.00018572191067983216, "loss": 12.3145, "num_tokens": 2016167.0, "step": 283 }, { "epoch": 0.1984626135569532, "grad_norm": 1.4383245513671332, "learning_rate": 0.00018560513724677643, "loss": 12.2043, "num_tokens": 2023059.0, "step": 284 }, { "epoch": 0.19916142557651992, "grad_norm": 1.3973942304081446, "learning_rate": 0.00018548792526236732, "loss": 12.0703, "num_tokens": 2030297.0, "step": 285 }, { "epoch": 0.19986023759608665, "grad_norm": 1.6715503163582084, "learning_rate": 0.00018537027532707662, "loss": 12.2566, "num_tokens": 2036674.0, "step": 286 }, { "epoch": 0.20055904961565338, "grad_norm": 1.5807473618196455, "learning_rate": 0.00018525218804361977, "loss": 11.9821, "num_tokens": 2043766.0, "step": 287 }, { "epoch": 0.20125786163522014, "grad_norm": 1.3939701688797845, "learning_rate": 0.00018513366401695276, "loss": 12.2849, "num_tokens": 2051302.0, "step": 288 }, { "epoch": 0.20195667365478687, "grad_norm": 1.66288008309786, "learning_rate": 0.00018501470385426892, "loss": 12.1162, "num_tokens": 2058562.0, "step": 289 }, { "epoch": 0.2026554856743536, "grad_norm": 1.205066449864526, "learning_rate": 0.00018489530816499596, "loss": 12.1756, "num_tokens": 2065605.0, "step": 290 }, { "epoch": 0.20335429769392033, "grad_norm": 1.4415331634901984, "learning_rate": 0.00018477547756079276, "loss": 12.1293, "num_tokens": 2072590.0, "step": 291 }, { "epoch": 0.20405310971348709, "grad_norm": 1.2758727814346498, "learning_rate": 0.0001846552126555462, "loss": 12.2186, "num_tokens": 2080039.0, "step": 292 }, { "epoch": 0.20475192173305382, "grad_norm": 1.365806213254789, "learning_rate": 0.00018453451406536816, "loss": 12.3626, "num_tokens": 2086481.0, "step": 293 }, { "epoch": 0.20545073375262055, "grad_norm": 1.258317016689637, "learning_rate": 0.00018441338240859215, "loss": 12.2221, "num_tokens": 2093192.0, "step": 294 }, { "epoch": 0.20614954577218728, "grad_norm": 1.2394183274844288, "learning_rate": 0.00018429181830577034, "loss": 12.1013, "num_tokens": 2100572.0, "step": 295 }, { "epoch": 0.206848357791754, "grad_norm": 1.5003520290968377, "learning_rate": 0.00018416982237967028, "loss": 12.2991, "num_tokens": 2107911.0, "step": 296 }, { "epoch": 0.20754716981132076, "grad_norm": 1.3085255308847732, "learning_rate": 0.00018404739525527174, "loss": 12.0833, "num_tokens": 2115264.0, "step": 297 }, { "epoch": 0.2082459818308875, "grad_norm": 1.6056696953344411, "learning_rate": 0.0001839245375597635, "loss": 12.0227, "num_tokens": 2122114.0, "step": 298 }, { "epoch": 0.20894479385045422, "grad_norm": 1.306758409586964, "learning_rate": 0.0001838012499225401, "loss": 12.0232, "num_tokens": 2129186.0, "step": 299 }, { "epoch": 0.20964360587002095, "grad_norm": 1.5183083261581376, "learning_rate": 0.00018367753297519873, "loss": 12.3017, "num_tokens": 2136056.0, "step": 300 }, { "epoch": 0.2103424178895877, "grad_norm": 1.3471382535225356, "learning_rate": 0.00018355338735153587, "loss": 12.0467, "num_tokens": 2143135.0, "step": 301 }, { "epoch": 0.21104122990915444, "grad_norm": 1.2459910456351586, "learning_rate": 0.00018342881368754404, "loss": 12.2002, "num_tokens": 2149855.0, "step": 302 }, { "epoch": 0.21174004192872117, "grad_norm": 1.411018357188159, "learning_rate": 0.00018330381262140864, "loss": 12.4267, "num_tokens": 2156629.0, "step": 303 }, { "epoch": 0.2124388539482879, "grad_norm": 1.231215569953917, "learning_rate": 0.00018317838479350472, "loss": 12.3044, "num_tokens": 2163993.0, "step": 304 }, { "epoch": 0.21313766596785463, "grad_norm": 1.282732106655346, "learning_rate": 0.0001830525308463934, "loss": 12.1661, "num_tokens": 2170517.0, "step": 305 }, { "epoch": 0.2138364779874214, "grad_norm": 1.3735651142447436, "learning_rate": 0.00018292625142481906, "loss": 12.2508, "num_tokens": 2177805.0, "step": 306 }, { "epoch": 0.21453529000698812, "grad_norm": 1.1267279398187773, "learning_rate": 0.00018279954717570553, "loss": 12.246, "num_tokens": 2184824.0, "step": 307 }, { "epoch": 0.21523410202655485, "grad_norm": 1.297904898358193, "learning_rate": 0.00018267241874815314, "loss": 11.951, "num_tokens": 2192640.0, "step": 308 }, { "epoch": 0.21593291404612158, "grad_norm": 1.168111311692671, "learning_rate": 0.00018254486679343516, "loss": 12.1526, "num_tokens": 2199963.0, "step": 309 }, { "epoch": 0.21663172606568834, "grad_norm": 1.2350993727401407, "learning_rate": 0.00018241689196499475, "loss": 12.1094, "num_tokens": 2207388.0, "step": 310 }, { "epoch": 0.21733053808525507, "grad_norm": 1.2380916588446629, "learning_rate": 0.00018228849491844129, "loss": 11.9739, "num_tokens": 2214115.0, "step": 311 }, { "epoch": 0.2180293501048218, "grad_norm": 1.2229988857994327, "learning_rate": 0.00018215967631154717, "loss": 12.0549, "num_tokens": 2221801.0, "step": 312 }, { "epoch": 0.21872816212438853, "grad_norm": 1.4655555313500719, "learning_rate": 0.00018203043680424448, "loss": 12.127, "num_tokens": 2229449.0, "step": 313 }, { "epoch": 0.2194269741439553, "grad_norm": 1.2682868579019313, "learning_rate": 0.00018190077705862155, "loss": 12.2111, "num_tokens": 2236249.0, "step": 314 }, { "epoch": 0.22012578616352202, "grad_norm": 1.6151789147467688, "learning_rate": 0.00018177069773891953, "loss": 12.0597, "num_tokens": 2243354.0, "step": 315 }, { "epoch": 0.22082459818308875, "grad_norm": 1.3508337349304373, "learning_rate": 0.00018164019951152902, "loss": 12.1405, "num_tokens": 2249837.0, "step": 316 }, { "epoch": 0.22152341020265548, "grad_norm": 1.760603025999751, "learning_rate": 0.00018150928304498675, "loss": 12.0609, "num_tokens": 2256520.0, "step": 317 }, { "epoch": 0.2222222222222222, "grad_norm": 1.305934795669062, "learning_rate": 0.00018137794900997201, "loss": 12.2233, "num_tokens": 2263145.0, "step": 318 }, { "epoch": 0.22292103424178897, "grad_norm": 1.491378308966807, "learning_rate": 0.0001812461980793033, "loss": 12.1678, "num_tokens": 2269862.0, "step": 319 }, { "epoch": 0.2236198462613557, "grad_norm": 1.2632967458884938, "learning_rate": 0.0001811140309279348, "loss": 12.3329, "num_tokens": 2276820.0, "step": 320 }, { "epoch": 0.22431865828092243, "grad_norm": 1.1158838193265348, "learning_rate": 0.00018098144823295304, "loss": 11.9781, "num_tokens": 2284748.0, "step": 321 }, { "epoch": 0.22501747030048916, "grad_norm": 1.2505380919755098, "learning_rate": 0.00018084845067357336, "loss": 12.0788, "num_tokens": 2292140.0, "step": 322 }, { "epoch": 0.22571628232005592, "grad_norm": 1.0921267435771704, "learning_rate": 0.00018071503893113638, "loss": 12.2769, "num_tokens": 2300108.0, "step": 323 }, { "epoch": 0.22641509433962265, "grad_norm": 1.3414524658246134, "learning_rate": 0.00018058121368910458, "loss": 11.8736, "num_tokens": 2307811.0, "step": 324 }, { "epoch": 0.22711390635918938, "grad_norm": 1.3336139939898337, "learning_rate": 0.00018044697563305876, "loss": 11.9168, "num_tokens": 2315254.0, "step": 325 }, { "epoch": 0.2278127183787561, "grad_norm": 1.2424706438980715, "learning_rate": 0.00018031232545069468, "loss": 12.1282, "num_tokens": 2323116.0, "step": 326 }, { "epoch": 0.22851153039832284, "grad_norm": 1.189509985734354, "learning_rate": 0.00018017726383181925, "loss": 12.1013, "num_tokens": 2330812.0, "step": 327 }, { "epoch": 0.2292103424178896, "grad_norm": 1.201526626317861, "learning_rate": 0.0001800417914683471, "loss": 12.1022, "num_tokens": 2338451.0, "step": 328 }, { "epoch": 0.22990915443745633, "grad_norm": 1.0841395533719858, "learning_rate": 0.0001799059090542974, "loss": 12.1923, "num_tokens": 2346026.0, "step": 329 }, { "epoch": 0.23060796645702306, "grad_norm": 1.0543640932645288, "learning_rate": 0.00017976961728578963, "loss": 12.0118, "num_tokens": 2353605.0, "step": 330 }, { "epoch": 0.23130677847658979, "grad_norm": 1.0221140744809603, "learning_rate": 0.00017963291686104053, "loss": 12.0598, "num_tokens": 2360509.0, "step": 331 }, { "epoch": 0.23200559049615654, "grad_norm": 1.1077266853542385, "learning_rate": 0.00017949580848036046, "loss": 12.2089, "num_tokens": 2367035.0, "step": 332 }, { "epoch": 0.23270440251572327, "grad_norm": 1.1017231412164032, "learning_rate": 0.00017935829284614952, "loss": 12.0369, "num_tokens": 2373702.0, "step": 333 }, { "epoch": 0.23340321453529, "grad_norm": 1.156329815157838, "learning_rate": 0.00017922037066289432, "loss": 12.1458, "num_tokens": 2380174.0, "step": 334 }, { "epoch": 0.23410202655485673, "grad_norm": 1.1589595989908332, "learning_rate": 0.0001790820426371641, "loss": 12.0406, "num_tokens": 2387052.0, "step": 335 }, { "epoch": 0.2348008385744235, "grad_norm": 1.0708490484699373, "learning_rate": 0.00017894330947760726, "loss": 11.8914, "num_tokens": 2393866.0, "step": 336 }, { "epoch": 0.23549965059399022, "grad_norm": 1.2056352526942165, "learning_rate": 0.0001788041718949477, "loss": 12.0097, "num_tokens": 2401040.0, "step": 337 }, { "epoch": 0.23619846261355695, "grad_norm": 1.034857406947483, "learning_rate": 0.00017866463060198115, "loss": 12.2169, "num_tokens": 2408627.0, "step": 338 }, { "epoch": 0.23689727463312368, "grad_norm": 1.1237746507948383, "learning_rate": 0.00017852468631357146, "loss": 12.1159, "num_tokens": 2415390.0, "step": 339 }, { "epoch": 0.2375960866526904, "grad_norm": 1.0771012425887634, "learning_rate": 0.00017838433974664712, "loss": 11.9807, "num_tokens": 2422275.0, "step": 340 }, { "epoch": 0.23829489867225717, "grad_norm": 1.0775272381287269, "learning_rate": 0.00017824359162019738, "loss": 12.039, "num_tokens": 2429408.0, "step": 341 }, { "epoch": 0.2389937106918239, "grad_norm": 1.157562166665061, "learning_rate": 0.00017810244265526875, "loss": 12.0734, "num_tokens": 2436362.0, "step": 342 }, { "epoch": 0.23969252271139063, "grad_norm": 1.141745416472308, "learning_rate": 0.00017796089357496108, "loss": 12.1806, "num_tokens": 2442668.0, "step": 343 }, { "epoch": 0.24039133473095736, "grad_norm": 1.1565392135388017, "learning_rate": 0.0001778189451044242, "loss": 12.1268, "num_tokens": 2449013.0, "step": 344 }, { "epoch": 0.24109014675052412, "grad_norm": 1.2199413936237042, "learning_rate": 0.00017767659797085375, "loss": 12.0651, "num_tokens": 2455131.0, "step": 345 }, { "epoch": 0.24178895877009085, "grad_norm": 1.1260507638707287, "learning_rate": 0.000177533852903488, "loss": 12.1853, "num_tokens": 2462060.0, "step": 346 }, { "epoch": 0.24248777078965758, "grad_norm": 1.0316001161122876, "learning_rate": 0.0001773907106336035, "loss": 12.1543, "num_tokens": 2469590.0, "step": 347 }, { "epoch": 0.2431865828092243, "grad_norm": 1.0821146376156172, "learning_rate": 0.0001772471718945119, "loss": 11.9286, "num_tokens": 2476488.0, "step": 348 }, { "epoch": 0.24388539482879107, "grad_norm": 1.0314206907032888, "learning_rate": 0.0001771032374215558, "loss": 11.9985, "num_tokens": 2484657.0, "step": 349 }, { "epoch": 0.2445842068483578, "grad_norm": 1.1661975155295445, "learning_rate": 0.00017695890795210517, "loss": 12.2751, "num_tokens": 2491489.0, "step": 350 }, { "epoch": 0.24528301886792453, "grad_norm": 1.006621569495116, "learning_rate": 0.00017681418422555356, "loss": 12.1682, "num_tokens": 2498631.0, "step": 351 }, { "epoch": 0.24598183088749126, "grad_norm": 1.017213858202063, "learning_rate": 0.00017666906698331428, "loss": 11.8183, "num_tokens": 2506017.0, "step": 352 }, { "epoch": 0.246680642907058, "grad_norm": 1.1601560219424385, "learning_rate": 0.00017652355696881652, "loss": 12.0538, "num_tokens": 2513168.0, "step": 353 }, { "epoch": 0.24737945492662475, "grad_norm": 1.1274194045715678, "learning_rate": 0.0001763776549275017, "loss": 12.0499, "num_tokens": 2520301.0, "step": 354 }, { "epoch": 0.24807826694619148, "grad_norm": 1.3076210428356354, "learning_rate": 0.00017623136160681963, "loss": 12.0843, "num_tokens": 2527172.0, "step": 355 }, { "epoch": 0.2487770789657582, "grad_norm": 1.129509433616736, "learning_rate": 0.00017608467775622445, "loss": 12.1999, "num_tokens": 2534008.0, "step": 356 }, { "epoch": 0.24947589098532494, "grad_norm": 1.1713262910481799, "learning_rate": 0.00017593760412717117, "loss": 12.1523, "num_tokens": 2541533.0, "step": 357 }, { "epoch": 0.2501747030048917, "grad_norm": 1.26537646556665, "learning_rate": 0.0001757901414731115, "loss": 12.2059, "num_tokens": 2548063.0, "step": 358 }, { "epoch": 0.2508735150244584, "grad_norm": 1.0858488934936994, "learning_rate": 0.00017564229054949006, "loss": 11.9977, "num_tokens": 2555500.0, "step": 359 }, { "epoch": 0.25157232704402516, "grad_norm": 1.2437942191749563, "learning_rate": 0.0001754940521137407, "loss": 12.0343, "num_tokens": 2562431.0, "step": 360 }, { "epoch": 0.2522711390635919, "grad_norm": 1.1822227089400206, "learning_rate": 0.0001753454269252824, "loss": 12.1325, "num_tokens": 2569063.0, "step": 361 }, { "epoch": 0.2529699510831586, "grad_norm": 1.224962109373223, "learning_rate": 0.00017519641574551546, "loss": 12.2002, "num_tokens": 2576257.0, "step": 362 }, { "epoch": 0.25366876310272535, "grad_norm": 1.205685504194018, "learning_rate": 0.0001750470193378176, "loss": 12.1142, "num_tokens": 2582076.0, "step": 363 }, { "epoch": 0.2543675751222921, "grad_norm": 1.2821801879030204, "learning_rate": 0.00017489723846754002, "loss": 12.2827, "num_tokens": 2588634.0, "step": 364 }, { "epoch": 0.25506638714185886, "grad_norm": 1.1426775318667641, "learning_rate": 0.0001747470739020036, "loss": 11.867, "num_tokens": 2595988.0, "step": 365 }, { "epoch": 0.2557651991614256, "grad_norm": 1.1637204489941761, "learning_rate": 0.00017459652641049474, "loss": 12.1299, "num_tokens": 2602882.0, "step": 366 }, { "epoch": 0.2564640111809923, "grad_norm": 1.1749766201575815, "learning_rate": 0.0001744455967642616, "loss": 12.0572, "num_tokens": 2610428.0, "step": 367 }, { "epoch": 0.25716282320055905, "grad_norm": 1.161910310989543, "learning_rate": 0.00017429428573651024, "loss": 11.7402, "num_tokens": 2617338.0, "step": 368 }, { "epoch": 0.2578616352201258, "grad_norm": 1.1754220027439937, "learning_rate": 0.00017414259410240026, "loss": 11.9926, "num_tokens": 2624172.0, "step": 369 }, { "epoch": 0.2585604472396925, "grad_norm": 1.2221271919493757, "learning_rate": 0.0001739905226390413, "loss": 11.9574, "num_tokens": 2631185.0, "step": 370 }, { "epoch": 0.25925925925925924, "grad_norm": 1.121957071755484, "learning_rate": 0.0001738380721254888, "loss": 12.0634, "num_tokens": 2638925.0, "step": 371 }, { "epoch": 0.259958071278826, "grad_norm": 1.0439041189495724, "learning_rate": 0.00017368524334273998, "loss": 12.0201, "num_tokens": 2645812.0, "step": 372 }, { "epoch": 0.2606568832983927, "grad_norm": 1.1527899155597479, "learning_rate": 0.00017353203707373, "loss": 12.2003, "num_tokens": 2652935.0, "step": 373 }, { "epoch": 0.2613556953179595, "grad_norm": 1.1907694393429749, "learning_rate": 0.00017337845410332782, "loss": 12.0194, "num_tokens": 2659882.0, "step": 374 }, { "epoch": 0.2620545073375262, "grad_norm": 1.1925500897396009, "learning_rate": 0.0001732244952183323, "loss": 11.985, "num_tokens": 2667013.0, "step": 375 }, { "epoch": 0.26275331935709295, "grad_norm": 1.1065412669686558, "learning_rate": 0.000173070161207468, "loss": 11.9913, "num_tokens": 2675131.0, "step": 376 }, { "epoch": 0.2634521313766597, "grad_norm": 1.1800802673651714, "learning_rate": 0.00017291545286138126, "loss": 12.0599, "num_tokens": 2681743.0, "step": 377 }, { "epoch": 0.2641509433962264, "grad_norm": 1.1658749096956924, "learning_rate": 0.00017276037097263612, "loss": 12.0414, "num_tokens": 2688355.0, "step": 378 }, { "epoch": 0.26484975541579314, "grad_norm": 1.3106109156886867, "learning_rate": 0.00017260491633571033, "loss": 11.9744, "num_tokens": 2695315.0, "step": 379 }, { "epoch": 0.2655485674353599, "grad_norm": 1.025449480192329, "learning_rate": 0.0001724490897469911, "loss": 11.9174, "num_tokens": 2703258.0, "step": 380 }, { "epoch": 0.2662473794549266, "grad_norm": 1.2314285772644886, "learning_rate": 0.00017229289200477123, "loss": 11.9577, "num_tokens": 2710326.0, "step": 381 }, { "epoch": 0.2669461914744934, "grad_norm": 1.117575231956547, "learning_rate": 0.00017213632390924486, "loss": 12.0226, "num_tokens": 2716825.0, "step": 382 }, { "epoch": 0.2676450034940601, "grad_norm": 1.2030793581305907, "learning_rate": 0.00017197938626250348, "loss": 12.0668, "num_tokens": 2723868.0, "step": 383 }, { "epoch": 0.26834381551362685, "grad_norm": 1.1807906081601707, "learning_rate": 0.00017182207986853176, "loss": 12.1037, "num_tokens": 2730711.0, "step": 384 }, { "epoch": 0.2690426275331936, "grad_norm": 1.1626829031353831, "learning_rate": 0.00017166440553320337, "loss": 11.8767, "num_tokens": 2737540.0, "step": 385 }, { "epoch": 0.2697414395527603, "grad_norm": 1.1958111650969308, "learning_rate": 0.0001715063640642771, "loss": 11.9674, "num_tokens": 2744595.0, "step": 386 }, { "epoch": 0.27044025157232704, "grad_norm": 1.0938579012100453, "learning_rate": 0.00017134795627139236, "loss": 12.0369, "num_tokens": 2751663.0, "step": 387 }, { "epoch": 0.27113906359189377, "grad_norm": 1.1147811767994924, "learning_rate": 0.00017118918296606537, "loss": 11.9541, "num_tokens": 2759081.0, "step": 388 }, { "epoch": 0.2718378756114605, "grad_norm": 1.0946017427329167, "learning_rate": 0.00017103004496168473, "loss": 11.9995, "num_tokens": 2766249.0, "step": 389 }, { "epoch": 0.27253668763102723, "grad_norm": 1.1630648910235608, "learning_rate": 0.0001708705430735075, "loss": 12.1063, "num_tokens": 2773320.0, "step": 390 }, { "epoch": 0.273235499650594, "grad_norm": 0.9921673795864713, "learning_rate": 0.00017071067811865476, "loss": 12.0095, "num_tokens": 2780968.0, "step": 391 }, { "epoch": 0.27393431167016075, "grad_norm": 1.1524595718563635, "learning_rate": 0.0001705504509161077, "loss": 11.9414, "num_tokens": 2788319.0, "step": 392 }, { "epoch": 0.2746331236897275, "grad_norm": 1.0585603726235264, "learning_rate": 0.00017038986228670323, "loss": 12.0465, "num_tokens": 2796120.0, "step": 393 }, { "epoch": 0.2753319357092942, "grad_norm": 1.0331878827292054, "learning_rate": 0.00017022891305312987, "loss": 11.8823, "num_tokens": 2803198.0, "step": 394 }, { "epoch": 0.27603074772886094, "grad_norm": 1.1409979908427963, "learning_rate": 0.00017006760403992337, "loss": 11.9414, "num_tokens": 2809978.0, "step": 395 }, { "epoch": 0.27672955974842767, "grad_norm": 1.040919985039819, "learning_rate": 0.00016990593607346276, "loss": 11.9296, "num_tokens": 2816932.0, "step": 396 }, { "epoch": 0.2774283717679944, "grad_norm": 1.0716219841086227, "learning_rate": 0.00016974390998196595, "loss": 11.8656, "num_tokens": 2824857.0, "step": 397 }, { "epoch": 0.2781271837875611, "grad_norm": 0.9753445323621813, "learning_rate": 0.00016958152659548548, "loss": 11.8725, "num_tokens": 2832029.0, "step": 398 }, { "epoch": 0.27882599580712786, "grad_norm": 1.127571712658559, "learning_rate": 0.00016941878674590425, "loss": 11.9015, "num_tokens": 2839252.0, "step": 399 }, { "epoch": 0.27952480782669464, "grad_norm": 1.1137834248990268, "learning_rate": 0.00016925569126693136, "loss": 11.8403, "num_tokens": 2847177.0, "step": 400 }, { "epoch": 0.2802236198462614, "grad_norm": 1.1431199419756541, "learning_rate": 0.0001690922409940978, "loss": 11.9518, "num_tokens": 2854561.0, "step": 401 }, { "epoch": 0.2809224318658281, "grad_norm": 1.1270384163061071, "learning_rate": 0.00016892843676475212, "loss": 11.8864, "num_tokens": 2861654.0, "step": 402 }, { "epoch": 0.28162124388539483, "grad_norm": 0.9948092143767864, "learning_rate": 0.00016876427941805622, "loss": 11.9772, "num_tokens": 2868359.0, "step": 403 }, { "epoch": 0.28232005590496156, "grad_norm": 1.0080698098570233, "learning_rate": 0.00016859976979498092, "loss": 11.8542, "num_tokens": 2875713.0, "step": 404 }, { "epoch": 0.2830188679245283, "grad_norm": 1.06487609830136, "learning_rate": 0.00016843490873830178, "loss": 11.8649, "num_tokens": 2882164.0, "step": 405 }, { "epoch": 0.283717679944095, "grad_norm": 1.0694632227640828, "learning_rate": 0.00016826969709259477, "loss": 12.224, "num_tokens": 2888466.0, "step": 406 }, { "epoch": 0.28441649196366176, "grad_norm": 1.0329263366216357, "learning_rate": 0.0001681041357042319, "loss": 11.8856, "num_tokens": 2896054.0, "step": 407 }, { "epoch": 0.2851153039832285, "grad_norm": 0.9866874389611432, "learning_rate": 0.0001679382254213768, "loss": 12.0074, "num_tokens": 2903381.0, "step": 408 }, { "epoch": 0.28581411600279527, "grad_norm": 1.0547830834870289, "learning_rate": 0.00016777196709398065, "loss": 12.066, "num_tokens": 2910764.0, "step": 409 }, { "epoch": 0.286512928022362, "grad_norm": 1.1391779846002652, "learning_rate": 0.00016760536157377754, "loss": 11.895, "num_tokens": 2917852.0, "step": 410 }, { "epoch": 0.28721174004192873, "grad_norm": 1.0177173312443486, "learning_rate": 0.00016743840971428017, "loss": 11.837, "num_tokens": 2925426.0, "step": 411 }, { "epoch": 0.28791055206149546, "grad_norm": 1.2162307058502813, "learning_rate": 0.00016727111237077559, "loss": 11.9744, "num_tokens": 2932534.0, "step": 412 }, { "epoch": 0.2886093640810622, "grad_norm": 1.0264120107199801, "learning_rate": 0.00016710347040032076, "loss": 11.9857, "num_tokens": 2939544.0, "step": 413 }, { "epoch": 0.2893081761006289, "grad_norm": 1.3014425270226597, "learning_rate": 0.0001669354846617381, "loss": 11.8196, "num_tokens": 2946773.0, "step": 414 }, { "epoch": 0.29000698812019565, "grad_norm": 1.0796383036656527, "learning_rate": 0.00016676715601561117, "loss": 12.0303, "num_tokens": 2954251.0, "step": 415 }, { "epoch": 0.2907058001397624, "grad_norm": 1.1175059429614296, "learning_rate": 0.00016659848532428023, "loss": 12.1162, "num_tokens": 2961656.0, "step": 416 }, { "epoch": 0.2914046121593291, "grad_norm": 1.0443096335885806, "learning_rate": 0.00016642947345183774, "loss": 11.9358, "num_tokens": 2969059.0, "step": 417 }, { "epoch": 0.2921034241788959, "grad_norm": 1.1183836131543219, "learning_rate": 0.0001662601212641242, "loss": 11.8719, "num_tokens": 2977234.0, "step": 418 }, { "epoch": 0.29280223619846263, "grad_norm": 1.0227099098344798, "learning_rate": 0.00016609042962872333, "loss": 11.8904, "num_tokens": 2984854.0, "step": 419 }, { "epoch": 0.29350104821802936, "grad_norm": 1.245079947315459, "learning_rate": 0.00016592039941495804, "loss": 11.6806, "num_tokens": 2991537.0, "step": 420 }, { "epoch": 0.2941998602375961, "grad_norm": 1.1291214580898223, "learning_rate": 0.00016575003149388548, "loss": 11.9606, "num_tokens": 2998088.0, "step": 421 }, { "epoch": 0.2948986722571628, "grad_norm": 1.0741415576787732, "learning_rate": 0.00016557932673829311, "loss": 12.0354, "num_tokens": 3004876.0, "step": 422 }, { "epoch": 0.29559748427672955, "grad_norm": 1.1830288131183533, "learning_rate": 0.0001654082860226939, "loss": 11.9615, "num_tokens": 3011520.0, "step": 423 }, { "epoch": 0.2962962962962963, "grad_norm": 1.073871290170833, "learning_rate": 0.00016523691022332185, "loss": 11.9096, "num_tokens": 3018294.0, "step": 424 }, { "epoch": 0.296995108315863, "grad_norm": 1.1812984503364992, "learning_rate": 0.00016506520021812766, "loss": 11.6186, "num_tokens": 3026301.0, "step": 425 }, { "epoch": 0.2976939203354298, "grad_norm": 0.9602274072933003, "learning_rate": 0.00016489315688677416, "loss": 11.8616, "num_tokens": 3034166.0, "step": 426 }, { "epoch": 0.2983927323549965, "grad_norm": 1.000734269306996, "learning_rate": 0.00016472078111063175, "loss": 11.7692, "num_tokens": 3041930.0, "step": 427 }, { "epoch": 0.29909154437456326, "grad_norm": 1.1119226968838083, "learning_rate": 0.00016454807377277398, "loss": 12.0168, "num_tokens": 3048767.0, "step": 428 }, { "epoch": 0.29979035639413, "grad_norm": 1.1154210947106469, "learning_rate": 0.00016437503575797297, "loss": 11.7643, "num_tokens": 3055753.0, "step": 429 }, { "epoch": 0.3004891684136967, "grad_norm": 1.0295541363836302, "learning_rate": 0.00016420166795269475, "loss": 12.0597, "num_tokens": 3063120.0, "step": 430 }, { "epoch": 0.30118798043326345, "grad_norm": 1.040910257992528, "learning_rate": 0.00016402797124509508, "loss": 11.9731, "num_tokens": 3070742.0, "step": 431 }, { "epoch": 0.3018867924528302, "grad_norm": 1.0526952080867569, "learning_rate": 0.00016385394652501445, "loss": 11.8993, "num_tokens": 3078056.0, "step": 432 }, { "epoch": 0.3025856044723969, "grad_norm": 1.0896170663855624, "learning_rate": 0.00016367959468397393, "loss": 11.8905, "num_tokens": 3085214.0, "step": 433 }, { "epoch": 0.30328441649196364, "grad_norm": 0.9283859474345019, "learning_rate": 0.00016350491661517032, "loss": 11.9101, "num_tokens": 3092548.0, "step": 434 }, { "epoch": 0.3039832285115304, "grad_norm": 0.986304057530193, "learning_rate": 0.00016332991321347167, "loss": 11.8033, "num_tokens": 3100393.0, "step": 435 }, { "epoch": 0.30468204053109715, "grad_norm": 1.1543372858478307, "learning_rate": 0.0001631545853754127, "loss": 12.1051, "num_tokens": 3106325.0, "step": 436 }, { "epoch": 0.3053808525506639, "grad_norm": 1.1966377576562668, "learning_rate": 0.0001629789339991902, "loss": 12.0699, "num_tokens": 3113193.0, "step": 437 }, { "epoch": 0.3060796645702306, "grad_norm": 1.2855963226674763, "learning_rate": 0.0001628029599846585, "loss": 11.9884, "num_tokens": 3120358.0, "step": 438 }, { "epoch": 0.30677847658979734, "grad_norm": 1.1830063545170302, "learning_rate": 0.00016262666423332473, "loss": 11.8667, "num_tokens": 3127109.0, "step": 439 }, { "epoch": 0.3074772886093641, "grad_norm": 1.0943107370858722, "learning_rate": 0.00016245004764834422, "loss": 11.7229, "num_tokens": 3134426.0, "step": 440 }, { "epoch": 0.3081761006289308, "grad_norm": 1.1001181023873448, "learning_rate": 0.000162273111134516, "loss": 11.9723, "num_tokens": 3141188.0, "step": 441 }, { "epoch": 0.30887491264849753, "grad_norm": 1.238166007571023, "learning_rate": 0.00016209585559827806, "loss": 11.9439, "num_tokens": 3149034.0, "step": 442 }, { "epoch": 0.30957372466806427, "grad_norm": 1.0126345029924844, "learning_rate": 0.0001619182819477027, "loss": 12.0405, "num_tokens": 3156062.0, "step": 443 }, { "epoch": 0.31027253668763105, "grad_norm": 1.1863059832196097, "learning_rate": 0.0001617403910924919, "loss": 11.7742, "num_tokens": 3164141.0, "step": 444 }, { "epoch": 0.3109713487071978, "grad_norm": 0.9943392593170401, "learning_rate": 0.00016156218394397273, "loss": 11.862, "num_tokens": 3172103.0, "step": 445 }, { "epoch": 0.3116701607267645, "grad_norm": 1.1431830948622974, "learning_rate": 0.0001613836614150926, "loss": 11.9805, "num_tokens": 3178950.0, "step": 446 }, { "epoch": 0.31236897274633124, "grad_norm": 1.0570354015055983, "learning_rate": 0.00016120482442041447, "loss": 11.8098, "num_tokens": 3186094.0, "step": 447 }, { "epoch": 0.31306778476589797, "grad_norm": 1.1551238616087514, "learning_rate": 0.0001610256738761125, "loss": 12.0324, "num_tokens": 3193281.0, "step": 448 }, { "epoch": 0.3137665967854647, "grad_norm": 1.153264276075561, "learning_rate": 0.000160846210699967, "loss": 11.8432, "num_tokens": 3200265.0, "step": 449 }, { "epoch": 0.31446540880503143, "grad_norm": 1.0812760599945195, "learning_rate": 0.0001606664358113599, "loss": 11.7047, "num_tokens": 3207548.0, "step": 450 }, { "epoch": 0.31516422082459816, "grad_norm": 1.0695050138356879, "learning_rate": 0.00016048635013127016, "loss": 11.9745, "num_tokens": 3214372.0, "step": 451 }, { "epoch": 0.3158630328441649, "grad_norm": 1.1212081487478163, "learning_rate": 0.00016030595458226872, "loss": 11.7326, "num_tokens": 3221203.0, "step": 452 }, { "epoch": 0.3165618448637317, "grad_norm": 1.0615326542920032, "learning_rate": 0.00016012525008851403, "loss": 11.8234, "num_tokens": 3229249.0, "step": 453 }, { "epoch": 0.3172606568832984, "grad_norm": 1.0924058533685397, "learning_rate": 0.0001599442375757473, "loss": 11.8809, "num_tokens": 3236472.0, "step": 454 }, { "epoch": 0.31795946890286514, "grad_norm": 1.0054105224939138, "learning_rate": 0.00015976291797128767, "loss": 11.8353, "num_tokens": 3243897.0, "step": 455 }, { "epoch": 0.31865828092243187, "grad_norm": 1.0836743188532598, "learning_rate": 0.00015958129220402744, "loss": 11.7473, "num_tokens": 3250533.0, "step": 456 }, { "epoch": 0.3193570929419986, "grad_norm": 1.1331172126168478, "learning_rate": 0.00015939936120442752, "loss": 11.9577, "num_tokens": 3257466.0, "step": 457 }, { "epoch": 0.32005590496156533, "grad_norm": 1.0605723822201687, "learning_rate": 0.00015921712590451236, "loss": 11.7791, "num_tokens": 3263944.0, "step": 458 }, { "epoch": 0.32075471698113206, "grad_norm": 1.2056325385887365, "learning_rate": 0.00015903458723786544, "loss": 11.9819, "num_tokens": 3270543.0, "step": 459 }, { "epoch": 0.3214535290006988, "grad_norm": 1.023946718695128, "learning_rate": 0.00015885174613962426, "loss": 11.9992, "num_tokens": 3277894.0, "step": 460 }, { "epoch": 0.3221523410202656, "grad_norm": 1.164177419749573, "learning_rate": 0.00015866860354647576, "loss": 11.908, "num_tokens": 3284793.0, "step": 461 }, { "epoch": 0.3228511530398323, "grad_norm": 0.974564180236439, "learning_rate": 0.00015848516039665138, "loss": 11.9273, "num_tokens": 3292856.0, "step": 462 }, { "epoch": 0.32354996505939904, "grad_norm": 1.1325311662596682, "learning_rate": 0.0001583014176299223, "loss": 11.6866, "num_tokens": 3300134.0, "step": 463 }, { "epoch": 0.32424877707896577, "grad_norm": 1.0215292754718461, "learning_rate": 0.00015811737618759468, "loss": 11.8115, "num_tokens": 3307091.0, "step": 464 }, { "epoch": 0.3249475890985325, "grad_norm": 1.0991030583076216, "learning_rate": 0.00015793303701250468, "loss": 11.7049, "num_tokens": 3314380.0, "step": 465 }, { "epoch": 0.3256464011180992, "grad_norm": 0.9723239955300694, "learning_rate": 0.00015774840104901378, "loss": 12.1343, "num_tokens": 3322081.0, "step": 466 }, { "epoch": 0.32634521313766596, "grad_norm": 1.1363739747714252, "learning_rate": 0.000157563469243004, "loss": 11.9717, "num_tokens": 3329105.0, "step": 467 }, { "epoch": 0.3270440251572327, "grad_norm": 1.0583331524691464, "learning_rate": 0.00015737824254187275, "loss": 11.9133, "num_tokens": 3336405.0, "step": 468 }, { "epoch": 0.3277428371767994, "grad_norm": 1.0618582403691783, "learning_rate": 0.00015719272189452824, "loss": 11.761, "num_tokens": 3343087.0, "step": 469 }, { "epoch": 0.3284416491963662, "grad_norm": 1.0202628694807843, "learning_rate": 0.00015700690825138473, "loss": 12.0182, "num_tokens": 3350235.0, "step": 470 }, { "epoch": 0.32914046121593293, "grad_norm": 0.9436248037294794, "learning_rate": 0.00015682080256435724, "loss": 11.8759, "num_tokens": 3357485.0, "step": 471 }, { "epoch": 0.32983927323549966, "grad_norm": 1.0693973366929455, "learning_rate": 0.00015663440578685703, "loss": 11.9772, "num_tokens": 3364351.0, "step": 472 }, { "epoch": 0.3305380852550664, "grad_norm": 0.9637424441716854, "learning_rate": 0.00015644771887378663, "loss": 11.862, "num_tokens": 3372249.0, "step": 473 }, { "epoch": 0.3312368972746331, "grad_norm": 1.099099553856128, "learning_rate": 0.00015626074278153485, "loss": 11.8989, "num_tokens": 3379136.0, "step": 474 }, { "epoch": 0.33193570929419985, "grad_norm": 1.0257686815570153, "learning_rate": 0.000156073478467972, "loss": 11.8664, "num_tokens": 3386556.0, "step": 475 }, { "epoch": 0.3326345213137666, "grad_norm": 0.9981099828581506, "learning_rate": 0.0001558859268924449, "loss": 11.6664, "num_tokens": 3394436.0, "step": 476 }, { "epoch": 0.3333333333333333, "grad_norm": 0.9935985234202279, "learning_rate": 0.0001556980890157721, "loss": 11.8036, "num_tokens": 3401842.0, "step": 477 }, { "epoch": 0.33403214535290005, "grad_norm": 0.9904221093625248, "learning_rate": 0.00015550996580023868, "loss": 11.6766, "num_tokens": 3409180.0, "step": 478 }, { "epoch": 0.33473095737246683, "grad_norm": 0.9644588966129022, "learning_rate": 0.00015532155820959165, "loss": 11.9491, "num_tokens": 3416900.0, "step": 479 }, { "epoch": 0.33542976939203356, "grad_norm": 0.9681833145177854, "learning_rate": 0.00015513286720903485, "loss": 11.9831, "num_tokens": 3424074.0, "step": 480 }, { "epoch": 0.3361285814116003, "grad_norm": 1.0017211776208106, "learning_rate": 0.00015494389376522388, "loss": 12.0019, "num_tokens": 3431040.0, "step": 481 }, { "epoch": 0.336827393431167, "grad_norm": 1.13259905993011, "learning_rate": 0.0001547546388462615, "loss": 11.9244, "num_tokens": 3437949.0, "step": 482 }, { "epoch": 0.33752620545073375, "grad_norm": 1.0001327316281, "learning_rate": 0.00015456510342169225, "loss": 11.6157, "num_tokens": 3444767.0, "step": 483 }, { "epoch": 0.3382250174703005, "grad_norm": 1.1862346561208217, "learning_rate": 0.00015437528846249784, "loss": 11.7979, "num_tokens": 3452455.0, "step": 484 }, { "epoch": 0.3389238294898672, "grad_norm": 0.9964598534250564, "learning_rate": 0.00015418519494109185, "loss": 11.8249, "num_tokens": 3459075.0, "step": 485 }, { "epoch": 0.33962264150943394, "grad_norm": 1.0499976134478521, "learning_rate": 0.00015399482383131517, "loss": 11.8271, "num_tokens": 3466194.0, "step": 486 }, { "epoch": 0.3403214535290007, "grad_norm": 1.0238007052339033, "learning_rate": 0.0001538041761084305, "loss": 11.8863, "num_tokens": 3472861.0, "step": 487 }, { "epoch": 0.34102026554856746, "grad_norm": 1.031413218747241, "learning_rate": 0.00015361325274911779, "loss": 11.8285, "num_tokens": 3480127.0, "step": 488 }, { "epoch": 0.3417190775681342, "grad_norm": 0.9109038752777893, "learning_rate": 0.00015342205473146904, "loss": 11.8997, "num_tokens": 3487155.0, "step": 489 }, { "epoch": 0.3424178895877009, "grad_norm": 1.0512658553310719, "learning_rate": 0.00015323058303498324, "loss": 11.7961, "num_tokens": 3494773.0, "step": 490 }, { "epoch": 0.34311670160726765, "grad_norm": 0.919699650030833, "learning_rate": 0.00015303883864056154, "loss": 11.7389, "num_tokens": 3502169.0, "step": 491 }, { "epoch": 0.3438155136268344, "grad_norm": 0.9713451834075437, "learning_rate": 0.00015284682253050198, "loss": 12.1026, "num_tokens": 3509575.0, "step": 492 }, { "epoch": 0.3445143256464011, "grad_norm": 1.071656337376466, "learning_rate": 0.00015265453568849463, "loss": 12.1382, "num_tokens": 3516257.0, "step": 493 }, { "epoch": 0.34521313766596784, "grad_norm": 0.9753487437886109, "learning_rate": 0.0001524619790996166, "loss": 12.0307, "num_tokens": 3523550.0, "step": 494 }, { "epoch": 0.34591194968553457, "grad_norm": 1.0299715129309743, "learning_rate": 0.00015226915375032675, "loss": 11.8399, "num_tokens": 3530386.0, "step": 495 }, { "epoch": 0.3466107617051013, "grad_norm": 0.9544143910020928, "learning_rate": 0.00015207606062846092, "loss": 11.8551, "num_tokens": 3537432.0, "step": 496 }, { "epoch": 0.3473095737246681, "grad_norm": 1.0699574494766773, "learning_rate": 0.00015188270072322664, "loss": 11.9395, "num_tokens": 3544343.0, "step": 497 }, { "epoch": 0.3480083857442348, "grad_norm": 0.976543421173678, "learning_rate": 0.00015168907502519823, "loss": 11.757, "num_tokens": 3551437.0, "step": 498 }, { "epoch": 0.34870719776380155, "grad_norm": 1.1037596936096155, "learning_rate": 0.00015149518452631163, "loss": 11.9166, "num_tokens": 3558381.0, "step": 499 }, { "epoch": 0.3494060097833683, "grad_norm": 1.1919912995611013, "learning_rate": 0.00015130103021985928, "loss": 11.8007, "num_tokens": 3565537.0, "step": 500 }, { "epoch": 0.350104821802935, "grad_norm": 1.0537606470925758, "learning_rate": 0.00015110661310048523, "loss": 12.0405, "num_tokens": 3572664.0, "step": 501 }, { "epoch": 0.35080363382250174, "grad_norm": 1.1601489554122157, "learning_rate": 0.00015091193416417981, "loss": 11.6978, "num_tokens": 3579863.0, "step": 502 }, { "epoch": 0.35150244584206847, "grad_norm": 0.9961821792223636, "learning_rate": 0.00015071699440827462, "loss": 11.9178, "num_tokens": 3587973.0, "step": 503 }, { "epoch": 0.3522012578616352, "grad_norm": 1.0220216417014585, "learning_rate": 0.00015052179483143752, "loss": 11.8212, "num_tokens": 3594430.0, "step": 504 }, { "epoch": 0.352900069881202, "grad_norm": 1.0918643967727748, "learning_rate": 0.00015032633643366727, "loss": 11.6821, "num_tokens": 3600877.0, "step": 505 }, { "epoch": 0.3535988819007687, "grad_norm": 1.0053389281755476, "learning_rate": 0.0001501306202162887, "loss": 11.7887, "num_tokens": 3607903.0, "step": 506 }, { "epoch": 0.35429769392033544, "grad_norm": 1.00938096807553, "learning_rate": 0.0001499346471819474, "loss": 11.8449, "num_tokens": 3614967.0, "step": 507 }, { "epoch": 0.3549965059399022, "grad_norm": 0.934535161353447, "learning_rate": 0.00014973841833460457, "loss": 11.9196, "num_tokens": 3622644.0, "step": 508 }, { "epoch": 0.3556953179594689, "grad_norm": 1.0532188197414574, "learning_rate": 0.00014954193467953196, "loss": 11.7554, "num_tokens": 3629496.0, "step": 509 }, { "epoch": 0.35639412997903563, "grad_norm": 1.0320538184578378, "learning_rate": 0.0001493451972233067, "loss": 11.8978, "num_tokens": 3636251.0, "step": 510 }, { "epoch": 0.35709294199860236, "grad_norm": 1.0169160736665257, "learning_rate": 0.0001491482069738062, "loss": 11.8188, "num_tokens": 3643640.0, "step": 511 }, { "epoch": 0.3577917540181691, "grad_norm": 0.9553371806973806, "learning_rate": 0.00014895096494020274, "loss": 11.737, "num_tokens": 3651194.0, "step": 512 }, { "epoch": 0.3584905660377358, "grad_norm": 0.9843783334182804, "learning_rate": 0.00014875347213295863, "loss": 11.6681, "num_tokens": 3658441.0, "step": 513 }, { "epoch": 0.3591893780573026, "grad_norm": 0.9703948672844441, "learning_rate": 0.00014855572956382082, "loss": 11.7661, "num_tokens": 3665170.0, "step": 514 }, { "epoch": 0.35988819007686934, "grad_norm": 0.9634487040177081, "learning_rate": 0.0001483577382458158, "loss": 11.8308, "num_tokens": 3672474.0, "step": 515 }, { "epoch": 0.36058700209643607, "grad_norm": 0.9876860191525895, "learning_rate": 0.00014815949919324444, "loss": 11.6739, "num_tokens": 3679386.0, "step": 516 }, { "epoch": 0.3612858141160028, "grad_norm": 1.0064827868977257, "learning_rate": 0.00014796101342167664, "loss": 11.9353, "num_tokens": 3687308.0, "step": 517 }, { "epoch": 0.36198462613556953, "grad_norm": 1.000619260490717, "learning_rate": 0.00014776228194794623, "loss": 11.7515, "num_tokens": 3694073.0, "step": 518 }, { "epoch": 0.36268343815513626, "grad_norm": 0.9365883961538043, "learning_rate": 0.00014756330579014591, "loss": 11.9024, "num_tokens": 3701387.0, "step": 519 }, { "epoch": 0.363382250174703, "grad_norm": 1.338573083068865, "learning_rate": 0.0001473640859676217, "loss": 11.5576, "num_tokens": 3708600.0, "step": 520 }, { "epoch": 0.3640810621942697, "grad_norm": 1.1132400116662757, "learning_rate": 0.00014716462350096803, "loss": 11.8507, "num_tokens": 3716979.0, "step": 521 }, { "epoch": 0.36477987421383645, "grad_norm": 1.3193521575988498, "learning_rate": 0.0001469649194120224, "loss": 11.8097, "num_tokens": 3723012.0, "step": 522 }, { "epoch": 0.36547868623340324, "grad_norm": 1.0403268561853904, "learning_rate": 0.00014676497472385994, "loss": 11.6589, "num_tokens": 3730363.0, "step": 523 }, { "epoch": 0.36617749825296997, "grad_norm": 1.2151940133509085, "learning_rate": 0.0001465647904607886, "loss": 11.7648, "num_tokens": 3737054.0, "step": 524 }, { "epoch": 0.3668763102725367, "grad_norm": 1.1114834210539923, "learning_rate": 0.00014636436764834353, "loss": 11.7175, "num_tokens": 3743973.0, "step": 525 }, { "epoch": 0.36757512229210343, "grad_norm": 1.0182021038420608, "learning_rate": 0.000146163707313282, "loss": 12.0104, "num_tokens": 3750646.0, "step": 526 }, { "epoch": 0.36827393431167016, "grad_norm": 1.0658085747221677, "learning_rate": 0.00014596281048357806, "loss": 11.8781, "num_tokens": 3758294.0, "step": 527 }, { "epoch": 0.3689727463312369, "grad_norm": 0.963002880311189, "learning_rate": 0.0001457616781884173, "loss": 11.6855, "num_tokens": 3765428.0, "step": 528 }, { "epoch": 0.3696715583508036, "grad_norm": 1.1131967814264112, "learning_rate": 0.00014556031145819168, "loss": 11.8129, "num_tokens": 3772483.0, "step": 529 }, { "epoch": 0.37037037037037035, "grad_norm": 0.9692436092208317, "learning_rate": 0.0001453587113244941, "loss": 11.8568, "num_tokens": 3779599.0, "step": 530 }, { "epoch": 0.3710691823899371, "grad_norm": 1.101540880954833, "learning_rate": 0.00014515687882011313, "loss": 11.789, "num_tokens": 3786663.0, "step": 531 }, { "epoch": 0.37176799440950387, "grad_norm": 1.050559906664223, "learning_rate": 0.00014495481497902788, "loss": 11.7851, "num_tokens": 3793917.0, "step": 532 }, { "epoch": 0.3724668064290706, "grad_norm": 1.1741989565793984, "learning_rate": 0.00014475252083640246, "loss": 11.8387, "num_tokens": 3800383.0, "step": 533 }, { "epoch": 0.3731656184486373, "grad_norm": 1.1113664176342144, "learning_rate": 0.00014454999742858092, "loss": 11.7885, "num_tokens": 3807421.0, "step": 534 }, { "epoch": 0.37386443046820406, "grad_norm": 1.180064570726451, "learning_rate": 0.0001443472457930817, "loss": 11.8684, "num_tokens": 3814470.0, "step": 535 }, { "epoch": 0.3745632424877708, "grad_norm": 1.2084454645287401, "learning_rate": 0.0001441442669685926, "loss": 11.8665, "num_tokens": 3821117.0, "step": 536 }, { "epoch": 0.3752620545073375, "grad_norm": 1.0286785364308508, "learning_rate": 0.00014394106199496517, "loss": 11.94, "num_tokens": 3828050.0, "step": 537 }, { "epoch": 0.37596086652690425, "grad_norm": 0.9866126433136005, "learning_rate": 0.00014373763191320954, "loss": 11.8129, "num_tokens": 3835858.0, "step": 538 }, { "epoch": 0.376659678546471, "grad_norm": 0.968366405112258, "learning_rate": 0.00014353397776548912, "loss": 11.8883, "num_tokens": 3843141.0, "step": 539 }, { "epoch": 0.37735849056603776, "grad_norm": 0.9164242552730942, "learning_rate": 0.00014333010059511505, "loss": 11.8982, "num_tokens": 3850497.0, "step": 540 }, { "epoch": 0.3780573025856045, "grad_norm": 0.9068419500979179, "learning_rate": 0.0001431260014465412, "loss": 11.3929, "num_tokens": 3857621.0, "step": 541 }, { "epoch": 0.3787561146051712, "grad_norm": 1.0163110836355063, "learning_rate": 0.00014292168136535854, "loss": 11.824, "num_tokens": 3864403.0, "step": 542 }, { "epoch": 0.37945492662473795, "grad_norm": 0.9074023579901379, "learning_rate": 0.00014271714139828983, "loss": 11.6444, "num_tokens": 3871744.0, "step": 543 }, { "epoch": 0.3801537386443047, "grad_norm": 0.9866662139219797, "learning_rate": 0.0001425123825931843, "loss": 11.725, "num_tokens": 3879289.0, "step": 544 }, { "epoch": 0.3808525506638714, "grad_norm": 0.9073257309786021, "learning_rate": 0.00014230740599901231, "loss": 11.6273, "num_tokens": 3886810.0, "step": 545 }, { "epoch": 0.38155136268343814, "grad_norm": 0.9207144140267274, "learning_rate": 0.00014210221266585998, "loss": 11.8695, "num_tokens": 3894520.0, "step": 546 }, { "epoch": 0.3822501747030049, "grad_norm": 0.93767292493419, "learning_rate": 0.0001418968036449237, "loss": 11.7849, "num_tokens": 3901356.0, "step": 547 }, { "epoch": 0.3829489867225716, "grad_norm": 0.9004540028187058, "learning_rate": 0.0001416911799885049, "loss": 11.7171, "num_tokens": 3908231.0, "step": 548 }, { "epoch": 0.3836477987421384, "grad_norm": 0.9581250266093889, "learning_rate": 0.00014148534275000444, "loss": 12.0426, "num_tokens": 3915548.0, "step": 549 }, { "epoch": 0.3843466107617051, "grad_norm": 1.0146876136163, "learning_rate": 0.0001412792929839175, "loss": 11.7198, "num_tokens": 3922294.0, "step": 550 }, { "epoch": 0.38504542278127185, "grad_norm": 0.9216708322241084, "learning_rate": 0.00014107303174582794, "loss": 11.662, "num_tokens": 3929145.0, "step": 551 }, { "epoch": 0.3857442348008386, "grad_norm": 1.0481953881329724, "learning_rate": 0.00014086656009240306, "loss": 11.7744, "num_tokens": 3936061.0, "step": 552 }, { "epoch": 0.3864430468204053, "grad_norm": 0.9829096758175492, "learning_rate": 0.00014065987908138804, "loss": 11.9194, "num_tokens": 3942508.0, "step": 553 }, { "epoch": 0.38714185883997204, "grad_norm": 1.0030318786032055, "learning_rate": 0.00014045298977160057, "loss": 11.8821, "num_tokens": 3949004.0, "step": 554 }, { "epoch": 0.38784067085953877, "grad_norm": 0.850860339193017, "learning_rate": 0.00014024589322292555, "loss": 11.7397, "num_tokens": 3956901.0, "step": 555 }, { "epoch": 0.3885394828791055, "grad_norm": 1.0103040436308783, "learning_rate": 0.00014003859049630942, "loss": 11.8767, "num_tokens": 3964133.0, "step": 556 }, { "epoch": 0.38923829489867223, "grad_norm": 0.9987701612040082, "learning_rate": 0.000139831082653755, "loss": 12.0504, "num_tokens": 3971294.0, "step": 557 }, { "epoch": 0.389937106918239, "grad_norm": 1.1429673333072519, "learning_rate": 0.00013962337075831583, "loss": 11.8142, "num_tokens": 3977426.0, "step": 558 }, { "epoch": 0.39063591893780575, "grad_norm": 0.9688917144423852, "learning_rate": 0.00013941545587409075, "loss": 11.698, "num_tokens": 3984492.0, "step": 559 }, { "epoch": 0.3913347309573725, "grad_norm": 1.0308616199533485, "learning_rate": 0.00013920733906621862, "loss": 11.9656, "num_tokens": 3991120.0, "step": 560 }, { "epoch": 0.3920335429769392, "grad_norm": 1.013985978635021, "learning_rate": 0.00013899902140087272, "loss": 11.8287, "num_tokens": 3997971.0, "step": 561 }, { "epoch": 0.39273235499650594, "grad_norm": 0.9071478462521866, "learning_rate": 0.00013879050394525523, "loss": 11.7164, "num_tokens": 4005376.0, "step": 562 }, { "epoch": 0.39343116701607267, "grad_norm": 1.0624818582024294, "learning_rate": 0.00013858178776759197, "loss": 11.8942, "num_tokens": 4012471.0, "step": 563 }, { "epoch": 0.3941299790356394, "grad_norm": 0.8936125162275714, "learning_rate": 0.00013837287393712666, "loss": 11.7496, "num_tokens": 4019054.0, "step": 564 }, { "epoch": 0.39482879105520613, "grad_norm": 1.0912085369465605, "learning_rate": 0.00013816376352411574, "loss": 11.846, "num_tokens": 4025454.0, "step": 565 }, { "epoch": 0.39552760307477286, "grad_norm": 0.9032002517173796, "learning_rate": 0.00013795445759982262, "loss": 11.5647, "num_tokens": 4032992.0, "step": 566 }, { "epoch": 0.39622641509433965, "grad_norm": 0.9257844804709106, "learning_rate": 0.00013774495723651236, "loss": 11.8157, "num_tokens": 4040064.0, "step": 567 }, { "epoch": 0.3969252271139064, "grad_norm": 1.0334706226849835, "learning_rate": 0.0001375352635074461, "loss": 11.6075, "num_tokens": 4046967.0, "step": 568 }, { "epoch": 0.3976240391334731, "grad_norm": 0.8363862917027438, "learning_rate": 0.0001373253774868756, "loss": 11.7502, "num_tokens": 4055301.0, "step": 569 }, { "epoch": 0.39832285115303984, "grad_norm": 0.9385148033516746, "learning_rate": 0.00013711530025003766, "loss": 11.7727, "num_tokens": 4062431.0, "step": 570 }, { "epoch": 0.39902166317260657, "grad_norm": 0.8299416376395171, "learning_rate": 0.00013690503287314883, "loss": 11.519, "num_tokens": 4070230.0, "step": 571 }, { "epoch": 0.3997204751921733, "grad_norm": 0.9976023645792326, "learning_rate": 0.00013669457643339955, "loss": 11.6711, "num_tokens": 4077148.0, "step": 572 }, { "epoch": 0.40041928721174, "grad_norm": 0.8855228788782702, "learning_rate": 0.00013648393200894893, "loss": 11.5901, "num_tokens": 4084168.0, "step": 573 }, { "epoch": 0.40111809923130676, "grad_norm": 0.9355806400309514, "learning_rate": 0.00013627310067891913, "loss": 11.7706, "num_tokens": 4092313.0, "step": 574 }, { "epoch": 0.4018169112508735, "grad_norm": 0.8887017159796101, "learning_rate": 0.00013606208352338973, "loss": 11.6336, "num_tokens": 4099578.0, "step": 575 }, { "epoch": 0.4025157232704403, "grad_norm": 0.896143619832332, "learning_rate": 0.00013585088162339231, "loss": 11.8035, "num_tokens": 4106534.0, "step": 576 }, { "epoch": 0.403214535290007, "grad_norm": 0.9717296350589316, "learning_rate": 0.00013563949606090503, "loss": 11.6552, "num_tokens": 4113134.0, "step": 577 }, { "epoch": 0.40391334730957373, "grad_norm": 0.8799694598345534, "learning_rate": 0.00013542792791884674, "loss": 11.6327, "num_tokens": 4120863.0, "step": 578 }, { "epoch": 0.40461215932914046, "grad_norm": 0.8714214123608119, "learning_rate": 0.00013521617828107175, "loss": 11.6445, "num_tokens": 4127780.0, "step": 579 }, { "epoch": 0.4053109713487072, "grad_norm": 0.9368995617188041, "learning_rate": 0.00013500424823236412, "loss": 11.7067, "num_tokens": 4134829.0, "step": 580 }, { "epoch": 0.4060097833682739, "grad_norm": 0.8220877834169437, "learning_rate": 0.0001347921388584322, "loss": 11.4554, "num_tokens": 4142834.0, "step": 581 }, { "epoch": 0.40670859538784065, "grad_norm": 0.9223723153491369, "learning_rate": 0.000134579851245903, "loss": 11.8592, "num_tokens": 4149774.0, "step": 582 }, { "epoch": 0.4074074074074074, "grad_norm": 0.9344353780152859, "learning_rate": 0.00013436738648231656, "loss": 11.7621, "num_tokens": 4156412.0, "step": 583 }, { "epoch": 0.40810621942697417, "grad_norm": 0.9224628889971425, "learning_rate": 0.00013415474565612058, "loss": 11.8511, "num_tokens": 4163132.0, "step": 584 }, { "epoch": 0.4088050314465409, "grad_norm": 0.9265931390375924, "learning_rate": 0.00013394192985666465, "loss": 11.7475, "num_tokens": 4170145.0, "step": 585 }, { "epoch": 0.40950384346610763, "grad_norm": 0.9535182759992712, "learning_rate": 0.0001337289401741947, "loss": 11.6528, "num_tokens": 4177826.0, "step": 586 }, { "epoch": 0.41020265548567436, "grad_norm": 0.9443710116876387, "learning_rate": 0.0001335157776998476, "loss": 11.6145, "num_tokens": 4184556.0, "step": 587 }, { "epoch": 0.4109014675052411, "grad_norm": 1.004187267294352, "learning_rate": 0.00013330244352564527, "loss": 11.6601, "num_tokens": 4191042.0, "step": 588 }, { "epoch": 0.4116002795248078, "grad_norm": 1.0157069700491397, "learning_rate": 0.0001330889387444893, "loss": 11.8948, "num_tokens": 4197862.0, "step": 589 }, { "epoch": 0.41229909154437455, "grad_norm": 0.9120585645887671, "learning_rate": 0.00013287526445015531, "loss": 11.824, "num_tokens": 4205404.0, "step": 590 }, { "epoch": 0.4129979035639413, "grad_norm": 0.8937750402751362, "learning_rate": 0.0001326614217372873, "loss": 11.6915, "num_tokens": 4212599.0, "step": 591 }, { "epoch": 0.413696715583508, "grad_norm": 0.9924820650387385, "learning_rate": 0.0001324474117013921, "loss": 11.9223, "num_tokens": 4219704.0, "step": 592 }, { "epoch": 0.4143955276030748, "grad_norm": 1.0041177228053997, "learning_rate": 0.00013223323543883373, "loss": 11.8224, "num_tokens": 4226784.0, "step": 593 }, { "epoch": 0.41509433962264153, "grad_norm": 0.9741629017640787, "learning_rate": 0.0001320188940468277, "loss": 11.6504, "num_tokens": 4233717.0, "step": 594 }, { "epoch": 0.41579315164220826, "grad_norm": 0.9601854433244981, "learning_rate": 0.0001318043886234356, "loss": 11.8218, "num_tokens": 4240647.0, "step": 595 }, { "epoch": 0.416491963661775, "grad_norm": 0.9645423908468603, "learning_rate": 0.00013158972026755926, "loss": 11.7377, "num_tokens": 4247672.0, "step": 596 }, { "epoch": 0.4171907756813417, "grad_norm": 0.9316349929157852, "learning_rate": 0.0001313748900789352, "loss": 11.6241, "num_tokens": 4255152.0, "step": 597 }, { "epoch": 0.41788958770090845, "grad_norm": 0.9537689079354016, "learning_rate": 0.0001311598991581291, "loss": 11.8703, "num_tokens": 4261453.0, "step": 598 }, { "epoch": 0.4185883997204752, "grad_norm": 0.9289919471319109, "learning_rate": 0.00013094474860652987, "loss": 11.6782, "num_tokens": 4268508.0, "step": 599 }, { "epoch": 0.4192872117400419, "grad_norm": 0.8914635789877776, "learning_rate": 0.00013072943952634447, "loss": 11.7243, "num_tokens": 4275543.0, "step": 600 }, { "epoch": 0.41998602375960864, "grad_norm": 0.9440659795291676, "learning_rate": 0.00013051397302059171, "loss": 11.8755, "num_tokens": 4282191.0, "step": 601 }, { "epoch": 0.4206848357791754, "grad_norm": 0.9585209926703763, "learning_rate": 0.00013029835019309714, "loss": 11.8258, "num_tokens": 4289593.0, "step": 602 }, { "epoch": 0.42138364779874216, "grad_norm": 0.7867669798656702, "learning_rate": 0.000130082572148487, "loss": 11.4998, "num_tokens": 4297297.0, "step": 603 }, { "epoch": 0.4220824598183089, "grad_norm": 0.8949794126524566, "learning_rate": 0.00012986663999218261, "loss": 11.7613, "num_tokens": 4304161.0, "step": 604 }, { "epoch": 0.4227812718378756, "grad_norm": 0.893864879487913, "learning_rate": 0.00012965055483039507, "loss": 11.5257, "num_tokens": 4311640.0, "step": 605 }, { "epoch": 0.42348008385744235, "grad_norm": 0.9586722537494932, "learning_rate": 0.00012943431777011902, "loss": 11.839, "num_tokens": 4318619.0, "step": 606 }, { "epoch": 0.4241788958770091, "grad_norm": 0.9537730403425981, "learning_rate": 0.00012921792991912753, "loss": 11.8218, "num_tokens": 4325488.0, "step": 607 }, { "epoch": 0.4248777078965758, "grad_norm": 0.8262921885502388, "learning_rate": 0.00012900139238596598, "loss": 11.4973, "num_tokens": 4332936.0, "step": 608 }, { "epoch": 0.42557651991614254, "grad_norm": 1.0157982223498707, "learning_rate": 0.00012878470627994664, "loss": 11.5374, "num_tokens": 4339915.0, "step": 609 }, { "epoch": 0.42627533193570927, "grad_norm": 0.8575390048878171, "learning_rate": 0.0001285678727111429, "loss": 11.5607, "num_tokens": 4347052.0, "step": 610 }, { "epoch": 0.42697414395527605, "grad_norm": 0.9730947204204807, "learning_rate": 0.00012835089279038362, "loss": 11.7061, "num_tokens": 4353752.0, "step": 611 }, { "epoch": 0.4276729559748428, "grad_norm": 0.9483802234693168, "learning_rate": 0.00012813376762924733, "loss": 11.836, "num_tokens": 4361038.0, "step": 612 }, { "epoch": 0.4283717679944095, "grad_norm": 0.9216292136491858, "learning_rate": 0.0001279164983400568, "loss": 11.7437, "num_tokens": 4368560.0, "step": 613 }, { "epoch": 0.42907058001397624, "grad_norm": 0.8425238285285168, "learning_rate": 0.00012769908603587292, "loss": 11.5207, "num_tokens": 4376222.0, "step": 614 }, { "epoch": 0.429769392033543, "grad_norm": 0.8896306896847135, "learning_rate": 0.0001274815318304894, "loss": 11.6286, "num_tokens": 4383884.0, "step": 615 }, { "epoch": 0.4304682040531097, "grad_norm": 0.9354680205418325, "learning_rate": 0.0001272638368384269, "loss": 11.6862, "num_tokens": 4390978.0, "step": 616 }, { "epoch": 0.43116701607267643, "grad_norm": 0.9479829150578614, "learning_rate": 0.00012704600217492725, "loss": 11.9, "num_tokens": 4398693.0, "step": 617 }, { "epoch": 0.43186582809224316, "grad_norm": 0.9515538105951693, "learning_rate": 0.0001268280289559479, "loss": 11.7401, "num_tokens": 4405911.0, "step": 618 }, { "epoch": 0.43256464011180995, "grad_norm": 0.8605687632190838, "learning_rate": 0.00012660991829815602, "loss": 11.6708, "num_tokens": 4413468.0, "step": 619 }, { "epoch": 0.4332634521313767, "grad_norm": 0.9579766386486308, "learning_rate": 0.00012639167131892293, "loss": 11.8158, "num_tokens": 4420628.0, "step": 620 }, { "epoch": 0.4339622641509434, "grad_norm": 0.9305350787646466, "learning_rate": 0.0001261732891363183, "loss": 11.7188, "num_tokens": 4427900.0, "step": 621 }, { "epoch": 0.43466107617051014, "grad_norm": 0.9798863355563663, "learning_rate": 0.0001259547728691045, "loss": 11.774, "num_tokens": 4434789.0, "step": 622 }, { "epoch": 0.43535988819007687, "grad_norm": 0.9247977118101478, "learning_rate": 0.00012573612363673067, "loss": 11.8669, "num_tokens": 4442355.0, "step": 623 }, { "epoch": 0.4360587002096436, "grad_norm": 0.9072492786959844, "learning_rate": 0.00012551734255932727, "loss": 11.5485, "num_tokens": 4449354.0, "step": 624 }, { "epoch": 0.43675751222921033, "grad_norm": 0.9280486905034977, "learning_rate": 0.0001252984307577001, "loss": 11.4915, "num_tokens": 4456996.0, "step": 625 }, { "epoch": 0.43745632424877706, "grad_norm": 0.9451836457137859, "learning_rate": 0.00012507938935332478, "loss": 11.6188, "num_tokens": 4464108.0, "step": 626 }, { "epoch": 0.4381551362683438, "grad_norm": 0.8990536388212861, "learning_rate": 0.00012486021946834068, "loss": 11.4984, "num_tokens": 4471782.0, "step": 627 }, { "epoch": 0.4388539482879106, "grad_norm": 0.9750832409444402, "learning_rate": 0.00012464092222554552, "loss": 11.4983, "num_tokens": 4478870.0, "step": 628 }, { "epoch": 0.4395527603074773, "grad_norm": 0.8506726198904068, "learning_rate": 0.00012442149874838948, "loss": 11.4597, "num_tokens": 4486006.0, "step": 629 }, { "epoch": 0.44025157232704404, "grad_norm": 0.9199807285091958, "learning_rate": 0.00012420195016096933, "loss": 11.8458, "num_tokens": 4493433.0, "step": 630 }, { "epoch": 0.44095038434661077, "grad_norm": 0.8020311533791471, "learning_rate": 0.00012398227758802285, "loss": 11.4522, "num_tokens": 4501615.0, "step": 631 }, { "epoch": 0.4416491963661775, "grad_norm": 0.9335899963926214, "learning_rate": 0.00012376248215492297, "loss": 11.7816, "num_tokens": 4508453.0, "step": 632 }, { "epoch": 0.44234800838574423, "grad_norm": 0.9347628048111192, "learning_rate": 0.000123542564987672, "loss": 11.6779, "num_tokens": 4515645.0, "step": 633 }, { "epoch": 0.44304682040531096, "grad_norm": 0.8555235370359207, "learning_rate": 0.00012332252721289594, "loss": 11.3559, "num_tokens": 4522914.0, "step": 634 }, { "epoch": 0.4437456324248777, "grad_norm": 0.9434725175049635, "learning_rate": 0.00012310236995783866, "loss": 11.8245, "num_tokens": 4530012.0, "step": 635 }, { "epoch": 0.4444444444444444, "grad_norm": 0.894759929105657, "learning_rate": 0.00012288209435035605, "loss": 11.633, "num_tokens": 4536666.0, "step": 636 }, { "epoch": 0.4451432564640112, "grad_norm": 0.8939776930969006, "learning_rate": 0.00012266170151891036, "loss": 11.6174, "num_tokens": 4543673.0, "step": 637 }, { "epoch": 0.44584206848357794, "grad_norm": 0.9419395016806058, "learning_rate": 0.00012244119259256442, "loss": 11.6986, "num_tokens": 4550285.0, "step": 638 }, { "epoch": 0.44654088050314467, "grad_norm": 0.900919444140853, "learning_rate": 0.00012222056870097572, "loss": 11.8951, "num_tokens": 4557630.0, "step": 639 }, { "epoch": 0.4472396925227114, "grad_norm": 0.9368150858158782, "learning_rate": 0.00012199983097439079, "loss": 11.7818, "num_tokens": 4564552.0, "step": 640 }, { "epoch": 0.4479385045422781, "grad_norm": 0.8542741429298996, "learning_rate": 0.00012177898054363923, "loss": 11.6147, "num_tokens": 4572109.0, "step": 641 }, { "epoch": 0.44863731656184486, "grad_norm": 0.9096968317268118, "learning_rate": 0.00012155801854012816, "loss": 11.6874, "num_tokens": 4579365.0, "step": 642 }, { "epoch": 0.4493361285814116, "grad_norm": 1.0188620411597427, "learning_rate": 0.00012133694609583615, "loss": 11.6009, "num_tokens": 4585759.0, "step": 643 }, { "epoch": 0.4500349406009783, "grad_norm": 0.9031994994109169, "learning_rate": 0.00012111576434330766, "loss": 11.7123, "num_tokens": 4592247.0, "step": 644 }, { "epoch": 0.45073375262054505, "grad_norm": 0.9149162319112071, "learning_rate": 0.00012089447441564705, "loss": 11.6632, "num_tokens": 4599348.0, "step": 645 }, { "epoch": 0.45143256464011183, "grad_norm": 0.9094933293195947, "learning_rate": 0.00012067307744651288, "loss": 11.4616, "num_tokens": 4606162.0, "step": 646 }, { "epoch": 0.45213137665967856, "grad_norm": 0.8663113470163256, "learning_rate": 0.00012045157457011211, "loss": 11.4333, "num_tokens": 4612972.0, "step": 647 }, { "epoch": 0.4528301886792453, "grad_norm": 0.8567265177969963, "learning_rate": 0.00012022996692119424, "loss": 11.647, "num_tokens": 4620976.0, "step": 648 }, { "epoch": 0.453529000698812, "grad_norm": 0.8601920989369755, "learning_rate": 0.00012000825563504547, "loss": 11.768, "num_tokens": 4629303.0, "step": 649 }, { "epoch": 0.45422781271837875, "grad_norm": 0.8865942670819554, "learning_rate": 0.000119786441847483, "loss": 11.5788, "num_tokens": 4636638.0, "step": 650 }, { "epoch": 0.4549266247379455, "grad_norm": 0.8777317880659796, "learning_rate": 0.00011956452669484908, "loss": 11.5788, "num_tokens": 4643800.0, "step": 651 }, { "epoch": 0.4556254367575122, "grad_norm": 0.9022730838918819, "learning_rate": 0.0001193425113140053, "loss": 11.7337, "num_tokens": 4650901.0, "step": 652 }, { "epoch": 0.45632424877707894, "grad_norm": 0.9200091529677351, "learning_rate": 0.00011912039684232674, "loss": 11.7567, "num_tokens": 4657604.0, "step": 653 }, { "epoch": 0.4570230607966457, "grad_norm": 0.9853157147946073, "learning_rate": 0.000118898184417696, "loss": 11.5672, "num_tokens": 4664504.0, "step": 654 }, { "epoch": 0.45772187281621246, "grad_norm": 0.9357526749090548, "learning_rate": 0.00011867587517849757, "loss": 11.6281, "num_tokens": 4670477.0, "step": 655 }, { "epoch": 0.4584206848357792, "grad_norm": 0.8505438973909345, "learning_rate": 0.0001184534702636119, "loss": 11.5583, "num_tokens": 4677858.0, "step": 656 }, { "epoch": 0.4591194968553459, "grad_norm": 1.0023298426002707, "learning_rate": 0.00011823097081240964, "loss": 11.5818, "num_tokens": 4684626.0, "step": 657 }, { "epoch": 0.45981830887491265, "grad_norm": 0.9108337238330952, "learning_rate": 0.00011800837796474561, "loss": 11.837, "num_tokens": 4691582.0, "step": 658 }, { "epoch": 0.4605171208944794, "grad_norm": 0.8958214114567762, "learning_rate": 0.00011778569286095329, "loss": 11.6986, "num_tokens": 4699398.0, "step": 659 }, { "epoch": 0.4612159329140461, "grad_norm": 0.8418306799615523, "learning_rate": 0.00011756291664183859, "loss": 11.6767, "num_tokens": 4707448.0, "step": 660 }, { "epoch": 0.46191474493361284, "grad_norm": 0.9201936424777087, "learning_rate": 0.00011734005044867426, "loss": 11.6697, "num_tokens": 4714120.0, "step": 661 }, { "epoch": 0.46261355695317957, "grad_norm": 0.9474217118708662, "learning_rate": 0.00011711709542319411, "loss": 11.6511, "num_tokens": 4721137.0, "step": 662 }, { "epoch": 0.46331236897274636, "grad_norm": 1.0911097821663276, "learning_rate": 0.00011689405270758684, "loss": 11.8961, "num_tokens": 4727881.0, "step": 663 }, { "epoch": 0.4640111809923131, "grad_norm": 0.9131509898854648, "learning_rate": 0.00011667092344449053, "loss": 11.7066, "num_tokens": 4735809.0, "step": 664 }, { "epoch": 0.4647099930118798, "grad_norm": 1.008550314106262, "learning_rate": 0.00011644770877698654, "loss": 11.8672, "num_tokens": 4742094.0, "step": 665 }, { "epoch": 0.46540880503144655, "grad_norm": 0.8283668414360017, "learning_rate": 0.00011622440984859384, "loss": 11.537, "num_tokens": 4749810.0, "step": 666 }, { "epoch": 0.4661076170510133, "grad_norm": 1.046661693335407, "learning_rate": 0.00011600102780326296, "loss": 11.6646, "num_tokens": 4756916.0, "step": 667 }, { "epoch": 0.46680642907058, "grad_norm": 0.8517203229245645, "learning_rate": 0.00011577756378537033, "loss": 11.6898, "num_tokens": 4764365.0, "step": 668 }, { "epoch": 0.46750524109014674, "grad_norm": 0.9243441693404036, "learning_rate": 0.00011555401893971229, "loss": 11.5513, "num_tokens": 4771335.0, "step": 669 }, { "epoch": 0.46820405310971347, "grad_norm": 0.8779927585010323, "learning_rate": 0.00011533039441149926, "loss": 11.5647, "num_tokens": 4778789.0, "step": 670 }, { "epoch": 0.4689028651292802, "grad_norm": 0.8629256515595302, "learning_rate": 0.00011510669134634984, "loss": 11.6128, "num_tokens": 4786533.0, "step": 671 }, { "epoch": 0.469601677148847, "grad_norm": 0.919436534596015, "learning_rate": 0.000114882910890285, "loss": 11.7525, "num_tokens": 4793395.0, "step": 672 }, { "epoch": 0.4703004891684137, "grad_norm": 0.8679940499104837, "learning_rate": 0.00011465905418972216, "loss": 11.6152, "num_tokens": 4800501.0, "step": 673 }, { "epoch": 0.47099930118798045, "grad_norm": 0.8595368570624692, "learning_rate": 0.00011443512239146941, "loss": 11.4401, "num_tokens": 4807730.0, "step": 674 }, { "epoch": 0.4716981132075472, "grad_norm": 0.9418065073409608, "learning_rate": 0.00011421111664271946, "loss": 11.9013, "num_tokens": 4814631.0, "step": 675 }, { "epoch": 0.4723969252271139, "grad_norm": 0.8698638525941518, "learning_rate": 0.00011398703809104391, "loss": 11.7185, "num_tokens": 4821859.0, "step": 676 }, { "epoch": 0.47309573724668064, "grad_norm": 0.9484728369720604, "learning_rate": 0.00011376288788438734, "loss": 11.783, "num_tokens": 4829001.0, "step": 677 }, { "epoch": 0.47379454926624737, "grad_norm": 0.8223477112780467, "learning_rate": 0.00011353866717106137, "loss": 11.8393, "num_tokens": 4836233.0, "step": 678 }, { "epoch": 0.4744933612858141, "grad_norm": 0.831097262102507, "learning_rate": 0.0001133143770997389, "loss": 11.6262, "num_tokens": 4844216.0, "step": 679 }, { "epoch": 0.4751921733053808, "grad_norm": 0.8328282276106426, "learning_rate": 0.00011309001881944809, "loss": 11.6812, "num_tokens": 4851705.0, "step": 680 }, { "epoch": 0.4758909853249476, "grad_norm": 0.9257538169159817, "learning_rate": 0.00011286559347956651, "loss": 11.8385, "num_tokens": 4858279.0, "step": 681 }, { "epoch": 0.47658979734451434, "grad_norm": 0.8413337207403613, "learning_rate": 0.00011264110222981535, "loss": 11.6961, "num_tokens": 4866344.0, "step": 682 }, { "epoch": 0.4772886093640811, "grad_norm": 0.8649763291413212, "learning_rate": 0.00011241654622025334, "loss": 11.5717, "num_tokens": 4873494.0, "step": 683 }, { "epoch": 0.4779874213836478, "grad_norm": 0.8267894686349591, "learning_rate": 0.00011219192660127116, "loss": 11.5825, "num_tokens": 4880904.0, "step": 684 }, { "epoch": 0.47868623340321453, "grad_norm": 0.7711166963102474, "learning_rate": 0.00011196724452358516, "loss": 11.5718, "num_tokens": 4888663.0, "step": 685 }, { "epoch": 0.47938504542278126, "grad_norm": 0.9464532926989868, "learning_rate": 0.00011174250113823173, "loss": 11.5705, "num_tokens": 4895524.0, "step": 686 }, { "epoch": 0.480083857442348, "grad_norm": 0.8496053713043877, "learning_rate": 0.00011151769759656136, "loss": 11.5792, "num_tokens": 4902875.0, "step": 687 }, { "epoch": 0.4807826694619147, "grad_norm": 0.9687523885064169, "learning_rate": 0.00011129283505023274, "loss": 11.6743, "num_tokens": 4910339.0, "step": 688 }, { "epoch": 0.48148148148148145, "grad_norm": 0.8936773368209414, "learning_rate": 0.00011106791465120678, "loss": 11.5858, "num_tokens": 4917859.0, "step": 689 }, { "epoch": 0.48218029350104824, "grad_norm": 0.8387839584310475, "learning_rate": 0.00011084293755174083, "loss": 11.529, "num_tokens": 4924845.0, "step": 690 }, { "epoch": 0.48287910552061497, "grad_norm": 0.8689568841128783, "learning_rate": 0.0001106179049043826, "loss": 11.6106, "num_tokens": 4932581.0, "step": 691 }, { "epoch": 0.4835779175401817, "grad_norm": 0.9581051097093195, "learning_rate": 0.00011039281786196454, "loss": 11.5746, "num_tokens": 4938840.0, "step": 692 }, { "epoch": 0.48427672955974843, "grad_norm": 0.8586009856955368, "learning_rate": 0.00011016767757759758, "loss": 11.5862, "num_tokens": 4946122.0, "step": 693 }, { "epoch": 0.48497554157931516, "grad_norm": 0.8457064008914603, "learning_rate": 0.00010994248520466555, "loss": 11.6757, "num_tokens": 4953325.0, "step": 694 }, { "epoch": 0.4856743535988819, "grad_norm": 0.8780641814348458, "learning_rate": 0.00010971724189681907, "loss": 11.4986, "num_tokens": 4960210.0, "step": 695 }, { "epoch": 0.4863731656184486, "grad_norm": 0.7763581662404005, "learning_rate": 0.00010949194880796966, "loss": 11.4152, "num_tokens": 4968396.0, "step": 696 }, { "epoch": 0.48707197763801535, "grad_norm": 0.8525009632274685, "learning_rate": 0.000109266607092284, "loss": 11.6096, "num_tokens": 4975812.0, "step": 697 }, { "epoch": 0.48777078965758214, "grad_norm": 0.8011788073233526, "learning_rate": 0.00010904121790417767, "loss": 11.5614, "num_tokens": 4983615.0, "step": 698 }, { "epoch": 0.48846960167714887, "grad_norm": 0.8819826065813017, "learning_rate": 0.00010881578239830965, "loss": 11.4381, "num_tokens": 4990664.0, "step": 699 }, { "epoch": 0.4891684136967156, "grad_norm": 0.7917077959661855, "learning_rate": 0.0001085903017295761, "loss": 11.576, "num_tokens": 4997991.0, "step": 700 }, { "epoch": 0.48986722571628233, "grad_norm": 0.8045632066937026, "learning_rate": 0.00010836477705310457, "loss": 11.4527, "num_tokens": 5005535.0, "step": 701 }, { "epoch": 0.49056603773584906, "grad_norm": 0.8324889010068139, "learning_rate": 0.00010813920952424805, "loss": 11.4868, "num_tokens": 5012725.0, "step": 702 }, { "epoch": 0.4912648497554158, "grad_norm": 0.8737336153739056, "learning_rate": 0.00010791360029857908, "loss": 11.5057, "num_tokens": 5020226.0, "step": 703 }, { "epoch": 0.4919636617749825, "grad_norm": 0.8651144067674705, "learning_rate": 0.00010768795053188378, "loss": 11.6567, "num_tokens": 5027841.0, "step": 704 }, { "epoch": 0.49266247379454925, "grad_norm": 0.8803712717375257, "learning_rate": 0.00010746226138015605, "loss": 11.5934, "num_tokens": 5034672.0, "step": 705 }, { "epoch": 0.493361285814116, "grad_norm": 0.9111814847188424, "learning_rate": 0.00010723653399959141, "loss": 11.5609, "num_tokens": 5041763.0, "step": 706 }, { "epoch": 0.49406009783368277, "grad_norm": 0.836271613979484, "learning_rate": 0.00010701076954658133, "loss": 11.7879, "num_tokens": 5048922.0, "step": 707 }, { "epoch": 0.4947589098532495, "grad_norm": 0.8749158004381932, "learning_rate": 0.00010678496917770719, "loss": 11.6949, "num_tokens": 5056008.0, "step": 708 }, { "epoch": 0.4954577218728162, "grad_norm": 0.9232202210604769, "learning_rate": 0.00010655913404973432, "loss": 11.5986, "num_tokens": 5062729.0, "step": 709 }, { "epoch": 0.49615653389238296, "grad_norm": 0.813868938711914, "learning_rate": 0.0001063332653196062, "loss": 11.6453, "num_tokens": 5069602.0, "step": 710 }, { "epoch": 0.4968553459119497, "grad_norm": 0.9242126754852555, "learning_rate": 0.00010610736414443836, "loss": 11.5618, "num_tokens": 5076146.0, "step": 711 }, { "epoch": 0.4975541579315164, "grad_norm": 0.8136052479087154, "learning_rate": 0.00010588143168151257, "loss": 11.4639, "num_tokens": 5083612.0, "step": 712 }, { "epoch": 0.49825296995108315, "grad_norm": 0.8852326396173759, "learning_rate": 0.00010565546908827093, "loss": 11.4881, "num_tokens": 5090353.0, "step": 713 }, { "epoch": 0.4989517819706499, "grad_norm": 0.8821183773577141, "learning_rate": 0.00010542947752230987, "loss": 11.483, "num_tokens": 5098640.0, "step": 714 }, { "epoch": 0.4996505939902166, "grad_norm": 0.785007875134882, "learning_rate": 0.00010520345814137422, "loss": 11.5312, "num_tokens": 5106336.0, "step": 715 }, { "epoch": 0.5003494060097834, "grad_norm": 0.9266326616202625, "learning_rate": 0.0001049774121033514, "loss": 11.5449, "num_tokens": 5113949.0, "step": 716 }, { "epoch": 0.5010482180293501, "grad_norm": 0.9045490686422497, "learning_rate": 0.00010475134056626521, "loss": 11.3111, "num_tokens": 5120936.0, "step": 717 }, { "epoch": 0.5017470300489169, "grad_norm": 0.8682422987772898, "learning_rate": 0.00010452524468827028, "loss": 11.5875, "num_tokens": 5127936.0, "step": 718 }, { "epoch": 0.5024458420684835, "grad_norm": 0.9430827830505436, "learning_rate": 0.00010429912562764582, "loss": 11.5213, "num_tokens": 5135181.0, "step": 719 }, { "epoch": 0.5031446540880503, "grad_norm": 0.9079946006860932, "learning_rate": 0.00010407298454278983, "loss": 11.668, "num_tokens": 5141581.0, "step": 720 }, { "epoch": 0.5038434661076171, "grad_norm": 0.8275366610829485, "learning_rate": 0.00010384682259221314, "loss": 11.5375, "num_tokens": 5149210.0, "step": 721 }, { "epoch": 0.5045422781271838, "grad_norm": 0.8396155615656928, "learning_rate": 0.00010362064093453347, "loss": 11.714, "num_tokens": 5156153.0, "step": 722 }, { "epoch": 0.5052410901467506, "grad_norm": 0.896827897280713, "learning_rate": 0.00010339444072846955, "loss": 11.5958, "num_tokens": 5163697.0, "step": 723 }, { "epoch": 0.5059399021663172, "grad_norm": 0.8796645326364783, "learning_rate": 0.00010316822313283503, "loss": 11.5598, "num_tokens": 5170939.0, "step": 724 }, { "epoch": 0.506638714185884, "grad_norm": 0.8530027365550926, "learning_rate": 0.00010294198930653273, "loss": 11.5458, "num_tokens": 5178058.0, "step": 725 }, { "epoch": 0.5073375262054507, "grad_norm": 0.9210847197223277, "learning_rate": 0.00010271574040854863, "loss": 11.4003, "num_tokens": 5185093.0, "step": 726 }, { "epoch": 0.5080363382250175, "grad_norm": 1.0009241150831982, "learning_rate": 0.00010248947759794583, "loss": 11.7382, "num_tokens": 5191726.0, "step": 727 }, { "epoch": 0.5087351502445842, "grad_norm": 0.9845840878696487, "learning_rate": 0.00010226320203385878, "loss": 11.4472, "num_tokens": 5198609.0, "step": 728 }, { "epoch": 0.5094339622641509, "grad_norm": 0.9751424145383517, "learning_rate": 0.00010203691487548721, "loss": 11.6166, "num_tokens": 5206060.0, "step": 729 }, { "epoch": 0.5101327742837177, "grad_norm": 0.9203888539094582, "learning_rate": 0.00010181061728209034, "loss": 11.6607, "num_tokens": 5213009.0, "step": 730 }, { "epoch": 0.5108315863032844, "grad_norm": 0.8788149722321699, "learning_rate": 0.00010158431041298076, "loss": 11.7453, "num_tokens": 5220145.0, "step": 731 }, { "epoch": 0.5115303983228512, "grad_norm": 0.903757013720806, "learning_rate": 0.00010135799542751861, "loss": 11.597, "num_tokens": 5226851.0, "step": 732 }, { "epoch": 0.5122292103424179, "grad_norm": 1.155119292678417, "learning_rate": 0.0001011316734851056, "loss": 11.4747, "num_tokens": 5234276.0, "step": 733 }, { "epoch": 0.5129280223619846, "grad_norm": 0.8100458853841396, "learning_rate": 0.00010090534574517907, "loss": 11.419, "num_tokens": 5241284.0, "step": 734 }, { "epoch": 0.5136268343815513, "grad_norm": 1.1407647261079725, "learning_rate": 0.00010067901336720611, "loss": 11.391, "num_tokens": 5248568.0, "step": 735 }, { "epoch": 0.5143256464011181, "grad_norm": 0.8989248775749288, "learning_rate": 0.00010045267751067757, "loss": 11.818, "num_tokens": 5255337.0, "step": 736 }, { "epoch": 0.5150244584206848, "grad_norm": 0.9297104357321131, "learning_rate": 0.00010022633933510201, "loss": 11.4153, "num_tokens": 5262391.0, "step": 737 }, { "epoch": 0.5157232704402516, "grad_norm": 0.8678797973500714, "learning_rate": 0.0001, "loss": 11.628, "num_tokens": 5270611.0, "step": 738 }, { "epoch": 0.5164220824598184, "grad_norm": 0.826651401983544, "learning_rate": 9.977366066489801e-05, "loss": 11.4746, "num_tokens": 5278249.0, "step": 739 }, { "epoch": 0.517120894479385, "grad_norm": 0.9125187235908128, "learning_rate": 9.954732248932244e-05, "loss": 11.6169, "num_tokens": 5285271.0, "step": 740 }, { "epoch": 0.5178197064989518, "grad_norm": 0.8816292587170192, "learning_rate": 9.932098663279392e-05, "loss": 11.4734, "num_tokens": 5292168.0, "step": 741 }, { "epoch": 0.5185185185185185, "grad_norm": 0.9512003221847414, "learning_rate": 9.909465425482093e-05, "loss": 11.638, "num_tokens": 5298892.0, "step": 742 }, { "epoch": 0.5192173305380853, "grad_norm": 0.9084938360187235, "learning_rate": 9.886832651489444e-05, "loss": 11.5939, "num_tokens": 5305743.0, "step": 743 }, { "epoch": 0.519916142557652, "grad_norm": 0.8670145766679175, "learning_rate": 9.864200457248144e-05, "loss": 11.4224, "num_tokens": 5313196.0, "step": 744 }, { "epoch": 0.5206149545772187, "grad_norm": 0.8331174116098162, "learning_rate": 9.841568958701924e-05, "loss": 11.4973, "num_tokens": 5320688.0, "step": 745 }, { "epoch": 0.5213137665967854, "grad_norm": 0.8997751578998335, "learning_rate": 9.81893827179097e-05, "loss": 11.4589, "num_tokens": 5328457.0, "step": 746 }, { "epoch": 0.5220125786163522, "grad_norm": 0.8720648157152265, "learning_rate": 9.796308512451284e-05, "loss": 11.6434, "num_tokens": 5335909.0, "step": 747 }, { "epoch": 0.522711390635919, "grad_norm": 0.973933585834344, "learning_rate": 9.773679796614124e-05, "loss": 11.3723, "num_tokens": 5343377.0, "step": 748 }, { "epoch": 0.5234102026554857, "grad_norm": 0.8303294482427562, "learning_rate": 9.751052240205421e-05, "loss": 11.5721, "num_tokens": 5350725.0, "step": 749 }, { "epoch": 0.5241090146750524, "grad_norm": 0.9573042241935518, "learning_rate": 9.728425959145139e-05, "loss": 11.5018, "num_tokens": 5358235.0, "step": 750 }, { "epoch": 0.5248078266946191, "grad_norm": 0.8319564056885251, "learning_rate": 9.705801069346729e-05, "loss": 11.613, "num_tokens": 5365590.0, "step": 751 }, { "epoch": 0.5255066387141859, "grad_norm": 0.8938099740380406, "learning_rate": 9.683177686716501e-05, "loss": 11.4718, "num_tokens": 5371915.0, "step": 752 }, { "epoch": 0.5262054507337526, "grad_norm": 1.0236425836011982, "learning_rate": 9.660555927153047e-05, "loss": 11.6484, "num_tokens": 5378290.0, "step": 753 }, { "epoch": 0.5269042627533194, "grad_norm": 0.7790090582597233, "learning_rate": 9.637935906546655e-05, "loss": 11.4802, "num_tokens": 5385294.0, "step": 754 }, { "epoch": 0.527603074772886, "grad_norm": 0.9227942573531065, "learning_rate": 9.615317740778689e-05, "loss": 11.6279, "num_tokens": 5392707.0, "step": 755 }, { "epoch": 0.5283018867924528, "grad_norm": 0.8441266244096841, "learning_rate": 9.592701545721021e-05, "loss": 11.5781, "num_tokens": 5400588.0, "step": 756 }, { "epoch": 0.5290006988120196, "grad_norm": 0.8729177064492472, "learning_rate": 9.570087437235423e-05, "loss": 11.6521, "num_tokens": 5407337.0, "step": 757 }, { "epoch": 0.5296995108315863, "grad_norm": 0.8659344216472323, "learning_rate": 9.547475531172973e-05, "loss": 11.5359, "num_tokens": 5414577.0, "step": 758 }, { "epoch": 0.5303983228511531, "grad_norm": 0.8714852832156825, "learning_rate": 9.524865943373481e-05, "loss": 11.5915, "num_tokens": 5421211.0, "step": 759 }, { "epoch": 0.5310971348707197, "grad_norm": 0.9244623269796611, "learning_rate": 9.502258789664865e-05, "loss": 11.6818, "num_tokens": 5427910.0, "step": 760 }, { "epoch": 0.5317959468902865, "grad_norm": 0.9155444602382034, "learning_rate": 9.479654185862579e-05, "loss": 11.6422, "num_tokens": 5434742.0, "step": 761 }, { "epoch": 0.5324947589098532, "grad_norm": 0.8219424880205499, "learning_rate": 9.457052247769017e-05, "loss": 11.5446, "num_tokens": 5441794.0, "step": 762 }, { "epoch": 0.53319357092942, "grad_norm": 0.7811646495161321, "learning_rate": 9.434453091172908e-05, "loss": 11.4499, "num_tokens": 5449915.0, "step": 763 }, { "epoch": 0.5338923829489868, "grad_norm": 0.8952879445607002, "learning_rate": 9.411856831848745e-05, "loss": 11.4182, "num_tokens": 5456843.0, "step": 764 }, { "epoch": 0.5345911949685535, "grad_norm": 0.8151057656551013, "learning_rate": 9.38926358555617e-05, "loss": 11.5544, "num_tokens": 5464023.0, "step": 765 }, { "epoch": 0.5352900069881202, "grad_norm": 0.8620924745150299, "learning_rate": 9.366673468039383e-05, "loss": 11.7123, "num_tokens": 5471406.0, "step": 766 }, { "epoch": 0.5359888190076869, "grad_norm": 0.9188647725830821, "learning_rate": 9.34408659502657e-05, "loss": 11.5564, "num_tokens": 5478339.0, "step": 767 }, { "epoch": 0.5366876310272537, "grad_norm": 0.8321100069995644, "learning_rate": 9.321503082229282e-05, "loss": 11.4913, "num_tokens": 5485853.0, "step": 768 }, { "epoch": 0.5373864430468204, "grad_norm": 0.8562467427898608, "learning_rate": 9.298923045341869e-05, "loss": 11.453, "num_tokens": 5493547.0, "step": 769 }, { "epoch": 0.5380852550663872, "grad_norm": 0.9138670376872613, "learning_rate": 9.276346600040862e-05, "loss": 11.2109, "num_tokens": 5500515.0, "step": 770 }, { "epoch": 0.5387840670859538, "grad_norm": 0.9006556795724268, "learning_rate": 9.253773861984397e-05, "loss": 11.4139, "num_tokens": 5507599.0, "step": 771 }, { "epoch": 0.5394828791055206, "grad_norm": 0.8553530215170654, "learning_rate": 9.231204946811624e-05, "loss": 11.5513, "num_tokens": 5514593.0, "step": 772 }, { "epoch": 0.5401816911250874, "grad_norm": 0.9148080781609712, "learning_rate": 9.208639970142093e-05, "loss": 11.5843, "num_tokens": 5521763.0, "step": 773 }, { "epoch": 0.5408805031446541, "grad_norm": 0.7554532212539814, "learning_rate": 9.186079047575197e-05, "loss": 11.5373, "num_tokens": 5529580.0, "step": 774 }, { "epoch": 0.5415793151642209, "grad_norm": 0.8307938551299585, "learning_rate": 9.163522294689546e-05, "loss": 11.461, "num_tokens": 5536873.0, "step": 775 }, { "epoch": 0.5422781271837875, "grad_norm": 0.9281257673464066, "learning_rate": 9.140969827042391e-05, "loss": 11.5544, "num_tokens": 5543550.0, "step": 776 }, { "epoch": 0.5429769392033543, "grad_norm": 0.8120164780002868, "learning_rate": 9.118421760169038e-05, "loss": 11.7136, "num_tokens": 5550884.0, "step": 777 }, { "epoch": 0.543675751222921, "grad_norm": 0.8836373379698687, "learning_rate": 9.095878209582237e-05, "loss": 11.386, "num_tokens": 5557807.0, "step": 778 }, { "epoch": 0.5443745632424878, "grad_norm": 0.9297380134031583, "learning_rate": 9.073339290771603e-05, "loss": 11.5867, "num_tokens": 5564576.0, "step": 779 }, { "epoch": 0.5450733752620545, "grad_norm": 0.7995838496582022, "learning_rate": 9.050805119203035e-05, "loss": 11.4059, "num_tokens": 5572504.0, "step": 780 }, { "epoch": 0.5457721872816212, "grad_norm": 0.8264976985889633, "learning_rate": 9.028275810318095e-05, "loss": 11.3345, "num_tokens": 5579720.0, "step": 781 }, { "epoch": 0.546470999301188, "grad_norm": 0.8302780963555162, "learning_rate": 9.005751479533449e-05, "loss": 11.5461, "num_tokens": 5586866.0, "step": 782 }, { "epoch": 0.5471698113207547, "grad_norm": 0.8842323310444606, "learning_rate": 8.983232242240247e-05, "loss": 11.4853, "num_tokens": 5593273.0, "step": 783 }, { "epoch": 0.5478686233403215, "grad_norm": 0.8079761026322886, "learning_rate": 8.96071821380355e-05, "loss": 11.6445, "num_tokens": 5600825.0, "step": 784 }, { "epoch": 0.5485674353598882, "grad_norm": 0.7976472224058269, "learning_rate": 8.938209509561741e-05, "loss": 11.5162, "num_tokens": 5608068.0, "step": 785 }, { "epoch": 0.549266247379455, "grad_norm": 0.7844336006627589, "learning_rate": 8.91570624482592e-05, "loss": 11.5521, "num_tokens": 5615637.0, "step": 786 }, { "epoch": 0.5499650593990216, "grad_norm": 0.8546739107820156, "learning_rate": 8.893208534879324e-05, "loss": 11.4099, "num_tokens": 5622822.0, "step": 787 }, { "epoch": 0.5506638714185884, "grad_norm": 0.7935052239619794, "learning_rate": 8.87071649497673e-05, "loss": 11.4709, "num_tokens": 5629785.0, "step": 788 }, { "epoch": 0.5513626834381551, "grad_norm": 0.8364823960249244, "learning_rate": 8.848230240343865e-05, "loss": 11.5328, "num_tokens": 5636551.0, "step": 789 }, { "epoch": 0.5520614954577219, "grad_norm": 0.8504654444826972, "learning_rate": 8.82574988617683e-05, "loss": 11.4773, "num_tokens": 5643210.0, "step": 790 }, { "epoch": 0.5527603074772887, "grad_norm": 0.8185493182608572, "learning_rate": 8.803275547641488e-05, "loss": 11.3797, "num_tokens": 5650927.0, "step": 791 }, { "epoch": 0.5534591194968553, "grad_norm": 0.7875124927027887, "learning_rate": 8.780807339872886e-05, "loss": 11.6004, "num_tokens": 5658354.0, "step": 792 }, { "epoch": 0.5541579315164221, "grad_norm": 0.9390755326975335, "learning_rate": 8.758345377974667e-05, "loss": 11.7106, "num_tokens": 5664499.0, "step": 793 }, { "epoch": 0.5548567435359888, "grad_norm": 0.8744733379949224, "learning_rate": 8.735889777018465e-05, "loss": 11.5987, "num_tokens": 5670823.0, "step": 794 }, { "epoch": 0.5555555555555556, "grad_norm": 0.7936257103943727, "learning_rate": 8.71344065204335e-05, "loss": 11.3932, "num_tokens": 5678464.0, "step": 795 }, { "epoch": 0.5562543675751223, "grad_norm": 0.9012746970901263, "learning_rate": 8.690998118055193e-05, "loss": 11.4493, "num_tokens": 5685313.0, "step": 796 }, { "epoch": 0.556953179594689, "grad_norm": 0.7949308089769119, "learning_rate": 8.66856229002611e-05, "loss": 11.6037, "num_tokens": 5692972.0, "step": 797 }, { "epoch": 0.5576519916142557, "grad_norm": 0.8986243273374567, "learning_rate": 8.646133282893864e-05, "loss": 11.4562, "num_tokens": 5699327.0, "step": 798 }, { "epoch": 0.5583508036338225, "grad_norm": 0.8149788916712181, "learning_rate": 8.623711211561267e-05, "loss": 11.4127, "num_tokens": 5706702.0, "step": 799 }, { "epoch": 0.5590496156533893, "grad_norm": 0.8205025674176395, "learning_rate": 8.601296190895611e-05, "loss": 11.5586, "num_tokens": 5713918.0, "step": 800 }, { "epoch": 0.559748427672956, "grad_norm": 0.9188648076919179, "learning_rate": 8.578888335728057e-05, "loss": 11.4676, "num_tokens": 5721220.0, "step": 801 }, { "epoch": 0.5604472396925227, "grad_norm": 0.8501915802973017, "learning_rate": 8.55648776085306e-05, "loss": 11.4786, "num_tokens": 5728492.0, "step": 802 }, { "epoch": 0.5611460517120894, "grad_norm": 0.8285958870852815, "learning_rate": 8.534094581027785e-05, "loss": 11.6043, "num_tokens": 5735983.0, "step": 803 }, { "epoch": 0.5618448637316562, "grad_norm": 0.944523071088074, "learning_rate": 8.511708910971505e-05, "loss": 11.5251, "num_tokens": 5742678.0, "step": 804 }, { "epoch": 0.5625436757512229, "grad_norm": 0.8081306301004181, "learning_rate": 8.489330865365018e-05, "loss": 11.4547, "num_tokens": 5750553.0, "step": 805 }, { "epoch": 0.5632424877707897, "grad_norm": 0.7754025315598624, "learning_rate": 8.466960558850077e-05, "loss": 11.2548, "num_tokens": 5758022.0, "step": 806 }, { "epoch": 0.5639412997903563, "grad_norm": 0.8196770260099395, "learning_rate": 8.444598106028773e-05, "loss": 11.4266, "num_tokens": 5765255.0, "step": 807 }, { "epoch": 0.5646401118099231, "grad_norm": 0.8107362785471051, "learning_rate": 8.422243621462969e-05, "loss": 11.4811, "num_tokens": 5772746.0, "step": 808 }, { "epoch": 0.5653389238294899, "grad_norm": 0.7690551203928506, "learning_rate": 8.399897219673709e-05, "loss": 11.3748, "num_tokens": 5779819.0, "step": 809 }, { "epoch": 0.5660377358490566, "grad_norm": 0.7912532894956382, "learning_rate": 8.37755901514062e-05, "loss": 11.4268, "num_tokens": 5787293.0, "step": 810 }, { "epoch": 0.5667365478686234, "grad_norm": 0.7530699743645662, "learning_rate": 8.355229122301348e-05, "loss": 11.4364, "num_tokens": 5795358.0, "step": 811 }, { "epoch": 0.56743535988819, "grad_norm": 0.8641916957536385, "learning_rate": 8.332907655550948e-05, "loss": 11.565, "num_tokens": 5802192.0, "step": 812 }, { "epoch": 0.5681341719077568, "grad_norm": 0.874534688203229, "learning_rate": 8.310594729241317e-05, "loss": 11.6021, "num_tokens": 5809107.0, "step": 813 }, { "epoch": 0.5688329839273235, "grad_norm": 0.7922582476273456, "learning_rate": 8.288290457680591e-05, "loss": 11.3441, "num_tokens": 5815896.0, "step": 814 }, { "epoch": 0.5695317959468903, "grad_norm": 0.8349253188255037, "learning_rate": 8.265994955132572e-05, "loss": 11.6314, "num_tokens": 5823124.0, "step": 815 }, { "epoch": 0.570230607966457, "grad_norm": 0.8699624382696525, "learning_rate": 8.243708335816145e-05, "loss": 11.6838, "num_tokens": 5829951.0, "step": 816 }, { "epoch": 0.5709294199860238, "grad_norm": 0.8100979857916469, "learning_rate": 8.221430713904672e-05, "loss": 11.3424, "num_tokens": 5836842.0, "step": 817 }, { "epoch": 0.5716282320055905, "grad_norm": 0.8477977602214554, "learning_rate": 8.19916220352544e-05, "loss": 11.5169, "num_tokens": 5844342.0, "step": 818 }, { "epoch": 0.5723270440251572, "grad_norm": 0.7768417118848057, "learning_rate": 8.176902918759041e-05, "loss": 11.4596, "num_tokens": 5851608.0, "step": 819 }, { "epoch": 0.573025856044724, "grad_norm": 0.8145926096488002, "learning_rate": 8.15465297363881e-05, "loss": 11.3234, "num_tokens": 5858938.0, "step": 820 }, { "epoch": 0.5737246680642907, "grad_norm": 0.8306120383140485, "learning_rate": 8.132412482150245e-05, "loss": 11.4715, "num_tokens": 5865983.0, "step": 821 }, { "epoch": 0.5744234800838575, "grad_norm": 0.8432187258645947, "learning_rate": 8.110181558230404e-05, "loss": 11.5188, "num_tokens": 5872600.0, "step": 822 }, { "epoch": 0.5751222921034241, "grad_norm": 0.913675498107121, "learning_rate": 8.087960315767328e-05, "loss": 11.4379, "num_tokens": 5878398.0, "step": 823 }, { "epoch": 0.5758211041229909, "grad_norm": 0.7767964555554504, "learning_rate": 8.06574886859947e-05, "loss": 11.5975, "num_tokens": 5886022.0, "step": 824 }, { "epoch": 0.5765199161425576, "grad_norm": 0.8067890059558731, "learning_rate": 8.043547330515092e-05, "loss": 11.417, "num_tokens": 5893141.0, "step": 825 }, { "epoch": 0.5772187281621244, "grad_norm": 0.7778204138902519, "learning_rate": 8.021355815251703e-05, "loss": 11.4109, "num_tokens": 5900162.0, "step": 826 }, { "epoch": 0.5779175401816912, "grad_norm": 0.8581915102507534, "learning_rate": 7.999174436495456e-05, "loss": 11.3281, "num_tokens": 5907169.0, "step": 827 }, { "epoch": 0.5786163522012578, "grad_norm": 0.7943071251832075, "learning_rate": 7.97700330788058e-05, "loss": 11.5825, "num_tokens": 5914949.0, "step": 828 }, { "epoch": 0.5793151642208246, "grad_norm": 0.8307994124468097, "learning_rate": 7.954842542988792e-05, "loss": 11.5205, "num_tokens": 5921813.0, "step": 829 }, { "epoch": 0.5800139762403913, "grad_norm": 0.7956011180143502, "learning_rate": 7.932692255348711e-05, "loss": 11.6589, "num_tokens": 5929467.0, "step": 830 }, { "epoch": 0.5807127882599581, "grad_norm": 0.751578867815494, "learning_rate": 7.910552558435297e-05, "loss": 11.3462, "num_tokens": 5936756.0, "step": 831 }, { "epoch": 0.5814116002795248, "grad_norm": 0.7268812168656755, "learning_rate": 7.888423565669236e-05, "loss": 11.4221, "num_tokens": 5944825.0, "step": 832 }, { "epoch": 0.5821104122990916, "grad_norm": 0.8006177593002789, "learning_rate": 7.866305390416385e-05, "loss": 11.4256, "num_tokens": 5951987.0, "step": 833 }, { "epoch": 0.5828092243186582, "grad_norm": 0.7402596515700245, "learning_rate": 7.844198145987187e-05, "loss": 11.3692, "num_tokens": 5959236.0, "step": 834 }, { "epoch": 0.583508036338225, "grad_norm": 0.8602548653263015, "learning_rate": 7.82210194563608e-05, "loss": 11.7315, "num_tokens": 5966125.0, "step": 835 }, { "epoch": 0.5842068483577918, "grad_norm": 0.7749004598919897, "learning_rate": 7.800016902560924e-05, "loss": 11.4858, "num_tokens": 5974067.0, "step": 836 }, { "epoch": 0.5849056603773585, "grad_norm": 0.833537255516446, "learning_rate": 7.77794312990243e-05, "loss": 11.4515, "num_tokens": 5980876.0, "step": 837 }, { "epoch": 0.5856044723969253, "grad_norm": 0.8541885917387239, "learning_rate": 7.755880740743559e-05, "loss": 11.6651, "num_tokens": 5988494.0, "step": 838 }, { "epoch": 0.5863032844164919, "grad_norm": 0.8342950784145347, "learning_rate": 7.733829848108965e-05, "loss": 11.3994, "num_tokens": 5995555.0, "step": 839 }, { "epoch": 0.5870020964360587, "grad_norm": 0.7862902039549512, "learning_rate": 7.7117905649644e-05, "loss": 11.3371, "num_tokens": 6002655.0, "step": 840 }, { "epoch": 0.5877009084556254, "grad_norm": 0.8543777698570504, "learning_rate": 7.689763004216135e-05, "loss": 11.5781, "num_tokens": 6009185.0, "step": 841 }, { "epoch": 0.5883997204751922, "grad_norm": 0.9108297671558842, "learning_rate": 7.667747278710406e-05, "loss": 11.2922, "num_tokens": 6016823.0, "step": 842 }, { "epoch": 0.589098532494759, "grad_norm": 0.7511611223234017, "learning_rate": 7.6457435012328e-05, "loss": 11.4936, "num_tokens": 6024771.0, "step": 843 }, { "epoch": 0.5897973445143256, "grad_norm": 0.8464284988171904, "learning_rate": 7.623751784507706e-05, "loss": 11.4268, "num_tokens": 6031975.0, "step": 844 }, { "epoch": 0.5904961565338924, "grad_norm": 0.8292138849138506, "learning_rate": 7.601772241197719e-05, "loss": 11.6141, "num_tokens": 6039124.0, "step": 845 }, { "epoch": 0.5911949685534591, "grad_norm": 0.8644997646527984, "learning_rate": 7.579804983903067e-05, "loss": 11.6254, "num_tokens": 6045752.0, "step": 846 }, { "epoch": 0.5918937805730259, "grad_norm": 0.9065613000680274, "learning_rate": 7.557850125161053e-05, "loss": 11.5652, "num_tokens": 6052231.0, "step": 847 }, { "epoch": 0.5925925925925926, "grad_norm": 0.8567835900522591, "learning_rate": 7.535907777445449e-05, "loss": 11.3823, "num_tokens": 6059886.0, "step": 848 }, { "epoch": 0.5932914046121593, "grad_norm": 0.8345066461262305, "learning_rate": 7.513978053165934e-05, "loss": 11.6205, "num_tokens": 6066548.0, "step": 849 }, { "epoch": 0.593990216631726, "grad_norm": 0.8056119012251634, "learning_rate": 7.492061064667526e-05, "loss": 11.4517, "num_tokens": 6074097.0, "step": 850 }, { "epoch": 0.5946890286512928, "grad_norm": 0.9106299776518808, "learning_rate": 7.470156924229988e-05, "loss": 11.524, "num_tokens": 6080717.0, "step": 851 }, { "epoch": 0.5953878406708596, "grad_norm": 0.7853436652395662, "learning_rate": 7.448265744067275e-05, "loss": 11.3629, "num_tokens": 6088678.0, "step": 852 }, { "epoch": 0.5960866526904263, "grad_norm": 0.8446726817091805, "learning_rate": 7.426387636326936e-05, "loss": 11.5285, "num_tokens": 6095534.0, "step": 853 }, { "epoch": 0.596785464709993, "grad_norm": 0.8096739560163518, "learning_rate": 7.404522713089554e-05, "loss": 11.5661, "num_tokens": 6102965.0, "step": 854 }, { "epoch": 0.5974842767295597, "grad_norm": 0.8368927033929309, "learning_rate": 7.382671086368172e-05, "loss": 11.2703, "num_tokens": 6109857.0, "step": 855 }, { "epoch": 0.5981830887491265, "grad_norm": 0.9545515909179914, "learning_rate": 7.360832868107708e-05, "loss": 11.8182, "num_tokens": 6116670.0, "step": 856 }, { "epoch": 0.5988819007686932, "grad_norm": 0.8061669184109826, "learning_rate": 7.3390081701844e-05, "loss": 11.2728, "num_tokens": 6123896.0, "step": 857 }, { "epoch": 0.59958071278826, "grad_norm": 0.8533029486109188, "learning_rate": 7.317197104405213e-05, "loss": 11.5543, "num_tokens": 6130750.0, "step": 858 }, { "epoch": 0.6002795248078266, "grad_norm": 0.8969968982568871, "learning_rate": 7.295399782507275e-05, "loss": 11.3407, "num_tokens": 6137850.0, "step": 859 }, { "epoch": 0.6009783368273934, "grad_norm": 0.8544932603225329, "learning_rate": 7.273616316157312e-05, "loss": 11.3906, "num_tokens": 6144967.0, "step": 860 }, { "epoch": 0.6016771488469602, "grad_norm": 0.9575871493500785, "learning_rate": 7.251846816951063e-05, "loss": 11.3528, "num_tokens": 6151858.0, "step": 861 }, { "epoch": 0.6023759608665269, "grad_norm": 0.9034387927043012, "learning_rate": 7.23009139641271e-05, "loss": 11.4893, "num_tokens": 6158407.0, "step": 862 }, { "epoch": 0.6030747728860937, "grad_norm": 0.8300032020729585, "learning_rate": 7.208350165994325e-05, "loss": 11.6586, "num_tokens": 6165454.0, "step": 863 }, { "epoch": 0.6037735849056604, "grad_norm": 0.9613981820130882, "learning_rate": 7.186623237075265e-05, "loss": 11.4186, "num_tokens": 6172278.0, "step": 864 }, { "epoch": 0.6044723969252271, "grad_norm": 0.8216153509086527, "learning_rate": 7.16491072096164e-05, "loss": 11.5343, "num_tokens": 6179334.0, "step": 865 }, { "epoch": 0.6051712089447938, "grad_norm": 0.8120991028042943, "learning_rate": 7.143212728885714e-05, "loss": 11.4413, "num_tokens": 6186949.0, "step": 866 }, { "epoch": 0.6058700209643606, "grad_norm": 0.8725448199985772, "learning_rate": 7.121529372005335e-05, "loss": 11.5006, "num_tokens": 6194084.0, "step": 867 }, { "epoch": 0.6065688329839273, "grad_norm": 0.7857788374163461, "learning_rate": 7.099860761403403e-05, "loss": 11.5642, "num_tokens": 6201312.0, "step": 868 }, { "epoch": 0.6072676450034941, "grad_norm": 0.749546501349823, "learning_rate": 7.078207008087248e-05, "loss": 11.5207, "num_tokens": 6208547.0, "step": 869 }, { "epoch": 0.6079664570230608, "grad_norm": 0.807549031552495, "learning_rate": 7.056568222988099e-05, "loss": 11.5615, "num_tokens": 6215891.0, "step": 870 }, { "epoch": 0.6086652690426275, "grad_norm": 0.7753534051289811, "learning_rate": 7.034944516960498e-05, "loss": 11.4658, "num_tokens": 6223503.0, "step": 871 }, { "epoch": 0.6093640810621943, "grad_norm": 0.7732259838681556, "learning_rate": 7.013336000781738e-05, "loss": 11.5801, "num_tokens": 6230239.0, "step": 872 }, { "epoch": 0.610062893081761, "grad_norm": 0.7574853840801833, "learning_rate": 6.991742785151305e-05, "loss": 11.3924, "num_tokens": 6237658.0, "step": 873 }, { "epoch": 0.6107617051013278, "grad_norm": 0.8360871413852982, "learning_rate": 6.970164980690285e-05, "loss": 11.5716, "num_tokens": 6244558.0, "step": 874 }, { "epoch": 0.6114605171208944, "grad_norm": 0.8199482824692478, "learning_rate": 6.94860269794083e-05, "loss": 11.5512, "num_tokens": 6251785.0, "step": 875 }, { "epoch": 0.6121593291404612, "grad_norm": 0.7837266750011456, "learning_rate": 6.927056047365557e-05, "loss": 11.3226, "num_tokens": 6258748.0, "step": 876 }, { "epoch": 0.6128581411600279, "grad_norm": 0.7577779156383097, "learning_rate": 6.905525139347011e-05, "loss": 11.494, "num_tokens": 6266062.0, "step": 877 }, { "epoch": 0.6135569531795947, "grad_norm": 0.8052344965652813, "learning_rate": 6.884010084187093e-05, "loss": 11.427, "num_tokens": 6272639.0, "step": 878 }, { "epoch": 0.6142557651991615, "grad_norm": 0.8325818177093272, "learning_rate": 6.86251099210648e-05, "loss": 11.5747, "num_tokens": 6279204.0, "step": 879 }, { "epoch": 0.6149545772187281, "grad_norm": 0.7754376173157712, "learning_rate": 6.841027973244076e-05, "loss": 11.4026, "num_tokens": 6286372.0, "step": 880 }, { "epoch": 0.6156533892382949, "grad_norm": 0.798756729619339, "learning_rate": 6.819561137656443e-05, "loss": 11.3876, "num_tokens": 6292809.0, "step": 881 }, { "epoch": 0.6163522012578616, "grad_norm": 0.7719865569386571, "learning_rate": 6.798110595317229e-05, "loss": 11.4415, "num_tokens": 6300059.0, "step": 882 }, { "epoch": 0.6170510132774284, "grad_norm": 0.8195039075085477, "learning_rate": 6.776676456116629e-05, "loss": 11.5411, "num_tokens": 6306992.0, "step": 883 }, { "epoch": 0.6177498252969951, "grad_norm": 0.8027317976901461, "learning_rate": 6.755258829860791e-05, "loss": 11.3557, "num_tokens": 6313854.0, "step": 884 }, { "epoch": 0.6184486373165619, "grad_norm": 0.771851720603674, "learning_rate": 6.733857826271271e-05, "loss": 11.369, "num_tokens": 6320866.0, "step": 885 }, { "epoch": 0.6191474493361285, "grad_norm": 0.7674659928459595, "learning_rate": 6.712473554984472e-05, "loss": 11.3876, "num_tokens": 6328614.0, "step": 886 }, { "epoch": 0.6198462613556953, "grad_norm": 0.7600859890865244, "learning_rate": 6.69110612555107e-05, "loss": 11.4883, "num_tokens": 6336010.0, "step": 887 }, { "epoch": 0.6205450733752621, "grad_norm": 0.8243419809913102, "learning_rate": 6.669755647435474e-05, "loss": 11.3781, "num_tokens": 6342374.0, "step": 888 }, { "epoch": 0.6212438853948288, "grad_norm": 0.7829044387907746, "learning_rate": 6.648422230015242e-05, "loss": 11.3094, "num_tokens": 6349581.0, "step": 889 }, { "epoch": 0.6219426974143956, "grad_norm": 0.8205152326848861, "learning_rate": 6.627105982580528e-05, "loss": 11.4789, "num_tokens": 6356441.0, "step": 890 }, { "epoch": 0.6226415094339622, "grad_norm": 0.8121937689718386, "learning_rate": 6.605807014333538e-05, "loss": 11.3885, "num_tokens": 6363148.0, "step": 891 }, { "epoch": 0.623340321453529, "grad_norm": 0.8187543353581098, "learning_rate": 6.584525434387944e-05, "loss": 11.367, "num_tokens": 6370176.0, "step": 892 }, { "epoch": 0.6240391334730957, "grad_norm": 0.8189881007250471, "learning_rate": 6.563261351768345e-05, "loss": 11.6037, "num_tokens": 6377384.0, "step": 893 }, { "epoch": 0.6247379454926625, "grad_norm": 0.7403030747434303, "learning_rate": 6.542014875409703e-05, "loss": 11.3652, "num_tokens": 6384876.0, "step": 894 }, { "epoch": 0.6254367575122292, "grad_norm": 0.8179439653721886, "learning_rate": 6.52078611415678e-05, "loss": 11.3852, "num_tokens": 6391664.0, "step": 895 }, { "epoch": 0.6261355695317959, "grad_norm": 0.8515340307528421, "learning_rate": 6.49957517676359e-05, "loss": 11.3703, "num_tokens": 6398231.0, "step": 896 }, { "epoch": 0.6268343815513627, "grad_norm": 0.8288077296988552, "learning_rate": 6.47838217189283e-05, "loss": 11.353, "num_tokens": 6405010.0, "step": 897 }, { "epoch": 0.6275331935709294, "grad_norm": 0.730129626547827, "learning_rate": 6.457207208115328e-05, "loss": 11.5185, "num_tokens": 6412293.0, "step": 898 }, { "epoch": 0.6282320055904962, "grad_norm": 0.8202972201931359, "learning_rate": 6.436050393909499e-05, "loss": 11.5474, "num_tokens": 6419190.0, "step": 899 }, { "epoch": 0.6289308176100629, "grad_norm": 0.7830504737205021, "learning_rate": 6.414911837660768e-05, "loss": 11.1973, "num_tokens": 6426719.0, "step": 900 }, { "epoch": 0.6296296296296297, "grad_norm": 0.7530937085031525, "learning_rate": 6.393791647661032e-05, "loss": 11.5101, "num_tokens": 6434060.0, "step": 901 }, { "epoch": 0.6303284416491963, "grad_norm": 0.7906021221246778, "learning_rate": 6.372689932108091e-05, "loss": 11.5279, "num_tokens": 6441144.0, "step": 902 }, { "epoch": 0.6310272536687631, "grad_norm": 0.8223449747267925, "learning_rate": 6.351606799105107e-05, "loss": 11.2108, "num_tokens": 6448000.0, "step": 903 }, { "epoch": 0.6317260656883298, "grad_norm": 0.7078365936333572, "learning_rate": 6.330542356660046e-05, "loss": 11.4398, "num_tokens": 6455854.0, "step": 904 }, { "epoch": 0.6324248777078966, "grad_norm": 0.8055321988221168, "learning_rate": 6.309496712685122e-05, "loss": 11.3534, "num_tokens": 6462687.0, "step": 905 }, { "epoch": 0.6331236897274634, "grad_norm": 0.8080504596422721, "learning_rate": 6.288469974996234e-05, "loss": 11.4418, "num_tokens": 6469302.0, "step": 906 }, { "epoch": 0.63382250174703, "grad_norm": 0.8806667746234638, "learning_rate": 6.267462251312445e-05, "loss": 11.4176, "num_tokens": 6475494.0, "step": 907 }, { "epoch": 0.6345213137665968, "grad_norm": 0.7642307341098185, "learning_rate": 6.24647364925539e-05, "loss": 11.399, "num_tokens": 6483132.0, "step": 908 }, { "epoch": 0.6352201257861635, "grad_norm": 0.7753132086477652, "learning_rate": 6.225504276348766e-05, "loss": 11.2906, "num_tokens": 6490796.0, "step": 909 }, { "epoch": 0.6359189378057303, "grad_norm": 0.7486646132302851, "learning_rate": 6.204554240017742e-05, "loss": 11.352, "num_tokens": 6498706.0, "step": 910 }, { "epoch": 0.636617749825297, "grad_norm": 0.778566234919517, "learning_rate": 6.183623647588427e-05, "loss": 11.5678, "num_tokens": 6505674.0, "step": 911 }, { "epoch": 0.6373165618448637, "grad_norm": 0.9101437765550461, "learning_rate": 6.162712606287335e-05, "loss": 11.384, "num_tokens": 6512122.0, "step": 912 }, { "epoch": 0.6380153738644304, "grad_norm": 0.791821933199024, "learning_rate": 6.141821223240804e-05, "loss": 11.3918, "num_tokens": 6519491.0, "step": 913 }, { "epoch": 0.6387141858839972, "grad_norm": 0.8037666912694112, "learning_rate": 6.120949605474478e-05, "loss": 11.5467, "num_tokens": 6526251.0, "step": 914 }, { "epoch": 0.639412997903564, "grad_norm": 0.8390371657278727, "learning_rate": 6.100097859912732e-05, "loss": 11.4776, "num_tokens": 6532851.0, "step": 915 }, { "epoch": 0.6401118099231307, "grad_norm": 0.8884662450896984, "learning_rate": 6.0792660933781375e-05, "loss": 11.3619, "num_tokens": 6539983.0, "step": 916 }, { "epoch": 0.6408106219426974, "grad_norm": 0.7726014803170672, "learning_rate": 6.058454412590928e-05, "loss": 11.3864, "num_tokens": 6547267.0, "step": 917 }, { "epoch": 0.6415094339622641, "grad_norm": 0.8533702770606022, "learning_rate": 6.037662924168419e-05, "loss": 11.2928, "num_tokens": 6554111.0, "step": 918 }, { "epoch": 0.6422082459818309, "grad_norm": 0.8468889356156204, "learning_rate": 6.016891734624501e-05, "loss": 11.4525, "num_tokens": 6561069.0, "step": 919 }, { "epoch": 0.6429070580013976, "grad_norm": 0.8612184509336287, "learning_rate": 5.9961409503690605e-05, "loss": 11.6099, "num_tokens": 6568238.0, "step": 920 }, { "epoch": 0.6436058700209644, "grad_norm": 0.7434062790975529, "learning_rate": 5.975410677707447e-05, "loss": 11.4411, "num_tokens": 6575880.0, "step": 921 }, { "epoch": 0.6443046820405312, "grad_norm": 0.8174217364264119, "learning_rate": 5.954701022839944e-05, "loss": 11.4125, "num_tokens": 6582676.0, "step": 922 }, { "epoch": 0.6450034940600978, "grad_norm": 0.7558016655031929, "learning_rate": 5.9340120918611994e-05, "loss": 11.6664, "num_tokens": 6590130.0, "step": 923 }, { "epoch": 0.6457023060796646, "grad_norm": 0.7940056450454165, "learning_rate": 5.913343990759695e-05, "loss": 11.5493, "num_tokens": 6596815.0, "step": 924 }, { "epoch": 0.6464011180992313, "grad_norm": 0.7726837389032853, "learning_rate": 5.8926968254172076e-05, "loss": 11.1489, "num_tokens": 6604041.0, "step": 925 }, { "epoch": 0.6470999301187981, "grad_norm": 0.823514945514921, "learning_rate": 5.872070701608251e-05, "loss": 11.4563, "num_tokens": 6611449.0, "step": 926 }, { "epoch": 0.6477987421383647, "grad_norm": 0.7998018031164993, "learning_rate": 5.851465724999559e-05, "loss": 11.55, "num_tokens": 6618417.0, "step": 927 }, { "epoch": 0.6484975541579315, "grad_norm": 0.7462225767654096, "learning_rate": 5.830882001149517e-05, "loss": 11.3976, "num_tokens": 6626409.0, "step": 928 }, { "epoch": 0.6491963661774982, "grad_norm": 0.7942705495379757, "learning_rate": 5.8103196355076305e-05, "loss": 11.464, "num_tokens": 6633421.0, "step": 929 }, { "epoch": 0.649895178197065, "grad_norm": 0.7533581744429543, "learning_rate": 5.789778733414004e-05, "loss": 11.4489, "num_tokens": 6641387.0, "step": 930 }, { "epoch": 0.6505939902166318, "grad_norm": 0.7765742186173611, "learning_rate": 5.769259400098769e-05, "loss": 11.2764, "num_tokens": 6648880.0, "step": 931 }, { "epoch": 0.6512928022361985, "grad_norm": 0.7416483184978369, "learning_rate": 5.748761740681573e-05, "loss": 11.4409, "num_tokens": 6656518.0, "step": 932 }, { "epoch": 0.6519916142557652, "grad_norm": 0.7462623775574179, "learning_rate": 5.728285860171021e-05, "loss": 11.315, "num_tokens": 6663873.0, "step": 933 }, { "epoch": 0.6526904262753319, "grad_norm": 0.7926028186762867, "learning_rate": 5.7078318634641456e-05, "loss": 11.4408, "num_tokens": 6671021.0, "step": 934 }, { "epoch": 0.6533892382948987, "grad_norm": 0.8186251647673696, "learning_rate": 5.687399855345879e-05, "loss": 11.4383, "num_tokens": 6677620.0, "step": 935 }, { "epoch": 0.6540880503144654, "grad_norm": 0.7280289619725115, "learning_rate": 5.666989940488496e-05, "loss": 11.417, "num_tokens": 6685101.0, "step": 936 }, { "epoch": 0.6547868623340322, "grad_norm": 0.7915454831387722, "learning_rate": 5.646602223451094e-05, "loss": 11.2694, "num_tokens": 6692207.0, "step": 937 }, { "epoch": 0.6554856743535988, "grad_norm": 0.7279570365472292, "learning_rate": 5.6262368086790504e-05, "loss": 11.313, "num_tokens": 6699759.0, "step": 938 }, { "epoch": 0.6561844863731656, "grad_norm": 0.7968453443976489, "learning_rate": 5.605893800503484e-05, "loss": 11.3699, "num_tokens": 6706906.0, "step": 939 }, { "epoch": 0.6568832983927324, "grad_norm": 0.8290722608319229, "learning_rate": 5.585573303140741e-05, "loss": 11.4912, "num_tokens": 6713394.0, "step": 940 }, { "epoch": 0.6575821104122991, "grad_norm": 0.7823549978752478, "learning_rate": 5.565275420691831e-05, "loss": 11.3901, "num_tokens": 6720211.0, "step": 941 }, { "epoch": 0.6582809224318659, "grad_norm": 0.7517377792179483, "learning_rate": 5.5450002571419104e-05, "loss": 11.5227, "num_tokens": 6727718.0, "step": 942 }, { "epoch": 0.6589797344514325, "grad_norm": 0.7578102737888668, "learning_rate": 5.524747916359756e-05, "loss": 11.3185, "num_tokens": 6735027.0, "step": 943 }, { "epoch": 0.6596785464709993, "grad_norm": 0.7294348627795622, "learning_rate": 5.504518502097212e-05, "loss": 11.4193, "num_tokens": 6742667.0, "step": 944 }, { "epoch": 0.660377358490566, "grad_norm": 0.7333256228607329, "learning_rate": 5.484312117988687e-05, "loss": 11.415, "num_tokens": 6750129.0, "step": 945 }, { "epoch": 0.6610761705101328, "grad_norm": 0.7629669936154013, "learning_rate": 5.464128867550593e-05, "loss": 11.3356, "num_tokens": 6756898.0, "step": 946 }, { "epoch": 0.6617749825296995, "grad_norm": 0.7752564686857137, "learning_rate": 5.4439688541808345e-05, "loss": 11.5905, "num_tokens": 6763921.0, "step": 947 }, { "epoch": 0.6624737945492662, "grad_norm": 0.755404720417447, "learning_rate": 5.423832181158274e-05, "loss": 11.3755, "num_tokens": 6771091.0, "step": 948 }, { "epoch": 0.663172606568833, "grad_norm": 0.72492465071565, "learning_rate": 5.4037189516422e-05, "loss": 11.2847, "num_tokens": 6778488.0, "step": 949 }, { "epoch": 0.6638714185883997, "grad_norm": 0.7336264900623639, "learning_rate": 5.383629268671804e-05, "loss": 11.2539, "num_tokens": 6785464.0, "step": 950 }, { "epoch": 0.6645702306079665, "grad_norm": 0.8327633064762886, "learning_rate": 5.3635632351656495e-05, "loss": 11.5402, "num_tokens": 6792243.0, "step": 951 }, { "epoch": 0.6652690426275332, "grad_norm": 0.7237982353291018, "learning_rate": 5.3435209539211394e-05, "loss": 11.416, "num_tokens": 6799493.0, "step": 952 }, { "epoch": 0.6659678546471, "grad_norm": 0.7303663246452419, "learning_rate": 5.323502527614007e-05, "loss": 11.3044, "num_tokens": 6806467.0, "step": 953 }, { "epoch": 0.6666666666666666, "grad_norm": 0.8181397264078049, "learning_rate": 5.303508058797766e-05, "loss": 11.4364, "num_tokens": 6813306.0, "step": 954 }, { "epoch": 0.6673654786862334, "grad_norm": 0.7355370180850539, "learning_rate": 5.2835376499031955e-05, "loss": 11.1692, "num_tokens": 6820518.0, "step": 955 }, { "epoch": 0.6680642907058001, "grad_norm": 0.7609137303788498, "learning_rate": 5.263591403237831e-05, "loss": 11.3905, "num_tokens": 6827100.0, "step": 956 }, { "epoch": 0.6687631027253669, "grad_norm": 0.8203786627444394, "learning_rate": 5.243669420985413e-05, "loss": 11.359, "num_tokens": 6833940.0, "step": 957 }, { "epoch": 0.6694619147449337, "grad_norm": 0.752993836488806, "learning_rate": 5.22377180520538e-05, "loss": 11.5737, "num_tokens": 6841696.0, "step": 958 }, { "epoch": 0.6701607267645003, "grad_norm": 0.7162845303848641, "learning_rate": 5.2038986578323437e-05, "loss": 11.5206, "num_tokens": 6849308.0, "step": 959 }, { "epoch": 0.6708595387840671, "grad_norm": 0.7601708724578571, "learning_rate": 5.1840500806755575e-05, "loss": 11.2212, "num_tokens": 6856816.0, "step": 960 }, { "epoch": 0.6715583508036338, "grad_norm": 0.8158602127134996, "learning_rate": 5.164226175418421e-05, "loss": 11.4374, "num_tokens": 6863209.0, "step": 961 }, { "epoch": 0.6722571628232006, "grad_norm": 0.7759684172814786, "learning_rate": 5.1444270436179185e-05, "loss": 11.1977, "num_tokens": 6869856.0, "step": 962 }, { "epoch": 0.6729559748427673, "grad_norm": 0.7307099197973029, "learning_rate": 5.12465278670414e-05, "loss": 11.3729, "num_tokens": 6877774.0, "step": 963 }, { "epoch": 0.673654786862334, "grad_norm": 0.7998954262132016, "learning_rate": 5.10490350597973e-05, "loss": 11.3021, "num_tokens": 6884547.0, "step": 964 }, { "epoch": 0.6743535988819007, "grad_norm": 0.829376113016866, "learning_rate": 5.085179302619383e-05, "loss": 11.4544, "num_tokens": 6892114.0, "step": 965 }, { "epoch": 0.6750524109014675, "grad_norm": 0.8124137234626652, "learning_rate": 5.06548027766933e-05, "loss": 11.5562, "num_tokens": 6899318.0, "step": 966 }, { "epoch": 0.6757512229210343, "grad_norm": 0.731877974221252, "learning_rate": 5.045806532046806e-05, "loss": 11.3625, "num_tokens": 6906566.0, "step": 967 }, { "epoch": 0.676450034940601, "grad_norm": 0.7960249114523722, "learning_rate": 5.0261581665395475e-05, "loss": 11.394, "num_tokens": 6913114.0, "step": 968 }, { "epoch": 0.6771488469601677, "grad_norm": 0.7471261094794464, "learning_rate": 5.006535281805265e-05, "loss": 11.2724, "num_tokens": 6920642.0, "step": 969 }, { "epoch": 0.6778476589797344, "grad_norm": 0.7394163224711746, "learning_rate": 4.9869379783711315e-05, "loss": 11.2185, "num_tokens": 6927787.0, "step": 970 }, { "epoch": 0.6785464709993012, "grad_norm": 0.7482808846351563, "learning_rate": 4.967366356633275e-05, "loss": 11.2588, "num_tokens": 6935406.0, "step": 971 }, { "epoch": 0.6792452830188679, "grad_norm": 0.7449895829598722, "learning_rate": 4.947820516856253e-05, "loss": 11.4155, "num_tokens": 6942968.0, "step": 972 }, { "epoch": 0.6799440950384347, "grad_norm": 0.7866416855660652, "learning_rate": 4.9283005591725375e-05, "loss": 11.4755, "num_tokens": 6949948.0, "step": 973 }, { "epoch": 0.6806429070580013, "grad_norm": 0.7398246001909144, "learning_rate": 4.908806583582021e-05, "loss": 11.4443, "num_tokens": 6957454.0, "step": 974 }, { "epoch": 0.6813417190775681, "grad_norm": 0.7809576116461072, "learning_rate": 4.8893386899514746e-05, "loss": 11.3265, "num_tokens": 6964979.0, "step": 975 }, { "epoch": 0.6820405310971349, "grad_norm": 0.7263811286969053, "learning_rate": 4.869896978014071e-05, "loss": 11.4017, "num_tokens": 6972329.0, "step": 976 }, { "epoch": 0.6827393431167016, "grad_norm": 0.7922288974239042, "learning_rate": 4.85048154736884e-05, "loss": 11.4244, "num_tokens": 6978773.0, "step": 977 }, { "epoch": 0.6834381551362684, "grad_norm": 0.7347442860022921, "learning_rate": 4.831092497480179e-05, "loss": 11.3205, "num_tokens": 6986336.0, "step": 978 }, { "epoch": 0.684136967155835, "grad_norm": 0.8820151378232439, "learning_rate": 4.81172992767734e-05, "loss": 11.3669, "num_tokens": 6992944.0, "step": 979 }, { "epoch": 0.6848357791754018, "grad_norm": 0.8114183972043456, "learning_rate": 4.792393937153914e-05, "loss": 11.4598, "num_tokens": 6999683.0, "step": 980 }, { "epoch": 0.6855345911949685, "grad_norm": 0.7694550494205694, "learning_rate": 4.773084624967327e-05, "loss": 11.2862, "num_tokens": 7006810.0, "step": 981 }, { "epoch": 0.6862334032145353, "grad_norm": 0.8035810017403291, "learning_rate": 4.753802090038344e-05, "loss": 11.6007, "num_tokens": 7013925.0, "step": 982 }, { "epoch": 0.686932215234102, "grad_norm": 0.8779508449006704, "learning_rate": 4.734546431150536e-05, "loss": 11.5836, "num_tokens": 7020571.0, "step": 983 }, { "epoch": 0.6876310272536688, "grad_norm": 0.7782965367791496, "learning_rate": 4.715317746949804e-05, "loss": 11.5124, "num_tokens": 7027401.0, "step": 984 }, { "epoch": 0.6883298392732355, "grad_norm": 0.7429245384849146, "learning_rate": 4.6961161359438486e-05, "loss": 11.2208, "num_tokens": 7034682.0, "step": 985 }, { "epoch": 0.6890286512928022, "grad_norm": 0.768180423011357, "learning_rate": 4.676941696501673e-05, "loss": 11.3495, "num_tokens": 7042175.0, "step": 986 }, { "epoch": 0.689727463312369, "grad_norm": 0.8158507454164047, "learning_rate": 4.657794526853096e-05, "loss": 11.4224, "num_tokens": 7048976.0, "step": 987 }, { "epoch": 0.6904262753319357, "grad_norm": 0.7416723415602191, "learning_rate": 4.6386747250882224e-05, "loss": 11.3724, "num_tokens": 7056210.0, "step": 988 }, { "epoch": 0.6911250873515025, "grad_norm": 0.7862281657651232, "learning_rate": 4.6195823891569545e-05, "loss": 11.5203, "num_tokens": 7063291.0, "step": 989 }, { "epoch": 0.6918238993710691, "grad_norm": 0.7947232569258293, "learning_rate": 4.60051761686849e-05, "loss": 11.3728, "num_tokens": 7070074.0, "step": 990 }, { "epoch": 0.6925227113906359, "grad_norm": 0.7564910310163546, "learning_rate": 4.581480505890816e-05, "loss": 11.2835, "num_tokens": 7077328.0, "step": 991 }, { "epoch": 0.6932215234102026, "grad_norm": 0.7559569974710254, "learning_rate": 4.5624711537502206e-05, "loss": 11.4323, "num_tokens": 7084251.0, "step": 992 }, { "epoch": 0.6939203354297694, "grad_norm": 0.7701109539670761, "learning_rate": 4.543489657830777e-05, "loss": 11.3439, "num_tokens": 7091370.0, "step": 993 }, { "epoch": 0.6946191474493362, "grad_norm": 0.8687082504202258, "learning_rate": 4.52453611537385e-05, "loss": 11.418, "num_tokens": 7097992.0, "step": 994 }, { "epoch": 0.6953179594689028, "grad_norm": 0.8075508267512269, "learning_rate": 4.505610623477611e-05, "loss": 11.3894, "num_tokens": 7104219.0, "step": 995 }, { "epoch": 0.6960167714884696, "grad_norm": 0.7584503225018251, "learning_rate": 4.486713279096515e-05, "loss": 11.2692, "num_tokens": 7110986.0, "step": 996 }, { "epoch": 0.6967155835080363, "grad_norm": 0.7316503352080553, "learning_rate": 4.4678441790408335e-05, "loss": 11.431, "num_tokens": 7118553.0, "step": 997 }, { "epoch": 0.6974143955276031, "grad_norm": 0.7804898512960603, "learning_rate": 4.449003419976133e-05, "loss": 11.2494, "num_tokens": 7125671.0, "step": 998 }, { "epoch": 0.6981132075471698, "grad_norm": 0.7665956804053953, "learning_rate": 4.430191098422795e-05, "loss": 11.172, "num_tokens": 7132247.0, "step": 999 }, { "epoch": 0.6988120195667366, "grad_norm": 0.8340556176005602, "learning_rate": 4.411407310755513e-05, "loss": 11.4609, "num_tokens": 7138298.0, "step": 1000 }, { "epoch": 0.6995108315863033, "grad_norm": 0.7808514429539747, "learning_rate": 4.392652153202802e-05, "loss": 11.485, "num_tokens": 7144756.0, "step": 1001 }, { "epoch": 0.70020964360587, "grad_norm": 0.7536778557900022, "learning_rate": 4.373925721846519e-05, "loss": 11.3155, "num_tokens": 7152146.0, "step": 1002 }, { "epoch": 0.7009084556254368, "grad_norm": 0.7235512526836008, "learning_rate": 4.355228112621341e-05, "loss": 11.3711, "num_tokens": 7159343.0, "step": 1003 }, { "epoch": 0.7016072676450035, "grad_norm": 0.7842398648638949, "learning_rate": 4.336559421314298e-05, "loss": 11.3397, "num_tokens": 7165846.0, "step": 1004 }, { "epoch": 0.7023060796645703, "grad_norm": 0.7675919005184618, "learning_rate": 4.317919743564278e-05, "loss": 11.4522, "num_tokens": 7173032.0, "step": 1005 }, { "epoch": 0.7030048916841369, "grad_norm": 0.7870096182872951, "learning_rate": 4.29930917486153e-05, "loss": 11.4789, "num_tokens": 7179676.0, "step": 1006 }, { "epoch": 0.7037037037037037, "grad_norm": 0.7135708920198698, "learning_rate": 4.2807278105471735e-05, "loss": 11.2755, "num_tokens": 7187102.0, "step": 1007 }, { "epoch": 0.7044025157232704, "grad_norm": 0.8003412427834519, "learning_rate": 4.2621757458127285e-05, "loss": 11.3892, "num_tokens": 7194233.0, "step": 1008 }, { "epoch": 0.7051013277428372, "grad_norm": 0.7585197255574855, "learning_rate": 4.243653075699604e-05, "loss": 11.3803, "num_tokens": 7200796.0, "step": 1009 }, { "epoch": 0.705800139762404, "grad_norm": 0.8452156813049073, "learning_rate": 4.2251598950986226e-05, "loss": 11.4087, "num_tokens": 7207142.0, "step": 1010 }, { "epoch": 0.7064989517819706, "grad_norm": 0.7835457389050836, "learning_rate": 4.2066962987495376e-05, "loss": 11.4664, "num_tokens": 7213920.0, "step": 1011 }, { "epoch": 0.7071977638015374, "grad_norm": 0.7451177469095189, "learning_rate": 4.188262381240534e-05, "loss": 11.5746, "num_tokens": 7221271.0, "step": 1012 }, { "epoch": 0.7078965758211041, "grad_norm": 0.6745508455944015, "learning_rate": 4.169858237007772e-05, "loss": 11.3237, "num_tokens": 7229149.0, "step": 1013 }, { "epoch": 0.7085953878406709, "grad_norm": 0.7325516422452724, "learning_rate": 4.151483960334862e-05, "loss": 11.2666, "num_tokens": 7236889.0, "step": 1014 }, { "epoch": 0.7092941998602376, "grad_norm": 0.7597756270035179, "learning_rate": 4.133139645352425e-05, "loss": 11.4504, "num_tokens": 7243535.0, "step": 1015 }, { "epoch": 0.7099930118798043, "grad_norm": 0.802411771746198, "learning_rate": 4.114825386037576e-05, "loss": 11.5466, "num_tokens": 7249939.0, "step": 1016 }, { "epoch": 0.710691823899371, "grad_norm": 0.6745688733699289, "learning_rate": 4.0965412762134556e-05, "loss": 11.3435, "num_tokens": 7257993.0, "step": 1017 }, { "epoch": 0.7113906359189378, "grad_norm": 0.8172015168604911, "learning_rate": 4.078287409548763e-05, "loss": 11.489, "num_tokens": 7264252.0, "step": 1018 }, { "epoch": 0.7120894479385046, "grad_norm": 0.7313370944739739, "learning_rate": 4.060063879557249e-05, "loss": 11.1882, "num_tokens": 7271420.0, "step": 1019 }, { "epoch": 0.7127882599580713, "grad_norm": 0.7413343223804779, "learning_rate": 4.0418707795972574e-05, "loss": 11.5518, "num_tokens": 7278894.0, "step": 1020 }, { "epoch": 0.713487071977638, "grad_norm": 0.70036680794593, "learning_rate": 4.023708202871239e-05, "loss": 11.2365, "num_tokens": 7286940.0, "step": 1021 }, { "epoch": 0.7141858839972047, "grad_norm": 0.6917658234021815, "learning_rate": 4.005576242425272e-05, "loss": 11.0293, "num_tokens": 7294745.0, "step": 1022 }, { "epoch": 0.7148846960167715, "grad_norm": 0.7723810701467926, "learning_rate": 3.9874749911485995e-05, "loss": 11.3387, "num_tokens": 7301959.0, "step": 1023 }, { "epoch": 0.7155835080363382, "grad_norm": 0.7972689072311142, "learning_rate": 3.969404541773132e-05, "loss": 11.3007, "num_tokens": 7308592.0, "step": 1024 }, { "epoch": 0.716282320055905, "grad_norm": 0.7665594285270318, "learning_rate": 3.951364986872984e-05, "loss": 11.2227, "num_tokens": 7315715.0, "step": 1025 }, { "epoch": 0.7169811320754716, "grad_norm": 0.8147865188294503, "learning_rate": 3.933356418864008e-05, "loss": 11.3663, "num_tokens": 7322229.0, "step": 1026 }, { "epoch": 0.7176799440950384, "grad_norm": 0.7563572784068692, "learning_rate": 3.9153789300033e-05, "loss": 11.4636, "num_tokens": 7329104.0, "step": 1027 }, { "epoch": 0.7183787561146052, "grad_norm": 0.6925283206897185, "learning_rate": 3.8974326123887515e-05, "loss": 11.1833, "num_tokens": 7336700.0, "step": 1028 }, { "epoch": 0.7190775681341719, "grad_norm": 0.7362192730624995, "learning_rate": 3.879517557958554e-05, "loss": 11.5596, "num_tokens": 7343801.0, "step": 1029 }, { "epoch": 0.7197763801537387, "grad_norm": 0.8067914900843238, "learning_rate": 3.861633858490745e-05, "loss": 11.4494, "num_tokens": 7350746.0, "step": 1030 }, { "epoch": 0.7204751921733054, "grad_norm": 0.7039344440821224, "learning_rate": 3.8437816056027296e-05, "loss": 11.2428, "num_tokens": 7358560.0, "step": 1031 }, { "epoch": 0.7211740041928721, "grad_norm": 0.7856827734869525, "learning_rate": 3.82596089075081e-05, "loss": 11.6543, "num_tokens": 7365191.0, "step": 1032 }, { "epoch": 0.7218728162124388, "grad_norm": 0.7235482096088951, "learning_rate": 3.808171805229733e-05, "loss": 11.3282, "num_tokens": 7372368.0, "step": 1033 }, { "epoch": 0.7225716282320056, "grad_norm": 0.7387334991329841, "learning_rate": 3.790414440172197e-05, "loss": 11.3636, "num_tokens": 7379881.0, "step": 1034 }, { "epoch": 0.7232704402515723, "grad_norm": 0.7126819433630927, "learning_rate": 3.7726888865484e-05, "loss": 11.2277, "num_tokens": 7386744.0, "step": 1035 }, { "epoch": 0.7239692522711391, "grad_norm": 0.662827736472648, "learning_rate": 3.754995235165579e-05, "loss": 11.1434, "num_tokens": 7394571.0, "step": 1036 }, { "epoch": 0.7246680642907058, "grad_norm": 0.7051999387799784, "learning_rate": 3.73733357666753e-05, "loss": 11.2589, "num_tokens": 7402101.0, "step": 1037 }, { "epoch": 0.7253668763102725, "grad_norm": 0.7341999680956166, "learning_rate": 3.719704001534149e-05, "loss": 11.3894, "num_tokens": 7409120.0, "step": 1038 }, { "epoch": 0.7260656883298393, "grad_norm": 0.7281969314553532, "learning_rate": 3.702106600080979e-05, "loss": 11.4819, "num_tokens": 7416318.0, "step": 1039 }, { "epoch": 0.726764500349406, "grad_norm": 0.7315483076557149, "learning_rate": 3.6845414624587326e-05, "loss": 11.2987, "num_tokens": 7423580.0, "step": 1040 }, { "epoch": 0.7274633123689728, "grad_norm": 0.6970166205503779, "learning_rate": 3.667008678652837e-05, "loss": 11.3018, "num_tokens": 7431059.0, "step": 1041 }, { "epoch": 0.7281621243885394, "grad_norm": 0.6885063172154732, "learning_rate": 3.6495083384829723e-05, "loss": 11.2188, "num_tokens": 7438722.0, "step": 1042 }, { "epoch": 0.7288609364081062, "grad_norm": 0.7822956671181562, "learning_rate": 3.6320405316026074e-05, "loss": 11.6013, "num_tokens": 7445317.0, "step": 1043 }, { "epoch": 0.7295597484276729, "grad_norm": 0.7646739355090845, "learning_rate": 3.6146053474985564e-05, "loss": 11.5241, "num_tokens": 7452194.0, "step": 1044 }, { "epoch": 0.7302585604472397, "grad_norm": 0.6713250992120214, "learning_rate": 3.597202875490494e-05, "loss": 11.2791, "num_tokens": 7460052.0, "step": 1045 }, { "epoch": 0.7309573724668065, "grad_norm": 0.745173558135304, "learning_rate": 3.579833204730525e-05, "loss": 11.3639, "num_tokens": 7467357.0, "step": 1046 }, { "epoch": 0.7316561844863732, "grad_norm": 0.8031651896658117, "learning_rate": 3.562496424202707e-05, "loss": 11.3606, "num_tokens": 7474027.0, "step": 1047 }, { "epoch": 0.7323549965059399, "grad_norm": 0.7166095564634701, "learning_rate": 3.5451926227225997e-05, "loss": 11.2742, "num_tokens": 7481232.0, "step": 1048 }, { "epoch": 0.7330538085255066, "grad_norm": 0.7326672859023466, "learning_rate": 3.5279218889368225e-05, "loss": 11.3805, "num_tokens": 7487946.0, "step": 1049 }, { "epoch": 0.7337526205450734, "grad_norm": 0.7416236009384238, "learning_rate": 3.5106843113225854e-05, "loss": 11.3072, "num_tokens": 7494713.0, "step": 1050 }, { "epoch": 0.7344514325646401, "grad_norm": 0.7012124715014632, "learning_rate": 3.493479978187236e-05, "loss": 11.3789, "num_tokens": 7502002.0, "step": 1051 }, { "epoch": 0.7351502445842069, "grad_norm": 0.7221924272062566, "learning_rate": 3.4763089776678203e-05, "loss": 11.3354, "num_tokens": 7509025.0, "step": 1052 }, { "epoch": 0.7358490566037735, "grad_norm": 0.7724816602138137, "learning_rate": 3.459171397730614e-05, "loss": 11.3475, "num_tokens": 7515456.0, "step": 1053 }, { "epoch": 0.7365478686233403, "grad_norm": 0.7475946730617843, "learning_rate": 3.44206732617069e-05, "loss": 11.3963, "num_tokens": 7522493.0, "step": 1054 }, { "epoch": 0.7372466806429071, "grad_norm": 0.7683500970071147, "learning_rate": 3.424996850611455e-05, "loss": 11.2392, "num_tokens": 7529387.0, "step": 1055 }, { "epoch": 0.7379454926624738, "grad_norm": 0.6733763063940768, "learning_rate": 3.4079600585041996e-05, "loss": 10.9176, "num_tokens": 7537726.0, "step": 1056 }, { "epoch": 0.7386443046820406, "grad_norm": 0.697410181297856, "learning_rate": 3.3909570371276654e-05, "loss": 11.4406, "num_tokens": 7545113.0, "step": 1057 }, { "epoch": 0.7393431167016072, "grad_norm": 0.8385269941717998, "learning_rate": 3.3739878735875796e-05, "loss": 11.4465, "num_tokens": 7552056.0, "step": 1058 }, { "epoch": 0.740041928721174, "grad_norm": 0.6766701266960187, "learning_rate": 3.357052654816225e-05, "loss": 11.2494, "num_tokens": 7559863.0, "step": 1059 }, { "epoch": 0.7407407407407407, "grad_norm": 0.7308228283594846, "learning_rate": 3.3401514675719816e-05, "loss": 11.4521, "num_tokens": 7567323.0, "step": 1060 }, { "epoch": 0.7414395527603075, "grad_norm": 0.7646099741455271, "learning_rate": 3.323284398438886e-05, "loss": 11.242, "num_tokens": 7573959.0, "step": 1061 }, { "epoch": 0.7421383647798742, "grad_norm": 0.7427632832724356, "learning_rate": 3.306451533826194e-05, "loss": 11.4877, "num_tokens": 7580996.0, "step": 1062 }, { "epoch": 0.742837176799441, "grad_norm": 0.7581322530475002, "learning_rate": 3.289652959967925e-05, "loss": 11.5307, "num_tokens": 7587525.0, "step": 1063 }, { "epoch": 0.7435359888190077, "grad_norm": 0.7028610260607215, "learning_rate": 3.272888762922442e-05, "loss": 11.4367, "num_tokens": 7595094.0, "step": 1064 }, { "epoch": 0.7442348008385744, "grad_norm": 0.7017221063986278, "learning_rate": 3.2561590285719856e-05, "loss": 11.276, "num_tokens": 7602618.0, "step": 1065 }, { "epoch": 0.7449336128581412, "grad_norm": 0.6865354079002892, "learning_rate": 3.2394638426222467e-05, "loss": 11.1785, "num_tokens": 7610143.0, "step": 1066 }, { "epoch": 0.7456324248777079, "grad_norm": 0.6740360060533553, "learning_rate": 3.222803290601934e-05, "loss": 11.3203, "num_tokens": 7617786.0, "step": 1067 }, { "epoch": 0.7463312368972747, "grad_norm": 0.7110904217960554, "learning_rate": 3.20617745786232e-05, "loss": 11.1755, "num_tokens": 7624874.0, "step": 1068 }, { "epoch": 0.7470300489168413, "grad_norm": 0.7377273985684303, "learning_rate": 3.189586429576812e-05, "loss": 11.3264, "num_tokens": 7631644.0, "step": 1069 }, { "epoch": 0.7477288609364081, "grad_norm": 0.8518215847668774, "learning_rate": 3.173030290740524e-05, "loss": 11.4216, "num_tokens": 7637610.0, "step": 1070 }, { "epoch": 0.7484276729559748, "grad_norm": 0.731189174579239, "learning_rate": 3.1565091261698245e-05, "loss": 11.2452, "num_tokens": 7644746.0, "step": 1071 }, { "epoch": 0.7491264849755416, "grad_norm": 0.7307695312815669, "learning_rate": 3.140023020501912e-05, "loss": 11.1864, "num_tokens": 7651980.0, "step": 1072 }, { "epoch": 0.7498252969951084, "grad_norm": 0.7470129712613078, "learning_rate": 3.1235720581943827e-05, "loss": 11.341, "num_tokens": 7658633.0, "step": 1073 }, { "epoch": 0.750524109014675, "grad_norm": 0.6974467676001982, "learning_rate": 3.107156323524788e-05, "loss": 11.131, "num_tokens": 7666388.0, "step": 1074 }, { "epoch": 0.7512229210342418, "grad_norm": 0.7710427604926284, "learning_rate": 3.0907759005902224e-05, "loss": 11.3886, "num_tokens": 7673401.0, "step": 1075 }, { "epoch": 0.7519217330538085, "grad_norm": 0.7722971345364247, "learning_rate": 3.074430873306865e-05, "loss": 11.4449, "num_tokens": 7680465.0, "step": 1076 }, { "epoch": 0.7526205450733753, "grad_norm": 0.7226343668653663, "learning_rate": 3.058121325409579e-05, "loss": 11.3678, "num_tokens": 7687681.0, "step": 1077 }, { "epoch": 0.753319357092942, "grad_norm": 0.7770625505796288, "learning_rate": 3.041847340451456e-05, "loss": 11.3737, "num_tokens": 7694753.0, "step": 1078 }, { "epoch": 0.7540181691125087, "grad_norm": 0.7999608445071387, "learning_rate": 3.0256090018034046e-05, "loss": 11.3074, "num_tokens": 7700964.0, "step": 1079 }, { "epoch": 0.7547169811320755, "grad_norm": 0.7507601730126232, "learning_rate": 3.009406392653723e-05, "loss": 11.2428, "num_tokens": 7707592.0, "step": 1080 }, { "epoch": 0.7554157931516422, "grad_norm": 0.6810868569742109, "learning_rate": 2.993239596007669e-05, "loss": 11.2451, "num_tokens": 7715404.0, "step": 1081 }, { "epoch": 0.756114605171209, "grad_norm": 0.6849850433555468, "learning_rate": 2.9771086946870175e-05, "loss": 11.172, "num_tokens": 7722495.0, "step": 1082 }, { "epoch": 0.7568134171907757, "grad_norm": 0.8728657634726412, "learning_rate": 2.9610137713296783e-05, "loss": 11.5636, "num_tokens": 7728379.0, "step": 1083 }, { "epoch": 0.7575122292103424, "grad_norm": 0.739038187201558, "learning_rate": 2.9449549083892292e-05, "loss": 11.4312, "num_tokens": 7735201.0, "step": 1084 }, { "epoch": 0.7582110412299091, "grad_norm": 0.8444559834520016, "learning_rate": 2.9289321881345254e-05, "loss": 11.6626, "num_tokens": 7741618.0, "step": 1085 }, { "epoch": 0.7589098532494759, "grad_norm": 0.7334827192137122, "learning_rate": 2.9129456926492548e-05, "loss": 11.2497, "num_tokens": 7748854.0, "step": 1086 }, { "epoch": 0.7596086652690426, "grad_norm": 0.7214074829950128, "learning_rate": 2.8969955038315277e-05, "loss": 11.153, "num_tokens": 7756291.0, "step": 1087 }, { "epoch": 0.7603074772886094, "grad_norm": 0.7558791668864552, "learning_rate": 2.8810817033934656e-05, "loss": 11.1789, "num_tokens": 7763508.0, "step": 1088 }, { "epoch": 0.7610062893081762, "grad_norm": 0.7137740371199445, "learning_rate": 2.8652043728607625e-05, "loss": 11.3707, "num_tokens": 7771093.0, "step": 1089 }, { "epoch": 0.7617051013277428, "grad_norm": 0.7894834299698231, "learning_rate": 2.8493635935722928e-05, "loss": 11.2646, "num_tokens": 7777723.0, "step": 1090 }, { "epoch": 0.7624039133473096, "grad_norm": 0.7321387669937152, "learning_rate": 2.8335594466796656e-05, "loss": 11.3176, "num_tokens": 7784753.0, "step": 1091 }, { "epoch": 0.7631027253668763, "grad_norm": 0.783647730922316, "learning_rate": 2.8177920131468273e-05, "loss": 11.5208, "num_tokens": 7791905.0, "step": 1092 }, { "epoch": 0.7638015373864431, "grad_norm": 0.7429571159412155, "learning_rate": 2.8020613737496547e-05, "loss": 11.3798, "num_tokens": 7799579.0, "step": 1093 }, { "epoch": 0.7645003494060097, "grad_norm": 0.7472473035800477, "learning_rate": 2.7863676090755176e-05, "loss": 11.4021, "num_tokens": 7806386.0, "step": 1094 }, { "epoch": 0.7651991614255765, "grad_norm": 0.6754659129161072, "learning_rate": 2.770710799522879e-05, "loss": 11.2082, "num_tokens": 7814116.0, "step": 1095 }, { "epoch": 0.7658979734451432, "grad_norm": 0.7693144724539596, "learning_rate": 2.7550910253008933e-05, "loss": 11.3185, "num_tokens": 7820763.0, "step": 1096 }, { "epoch": 0.76659678546471, "grad_norm": 0.7091084838894971, "learning_rate": 2.739508366428969e-05, "loss": 11.2438, "num_tokens": 7828321.0, "step": 1097 }, { "epoch": 0.7672955974842768, "grad_norm": 0.705869316522346, "learning_rate": 2.723962902736389e-05, "loss": 11.2762, "num_tokens": 7835798.0, "step": 1098 }, { "epoch": 0.7679944095038435, "grad_norm": 0.7138100544611918, "learning_rate": 2.7084547138618778e-05, "loss": 11.3148, "num_tokens": 7842754.0, "step": 1099 }, { "epoch": 0.7686932215234102, "grad_norm": 0.7151217684539595, "learning_rate": 2.6929838792532037e-05, "loss": 11.217, "num_tokens": 7849613.0, "step": 1100 }, { "epoch": 0.7693920335429769, "grad_norm": 0.7188728094758737, "learning_rate": 2.6775504781667725e-05, "loss": 11.3475, "num_tokens": 7856848.0, "step": 1101 }, { "epoch": 0.7700908455625437, "grad_norm": 0.7052028694843995, "learning_rate": 2.6621545896672174e-05, "loss": 11.3796, "num_tokens": 7863992.0, "step": 1102 }, { "epoch": 0.7707896575821104, "grad_norm": 0.6914477298742575, "learning_rate": 2.6467962926270017e-05, "loss": 11.3365, "num_tokens": 7872390.0, "step": 1103 }, { "epoch": 0.7714884696016772, "grad_norm": 0.7137530093694683, "learning_rate": 2.6314756657260054e-05, "loss": 11.2455, "num_tokens": 7879165.0, "step": 1104 }, { "epoch": 0.7721872816212438, "grad_norm": 0.7247684771863809, "learning_rate": 2.6161927874511216e-05, "loss": 11.2368, "num_tokens": 7886459.0, "step": 1105 }, { "epoch": 0.7728860936408106, "grad_norm": 0.7194841517185353, "learning_rate": 2.6009477360958712e-05, "loss": 11.3713, "num_tokens": 7893611.0, "step": 1106 }, { "epoch": 0.7735849056603774, "grad_norm": 0.7026960816920488, "learning_rate": 2.585740589759976e-05, "loss": 11.2143, "num_tokens": 7900850.0, "step": 1107 }, { "epoch": 0.7742837176799441, "grad_norm": 0.7434038696739714, "learning_rate": 2.5705714263489776e-05, "loss": 11.2671, "num_tokens": 7907796.0, "step": 1108 }, { "epoch": 0.7749825296995109, "grad_norm": 0.7487747923849204, "learning_rate": 2.555440323573839e-05, "loss": 11.4015, "num_tokens": 7915024.0, "step": 1109 }, { "epoch": 0.7756813417190775, "grad_norm": 0.79745216396254, "learning_rate": 2.540347358950529e-05, "loss": 11.1625, "num_tokens": 7921430.0, "step": 1110 }, { "epoch": 0.7763801537386443, "grad_norm": 0.7596177805629818, "learning_rate": 2.5252926097996445e-05, "loss": 11.4533, "num_tokens": 7928272.0, "step": 1111 }, { "epoch": 0.777078965758211, "grad_norm": 0.7350154935054022, "learning_rate": 2.5102761532460008e-05, "loss": 11.4029, "num_tokens": 7934944.0, "step": 1112 }, { "epoch": 0.7777777777777778, "grad_norm": 0.7499238754252547, "learning_rate": 2.4952980662182425e-05, "loss": 11.1187, "num_tokens": 7941339.0, "step": 1113 }, { "epoch": 0.7784765897973445, "grad_norm": 0.7453807382953898, "learning_rate": 2.4803584254484568e-05, "loss": 11.3362, "num_tokens": 7948332.0, "step": 1114 }, { "epoch": 0.7791754018169113, "grad_norm": 0.746935094009976, "learning_rate": 2.4654573074717602e-05, "loss": 11.1033, "num_tokens": 7955689.0, "step": 1115 }, { "epoch": 0.779874213836478, "grad_norm": 0.7132201817546339, "learning_rate": 2.4505947886259318e-05, "loss": 11.4412, "num_tokens": 7962958.0, "step": 1116 }, { "epoch": 0.7805730258560447, "grad_norm": 0.6636018994598554, "learning_rate": 2.435770945050997e-05, "loss": 11.1383, "num_tokens": 7970832.0, "step": 1117 }, { "epoch": 0.7812718378756115, "grad_norm": 0.7607115224553903, "learning_rate": 2.420985852688854e-05, "loss": 11.4823, "num_tokens": 7977881.0, "step": 1118 }, { "epoch": 0.7819706498951782, "grad_norm": 0.6604593093823747, "learning_rate": 2.4062395872828846e-05, "loss": 11.2511, "num_tokens": 7985659.0, "step": 1119 }, { "epoch": 0.782669461914745, "grad_norm": 0.7004472719825999, "learning_rate": 2.3915322243775562e-05, "loss": 11.1951, "num_tokens": 7993219.0, "step": 1120 }, { "epoch": 0.7833682739343116, "grad_norm": 0.7154398128015275, "learning_rate": 2.3768638393180407e-05, "loss": 11.2984, "num_tokens": 8000242.0, "step": 1121 }, { "epoch": 0.7840670859538784, "grad_norm": 0.7194741079202231, "learning_rate": 2.362234507249832e-05, "loss": 11.3538, "num_tokens": 8007589.0, "step": 1122 }, { "epoch": 0.7847658979734451, "grad_norm": 0.665405485222391, "learning_rate": 2.3476443031183503e-05, "loss": 11.3998, "num_tokens": 8015113.0, "step": 1123 }, { "epoch": 0.7854647099930119, "grad_norm": 0.7497006916629403, "learning_rate": 2.3330933016685754e-05, "loss": 11.3447, "num_tokens": 8021703.0, "step": 1124 }, { "epoch": 0.7861635220125787, "grad_norm": 0.724069609414905, "learning_rate": 2.318581577444646e-05, "loss": 11.2892, "num_tokens": 8028563.0, "step": 1125 }, { "epoch": 0.7868623340321453, "grad_norm": 0.7369772430147077, "learning_rate": 2.304109204789484e-05, "loss": 11.3706, "num_tokens": 8035481.0, "step": 1126 }, { "epoch": 0.7875611460517121, "grad_norm": 0.7210670035313614, "learning_rate": 2.289676257844423e-05, "loss": 11.4219, "num_tokens": 8042374.0, "step": 1127 }, { "epoch": 0.7882599580712788, "grad_norm": 0.6806882221443891, "learning_rate": 2.275282810548811e-05, "loss": 11.2311, "num_tokens": 8050149.0, "step": 1128 }, { "epoch": 0.7889587700908456, "grad_norm": 0.7336078475191532, "learning_rate": 2.2609289366396502e-05, "loss": 11.3744, "num_tokens": 8056810.0, "step": 1129 }, { "epoch": 0.7896575821104123, "grad_norm": 0.7276364501652224, "learning_rate": 2.2466147096512035e-05, "loss": 11.5052, "num_tokens": 8063712.0, "step": 1130 }, { "epoch": 0.790356394129979, "grad_norm": 0.707614090504406, "learning_rate": 2.2323402029146244e-05, "loss": 11.2308, "num_tokens": 8070844.0, "step": 1131 }, { "epoch": 0.7910552061495457, "grad_norm": 0.7077004079623015, "learning_rate": 2.2181054895575847e-05, "loss": 11.1824, "num_tokens": 8078320.0, "step": 1132 }, { "epoch": 0.7917540181691125, "grad_norm": 0.6953790426100739, "learning_rate": 2.2039106425038924e-05, "loss": 11.4368, "num_tokens": 8085826.0, "step": 1133 }, { "epoch": 0.7924528301886793, "grad_norm": 0.7209711307337222, "learning_rate": 2.189755734473129e-05, "loss": 11.3935, "num_tokens": 8092777.0, "step": 1134 }, { "epoch": 0.793151642208246, "grad_norm": 0.808854769776892, "learning_rate": 2.175640837980265e-05, "loss": 11.5428, "num_tokens": 8099130.0, "step": 1135 }, { "epoch": 0.7938504542278128, "grad_norm": 0.801908412177368, "learning_rate": 2.161566025335289e-05, "loss": 11.3935, "num_tokens": 8105524.0, "step": 1136 }, { "epoch": 0.7945492662473794, "grad_norm": 0.7150555268911781, "learning_rate": 2.1475313686428544e-05, "loss": 11.2527, "num_tokens": 8112500.0, "step": 1137 }, { "epoch": 0.7952480782669462, "grad_norm": 0.7357614818712042, "learning_rate": 2.133536939801888e-05, "loss": 11.3546, "num_tokens": 8119456.0, "step": 1138 }, { "epoch": 0.7959468902865129, "grad_norm": 0.7370468321948003, "learning_rate": 2.1195828105052283e-05, "loss": 11.2675, "num_tokens": 8126211.0, "step": 1139 }, { "epoch": 0.7966457023060797, "grad_norm": 0.7794774618969516, "learning_rate": 2.105669052239274e-05, "loss": 11.2933, "num_tokens": 8132659.0, "step": 1140 }, { "epoch": 0.7973445143256463, "grad_norm": 0.7733675605498702, "learning_rate": 2.091795736283593e-05, "loss": 11.4373, "num_tokens": 8139414.0, "step": 1141 }, { "epoch": 0.7980433263452131, "grad_norm": 0.711124356307087, "learning_rate": 2.0779629337105722e-05, "loss": 11.1578, "num_tokens": 8146787.0, "step": 1142 }, { "epoch": 0.7987421383647799, "grad_norm": 0.7872909388951395, "learning_rate": 2.064170715385052e-05, "loss": 11.2581, "num_tokens": 8153506.0, "step": 1143 }, { "epoch": 0.7994409503843466, "grad_norm": 0.6555684032466877, "learning_rate": 2.050419151963957e-05, "loss": 11.4168, "num_tokens": 8161093.0, "step": 1144 }, { "epoch": 0.8001397624039134, "grad_norm": 0.7378265255419366, "learning_rate": 2.0367083138959476e-05, "loss": 11.3239, "num_tokens": 8167975.0, "step": 1145 }, { "epoch": 0.80083857442348, "grad_norm": 0.7314500876097451, "learning_rate": 2.0230382714210384e-05, "loss": 11.3205, "num_tokens": 8174799.0, "step": 1146 }, { "epoch": 0.8015373864430468, "grad_norm": 0.7415518531134959, "learning_rate": 2.0094090945702616e-05, "loss": 11.4, "num_tokens": 8181864.0, "step": 1147 }, { "epoch": 0.8022361984626135, "grad_norm": 0.7101043302442525, "learning_rate": 1.9958208531652877e-05, "loss": 11.3687, "num_tokens": 8188903.0, "step": 1148 }, { "epoch": 0.8029350104821803, "grad_norm": 0.6457420702696471, "learning_rate": 1.9822736168180778e-05, "loss": 11.3409, "num_tokens": 8196709.0, "step": 1149 }, { "epoch": 0.803633822501747, "grad_norm": 0.7225646893500235, "learning_rate": 1.9687674549305335e-05, "loss": 11.431, "num_tokens": 8204133.0, "step": 1150 }, { "epoch": 0.8043326345213138, "grad_norm": 0.6803983158734834, "learning_rate": 1.9553024366941242e-05, "loss": 11.2953, "num_tokens": 8211596.0, "step": 1151 }, { "epoch": 0.8050314465408805, "grad_norm": 0.7218921095299768, "learning_rate": 1.9418786310895464e-05, "loss": 11.432, "num_tokens": 8218648.0, "step": 1152 }, { "epoch": 0.8057302585604472, "grad_norm": 0.64913746065341, "learning_rate": 1.9284961068863673e-05, "loss": 11.3047, "num_tokens": 8226878.0, "step": 1153 }, { "epoch": 0.806429070580014, "grad_norm": 0.7074942179242839, "learning_rate": 1.9151549326426656e-05, "loss": 11.3206, "num_tokens": 8234332.0, "step": 1154 }, { "epoch": 0.8071278825995807, "grad_norm": 0.7527787088886351, "learning_rate": 1.9018551767046966e-05, "loss": 11.2193, "num_tokens": 8241077.0, "step": 1155 }, { "epoch": 0.8078266946191475, "grad_norm": 0.7062668084460995, "learning_rate": 1.8885969072065225e-05, "loss": 11.1638, "num_tokens": 8248516.0, "step": 1156 }, { "epoch": 0.8085255066387141, "grad_norm": 0.7424957637297211, "learning_rate": 1.8753801920696712e-05, "loss": 11.316, "num_tokens": 8255076.0, "step": 1157 }, { "epoch": 0.8092243186582809, "grad_norm": 0.6823678136180803, "learning_rate": 1.8622050990027995e-05, "loss": 11.2536, "num_tokens": 8262629.0, "step": 1158 }, { "epoch": 0.8099231306778477, "grad_norm": 0.750023065492982, "learning_rate": 1.8490716955013232e-05, "loss": 11.2706, "num_tokens": 8269455.0, "step": 1159 }, { "epoch": 0.8106219426974144, "grad_norm": 0.7704625910645484, "learning_rate": 1.8359800488470978e-05, "loss": 11.3387, "num_tokens": 8276214.0, "step": 1160 }, { "epoch": 0.8113207547169812, "grad_norm": 0.6826518153130348, "learning_rate": 1.8229302261080495e-05, "loss": 11.3443, "num_tokens": 8283578.0, "step": 1161 }, { "epoch": 0.8120195667365478, "grad_norm": 0.7423036450637122, "learning_rate": 1.809922294137847e-05, "loss": 11.4065, "num_tokens": 8290424.0, "step": 1162 }, { "epoch": 0.8127183787561146, "grad_norm": 0.6793892788712328, "learning_rate": 1.7969563195755535e-05, "loss": 11.1861, "num_tokens": 8298504.0, "step": 1163 }, { "epoch": 0.8134171907756813, "grad_norm": 0.7214794343659785, "learning_rate": 1.784032368845283e-05, "loss": 11.4229, "num_tokens": 8305815.0, "step": 1164 }, { "epoch": 0.8141160027952481, "grad_norm": 0.7485617746043014, "learning_rate": 1.7711505081558734e-05, "loss": 11.2375, "num_tokens": 8312752.0, "step": 1165 }, { "epoch": 0.8148148148148148, "grad_norm": 0.6748388760967671, "learning_rate": 1.758310803500527e-05, "loss": 11.3151, "num_tokens": 8320180.0, "step": 1166 }, { "epoch": 0.8155136268343816, "grad_norm": 0.7313883011335084, "learning_rate": 1.7455133206564832e-05, "loss": 11.1296, "num_tokens": 8327115.0, "step": 1167 }, { "epoch": 0.8162124388539483, "grad_norm": 0.6691961538450998, "learning_rate": 1.73275812518469e-05, "loss": 11.2173, "num_tokens": 8334667.0, "step": 1168 }, { "epoch": 0.816911250873515, "grad_norm": 0.7672032209833499, "learning_rate": 1.7200452824294498e-05, "loss": 11.2806, "num_tokens": 8340918.0, "step": 1169 }, { "epoch": 0.8176100628930818, "grad_norm": 0.7333853304823384, "learning_rate": 1.707374857518094e-05, "loss": 11.3178, "num_tokens": 8347718.0, "step": 1170 }, { "epoch": 0.8183088749126485, "grad_norm": 0.7083154662968923, "learning_rate": 1.6947469153606577e-05, "loss": 11.4407, "num_tokens": 8355388.0, "step": 1171 }, { "epoch": 0.8190076869322153, "grad_norm": 0.7605932556421211, "learning_rate": 1.6821615206495312e-05, "loss": 11.4539, "num_tokens": 8362353.0, "step": 1172 }, { "epoch": 0.8197064989517819, "grad_norm": 0.7196641926932061, "learning_rate": 1.6696187378591376e-05, "loss": 11.2854, "num_tokens": 8369668.0, "step": 1173 }, { "epoch": 0.8204053109713487, "grad_norm": 0.7607679332819676, "learning_rate": 1.657118631245601e-05, "loss": 11.5675, "num_tokens": 8376453.0, "step": 1174 }, { "epoch": 0.8211041229909154, "grad_norm": 0.6781286235128161, "learning_rate": 1.6446612648464164e-05, "loss": 11.2414, "num_tokens": 8384073.0, "step": 1175 }, { "epoch": 0.8218029350104822, "grad_norm": 0.6575078980008394, "learning_rate": 1.632246702480128e-05, "loss": 11.0235, "num_tokens": 8391615.0, "step": 1176 }, { "epoch": 0.822501747030049, "grad_norm": 0.6936210648421214, "learning_rate": 1.619875007745989e-05, "loss": 11.2754, "num_tokens": 8399117.0, "step": 1177 }, { "epoch": 0.8232005590496156, "grad_norm": 0.7861112908302533, "learning_rate": 1.607546244023651e-05, "loss": 11.3219, "num_tokens": 8405508.0, "step": 1178 }, { "epoch": 0.8238993710691824, "grad_norm": 0.7415539090414199, "learning_rate": 1.5952604744728272e-05, "loss": 11.4339, "num_tokens": 8412474.0, "step": 1179 }, { "epoch": 0.8245981830887491, "grad_norm": 0.7613238748546601, "learning_rate": 1.5830177620329712e-05, "loss": 11.4025, "num_tokens": 8419536.0, "step": 1180 }, { "epoch": 0.8252969951083159, "grad_norm": 0.7285477512094749, "learning_rate": 1.570818169422966e-05, "loss": 11.3531, "num_tokens": 8426032.0, "step": 1181 }, { "epoch": 0.8259958071278826, "grad_norm": 0.7939048597348616, "learning_rate": 1.558661759140786e-05, "loss": 11.3648, "num_tokens": 8432351.0, "step": 1182 }, { "epoch": 0.8266946191474493, "grad_norm": 0.7015799677247678, "learning_rate": 1.5465485934631853e-05, "loss": 11.237, "num_tokens": 8439781.0, "step": 1183 }, { "epoch": 0.827393431167016, "grad_norm": 0.6963852994738172, "learning_rate": 1.5344787344453805e-05, "loss": 11.3215, "num_tokens": 8446877.0, "step": 1184 }, { "epoch": 0.8280922431865828, "grad_norm": 0.7447257257726353, "learning_rate": 1.5224522439207246e-05, "loss": 11.1872, "num_tokens": 8453354.0, "step": 1185 }, { "epoch": 0.8287910552061496, "grad_norm": 0.745356152433122, "learning_rate": 1.5104691835004048e-05, "loss": 11.3101, "num_tokens": 8460350.0, "step": 1186 }, { "epoch": 0.8294898672257163, "grad_norm": 0.6723282265681545, "learning_rate": 1.498529614573111e-05, "loss": 11.231, "num_tokens": 8467882.0, "step": 1187 }, { "epoch": 0.8301886792452831, "grad_norm": 0.7694903453602582, "learning_rate": 1.4866335983047264e-05, "loss": 11.2441, "num_tokens": 8474603.0, "step": 1188 }, { "epoch": 0.8308874912648497, "grad_norm": 0.727022997755655, "learning_rate": 1.4747811956380242e-05, "loss": 11.363, "num_tokens": 8481543.0, "step": 1189 }, { "epoch": 0.8315863032844165, "grad_norm": 0.7022103489694916, "learning_rate": 1.4629724672923384e-05, "loss": 11.2517, "num_tokens": 8488957.0, "step": 1190 }, { "epoch": 0.8322851153039832, "grad_norm": 0.6861179535933343, "learning_rate": 1.4512074737632686e-05, "loss": 11.122, "num_tokens": 8496305.0, "step": 1191 }, { "epoch": 0.83298392732355, "grad_norm": 0.91578231716021, "learning_rate": 1.439486275322357e-05, "loss": 11.2723, "num_tokens": 8503384.0, "step": 1192 }, { "epoch": 0.8336827393431167, "grad_norm": 0.696492319983534, "learning_rate": 1.4278089320167876e-05, "loss": 11.182, "num_tokens": 8510477.0, "step": 1193 }, { "epoch": 0.8343815513626834, "grad_norm": 0.7326081564411686, "learning_rate": 1.4161755036690771e-05, "loss": 11.2417, "num_tokens": 8517882.0, "step": 1194 }, { "epoch": 0.8350803633822502, "grad_norm": 0.7429335339057346, "learning_rate": 1.4045860498767671e-05, "loss": 11.4592, "num_tokens": 8524376.0, "step": 1195 }, { "epoch": 0.8357791754018169, "grad_norm": 0.7041470336747101, "learning_rate": 1.3930406300121179e-05, "loss": 11.3957, "num_tokens": 8531630.0, "step": 1196 }, { "epoch": 0.8364779874213837, "grad_norm": 0.7287088760207717, "learning_rate": 1.3815393032218115e-05, "loss": 11.179, "num_tokens": 8538262.0, "step": 1197 }, { "epoch": 0.8371767994409504, "grad_norm": 0.7001293925089493, "learning_rate": 1.3700821284266351e-05, "loss": 11.165, "num_tokens": 8545378.0, "step": 1198 }, { "epoch": 0.8378756114605171, "grad_norm": 0.6577452042600329, "learning_rate": 1.3586691643211957e-05, "loss": 11.3174, "num_tokens": 8553221.0, "step": 1199 }, { "epoch": 0.8385744234800838, "grad_norm": 0.8325890793471512, "learning_rate": 1.3473004693736036e-05, "loss": 11.3585, "num_tokens": 8559107.0, "step": 1200 }, { "epoch": 0.8392732354996506, "grad_norm": 0.747352363910281, "learning_rate": 1.3359761018251826e-05, "loss": 11.1498, "num_tokens": 8565926.0, "step": 1201 }, { "epoch": 0.8399720475192173, "grad_norm": 0.7032494382852825, "learning_rate": 1.324696119690173e-05, "loss": 11.4202, "num_tokens": 8573174.0, "step": 1202 }, { "epoch": 0.8406708595387841, "grad_norm": 0.7081165719952596, "learning_rate": 1.3134605807554246e-05, "loss": 10.8787, "num_tokens": 8579902.0, "step": 1203 }, { "epoch": 0.8413696715583509, "grad_norm": 0.7334322609733372, "learning_rate": 1.302269542580109e-05, "loss": 11.5137, "num_tokens": 8586798.0, "step": 1204 }, { "epoch": 0.8420684835779175, "grad_norm": 0.6820085936877066, "learning_rate": 1.291123062495424e-05, "loss": 11.0216, "num_tokens": 8593890.0, "step": 1205 }, { "epoch": 0.8427672955974843, "grad_norm": 0.790988424419638, "learning_rate": 1.2800211976042941e-05, "loss": 11.4354, "num_tokens": 8600460.0, "step": 1206 }, { "epoch": 0.843466107617051, "grad_norm": 0.7027098919987455, "learning_rate": 1.268964004781089e-05, "loss": 11.2323, "num_tokens": 8607572.0, "step": 1207 }, { "epoch": 0.8441649196366178, "grad_norm": 0.7192825937138662, "learning_rate": 1.2579515406713193e-05, "loss": 11.3833, "num_tokens": 8614609.0, "step": 1208 }, { "epoch": 0.8448637316561844, "grad_norm": 0.7134129075930509, "learning_rate": 1.246983861691352e-05, "loss": 11.2464, "num_tokens": 8621756.0, "step": 1209 }, { "epoch": 0.8455625436757512, "grad_norm": 0.7343490934175165, "learning_rate": 1.236061024028129e-05, "loss": 11.1625, "num_tokens": 8628716.0, "step": 1210 }, { "epoch": 0.8462613556953179, "grad_norm": 0.6619816183303324, "learning_rate": 1.2251830836388622e-05, "loss": 11.2272, "num_tokens": 8636472.0, "step": 1211 }, { "epoch": 0.8469601677148847, "grad_norm": 0.6990600383266643, "learning_rate": 1.214350096250767e-05, "loss": 11.3979, "num_tokens": 8643444.0, "step": 1212 }, { "epoch": 0.8476589797344515, "grad_norm": 0.6847030430264577, "learning_rate": 1.2035621173607581e-05, "loss": 11.3022, "num_tokens": 8650747.0, "step": 1213 }, { "epoch": 0.8483577917540182, "grad_norm": 0.71252099329853, "learning_rate": 1.192819202235178e-05, "loss": 11.1707, "num_tokens": 8657676.0, "step": 1214 }, { "epoch": 0.8490566037735849, "grad_norm": 0.6677411887169162, "learning_rate": 1.1821214059095088e-05, "loss": 11.1983, "num_tokens": 8665448.0, "step": 1215 }, { "epoch": 0.8497554157931516, "grad_norm": 0.7048673577714665, "learning_rate": 1.1714687831880865e-05, "loss": 11.3894, "num_tokens": 8672624.0, "step": 1216 }, { "epoch": 0.8504542278127184, "grad_norm": 0.7184006740909511, "learning_rate": 1.1608613886438346e-05, "loss": 11.1164, "num_tokens": 8679416.0, "step": 1217 }, { "epoch": 0.8511530398322851, "grad_norm": 0.701659143428556, "learning_rate": 1.1502992766179666e-05, "loss": 11.3558, "num_tokens": 8686327.0, "step": 1218 }, { "epoch": 0.8518518518518519, "grad_norm": 0.7509863977103985, "learning_rate": 1.139782501219715e-05, "loss": 11.2614, "num_tokens": 8693164.0, "step": 1219 }, { "epoch": 0.8525506638714185, "grad_norm": 0.7465295676120681, "learning_rate": 1.1293111163260639e-05, "loss": 11.1938, "num_tokens": 8700212.0, "step": 1220 }, { "epoch": 0.8532494758909853, "grad_norm": 0.670212726845832, "learning_rate": 1.118885175581451e-05, "loss": 11.3063, "num_tokens": 8707503.0, "step": 1221 }, { "epoch": 0.8539482879105521, "grad_norm": 0.682152300598347, "learning_rate": 1.1085047323975173e-05, "loss": 11.2127, "num_tokens": 8715062.0, "step": 1222 }, { "epoch": 0.8546470999301188, "grad_norm": 0.695908771897953, "learning_rate": 1.0981698399528151e-05, "loss": 11.3079, "num_tokens": 8722494.0, "step": 1223 }, { "epoch": 0.8553459119496856, "grad_norm": 0.6457103617835629, "learning_rate": 1.0878805511925438e-05, "loss": 11.2127, "num_tokens": 8730292.0, "step": 1224 }, { "epoch": 0.8560447239692522, "grad_norm": 0.6716738001376975, "learning_rate": 1.0776369188282775e-05, "loss": 11.4094, "num_tokens": 8737792.0, "step": 1225 }, { "epoch": 0.856743535988819, "grad_norm": 0.651764381193494, "learning_rate": 1.0674389953376928e-05, "loss": 11.2051, "num_tokens": 8745484.0, "step": 1226 }, { "epoch": 0.8574423480083857, "grad_norm": 0.6740530141860219, "learning_rate": 1.0572868329643027e-05, "loss": 11.1321, "num_tokens": 8752746.0, "step": 1227 }, { "epoch": 0.8581411600279525, "grad_norm": 0.7681446924106794, "learning_rate": 1.0471804837171916e-05, "loss": 11.4142, "num_tokens": 8759307.0, "step": 1228 }, { "epoch": 0.8588399720475192, "grad_norm": 0.6639268473439438, "learning_rate": 1.0371199993707392e-05, "loss": 11.1017, "num_tokens": 8766894.0, "step": 1229 }, { "epoch": 0.859538784067086, "grad_norm": 0.7151411365264585, "learning_rate": 1.027105431464368e-05, "loss": 11.2614, "num_tokens": 8773818.0, "step": 1230 }, { "epoch": 0.8602375960866527, "grad_norm": 0.680537812110416, "learning_rate": 1.0171368313022677e-05, "loss": 11.1653, "num_tokens": 8781325.0, "step": 1231 }, { "epoch": 0.8609364081062194, "grad_norm": 0.7086824988570711, "learning_rate": 1.0072142499531344e-05, "loss": 11.396, "num_tokens": 8788576.0, "step": 1232 }, { "epoch": 0.8616352201257862, "grad_norm": 0.7276519344708898, "learning_rate": 9.973377382499227e-06, "loss": 11.3805, "num_tokens": 8794997.0, "step": 1233 }, { "epoch": 0.8623340321453529, "grad_norm": 0.6616315480835158, "learning_rate": 9.875073467895634e-06, "loss": 11.2051, "num_tokens": 8802443.0, "step": 1234 }, { "epoch": 0.8630328441649197, "grad_norm": 0.6304509355340976, "learning_rate": 9.777231259327212e-06, "loss": 11.1513, "num_tokens": 8810645.0, "step": 1235 }, { "epoch": 0.8637316561844863, "grad_norm": 0.6426270503509127, "learning_rate": 9.679851258035277e-06, "loss": 11.1122, "num_tokens": 8818550.0, "step": 1236 }, { "epoch": 0.8644304682040531, "grad_norm": 0.6854294622483896, "learning_rate": 9.582933962893293e-06, "loss": 11.1728, "num_tokens": 8825989.0, "step": 1237 }, { "epoch": 0.8651292802236199, "grad_norm": 0.6466231934487748, "learning_rate": 9.48647987040433e-06, "loss": 11.2237, "num_tokens": 8833884.0, "step": 1238 }, { "epoch": 0.8658280922431866, "grad_norm": 0.7866666947672692, "learning_rate": 9.390489474698439e-06, "loss": 11.3251, "num_tokens": 8840327.0, "step": 1239 }, { "epoch": 0.8665269042627534, "grad_norm": 0.6904961943465432, "learning_rate": 9.294963267530176e-06, "loss": 11.3618, "num_tokens": 8847713.0, "step": 1240 }, { "epoch": 0.86722571628232, "grad_norm": 0.658618408429087, "learning_rate": 9.19990173827615e-06, "loss": 11.1388, "num_tokens": 8855278.0, "step": 1241 }, { "epoch": 0.8679245283018868, "grad_norm": 0.7534943781371122, "learning_rate": 9.105305373932338e-06, "loss": 11.1924, "num_tokens": 8862156.0, "step": 1242 }, { "epoch": 0.8686233403214535, "grad_norm": 0.6787156336949429, "learning_rate": 9.01117465911181e-06, "loss": 11.2441, "num_tokens": 8869458.0, "step": 1243 }, { "epoch": 0.8693221523410203, "grad_norm": 0.7501279068444608, "learning_rate": 8.917510076042057e-06, "loss": 11.3719, "num_tokens": 8875858.0, "step": 1244 }, { "epoch": 0.870020964360587, "grad_norm": 0.7379674425125939, "learning_rate": 8.824312104562615e-06, "loss": 11.2014, "num_tokens": 8882524.0, "step": 1245 }, { "epoch": 0.8707197763801537, "grad_norm": 0.6693741919269287, "learning_rate": 8.731581222122587e-06, "loss": 11.3032, "num_tokens": 8890194.0, "step": 1246 }, { "epoch": 0.8714185883997205, "grad_norm": 0.6794304721197457, "learning_rate": 8.639317903778189e-06, "loss": 11.2264, "num_tokens": 8897718.0, "step": 1247 }, { "epoch": 0.8721174004192872, "grad_norm": 0.6983883664214291, "learning_rate": 8.547522622190385e-06, "loss": 11.112, "num_tokens": 8904932.0, "step": 1248 }, { "epoch": 0.872816212438854, "grad_norm": 0.7184755993502536, "learning_rate": 8.45619584762235e-06, "loss": 11.4004, "num_tokens": 8912001.0, "step": 1249 }, { "epoch": 0.8735150244584207, "grad_norm": 0.685772655591201, "learning_rate": 8.365338047937121e-06, "loss": 11.0864, "num_tokens": 8919388.0, "step": 1250 }, { "epoch": 0.8742138364779874, "grad_norm": 0.713433613357652, "learning_rate": 8.274949688595224e-06, "loss": 11.4927, "num_tokens": 8926115.0, "step": 1251 }, { "epoch": 0.8749126484975541, "grad_norm": 0.796288755617905, "learning_rate": 8.185031232652251e-06, "loss": 11.3539, "num_tokens": 8932498.0, "step": 1252 }, { "epoch": 0.8756114605171209, "grad_norm": 0.7317766316505643, "learning_rate": 8.095583140756468e-06, "loss": 11.3538, "num_tokens": 8939059.0, "step": 1253 }, { "epoch": 0.8763102725366876, "grad_norm": 0.7196340925972718, "learning_rate": 8.006605871146577e-06, "loss": 11.2812, "num_tokens": 8946032.0, "step": 1254 }, { "epoch": 0.8770090845562544, "grad_norm": 0.6933864422218876, "learning_rate": 7.918099879649144e-06, "loss": 11.3292, "num_tokens": 8953182.0, "step": 1255 }, { "epoch": 0.8777078965758212, "grad_norm": 0.7351379035469031, "learning_rate": 7.830065619676518e-06, "loss": 11.1246, "num_tokens": 8959948.0, "step": 1256 }, { "epoch": 0.8784067085953878, "grad_norm": 0.6544570994881753, "learning_rate": 7.742503542224334e-06, "loss": 11.2429, "num_tokens": 8967644.0, "step": 1257 }, { "epoch": 0.8791055206149546, "grad_norm": 0.6641255330980453, "learning_rate": 7.65541409586924e-06, "loss": 11.267, "num_tokens": 8975384.0, "step": 1258 }, { "epoch": 0.8798043326345213, "grad_norm": 0.8045043710675989, "learning_rate": 7.568797726766686e-06, "loss": 11.1368, "num_tokens": 8981552.0, "step": 1259 }, { "epoch": 0.8805031446540881, "grad_norm": 0.6704571245484808, "learning_rate": 7.482654878648465e-06, "loss": 11.3728, "num_tokens": 8989062.0, "step": 1260 }, { "epoch": 0.8812019566736548, "grad_norm": 0.8369669539346279, "learning_rate": 7.396985992820648e-06, "loss": 11.2803, "num_tokens": 8996294.0, "step": 1261 }, { "epoch": 0.8819007686932215, "grad_norm": 0.6847794044512928, "learning_rate": 7.311791508161159e-06, "loss": 11.3536, "num_tokens": 9003435.0, "step": 1262 }, { "epoch": 0.8825995807127882, "grad_norm": 0.6571735057337356, "learning_rate": 7.227071861117562e-06, "loss": 11.4174, "num_tokens": 9011341.0, "step": 1263 }, { "epoch": 0.883298392732355, "grad_norm": 0.7245916491525788, "learning_rate": 7.14282748570495e-06, "loss": 11.2201, "num_tokens": 9017863.0, "step": 1264 }, { "epoch": 0.8839972047519218, "grad_norm": 0.7310401125961942, "learning_rate": 7.059058813503483e-06, "loss": 11.4414, "num_tokens": 9024474.0, "step": 1265 }, { "epoch": 0.8846960167714885, "grad_norm": 0.6450501711377958, "learning_rate": 6.975766273656425e-06, "loss": 11.2476, "num_tokens": 9032345.0, "step": 1266 }, { "epoch": 0.8853948287910552, "grad_norm": 0.7556755113831584, "learning_rate": 6.892950292867784e-06, "loss": 11.3764, "num_tokens": 9038850.0, "step": 1267 }, { "epoch": 0.8860936408106219, "grad_norm": 0.7125713611966051, "learning_rate": 6.810611295400171e-06, "loss": 11.3624, "num_tokens": 9045840.0, "step": 1268 }, { "epoch": 0.8867924528301887, "grad_norm": 0.6642280207143011, "learning_rate": 6.728749703072679e-06, "loss": 11.2514, "num_tokens": 9053258.0, "step": 1269 }, { "epoch": 0.8874912648497554, "grad_norm": 0.6771501155957508, "learning_rate": 6.647365935258642e-06, "loss": 11.2015, "num_tokens": 9060449.0, "step": 1270 }, { "epoch": 0.8881900768693222, "grad_norm": 0.7024388591874622, "learning_rate": 6.56646040888349e-06, "loss": 11.3443, "num_tokens": 9067366.0, "step": 1271 }, { "epoch": 0.8888888888888888, "grad_norm": 0.6765742708153049, "learning_rate": 6.48603353842272e-06, "loss": 11.2577, "num_tokens": 9074722.0, "step": 1272 }, { "epoch": 0.8895877009084556, "grad_norm": 0.7261272982152684, "learning_rate": 6.406085735899625e-06, "loss": 11.2727, "num_tokens": 9082068.0, "step": 1273 }, { "epoch": 0.8902865129280224, "grad_norm": 0.6735110971588754, "learning_rate": 6.326617410883295e-06, "loss": 11.4339, "num_tokens": 9089269.0, "step": 1274 }, { "epoch": 0.8909853249475891, "grad_norm": 0.7182901637403895, "learning_rate": 6.247628970486463e-06, "loss": 11.085, "num_tokens": 9096281.0, "step": 1275 }, { "epoch": 0.8916841369671559, "grad_norm": 0.6411503497109853, "learning_rate": 6.169120819363405e-06, "loss": 11.3368, "num_tokens": 9103954.0, "step": 1276 }, { "epoch": 0.8923829489867225, "grad_norm": 0.7214606675707624, "learning_rate": 6.091093359707977e-06, "loss": 11.26, "num_tokens": 9110403.0, "step": 1277 }, { "epoch": 0.8930817610062893, "grad_norm": 0.6884291008923099, "learning_rate": 6.013546991251373e-06, "loss": 11.0697, "num_tokens": 9117361.0, "step": 1278 }, { "epoch": 0.893780573025856, "grad_norm": 0.7410343770266834, "learning_rate": 5.936482111260278e-06, "loss": 11.4048, "num_tokens": 9123889.0, "step": 1279 }, { "epoch": 0.8944793850454228, "grad_norm": 0.6910189149126138, "learning_rate": 5.859899114534661e-06, "loss": 11.0736, "num_tokens": 9130750.0, "step": 1280 }, { "epoch": 0.8951781970649895, "grad_norm": 0.6997111425088132, "learning_rate": 5.783798393405826e-06, "loss": 11.4646, "num_tokens": 9137972.0, "step": 1281 }, { "epoch": 0.8958770090845563, "grad_norm": 0.7381270749567366, "learning_rate": 5.708180337734448e-06, "loss": 11.3544, "num_tokens": 9144975.0, "step": 1282 }, { "epoch": 0.896575821104123, "grad_norm": 0.7016469624483329, "learning_rate": 5.633045334908493e-06, "loss": 11.416, "num_tokens": 9152166.0, "step": 1283 }, { "epoch": 0.8972746331236897, "grad_norm": 0.7555515485608866, "learning_rate": 5.5583937698412856e-06, "loss": 11.3033, "num_tokens": 9158905.0, "step": 1284 }, { "epoch": 0.8979734451432565, "grad_norm": 0.754593708430779, "learning_rate": 5.4842260249694964e-06, "loss": 11.5423, "num_tokens": 9165569.0, "step": 1285 }, { "epoch": 0.8986722571628232, "grad_norm": 0.721083897447478, "learning_rate": 5.410542480251202e-06, "loss": 11.3485, "num_tokens": 9172201.0, "step": 1286 }, { "epoch": 0.89937106918239, "grad_norm": 0.7403874761653653, "learning_rate": 5.337343513164006e-06, "loss": 11.1843, "num_tokens": 9179084.0, "step": 1287 }, { "epoch": 0.9000698812019566, "grad_norm": 0.6781928166122254, "learning_rate": 5.264629498702967e-06, "loss": 11.2702, "num_tokens": 9186520.0, "step": 1288 }, { "epoch": 0.9007686932215234, "grad_norm": 0.64789553108461, "learning_rate": 5.192400809378783e-06, "loss": 11.1842, "num_tokens": 9193927.0, "step": 1289 }, { "epoch": 0.9014675052410901, "grad_norm": 0.638576282772883, "learning_rate": 5.120657815215879e-06, "loss": 11.2144, "num_tokens": 9201756.0, "step": 1290 }, { "epoch": 0.9021663172606569, "grad_norm": 0.6516712972261183, "learning_rate": 5.0494008837504214e-06, "loss": 11.1786, "num_tokens": 9209398.0, "step": 1291 }, { "epoch": 0.9028651292802237, "grad_norm": 0.7567436281029121, "learning_rate": 4.978630380028582e-06, "loss": 11.2843, "num_tokens": 9215890.0, "step": 1292 }, { "epoch": 0.9035639412997903, "grad_norm": 0.7830577964462887, "learning_rate": 4.908346666604502e-06, "loss": 11.4464, "num_tokens": 9222461.0, "step": 1293 }, { "epoch": 0.9042627533193571, "grad_norm": 0.7351201872984475, "learning_rate": 4.8385501035385746e-06, "loss": 11.3842, "num_tokens": 9229085.0, "step": 1294 }, { "epoch": 0.9049615653389238, "grad_norm": 0.6510585165795112, "learning_rate": 4.769241048395512e-06, "loss": 11.1701, "num_tokens": 9236669.0, "step": 1295 }, { "epoch": 0.9056603773584906, "grad_norm": 0.664145384036073, "learning_rate": 4.700419856242555e-06, "loss": 11.1899, "num_tokens": 9243791.0, "step": 1296 }, { "epoch": 0.9063591893780573, "grad_norm": 0.7537789034846496, "learning_rate": 4.632086879647635e-06, "loss": 11.4093, "num_tokens": 9250468.0, "step": 1297 }, { "epoch": 0.907058001397624, "grad_norm": 0.6644394086425081, "learning_rate": 4.564242468677615e-06, "loss": 11.1755, "num_tokens": 9257850.0, "step": 1298 }, { "epoch": 0.9077568134171907, "grad_norm": 0.6585007654131836, "learning_rate": 4.496886970896396e-06, "loss": 11.304, "num_tokens": 9265236.0, "step": 1299 }, { "epoch": 0.9084556254367575, "grad_norm": 0.6909557159895615, "learning_rate": 4.430020731363271e-06, "loss": 11.2188, "num_tokens": 9272210.0, "step": 1300 }, { "epoch": 0.9091544374563243, "grad_norm": 0.7195362441131871, "learning_rate": 4.3636440926310144e-06, "loss": 11.5416, "num_tokens": 9279337.0, "step": 1301 }, { "epoch": 0.909853249475891, "grad_norm": 0.7125110991664391, "learning_rate": 4.2977573947442175e-06, "loss": 11.2543, "num_tokens": 9286092.0, "step": 1302 }, { "epoch": 0.9105520614954578, "grad_norm": 0.6935508566906836, "learning_rate": 4.232360975237571e-06, "loss": 11.2606, "num_tokens": 9292888.0, "step": 1303 }, { "epoch": 0.9112508735150244, "grad_norm": 0.695239157260203, "learning_rate": 4.167455169134027e-06, "loss": 11.3306, "num_tokens": 9299710.0, "step": 1304 }, { "epoch": 0.9119496855345912, "grad_norm": 0.7177219647398039, "learning_rate": 4.103040308943195e-06, "loss": 11.3787, "num_tokens": 9306349.0, "step": 1305 }, { "epoch": 0.9126484975541579, "grad_norm": 0.6631627285340865, "learning_rate": 4.039116724659564e-06, "loss": 11.3888, "num_tokens": 9313892.0, "step": 1306 }, { "epoch": 0.9133473095737247, "grad_norm": 0.7192009063137447, "learning_rate": 3.975684743760832e-06, "loss": 11.3041, "num_tokens": 9320784.0, "step": 1307 }, { "epoch": 0.9140461215932913, "grad_norm": 0.6206047815154435, "learning_rate": 3.91274469120626e-06, "loss": 11.075, "num_tokens": 9328759.0, "step": 1308 }, { "epoch": 0.9147449336128581, "grad_norm": 0.7157002783746432, "learning_rate": 3.850296889434968e-06, "loss": 11.6899, "num_tokens": 9335584.0, "step": 1309 }, { "epoch": 0.9154437456324249, "grad_norm": 0.63380899392185, "learning_rate": 3.788341658364314e-06, "loss": 11.2001, "num_tokens": 9343277.0, "step": 1310 }, { "epoch": 0.9161425576519916, "grad_norm": 0.6594255092883604, "learning_rate": 3.726879315388199e-06, "loss": 11.4582, "num_tokens": 9350951.0, "step": 1311 }, { "epoch": 0.9168413696715584, "grad_norm": 0.7156352255325785, "learning_rate": 3.665910175375498e-06, "loss": 11.2162, "num_tokens": 9357659.0, "step": 1312 }, { "epoch": 0.9175401816911251, "grad_norm": 0.6425872585900699, "learning_rate": 3.6054345506684627e-06, "loss": 11.2544, "num_tokens": 9365388.0, "step": 1313 }, { "epoch": 0.9182389937106918, "grad_norm": 0.6714524114625421, "learning_rate": 3.5454527510810352e-06, "loss": 11.1915, "num_tokens": 9372586.0, "step": 1314 }, { "epoch": 0.9189378057302585, "grad_norm": 0.6582728677450521, "learning_rate": 3.485965083897347e-06, "loss": 11.1907, "num_tokens": 9380241.0, "step": 1315 }, { "epoch": 0.9196366177498253, "grad_norm": 0.7159032392269566, "learning_rate": 3.426971853870109e-06, "loss": 11.4437, "num_tokens": 9387292.0, "step": 1316 }, { "epoch": 0.9203354297693921, "grad_norm": 0.6942298939290196, "learning_rate": 3.3684733632190157e-06, "loss": 11.1795, "num_tokens": 9394171.0, "step": 1317 }, { "epoch": 0.9210342417889588, "grad_norm": 0.6692411875362031, "learning_rate": 3.310469911629288e-06, "loss": 11.2257, "num_tokens": 9401400.0, "step": 1318 }, { "epoch": 0.9217330538085255, "grad_norm": 0.728672155961115, "learning_rate": 3.252961796250054e-06, "loss": 11.1689, "num_tokens": 9408486.0, "step": 1319 }, { "epoch": 0.9224318658280922, "grad_norm": 0.7355419519065545, "learning_rate": 3.1959493116928476e-06, "loss": 11.4436, "num_tokens": 9415285.0, "step": 1320 }, { "epoch": 0.923130677847659, "grad_norm": 0.7030206494953251, "learning_rate": 3.1394327500301357e-06, "loss": 11.3679, "num_tokens": 9422270.0, "step": 1321 }, { "epoch": 0.9238294898672257, "grad_norm": 0.629823205533704, "learning_rate": 3.0834124007937614e-06, "loss": 11.3484, "num_tokens": 9430223.0, "step": 1322 }, { "epoch": 0.9245283018867925, "grad_norm": 0.6789253643905201, "learning_rate": 3.0278885509735234e-06, "loss": 11.2971, "num_tokens": 9437454.0, "step": 1323 }, { "epoch": 0.9252271139063591, "grad_norm": 0.6636854562456178, "learning_rate": 2.9728614850156653e-06, "loss": 11.1163, "num_tokens": 9444734.0, "step": 1324 }, { "epoch": 0.9259259259259259, "grad_norm": 0.7344693956693098, "learning_rate": 2.9183314848214127e-06, "loss": 11.3255, "num_tokens": 9451233.0, "step": 1325 }, { "epoch": 0.9266247379454927, "grad_norm": 0.6770150273794775, "learning_rate": 2.864298829745571e-06, "loss": 11.0587, "num_tokens": 9458292.0, "step": 1326 }, { "epoch": 0.9273235499650594, "grad_norm": 0.6755839488912377, "learning_rate": 2.8107637965950506e-06, "loss": 11.1855, "num_tokens": 9465760.0, "step": 1327 }, { "epoch": 0.9280223619846262, "grad_norm": 0.6907883258293367, "learning_rate": 2.7577266596274576e-06, "loss": 11.2504, "num_tokens": 9473090.0, "step": 1328 }, { "epoch": 0.9287211740041929, "grad_norm": 0.6894014385780622, "learning_rate": 2.7051876905497375e-06, "loss": 11.3714, "num_tokens": 9480091.0, "step": 1329 }, { "epoch": 0.9294199860237596, "grad_norm": 0.6507226401997432, "learning_rate": 2.6531471585167e-06, "loss": 11.3168, "num_tokens": 9487848.0, "step": 1330 }, { "epoch": 0.9301187980433263, "grad_norm": 0.6634040035460618, "learning_rate": 2.6016053301297196e-06, "loss": 11.1478, "num_tokens": 9495032.0, "step": 1331 }, { "epoch": 0.9308176100628931, "grad_norm": 0.6248126259198482, "learning_rate": 2.5505624694353024e-06, "loss": 11.1628, "num_tokens": 9502796.0, "step": 1332 }, { "epoch": 0.9315164220824598, "grad_norm": 0.7223781912313146, "learning_rate": 2.5000188379237786e-06, "loss": 11.2473, "num_tokens": 9509344.0, "step": 1333 }, { "epoch": 0.9322152341020266, "grad_norm": 0.6968748402634183, "learning_rate": 2.4499746945279566e-06, "loss": 11.227, "num_tokens": 9516077.0, "step": 1334 }, { "epoch": 0.9329140461215933, "grad_norm": 0.6518545278329891, "learning_rate": 2.4004302956217804e-06, "loss": 11.2451, "num_tokens": 9523359.0, "step": 1335 }, { "epoch": 0.93361285814116, "grad_norm": 0.7049583629463331, "learning_rate": 2.3513858950190204e-06, "loss": 11.2872, "num_tokens": 9530186.0, "step": 1336 }, { "epoch": 0.9343116701607268, "grad_norm": 0.7118301258107756, "learning_rate": 2.302841743971995e-06, "loss": 11.0207, "num_tokens": 9537136.0, "step": 1337 }, { "epoch": 0.9350104821802935, "grad_norm": 0.6928653342414067, "learning_rate": 2.2547980911702404e-06, "loss": 11.3115, "num_tokens": 9544223.0, "step": 1338 }, { "epoch": 0.9357092941998603, "grad_norm": 0.7236335982996023, "learning_rate": 2.2072551827392983e-06, "loss": 11.3392, "num_tokens": 9550920.0, "step": 1339 }, { "epoch": 0.9364081062194269, "grad_norm": 0.6465425844125828, "learning_rate": 2.1602132622393746e-06, "loss": 11.2663, "num_tokens": 9558586.0, "step": 1340 }, { "epoch": 0.9371069182389937, "grad_norm": 0.658991959593526, "learning_rate": 2.1136725706641712e-06, "loss": 11.3773, "num_tokens": 9566214.0, "step": 1341 }, { "epoch": 0.9378057302585604, "grad_norm": 0.6948550756585189, "learning_rate": 2.0676333464396126e-06, "loss": 11.2736, "num_tokens": 9573631.0, "step": 1342 }, { "epoch": 0.9385045422781272, "grad_norm": 0.7206740338733898, "learning_rate": 2.0220958254225984e-06, "loss": 11.3452, "num_tokens": 9580814.0, "step": 1343 }, { "epoch": 0.939203354297694, "grad_norm": 0.7268069168403607, "learning_rate": 1.977060240899864e-06, "loss": 11.2923, "num_tokens": 9587261.0, "step": 1344 }, { "epoch": 0.9399021663172606, "grad_norm": 0.7172014463716784, "learning_rate": 1.932526823586722e-06, "loss": 11.3546, "num_tokens": 9593973.0, "step": 1345 }, { "epoch": 0.9406009783368274, "grad_norm": 0.623418404084086, "learning_rate": 1.8884958016259113e-06, "loss": 11.2887, "num_tokens": 9601821.0, "step": 1346 }, { "epoch": 0.9412997903563941, "grad_norm": 0.6616585012666008, "learning_rate": 1.844967400586428e-06, "loss": 11.2776, "num_tokens": 9609343.0, "step": 1347 }, { "epoch": 0.9419986023759609, "grad_norm": 0.6433007452971647, "learning_rate": 1.8019418434623404e-06, "loss": 11.1876, "num_tokens": 9616602.0, "step": 1348 }, { "epoch": 0.9426974143955276, "grad_norm": 0.6822594858991043, "learning_rate": 1.7594193506716983e-06, "loss": 11.2809, "num_tokens": 9623622.0, "step": 1349 }, { "epoch": 0.9433962264150944, "grad_norm": 0.6684722637362679, "learning_rate": 1.7174001400553586e-06, "loss": 11.3421, "num_tokens": 9631105.0, "step": 1350 }, { "epoch": 0.944095038434661, "grad_norm": 0.6429394633148133, "learning_rate": 1.6758844268758843e-06, "loss": 11.3945, "num_tokens": 9638663.0, "step": 1351 }, { "epoch": 0.9447938504542278, "grad_norm": 0.5926480868121347, "learning_rate": 1.634872423816458e-06, "loss": 11.1168, "num_tokens": 9646957.0, "step": 1352 }, { "epoch": 0.9454926624737946, "grad_norm": 0.7066366283958508, "learning_rate": 1.5943643409797594e-06, "loss": 11.2229, "num_tokens": 9653670.0, "step": 1353 }, { "epoch": 0.9461914744933613, "grad_norm": 0.6611735635710152, "learning_rate": 1.5543603858869215e-06, "loss": 11.2078, "num_tokens": 9660888.0, "step": 1354 }, { "epoch": 0.9468902865129281, "grad_norm": 0.6742560705489566, "learning_rate": 1.5148607634764446e-06, "loss": 11.2851, "num_tokens": 9668331.0, "step": 1355 }, { "epoch": 0.9475890985324947, "grad_norm": 0.702592812073591, "learning_rate": 1.475865676103161e-06, "loss": 11.2943, "num_tokens": 9675259.0, "step": 1356 }, { "epoch": 0.9482879105520615, "grad_norm": 0.6570155584828766, "learning_rate": 1.4373753235371823e-06, "loss": 10.9385, "num_tokens": 9682674.0, "step": 1357 }, { "epoch": 0.9489867225716282, "grad_norm": 0.7784712864060985, "learning_rate": 1.3993899029628997e-06, "loss": 11.3884, "num_tokens": 9689182.0, "step": 1358 }, { "epoch": 0.949685534591195, "grad_norm": 0.6920868766511811, "learning_rate": 1.3619096089779293e-06, "loss": 11.4583, "num_tokens": 9696437.0, "step": 1359 }, { "epoch": 0.9503843466107617, "grad_norm": 0.7247008722668327, "learning_rate": 1.3249346335922007e-06, "loss": 11.4813, "num_tokens": 9703396.0, "step": 1360 }, { "epoch": 0.9510831586303284, "grad_norm": 0.6632602910524646, "learning_rate": 1.2884651662268709e-06, "loss": 11.3547, "num_tokens": 9711059.0, "step": 1361 }, { "epoch": 0.9517819706498952, "grad_norm": 0.6710435983883922, "learning_rate": 1.2525013937134122e-06, "loss": 11.3387, "num_tokens": 9718400.0, "step": 1362 }, { "epoch": 0.9524807826694619, "grad_norm": 0.664792473697077, "learning_rate": 1.2170435002926694e-06, "loss": 11.1944, "num_tokens": 9725645.0, "step": 1363 }, { "epoch": 0.9531795946890287, "grad_norm": 0.6707698174727578, "learning_rate": 1.1820916676138382e-06, "loss": 11.2262, "num_tokens": 9733005.0, "step": 1364 }, { "epoch": 0.9538784067085954, "grad_norm": 0.6688979482961926, "learning_rate": 1.147646074733655e-06, "loss": 11.2677, "num_tokens": 9740218.0, "step": 1365 }, { "epoch": 0.9545772187281621, "grad_norm": 0.7433455767451992, "learning_rate": 1.1137068981153632e-06, "loss": 11.4951, "num_tokens": 9746821.0, "step": 1366 }, { "epoch": 0.9552760307477288, "grad_norm": 0.7175584827821752, "learning_rate": 1.0802743116278714e-06, "loss": 11.2647, "num_tokens": 9753374.0, "step": 1367 }, { "epoch": 0.9559748427672956, "grad_norm": 0.6933785565534558, "learning_rate": 1.0473484865448525e-06, "loss": 11.387, "num_tokens": 9760505.0, "step": 1368 }, { "epoch": 0.9566736547868623, "grad_norm": 0.6647102154219586, "learning_rate": 1.014929591543845e-06, "loss": 11.3239, "num_tokens": 9767972.0, "step": 1369 }, { "epoch": 0.9573724668064291, "grad_norm": 0.7171114285326375, "learning_rate": 9.830177927054428e-07, "loss": 11.2421, "num_tokens": 9774744.0, "step": 1370 }, { "epoch": 0.9580712788259959, "grad_norm": 0.6753844797667017, "learning_rate": 9.516132535123846e-07, "loss": 11.0581, "num_tokens": 9782050.0, "step": 1371 }, { "epoch": 0.9587700908455625, "grad_norm": 0.6594805435739066, "learning_rate": 9.207161348487315e-07, "loss": 11.2986, "num_tokens": 9789435.0, "step": 1372 }, { "epoch": 0.9594689028651293, "grad_norm": 0.6219305497890216, "learning_rate": 8.903265949990691e-07, "loss": 11.2472, "num_tokens": 9797498.0, "step": 1373 }, { "epoch": 0.960167714884696, "grad_norm": 0.6797824852084828, "learning_rate": 8.604447896476852e-07, "loss": 11.2204, "num_tokens": 9805014.0, "step": 1374 }, { "epoch": 0.9608665269042628, "grad_norm": 0.637072288456489, "learning_rate": 8.310708718777371e-07, "loss": 11.0226, "num_tokens": 9812890.0, "step": 1375 }, { "epoch": 0.9615653389238294, "grad_norm": 0.6883492395222, "learning_rate": 8.022049921705299e-07, "loss": 11.1737, "num_tokens": 9819758.0, "step": 1376 }, { "epoch": 0.9622641509433962, "grad_norm": 0.6679558355010241, "learning_rate": 7.73847298404684e-07, "loss": 11.0855, "num_tokens": 9826786.0, "step": 1377 }, { "epoch": 0.9629629629629629, "grad_norm": 0.6523416128412312, "learning_rate": 7.459979358554248e-07, "loss": 11.2854, "num_tokens": 9834179.0, "step": 1378 }, { "epoch": 0.9636617749825297, "grad_norm": 0.6761703126788413, "learning_rate": 7.186570471937937e-07, "loss": 11.4241, "num_tokens": 9841706.0, "step": 1379 }, { "epoch": 0.9643605870020965, "grad_norm": 0.6539490861725904, "learning_rate": 6.918247724859939e-07, "loss": 11.0869, "num_tokens": 9849180.0, "step": 1380 }, { "epoch": 0.9650593990216632, "grad_norm": 0.6627942977748735, "learning_rate": 6.655012491925683e-07, "loss": 11.3332, "num_tokens": 9856546.0, "step": 1381 }, { "epoch": 0.9657582110412299, "grad_norm": 0.7525079583067344, "learning_rate": 6.396866121677559e-07, "loss": 11.485, "num_tokens": 9863096.0, "step": 1382 }, { "epoch": 0.9664570230607966, "grad_norm": 0.666826182150616, "learning_rate": 6.143809936588363e-07, "loss": 11.1335, "num_tokens": 9870255.0, "step": 1383 }, { "epoch": 0.9671558350803634, "grad_norm": 0.6242640875889792, "learning_rate": 5.895845233053643e-07, "loss": 11.2859, "num_tokens": 9878236.0, "step": 1384 }, { "epoch": 0.9678546470999301, "grad_norm": 0.7272158788230525, "learning_rate": 5.652973281385588e-07, "loss": 11.3917, "num_tokens": 9885146.0, "step": 1385 }, { "epoch": 0.9685534591194969, "grad_norm": 0.6972412481254419, "learning_rate": 5.415195325806699e-07, "loss": 11.2604, "num_tokens": 9892008.0, "step": 1386 }, { "epoch": 0.9692522711390635, "grad_norm": 0.6840355027864292, "learning_rate": 5.182512584443022e-07, "loss": 11.3559, "num_tokens": 9899023.0, "step": 1387 }, { "epoch": 0.9699510831586303, "grad_norm": 0.6604927934689941, "learning_rate": 4.954926249317815e-07, "loss": 11.0894, "num_tokens": 9906287.0, "step": 1388 }, { "epoch": 0.9706498951781971, "grad_norm": 0.6550659928475684, "learning_rate": 4.732437486345886e-07, "loss": 10.9861, "num_tokens": 9913471.0, "step": 1389 }, { "epoch": 0.9713487071977638, "grad_norm": 0.6731044015412156, "learning_rate": 4.515047435327491e-07, "loss": 11.3375, "num_tokens": 9920776.0, "step": 1390 }, { "epoch": 0.9720475192173306, "grad_norm": 0.6886371291373153, "learning_rate": 4.3027572099422207e-07, "loss": 11.3041, "num_tokens": 9927914.0, "step": 1391 }, { "epoch": 0.9727463312368972, "grad_norm": 0.7315035720036384, "learning_rate": 4.0955678977436797e-07, "loss": 11.3239, "num_tokens": 9934401.0, "step": 1392 }, { "epoch": 0.973445143256464, "grad_norm": 0.6731364179858964, "learning_rate": 3.893480560153484e-07, "loss": 11.1288, "num_tokens": 9941705.0, "step": 1393 }, { "epoch": 0.9741439552760307, "grad_norm": 0.6600863194244329, "learning_rate": 3.6964962324561593e-07, "loss": 11.2683, "num_tokens": 9949413.0, "step": 1394 }, { "epoch": 0.9748427672955975, "grad_norm": 0.6737648893244508, "learning_rate": 3.504615923793919e-07, "loss": 11.0837, "num_tokens": 9956435.0, "step": 1395 }, { "epoch": 0.9755415793151643, "grad_norm": 0.7145328987881118, "learning_rate": 3.317840617160894e-07, "loss": 11.2672, "num_tokens": 9963267.0, "step": 1396 }, { "epoch": 0.976240391334731, "grad_norm": 0.6822610524050463, "learning_rate": 3.136171269399024e-07, "loss": 10.9777, "num_tokens": 9970795.0, "step": 1397 }, { "epoch": 0.9769392033542977, "grad_norm": 0.7139750346598414, "learning_rate": 2.959608811192283e-07, "loss": 11.2859, "num_tokens": 9977599.0, "step": 1398 }, { "epoch": 0.9776380153738644, "grad_norm": 0.6629186760012954, "learning_rate": 2.7881541470623494e-07, "loss": 11.3439, "num_tokens": 9985126.0, "step": 1399 }, { "epoch": 0.9783368273934312, "grad_norm": 0.7021437040550742, "learning_rate": 2.6218081553638364e-07, "loss": 11.074, "num_tokens": 9991636.0, "step": 1400 }, { "epoch": 0.9790356394129979, "grad_norm": 0.651326390715108, "learning_rate": 2.4605716882801776e-07, "loss": 11.3278, "num_tokens": 9999333.0, "step": 1401 }, { "epoch": 0.9797344514325647, "grad_norm": 0.7077963783884662, "learning_rate": 2.3044455718185253e-07, "loss": 11.279, "num_tokens": 10006127.0, "step": 1402 }, { "epoch": 0.9804332634521313, "grad_norm": 0.7275599228420562, "learning_rate": 2.153430605806195e-07, "loss": 11.2103, "num_tokens": 10012832.0, "step": 1403 }, { "epoch": 0.9811320754716981, "grad_norm": 0.6445139570135618, "learning_rate": 2.0075275638862246e-07, "loss": 10.9713, "num_tokens": 10020535.0, "step": 1404 }, { "epoch": 0.9818308874912649, "grad_norm": 0.7116706126407417, "learning_rate": 1.8667371935133792e-07, "loss": 11.4616, "num_tokens": 10027624.0, "step": 1405 }, { "epoch": 0.9825296995108316, "grad_norm": 0.6841858958472258, "learning_rate": 1.7310602159505974e-07, "loss": 11.3032, "num_tokens": 10034683.0, "step": 1406 }, { "epoch": 0.9832285115303984, "grad_norm": 0.6705286305197254, "learning_rate": 1.6004973262651047e-07, "loss": 11.3338, "num_tokens": 10042053.0, "step": 1407 }, { "epoch": 0.983927323549965, "grad_norm": 0.6730415740783933, "learning_rate": 1.4750491933247512e-07, "loss": 11.3956, "num_tokens": 10049229.0, "step": 1408 }, { "epoch": 0.9846261355695318, "grad_norm": 0.7674497304328863, "learning_rate": 1.3547164597949026e-07, "loss": 11.5897, "num_tokens": 10055821.0, "step": 1409 }, { "epoch": 0.9853249475890985, "grad_norm": 0.6146955427792996, "learning_rate": 1.2394997421347753e-07, "loss": 11.3468, "num_tokens": 10064206.0, "step": 1410 }, { "epoch": 0.9860237596086653, "grad_norm": 0.7186141338407501, "learning_rate": 1.1293996305946631e-07, "loss": 11.2113, "num_tokens": 10070918.0, "step": 1411 }, { "epoch": 0.986722571628232, "grad_norm": 0.6471488770051232, "learning_rate": 1.0244166892124928e-07, "loss": 11.2701, "num_tokens": 10078257.0, "step": 1412 }, { "epoch": 0.9874213836477987, "grad_norm": 0.6966594622997705, "learning_rate": 9.245514558112733e-08, "loss": 11.3059, "num_tokens": 10085142.0, "step": 1413 }, { "epoch": 0.9881201956673655, "grad_norm": 0.7492634726939886, "learning_rate": 8.298044419962069e-08, "loss": 11.2983, "num_tokens": 10091532.0, "step": 1414 }, { "epoch": 0.9888190076869322, "grad_norm": 0.67547464453374, "learning_rate": 7.401761331521372e-08, "loss": 11.2074, "num_tokens": 10098630.0, "step": 1415 }, { "epoch": 0.989517819706499, "grad_norm": 0.6997394087199268, "learning_rate": 6.556669884408839e-08, "loss": 11.2439, "num_tokens": 10105381.0, "step": 1416 }, { "epoch": 0.9902166317260657, "grad_norm": 0.6784044100485935, "learning_rate": 5.7627744079902235e-08, "loss": 11.2719, "num_tokens": 10112328.0, "step": 1417 }, { "epoch": 0.9909154437456325, "grad_norm": 0.7376730133275492, "learning_rate": 5.0200789693588544e-08, "loss": 11.3005, "num_tokens": 10118964.0, "step": 1418 }, { "epoch": 0.9916142557651991, "grad_norm": 0.674112638355877, "learning_rate": 4.32858737330899e-08, "loss": 11.0974, "num_tokens": 10125874.0, "step": 1419 }, { "epoch": 0.9923130677847659, "grad_norm": 0.7551764155567859, "learning_rate": 3.6883031623224926e-08, "loss": 11.5891, "num_tokens": 10132274.0, "step": 1420 }, { "epoch": 0.9930118798043326, "grad_norm": 0.7272923742000015, "learning_rate": 3.099229616547739e-08, "loss": 11.3233, "num_tokens": 10138845.0, "step": 1421 }, { "epoch": 0.9937106918238994, "grad_norm": 0.7015750748072532, "learning_rate": 2.5613697537818504e-08, "loss": 11.3946, "num_tokens": 10145801.0, "step": 1422 }, { "epoch": 0.9944095038434662, "grad_norm": 0.6722209770345767, "learning_rate": 2.074726329457377e-08, "loss": 11.2842, "num_tokens": 10152749.0, "step": 1423 }, { "epoch": 0.9951083158630328, "grad_norm": 0.6390873103140037, "learning_rate": 1.6393018366278602e-08, "loss": 11.1965, "num_tokens": 10160312.0, "step": 1424 }, { "epoch": 0.9958071278825996, "grad_norm": 0.7038247897264213, "learning_rate": 1.2550985059522902e-08, "loss": 11.2494, "num_tokens": 10167099.0, "step": 1425 }, { "epoch": 0.9965059399021663, "grad_norm": 0.6666127532974914, "learning_rate": 9.221183056895566e-09, "loss": 11.2175, "num_tokens": 10174489.0, "step": 1426 }, { "epoch": 0.9972047519217331, "grad_norm": 0.6765466784943449, "learning_rate": 6.4036294168068335e-09, "loss": 11.3275, "num_tokens": 10182102.0, "step": 1427 }, { "epoch": 0.9979035639412998, "grad_norm": 0.6964009190338591, "learning_rate": 4.0983385734660875e-09, "loss": 11.1313, "num_tokens": 10188883.0, "step": 1428 }, { "epoch": 0.9986023759608665, "grad_norm": 0.7265152003293984, "learning_rate": 2.305322336781934e-09, "loss": 11.3144, "num_tokens": 10195672.0, "step": 1429 }, { "epoch": 0.9993011879804332, "grad_norm": 0.6683920838823425, "learning_rate": 1.0245898922844888e-09, "loss": 11.445, "num_tokens": 10203012.0, "step": 1430 }, { "epoch": 1.0, "grad_norm": 0.6789763311122045, "learning_rate": 2.561478011253726e-10, "loss": 11.2073, "num_tokens": 10210317.0, "step": 1431 }, { "epoch": 1.0, "step": 1431, "total_flos": 1443880778727424.0, "train_loss": 12.112521223552072, "train_runtime": 11701.6904, "train_samples_per_second": 7.829, "train_steps_per_second": 0.122 } ], "logging_steps": 1.0, "max_steps": 1431, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1443880778727424.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }