{ "best_global_step": 17172, "best_metric": 0.4832555055618286, "best_model_checkpoint": "saves/lntuning/llama-3-8b-instruct/train_codealpacapy_1754507521/checkpoint-17172", "epoch": 10.0, "eval_steps": 954, "global_step": 19080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002620545073375262, "grad_norm": 1.7992247343063354, "learning_rate": 1.048218029350105e-07, "loss": 1.2559, "num_input_tokens_seen": 2944, "step": 5 }, { "epoch": 0.005241090146750524, "grad_norm": 0.8840388655662537, "learning_rate": 2.3584905660377358e-07, "loss": 1.1142, "num_input_tokens_seen": 6816, "step": 10 }, { "epoch": 0.007861635220125786, "grad_norm": 1.1742973327636719, "learning_rate": 3.6687631027253674e-07, "loss": 1.223, "num_input_tokens_seen": 9760, "step": 15 }, { "epoch": 0.010482180293501049, "grad_norm": 0.7600769400596619, "learning_rate": 4.979035639412998e-07, "loss": 1.1628, "num_input_tokens_seen": 13472, "step": 20 }, { "epoch": 0.01310272536687631, "grad_norm": 2.1344194412231445, "learning_rate": 6.28930817610063e-07, "loss": 1.2331, "num_input_tokens_seen": 16864, "step": 25 }, { "epoch": 0.015723270440251572, "grad_norm": 1.1733638048171997, "learning_rate": 7.59958071278826e-07, "loss": 1.357, "num_input_tokens_seen": 20352, "step": 30 }, { "epoch": 0.018343815513626835, "grad_norm": 1.2997647523880005, "learning_rate": 8.90985324947589e-07, "loss": 1.33, "num_input_tokens_seen": 23008, "step": 35 }, { "epoch": 0.020964360587002098, "grad_norm": 1.3185588121414185, "learning_rate": 1.0220125786163522e-06, "loss": 1.0737, "num_input_tokens_seen": 25920, "step": 40 }, { "epoch": 0.02358490566037736, "grad_norm": 1.7854304313659668, "learning_rate": 1.1530398322851154e-06, "loss": 1.2306, "num_input_tokens_seen": 28672, "step": 45 }, { "epoch": 0.02620545073375262, "grad_norm": 1.0071042776107788, "learning_rate": 1.2840670859538784e-06, "loss": 1.0765, "num_input_tokens_seen": 31488, "step": 50 }, { "epoch": 0.028825995807127882, "grad_norm": 1.145833134651184, "learning_rate": 1.4150943396226415e-06, "loss": 1.2129, "num_input_tokens_seen": 34496, "step": 55 }, { "epoch": 0.031446540880503145, "grad_norm": 0.7332286238670349, "learning_rate": 1.5461215932914047e-06, "loss": 1.291, "num_input_tokens_seen": 38112, "step": 60 }, { "epoch": 0.034067085953878404, "grad_norm": 1.308964490890503, "learning_rate": 1.677148846960168e-06, "loss": 1.3668, "num_input_tokens_seen": 40864, "step": 65 }, { "epoch": 0.03668763102725367, "grad_norm": 0.7027747631072998, "learning_rate": 1.8081761006289309e-06, "loss": 1.2761, "num_input_tokens_seen": 44192, "step": 70 }, { "epoch": 0.03930817610062893, "grad_norm": 1.738516092300415, "learning_rate": 1.939203354297694e-06, "loss": 1.4024, "num_input_tokens_seen": 46784, "step": 75 }, { "epoch": 0.041928721174004195, "grad_norm": 3.2979280948638916, "learning_rate": 2.0702306079664572e-06, "loss": 1.3883, "num_input_tokens_seen": 50176, "step": 80 }, { "epoch": 0.044549266247379454, "grad_norm": 0.8180651068687439, "learning_rate": 2.20125786163522e-06, "loss": 1.0861, "num_input_tokens_seen": 54400, "step": 85 }, { "epoch": 0.04716981132075472, "grad_norm": 1.6968659162521362, "learning_rate": 2.3322851153039836e-06, "loss": 1.4327, "num_input_tokens_seen": 56992, "step": 90 }, { "epoch": 0.04979035639412998, "grad_norm": 1.5224425792694092, "learning_rate": 2.4633123689727464e-06, "loss": 1.0124, "num_input_tokens_seen": 60896, "step": 95 }, { "epoch": 0.05241090146750524, "grad_norm": 0.6967443227767944, "learning_rate": 2.5943396226415095e-06, "loss": 1.1461, "num_input_tokens_seen": 64576, "step": 100 }, { "epoch": 0.055031446540880505, "grad_norm": 2.3950300216674805, "learning_rate": 2.7253668763102727e-06, "loss": 1.3752, "num_input_tokens_seen": 67616, "step": 105 }, { "epoch": 0.057651991614255764, "grad_norm": 0.9431440234184265, "learning_rate": 2.8563941299790355e-06, "loss": 1.0212, "num_input_tokens_seen": 72544, "step": 110 }, { "epoch": 0.06027253668763103, "grad_norm": 1.1005886793136597, "learning_rate": 2.987421383647799e-06, "loss": 1.2906, "num_input_tokens_seen": 75648, "step": 115 }, { "epoch": 0.06289308176100629, "grad_norm": 1.3984357118606567, "learning_rate": 3.118448637316562e-06, "loss": 1.3783, "num_input_tokens_seen": 78368, "step": 120 }, { "epoch": 0.06551362683438156, "grad_norm": 1.4639437198638916, "learning_rate": 3.249475890985325e-06, "loss": 1.2324, "num_input_tokens_seen": 81952, "step": 125 }, { "epoch": 0.06813417190775681, "grad_norm": 1.2373332977294922, "learning_rate": 3.380503144654088e-06, "loss": 1.1821, "num_input_tokens_seen": 84960, "step": 130 }, { "epoch": 0.07075471698113207, "grad_norm": 1.7597533464431763, "learning_rate": 3.5115303983228514e-06, "loss": 1.3003, "num_input_tokens_seen": 88576, "step": 135 }, { "epoch": 0.07337526205450734, "grad_norm": 1.289990782737732, "learning_rate": 3.642557651991614e-06, "loss": 1.0102, "num_input_tokens_seen": 92160, "step": 140 }, { "epoch": 0.0759958071278826, "grad_norm": 0.9363468885421753, "learning_rate": 3.7735849056603773e-06, "loss": 1.1006, "num_input_tokens_seen": 95840, "step": 145 }, { "epoch": 0.07861635220125786, "grad_norm": 1.6384217739105225, "learning_rate": 3.9046121593291405e-06, "loss": 1.1738, "num_input_tokens_seen": 98816, "step": 150 }, { "epoch": 0.08123689727463312, "grad_norm": 0.8482155799865723, "learning_rate": 4.035639412997904e-06, "loss": 0.9776, "num_input_tokens_seen": 102880, "step": 155 }, { "epoch": 0.08385744234800839, "grad_norm": 1.5524969100952148, "learning_rate": 4.166666666666667e-06, "loss": 1.1535, "num_input_tokens_seen": 105920, "step": 160 }, { "epoch": 0.08647798742138364, "grad_norm": 0.8594745993614197, "learning_rate": 4.29769392033543e-06, "loss": 1.3669, "num_input_tokens_seen": 108800, "step": 165 }, { "epoch": 0.08909853249475891, "grad_norm": 1.1186102628707886, "learning_rate": 4.428721174004193e-06, "loss": 1.0766, "num_input_tokens_seen": 111968, "step": 170 }, { "epoch": 0.09171907756813417, "grad_norm": 1.4487866163253784, "learning_rate": 4.559748427672956e-06, "loss": 1.1564, "num_input_tokens_seen": 114976, "step": 175 }, { "epoch": 0.09433962264150944, "grad_norm": 3.6846606731414795, "learning_rate": 4.69077568134172e-06, "loss": 1.4043, "num_input_tokens_seen": 117568, "step": 180 }, { "epoch": 0.09696016771488469, "grad_norm": 1.2929426431655884, "learning_rate": 4.821802935010482e-06, "loss": 1.0219, "num_input_tokens_seen": 120384, "step": 185 }, { "epoch": 0.09958071278825996, "grad_norm": 0.7642637491226196, "learning_rate": 4.952830188679246e-06, "loss": 0.9904, "num_input_tokens_seen": 123680, "step": 190 }, { "epoch": 0.10220125786163523, "grad_norm": 1.223405122756958, "learning_rate": 5.083857442348009e-06, "loss": 1.2228, "num_input_tokens_seen": 127168, "step": 195 }, { "epoch": 0.10482180293501048, "grad_norm": 1.347133994102478, "learning_rate": 5.2148846960167715e-06, "loss": 0.96, "num_input_tokens_seen": 130176, "step": 200 }, { "epoch": 0.10744234800838574, "grad_norm": 1.131230115890503, "learning_rate": 5.345911949685535e-06, "loss": 1.1251, "num_input_tokens_seen": 133056, "step": 205 }, { "epoch": 0.11006289308176101, "grad_norm": 1.1063246726989746, "learning_rate": 5.476939203354298e-06, "loss": 1.0373, "num_input_tokens_seen": 136864, "step": 210 }, { "epoch": 0.11268343815513626, "grad_norm": 1.3608777523040771, "learning_rate": 5.607966457023061e-06, "loss": 1.4549, "num_input_tokens_seen": 140064, "step": 215 }, { "epoch": 0.11530398322851153, "grad_norm": 1.365909457206726, "learning_rate": 5.738993710691824e-06, "loss": 1.0363, "num_input_tokens_seen": 143712, "step": 220 }, { "epoch": 0.1179245283018868, "grad_norm": 1.370688557624817, "learning_rate": 5.870020964360588e-06, "loss": 1.1322, "num_input_tokens_seen": 146880, "step": 225 }, { "epoch": 0.12054507337526206, "grad_norm": 1.1439714431762695, "learning_rate": 6.0010482180293506e-06, "loss": 1.0062, "num_input_tokens_seen": 150560, "step": 230 }, { "epoch": 0.12316561844863731, "grad_norm": 1.0369666814804077, "learning_rate": 6.132075471698113e-06, "loss": 1.2466, "num_input_tokens_seen": 153664, "step": 235 }, { "epoch": 0.12578616352201258, "grad_norm": 1.864905834197998, "learning_rate": 6.263102725366876e-06, "loss": 0.9513, "num_input_tokens_seen": 157568, "step": 240 }, { "epoch": 0.12840670859538783, "grad_norm": 1.3295080661773682, "learning_rate": 6.3941299790356405e-06, "loss": 1.0548, "num_input_tokens_seen": 160512, "step": 245 }, { "epoch": 0.1310272536687631, "grad_norm": 0.5349722504615784, "learning_rate": 6.5251572327044024e-06, "loss": 0.9695, "num_input_tokens_seen": 164512, "step": 250 }, { "epoch": 0.13364779874213836, "grad_norm": 2.1651926040649414, "learning_rate": 6.656184486373165e-06, "loss": 1.1725, "num_input_tokens_seen": 167200, "step": 255 }, { "epoch": 0.13626834381551362, "grad_norm": 1.9152958393096924, "learning_rate": 6.78721174004193e-06, "loss": 1.1931, "num_input_tokens_seen": 169984, "step": 260 }, { "epoch": 0.1388888888888889, "grad_norm": 1.1223480701446533, "learning_rate": 6.918238993710692e-06, "loss": 1.0408, "num_input_tokens_seen": 173856, "step": 265 }, { "epoch": 0.14150943396226415, "grad_norm": 1.5492643117904663, "learning_rate": 7.049266247379454e-06, "loss": 1.3722, "num_input_tokens_seen": 176192, "step": 270 }, { "epoch": 0.1441299790356394, "grad_norm": 0.7313210964202881, "learning_rate": 7.180293501048219e-06, "loss": 0.8449, "num_input_tokens_seen": 179840, "step": 275 }, { "epoch": 0.14675052410901468, "grad_norm": 1.154045581817627, "learning_rate": 7.3113207547169815e-06, "loss": 0.965, "num_input_tokens_seen": 182976, "step": 280 }, { "epoch": 0.14937106918238993, "grad_norm": 1.1218169927597046, "learning_rate": 7.442348008385745e-06, "loss": 0.9885, "num_input_tokens_seen": 186400, "step": 285 }, { "epoch": 0.1519916142557652, "grad_norm": 1.1598070859909058, "learning_rate": 7.573375262054508e-06, "loss": 1.036, "num_input_tokens_seen": 190144, "step": 290 }, { "epoch": 0.15461215932914046, "grad_norm": 1.4991387128829956, "learning_rate": 7.70440251572327e-06, "loss": 1.2609, "num_input_tokens_seen": 193344, "step": 295 }, { "epoch": 0.15723270440251572, "grad_norm": 1.7046396732330322, "learning_rate": 7.835429769392034e-06, "loss": 0.9461, "num_input_tokens_seen": 195904, "step": 300 }, { "epoch": 0.159853249475891, "grad_norm": 1.4975050687789917, "learning_rate": 7.966457023060797e-06, "loss": 1.0941, "num_input_tokens_seen": 198944, "step": 305 }, { "epoch": 0.16247379454926625, "grad_norm": 2.132274627685547, "learning_rate": 8.09748427672956e-06, "loss": 1.2746, "num_input_tokens_seen": 201792, "step": 310 }, { "epoch": 0.1650943396226415, "grad_norm": 0.9286038875579834, "learning_rate": 8.228511530398324e-06, "loss": 0.9329, "num_input_tokens_seen": 206080, "step": 315 }, { "epoch": 0.16771488469601678, "grad_norm": 1.3184648752212524, "learning_rate": 8.359538784067087e-06, "loss": 1.1011, "num_input_tokens_seen": 209376, "step": 320 }, { "epoch": 0.17033542976939203, "grad_norm": 1.2758986949920654, "learning_rate": 8.49056603773585e-06, "loss": 1.2416, "num_input_tokens_seen": 212160, "step": 325 }, { "epoch": 0.17295597484276728, "grad_norm": 0.823065459728241, "learning_rate": 8.621593291404612e-06, "loss": 0.8963, "num_input_tokens_seen": 216128, "step": 330 }, { "epoch": 0.17557651991614256, "grad_norm": 1.1908247470855713, "learning_rate": 8.752620545073375e-06, "loss": 1.9211, "num_input_tokens_seen": 219232, "step": 335 }, { "epoch": 0.17819706498951782, "grad_norm": 1.010787010192871, "learning_rate": 8.883647798742138e-06, "loss": 0.9744, "num_input_tokens_seen": 222464, "step": 340 }, { "epoch": 0.18081761006289307, "grad_norm": 1.3557424545288086, "learning_rate": 9.014675052410902e-06, "loss": 1.3483, "num_input_tokens_seen": 225184, "step": 345 }, { "epoch": 0.18343815513626835, "grad_norm": 1.4151729345321655, "learning_rate": 9.145702306079665e-06, "loss": 1.1394, "num_input_tokens_seen": 228000, "step": 350 }, { "epoch": 0.1860587002096436, "grad_norm": 0.8666908740997314, "learning_rate": 9.276729559748428e-06, "loss": 0.9822, "num_input_tokens_seen": 230944, "step": 355 }, { "epoch": 0.18867924528301888, "grad_norm": 1.8694266080856323, "learning_rate": 9.40775681341719e-06, "loss": 1.0163, "num_input_tokens_seen": 233728, "step": 360 }, { "epoch": 0.19129979035639413, "grad_norm": 1.449751853942871, "learning_rate": 9.538784067085953e-06, "loss": 0.9524, "num_input_tokens_seen": 236480, "step": 365 }, { "epoch": 0.19392033542976939, "grad_norm": 2.037097454071045, "learning_rate": 9.669811320754718e-06, "loss": 1.1336, "num_input_tokens_seen": 239488, "step": 370 }, { "epoch": 0.19654088050314467, "grad_norm": 1.3292194604873657, "learning_rate": 9.80083857442348e-06, "loss": 1.1487, "num_input_tokens_seen": 242144, "step": 375 }, { "epoch": 0.19916142557651992, "grad_norm": 1.0996702909469604, "learning_rate": 9.931865828092243e-06, "loss": 1.0173, "num_input_tokens_seen": 245216, "step": 380 }, { "epoch": 0.20178197064989517, "grad_norm": 1.2654340267181396, "learning_rate": 1.0062893081761008e-05, "loss": 0.9678, "num_input_tokens_seen": 249632, "step": 385 }, { "epoch": 0.20440251572327045, "grad_norm": 0.7788991928100586, "learning_rate": 1.019392033542977e-05, "loss": 1.0679, "num_input_tokens_seen": 253312, "step": 390 }, { "epoch": 0.2070230607966457, "grad_norm": 1.3950775861740112, "learning_rate": 1.0324947589098532e-05, "loss": 1.0214, "num_input_tokens_seen": 256384, "step": 395 }, { "epoch": 0.20964360587002095, "grad_norm": 2.7373485565185547, "learning_rate": 1.0455974842767296e-05, "loss": 1.2251, "num_input_tokens_seen": 259744, "step": 400 }, { "epoch": 0.21226415094339623, "grad_norm": 1.848958969116211, "learning_rate": 1.0587002096436059e-05, "loss": 1.2692, "num_input_tokens_seen": 263456, "step": 405 }, { "epoch": 0.2148846960167715, "grad_norm": 1.2408130168914795, "learning_rate": 1.0718029350104822e-05, "loss": 0.968, "num_input_tokens_seen": 266496, "step": 410 }, { "epoch": 0.21750524109014674, "grad_norm": 1.3775478601455688, "learning_rate": 1.0849056603773586e-05, "loss": 1.0973, "num_input_tokens_seen": 269600, "step": 415 }, { "epoch": 0.22012578616352202, "grad_norm": 1.4160349369049072, "learning_rate": 1.0980083857442349e-05, "loss": 1.1648, "num_input_tokens_seen": 273152, "step": 420 }, { "epoch": 0.22274633123689727, "grad_norm": 1.261025071144104, "learning_rate": 1.1111111111111112e-05, "loss": 1.3947, "num_input_tokens_seen": 276992, "step": 425 }, { "epoch": 0.22536687631027252, "grad_norm": 1.338903546333313, "learning_rate": 1.1242138364779874e-05, "loss": 1.2857, "num_input_tokens_seen": 279296, "step": 430 }, { "epoch": 0.2279874213836478, "grad_norm": 0.722646951675415, "learning_rate": 1.1373165618448637e-05, "loss": 0.8463, "num_input_tokens_seen": 283296, "step": 435 }, { "epoch": 0.23060796645702306, "grad_norm": 2.1584556102752686, "learning_rate": 1.1504192872117402e-05, "loss": 1.3791, "num_input_tokens_seen": 286016, "step": 440 }, { "epoch": 0.23322851153039834, "grad_norm": 1.5198266506195068, "learning_rate": 1.1635220125786164e-05, "loss": 0.9738, "num_input_tokens_seen": 289248, "step": 445 }, { "epoch": 0.2358490566037736, "grad_norm": 0.6625983715057373, "learning_rate": 1.1766247379454927e-05, "loss": 1.2767, "num_input_tokens_seen": 292160, "step": 450 }, { "epoch": 0.23846960167714884, "grad_norm": 1.4233908653259277, "learning_rate": 1.1897274633123692e-05, "loss": 1.0111, "num_input_tokens_seen": 295296, "step": 455 }, { "epoch": 0.24109014675052412, "grad_norm": 0.7290078401565552, "learning_rate": 1.2028301886792454e-05, "loss": 0.9696, "num_input_tokens_seen": 299360, "step": 460 }, { "epoch": 0.24371069182389937, "grad_norm": 1.0026768445968628, "learning_rate": 1.2159329140461215e-05, "loss": 0.8392, "num_input_tokens_seen": 303072, "step": 465 }, { "epoch": 0.24633123689727462, "grad_norm": 0.8622721433639526, "learning_rate": 1.229035639412998e-05, "loss": 0.8001, "num_input_tokens_seen": 307616, "step": 470 }, { "epoch": 0.2489517819706499, "grad_norm": 0.9228618741035461, "learning_rate": 1.2421383647798743e-05, "loss": 1.1359, "num_input_tokens_seen": 310112, "step": 475 }, { "epoch": 0.25157232704402516, "grad_norm": 1.406606912612915, "learning_rate": 1.2552410901467507e-05, "loss": 0.9037, "num_input_tokens_seen": 313472, "step": 480 }, { "epoch": 0.25419287211740044, "grad_norm": 0.9839485287666321, "learning_rate": 1.2683438155136268e-05, "loss": 1.0369, "num_input_tokens_seen": 316128, "step": 485 }, { "epoch": 0.25681341719077566, "grad_norm": 1.072838544845581, "learning_rate": 1.2814465408805033e-05, "loss": 0.8132, "num_input_tokens_seen": 319584, "step": 490 }, { "epoch": 0.25943396226415094, "grad_norm": 0.8594610095024109, "learning_rate": 1.2945492662473795e-05, "loss": 0.871, "num_input_tokens_seen": 322496, "step": 495 }, { "epoch": 0.2620545073375262, "grad_norm": 1.2082688808441162, "learning_rate": 1.3076519916142556e-05, "loss": 1.3648, "num_input_tokens_seen": 325120, "step": 500 }, { "epoch": 0.26467505241090145, "grad_norm": 1.2511188983917236, "learning_rate": 1.320754716981132e-05, "loss": 1.0639, "num_input_tokens_seen": 327840, "step": 505 }, { "epoch": 0.2672955974842767, "grad_norm": 1.226073980331421, "learning_rate": 1.3338574423480085e-05, "loss": 0.6794, "num_input_tokens_seen": 331776, "step": 510 }, { "epoch": 0.269916142557652, "grad_norm": 2.0196211338043213, "learning_rate": 1.3469601677148846e-05, "loss": 0.872, "num_input_tokens_seen": 336544, "step": 515 }, { "epoch": 0.27253668763102723, "grad_norm": 0.7312788367271423, "learning_rate": 1.360062893081761e-05, "loss": 0.9685, "num_input_tokens_seen": 340768, "step": 520 }, { "epoch": 0.2751572327044025, "grad_norm": 1.4625129699707031, "learning_rate": 1.3731656184486375e-05, "loss": 1.1667, "num_input_tokens_seen": 343296, "step": 525 }, { "epoch": 0.2777777777777778, "grad_norm": 0.9751467108726501, "learning_rate": 1.3862683438155136e-05, "loss": 0.7863, "num_input_tokens_seen": 346848, "step": 530 }, { "epoch": 0.280398322851153, "grad_norm": 0.774558961391449, "learning_rate": 1.3993710691823899e-05, "loss": 1.0197, "num_input_tokens_seen": 349632, "step": 535 }, { "epoch": 0.2830188679245283, "grad_norm": 0.878465473651886, "learning_rate": 1.4124737945492664e-05, "loss": 0.805, "num_input_tokens_seen": 353024, "step": 540 }, { "epoch": 0.2856394129979036, "grad_norm": 1.0614235401153564, "learning_rate": 1.4255765199161425e-05, "loss": 0.9743, "num_input_tokens_seen": 355552, "step": 545 }, { "epoch": 0.2882599580712788, "grad_norm": 1.2745479345321655, "learning_rate": 1.4386792452830189e-05, "loss": 0.7099, "num_input_tokens_seen": 360128, "step": 550 }, { "epoch": 0.2908805031446541, "grad_norm": 0.6864871382713318, "learning_rate": 1.4517819706498954e-05, "loss": 0.815, "num_input_tokens_seen": 363360, "step": 555 }, { "epoch": 0.29350104821802936, "grad_norm": 1.185288906097412, "learning_rate": 1.4648846960167716e-05, "loss": 0.9456, "num_input_tokens_seen": 366144, "step": 560 }, { "epoch": 0.29612159329140464, "grad_norm": 1.236533284187317, "learning_rate": 1.4779874213836479e-05, "loss": 0.8335, "num_input_tokens_seen": 369216, "step": 565 }, { "epoch": 0.29874213836477986, "grad_norm": 1.6934411525726318, "learning_rate": 1.4910901467505242e-05, "loss": 0.8643, "num_input_tokens_seen": 372512, "step": 570 }, { "epoch": 0.30136268343815514, "grad_norm": 1.2627981901168823, "learning_rate": 1.5041928721174006e-05, "loss": 0.9764, "num_input_tokens_seen": 374976, "step": 575 }, { "epoch": 0.3039832285115304, "grad_norm": 2.248666286468506, "learning_rate": 1.5172955974842767e-05, "loss": 0.8441, "num_input_tokens_seen": 377632, "step": 580 }, { "epoch": 0.30660377358490565, "grad_norm": 0.7787745594978333, "learning_rate": 1.530398322851153e-05, "loss": 1.002, "num_input_tokens_seen": 380768, "step": 585 }, { "epoch": 0.30922431865828093, "grad_norm": 0.9418494701385498, "learning_rate": 1.5435010482180296e-05, "loss": 0.821, "num_input_tokens_seen": 383520, "step": 590 }, { "epoch": 0.3118448637316562, "grad_norm": 2.1940321922302246, "learning_rate": 1.5566037735849056e-05, "loss": 1.0822, "num_input_tokens_seen": 386496, "step": 595 }, { "epoch": 0.31446540880503143, "grad_norm": 1.276610016822815, "learning_rate": 1.5697064989517822e-05, "loss": 0.845, "num_input_tokens_seen": 390464, "step": 600 }, { "epoch": 0.3170859538784067, "grad_norm": 1.109817624092102, "learning_rate": 1.5828092243186584e-05, "loss": 0.6852, "num_input_tokens_seen": 393440, "step": 605 }, { "epoch": 0.319706498951782, "grad_norm": 0.9866474270820618, "learning_rate": 1.5959119496855347e-05, "loss": 0.692, "num_input_tokens_seen": 398336, "step": 610 }, { "epoch": 0.3223270440251572, "grad_norm": 1.163825273513794, "learning_rate": 1.609014675052411e-05, "loss": 0.6985, "num_input_tokens_seen": 401504, "step": 615 }, { "epoch": 0.3249475890985325, "grad_norm": 1.6499478816986084, "learning_rate": 1.6221174004192873e-05, "loss": 0.7244, "num_input_tokens_seen": 404864, "step": 620 }, { "epoch": 0.3275681341719078, "grad_norm": 0.6231809854507446, "learning_rate": 1.6352201257861635e-05, "loss": 0.8191, "num_input_tokens_seen": 409056, "step": 625 }, { "epoch": 0.330188679245283, "grad_norm": 1.1174907684326172, "learning_rate": 1.6483228511530398e-05, "loss": 0.7973, "num_input_tokens_seen": 412544, "step": 630 }, { "epoch": 0.3328092243186583, "grad_norm": 1.0812493562698364, "learning_rate": 1.6614255765199164e-05, "loss": 0.803, "num_input_tokens_seen": 415520, "step": 635 }, { "epoch": 0.33542976939203356, "grad_norm": 1.0331084728240967, "learning_rate": 1.6745283018867924e-05, "loss": 1.3702, "num_input_tokens_seen": 417760, "step": 640 }, { "epoch": 0.3380503144654088, "grad_norm": 0.5639844536781311, "learning_rate": 1.687631027253669e-05, "loss": 0.7488, "num_input_tokens_seen": 421728, "step": 645 }, { "epoch": 0.34067085953878407, "grad_norm": 3.896151542663574, "learning_rate": 1.7007337526205453e-05, "loss": 0.8601, "num_input_tokens_seen": 425664, "step": 650 }, { "epoch": 0.34329140461215935, "grad_norm": 1.4503062963485718, "learning_rate": 1.7138364779874212e-05, "loss": 0.6816, "num_input_tokens_seen": 428896, "step": 655 }, { "epoch": 0.34591194968553457, "grad_norm": 1.1440620422363281, "learning_rate": 1.7269392033542978e-05, "loss": 0.7785, "num_input_tokens_seen": 431744, "step": 660 }, { "epoch": 0.34853249475890985, "grad_norm": 1.0137196779251099, "learning_rate": 1.740041928721174e-05, "loss": 0.7293, "num_input_tokens_seen": 435520, "step": 665 }, { "epoch": 0.35115303983228513, "grad_norm": 0.717258632183075, "learning_rate": 1.7531446540880504e-05, "loss": 0.6739, "num_input_tokens_seen": 438592, "step": 670 }, { "epoch": 0.35377358490566035, "grad_norm": 0.8844522833824158, "learning_rate": 1.7662473794549266e-05, "loss": 0.7009, "num_input_tokens_seen": 441664, "step": 675 }, { "epoch": 0.35639412997903563, "grad_norm": 1.6868358850479126, "learning_rate": 1.779350104821803e-05, "loss": 0.8869, "num_input_tokens_seen": 444832, "step": 680 }, { "epoch": 0.3590146750524109, "grad_norm": 0.8615331053733826, "learning_rate": 1.7924528301886792e-05, "loss": 0.9227, "num_input_tokens_seen": 447392, "step": 685 }, { "epoch": 0.36163522012578614, "grad_norm": 1.446609377861023, "learning_rate": 1.8055555555555555e-05, "loss": 0.7702, "num_input_tokens_seen": 450528, "step": 690 }, { "epoch": 0.3642557651991614, "grad_norm": 0.9539512991905212, "learning_rate": 1.818658280922432e-05, "loss": 0.6522, "num_input_tokens_seen": 453504, "step": 695 }, { "epoch": 0.3668763102725367, "grad_norm": 1.536681056022644, "learning_rate": 1.831761006289308e-05, "loss": 1.3315, "num_input_tokens_seen": 456288, "step": 700 }, { "epoch": 0.3694968553459119, "grad_norm": 1.2888503074645996, "learning_rate": 1.8448637316561846e-05, "loss": 0.5672, "num_input_tokens_seen": 459840, "step": 705 }, { "epoch": 0.3721174004192872, "grad_norm": 0.6195198893547058, "learning_rate": 1.857966457023061e-05, "loss": 0.6556, "num_input_tokens_seen": 463168, "step": 710 }, { "epoch": 0.3747379454926625, "grad_norm": 1.0797394514083862, "learning_rate": 1.8710691823899372e-05, "loss": 0.6924, "num_input_tokens_seen": 466112, "step": 715 }, { "epoch": 0.37735849056603776, "grad_norm": 1.230659008026123, "learning_rate": 1.8841719077568135e-05, "loss": 0.8224, "num_input_tokens_seen": 469568, "step": 720 }, { "epoch": 0.379979035639413, "grad_norm": 0.897929847240448, "learning_rate": 1.8972746331236897e-05, "loss": 0.588, "num_input_tokens_seen": 472896, "step": 725 }, { "epoch": 0.38259958071278827, "grad_norm": 1.3094613552093506, "learning_rate": 1.9103773584905664e-05, "loss": 0.5736, "num_input_tokens_seen": 475744, "step": 730 }, { "epoch": 0.38522012578616355, "grad_norm": 0.3559638559818268, "learning_rate": 1.9234800838574423e-05, "loss": 0.6811, "num_input_tokens_seen": 481216, "step": 735 }, { "epoch": 0.38784067085953877, "grad_norm": 0.6804262399673462, "learning_rate": 1.936582809224319e-05, "loss": 0.7014, "num_input_tokens_seen": 483488, "step": 740 }, { "epoch": 0.39046121593291405, "grad_norm": 0.846321702003479, "learning_rate": 1.9496855345911952e-05, "loss": 0.7478, "num_input_tokens_seen": 487072, "step": 745 }, { "epoch": 0.39308176100628933, "grad_norm": 0.7349975109100342, "learning_rate": 1.9627882599580715e-05, "loss": 0.6759, "num_input_tokens_seen": 490496, "step": 750 }, { "epoch": 0.39570230607966456, "grad_norm": 0.7891698479652405, "learning_rate": 1.9758909853249477e-05, "loss": 0.634, "num_input_tokens_seen": 493824, "step": 755 }, { "epoch": 0.39832285115303984, "grad_norm": 1.0618149042129517, "learning_rate": 1.988993710691824e-05, "loss": 0.9253, "num_input_tokens_seen": 496192, "step": 760 }, { "epoch": 0.4009433962264151, "grad_norm": 0.7118943333625793, "learning_rate": 2.0020964360587003e-05, "loss": 0.7723, "num_input_tokens_seen": 498880, "step": 765 }, { "epoch": 0.40356394129979034, "grad_norm": 0.7049108743667603, "learning_rate": 2.0151991614255766e-05, "loss": 0.6795, "num_input_tokens_seen": 502016, "step": 770 }, { "epoch": 0.4061844863731656, "grad_norm": 0.3776947259902954, "learning_rate": 2.0283018867924532e-05, "loss": 0.7143, "num_input_tokens_seen": 505120, "step": 775 }, { "epoch": 0.4088050314465409, "grad_norm": 0.765929639339447, "learning_rate": 2.041404612159329e-05, "loss": 0.6571, "num_input_tokens_seen": 508256, "step": 780 }, { "epoch": 0.4114255765199161, "grad_norm": 0.7426820397377014, "learning_rate": 2.0545073375262054e-05, "loss": 0.6686, "num_input_tokens_seen": 511776, "step": 785 }, { "epoch": 0.4140461215932914, "grad_norm": 1.0526981353759766, "learning_rate": 2.067610062893082e-05, "loss": 0.7485, "num_input_tokens_seen": 514464, "step": 790 }, { "epoch": 0.4166666666666667, "grad_norm": 0.7280814051628113, "learning_rate": 2.080712788259958e-05, "loss": 0.6462, "num_input_tokens_seen": 518144, "step": 795 }, { "epoch": 0.4192872117400419, "grad_norm": 0.9274876713752747, "learning_rate": 2.0938155136268346e-05, "loss": 0.7514, "num_input_tokens_seen": 520768, "step": 800 }, { "epoch": 0.4219077568134172, "grad_norm": 0.7540377378463745, "learning_rate": 2.106918238993711e-05, "loss": 0.7792, "num_input_tokens_seen": 523808, "step": 805 }, { "epoch": 0.42452830188679247, "grad_norm": 0.9238664507865906, "learning_rate": 2.120020964360587e-05, "loss": 0.4926, "num_input_tokens_seen": 527136, "step": 810 }, { "epoch": 0.4271488469601677, "grad_norm": 0.8880646824836731, "learning_rate": 2.1331236897274634e-05, "loss": 0.7759, "num_input_tokens_seen": 530240, "step": 815 }, { "epoch": 0.429769392033543, "grad_norm": 1.0727427005767822, "learning_rate": 2.1462264150943397e-05, "loss": 0.6569, "num_input_tokens_seen": 534496, "step": 820 }, { "epoch": 0.43238993710691825, "grad_norm": 1.5853917598724365, "learning_rate": 2.159329140461216e-05, "loss": 0.722, "num_input_tokens_seen": 538624, "step": 825 }, { "epoch": 0.4350104821802935, "grad_norm": 0.9889712333679199, "learning_rate": 2.1724318658280922e-05, "loss": 0.6703, "num_input_tokens_seen": 542112, "step": 830 }, { "epoch": 0.43763102725366876, "grad_norm": 0.7276128530502319, "learning_rate": 2.1855345911949688e-05, "loss": 0.7779, "num_input_tokens_seen": 545408, "step": 835 }, { "epoch": 0.44025157232704404, "grad_norm": 0.4611942172050476, "learning_rate": 2.1986373165618448e-05, "loss": 0.6052, "num_input_tokens_seen": 548416, "step": 840 }, { "epoch": 0.44287211740041926, "grad_norm": 0.7090238332748413, "learning_rate": 2.2117400419287214e-05, "loss": 0.6435, "num_input_tokens_seen": 551264, "step": 845 }, { "epoch": 0.44549266247379454, "grad_norm": 0.48046645522117615, "learning_rate": 2.2248427672955977e-05, "loss": 0.5399, "num_input_tokens_seen": 554592, "step": 850 }, { "epoch": 0.4481132075471698, "grad_norm": 0.9261488914489746, "learning_rate": 2.237945492662474e-05, "loss": 0.4638, "num_input_tokens_seen": 558304, "step": 855 }, { "epoch": 0.45073375262054505, "grad_norm": 0.7011964917182922, "learning_rate": 2.2510482180293502e-05, "loss": 0.6784, "num_input_tokens_seen": 561248, "step": 860 }, { "epoch": 0.4533542976939203, "grad_norm": 0.6349368095397949, "learning_rate": 2.2641509433962265e-05, "loss": 0.4814, "num_input_tokens_seen": 564352, "step": 865 }, { "epoch": 0.4559748427672956, "grad_norm": 1.1659247875213623, "learning_rate": 2.2772536687631028e-05, "loss": 0.5978, "num_input_tokens_seen": 567712, "step": 870 }, { "epoch": 0.4585953878406709, "grad_norm": 1.0578056573867798, "learning_rate": 2.290356394129979e-05, "loss": 0.6886, "num_input_tokens_seen": 570240, "step": 875 }, { "epoch": 0.4612159329140461, "grad_norm": 1.324271559715271, "learning_rate": 2.3034591194968556e-05, "loss": 0.7356, "num_input_tokens_seen": 572768, "step": 880 }, { "epoch": 0.4638364779874214, "grad_norm": 0.6794723868370056, "learning_rate": 2.316561844863732e-05, "loss": 0.6784, "num_input_tokens_seen": 576512, "step": 885 }, { "epoch": 0.46645702306079667, "grad_norm": 0.7098039388656616, "learning_rate": 2.329664570230608e-05, "loss": 0.6888, "num_input_tokens_seen": 579264, "step": 890 }, { "epoch": 0.4690775681341719, "grad_norm": 0.4916303753852844, "learning_rate": 2.3427672955974845e-05, "loss": 0.6972, "num_input_tokens_seen": 581440, "step": 895 }, { "epoch": 0.4716981132075472, "grad_norm": 0.4219779372215271, "learning_rate": 2.3558700209643607e-05, "loss": 0.4396, "num_input_tokens_seen": 584544, "step": 900 }, { "epoch": 0.47431865828092246, "grad_norm": 0.7790907621383667, "learning_rate": 2.368972746331237e-05, "loss": 0.6777, "num_input_tokens_seen": 588160, "step": 905 }, { "epoch": 0.4769392033542977, "grad_norm": 1.277363657951355, "learning_rate": 2.3820754716981133e-05, "loss": 0.66, "num_input_tokens_seen": 590400, "step": 910 }, { "epoch": 0.47955974842767296, "grad_norm": 0.7132774591445923, "learning_rate": 2.39517819706499e-05, "loss": 0.5474, "num_input_tokens_seen": 593440, "step": 915 }, { "epoch": 0.48218029350104824, "grad_norm": 0.7276756763458252, "learning_rate": 2.408280922431866e-05, "loss": 0.6239, "num_input_tokens_seen": 596480, "step": 920 }, { "epoch": 0.48480083857442346, "grad_norm": 0.6013426780700684, "learning_rate": 2.421383647798742e-05, "loss": 0.5993, "num_input_tokens_seen": 599392, "step": 925 }, { "epoch": 0.48742138364779874, "grad_norm": 0.8034766316413879, "learning_rate": 2.4344863731656187e-05, "loss": 0.5016, "num_input_tokens_seen": 602432, "step": 930 }, { "epoch": 0.490041928721174, "grad_norm": 0.5819337964057922, "learning_rate": 2.4475890985324947e-05, "loss": 0.631, "num_input_tokens_seen": 604512, "step": 935 }, { "epoch": 0.49266247379454925, "grad_norm": 0.3543080985546112, "learning_rate": 2.4606918238993713e-05, "loss": 0.5609, "num_input_tokens_seen": 608864, "step": 940 }, { "epoch": 0.49528301886792453, "grad_norm": 0.6343904733657837, "learning_rate": 2.4737945492662476e-05, "loss": 0.5764, "num_input_tokens_seen": 612448, "step": 945 }, { "epoch": 0.4979035639412998, "grad_norm": 4.001172065734863, "learning_rate": 2.486897274633124e-05, "loss": 0.6575, "num_input_tokens_seen": 614944, "step": 950 }, { "epoch": 0.5, "eval_loss": 0.6278152465820312, "eval_runtime": 14.5492, "eval_samples_per_second": 58.285, "eval_steps_per_second": 14.571, "num_input_tokens_seen": 616992, "step": 954 }, { "epoch": 0.500524109014675, "grad_norm": 0.9471813440322876, "learning_rate": 2.5e-05, "loss": 0.6183, "num_input_tokens_seen": 617472, "step": 955 }, { "epoch": 0.5031446540880503, "grad_norm": 0.9817367196083069, "learning_rate": 2.5131027253668764e-05, "loss": 0.5414, "num_input_tokens_seen": 620192, "step": 960 }, { "epoch": 0.5057651991614256, "grad_norm": 0.530161440372467, "learning_rate": 2.526205450733753e-05, "loss": 0.6436, "num_input_tokens_seen": 623232, "step": 965 }, { "epoch": 0.5083857442348009, "grad_norm": 0.5066660642623901, "learning_rate": 2.5393081761006293e-05, "loss": 0.7549, "num_input_tokens_seen": 626016, "step": 970 }, { "epoch": 0.5110062893081762, "grad_norm": 0.6713323593139648, "learning_rate": 2.5524109014675052e-05, "loss": 0.5528, "num_input_tokens_seen": 629472, "step": 975 }, { "epoch": 0.5136268343815513, "grad_norm": 1.0115481615066528, "learning_rate": 2.5655136268343815e-05, "loss": 0.6866, "num_input_tokens_seen": 632608, "step": 980 }, { "epoch": 0.5162473794549266, "grad_norm": 0.8512790203094482, "learning_rate": 2.578616352201258e-05, "loss": 0.5038, "num_input_tokens_seen": 636192, "step": 985 }, { "epoch": 0.5188679245283019, "grad_norm": 0.5555885434150696, "learning_rate": 2.5917190775681344e-05, "loss": 0.5631, "num_input_tokens_seen": 640544, "step": 990 }, { "epoch": 0.5214884696016772, "grad_norm": 0.6243686676025391, "learning_rate": 2.6048218029350107e-05, "loss": 0.5354, "num_input_tokens_seen": 644448, "step": 995 }, { "epoch": 0.5241090146750524, "grad_norm": 0.42946845293045044, "learning_rate": 2.6179245283018873e-05, "loss": 0.5524, "num_input_tokens_seen": 648288, "step": 1000 }, { "epoch": 0.5267295597484277, "grad_norm": 0.8081832528114319, "learning_rate": 2.631027253668763e-05, "loss": 0.7332, "num_input_tokens_seen": 651296, "step": 1005 }, { "epoch": 0.5293501048218029, "grad_norm": 0.3930015563964844, "learning_rate": 2.6441299790356395e-05, "loss": 0.549, "num_input_tokens_seen": 654176, "step": 1010 }, { "epoch": 0.5319706498951782, "grad_norm": 0.49029213190078735, "learning_rate": 2.6572327044025158e-05, "loss": 0.5847, "num_input_tokens_seen": 657664, "step": 1015 }, { "epoch": 0.5345911949685535, "grad_norm": 1.2149559259414673, "learning_rate": 2.6703354297693924e-05, "loss": 0.7082, "num_input_tokens_seen": 660064, "step": 1020 }, { "epoch": 0.5372117400419287, "grad_norm": 0.9247342348098755, "learning_rate": 2.6834381551362687e-05, "loss": 0.6938, "num_input_tokens_seen": 662880, "step": 1025 }, { "epoch": 0.539832285115304, "grad_norm": 0.6140971779823303, "learning_rate": 2.696540880503145e-05, "loss": 0.5389, "num_input_tokens_seen": 665920, "step": 1030 }, { "epoch": 0.5424528301886793, "grad_norm": 0.7639221549034119, "learning_rate": 2.709643605870021e-05, "loss": 0.5389, "num_input_tokens_seen": 668864, "step": 1035 }, { "epoch": 0.5450733752620545, "grad_norm": 0.733273446559906, "learning_rate": 2.722746331236897e-05, "loss": 0.5166, "num_input_tokens_seen": 671776, "step": 1040 }, { "epoch": 0.5476939203354297, "grad_norm": 0.6812888383865356, "learning_rate": 2.7358490566037738e-05, "loss": 0.6054, "num_input_tokens_seen": 674912, "step": 1045 }, { "epoch": 0.550314465408805, "grad_norm": 0.7785666584968567, "learning_rate": 2.74895178197065e-05, "loss": 0.453, "num_input_tokens_seen": 678368, "step": 1050 }, { "epoch": 0.5529350104821803, "grad_norm": 0.44093772768974304, "learning_rate": 2.7620545073375263e-05, "loss": 0.6597, "num_input_tokens_seen": 682336, "step": 1055 }, { "epoch": 0.5555555555555556, "grad_norm": 1.4687477350234985, "learning_rate": 2.775157232704403e-05, "loss": 0.5572, "num_input_tokens_seen": 685760, "step": 1060 }, { "epoch": 0.5581761006289309, "grad_norm": 0.8561159372329712, "learning_rate": 2.788259958071279e-05, "loss": 0.4969, "num_input_tokens_seen": 688480, "step": 1065 }, { "epoch": 0.560796645702306, "grad_norm": 0.6249496340751648, "learning_rate": 2.801362683438155e-05, "loss": 0.6964, "num_input_tokens_seen": 692064, "step": 1070 }, { "epoch": 0.5634171907756813, "grad_norm": 1.0836408138275146, "learning_rate": 2.8144654088050314e-05, "loss": 0.6726, "num_input_tokens_seen": 694464, "step": 1075 }, { "epoch": 0.5660377358490566, "grad_norm": 0.8967437744140625, "learning_rate": 2.827568134171908e-05, "loss": 0.601, "num_input_tokens_seen": 697344, "step": 1080 }, { "epoch": 0.5686582809224319, "grad_norm": 0.7842375040054321, "learning_rate": 2.8406708595387843e-05, "loss": 0.605, "num_input_tokens_seen": 700256, "step": 1085 }, { "epoch": 0.5712788259958071, "grad_norm": 0.577279269695282, "learning_rate": 2.8537735849056606e-05, "loss": 0.5805, "num_input_tokens_seen": 703968, "step": 1090 }, { "epoch": 0.5738993710691824, "grad_norm": 0.5168815851211548, "learning_rate": 2.8668763102725365e-05, "loss": 0.5803, "num_input_tokens_seen": 708160, "step": 1095 }, { "epoch": 0.5765199161425576, "grad_norm": 0.5478891730308533, "learning_rate": 2.8799790356394128e-05, "loss": 0.7925, "num_input_tokens_seen": 710976, "step": 1100 }, { "epoch": 0.5791404612159329, "grad_norm": 2.2423715591430664, "learning_rate": 2.8930817610062894e-05, "loss": 0.7436, "num_input_tokens_seen": 713696, "step": 1105 }, { "epoch": 0.5817610062893082, "grad_norm": 0.9839460849761963, "learning_rate": 2.9061844863731657e-05, "loss": 0.5664, "num_input_tokens_seen": 716608, "step": 1110 }, { "epoch": 0.5843815513626834, "grad_norm": 0.620229184627533, "learning_rate": 2.9192872117400423e-05, "loss": 0.5548, "num_input_tokens_seen": 719488, "step": 1115 }, { "epoch": 0.5870020964360587, "grad_norm": 0.5705578923225403, "learning_rate": 2.9323899371069186e-05, "loss": 0.4705, "num_input_tokens_seen": 722304, "step": 1120 }, { "epoch": 0.589622641509434, "grad_norm": 1.0039438009262085, "learning_rate": 2.945492662473795e-05, "loss": 0.5127, "num_input_tokens_seen": 725472, "step": 1125 }, { "epoch": 0.5922431865828093, "grad_norm": 0.6039586663246155, "learning_rate": 2.9585953878406708e-05, "loss": 0.4231, "num_input_tokens_seen": 728896, "step": 1130 }, { "epoch": 0.5948637316561844, "grad_norm": 0.5248985290527344, "learning_rate": 2.971698113207547e-05, "loss": 0.5189, "num_input_tokens_seen": 732000, "step": 1135 }, { "epoch": 0.5974842767295597, "grad_norm": 0.8106800317764282, "learning_rate": 2.9848008385744237e-05, "loss": 0.6382, "num_input_tokens_seen": 735328, "step": 1140 }, { "epoch": 0.600104821802935, "grad_norm": 0.6874638199806213, "learning_rate": 2.9979035639413e-05, "loss": 0.4577, "num_input_tokens_seen": 738528, "step": 1145 }, { "epoch": 0.6027253668763103, "grad_norm": 0.5246922373771667, "learning_rate": 3.0110062893081766e-05, "loss": 0.6896, "num_input_tokens_seen": 741376, "step": 1150 }, { "epoch": 0.6053459119496856, "grad_norm": 0.7812998294830322, "learning_rate": 3.024109014675053e-05, "loss": 0.6021, "num_input_tokens_seen": 743872, "step": 1155 }, { "epoch": 0.6079664570230608, "grad_norm": 0.8267132639884949, "learning_rate": 3.0372117400419288e-05, "loss": 0.6348, "num_input_tokens_seen": 746720, "step": 1160 }, { "epoch": 0.610587002096436, "grad_norm": 0.7040634751319885, "learning_rate": 3.050314465408805e-05, "loss": 0.6483, "num_input_tokens_seen": 750240, "step": 1165 }, { "epoch": 0.6132075471698113, "grad_norm": 0.5692875385284424, "learning_rate": 3.063417190775681e-05, "loss": 0.5666, "num_input_tokens_seen": 753344, "step": 1170 }, { "epoch": 0.6158280922431866, "grad_norm": 0.7365718483924866, "learning_rate": 3.076519916142558e-05, "loss": 0.6121, "num_input_tokens_seen": 755968, "step": 1175 }, { "epoch": 0.6184486373165619, "grad_norm": 0.3801325857639313, "learning_rate": 3.0896226415094346e-05, "loss": 0.6306, "num_input_tokens_seen": 759520, "step": 1180 }, { "epoch": 0.6210691823899371, "grad_norm": 0.6412213444709778, "learning_rate": 3.1027253668763105e-05, "loss": 0.5541, "num_input_tokens_seen": 762048, "step": 1185 }, { "epoch": 0.6236897274633124, "grad_norm": 0.8419643044471741, "learning_rate": 3.1158280922431864e-05, "loss": 0.7285, "num_input_tokens_seen": 765504, "step": 1190 }, { "epoch": 0.6263102725366876, "grad_norm": 0.3711233139038086, "learning_rate": 3.128930817610063e-05, "loss": 0.5408, "num_input_tokens_seen": 768192, "step": 1195 }, { "epoch": 0.6289308176100629, "grad_norm": 0.5255857706069946, "learning_rate": 3.142033542976939e-05, "loss": 0.5916, "num_input_tokens_seen": 771168, "step": 1200 }, { "epoch": 0.6315513626834381, "grad_norm": 0.44710761308670044, "learning_rate": 3.1551362683438156e-05, "loss": 0.3517, "num_input_tokens_seen": 777504, "step": 1205 }, { "epoch": 0.6341719077568134, "grad_norm": 0.8805302977561951, "learning_rate": 3.168238993710692e-05, "loss": 0.5453, "num_input_tokens_seen": 780512, "step": 1210 }, { "epoch": 0.6367924528301887, "grad_norm": 0.6647432446479797, "learning_rate": 3.181341719077569e-05, "loss": 0.6035, "num_input_tokens_seen": 784192, "step": 1215 }, { "epoch": 0.639412997903564, "grad_norm": 1.1205782890319824, "learning_rate": 3.194444444444444e-05, "loss": 0.6488, "num_input_tokens_seen": 787552, "step": 1220 }, { "epoch": 0.6420335429769392, "grad_norm": 0.3906605839729309, "learning_rate": 3.207547169811321e-05, "loss": 0.731, "num_input_tokens_seen": 790720, "step": 1225 }, { "epoch": 0.6446540880503144, "grad_norm": 0.8831583857536316, "learning_rate": 3.220649895178197e-05, "loss": 0.6455, "num_input_tokens_seen": 793696, "step": 1230 }, { "epoch": 0.6472746331236897, "grad_norm": 0.7287240624427795, "learning_rate": 3.233752620545073e-05, "loss": 0.653, "num_input_tokens_seen": 796640, "step": 1235 }, { "epoch": 0.649895178197065, "grad_norm": 0.6299283504486084, "learning_rate": 3.24685534591195e-05, "loss": 0.5672, "num_input_tokens_seen": 799648, "step": 1240 }, { "epoch": 0.6525157232704403, "grad_norm": 0.7847397327423096, "learning_rate": 3.2599580712788265e-05, "loss": 0.4039, "num_input_tokens_seen": 802496, "step": 1245 }, { "epoch": 0.6551362683438156, "grad_norm": 0.7174577713012695, "learning_rate": 3.2730607966457024e-05, "loss": 0.6222, "num_input_tokens_seen": 805760, "step": 1250 }, { "epoch": 0.6577568134171907, "grad_norm": 1.045624852180481, "learning_rate": 3.2861635220125784e-05, "loss": 0.6451, "num_input_tokens_seen": 808832, "step": 1255 }, { "epoch": 0.660377358490566, "grad_norm": 0.73774254322052, "learning_rate": 3.299266247379455e-05, "loss": 0.5181, "num_input_tokens_seen": 812256, "step": 1260 }, { "epoch": 0.6629979035639413, "grad_norm": 0.6651224493980408, "learning_rate": 3.3123689727463316e-05, "loss": 0.532, "num_input_tokens_seen": 815680, "step": 1265 }, { "epoch": 0.6656184486373166, "grad_norm": 0.43102386593818665, "learning_rate": 3.3254716981132075e-05, "loss": 0.4596, "num_input_tokens_seen": 818400, "step": 1270 }, { "epoch": 0.6682389937106918, "grad_norm": 0.6134279370307922, "learning_rate": 3.338574423480084e-05, "loss": 0.594, "num_input_tokens_seen": 821536, "step": 1275 }, { "epoch": 0.6708595387840671, "grad_norm": 0.8011902570724487, "learning_rate": 3.351677148846961e-05, "loss": 0.5139, "num_input_tokens_seen": 825024, "step": 1280 }, { "epoch": 0.6734800838574424, "grad_norm": 0.3426799476146698, "learning_rate": 3.364779874213837e-05, "loss": 0.499, "num_input_tokens_seen": 829248, "step": 1285 }, { "epoch": 0.6761006289308176, "grad_norm": 0.4970197081565857, "learning_rate": 3.3778825995807126e-05, "loss": 0.5192, "num_input_tokens_seen": 831872, "step": 1290 }, { "epoch": 0.6787211740041929, "grad_norm": 0.4787292182445526, "learning_rate": 3.390985324947589e-05, "loss": 0.6872, "num_input_tokens_seen": 834880, "step": 1295 }, { "epoch": 0.6813417190775681, "grad_norm": 0.4686105251312256, "learning_rate": 3.404088050314466e-05, "loss": 0.6668, "num_input_tokens_seen": 837824, "step": 1300 }, { "epoch": 0.6839622641509434, "grad_norm": 0.594571590423584, "learning_rate": 3.417190775681342e-05, "loss": 0.6411, "num_input_tokens_seen": 840896, "step": 1305 }, { "epoch": 0.6865828092243187, "grad_norm": 0.6037874817848206, "learning_rate": 3.4302935010482184e-05, "loss": 0.7338, "num_input_tokens_seen": 844000, "step": 1310 }, { "epoch": 0.689203354297694, "grad_norm": 0.21269598603248596, "learning_rate": 3.4433962264150943e-05, "loss": 0.5242, "num_input_tokens_seen": 847840, "step": 1315 }, { "epoch": 0.6918238993710691, "grad_norm": 0.647485077381134, "learning_rate": 3.456498951781971e-05, "loss": 0.6815, "num_input_tokens_seen": 850976, "step": 1320 }, { "epoch": 0.6944444444444444, "grad_norm": 0.5588605999946594, "learning_rate": 3.469601677148847e-05, "loss": 0.5143, "num_input_tokens_seen": 855616, "step": 1325 }, { "epoch": 0.6970649895178197, "grad_norm": 0.4682219922542572, "learning_rate": 3.4827044025157235e-05, "loss": 0.3704, "num_input_tokens_seen": 858528, "step": 1330 }, { "epoch": 0.699685534591195, "grad_norm": 0.7844645380973816, "learning_rate": 3.4958071278826e-05, "loss": 0.5783, "num_input_tokens_seen": 862144, "step": 1335 }, { "epoch": 0.7023060796645703, "grad_norm": 0.3845345079898834, "learning_rate": 3.508909853249476e-05, "loss": 0.6309, "num_input_tokens_seen": 865440, "step": 1340 }, { "epoch": 0.7049266247379455, "grad_norm": 0.4203493893146515, "learning_rate": 3.522012578616352e-05, "loss": 0.5995, "num_input_tokens_seen": 868992, "step": 1345 }, { "epoch": 0.7075471698113207, "grad_norm": 0.9449232816696167, "learning_rate": 3.5351153039832286e-05, "loss": 0.5508, "num_input_tokens_seen": 871968, "step": 1350 }, { "epoch": 0.710167714884696, "grad_norm": 0.6679859757423401, "learning_rate": 3.548218029350105e-05, "loss": 0.5591, "num_input_tokens_seen": 874912, "step": 1355 }, { "epoch": 0.7127882599580713, "grad_norm": 0.4005996882915497, "learning_rate": 3.561320754716981e-05, "loss": 0.584, "num_input_tokens_seen": 878112, "step": 1360 }, { "epoch": 0.7154088050314465, "grad_norm": 0.7610798478126526, "learning_rate": 3.574423480083858e-05, "loss": 0.5548, "num_input_tokens_seen": 880928, "step": 1365 }, { "epoch": 0.7180293501048218, "grad_norm": 0.5369072556495667, "learning_rate": 3.5875262054507344e-05, "loss": 0.5674, "num_input_tokens_seen": 884832, "step": 1370 }, { "epoch": 0.7206498951781971, "grad_norm": 0.39752867817878723, "learning_rate": 3.6006289308176097e-05, "loss": 0.5303, "num_input_tokens_seen": 888800, "step": 1375 }, { "epoch": 0.7232704402515723, "grad_norm": 0.5447391271591187, "learning_rate": 3.613731656184486e-05, "loss": 0.5481, "num_input_tokens_seen": 892416, "step": 1380 }, { "epoch": 0.7258909853249476, "grad_norm": 0.6966484189033508, "learning_rate": 3.626834381551363e-05, "loss": 0.6586, "num_input_tokens_seen": 895840, "step": 1385 }, { "epoch": 0.7285115303983228, "grad_norm": 0.45338210463523865, "learning_rate": 3.6399371069182395e-05, "loss": 0.588, "num_input_tokens_seen": 899392, "step": 1390 }, { "epoch": 0.7311320754716981, "grad_norm": 0.44898852705955505, "learning_rate": 3.6530398322851154e-05, "loss": 0.5209, "num_input_tokens_seen": 903264, "step": 1395 }, { "epoch": 0.7337526205450734, "grad_norm": 0.48714813590049744, "learning_rate": 3.666142557651992e-05, "loss": 0.5849, "num_input_tokens_seen": 906016, "step": 1400 }, { "epoch": 0.7363731656184487, "grad_norm": 0.9300102591514587, "learning_rate": 3.679245283018868e-05, "loss": 0.6601, "num_input_tokens_seen": 909152, "step": 1405 }, { "epoch": 0.7389937106918238, "grad_norm": 0.4774525761604309, "learning_rate": 3.692348008385744e-05, "loss": 0.6278, "num_input_tokens_seen": 912416, "step": 1410 }, { "epoch": 0.7416142557651991, "grad_norm": 0.6069843173027039, "learning_rate": 3.7054507337526205e-05, "loss": 0.7144, "num_input_tokens_seen": 915616, "step": 1415 }, { "epoch": 0.7442348008385744, "grad_norm": 0.8977115750312805, "learning_rate": 3.718553459119497e-05, "loss": 0.5979, "num_input_tokens_seen": 921568, "step": 1420 }, { "epoch": 0.7468553459119497, "grad_norm": 0.38874050974845886, "learning_rate": 3.731656184486374e-05, "loss": 0.5074, "num_input_tokens_seen": 924864, "step": 1425 }, { "epoch": 0.749475890985325, "grad_norm": 0.7666509747505188, "learning_rate": 3.74475890985325e-05, "loss": 0.5433, "num_input_tokens_seen": 927968, "step": 1430 }, { "epoch": 0.7520964360587002, "grad_norm": 0.4994032084941864, "learning_rate": 3.757861635220126e-05, "loss": 0.561, "num_input_tokens_seen": 931584, "step": 1435 }, { "epoch": 0.7547169811320755, "grad_norm": 0.7527554035186768, "learning_rate": 3.770964360587002e-05, "loss": 0.541, "num_input_tokens_seen": 934816, "step": 1440 }, { "epoch": 0.7573375262054507, "grad_norm": 0.7286273241043091, "learning_rate": 3.784067085953878e-05, "loss": 0.6073, "num_input_tokens_seen": 937856, "step": 1445 }, { "epoch": 0.759958071278826, "grad_norm": 0.7228736877441406, "learning_rate": 3.797169811320755e-05, "loss": 0.5862, "num_input_tokens_seen": 945632, "step": 1450 }, { "epoch": 0.7625786163522013, "grad_norm": 0.6321483254432678, "learning_rate": 3.8102725366876314e-05, "loss": 0.6612, "num_input_tokens_seen": 948608, "step": 1455 }, { "epoch": 0.7651991614255765, "grad_norm": 0.5263976454734802, "learning_rate": 3.8233752620545074e-05, "loss": 0.5477, "num_input_tokens_seen": 951488, "step": 1460 }, { "epoch": 0.7678197064989518, "grad_norm": 0.6145146489143372, "learning_rate": 3.836477987421384e-05, "loss": 0.6026, "num_input_tokens_seen": 955072, "step": 1465 }, { "epoch": 0.7704402515723271, "grad_norm": 0.8468273878097534, "learning_rate": 3.84958071278826e-05, "loss": 0.6156, "num_input_tokens_seen": 958272, "step": 1470 }, { "epoch": 0.7730607966457023, "grad_norm": 0.4553072452545166, "learning_rate": 3.8626834381551365e-05, "loss": 0.7014, "num_input_tokens_seen": 961888, "step": 1475 }, { "epoch": 0.7756813417190775, "grad_norm": 1.1358648538589478, "learning_rate": 3.8757861635220125e-05, "loss": 0.772, "num_input_tokens_seen": 964256, "step": 1480 }, { "epoch": 0.7783018867924528, "grad_norm": 0.6559430360794067, "learning_rate": 3.888888888888889e-05, "loss": 0.5739, "num_input_tokens_seen": 968224, "step": 1485 }, { "epoch": 0.7809224318658281, "grad_norm": 0.43443799018859863, "learning_rate": 3.901991614255766e-05, "loss": 0.5404, "num_input_tokens_seen": 971136, "step": 1490 }, { "epoch": 0.7835429769392034, "grad_norm": 0.5430450439453125, "learning_rate": 3.9150943396226416e-05, "loss": 0.5668, "num_input_tokens_seen": 975936, "step": 1495 }, { "epoch": 0.7861635220125787, "grad_norm": 0.5874946713447571, "learning_rate": 3.9281970649895176e-05, "loss": 0.5962, "num_input_tokens_seen": 979360, "step": 1500 }, { "epoch": 0.7887840670859538, "grad_norm": 0.2353808879852295, "learning_rate": 3.941299790356394e-05, "loss": 0.5142, "num_input_tokens_seen": 983616, "step": 1505 }, { "epoch": 0.7914046121593291, "grad_norm": 0.5022816061973572, "learning_rate": 3.954402515723271e-05, "loss": 0.5585, "num_input_tokens_seen": 986944, "step": 1510 }, { "epoch": 0.7940251572327044, "grad_norm": 0.7295569777488708, "learning_rate": 3.967505241090147e-05, "loss": 0.5807, "num_input_tokens_seen": 989888, "step": 1515 }, { "epoch": 0.7966457023060797, "grad_norm": 0.5181534886360168, "learning_rate": 3.9806079664570233e-05, "loss": 0.5744, "num_input_tokens_seen": 992768, "step": 1520 }, { "epoch": 0.799266247379455, "grad_norm": 0.45893797278404236, "learning_rate": 3.9937106918239e-05, "loss": 0.4601, "num_input_tokens_seen": 996896, "step": 1525 }, { "epoch": 0.8018867924528302, "grad_norm": 0.42643842101097107, "learning_rate": 4.006813417190776e-05, "loss": 0.6013, "num_input_tokens_seen": 999840, "step": 1530 }, { "epoch": 0.8045073375262054, "grad_norm": 0.4358137845993042, "learning_rate": 4.019916142557652e-05, "loss": 0.5097, "num_input_tokens_seen": 1003680, "step": 1535 }, { "epoch": 0.8071278825995807, "grad_norm": 0.52120441198349, "learning_rate": 4.0330188679245284e-05, "loss": 0.5061, "num_input_tokens_seen": 1006656, "step": 1540 }, { "epoch": 0.809748427672956, "grad_norm": 1.2906550168991089, "learning_rate": 4.046121593291405e-05, "loss": 0.44, "num_input_tokens_seen": 1008768, "step": 1545 }, { "epoch": 0.8123689727463312, "grad_norm": 0.35363706946372986, "learning_rate": 4.059224318658281e-05, "loss": 0.5367, "num_input_tokens_seen": 1011968, "step": 1550 }, { "epoch": 0.8149895178197065, "grad_norm": 0.9847720265388489, "learning_rate": 4.0723270440251576e-05, "loss": 0.6015, "num_input_tokens_seen": 1018784, "step": 1555 }, { "epoch": 0.8176100628930818, "grad_norm": 0.32934921979904175, "learning_rate": 4.0854297693920336e-05, "loss": 0.3808, "num_input_tokens_seen": 1023520, "step": 1560 }, { "epoch": 0.820230607966457, "grad_norm": 0.6365347504615784, "learning_rate": 4.09853249475891e-05, "loss": 0.5009, "num_input_tokens_seen": 1027200, "step": 1565 }, { "epoch": 0.8228511530398323, "grad_norm": 0.29569166898727417, "learning_rate": 4.111635220125786e-05, "loss": 0.8658, "num_input_tokens_seen": 1030848, "step": 1570 }, { "epoch": 0.8254716981132075, "grad_norm": 0.5301792025566101, "learning_rate": 4.124737945492663e-05, "loss": 0.596, "num_input_tokens_seen": 1035232, "step": 1575 }, { "epoch": 0.8280922431865828, "grad_norm": 0.7776142954826355, "learning_rate": 4.137840670859539e-05, "loss": 0.5546, "num_input_tokens_seen": 1038176, "step": 1580 }, { "epoch": 0.8307127882599581, "grad_norm": 1.0511094331741333, "learning_rate": 4.150943396226415e-05, "loss": 0.5197, "num_input_tokens_seen": 1041216, "step": 1585 }, { "epoch": 0.8333333333333334, "grad_norm": 0.4908311367034912, "learning_rate": 4.164046121593291e-05, "loss": 0.5802, "num_input_tokens_seen": 1043840, "step": 1590 }, { "epoch": 0.8359538784067087, "grad_norm": 0.4208201766014099, "learning_rate": 4.177148846960168e-05, "loss": 0.4585, "num_input_tokens_seen": 1046656, "step": 1595 }, { "epoch": 0.8385744234800838, "grad_norm": 0.5173287987709045, "learning_rate": 4.1902515723270444e-05, "loss": 0.4823, "num_input_tokens_seen": 1051616, "step": 1600 }, { "epoch": 0.8411949685534591, "grad_norm": 0.6983963847160339, "learning_rate": 4.2033542976939204e-05, "loss": 0.703, "num_input_tokens_seen": 1053824, "step": 1605 }, { "epoch": 0.8438155136268344, "grad_norm": 0.6305585503578186, "learning_rate": 4.216457023060797e-05, "loss": 0.537, "num_input_tokens_seen": 1056512, "step": 1610 }, { "epoch": 0.8464360587002097, "grad_norm": 0.3073955774307251, "learning_rate": 4.2295597484276736e-05, "loss": 0.5349, "num_input_tokens_seen": 1060160, "step": 1615 }, { "epoch": 0.8490566037735849, "grad_norm": 0.6677712798118591, "learning_rate": 4.2426624737945495e-05, "loss": 0.7002, "num_input_tokens_seen": 1062880, "step": 1620 }, { "epoch": 0.8516771488469602, "grad_norm": 0.4843500256538391, "learning_rate": 4.2557651991614255e-05, "loss": 0.6494, "num_input_tokens_seen": 1066528, "step": 1625 }, { "epoch": 0.8542976939203354, "grad_norm": 0.5640972852706909, "learning_rate": 4.268867924528302e-05, "loss": 0.4495, "num_input_tokens_seen": 1069472, "step": 1630 }, { "epoch": 0.8569182389937107, "grad_norm": 0.4987848699092865, "learning_rate": 4.281970649895179e-05, "loss": 0.5506, "num_input_tokens_seen": 1074304, "step": 1635 }, { "epoch": 0.859538784067086, "grad_norm": 1.0020112991333008, "learning_rate": 4.2950733752620546e-05, "loss": 0.4992, "num_input_tokens_seen": 1077376, "step": 1640 }, { "epoch": 0.8621593291404612, "grad_norm": 0.4737785756587982, "learning_rate": 4.308176100628931e-05, "loss": 0.5566, "num_input_tokens_seen": 1080672, "step": 1645 }, { "epoch": 0.8647798742138365, "grad_norm": 0.5034939050674438, "learning_rate": 4.321278825995808e-05, "loss": 0.5988, "num_input_tokens_seen": 1084224, "step": 1650 }, { "epoch": 0.8674004192872118, "grad_norm": 0.607521116733551, "learning_rate": 4.334381551362683e-05, "loss": 0.751, "num_input_tokens_seen": 1086912, "step": 1655 }, { "epoch": 0.870020964360587, "grad_norm": 0.5174294114112854, "learning_rate": 4.34748427672956e-05, "loss": 0.6403, "num_input_tokens_seen": 1089696, "step": 1660 }, { "epoch": 0.8726415094339622, "grad_norm": 0.6405696272850037, "learning_rate": 4.3605870020964364e-05, "loss": 0.4981, "num_input_tokens_seen": 1092608, "step": 1665 }, { "epoch": 0.8752620545073375, "grad_norm": 0.312179833650589, "learning_rate": 4.373689727463312e-05, "loss": 0.7354, "num_input_tokens_seen": 1095808, "step": 1670 }, { "epoch": 0.8778825995807128, "grad_norm": 0.4397551715373993, "learning_rate": 4.386792452830189e-05, "loss": 0.6214, "num_input_tokens_seen": 1098816, "step": 1675 }, { "epoch": 0.8805031446540881, "grad_norm": 0.5614981651306152, "learning_rate": 4.3998951781970655e-05, "loss": 0.6165, "num_input_tokens_seen": 1103136, "step": 1680 }, { "epoch": 0.8831236897274634, "grad_norm": 0.38898324966430664, "learning_rate": 4.4129979035639415e-05, "loss": 0.5117, "num_input_tokens_seen": 1107872, "step": 1685 }, { "epoch": 0.8857442348008385, "grad_norm": 0.5529558658599854, "learning_rate": 4.4261006289308174e-05, "loss": 0.6631, "num_input_tokens_seen": 1111072, "step": 1690 }, { "epoch": 0.8883647798742138, "grad_norm": 1.270159125328064, "learning_rate": 4.439203354297694e-05, "loss": 0.6312, "num_input_tokens_seen": 1113792, "step": 1695 }, { "epoch": 0.8909853249475891, "grad_norm": 0.7984760403633118, "learning_rate": 4.4523060796645706e-05, "loss": 0.553, "num_input_tokens_seen": 1116384, "step": 1700 }, { "epoch": 0.8936058700209644, "grad_norm": 0.3527769446372986, "learning_rate": 4.4654088050314466e-05, "loss": 0.5262, "num_input_tokens_seen": 1119296, "step": 1705 }, { "epoch": 0.8962264150943396, "grad_norm": 0.4520549476146698, "learning_rate": 4.478511530398323e-05, "loss": 0.6501, "num_input_tokens_seen": 1122720, "step": 1710 }, { "epoch": 0.8988469601677149, "grad_norm": 1.542492151260376, "learning_rate": 4.491614255765199e-05, "loss": 0.7045, "num_input_tokens_seen": 1125088, "step": 1715 }, { "epoch": 0.9014675052410901, "grad_norm": 0.5538150072097778, "learning_rate": 4.504716981132076e-05, "loss": 0.6988, "num_input_tokens_seen": 1128288, "step": 1720 }, { "epoch": 0.9040880503144654, "grad_norm": 1.0549224615097046, "learning_rate": 4.517819706498952e-05, "loss": 0.6604, "num_input_tokens_seen": 1131040, "step": 1725 }, { "epoch": 0.9067085953878407, "grad_norm": 0.306679368019104, "learning_rate": 4.530922431865828e-05, "loss": 0.565, "num_input_tokens_seen": 1135136, "step": 1730 }, { "epoch": 0.9093291404612159, "grad_norm": 0.47552964091300964, "learning_rate": 4.544025157232705e-05, "loss": 0.7072, "num_input_tokens_seen": 1139008, "step": 1735 }, { "epoch": 0.9119496855345912, "grad_norm": 0.4989221692085266, "learning_rate": 4.557127882599581e-05, "loss": 0.5145, "num_input_tokens_seen": 1141920, "step": 1740 }, { "epoch": 0.9145702306079665, "grad_norm": 0.6842064261436462, "learning_rate": 4.570230607966457e-05, "loss": 0.5343, "num_input_tokens_seen": 1144960, "step": 1745 }, { "epoch": 0.9171907756813418, "grad_norm": 0.5094396471977234, "learning_rate": 4.5833333333333334e-05, "loss": 0.6131, "num_input_tokens_seen": 1147712, "step": 1750 }, { "epoch": 0.9198113207547169, "grad_norm": 0.4891991913318634, "learning_rate": 4.59643605870021e-05, "loss": 0.5841, "num_input_tokens_seen": 1150528, "step": 1755 }, { "epoch": 0.9224318658280922, "grad_norm": 2.730097770690918, "learning_rate": 4.609538784067086e-05, "loss": 0.6221, "num_input_tokens_seen": 1154272, "step": 1760 }, { "epoch": 0.9250524109014675, "grad_norm": 0.9352796673774719, "learning_rate": 4.6226415094339625e-05, "loss": 0.5437, "num_input_tokens_seen": 1157088, "step": 1765 }, { "epoch": 0.9276729559748428, "grad_norm": 0.40597930550575256, "learning_rate": 4.635744234800839e-05, "loss": 0.6903, "num_input_tokens_seen": 1160192, "step": 1770 }, { "epoch": 0.9302935010482181, "grad_norm": 0.48425528407096863, "learning_rate": 4.648846960167715e-05, "loss": 0.5869, "num_input_tokens_seen": 1163424, "step": 1775 }, { "epoch": 0.9329140461215933, "grad_norm": 0.37191665172576904, "learning_rate": 4.661949685534591e-05, "loss": 0.6287, "num_input_tokens_seen": 1167104, "step": 1780 }, { "epoch": 0.9355345911949685, "grad_norm": 0.8180853128433228, "learning_rate": 4.6750524109014677e-05, "loss": 0.5678, "num_input_tokens_seen": 1170368, "step": 1785 }, { "epoch": 0.9381551362683438, "grad_norm": 0.4892943799495697, "learning_rate": 4.688155136268344e-05, "loss": 0.6749, "num_input_tokens_seen": 1173888, "step": 1790 }, { "epoch": 0.9407756813417191, "grad_norm": 0.39440101385116577, "learning_rate": 4.70125786163522e-05, "loss": 0.5278, "num_input_tokens_seen": 1177408, "step": 1795 }, { "epoch": 0.9433962264150944, "grad_norm": 0.6425689458847046, "learning_rate": 4.714360587002097e-05, "loss": 0.5359, "num_input_tokens_seen": 1181056, "step": 1800 }, { "epoch": 0.9460167714884696, "grad_norm": 0.5546563267707825, "learning_rate": 4.7274633123689734e-05, "loss": 0.7413, "num_input_tokens_seen": 1184256, "step": 1805 }, { "epoch": 0.9486373165618449, "grad_norm": 0.683784544467926, "learning_rate": 4.7405660377358494e-05, "loss": 0.4884, "num_input_tokens_seen": 1187168, "step": 1810 }, { "epoch": 0.9512578616352201, "grad_norm": 0.32827168703079224, "learning_rate": 4.753668763102725e-05, "loss": 0.4924, "num_input_tokens_seen": 1190784, "step": 1815 }, { "epoch": 0.9538784067085954, "grad_norm": 0.5919519662857056, "learning_rate": 4.766771488469602e-05, "loss": 0.6385, "num_input_tokens_seen": 1193312, "step": 1820 }, { "epoch": 0.9564989517819706, "grad_norm": 0.4357806146144867, "learning_rate": 4.7798742138364785e-05, "loss": 0.5717, "num_input_tokens_seen": 1196608, "step": 1825 }, { "epoch": 0.9591194968553459, "grad_norm": 0.609332263469696, "learning_rate": 4.7929769392033545e-05, "loss": 0.6149, "num_input_tokens_seen": 1200160, "step": 1830 }, { "epoch": 0.9617400419287212, "grad_norm": 0.8378402590751648, "learning_rate": 4.806079664570231e-05, "loss": 0.468, "num_input_tokens_seen": 1203040, "step": 1835 }, { "epoch": 0.9643605870020965, "grad_norm": 0.3810708224773407, "learning_rate": 4.819182389937107e-05, "loss": 0.4227, "num_input_tokens_seen": 1205408, "step": 1840 }, { "epoch": 0.9669811320754716, "grad_norm": 1.502177119255066, "learning_rate": 4.8322851153039836e-05, "loss": 0.4154, "num_input_tokens_seen": 1208896, "step": 1845 }, { "epoch": 0.9696016771488469, "grad_norm": 0.3638145327568054, "learning_rate": 4.8453878406708596e-05, "loss": 0.5422, "num_input_tokens_seen": 1212576, "step": 1850 }, { "epoch": 0.9722222222222222, "grad_norm": 0.35817283391952515, "learning_rate": 4.858490566037736e-05, "loss": 0.5334, "num_input_tokens_seen": 1215872, "step": 1855 }, { "epoch": 0.9748427672955975, "grad_norm": 0.6571997404098511, "learning_rate": 4.871593291404613e-05, "loss": 0.5207, "num_input_tokens_seen": 1219968, "step": 1860 }, { "epoch": 0.9774633123689728, "grad_norm": 0.3670083284378052, "learning_rate": 4.884696016771489e-05, "loss": 0.5759, "num_input_tokens_seen": 1222048, "step": 1865 }, { "epoch": 0.980083857442348, "grad_norm": 0.629447340965271, "learning_rate": 4.897798742138365e-05, "loss": 0.5988, "num_input_tokens_seen": 1225120, "step": 1870 }, { "epoch": 0.9827044025157232, "grad_norm": 0.3874313533306122, "learning_rate": 4.910901467505241e-05, "loss": 0.565, "num_input_tokens_seen": 1229056, "step": 1875 }, { "epoch": 0.9853249475890985, "grad_norm": 0.6902626752853394, "learning_rate": 4.924004192872117e-05, "loss": 0.4398, "num_input_tokens_seen": 1231776, "step": 1880 }, { "epoch": 0.9879454926624738, "grad_norm": 0.5371819734573364, "learning_rate": 4.937106918238994e-05, "loss": 0.5289, "num_input_tokens_seen": 1235040, "step": 1885 }, { "epoch": 0.9905660377358491, "grad_norm": 0.6145362854003906, "learning_rate": 4.9502096436058705e-05, "loss": 0.4915, "num_input_tokens_seen": 1238048, "step": 1890 }, { "epoch": 0.9931865828092243, "grad_norm": 0.3926757872104645, "learning_rate": 4.963312368972747e-05, "loss": 0.5874, "num_input_tokens_seen": 1241344, "step": 1895 }, { "epoch": 0.9958071278825996, "grad_norm": 0.5208534598350525, "learning_rate": 4.976415094339622e-05, "loss": 0.4305, "num_input_tokens_seen": 1243840, "step": 1900 }, { "epoch": 0.9984276729559748, "grad_norm": 0.4232868254184723, "learning_rate": 4.989517819706499e-05, "loss": 0.6347, "num_input_tokens_seen": 1246816, "step": 1905 }, { "epoch": 1.0, "eval_loss": 0.5513554215431213, "eval_runtime": 14.5416, "eval_samples_per_second": 58.315, "eval_steps_per_second": 14.579, "num_input_tokens_seen": 1248304, "step": 1908 }, { "epoch": 1.00104821802935, "grad_norm": 0.6163535714149475, "learning_rate": 4.9999999581622816e-05, "loss": 0.5443, "num_input_tokens_seen": 1249200, "step": 1910 }, { "epoch": 1.0036687631027255, "grad_norm": 1.2550445795059204, "learning_rate": 4.999998493842267e-05, "loss": 0.544, "num_input_tokens_seen": 1251664, "step": 1915 }, { "epoch": 1.0062893081761006, "grad_norm": 0.3217064440250397, "learning_rate": 4.999994937637709e-05, "loss": 0.4834, "num_input_tokens_seen": 1254928, "step": 1920 }, { "epoch": 1.0089098532494758, "grad_norm": 0.841200590133667, "learning_rate": 4.999989289551581e-05, "loss": 0.4344, "num_input_tokens_seen": 1257744, "step": 1925 }, { "epoch": 1.0115303983228512, "grad_norm": 0.7548254728317261, "learning_rate": 4.999981549588612e-05, "loss": 0.4847, "num_input_tokens_seen": 1261904, "step": 1930 }, { "epoch": 1.0141509433962264, "grad_norm": 0.42920443415641785, "learning_rate": 4.9999717177552764e-05, "loss": 0.4322, "num_input_tokens_seen": 1264432, "step": 1935 }, { "epoch": 1.0167714884696017, "grad_norm": 0.7182300090789795, "learning_rate": 4.999959794059801e-05, "loss": 0.6157, "num_input_tokens_seen": 1268400, "step": 1940 }, { "epoch": 1.019392033542977, "grad_norm": 0.4919317662715912, "learning_rate": 4.999945778512164e-05, "loss": 0.4774, "num_input_tokens_seen": 1272752, "step": 1945 }, { "epoch": 1.0220125786163523, "grad_norm": 0.5247616171836853, "learning_rate": 4.999929671124093e-05, "loss": 0.6278, "num_input_tokens_seen": 1275568, "step": 1950 }, { "epoch": 1.0246331236897275, "grad_norm": 0.41907086968421936, "learning_rate": 4.9999114719090645e-05, "loss": 0.5953, "num_input_tokens_seen": 1278160, "step": 1955 }, { "epoch": 1.0272536687631026, "grad_norm": 0.5695929527282715, "learning_rate": 4.999891180882308e-05, "loss": 0.5268, "num_input_tokens_seen": 1281424, "step": 1960 }, { "epoch": 1.029874213836478, "grad_norm": 0.4340973198413849, "learning_rate": 4.9998687980608014e-05, "loss": 0.5274, "num_input_tokens_seen": 1283920, "step": 1965 }, { "epoch": 1.0324947589098532, "grad_norm": 0.6605707406997681, "learning_rate": 4.9998443234632744e-05, "loss": 0.4965, "num_input_tokens_seen": 1287920, "step": 1970 }, { "epoch": 1.0351153039832286, "grad_norm": 1.826466679573059, "learning_rate": 4.999817757110206e-05, "loss": 0.5619, "num_input_tokens_seen": 1290960, "step": 1975 }, { "epoch": 1.0377358490566038, "grad_norm": 0.4888373613357544, "learning_rate": 4.999789099023826e-05, "loss": 0.4197, "num_input_tokens_seen": 1294544, "step": 1980 }, { "epoch": 1.040356394129979, "grad_norm": 0.3377688229084015, "learning_rate": 4.9997583492281126e-05, "loss": 0.6794, "num_input_tokens_seen": 1298160, "step": 1985 }, { "epoch": 1.0429769392033543, "grad_norm": 0.5094422698020935, "learning_rate": 4.999725507748798e-05, "loss": 0.5515, "num_input_tokens_seen": 1300368, "step": 1990 }, { "epoch": 1.0455974842767295, "grad_norm": 0.2613098621368408, "learning_rate": 4.9996905746133606e-05, "loss": 0.457, "num_input_tokens_seen": 1307184, "step": 1995 }, { "epoch": 1.0482180293501049, "grad_norm": 0.5481349229812622, "learning_rate": 4.999653549851032e-05, "loss": 0.8311, "num_input_tokens_seen": 1309840, "step": 2000 }, { "epoch": 1.05083857442348, "grad_norm": 0.6606416702270508, "learning_rate": 4.999614433492792e-05, "loss": 0.4988, "num_input_tokens_seen": 1313648, "step": 2005 }, { "epoch": 1.0534591194968554, "grad_norm": 0.48651671409606934, "learning_rate": 4.9995732255713725e-05, "loss": 0.5156, "num_input_tokens_seen": 1316496, "step": 2010 }, { "epoch": 1.0560796645702306, "grad_norm": 1.0027391910552979, "learning_rate": 4.9995299261212536e-05, "loss": 0.5443, "num_input_tokens_seen": 1319952, "step": 2015 }, { "epoch": 1.0587002096436058, "grad_norm": 0.4322044253349304, "learning_rate": 4.999484535178667e-05, "loss": 0.4795, "num_input_tokens_seen": 1323344, "step": 2020 }, { "epoch": 1.0613207547169812, "grad_norm": 0.5271179676055908, "learning_rate": 4.9994370527815925e-05, "loss": 0.5695, "num_input_tokens_seen": 1326704, "step": 2025 }, { "epoch": 1.0639412997903563, "grad_norm": 0.45615991950035095, "learning_rate": 4.999387478969762e-05, "loss": 0.5232, "num_input_tokens_seen": 1330128, "step": 2030 }, { "epoch": 1.0665618448637317, "grad_norm": 0.48930373787879944, "learning_rate": 4.999335813784657e-05, "loss": 0.4662, "num_input_tokens_seen": 1333040, "step": 2035 }, { "epoch": 1.069182389937107, "grad_norm": 0.5437284111976624, "learning_rate": 4.999282057269508e-05, "loss": 0.5302, "num_input_tokens_seen": 1335504, "step": 2040 }, { "epoch": 1.0718029350104823, "grad_norm": 0.834537148475647, "learning_rate": 4.999226209469295e-05, "loss": 0.5613, "num_input_tokens_seen": 1339280, "step": 2045 }, { "epoch": 1.0744234800838575, "grad_norm": 0.7194111943244934, "learning_rate": 4.999168270430752e-05, "loss": 0.597, "num_input_tokens_seen": 1342576, "step": 2050 }, { "epoch": 1.0770440251572326, "grad_norm": 0.4977926015853882, "learning_rate": 4.999108240202356e-05, "loss": 0.5025, "num_input_tokens_seen": 1346256, "step": 2055 }, { "epoch": 1.079664570230608, "grad_norm": 0.758943498134613, "learning_rate": 4.999046118834341e-05, "loss": 0.6224, "num_input_tokens_seen": 1349008, "step": 2060 }, { "epoch": 1.0822851153039832, "grad_norm": 0.5003901124000549, "learning_rate": 4.998981906378684e-05, "loss": 0.574, "num_input_tokens_seen": 1353552, "step": 2065 }, { "epoch": 1.0849056603773586, "grad_norm": 0.6382290720939636, "learning_rate": 4.998915602889117e-05, "loss": 0.5023, "num_input_tokens_seen": 1356400, "step": 2070 }, { "epoch": 1.0875262054507338, "grad_norm": 0.7033169865608215, "learning_rate": 4.9988472084211203e-05, "loss": 0.564, "num_input_tokens_seen": 1361008, "step": 2075 }, { "epoch": 1.090146750524109, "grad_norm": 1.4291706085205078, "learning_rate": 4.9987767230319215e-05, "loss": 0.625, "num_input_tokens_seen": 1363184, "step": 2080 }, { "epoch": 1.0927672955974843, "grad_norm": 0.8402513861656189, "learning_rate": 4.998704146780501e-05, "loss": 0.5127, "num_input_tokens_seen": 1365872, "step": 2085 }, { "epoch": 1.0953878406708595, "grad_norm": 0.4538615942001343, "learning_rate": 4.9986294797275857e-05, "loss": 0.5889, "num_input_tokens_seen": 1369168, "step": 2090 }, { "epoch": 1.0980083857442349, "grad_norm": 0.4194008708000183, "learning_rate": 4.9985527219356554e-05, "loss": 0.5044, "num_input_tokens_seen": 1372208, "step": 2095 }, { "epoch": 1.10062893081761, "grad_norm": 0.4823490083217621, "learning_rate": 4.998473873468937e-05, "loss": 0.4152, "num_input_tokens_seen": 1377360, "step": 2100 }, { "epoch": 1.1032494758909852, "grad_norm": 0.6674350500106812, "learning_rate": 4.998392934393407e-05, "loss": 0.4572, "num_input_tokens_seen": 1380400, "step": 2105 }, { "epoch": 1.1058700209643606, "grad_norm": 0.7129555940628052, "learning_rate": 4.9983099047767905e-05, "loss": 0.516, "num_input_tokens_seen": 1383056, "step": 2110 }, { "epoch": 1.1084905660377358, "grad_norm": 0.5313023924827576, "learning_rate": 4.9982247846885644e-05, "loss": 0.6197, "num_input_tokens_seen": 1386160, "step": 2115 }, { "epoch": 1.1111111111111112, "grad_norm": 0.5617659091949463, "learning_rate": 4.9981375741999534e-05, "loss": 0.4759, "num_input_tokens_seen": 1389232, "step": 2120 }, { "epoch": 1.1137316561844863, "grad_norm": 0.41282889246940613, "learning_rate": 4.99804827338393e-05, "loss": 0.6262, "num_input_tokens_seen": 1393328, "step": 2125 }, { "epoch": 1.1163522012578617, "grad_norm": 0.4381900727748871, "learning_rate": 4.997956882315218e-05, "loss": 0.4625, "num_input_tokens_seen": 1396816, "step": 2130 }, { "epoch": 1.118972746331237, "grad_norm": 0.3869813084602356, "learning_rate": 4.997863401070289e-05, "loss": 0.4461, "num_input_tokens_seen": 1400176, "step": 2135 }, { "epoch": 1.121593291404612, "grad_norm": 0.5406429171562195, "learning_rate": 4.9977678297273634e-05, "loss": 0.6212, "num_input_tokens_seen": 1403408, "step": 2140 }, { "epoch": 1.1242138364779874, "grad_norm": 0.44504985213279724, "learning_rate": 4.997670168366412e-05, "loss": 0.4945, "num_input_tokens_seen": 1406448, "step": 2145 }, { "epoch": 1.1268343815513626, "grad_norm": 1.5524051189422607, "learning_rate": 4.997570417069152e-05, "loss": 0.569, "num_input_tokens_seen": 1409584, "step": 2150 }, { "epoch": 1.129454926624738, "grad_norm": 0.6517429947853088, "learning_rate": 4.997468575919052e-05, "loss": 0.5022, "num_input_tokens_seen": 1413872, "step": 2155 }, { "epoch": 1.1320754716981132, "grad_norm": 0.4997948110103607, "learning_rate": 4.9973646450013275e-05, "loss": 0.6824, "num_input_tokens_seen": 1416528, "step": 2160 }, { "epoch": 1.1346960167714886, "grad_norm": 0.6950650811195374, "learning_rate": 4.997258624402943e-05, "loss": 0.516, "num_input_tokens_seen": 1418896, "step": 2165 }, { "epoch": 1.1373165618448637, "grad_norm": 0.6713169813156128, "learning_rate": 4.997150514212611e-05, "loss": 0.5809, "num_input_tokens_seen": 1421712, "step": 2170 }, { "epoch": 1.139937106918239, "grad_norm": 0.4223501682281494, "learning_rate": 4.997040314520795e-05, "loss": 0.5066, "num_input_tokens_seen": 1425072, "step": 2175 }, { "epoch": 1.1425576519916143, "grad_norm": 0.3252376317977905, "learning_rate": 4.9969280254197035e-05, "loss": 0.693, "num_input_tokens_seen": 1428848, "step": 2180 }, { "epoch": 1.1451781970649895, "grad_norm": 0.7856203317642212, "learning_rate": 4.996813647003296e-05, "loss": 0.5961, "num_input_tokens_seen": 1432432, "step": 2185 }, { "epoch": 1.1477987421383649, "grad_norm": 0.552338719367981, "learning_rate": 4.9966971793672784e-05, "loss": 0.6969, "num_input_tokens_seen": 1436048, "step": 2190 }, { "epoch": 1.15041928721174, "grad_norm": 0.8672226071357727, "learning_rate": 4.9965786226091054e-05, "loss": 0.6777, "num_input_tokens_seen": 1440144, "step": 2195 }, { "epoch": 1.1530398322851152, "grad_norm": 0.5213975310325623, "learning_rate": 4.9964579768279803e-05, "loss": 0.6217, "num_input_tokens_seen": 1443760, "step": 2200 }, { "epoch": 1.1556603773584906, "grad_norm": 0.5117209553718567, "learning_rate": 4.996335242124854e-05, "loss": 0.8145, "num_input_tokens_seen": 1446864, "step": 2205 }, { "epoch": 1.1582809224318658, "grad_norm": 0.6119126081466675, "learning_rate": 4.996210418602425e-05, "loss": 0.3574, "num_input_tokens_seen": 1449456, "step": 2210 }, { "epoch": 1.1609014675052411, "grad_norm": 0.5385763049125671, "learning_rate": 4.99608350636514e-05, "loss": 0.6041, "num_input_tokens_seen": 1452944, "step": 2215 }, { "epoch": 1.1635220125786163, "grad_norm": 0.3152080178260803, "learning_rate": 4.995954505519193e-05, "loss": 0.5023, "num_input_tokens_seen": 1455952, "step": 2220 }, { "epoch": 1.1661425576519917, "grad_norm": 0.4456504285335541, "learning_rate": 4.995823416172527e-05, "loss": 0.6383, "num_input_tokens_seen": 1459376, "step": 2225 }, { "epoch": 1.1687631027253669, "grad_norm": 0.49261006712913513, "learning_rate": 4.995690238434831e-05, "loss": 0.5149, "num_input_tokens_seen": 1462352, "step": 2230 }, { "epoch": 1.171383647798742, "grad_norm": 0.5769947171211243, "learning_rate": 4.995554972417541e-05, "loss": 0.6122, "num_input_tokens_seen": 1465104, "step": 2235 }, { "epoch": 1.1740041928721174, "grad_norm": 0.42984211444854736, "learning_rate": 4.995417618233844e-05, "loss": 0.5959, "num_input_tokens_seen": 1468624, "step": 2240 }, { "epoch": 1.1766247379454926, "grad_norm": 0.46592721343040466, "learning_rate": 4.9952781759986694e-05, "loss": 0.4564, "num_input_tokens_seen": 1475184, "step": 2245 }, { "epoch": 1.179245283018868, "grad_norm": 0.24314485490322113, "learning_rate": 4.995136645828697e-05, "loss": 0.4366, "num_input_tokens_seen": 1478640, "step": 2250 }, { "epoch": 1.1818658280922432, "grad_norm": 0.4012669622898102, "learning_rate": 4.994993027842353e-05, "loss": 0.6071, "num_input_tokens_seen": 1481552, "step": 2255 }, { "epoch": 1.1844863731656186, "grad_norm": 0.8085028529167175, "learning_rate": 4.9948473221598094e-05, "loss": 0.5259, "num_input_tokens_seen": 1485104, "step": 2260 }, { "epoch": 1.1871069182389937, "grad_norm": 0.48007872700691223, "learning_rate": 4.994699528902987e-05, "loss": 0.6045, "num_input_tokens_seen": 1487728, "step": 2265 }, { "epoch": 1.189727463312369, "grad_norm": 0.4241069257259369, "learning_rate": 4.994549648195552e-05, "loss": 0.4701, "num_input_tokens_seen": 1491312, "step": 2270 }, { "epoch": 1.1923480083857443, "grad_norm": 0.6587438583374023, "learning_rate": 4.994397680162918e-05, "loss": 0.6044, "num_input_tokens_seen": 1494192, "step": 2275 }, { "epoch": 1.1949685534591195, "grad_norm": 0.6190853118896484, "learning_rate": 4.9942436249322444e-05, "loss": 0.6825, "num_input_tokens_seen": 1497168, "step": 2280 }, { "epoch": 1.1975890985324948, "grad_norm": 0.48449885845184326, "learning_rate": 4.994087482632438e-05, "loss": 0.5343, "num_input_tokens_seen": 1501936, "step": 2285 }, { "epoch": 1.20020964360587, "grad_norm": 0.41240254044532776, "learning_rate": 4.993929253394152e-05, "loss": 0.4502, "num_input_tokens_seen": 1505040, "step": 2290 }, { "epoch": 1.2028301886792452, "grad_norm": 0.829541802406311, "learning_rate": 4.993768937349784e-05, "loss": 0.6821, "num_input_tokens_seen": 1507856, "step": 2295 }, { "epoch": 1.2054507337526206, "grad_norm": 0.5907192826271057, "learning_rate": 4.993606534633481e-05, "loss": 0.5379, "num_input_tokens_seen": 1510768, "step": 2300 }, { "epoch": 1.2080712788259957, "grad_norm": 0.5630987286567688, "learning_rate": 4.9934420453811334e-05, "loss": 0.6541, "num_input_tokens_seen": 1513808, "step": 2305 }, { "epoch": 1.2106918238993711, "grad_norm": 0.4093603789806366, "learning_rate": 4.993275469730377e-05, "loss": 0.5141, "num_input_tokens_seen": 1517392, "step": 2310 }, { "epoch": 1.2133123689727463, "grad_norm": 0.33304569125175476, "learning_rate": 4.993106807820597e-05, "loss": 0.5484, "num_input_tokens_seen": 1521648, "step": 2315 }, { "epoch": 1.2159329140461215, "grad_norm": 0.5322422385215759, "learning_rate": 4.99293605979292e-05, "loss": 0.4646, "num_input_tokens_seen": 1524496, "step": 2320 }, { "epoch": 1.2185534591194969, "grad_norm": 0.49418842792510986, "learning_rate": 4.992763225790221e-05, "loss": 0.5285, "num_input_tokens_seen": 1528144, "step": 2325 }, { "epoch": 1.221174004192872, "grad_norm": 0.5916735529899597, "learning_rate": 4.992588305957119e-05, "loss": 0.5112, "num_input_tokens_seen": 1531664, "step": 2330 }, { "epoch": 1.2237945492662474, "grad_norm": 0.57837975025177, "learning_rate": 4.99241130043998e-05, "loss": 0.6336, "num_input_tokens_seen": 1535408, "step": 2335 }, { "epoch": 1.2264150943396226, "grad_norm": 0.35766392946243286, "learning_rate": 4.992232209386914e-05, "loss": 0.5452, "num_input_tokens_seen": 1538448, "step": 2340 }, { "epoch": 1.229035639412998, "grad_norm": 0.4932836890220642, "learning_rate": 4.9920510329477756e-05, "loss": 0.5962, "num_input_tokens_seen": 1541968, "step": 2345 }, { "epoch": 1.2316561844863732, "grad_norm": 0.3998194932937622, "learning_rate": 4.9918677712741644e-05, "loss": 0.5744, "num_input_tokens_seen": 1544560, "step": 2350 }, { "epoch": 1.2342767295597485, "grad_norm": 0.6719552874565125, "learning_rate": 4.991682424519427e-05, "loss": 0.5444, "num_input_tokens_seen": 1548080, "step": 2355 }, { "epoch": 1.2368972746331237, "grad_norm": 0.7123143076896667, "learning_rate": 4.9914949928386524e-05, "loss": 0.4654, "num_input_tokens_seen": 1551472, "step": 2360 }, { "epoch": 1.2395178197064989, "grad_norm": 0.3747619092464447, "learning_rate": 4.991305476388673e-05, "loss": 0.6218, "num_input_tokens_seen": 1553904, "step": 2365 }, { "epoch": 1.2421383647798743, "grad_norm": 0.34223851561546326, "learning_rate": 4.991113875328072e-05, "loss": 0.4894, "num_input_tokens_seen": 1558352, "step": 2370 }, { "epoch": 1.2447589098532494, "grad_norm": 0.619814395904541, "learning_rate": 4.9909201898171676e-05, "loss": 0.418, "num_input_tokens_seen": 1561616, "step": 2375 }, { "epoch": 1.2473794549266248, "grad_norm": 0.40527743101119995, "learning_rate": 4.9907244200180295e-05, "loss": 0.4102, "num_input_tokens_seen": 1566000, "step": 2380 }, { "epoch": 1.25, "grad_norm": 0.4880008399486542, "learning_rate": 4.990526566094469e-05, "loss": 0.5422, "num_input_tokens_seen": 1569776, "step": 2385 }, { "epoch": 1.2526205450733752, "grad_norm": 0.5487090945243835, "learning_rate": 4.99032662821204e-05, "loss": 0.6188, "num_input_tokens_seen": 1572112, "step": 2390 }, { "epoch": 1.2552410901467506, "grad_norm": 0.4082305133342743, "learning_rate": 4.990124606538042e-05, "loss": 0.5532, "num_input_tokens_seen": 1575312, "step": 2395 }, { "epoch": 1.2578616352201257, "grad_norm": 0.4252965748310089, "learning_rate": 4.9899205012415184e-05, "loss": 0.5998, "num_input_tokens_seen": 1578256, "step": 2400 }, { "epoch": 1.2604821802935011, "grad_norm": 0.5023346543312073, "learning_rate": 4.9897143124932547e-05, "loss": 0.7214, "num_input_tokens_seen": 1581328, "step": 2405 }, { "epoch": 1.2631027253668763, "grad_norm": 0.5107813477516174, "learning_rate": 4.9895060404657786e-05, "loss": 0.586, "num_input_tokens_seen": 1583952, "step": 2410 }, { "epoch": 1.2657232704402515, "grad_norm": 0.7700883150100708, "learning_rate": 4.9892956853333644e-05, "loss": 0.5549, "num_input_tokens_seen": 1587472, "step": 2415 }, { "epoch": 1.2683438155136268, "grad_norm": 0.7720652222633362, "learning_rate": 4.989083247272027e-05, "loss": 0.474, "num_input_tokens_seen": 1590384, "step": 2420 }, { "epoch": 1.270964360587002, "grad_norm": 0.5486740469932556, "learning_rate": 4.988868726459526e-05, "loss": 0.4974, "num_input_tokens_seen": 1593264, "step": 2425 }, { "epoch": 1.2735849056603774, "grad_norm": 0.5583911538124084, "learning_rate": 4.988652123075361e-05, "loss": 0.483, "num_input_tokens_seen": 1597680, "step": 2430 }, { "epoch": 1.2762054507337526, "grad_norm": 0.7433040142059326, "learning_rate": 4.988433437300776e-05, "loss": 0.5554, "num_input_tokens_seen": 1600912, "step": 2435 }, { "epoch": 1.2788259958071277, "grad_norm": 0.5572810769081116, "learning_rate": 4.988212669318758e-05, "loss": 0.4702, "num_input_tokens_seen": 1603472, "step": 2440 }, { "epoch": 1.2814465408805031, "grad_norm": 1.3449318408966064, "learning_rate": 4.987989819314036e-05, "loss": 0.5081, "num_input_tokens_seen": 1606384, "step": 2445 }, { "epoch": 1.2840670859538785, "grad_norm": 0.4201442301273346, "learning_rate": 4.98776488747308e-05, "loss": 0.5217, "num_input_tokens_seen": 1609360, "step": 2450 }, { "epoch": 1.2866876310272537, "grad_norm": 0.9446012377738953, "learning_rate": 4.9875378739841016e-05, "loss": 0.5452, "num_input_tokens_seen": 1612336, "step": 2455 }, { "epoch": 1.2893081761006289, "grad_norm": 0.4049665629863739, "learning_rate": 4.9873087790370576e-05, "loss": 0.4479, "num_input_tokens_seen": 1616048, "step": 2460 }, { "epoch": 1.2919287211740043, "grad_norm": 0.8382037281990051, "learning_rate": 4.9870776028236424e-05, "loss": 0.4434, "num_input_tokens_seen": 1618960, "step": 2465 }, { "epoch": 1.2945492662473794, "grad_norm": 0.3287566602230072, "learning_rate": 4.9868443455372945e-05, "loss": 0.3925, "num_input_tokens_seen": 1622544, "step": 2470 }, { "epoch": 1.2971698113207548, "grad_norm": 0.4001702666282654, "learning_rate": 4.986609007373193e-05, "loss": 0.5039, "num_input_tokens_seen": 1625872, "step": 2475 }, { "epoch": 1.29979035639413, "grad_norm": 0.41151511669158936, "learning_rate": 4.986371588528257e-05, "loss": 0.5264, "num_input_tokens_seen": 1629456, "step": 2480 }, { "epoch": 1.3024109014675052, "grad_norm": 0.322173148393631, "learning_rate": 4.98613208920115e-05, "loss": 0.6668, "num_input_tokens_seen": 1632432, "step": 2485 }, { "epoch": 1.3050314465408805, "grad_norm": 0.3945515751838684, "learning_rate": 4.985890509592271e-05, "loss": 0.4985, "num_input_tokens_seen": 1636080, "step": 2490 }, { "epoch": 1.3076519916142557, "grad_norm": 0.35470423102378845, "learning_rate": 4.985646849903766e-05, "loss": 0.5354, "num_input_tokens_seen": 1638960, "step": 2495 }, { "epoch": 1.310272536687631, "grad_norm": 0.20713107287883759, "learning_rate": 4.985401110339517e-05, "loss": 0.5138, "num_input_tokens_seen": 1643120, "step": 2500 }, { "epoch": 1.3128930817610063, "grad_norm": 0.5854973793029785, "learning_rate": 4.985153291105146e-05, "loss": 0.4357, "num_input_tokens_seen": 1646768, "step": 2505 }, { "epoch": 1.3155136268343814, "grad_norm": 0.3859921097755432, "learning_rate": 4.984903392408019e-05, "loss": 0.5127, "num_input_tokens_seen": 1650256, "step": 2510 }, { "epoch": 1.3181341719077568, "grad_norm": 0.38251155614852905, "learning_rate": 4.984651414457239e-05, "loss": 0.573, "num_input_tokens_seen": 1652912, "step": 2515 }, { "epoch": 1.320754716981132, "grad_norm": 0.22981657087802887, "learning_rate": 4.98439735746365e-05, "loss": 0.5273, "num_input_tokens_seen": 1656432, "step": 2520 }, { "epoch": 1.3233752620545074, "grad_norm": 0.45612621307373047, "learning_rate": 4.984141221639835e-05, "loss": 0.4368, "num_input_tokens_seen": 1659472, "step": 2525 }, { "epoch": 1.3259958071278826, "grad_norm": 0.8474671840667725, "learning_rate": 4.9838830072001165e-05, "loss": 0.5375, "num_input_tokens_seen": 1662480, "step": 2530 }, { "epoch": 1.3286163522012577, "grad_norm": 0.527665376663208, "learning_rate": 4.983622714360557e-05, "loss": 0.5688, "num_input_tokens_seen": 1665424, "step": 2535 }, { "epoch": 1.3312368972746331, "grad_norm": 0.4228573143482208, "learning_rate": 4.9833603433389576e-05, "loss": 0.5781, "num_input_tokens_seen": 1667760, "step": 2540 }, { "epoch": 1.3338574423480085, "grad_norm": 0.4836251139640808, "learning_rate": 4.983095894354858e-05, "loss": 0.4524, "num_input_tokens_seen": 1670800, "step": 2545 }, { "epoch": 1.3364779874213837, "grad_norm": 0.3997974991798401, "learning_rate": 4.982829367629537e-05, "loss": 0.5751, "num_input_tokens_seen": 1675760, "step": 2550 }, { "epoch": 1.3390985324947589, "grad_norm": 1.041587471961975, "learning_rate": 4.982560763386013e-05, "loss": 0.5853, "num_input_tokens_seen": 1677936, "step": 2555 }, { "epoch": 1.3417190775681342, "grad_norm": 0.32707679271698, "learning_rate": 4.9822900818490404e-05, "loss": 0.6538, "num_input_tokens_seen": 1685008, "step": 2560 }, { "epoch": 1.3443396226415094, "grad_norm": 0.49454593658447266, "learning_rate": 4.982017323245114e-05, "loss": 0.6703, "num_input_tokens_seen": 1688240, "step": 2565 }, { "epoch": 1.3469601677148848, "grad_norm": 0.5074874758720398, "learning_rate": 4.981742487802466e-05, "loss": 0.7097, "num_input_tokens_seen": 1690832, "step": 2570 }, { "epoch": 1.34958071278826, "grad_norm": 0.4860459268093109, "learning_rate": 4.9814655757510644e-05, "loss": 0.6539, "num_input_tokens_seen": 1695248, "step": 2575 }, { "epoch": 1.3522012578616351, "grad_norm": 0.6943042278289795, "learning_rate": 4.981186587322619e-05, "loss": 0.5516, "num_input_tokens_seen": 1698896, "step": 2580 }, { "epoch": 1.3548218029350105, "grad_norm": 0.669049084186554, "learning_rate": 4.980905522750573e-05, "loss": 0.6739, "num_input_tokens_seen": 1701296, "step": 2585 }, { "epoch": 1.3574423480083857, "grad_norm": 0.7142858505249023, "learning_rate": 4.980622382270108e-05, "loss": 0.3884, "num_input_tokens_seen": 1705136, "step": 2590 }, { "epoch": 1.360062893081761, "grad_norm": 0.46429452300071716, "learning_rate": 4.9803371661181456e-05, "loss": 0.5026, "num_input_tokens_seen": 1708016, "step": 2595 }, { "epoch": 1.3626834381551363, "grad_norm": 2.474259614944458, "learning_rate": 4.980049874533338e-05, "loss": 0.7104, "num_input_tokens_seen": 1710864, "step": 2600 }, { "epoch": 1.3653039832285114, "grad_norm": 0.45309677720069885, "learning_rate": 4.979760507756081e-05, "loss": 0.4651, "num_input_tokens_seen": 1713552, "step": 2605 }, { "epoch": 1.3679245283018868, "grad_norm": 0.5276386141777039, "learning_rate": 4.979469066028502e-05, "loss": 0.4507, "num_input_tokens_seen": 1716528, "step": 2610 }, { "epoch": 1.370545073375262, "grad_norm": 1.6822408437728882, "learning_rate": 4.9791755495944645e-05, "loss": 0.5007, "num_input_tokens_seen": 1719216, "step": 2615 }, { "epoch": 1.3731656184486374, "grad_norm": 0.42932772636413574, "learning_rate": 4.978879958699573e-05, "loss": 0.5566, "num_input_tokens_seen": 1722384, "step": 2620 }, { "epoch": 1.3757861635220126, "grad_norm": 0.36574655771255493, "learning_rate": 4.978582293591162e-05, "loss": 0.462, "num_input_tokens_seen": 1725104, "step": 2625 }, { "epoch": 1.3784067085953877, "grad_norm": 0.8556284308433533, "learning_rate": 4.978282554518305e-05, "loss": 0.4618, "num_input_tokens_seen": 1728112, "step": 2630 }, { "epoch": 1.381027253668763, "grad_norm": 0.580269992351532, "learning_rate": 4.9779807417318096e-05, "loss": 0.5486, "num_input_tokens_seen": 1731536, "step": 2635 }, { "epoch": 1.3836477987421385, "grad_norm": 0.36853232979774475, "learning_rate": 4.977676855484219e-05, "loss": 0.5924, "num_input_tokens_seen": 1734896, "step": 2640 }, { "epoch": 1.3862683438155137, "grad_norm": 0.6232555508613586, "learning_rate": 4.977370896029812e-05, "loss": 0.555, "num_input_tokens_seen": 1737584, "step": 2645 }, { "epoch": 1.3888888888888888, "grad_norm": 0.48538872599601746, "learning_rate": 4.977062863624601e-05, "loss": 0.6109, "num_input_tokens_seen": 1740720, "step": 2650 }, { "epoch": 1.3915094339622642, "grad_norm": 0.4580588936805725, "learning_rate": 4.976752758526333e-05, "loss": 0.4051, "num_input_tokens_seen": 1743216, "step": 2655 }, { "epoch": 1.3941299790356394, "grad_norm": 0.68052077293396, "learning_rate": 4.9764405809944906e-05, "loss": 0.6009, "num_input_tokens_seen": 1746000, "step": 2660 }, { "epoch": 1.3967505241090148, "grad_norm": 0.8041396141052246, "learning_rate": 4.9761263312902895e-05, "loss": 0.4581, "num_input_tokens_seen": 1748560, "step": 2665 }, { "epoch": 1.39937106918239, "grad_norm": 0.4175306558609009, "learning_rate": 4.9758100096766786e-05, "loss": 0.6041, "num_input_tokens_seen": 1751504, "step": 2670 }, { "epoch": 1.4019916142557651, "grad_norm": 0.7727677226066589, "learning_rate": 4.975491616418342e-05, "loss": 0.4946, "num_input_tokens_seen": 1754864, "step": 2675 }, { "epoch": 1.4046121593291405, "grad_norm": 1.2312335968017578, "learning_rate": 4.975171151781698e-05, "loss": 0.5697, "num_input_tokens_seen": 1757648, "step": 2680 }, { "epoch": 1.4072327044025157, "grad_norm": 0.39356788992881775, "learning_rate": 4.974848616034894e-05, "loss": 0.5193, "num_input_tokens_seen": 1760432, "step": 2685 }, { "epoch": 1.409853249475891, "grad_norm": 0.28062674403190613, "learning_rate": 4.974524009447815e-05, "loss": 0.5377, "num_input_tokens_seen": 1763664, "step": 2690 }, { "epoch": 1.4124737945492662, "grad_norm": 0.37650373578071594, "learning_rate": 4.974197332292078e-05, "loss": 0.4364, "num_input_tokens_seen": 1766640, "step": 2695 }, { "epoch": 1.4150943396226414, "grad_norm": 0.5268535017967224, "learning_rate": 4.973868584841028e-05, "loss": 0.5353, "num_input_tokens_seen": 1769776, "step": 2700 }, { "epoch": 1.4177148846960168, "grad_norm": 0.9085237979888916, "learning_rate": 4.973537767369749e-05, "loss": 0.5533, "num_input_tokens_seen": 1772112, "step": 2705 }, { "epoch": 1.420335429769392, "grad_norm": 0.37329667806625366, "learning_rate": 4.973204880155053e-05, "loss": 0.4898, "num_input_tokens_seen": 1775312, "step": 2710 }, { "epoch": 1.4229559748427674, "grad_norm": 0.3977273404598236, "learning_rate": 4.972869923475485e-05, "loss": 0.6312, "num_input_tokens_seen": 1779504, "step": 2715 }, { "epoch": 1.4255765199161425, "grad_norm": 0.3897983133792877, "learning_rate": 4.972532897611321e-05, "loss": 0.4633, "num_input_tokens_seen": 1783248, "step": 2720 }, { "epoch": 1.4281970649895177, "grad_norm": 0.8118927478790283, "learning_rate": 4.972193802844569e-05, "loss": 0.3752, "num_input_tokens_seen": 1789168, "step": 2725 }, { "epoch": 1.430817610062893, "grad_norm": 0.4991753101348877, "learning_rate": 4.971852639458968e-05, "loss": 0.4972, "num_input_tokens_seen": 1792592, "step": 2730 }, { "epoch": 1.4334381551362683, "grad_norm": 0.9497215151786804, "learning_rate": 4.971509407739988e-05, "loss": 0.5483, "num_input_tokens_seen": 1795536, "step": 2735 }, { "epoch": 1.4360587002096437, "grad_norm": 0.3999401330947876, "learning_rate": 4.971164107974831e-05, "loss": 0.4531, "num_input_tokens_seen": 1798064, "step": 2740 }, { "epoch": 1.4386792452830188, "grad_norm": 0.4135047197341919, "learning_rate": 4.970816740452425e-05, "loss": 0.5361, "num_input_tokens_seen": 1801392, "step": 2745 }, { "epoch": 1.441299790356394, "grad_norm": 0.6220123767852783, "learning_rate": 4.9704673054634335e-05, "loss": 0.5568, "num_input_tokens_seen": 1804368, "step": 2750 }, { "epoch": 1.4439203354297694, "grad_norm": 0.6315510869026184, "learning_rate": 4.970115803300247e-05, "loss": 0.5802, "num_input_tokens_seen": 1807504, "step": 2755 }, { "epoch": 1.4465408805031448, "grad_norm": 0.5694937109947205, "learning_rate": 4.969762234256987e-05, "loss": 0.3954, "num_input_tokens_seen": 1810160, "step": 2760 }, { "epoch": 1.44916142557652, "grad_norm": 0.5386505722999573, "learning_rate": 4.969406598629503e-05, "loss": 0.5248, "num_input_tokens_seen": 1813616, "step": 2765 }, { "epoch": 1.4517819706498951, "grad_norm": 0.4276624619960785, "learning_rate": 4.969048896715376e-05, "loss": 0.5061, "num_input_tokens_seen": 1816752, "step": 2770 }, { "epoch": 1.4544025157232705, "grad_norm": 0.5479277968406677, "learning_rate": 4.968689128813914e-05, "loss": 0.4903, "num_input_tokens_seen": 1820784, "step": 2775 }, { "epoch": 1.4570230607966457, "grad_norm": 0.611650824546814, "learning_rate": 4.968327295226153e-05, "loss": 0.425, "num_input_tokens_seen": 1823024, "step": 2780 }, { "epoch": 1.459643605870021, "grad_norm": 0.6325893998146057, "learning_rate": 4.967963396254861e-05, "loss": 0.5743, "num_input_tokens_seen": 1828016, "step": 2785 }, { "epoch": 1.4622641509433962, "grad_norm": 0.31148582696914673, "learning_rate": 4.967597432204531e-05, "loss": 0.5534, "num_input_tokens_seen": 1831280, "step": 2790 }, { "epoch": 1.4648846960167714, "grad_norm": 0.3160398602485657, "learning_rate": 4.9672294033813846e-05, "loss": 0.4396, "num_input_tokens_seen": 1834896, "step": 2795 }, { "epoch": 1.4675052410901468, "grad_norm": 0.469874769449234, "learning_rate": 4.966859310093372e-05, "loss": 0.5173, "num_input_tokens_seen": 1837488, "step": 2800 }, { "epoch": 1.470125786163522, "grad_norm": 0.4423501491546631, "learning_rate": 4.966487152650171e-05, "loss": 0.5756, "num_input_tokens_seen": 1841584, "step": 2805 }, { "epoch": 1.4727463312368974, "grad_norm": 0.5459968447685242, "learning_rate": 4.966112931363185e-05, "loss": 0.5765, "num_input_tokens_seen": 1844688, "step": 2810 }, { "epoch": 1.4753668763102725, "grad_norm": 0.3796689808368683, "learning_rate": 4.965736646545546e-05, "loss": 0.5178, "num_input_tokens_seen": 1847472, "step": 2815 }, { "epoch": 1.4779874213836477, "grad_norm": 0.6437380909919739, "learning_rate": 4.96535829851211e-05, "loss": 0.7735, "num_input_tokens_seen": 1851088, "step": 2820 }, { "epoch": 1.480607966457023, "grad_norm": 0.41598159074783325, "learning_rate": 4.964977887579464e-05, "loss": 0.5192, "num_input_tokens_seen": 1855216, "step": 2825 }, { "epoch": 1.4832285115303983, "grad_norm": 0.42335906624794006, "learning_rate": 4.964595414065918e-05, "loss": 0.4453, "num_input_tokens_seen": 1857744, "step": 2830 }, { "epoch": 1.4858490566037736, "grad_norm": 0.46275001764297485, "learning_rate": 4.9642108782915066e-05, "loss": 0.4706, "num_input_tokens_seen": 1860400, "step": 2835 }, { "epoch": 1.4884696016771488, "grad_norm": 0.5639103055000305, "learning_rate": 4.963824280577993e-05, "loss": 0.564, "num_input_tokens_seen": 1863248, "step": 2840 }, { "epoch": 1.491090146750524, "grad_norm": 0.44006064534187317, "learning_rate": 4.963435621248865e-05, "loss": 0.5075, "num_input_tokens_seen": 1866576, "step": 2845 }, { "epoch": 1.4937106918238994, "grad_norm": 0.43963420391082764, "learning_rate": 4.9630449006293345e-05, "loss": 0.4794, "num_input_tokens_seen": 1869840, "step": 2850 }, { "epoch": 1.4963312368972748, "grad_norm": 0.3872293531894684, "learning_rate": 4.9626521190463375e-05, "loss": 0.6316, "num_input_tokens_seen": 1872624, "step": 2855 }, { "epoch": 1.49895178197065, "grad_norm": 0.332090824842453, "learning_rate": 4.9622572768285377e-05, "loss": 0.5265, "num_input_tokens_seen": 1876016, "step": 2860 }, { "epoch": 1.5, "eval_loss": 0.5262528657913208, "eval_runtime": 14.5146, "eval_samples_per_second": 58.424, "eval_steps_per_second": 14.606, "num_input_tokens_seen": 1877040, "step": 2862 }, { "epoch": 1.501572327044025, "grad_norm": 0.4007701277732849, "learning_rate": 4.96186037430632e-05, "loss": 0.4085, "num_input_tokens_seen": 1880976, "step": 2865 }, { "epoch": 1.5041928721174003, "grad_norm": 1.0993424654006958, "learning_rate": 4.9614614118117934e-05, "loss": 0.5975, "num_input_tokens_seen": 1883568, "step": 2870 }, { "epoch": 1.5068134171907757, "grad_norm": 0.6750695705413818, "learning_rate": 4.961060389678793e-05, "loss": 0.4903, "num_input_tokens_seen": 1886800, "step": 2875 }, { "epoch": 1.509433962264151, "grad_norm": 0.5730173587799072, "learning_rate": 4.9606573082428754e-05, "loss": 0.4932, "num_input_tokens_seen": 1890672, "step": 2880 }, { "epoch": 1.5120545073375262, "grad_norm": 0.5998068451881409, "learning_rate": 4.9602521678413206e-05, "loss": 0.3846, "num_input_tokens_seen": 1893456, "step": 2885 }, { "epoch": 1.5146750524109014, "grad_norm": 0.6097415685653687, "learning_rate": 4.959844968813132e-05, "loss": 0.5341, "num_input_tokens_seen": 1896752, "step": 2890 }, { "epoch": 1.5172955974842768, "grad_norm": 0.2844610810279846, "learning_rate": 4.959435711499034e-05, "loss": 0.4548, "num_input_tokens_seen": 1899664, "step": 2895 }, { "epoch": 1.519916142557652, "grad_norm": 0.3367796838283539, "learning_rate": 4.959024396241475e-05, "loss": 0.4114, "num_input_tokens_seen": 1902320, "step": 2900 }, { "epoch": 1.5225366876310273, "grad_norm": 0.37319469451904297, "learning_rate": 4.958611023384626e-05, "loss": 0.5614, "num_input_tokens_seen": 1905488, "step": 2905 }, { "epoch": 1.5251572327044025, "grad_norm": 0.4210143983364105, "learning_rate": 4.958195593274376e-05, "loss": 0.4922, "num_input_tokens_seen": 1908144, "step": 2910 }, { "epoch": 1.5277777777777777, "grad_norm": 0.30830177664756775, "learning_rate": 4.957778106258341e-05, "loss": 0.5037, "num_input_tokens_seen": 1911536, "step": 2915 }, { "epoch": 1.530398322851153, "grad_norm": 0.3065758943557739, "learning_rate": 4.957358562685852e-05, "loss": 0.4991, "num_input_tokens_seen": 1916176, "step": 2920 }, { "epoch": 1.5330188679245285, "grad_norm": 0.44745922088623047, "learning_rate": 4.956936962907966e-05, "loss": 0.5063, "num_input_tokens_seen": 1920592, "step": 2925 }, { "epoch": 1.5356394129979036, "grad_norm": 0.474324494600296, "learning_rate": 4.9565133072774585e-05, "loss": 0.4656, "num_input_tokens_seen": 1923152, "step": 2930 }, { "epoch": 1.5382599580712788, "grad_norm": 0.5591490864753723, "learning_rate": 4.956087596148824e-05, "loss": 0.5411, "num_input_tokens_seen": 1926000, "step": 2935 }, { "epoch": 1.540880503144654, "grad_norm": 0.40030184388160706, "learning_rate": 4.955659829878279e-05, "loss": 0.5954, "num_input_tokens_seen": 1929072, "step": 2940 }, { "epoch": 1.5435010482180294, "grad_norm": 0.24507169425487518, "learning_rate": 4.955230008823758e-05, "loss": 0.3798, "num_input_tokens_seen": 1932208, "step": 2945 }, { "epoch": 1.5461215932914047, "grad_norm": 0.4643065631389618, "learning_rate": 4.954798133344916e-05, "loss": 0.6788, "num_input_tokens_seen": 1935152, "step": 2950 }, { "epoch": 1.54874213836478, "grad_norm": 0.5530327558517456, "learning_rate": 4.954364203803127e-05, "loss": 0.5515, "num_input_tokens_seen": 1938704, "step": 2955 }, { "epoch": 1.551362683438155, "grad_norm": 0.3667726516723633, "learning_rate": 4.953928220561482e-05, "loss": 0.6003, "num_input_tokens_seen": 1942192, "step": 2960 }, { "epoch": 1.5539832285115303, "grad_norm": 0.6153308749198914, "learning_rate": 4.953490183984795e-05, "loss": 0.4688, "num_input_tokens_seen": 1945744, "step": 2965 }, { "epoch": 1.5566037735849056, "grad_norm": 0.40885069966316223, "learning_rate": 4.953050094439591e-05, "loss": 0.5043, "num_input_tokens_seen": 1948944, "step": 2970 }, { "epoch": 1.559224318658281, "grad_norm": 0.5039407014846802, "learning_rate": 4.95260795229412e-05, "loss": 0.4706, "num_input_tokens_seen": 1951792, "step": 2975 }, { "epoch": 1.5618448637316562, "grad_norm": 0.4655413329601288, "learning_rate": 4.952163757918344e-05, "loss": 0.5785, "num_input_tokens_seen": 1955376, "step": 2980 }, { "epoch": 1.5644654088050314, "grad_norm": 0.6459707617759705, "learning_rate": 4.951717511683947e-05, "loss": 0.5247, "num_input_tokens_seen": 1959120, "step": 2985 }, { "epoch": 1.5670859538784065, "grad_norm": 0.25272005796432495, "learning_rate": 4.9512692139643264e-05, "loss": 0.407, "num_input_tokens_seen": 1964304, "step": 2990 }, { "epoch": 1.569706498951782, "grad_norm": 0.5146021842956543, "learning_rate": 4.950818865134596e-05, "loss": 0.6461, "num_input_tokens_seen": 1966832, "step": 2995 }, { "epoch": 1.5723270440251573, "grad_norm": 0.4709148705005646, "learning_rate": 4.9503664655715885e-05, "loss": 0.603, "num_input_tokens_seen": 1970224, "step": 3000 }, { "epoch": 1.5749475890985325, "grad_norm": 0.2729114294052124, "learning_rate": 4.9499120156538516e-05, "loss": 0.533, "num_input_tokens_seen": 1974288, "step": 3005 }, { "epoch": 1.5775681341719077, "grad_norm": 0.5352027416229248, "learning_rate": 4.949455515761647e-05, "loss": 0.4497, "num_input_tokens_seen": 1978064, "step": 3010 }, { "epoch": 1.580188679245283, "grad_norm": 0.5488559007644653, "learning_rate": 4.948996966276953e-05, "loss": 0.5433, "num_input_tokens_seen": 1981744, "step": 3015 }, { "epoch": 1.5828092243186582, "grad_norm": 0.8167805075645447, "learning_rate": 4.948536367583464e-05, "loss": 0.5449, "num_input_tokens_seen": 1984016, "step": 3020 }, { "epoch": 1.5854297693920336, "grad_norm": 0.5341197848320007, "learning_rate": 4.948073720066587e-05, "loss": 0.3954, "num_input_tokens_seen": 1987056, "step": 3025 }, { "epoch": 1.5880503144654088, "grad_norm": 0.3826766014099121, "learning_rate": 4.947609024113444e-05, "loss": 0.5506, "num_input_tokens_seen": 1992048, "step": 3030 }, { "epoch": 1.590670859538784, "grad_norm": 0.3436478078365326, "learning_rate": 4.947142280112873e-05, "loss": 0.681, "num_input_tokens_seen": 1995472, "step": 3035 }, { "epoch": 1.5932914046121593, "grad_norm": 0.38398730754852295, "learning_rate": 4.946673488455422e-05, "loss": 0.3963, "num_input_tokens_seen": 1999152, "step": 3040 }, { "epoch": 1.5959119496855347, "grad_norm": 0.520606279373169, "learning_rate": 4.946202649533356e-05, "loss": 0.5654, "num_input_tokens_seen": 2001744, "step": 3045 }, { "epoch": 1.59853249475891, "grad_norm": 0.4367597699165344, "learning_rate": 4.9457297637406506e-05, "loss": 0.6256, "num_input_tokens_seen": 2004848, "step": 3050 }, { "epoch": 1.601153039832285, "grad_norm": 0.7321252822875977, "learning_rate": 4.9452548314729965e-05, "loss": 0.5513, "num_input_tokens_seen": 2007920, "step": 3055 }, { "epoch": 1.6037735849056602, "grad_norm": 0.43471789360046387, "learning_rate": 4.944777853127793e-05, "loss": 0.4895, "num_input_tokens_seen": 2011152, "step": 3060 }, { "epoch": 1.6063941299790356, "grad_norm": 0.6035179495811462, "learning_rate": 4.9442988291041545e-05, "loss": 0.6043, "num_input_tokens_seen": 2013968, "step": 3065 }, { "epoch": 1.609014675052411, "grad_norm": 0.6997206807136536, "learning_rate": 4.943817759802908e-05, "loss": 0.5344, "num_input_tokens_seen": 2017936, "step": 3070 }, { "epoch": 1.6116352201257862, "grad_norm": 0.2714653015136719, "learning_rate": 4.94333464562659e-05, "loss": 0.5113, "num_input_tokens_seen": 2020528, "step": 3075 }, { "epoch": 1.6142557651991614, "grad_norm": 0.600290834903717, "learning_rate": 4.942849486979446e-05, "loss": 0.6772, "num_input_tokens_seen": 2023792, "step": 3080 }, { "epoch": 1.6168763102725365, "grad_norm": 0.40745270252227783, "learning_rate": 4.9423622842674366e-05, "loss": 0.5719, "num_input_tokens_seen": 2027152, "step": 3085 }, { "epoch": 1.619496855345912, "grad_norm": 0.3500750660896301, "learning_rate": 4.9418730378982304e-05, "loss": 0.4169, "num_input_tokens_seen": 2029968, "step": 3090 }, { "epoch": 1.6221174004192873, "grad_norm": 0.6198293566703796, "learning_rate": 4.9413817482812064e-05, "loss": 0.4642, "num_input_tokens_seen": 2033488, "step": 3095 }, { "epoch": 1.6247379454926625, "grad_norm": 1.0396119356155396, "learning_rate": 4.9408884158274534e-05, "loss": 0.6001, "num_input_tokens_seen": 2037136, "step": 3100 }, { "epoch": 1.6273584905660377, "grad_norm": 0.394243448972702, "learning_rate": 4.940393040949769e-05, "loss": 0.5239, "num_input_tokens_seen": 2039888, "step": 3105 }, { "epoch": 1.629979035639413, "grad_norm": 0.419204980134964, "learning_rate": 4.939895624062661e-05, "loss": 0.6938, "num_input_tokens_seen": 2043024, "step": 3110 }, { "epoch": 1.6325995807127882, "grad_norm": 0.4284510016441345, "learning_rate": 4.9393961655823454e-05, "loss": 0.4132, "num_input_tokens_seen": 2047056, "step": 3115 }, { "epoch": 1.6352201257861636, "grad_norm": 0.9528070092201233, "learning_rate": 4.9388946659267444e-05, "loss": 0.5739, "num_input_tokens_seen": 2050000, "step": 3120 }, { "epoch": 1.6378406708595388, "grad_norm": 0.3914155960083008, "learning_rate": 4.9383911255154916e-05, "loss": 0.5169, "num_input_tokens_seen": 2054032, "step": 3125 }, { "epoch": 1.640461215932914, "grad_norm": 0.49133044481277466, "learning_rate": 4.9378855447699264e-05, "loss": 0.5112, "num_input_tokens_seen": 2057072, "step": 3130 }, { "epoch": 1.6430817610062893, "grad_norm": 0.32748305797576904, "learning_rate": 4.9373779241130955e-05, "loss": 0.4615, "num_input_tokens_seen": 2060656, "step": 3135 }, { "epoch": 1.6457023060796647, "grad_norm": 1.1354202032089233, "learning_rate": 4.936868263969752e-05, "loss": 0.518, "num_input_tokens_seen": 2063568, "step": 3140 }, { "epoch": 1.64832285115304, "grad_norm": 0.48287153244018555, "learning_rate": 4.936356564766358e-05, "loss": 0.4374, "num_input_tokens_seen": 2066960, "step": 3145 }, { "epoch": 1.650943396226415, "grad_norm": 0.4048994183540344, "learning_rate": 4.935842826931078e-05, "loss": 0.5102, "num_input_tokens_seen": 2070096, "step": 3150 }, { "epoch": 1.6535639412997902, "grad_norm": 0.3583712875843048, "learning_rate": 4.9353270508937854e-05, "loss": 0.5087, "num_input_tokens_seen": 2073072, "step": 3155 }, { "epoch": 1.6561844863731656, "grad_norm": 0.38228899240493774, "learning_rate": 4.934809237086059e-05, "loss": 0.4655, "num_input_tokens_seen": 2075632, "step": 3160 }, { "epoch": 1.658805031446541, "grad_norm": 0.7656595706939697, "learning_rate": 4.934289385941179e-05, "loss": 0.4614, "num_input_tokens_seen": 2079024, "step": 3165 }, { "epoch": 1.6614255765199162, "grad_norm": 0.7057949304580688, "learning_rate": 4.9337674978941364e-05, "loss": 0.6607, "num_input_tokens_seen": 2081744, "step": 3170 }, { "epoch": 1.6640461215932913, "grad_norm": 0.29648473858833313, "learning_rate": 4.9332435733816204e-05, "loss": 0.5584, "num_input_tokens_seen": 2084784, "step": 3175 }, { "epoch": 1.6666666666666665, "grad_norm": 0.3652234673500061, "learning_rate": 4.932717612842028e-05, "loss": 0.4475, "num_input_tokens_seen": 2087984, "step": 3180 }, { "epoch": 1.669287211740042, "grad_norm": 0.32418113946914673, "learning_rate": 4.93218961671546e-05, "loss": 0.5069, "num_input_tokens_seen": 2091344, "step": 3185 }, { "epoch": 1.6719077568134173, "grad_norm": 0.3618309497833252, "learning_rate": 4.931659585443719e-05, "loss": 0.435, "num_input_tokens_seen": 2095312, "step": 3190 }, { "epoch": 1.6745283018867925, "grad_norm": 0.485213965177536, "learning_rate": 4.931127519470311e-05, "loss": 0.4239, "num_input_tokens_seen": 2098832, "step": 3195 }, { "epoch": 1.6771488469601676, "grad_norm": 0.6654406189918518, "learning_rate": 4.9305934192404426e-05, "loss": 0.6537, "num_input_tokens_seen": 2101168, "step": 3200 }, { "epoch": 1.679769392033543, "grad_norm": 0.659778356552124, "learning_rate": 4.930057285201027e-05, "loss": 0.7418, "num_input_tokens_seen": 2104816, "step": 3205 }, { "epoch": 1.6823899371069182, "grad_norm": 0.6889087557792664, "learning_rate": 4.929519117800676e-05, "loss": 0.5064, "num_input_tokens_seen": 2107728, "step": 3210 }, { "epoch": 1.6850104821802936, "grad_norm": 1.0791599750518799, "learning_rate": 4.928978917489703e-05, "loss": 0.7491, "num_input_tokens_seen": 2110992, "step": 3215 }, { "epoch": 1.6876310272536688, "grad_norm": 0.6321695446968079, "learning_rate": 4.928436684720122e-05, "loss": 0.4941, "num_input_tokens_seen": 2113840, "step": 3220 }, { "epoch": 1.690251572327044, "grad_norm": 0.5242596864700317, "learning_rate": 4.927892419945651e-05, "loss": 0.5002, "num_input_tokens_seen": 2116784, "step": 3225 }, { "epoch": 1.6928721174004193, "grad_norm": 0.29644426703453064, "learning_rate": 4.927346123621705e-05, "loss": 0.6506, "num_input_tokens_seen": 2119696, "step": 3230 }, { "epoch": 1.6954926624737947, "grad_norm": 0.5105762481689453, "learning_rate": 4.926797796205399e-05, "loss": 0.5615, "num_input_tokens_seen": 2122608, "step": 3235 }, { "epoch": 1.6981132075471699, "grad_norm": 0.4934845268726349, "learning_rate": 4.926247438155549e-05, "loss": 0.527, "num_input_tokens_seen": 2125808, "step": 3240 }, { "epoch": 1.700733752620545, "grad_norm": 0.5487411022186279, "learning_rate": 4.9256950499326684e-05, "loss": 0.4326, "num_input_tokens_seen": 2129584, "step": 3245 }, { "epoch": 1.7033542976939202, "grad_norm": 0.45831578969955444, "learning_rate": 4.9251406319989725e-05, "loss": 0.4849, "num_input_tokens_seen": 2132208, "step": 3250 }, { "epoch": 1.7059748427672956, "grad_norm": 0.32939863204956055, "learning_rate": 4.9245841848183714e-05, "loss": 0.4818, "num_input_tokens_seen": 2135632, "step": 3255 }, { "epoch": 1.708595387840671, "grad_norm": 0.6363638639450073, "learning_rate": 4.924025708856475e-05, "loss": 0.717, "num_input_tokens_seen": 2139760, "step": 3260 }, { "epoch": 1.7112159329140462, "grad_norm": 0.554495632648468, "learning_rate": 4.9234652045805895e-05, "loss": 0.5075, "num_input_tokens_seen": 2143280, "step": 3265 }, { "epoch": 1.7138364779874213, "grad_norm": 0.4136936068534851, "learning_rate": 4.922902672459722e-05, "loss": 0.452, "num_input_tokens_seen": 2145648, "step": 3270 }, { "epoch": 1.7164570230607965, "grad_norm": 0.5675715208053589, "learning_rate": 4.9223381129645706e-05, "loss": 0.496, "num_input_tokens_seen": 2149040, "step": 3275 }, { "epoch": 1.719077568134172, "grad_norm": 0.3474430441856384, "learning_rate": 4.921771526567535e-05, "loss": 0.5975, "num_input_tokens_seen": 2151792, "step": 3280 }, { "epoch": 1.7216981132075473, "grad_norm": 0.5961593389511108, "learning_rate": 4.921202913742707e-05, "loss": 0.4683, "num_input_tokens_seen": 2155600, "step": 3285 }, { "epoch": 1.7243186582809225, "grad_norm": 0.3846529424190521, "learning_rate": 4.920632274965878e-05, "loss": 0.4347, "num_input_tokens_seen": 2159088, "step": 3290 }, { "epoch": 1.7269392033542976, "grad_norm": 0.5431245565414429, "learning_rate": 4.920059610714531e-05, "loss": 0.4119, "num_input_tokens_seen": 2162800, "step": 3295 }, { "epoch": 1.7295597484276728, "grad_norm": 0.6113054752349854, "learning_rate": 4.919484921467846e-05, "loss": 0.5673, "num_input_tokens_seen": 2165872, "step": 3300 }, { "epoch": 1.7321802935010482, "grad_norm": 0.4531255066394806, "learning_rate": 4.9189082077066965e-05, "loss": 0.6034, "num_input_tokens_seen": 2169456, "step": 3305 }, { "epoch": 1.7348008385744236, "grad_norm": 0.3585594594478607, "learning_rate": 4.918329469913649e-05, "loss": 0.4942, "num_input_tokens_seen": 2172368, "step": 3310 }, { "epoch": 1.7374213836477987, "grad_norm": 0.29914698004722595, "learning_rate": 4.917748708572967e-05, "loss": 0.6196, "num_input_tokens_seen": 2175504, "step": 3315 }, { "epoch": 1.740041928721174, "grad_norm": 0.6363564729690552, "learning_rate": 4.917165924170604e-05, "loss": 0.5151, "num_input_tokens_seen": 2179440, "step": 3320 }, { "epoch": 1.7426624737945493, "grad_norm": 0.6382607221603394, "learning_rate": 4.9165811171942064e-05, "loss": 0.6139, "num_input_tokens_seen": 2181904, "step": 3325 }, { "epoch": 1.7452830188679245, "grad_norm": 0.3972032070159912, "learning_rate": 4.915994288133115e-05, "loss": 0.4198, "num_input_tokens_seen": 2184816, "step": 3330 }, { "epoch": 1.7479035639412999, "grad_norm": 0.8182671666145325, "learning_rate": 4.9154054374783624e-05, "loss": 0.6068, "num_input_tokens_seen": 2187920, "step": 3335 }, { "epoch": 1.750524109014675, "grad_norm": 0.23075734078884125, "learning_rate": 4.914814565722671e-05, "loss": 0.5299, "num_input_tokens_seen": 2191216, "step": 3340 }, { "epoch": 1.7531446540880502, "grad_norm": 0.5725029706954956, "learning_rate": 4.914221673360455e-05, "loss": 0.4879, "num_input_tokens_seen": 2193744, "step": 3345 }, { "epoch": 1.7557651991614256, "grad_norm": 0.5079519152641296, "learning_rate": 4.91362676088782e-05, "loss": 0.4661, "num_input_tokens_seen": 2196368, "step": 3350 }, { "epoch": 1.758385744234801, "grad_norm": 0.35343021154403687, "learning_rate": 4.913029828802561e-05, "loss": 0.3856, "num_input_tokens_seen": 2199632, "step": 3355 }, { "epoch": 1.7610062893081762, "grad_norm": 0.3133793771266937, "learning_rate": 4.912430877604165e-05, "loss": 0.519, "num_input_tokens_seen": 2202704, "step": 3360 }, { "epoch": 1.7636268343815513, "grad_norm": 0.48261916637420654, "learning_rate": 4.9118299077938054e-05, "loss": 0.6955, "num_input_tokens_seen": 2205616, "step": 3365 }, { "epoch": 1.7662473794549265, "grad_norm": 0.3118814527988434, "learning_rate": 4.911226919874347e-05, "loss": 0.4676, "num_input_tokens_seen": 2209008, "step": 3370 }, { "epoch": 1.7688679245283019, "grad_norm": 0.4248847961425781, "learning_rate": 4.910621914350343e-05, "loss": 0.4518, "num_input_tokens_seen": 2211824, "step": 3375 }, { "epoch": 1.7714884696016773, "grad_norm": 0.3499216139316559, "learning_rate": 4.910014891728033e-05, "loss": 0.4953, "num_input_tokens_seen": 2215632, "step": 3380 }, { "epoch": 1.7741090146750524, "grad_norm": 0.5594117641448975, "learning_rate": 4.9094058525153475e-05, "loss": 0.4615, "num_input_tokens_seen": 2218512, "step": 3385 }, { "epoch": 1.7767295597484276, "grad_norm": 0.37053951621055603, "learning_rate": 4.908794797221902e-05, "loss": 0.4151, "num_input_tokens_seen": 2221776, "step": 3390 }, { "epoch": 1.7793501048218028, "grad_norm": 0.26576563715934753, "learning_rate": 4.908181726358999e-05, "loss": 0.4401, "num_input_tokens_seen": 2224784, "step": 3395 }, { "epoch": 1.7819706498951782, "grad_norm": 0.8138899207115173, "learning_rate": 4.907566640439628e-05, "loss": 0.5064, "num_input_tokens_seen": 2227408, "step": 3400 }, { "epoch": 1.7845911949685536, "grad_norm": 0.6083232164382935, "learning_rate": 4.906949539978467e-05, "loss": 0.4807, "num_input_tokens_seen": 2230064, "step": 3405 }, { "epoch": 1.7872117400419287, "grad_norm": 0.5600356459617615, "learning_rate": 4.906330425491875e-05, "loss": 0.3954, "num_input_tokens_seen": 2234288, "step": 3410 }, { "epoch": 1.789832285115304, "grad_norm": 0.2624227702617645, "learning_rate": 4.9057092974979e-05, "loss": 0.4163, "num_input_tokens_seen": 2237072, "step": 3415 }, { "epoch": 1.7924528301886793, "grad_norm": 0.3208979368209839, "learning_rate": 4.905086156516273e-05, "loss": 0.3953, "num_input_tokens_seen": 2240016, "step": 3420 }, { "epoch": 1.7950733752620545, "grad_norm": 0.35615307092666626, "learning_rate": 4.904461003068411e-05, "loss": 0.4518, "num_input_tokens_seen": 2242800, "step": 3425 }, { "epoch": 1.7976939203354299, "grad_norm": 0.6867821216583252, "learning_rate": 4.9038338376774124e-05, "loss": 0.5491, "num_input_tokens_seen": 2245680, "step": 3430 }, { "epoch": 1.800314465408805, "grad_norm": 0.5759240984916687, "learning_rate": 4.9032046608680613e-05, "loss": 0.5358, "num_input_tokens_seen": 2248272, "step": 3435 }, { "epoch": 1.8029350104821802, "grad_norm": 0.7835785150527954, "learning_rate": 4.902573473166824e-05, "loss": 0.357, "num_input_tokens_seen": 2251376, "step": 3440 }, { "epoch": 1.8055555555555556, "grad_norm": 0.7268351316452026, "learning_rate": 4.9019402751018496e-05, "loss": 0.5572, "num_input_tokens_seen": 2254256, "step": 3445 }, { "epoch": 1.808176100628931, "grad_norm": 0.6696903705596924, "learning_rate": 4.901305067202969e-05, "loss": 0.585, "num_input_tokens_seen": 2257104, "step": 3450 }, { "epoch": 1.8107966457023061, "grad_norm": 0.45470890402793884, "learning_rate": 4.900667850001696e-05, "loss": 0.5695, "num_input_tokens_seen": 2259920, "step": 3455 }, { "epoch": 1.8134171907756813, "grad_norm": 0.35532239079475403, "learning_rate": 4.900028624031223e-05, "loss": 0.4034, "num_input_tokens_seen": 2262736, "step": 3460 }, { "epoch": 1.8160377358490565, "grad_norm": 0.21534936130046844, "learning_rate": 4.899387389826427e-05, "loss": 0.4466, "num_input_tokens_seen": 2266864, "step": 3465 }, { "epoch": 1.8186582809224319, "grad_norm": 0.9284306168556213, "learning_rate": 4.898744147923863e-05, "loss": 0.707, "num_input_tokens_seen": 2269584, "step": 3470 }, { "epoch": 1.8212788259958073, "grad_norm": 0.25250524282455444, "learning_rate": 4.898098898861766e-05, "loss": 0.4718, "num_input_tokens_seen": 2272720, "step": 3475 }, { "epoch": 1.8238993710691824, "grad_norm": 0.33694618940353394, "learning_rate": 4.897451643180051e-05, "loss": 0.5978, "num_input_tokens_seen": 2276880, "step": 3480 }, { "epoch": 1.8265199161425576, "grad_norm": 1.194644808769226, "learning_rate": 4.896802381420313e-05, "loss": 0.4259, "num_input_tokens_seen": 2280016, "step": 3485 }, { "epoch": 1.8291404612159328, "grad_norm": 0.43835514783859253, "learning_rate": 4.896151114125823e-05, "loss": 0.4661, "num_input_tokens_seen": 2282640, "step": 3490 }, { "epoch": 1.8317610062893082, "grad_norm": 0.36317136883735657, "learning_rate": 4.895497841841533e-05, "loss": 0.4494, "num_input_tokens_seen": 2286032, "step": 3495 }, { "epoch": 1.8343815513626835, "grad_norm": 0.42648351192474365, "learning_rate": 4.8948425651140704e-05, "loss": 0.5843, "num_input_tokens_seen": 2289136, "step": 3500 }, { "epoch": 1.8370020964360587, "grad_norm": 0.5958765745162964, "learning_rate": 4.894185284491742e-05, "loss": 0.4069, "num_input_tokens_seen": 2292944, "step": 3505 }, { "epoch": 1.8396226415094339, "grad_norm": 0.7873892784118652, "learning_rate": 4.893526000524529e-05, "loss": 0.487, "num_input_tokens_seen": 2296240, "step": 3510 }, { "epoch": 1.8422431865828093, "grad_norm": 0.7525749802589417, "learning_rate": 4.892864713764091e-05, "loss": 0.4779, "num_input_tokens_seen": 2299088, "step": 3515 }, { "epoch": 1.8448637316561844, "grad_norm": 0.5394968390464783, "learning_rate": 4.892201424763762e-05, "loss": 0.5134, "num_input_tokens_seen": 2302736, "step": 3520 }, { "epoch": 1.8474842767295598, "grad_norm": 0.44719189405441284, "learning_rate": 4.891536134078553e-05, "loss": 0.583, "num_input_tokens_seen": 2306640, "step": 3525 }, { "epoch": 1.850104821802935, "grad_norm": 0.4580692648887634, "learning_rate": 4.8908688422651465e-05, "loss": 0.5552, "num_input_tokens_seen": 2309840, "step": 3530 }, { "epoch": 1.8527253668763102, "grad_norm": 0.8302388787269592, "learning_rate": 4.8901995498819044e-05, "loss": 0.5414, "num_input_tokens_seen": 2312208, "step": 3535 }, { "epoch": 1.8553459119496856, "grad_norm": 0.9673183560371399, "learning_rate": 4.8895282574888576e-05, "loss": 0.643, "num_input_tokens_seen": 2315088, "step": 3540 }, { "epoch": 1.857966457023061, "grad_norm": 0.7130209803581238, "learning_rate": 4.888854965647716e-05, "loss": 0.5247, "num_input_tokens_seen": 2318224, "step": 3545 }, { "epoch": 1.8605870020964361, "grad_norm": 0.2554667890071869, "learning_rate": 4.8881796749218564e-05, "loss": 0.4224, "num_input_tokens_seen": 2322800, "step": 3550 }, { "epoch": 1.8632075471698113, "grad_norm": 0.6043434143066406, "learning_rate": 4.8875023858763335e-05, "loss": 0.3471, "num_input_tokens_seen": 2329008, "step": 3555 }, { "epoch": 1.8658280922431865, "grad_norm": 0.6596816182136536, "learning_rate": 4.88682309907787e-05, "loss": 0.652, "num_input_tokens_seen": 2331344, "step": 3560 }, { "epoch": 1.8684486373165619, "grad_norm": 0.39816009998321533, "learning_rate": 4.886141815094863e-05, "loss": 0.4895, "num_input_tokens_seen": 2334352, "step": 3565 }, { "epoch": 1.8710691823899372, "grad_norm": 0.31219804286956787, "learning_rate": 4.88545853449738e-05, "loss": 0.5742, "num_input_tokens_seen": 2338512, "step": 3570 }, { "epoch": 1.8736897274633124, "grad_norm": 0.3925531506538391, "learning_rate": 4.8847732578571585e-05, "loss": 0.5874, "num_input_tokens_seen": 2342224, "step": 3575 }, { "epoch": 1.8763102725366876, "grad_norm": 1.0497177839279175, "learning_rate": 4.8840859857476074e-05, "loss": 0.4755, "num_input_tokens_seen": 2345808, "step": 3580 }, { "epoch": 1.8789308176100628, "grad_norm": 0.3005824685096741, "learning_rate": 4.8833967187438034e-05, "loss": 0.5652, "num_input_tokens_seen": 2348560, "step": 3585 }, { "epoch": 1.8815513626834381, "grad_norm": 0.41804859042167664, "learning_rate": 4.882705457422495e-05, "loss": 0.3955, "num_input_tokens_seen": 2351824, "step": 3590 }, { "epoch": 1.8841719077568135, "grad_norm": 0.7873826026916504, "learning_rate": 4.8820122023620975e-05, "loss": 0.3916, "num_input_tokens_seen": 2355696, "step": 3595 }, { "epoch": 1.8867924528301887, "grad_norm": 0.45030075311660767, "learning_rate": 4.881316954142694e-05, "loss": 0.4293, "num_input_tokens_seen": 2358352, "step": 3600 }, { "epoch": 1.8894129979035639, "grad_norm": 0.327679306268692, "learning_rate": 4.880619713346039e-05, "loss": 0.4318, "num_input_tokens_seen": 2361776, "step": 3605 }, { "epoch": 1.892033542976939, "grad_norm": 0.5322070121765137, "learning_rate": 4.879920480555549e-05, "loss": 0.5741, "num_input_tokens_seen": 2365264, "step": 3610 }, { "epoch": 1.8946540880503144, "grad_norm": 0.6552875638008118, "learning_rate": 4.8792192563563114e-05, "loss": 0.7196, "num_input_tokens_seen": 2368080, "step": 3615 }, { "epoch": 1.8972746331236898, "grad_norm": 0.3985002934932709, "learning_rate": 4.8785160413350797e-05, "loss": 0.5174, "num_input_tokens_seen": 2371696, "step": 3620 }, { "epoch": 1.899895178197065, "grad_norm": 0.4565732181072235, "learning_rate": 4.877810836080269e-05, "loss": 0.4807, "num_input_tokens_seen": 2374160, "step": 3625 }, { "epoch": 1.9025157232704402, "grad_norm": 0.4726003110408783, "learning_rate": 4.8771036411819656e-05, "loss": 0.4392, "num_input_tokens_seen": 2376656, "step": 3630 }, { "epoch": 1.9051362683438156, "grad_norm": 0.45254749059677124, "learning_rate": 4.876394457231917e-05, "loss": 0.4778, "num_input_tokens_seen": 2379856, "step": 3635 }, { "epoch": 1.9077568134171907, "grad_norm": 0.5409723520278931, "learning_rate": 4.875683284823537e-05, "loss": 0.5985, "num_input_tokens_seen": 2382736, "step": 3640 }, { "epoch": 1.9103773584905661, "grad_norm": 0.6566748023033142, "learning_rate": 4.8749701245519e-05, "loss": 0.4899, "num_input_tokens_seen": 2386672, "step": 3645 }, { "epoch": 1.9129979035639413, "grad_norm": 0.34518495202064514, "learning_rate": 4.874254977013747e-05, "loss": 0.4077, "num_input_tokens_seen": 2389296, "step": 3650 }, { "epoch": 1.9156184486373165, "grad_norm": 0.43102577328681946, "learning_rate": 4.8735378428074806e-05, "loss": 0.5872, "num_input_tokens_seen": 2392080, "step": 3655 }, { "epoch": 1.9182389937106918, "grad_norm": 0.2782866060733795, "learning_rate": 4.8728187225331665e-05, "loss": 0.4045, "num_input_tokens_seen": 2395952, "step": 3660 }, { "epoch": 1.9208595387840672, "grad_norm": 0.26185935735702515, "learning_rate": 4.872097616792532e-05, "loss": 0.4295, "num_input_tokens_seen": 2398512, "step": 3665 }, { "epoch": 1.9234800838574424, "grad_norm": 0.422760009765625, "learning_rate": 4.871374526188964e-05, "loss": 0.5714, "num_input_tokens_seen": 2401904, "step": 3670 }, { "epoch": 1.9261006289308176, "grad_norm": 0.34353601932525635, "learning_rate": 4.8706494513275134e-05, "loss": 0.5254, "num_input_tokens_seen": 2405584, "step": 3675 }, { "epoch": 1.9287211740041927, "grad_norm": 0.28602349758148193, "learning_rate": 4.869922392814889e-05, "loss": 0.6, "num_input_tokens_seen": 2409712, "step": 3680 }, { "epoch": 1.9313417190775681, "grad_norm": 0.37014833092689514, "learning_rate": 4.869193351259459e-05, "loss": 0.6432, "num_input_tokens_seen": 2412240, "step": 3685 }, { "epoch": 1.9339622641509435, "grad_norm": 0.3976798951625824, "learning_rate": 4.868462327271254e-05, "loss": 0.4503, "num_input_tokens_seen": 2415056, "step": 3690 }, { "epoch": 1.9365828092243187, "grad_norm": 0.765496551990509, "learning_rate": 4.86772932146196e-05, "loss": 0.4738, "num_input_tokens_seen": 2419184, "step": 3695 }, { "epoch": 1.9392033542976939, "grad_norm": 0.7306535840034485, "learning_rate": 4.866994334444923e-05, "loss": 0.5264, "num_input_tokens_seen": 2422512, "step": 3700 }, { "epoch": 1.941823899371069, "grad_norm": 0.5506483316421509, "learning_rate": 4.866257366835147e-05, "loss": 0.4786, "num_input_tokens_seen": 2425936, "step": 3705 }, { "epoch": 1.9444444444444444, "grad_norm": 0.8083242177963257, "learning_rate": 4.865518419249294e-05, "loss": 0.6165, "num_input_tokens_seen": 2428944, "step": 3710 }, { "epoch": 1.9470649895178198, "grad_norm": 0.46390262246131897, "learning_rate": 4.864777492305679e-05, "loss": 0.4062, "num_input_tokens_seen": 2432080, "step": 3715 }, { "epoch": 1.949685534591195, "grad_norm": 0.47987794876098633, "learning_rate": 4.864034586624277e-05, "loss": 0.4611, "num_input_tokens_seen": 2434864, "step": 3720 }, { "epoch": 1.9523060796645701, "grad_norm": 0.5343963503837585, "learning_rate": 4.863289702826719e-05, "loss": 0.4682, "num_input_tokens_seen": 2437104, "step": 3725 }, { "epoch": 1.9549266247379455, "grad_norm": 0.5652403235435486, "learning_rate": 4.862542841536288e-05, "loss": 0.6668, "num_input_tokens_seen": 2439792, "step": 3730 }, { "epoch": 1.9575471698113207, "grad_norm": 0.32877078652381897, "learning_rate": 4.861794003377923e-05, "loss": 0.6471, "num_input_tokens_seen": 2442320, "step": 3735 }, { "epoch": 1.960167714884696, "grad_norm": 0.41446757316589355, "learning_rate": 4.8610431889782195e-05, "loss": 0.4325, "num_input_tokens_seen": 2445136, "step": 3740 }, { "epoch": 1.9627882599580713, "grad_norm": 0.35702189803123474, "learning_rate": 4.8602903989654224e-05, "loss": 0.5029, "num_input_tokens_seen": 2448080, "step": 3745 }, { "epoch": 1.9654088050314464, "grad_norm": 0.392965704202652, "learning_rate": 4.859535633969434e-05, "loss": 0.5275, "num_input_tokens_seen": 2451376, "step": 3750 }, { "epoch": 1.9680293501048218, "grad_norm": 0.3766601085662842, "learning_rate": 4.858778894621807e-05, "loss": 0.5968, "num_input_tokens_seen": 2454672, "step": 3755 }, { "epoch": 1.9706498951781972, "grad_norm": 0.5450807809829712, "learning_rate": 4.858020181555745e-05, "loss": 0.4594, "num_input_tokens_seen": 2457712, "step": 3760 }, { "epoch": 1.9732704402515724, "grad_norm": 0.44137635827064514, "learning_rate": 4.857259495406105e-05, "loss": 0.4888, "num_input_tokens_seen": 2461936, "step": 3765 }, { "epoch": 1.9758909853249476, "grad_norm": 0.39887404441833496, "learning_rate": 4.856496836809394e-05, "loss": 0.4351, "num_input_tokens_seen": 2465296, "step": 3770 }, { "epoch": 1.9785115303983227, "grad_norm": 0.5767850875854492, "learning_rate": 4.8557322064037714e-05, "loss": 0.5025, "num_input_tokens_seen": 2469872, "step": 3775 }, { "epoch": 1.9811320754716981, "grad_norm": 0.5210093259811401, "learning_rate": 4.854965604829044e-05, "loss": 0.388, "num_input_tokens_seen": 2473840, "step": 3780 }, { "epoch": 1.9837526205450735, "grad_norm": 0.5791919827461243, "learning_rate": 4.8541970327266685e-05, "loss": 0.5184, "num_input_tokens_seen": 2476944, "step": 3785 }, { "epoch": 1.9863731656184487, "grad_norm": 0.4172225892543793, "learning_rate": 4.853426490739751e-05, "loss": 0.4951, "num_input_tokens_seen": 2479792, "step": 3790 }, { "epoch": 1.9889937106918238, "grad_norm": 0.5181266665458679, "learning_rate": 4.852653979513047e-05, "loss": 0.6096, "num_input_tokens_seen": 2484464, "step": 3795 }, { "epoch": 1.991614255765199, "grad_norm": 0.6160832643508911, "learning_rate": 4.851879499692958e-05, "loss": 0.4629, "num_input_tokens_seen": 2487312, "step": 3800 }, { "epoch": 1.9942348008385744, "grad_norm": 0.49809592962265015, "learning_rate": 4.851103051927532e-05, "loss": 0.4901, "num_input_tokens_seen": 2490288, "step": 3805 }, { "epoch": 1.9968553459119498, "grad_norm": 0.5169609785079956, "learning_rate": 4.850324636866468e-05, "loss": 0.5241, "num_input_tokens_seen": 2494128, "step": 3810 }, { "epoch": 1.999475890985325, "grad_norm": 0.3992151916027069, "learning_rate": 4.849544255161106e-05, "loss": 0.4717, "num_input_tokens_seen": 2496848, "step": 3815 }, { "epoch": 2.0, "eval_loss": 0.5127993226051331, "eval_runtime": 14.5394, "eval_samples_per_second": 58.324, "eval_steps_per_second": 14.581, "num_input_tokens_seen": 2497016, "step": 3816 }, { "epoch": 2.0020964360587, "grad_norm": 0.46166089177131653, "learning_rate": 4.848761907464433e-05, "loss": 0.3994, "num_input_tokens_seen": 2499256, "step": 3820 }, { "epoch": 2.0047169811320753, "grad_norm": 0.5400064587593079, "learning_rate": 4.847977594431084e-05, "loss": 0.4952, "num_input_tokens_seen": 2502200, "step": 3825 }, { "epoch": 2.007337526205451, "grad_norm": 0.6577861309051514, "learning_rate": 4.847191316717335e-05, "loss": 0.6588, "num_input_tokens_seen": 2505080, "step": 3830 }, { "epoch": 2.009958071278826, "grad_norm": 0.5703439712524414, "learning_rate": 4.846403074981107e-05, "loss": 0.513, "num_input_tokens_seen": 2508216, "step": 3835 }, { "epoch": 2.0125786163522013, "grad_norm": 0.37165528535842896, "learning_rate": 4.845612869881967e-05, "loss": 0.3745, "num_input_tokens_seen": 2511128, "step": 3840 }, { "epoch": 2.0151991614255764, "grad_norm": 0.4487060308456421, "learning_rate": 4.8448207020811194e-05, "loss": 0.5106, "num_input_tokens_seen": 2514776, "step": 3845 }, { "epoch": 2.0178197064989516, "grad_norm": 0.770725667476654, "learning_rate": 4.8440265722414155e-05, "loss": 0.6492, "num_input_tokens_seen": 2517528, "step": 3850 }, { "epoch": 2.020440251572327, "grad_norm": 0.2859501540660858, "learning_rate": 4.843230481027347e-05, "loss": 0.5493, "num_input_tokens_seen": 2523032, "step": 3855 }, { "epoch": 2.0230607966457024, "grad_norm": 0.28220227360725403, "learning_rate": 4.8424324291050464e-05, "loss": 0.4783, "num_input_tokens_seen": 2526200, "step": 3860 }, { "epoch": 2.0256813417190775, "grad_norm": 0.5270119905471802, "learning_rate": 4.841632417142287e-05, "loss": 0.5932, "num_input_tokens_seen": 2532760, "step": 3865 }, { "epoch": 2.0283018867924527, "grad_norm": 0.32782313227653503, "learning_rate": 4.840830445808483e-05, "loss": 0.4216, "num_input_tokens_seen": 2535640, "step": 3870 }, { "epoch": 2.030922431865828, "grad_norm": 0.31158366799354553, "learning_rate": 4.840026515774686e-05, "loss": 0.4767, "num_input_tokens_seen": 2539288, "step": 3875 }, { "epoch": 2.0335429769392035, "grad_norm": 0.8265517950057983, "learning_rate": 4.8392206277135896e-05, "loss": 0.5774, "num_input_tokens_seen": 2541880, "step": 3880 }, { "epoch": 2.0361635220125787, "grad_norm": 0.39637336134910583, "learning_rate": 4.8384127822995227e-05, "loss": 0.4884, "num_input_tokens_seen": 2545080, "step": 3885 }, { "epoch": 2.038784067085954, "grad_norm": 0.3770711123943329, "learning_rate": 4.8376029802084546e-05, "loss": 0.3766, "num_input_tokens_seen": 2548376, "step": 3890 }, { "epoch": 2.041404612159329, "grad_norm": 0.460449755191803, "learning_rate": 4.836791222117989e-05, "loss": 0.4366, "num_input_tokens_seen": 2550616, "step": 3895 }, { "epoch": 2.0440251572327046, "grad_norm": 0.38252392411231995, "learning_rate": 4.83597750870737e-05, "loss": 0.594, "num_input_tokens_seen": 2555000, "step": 3900 }, { "epoch": 2.04664570230608, "grad_norm": 0.6079077124595642, "learning_rate": 4.8351618406574746e-05, "loss": 0.5099, "num_input_tokens_seen": 2558328, "step": 3905 }, { "epoch": 2.049266247379455, "grad_norm": 0.37456175684928894, "learning_rate": 4.834344218650817e-05, "loss": 0.5838, "num_input_tokens_seen": 2561592, "step": 3910 }, { "epoch": 2.05188679245283, "grad_norm": 0.3673248291015625, "learning_rate": 4.833524643371545e-05, "loss": 0.363, "num_input_tokens_seen": 2564472, "step": 3915 }, { "epoch": 2.0545073375262053, "grad_norm": 0.7165189385414124, "learning_rate": 4.8327031155054434e-05, "loss": 0.4711, "num_input_tokens_seen": 2567352, "step": 3920 }, { "epoch": 2.057127882599581, "grad_norm": 0.32098016142845154, "learning_rate": 4.831879635739929e-05, "loss": 0.3709, "num_input_tokens_seen": 2571448, "step": 3925 }, { "epoch": 2.059748427672956, "grad_norm": 0.5695472955703735, "learning_rate": 4.83105420476405e-05, "loss": 0.4224, "num_input_tokens_seen": 2574264, "step": 3930 }, { "epoch": 2.0623689727463312, "grad_norm": 0.5085157752037048, "learning_rate": 4.830226823268491e-05, "loss": 0.6144, "num_input_tokens_seen": 2577592, "step": 3935 }, { "epoch": 2.0649895178197064, "grad_norm": 0.6330558657646179, "learning_rate": 4.829397491945568e-05, "loss": 0.5145, "num_input_tokens_seen": 2581624, "step": 3940 }, { "epoch": 2.0676100628930816, "grad_norm": 0.5599704384803772, "learning_rate": 4.828566211489225e-05, "loss": 0.5131, "num_input_tokens_seen": 2584888, "step": 3945 }, { "epoch": 2.070230607966457, "grad_norm": 0.7744221687316895, "learning_rate": 4.827732982595041e-05, "loss": 0.5077, "num_input_tokens_seen": 2587672, "step": 3950 }, { "epoch": 2.0728511530398324, "grad_norm": 0.4454687237739563, "learning_rate": 4.826897805960224e-05, "loss": 0.482, "num_input_tokens_seen": 2590136, "step": 3955 }, { "epoch": 2.0754716981132075, "grad_norm": 0.43887943029403687, "learning_rate": 4.8260606822836116e-05, "loss": 0.4828, "num_input_tokens_seen": 2593976, "step": 3960 }, { "epoch": 2.0780922431865827, "grad_norm": 0.5858749747276306, "learning_rate": 4.8252216122656716e-05, "loss": 0.9978, "num_input_tokens_seen": 2597208, "step": 3965 }, { "epoch": 2.080712788259958, "grad_norm": 0.535872220993042, "learning_rate": 4.824380596608497e-05, "loss": 0.5134, "num_input_tokens_seen": 2600536, "step": 3970 }, { "epoch": 2.0833333333333335, "grad_norm": 0.41468942165374756, "learning_rate": 4.823537636015812e-05, "loss": 0.4269, "num_input_tokens_seen": 2603608, "step": 3975 }, { "epoch": 2.0859538784067087, "grad_norm": 0.5533286929130554, "learning_rate": 4.822692731192969e-05, "loss": 0.5043, "num_input_tokens_seen": 2607288, "step": 3980 }, { "epoch": 2.088574423480084, "grad_norm": 0.40902745723724365, "learning_rate": 4.8218458828469445e-05, "loss": 0.6055, "num_input_tokens_seen": 2610392, "step": 3985 }, { "epoch": 2.091194968553459, "grad_norm": 0.3874704837799072, "learning_rate": 4.820997091686343e-05, "loss": 0.6318, "num_input_tokens_seen": 2613208, "step": 3990 }, { "epoch": 2.0938155136268346, "grad_norm": 0.3823348581790924, "learning_rate": 4.8201463584213946e-05, "loss": 0.6275, "num_input_tokens_seen": 2616472, "step": 3995 }, { "epoch": 2.0964360587002098, "grad_norm": 0.3699704110622406, "learning_rate": 4.819293683763954e-05, "loss": 0.5012, "num_input_tokens_seen": 2620504, "step": 4000 }, { "epoch": 2.099056603773585, "grad_norm": 0.44042935967445374, "learning_rate": 4.818439068427498e-05, "loss": 0.5506, "num_input_tokens_seen": 2624632, "step": 4005 }, { "epoch": 2.10167714884696, "grad_norm": 0.5852722525596619, "learning_rate": 4.817582513127133e-05, "loss": 0.4964, "num_input_tokens_seen": 2627608, "step": 4010 }, { "epoch": 2.1042976939203353, "grad_norm": 0.25581279397010803, "learning_rate": 4.8167240185795835e-05, "loss": 0.4341, "num_input_tokens_seen": 2630232, "step": 4015 }, { "epoch": 2.106918238993711, "grad_norm": 0.32503095269203186, "learning_rate": 4.8158635855032e-05, "loss": 0.4353, "num_input_tokens_seen": 2633624, "step": 4020 }, { "epoch": 2.109538784067086, "grad_norm": 1.1699451208114624, "learning_rate": 4.8150012146179514e-05, "loss": 0.5357, "num_input_tokens_seen": 2636280, "step": 4025 }, { "epoch": 2.1121593291404612, "grad_norm": 0.4686707556247711, "learning_rate": 4.814136906645431e-05, "loss": 0.5325, "num_input_tokens_seen": 2640152, "step": 4030 }, { "epoch": 2.1147798742138364, "grad_norm": 0.28176456689834595, "learning_rate": 4.813270662308854e-05, "loss": 0.4951, "num_input_tokens_seen": 2643800, "step": 4035 }, { "epoch": 2.1174004192872116, "grad_norm": 0.4197565019130707, "learning_rate": 4.812402482333052e-05, "loss": 0.4598, "num_input_tokens_seen": 2647608, "step": 4040 }, { "epoch": 2.120020964360587, "grad_norm": 0.4379793405532837, "learning_rate": 4.811532367444479e-05, "loss": 0.5819, "num_input_tokens_seen": 2650200, "step": 4045 }, { "epoch": 2.1226415094339623, "grad_norm": 0.3620466887950897, "learning_rate": 4.810660318371208e-05, "loss": 0.6186, "num_input_tokens_seen": 2653784, "step": 4050 }, { "epoch": 2.1252620545073375, "grad_norm": 0.23785912990570068, "learning_rate": 4.809786335842929e-05, "loss": 0.5146, "num_input_tokens_seen": 2658104, "step": 4055 }, { "epoch": 2.1278825995807127, "grad_norm": 0.6000411510467529, "learning_rate": 4.8089104205909506e-05, "loss": 0.4693, "num_input_tokens_seen": 2661080, "step": 4060 }, { "epoch": 2.130503144654088, "grad_norm": 0.3746677041053772, "learning_rate": 4.8080325733482004e-05, "loss": 0.4216, "num_input_tokens_seen": 2664312, "step": 4065 }, { "epoch": 2.1331236897274635, "grad_norm": 1.017557144165039, "learning_rate": 4.8071527948492176e-05, "loss": 0.5719, "num_input_tokens_seen": 2667704, "step": 4070 }, { "epoch": 2.1357442348008386, "grad_norm": 0.3773357570171356, "learning_rate": 4.806271085830164e-05, "loss": 0.4065, "num_input_tokens_seen": 2671160, "step": 4075 }, { "epoch": 2.138364779874214, "grad_norm": 0.29770728945732117, "learning_rate": 4.805387447028812e-05, "loss": 0.5371, "num_input_tokens_seen": 2675544, "step": 4080 }, { "epoch": 2.140985324947589, "grad_norm": 0.5880931615829468, "learning_rate": 4.80450187918455e-05, "loss": 0.4829, "num_input_tokens_seen": 2678264, "step": 4085 }, { "epoch": 2.1436058700209646, "grad_norm": 0.5239116549491882, "learning_rate": 4.8036143830383807e-05, "loss": 0.4391, "num_input_tokens_seen": 2684120, "step": 4090 }, { "epoch": 2.1462264150943398, "grad_norm": 0.8837340474128723, "learning_rate": 4.8027249593329206e-05, "loss": 0.6715, "num_input_tokens_seen": 2686872, "step": 4095 }, { "epoch": 2.148846960167715, "grad_norm": 0.3671552538871765, "learning_rate": 4.8018336088123986e-05, "loss": 0.4487, "num_input_tokens_seen": 2690456, "step": 4100 }, { "epoch": 2.15146750524109, "grad_norm": 0.3937775194644928, "learning_rate": 4.800940332222656e-05, "loss": 0.5043, "num_input_tokens_seen": 2692824, "step": 4105 }, { "epoch": 2.1540880503144653, "grad_norm": 0.514653742313385, "learning_rate": 4.8000451303111474e-05, "loss": 0.5986, "num_input_tokens_seen": 2695704, "step": 4110 }, { "epoch": 2.156708595387841, "grad_norm": 0.3377317190170288, "learning_rate": 4.799148003826936e-05, "loss": 0.4755, "num_input_tokens_seen": 2699480, "step": 4115 }, { "epoch": 2.159329140461216, "grad_norm": 0.44766688346862793, "learning_rate": 4.798248953520694e-05, "loss": 0.5587, "num_input_tokens_seen": 2703960, "step": 4120 }, { "epoch": 2.161949685534591, "grad_norm": 0.4245861768722534, "learning_rate": 4.7973479801447084e-05, "loss": 0.4613, "num_input_tokens_seen": 2706552, "step": 4125 }, { "epoch": 2.1645702306079664, "grad_norm": 0.3838205933570862, "learning_rate": 4.796445084452871e-05, "loss": 0.5489, "num_input_tokens_seen": 2709368, "step": 4130 }, { "epoch": 2.1671907756813416, "grad_norm": 0.5607044100761414, "learning_rate": 4.7955402672006854e-05, "loss": 0.5484, "num_input_tokens_seen": 2712248, "step": 4135 }, { "epoch": 2.169811320754717, "grad_norm": 0.42286592721939087, "learning_rate": 4.794633529145259e-05, "loss": 0.4753, "num_input_tokens_seen": 2715448, "step": 4140 }, { "epoch": 2.1724318658280923, "grad_norm": 0.331548810005188, "learning_rate": 4.793724871045312e-05, "loss": 0.3915, "num_input_tokens_seen": 2719064, "step": 4145 }, { "epoch": 2.1750524109014675, "grad_norm": 0.314654141664505, "learning_rate": 4.792814293661164e-05, "loss": 0.4542, "num_input_tokens_seen": 2722488, "step": 4150 }, { "epoch": 2.1776729559748427, "grad_norm": 0.5784279704093933, "learning_rate": 4.791901797754748e-05, "loss": 0.4111, "num_input_tokens_seen": 2725528, "step": 4155 }, { "epoch": 2.180293501048218, "grad_norm": 0.371193528175354, "learning_rate": 4.790987384089597e-05, "loss": 0.4312, "num_input_tokens_seen": 2728824, "step": 4160 }, { "epoch": 2.1829140461215935, "grad_norm": 0.3244892656803131, "learning_rate": 4.790071053430851e-05, "loss": 0.6132, "num_input_tokens_seen": 2732472, "step": 4165 }, { "epoch": 2.1855345911949686, "grad_norm": 0.28266042470932007, "learning_rate": 4.7891528065452544e-05, "loss": 0.5015, "num_input_tokens_seen": 2735544, "step": 4170 }, { "epoch": 2.188155136268344, "grad_norm": 0.3169524371623993, "learning_rate": 4.788232644201153e-05, "loss": 0.5086, "num_input_tokens_seen": 2739032, "step": 4175 }, { "epoch": 2.190775681341719, "grad_norm": 0.5029659271240234, "learning_rate": 4.787310567168498e-05, "loss": 0.6311, "num_input_tokens_seen": 2742456, "step": 4180 }, { "epoch": 2.1933962264150946, "grad_norm": 0.9073389172554016, "learning_rate": 4.78638657621884e-05, "loss": 0.4395, "num_input_tokens_seen": 2744888, "step": 4185 }, { "epoch": 2.1960167714884697, "grad_norm": 0.5226964950561523, "learning_rate": 4.785460672125332e-05, "loss": 0.598, "num_input_tokens_seen": 2747512, "step": 4190 }, { "epoch": 2.198637316561845, "grad_norm": 0.4810740649700165, "learning_rate": 4.7845328556627306e-05, "loss": 0.5197, "num_input_tokens_seen": 2750648, "step": 4195 }, { "epoch": 2.20125786163522, "grad_norm": 0.421490341424942, "learning_rate": 4.783603127607388e-05, "loss": 0.3961, "num_input_tokens_seen": 2753368, "step": 4200 }, { "epoch": 2.2038784067085953, "grad_norm": 0.41319340467453003, "learning_rate": 4.78267148873726e-05, "loss": 0.4565, "num_input_tokens_seen": 2756696, "step": 4205 }, { "epoch": 2.2064989517819704, "grad_norm": 0.7076919674873352, "learning_rate": 4.781737939831898e-05, "loss": 0.5229, "num_input_tokens_seen": 2759256, "step": 4210 }, { "epoch": 2.209119496855346, "grad_norm": 0.5779746174812317, "learning_rate": 4.7808024816724536e-05, "loss": 0.5406, "num_input_tokens_seen": 2762296, "step": 4215 }, { "epoch": 2.211740041928721, "grad_norm": 0.6683434844017029, "learning_rate": 4.7798651150416754e-05, "loss": 0.4688, "num_input_tokens_seen": 2765400, "step": 4220 }, { "epoch": 2.2143605870020964, "grad_norm": 1.168829321861267, "learning_rate": 4.778925840723909e-05, "loss": 0.4867, "num_input_tokens_seen": 2768120, "step": 4225 }, { "epoch": 2.2169811320754715, "grad_norm": 0.3993188440799713, "learning_rate": 4.777984659505096e-05, "loss": 0.5284, "num_input_tokens_seen": 2771096, "step": 4230 }, { "epoch": 2.219601677148847, "grad_norm": 0.5641477108001709, "learning_rate": 4.777041572172774e-05, "loss": 0.5698, "num_input_tokens_seen": 2774328, "step": 4235 }, { "epoch": 2.2222222222222223, "grad_norm": 0.3566245138645172, "learning_rate": 4.776096579516076e-05, "loss": 0.5552, "num_input_tokens_seen": 2778008, "step": 4240 }, { "epoch": 2.2248427672955975, "grad_norm": 2.145235300064087, "learning_rate": 4.775149682325728e-05, "loss": 0.5471, "num_input_tokens_seen": 2780216, "step": 4245 }, { "epoch": 2.2274633123689727, "grad_norm": 0.39947614073753357, "learning_rate": 4.77420088139405e-05, "loss": 0.4818, "num_input_tokens_seen": 2784088, "step": 4250 }, { "epoch": 2.230083857442348, "grad_norm": 0.3328837752342224, "learning_rate": 4.7732501775149564e-05, "loss": 0.4106, "num_input_tokens_seen": 2786488, "step": 4255 }, { "epoch": 2.2327044025157234, "grad_norm": 0.5231049656867981, "learning_rate": 4.7722975714839526e-05, "loss": 0.4201, "num_input_tokens_seen": 2788920, "step": 4260 }, { "epoch": 2.2353249475890986, "grad_norm": 0.4870285987854004, "learning_rate": 4.7713430640981346e-05, "loss": 0.6668, "num_input_tokens_seen": 2791640, "step": 4265 }, { "epoch": 2.237945492662474, "grad_norm": 0.45645871758461, "learning_rate": 4.7703866561561915e-05, "loss": 0.4045, "num_input_tokens_seen": 2794904, "step": 4270 }, { "epoch": 2.240566037735849, "grad_norm": 0.39592882990837097, "learning_rate": 4.769428348458402e-05, "loss": 0.4228, "num_input_tokens_seen": 2798840, "step": 4275 }, { "epoch": 2.243186582809224, "grad_norm": 0.39523768424987793, "learning_rate": 4.7684681418066334e-05, "loss": 0.5461, "num_input_tokens_seen": 2802616, "step": 4280 }, { "epoch": 2.2458071278825997, "grad_norm": 0.5710734724998474, "learning_rate": 4.767506037004344e-05, "loss": 0.4736, "num_input_tokens_seen": 2805432, "step": 4285 }, { "epoch": 2.248427672955975, "grad_norm": 0.7520540356636047, "learning_rate": 4.766542034856577e-05, "loss": 0.5302, "num_input_tokens_seen": 2808696, "step": 4290 }, { "epoch": 2.25104821802935, "grad_norm": 0.35064697265625, "learning_rate": 4.7655761361699676e-05, "loss": 0.4361, "num_input_tokens_seen": 2811640, "step": 4295 }, { "epoch": 2.2536687631027252, "grad_norm": 0.45956286787986755, "learning_rate": 4.7646083417527345e-05, "loss": 0.4542, "num_input_tokens_seen": 2815224, "step": 4300 }, { "epoch": 2.2562893081761004, "grad_norm": 0.4796851873397827, "learning_rate": 4.7636386524146846e-05, "loss": 0.4994, "num_input_tokens_seen": 2817688, "step": 4305 }, { "epoch": 2.258909853249476, "grad_norm": 0.3109770119190216, "learning_rate": 4.7626670689672095e-05, "loss": 0.4143, "num_input_tokens_seen": 2821016, "step": 4310 }, { "epoch": 2.261530398322851, "grad_norm": 0.44280534982681274, "learning_rate": 4.761693592223285e-05, "loss": 0.5856, "num_input_tokens_seen": 2823608, "step": 4315 }, { "epoch": 2.2641509433962264, "grad_norm": 0.41485968232154846, "learning_rate": 4.760718222997472e-05, "loss": 0.4283, "num_input_tokens_seen": 2828248, "step": 4320 }, { "epoch": 2.2667714884696015, "grad_norm": 0.4806077182292938, "learning_rate": 4.7597409621059164e-05, "loss": 0.5178, "num_input_tokens_seen": 2832344, "step": 4325 }, { "epoch": 2.269392033542977, "grad_norm": 0.4998723268508911, "learning_rate": 4.7587618103663444e-05, "loss": 0.4446, "num_input_tokens_seen": 2836408, "step": 4330 }, { "epoch": 2.2720125786163523, "grad_norm": 0.3099648058414459, "learning_rate": 4.757780768598066e-05, "loss": 0.5139, "num_input_tokens_seen": 2839320, "step": 4335 }, { "epoch": 2.2746331236897275, "grad_norm": 0.8045790195465088, "learning_rate": 4.756797837621971e-05, "loss": 0.5678, "num_input_tokens_seen": 2842456, "step": 4340 }, { "epoch": 2.2772536687631026, "grad_norm": 0.7584251165390015, "learning_rate": 4.755813018260532e-05, "loss": 0.644, "num_input_tokens_seen": 2845432, "step": 4345 }, { "epoch": 2.279874213836478, "grad_norm": 0.34374645352363586, "learning_rate": 4.754826311337801e-05, "loss": 0.5408, "num_input_tokens_seen": 2848536, "step": 4350 }, { "epoch": 2.2824947589098534, "grad_norm": 0.36587047576904297, "learning_rate": 4.753837717679409e-05, "loss": 0.5695, "num_input_tokens_seen": 2851544, "step": 4355 }, { "epoch": 2.2851153039832286, "grad_norm": 0.4165107309818268, "learning_rate": 4.7528472381125653e-05, "loss": 0.4783, "num_input_tokens_seen": 2854392, "step": 4360 }, { "epoch": 2.2877358490566038, "grad_norm": 0.2761901319026947, "learning_rate": 4.75185487346606e-05, "loss": 0.4274, "num_input_tokens_seen": 2858424, "step": 4365 }, { "epoch": 2.290356394129979, "grad_norm": 0.3858884572982788, "learning_rate": 4.750860624570256e-05, "loss": 0.5171, "num_input_tokens_seen": 2861368, "step": 4370 }, { "epoch": 2.2929769392033545, "grad_norm": 0.703233540058136, "learning_rate": 4.7498644922570966e-05, "loss": 0.5288, "num_input_tokens_seen": 2863960, "step": 4375 }, { "epoch": 2.2955974842767297, "grad_norm": 0.8157623410224915, "learning_rate": 4.7488664773601004e-05, "loss": 0.5726, "num_input_tokens_seen": 2867352, "step": 4380 }, { "epoch": 2.298218029350105, "grad_norm": 0.6271132826805115, "learning_rate": 4.7478665807143605e-05, "loss": 0.5469, "num_input_tokens_seen": 2871992, "step": 4385 }, { "epoch": 2.30083857442348, "grad_norm": 0.4167092740535736, "learning_rate": 4.7468648031565434e-05, "loss": 0.4482, "num_input_tokens_seen": 2874680, "step": 4390 }, { "epoch": 2.3034591194968552, "grad_norm": 0.8266074657440186, "learning_rate": 4.745861145524892e-05, "loss": 0.3686, "num_input_tokens_seen": 2877688, "step": 4395 }, { "epoch": 2.3060796645702304, "grad_norm": 0.39731645584106445, "learning_rate": 4.74485560865922e-05, "loss": 0.568, "num_input_tokens_seen": 2885528, "step": 4400 }, { "epoch": 2.308700209643606, "grad_norm": 0.34674885869026184, "learning_rate": 4.743848193400917e-05, "loss": 0.6136, "num_input_tokens_seen": 2889464, "step": 4405 }, { "epoch": 2.311320754716981, "grad_norm": 0.49499404430389404, "learning_rate": 4.7428389005929405e-05, "loss": 0.3904, "num_input_tokens_seen": 2893464, "step": 4410 }, { "epoch": 2.3139412997903563, "grad_norm": 0.5014910101890564, "learning_rate": 4.74182773107982e-05, "loss": 0.5378, "num_input_tokens_seen": 2896792, "step": 4415 }, { "epoch": 2.3165618448637315, "grad_norm": 0.597936749458313, "learning_rate": 4.7408146857076566e-05, "loss": 0.4951, "num_input_tokens_seen": 2900664, "step": 4420 }, { "epoch": 2.319182389937107, "grad_norm": 0.5240599513053894, "learning_rate": 4.739799765324121e-05, "loss": 0.5542, "num_input_tokens_seen": 2904152, "step": 4425 }, { "epoch": 2.3218029350104823, "grad_norm": 0.3808321952819824, "learning_rate": 4.738782970778452e-05, "loss": 0.505, "num_input_tokens_seen": 2906776, "step": 4430 }, { "epoch": 2.3244234800838575, "grad_norm": 0.3908189535140991, "learning_rate": 4.737764302921456e-05, "loss": 0.5219, "num_input_tokens_seen": 2909720, "step": 4435 }, { "epoch": 2.3270440251572326, "grad_norm": 0.6814975142478943, "learning_rate": 4.7367437626055087e-05, "loss": 0.3576, "num_input_tokens_seen": 2916536, "step": 4440 }, { "epoch": 2.329664570230608, "grad_norm": 0.5492028594017029, "learning_rate": 4.735721350684551e-05, "loss": 0.5569, "num_input_tokens_seen": 2919608, "step": 4445 }, { "epoch": 2.3322851153039834, "grad_norm": 0.3398478627204895, "learning_rate": 4.734697068014091e-05, "loss": 0.632, "num_input_tokens_seen": 2923352, "step": 4450 }, { "epoch": 2.3349056603773586, "grad_norm": 0.5199428796768188, "learning_rate": 4.733670915451202e-05, "loss": 0.4871, "num_input_tokens_seen": 2926232, "step": 4455 }, { "epoch": 2.3375262054507338, "grad_norm": 0.5563454627990723, "learning_rate": 4.732642893854519e-05, "loss": 0.4315, "num_input_tokens_seen": 2929656, "step": 4460 }, { "epoch": 2.340146750524109, "grad_norm": 0.5584842562675476, "learning_rate": 4.7316130040842466e-05, "loss": 0.5667, "num_input_tokens_seen": 2932344, "step": 4465 }, { "epoch": 2.342767295597484, "grad_norm": 0.2902621030807495, "learning_rate": 4.730581247002148e-05, "loss": 0.4129, "num_input_tokens_seen": 2936152, "step": 4470 }, { "epoch": 2.3453878406708597, "grad_norm": 0.4541375935077667, "learning_rate": 4.7295476234715516e-05, "loss": 0.4525, "num_input_tokens_seen": 2939224, "step": 4475 }, { "epoch": 2.348008385744235, "grad_norm": 0.2919915020465851, "learning_rate": 4.728512134357345e-05, "loss": 0.4107, "num_input_tokens_seen": 2941912, "step": 4480 }, { "epoch": 2.35062893081761, "grad_norm": 0.30535417795181274, "learning_rate": 4.727474780525979e-05, "loss": 0.423, "num_input_tokens_seen": 2944568, "step": 4485 }, { "epoch": 2.353249475890985, "grad_norm": 0.38074395060539246, "learning_rate": 4.7264355628454636e-05, "loss": 0.403, "num_input_tokens_seen": 2947480, "step": 4490 }, { "epoch": 2.3558700209643604, "grad_norm": 0.5086488723754883, "learning_rate": 4.7253944821853685e-05, "loss": 0.5351, "num_input_tokens_seen": 2951192, "step": 4495 }, { "epoch": 2.358490566037736, "grad_norm": 0.7418559193611145, "learning_rate": 4.724351539416822e-05, "loss": 0.5099, "num_input_tokens_seen": 2954648, "step": 4500 }, { "epoch": 2.361111111111111, "grad_norm": 0.30166012048721313, "learning_rate": 4.7233067354125125e-05, "loss": 0.4255, "num_input_tokens_seen": 2958552, "step": 4505 }, { "epoch": 2.3637316561844863, "grad_norm": 0.38315266370773315, "learning_rate": 4.722260071046683e-05, "loss": 0.4462, "num_input_tokens_seen": 2962040, "step": 4510 }, { "epoch": 2.3663522012578615, "grad_norm": 0.45656445622444153, "learning_rate": 4.721211547195136e-05, "loss": 0.4735, "num_input_tokens_seen": 2964824, "step": 4515 }, { "epoch": 2.368972746331237, "grad_norm": 0.32834097743034363, "learning_rate": 4.7201611647352264e-05, "loss": 0.4534, "num_input_tokens_seen": 2967448, "step": 4520 }, { "epoch": 2.3715932914046123, "grad_norm": 0.480000376701355, "learning_rate": 4.719108924545866e-05, "loss": 0.5795, "num_input_tokens_seen": 2971000, "step": 4525 }, { "epoch": 2.3742138364779874, "grad_norm": 0.6955885887145996, "learning_rate": 4.718054827507524e-05, "loss": 0.5042, "num_input_tokens_seen": 2973592, "step": 4530 }, { "epoch": 2.3768343815513626, "grad_norm": 0.35563862323760986, "learning_rate": 4.716998874502218e-05, "loss": 0.5678, "num_input_tokens_seen": 2977560, "step": 4535 }, { "epoch": 2.379454926624738, "grad_norm": 0.43254366517066956, "learning_rate": 4.7159410664135225e-05, "loss": 0.4862, "num_input_tokens_seen": 2980280, "step": 4540 }, { "epoch": 2.3820754716981134, "grad_norm": 0.5416499376296997, "learning_rate": 4.714881404126563e-05, "loss": 0.4719, "num_input_tokens_seen": 2983352, "step": 4545 }, { "epoch": 2.3846960167714886, "grad_norm": 0.4983150064945221, "learning_rate": 4.713819888528016e-05, "loss": 0.5386, "num_input_tokens_seen": 2986648, "step": 4550 }, { "epoch": 2.3873165618448637, "grad_norm": 0.7617013454437256, "learning_rate": 4.7127565205061096e-05, "loss": 0.4785, "num_input_tokens_seen": 2989880, "step": 4555 }, { "epoch": 2.389937106918239, "grad_norm": 0.44090136885643005, "learning_rate": 4.711691300950622e-05, "loss": 0.4593, "num_input_tokens_seen": 2992536, "step": 4560 }, { "epoch": 2.392557651991614, "grad_norm": 1.1041603088378906, "learning_rate": 4.710624230752879e-05, "loss": 0.5129, "num_input_tokens_seen": 2996152, "step": 4565 }, { "epoch": 2.3951781970649897, "grad_norm": 1.0771291255950928, "learning_rate": 4.709555310805758e-05, "loss": 0.4611, "num_input_tokens_seen": 2999288, "step": 4570 }, { "epoch": 2.397798742138365, "grad_norm": 0.5470988154411316, "learning_rate": 4.7084845420036805e-05, "loss": 0.3814, "num_input_tokens_seen": 3002520, "step": 4575 }, { "epoch": 2.40041928721174, "grad_norm": 0.4215276539325714, "learning_rate": 4.7074119252426175e-05, "loss": 0.5688, "num_input_tokens_seen": 3005752, "step": 4580 }, { "epoch": 2.403039832285115, "grad_norm": 0.51725834608078, "learning_rate": 4.7063374614200866e-05, "loss": 0.5068, "num_input_tokens_seen": 3009432, "step": 4585 }, { "epoch": 2.4056603773584904, "grad_norm": 0.3772942125797272, "learning_rate": 4.7052611514351495e-05, "loss": 0.5204, "num_input_tokens_seen": 3014136, "step": 4590 }, { "epoch": 2.408280922431866, "grad_norm": 0.42544734477996826, "learning_rate": 4.704182996188413e-05, "loss": 0.5333, "num_input_tokens_seen": 3017144, "step": 4595 }, { "epoch": 2.410901467505241, "grad_norm": 5.630637168884277, "learning_rate": 4.703102996582028e-05, "loss": 0.5759, "num_input_tokens_seen": 3019448, "step": 4600 }, { "epoch": 2.4135220125786163, "grad_norm": 0.46727314591407776, "learning_rate": 4.70202115351969e-05, "loss": 0.3901, "num_input_tokens_seen": 3022104, "step": 4605 }, { "epoch": 2.4161425576519915, "grad_norm": 0.41766786575317383, "learning_rate": 4.700937467906634e-05, "loss": 0.5421, "num_input_tokens_seen": 3025688, "step": 4610 }, { "epoch": 2.418763102725367, "grad_norm": 0.36204347014427185, "learning_rate": 4.69985194064964e-05, "loss": 0.3621, "num_input_tokens_seen": 3029816, "step": 4615 }, { "epoch": 2.4213836477987423, "grad_norm": 0.3046499490737915, "learning_rate": 4.698764572657029e-05, "loss": 0.3506, "num_input_tokens_seen": 3032824, "step": 4620 }, { "epoch": 2.4240041928721174, "grad_norm": 0.4489087760448456, "learning_rate": 4.697675364838657e-05, "loss": 0.6172, "num_input_tokens_seen": 3036632, "step": 4625 }, { "epoch": 2.4266247379454926, "grad_norm": 0.6891305446624756, "learning_rate": 4.6965843181059264e-05, "loss": 0.5267, "num_input_tokens_seen": 3039384, "step": 4630 }, { "epoch": 2.4292452830188678, "grad_norm": 0.45551246404647827, "learning_rate": 4.695491433371774e-05, "loss": 0.4971, "num_input_tokens_seen": 3042296, "step": 4635 }, { "epoch": 2.431865828092243, "grad_norm": 0.446414977312088, "learning_rate": 4.694396711550676e-05, "loss": 0.4424, "num_input_tokens_seen": 3045304, "step": 4640 }, { "epoch": 2.4344863731656186, "grad_norm": 0.521632969379425, "learning_rate": 4.693300153558646e-05, "loss": 0.3969, "num_input_tokens_seen": 3048824, "step": 4645 }, { "epoch": 2.4371069182389937, "grad_norm": 0.3272760808467865, "learning_rate": 4.692201760313233e-05, "loss": 0.5485, "num_input_tokens_seen": 3052120, "step": 4650 }, { "epoch": 2.439727463312369, "grad_norm": 0.3149610161781311, "learning_rate": 4.691101532733524e-05, "loss": 0.4969, "num_input_tokens_seen": 3055384, "step": 4655 }, { "epoch": 2.442348008385744, "grad_norm": 0.40797480940818787, "learning_rate": 4.689999471740137e-05, "loss": 0.3929, "num_input_tokens_seen": 3058328, "step": 4660 }, { "epoch": 2.4449685534591197, "grad_norm": 0.364224910736084, "learning_rate": 4.6888955782552274e-05, "loss": 0.5423, "num_input_tokens_seen": 3063512, "step": 4665 }, { "epoch": 2.447589098532495, "grad_norm": 0.4615791141986847, "learning_rate": 4.6877898532024825e-05, "loss": 0.5239, "num_input_tokens_seen": 3068056, "step": 4670 }, { "epoch": 2.45020964360587, "grad_norm": 0.4101943373680115, "learning_rate": 4.686682297507123e-05, "loss": 0.4446, "num_input_tokens_seen": 3071608, "step": 4675 }, { "epoch": 2.452830188679245, "grad_norm": 0.5853837132453918, "learning_rate": 4.6855729120959e-05, "loss": 0.5097, "num_input_tokens_seen": 3074872, "step": 4680 }, { "epoch": 2.4554507337526204, "grad_norm": 0.42309680581092834, "learning_rate": 4.684461697897098e-05, "loss": 0.4372, "num_input_tokens_seen": 3077720, "step": 4685 }, { "epoch": 2.458071278825996, "grad_norm": 0.5388785004615784, "learning_rate": 4.683348655840529e-05, "loss": 0.5878, "num_input_tokens_seen": 3080664, "step": 4690 }, { "epoch": 2.460691823899371, "grad_norm": 0.6537149548530579, "learning_rate": 4.682233786857536e-05, "loss": 0.5045, "num_input_tokens_seen": 3084408, "step": 4695 }, { "epoch": 2.4633123689727463, "grad_norm": 0.33927854895591736, "learning_rate": 4.681117091880991e-05, "loss": 0.5413, "num_input_tokens_seen": 3087640, "step": 4700 }, { "epoch": 2.4659329140461215, "grad_norm": 0.4715810716152191, "learning_rate": 4.679998571845293e-05, "loss": 0.5736, "num_input_tokens_seen": 3090520, "step": 4705 }, { "epoch": 2.468553459119497, "grad_norm": 0.36066555976867676, "learning_rate": 4.678878227686368e-05, "loss": 0.4262, "num_input_tokens_seen": 3093624, "step": 4710 }, { "epoch": 2.4711740041928723, "grad_norm": 0.39921993017196655, "learning_rate": 4.677756060341669e-05, "loss": 0.4369, "num_input_tokens_seen": 3096184, "step": 4715 }, { "epoch": 2.4737945492662474, "grad_norm": 0.4757072329521179, "learning_rate": 4.676632070750175e-05, "loss": 0.5831, "num_input_tokens_seen": 3099928, "step": 4720 }, { "epoch": 2.4764150943396226, "grad_norm": 0.7270557880401611, "learning_rate": 4.6755062598523894e-05, "loss": 0.5053, "num_input_tokens_seen": 3102456, "step": 4725 }, { "epoch": 2.4790356394129978, "grad_norm": 1.804840087890625, "learning_rate": 4.674378628590338e-05, "loss": 0.6089, "num_input_tokens_seen": 3104728, "step": 4730 }, { "epoch": 2.481656184486373, "grad_norm": 0.7906310558319092, "learning_rate": 4.673249177907571e-05, "loss": 0.4476, "num_input_tokens_seen": 3107704, "step": 4735 }, { "epoch": 2.4842767295597485, "grad_norm": 0.3902539312839508, "learning_rate": 4.672117908749164e-05, "loss": 0.4323, "num_input_tokens_seen": 3110328, "step": 4740 }, { "epoch": 2.4868972746331237, "grad_norm": 0.39475977420806885, "learning_rate": 4.670984822061708e-05, "loss": 0.5222, "num_input_tokens_seen": 3113464, "step": 4745 }, { "epoch": 2.489517819706499, "grad_norm": 0.26760369539260864, "learning_rate": 4.6698499187933196e-05, "loss": 0.4178, "num_input_tokens_seen": 3117112, "step": 4750 }, { "epoch": 2.492138364779874, "grad_norm": 0.31082817912101746, "learning_rate": 4.668713199893635e-05, "loss": 0.4462, "num_input_tokens_seen": 3120280, "step": 4755 }, { "epoch": 2.4947589098532497, "grad_norm": 0.3512860834598541, "learning_rate": 4.6675746663138066e-05, "loss": 0.4821, "num_input_tokens_seen": 3123928, "step": 4760 }, { "epoch": 2.497379454926625, "grad_norm": 0.38651999831199646, "learning_rate": 4.666434319006508e-05, "loss": 0.403, "num_input_tokens_seen": 3126584, "step": 4765 }, { "epoch": 2.5, "grad_norm": 0.6352567672729492, "learning_rate": 4.66529215892593e-05, "loss": 0.6225, "num_input_tokens_seen": 3129368, "step": 4770 }, { "epoch": 2.5, "eval_loss": 0.5045660734176636, "eval_runtime": 14.5606, "eval_samples_per_second": 58.24, "eval_steps_per_second": 14.56, "num_input_tokens_seen": 3129368, "step": 4770 }, { "epoch": 2.502620545073375, "grad_norm": 0.9288726449012756, "learning_rate": 4.664148187027781e-05, "loss": 0.5659, "num_input_tokens_seen": 3132248, "step": 4775 }, { "epoch": 2.5052410901467503, "grad_norm": 0.3497016429901123, "learning_rate": 4.663002404269283e-05, "loss": 0.3674, "num_input_tokens_seen": 3135320, "step": 4780 }, { "epoch": 2.507861635220126, "grad_norm": 0.5864827036857605, "learning_rate": 4.661854811609174e-05, "loss": 0.6993, "num_input_tokens_seen": 3139032, "step": 4785 }, { "epoch": 2.510482180293501, "grad_norm": 0.4990938901901245, "learning_rate": 4.6607054100077096e-05, "loss": 0.656, "num_input_tokens_seen": 3141944, "step": 4790 }, { "epoch": 2.5131027253668763, "grad_norm": 0.8863980174064636, "learning_rate": 4.6595542004266545e-05, "loss": 0.5268, "num_input_tokens_seen": 3145400, "step": 4795 }, { "epoch": 2.5157232704402515, "grad_norm": 0.32470524311065674, "learning_rate": 4.65840118382929e-05, "loss": 0.5874, "num_input_tokens_seen": 3149496, "step": 4800 }, { "epoch": 2.518343815513627, "grad_norm": 0.4024010896682739, "learning_rate": 4.657246361180405e-05, "loss": 0.603, "num_input_tokens_seen": 3152344, "step": 4805 }, { "epoch": 2.5209643605870022, "grad_norm": 0.4593154191970825, "learning_rate": 4.656089733446305e-05, "loss": 0.4098, "num_input_tokens_seen": 3155192, "step": 4810 }, { "epoch": 2.5235849056603774, "grad_norm": 0.49150970578193665, "learning_rate": 4.6549313015948025e-05, "loss": 0.3912, "num_input_tokens_seen": 3157688, "step": 4815 }, { "epoch": 2.5262054507337526, "grad_norm": 0.26410549879074097, "learning_rate": 4.653771066595219e-05, "loss": 0.4873, "num_input_tokens_seen": 3161592, "step": 4820 }, { "epoch": 2.5288259958071277, "grad_norm": 0.6683566570281982, "learning_rate": 4.652609029418389e-05, "loss": 0.4748, "num_input_tokens_seen": 3164024, "step": 4825 }, { "epoch": 2.531446540880503, "grad_norm": 0.5684312582015991, "learning_rate": 4.65144519103665e-05, "loss": 0.412, "num_input_tokens_seen": 3166424, "step": 4830 }, { "epoch": 2.5340670859538785, "grad_norm": 0.564618706703186, "learning_rate": 4.650279552423849e-05, "loss": 0.6747, "num_input_tokens_seen": 3169240, "step": 4835 }, { "epoch": 2.5366876310272537, "grad_norm": 0.6792442202568054, "learning_rate": 4.6491121145553386e-05, "loss": 0.4996, "num_input_tokens_seen": 3172088, "step": 4840 }, { "epoch": 2.539308176100629, "grad_norm": 0.5832188129425049, "learning_rate": 4.6479428784079796e-05, "loss": 0.5168, "num_input_tokens_seen": 3175480, "step": 4845 }, { "epoch": 2.541928721174004, "grad_norm": 0.4236965477466583, "learning_rate": 4.6467718449601326e-05, "loss": 0.5283, "num_input_tokens_seen": 3179480, "step": 4850 }, { "epoch": 2.5445492662473796, "grad_norm": 0.30900856852531433, "learning_rate": 4.645599015191667e-05, "loss": 0.4662, "num_input_tokens_seen": 3182520, "step": 4855 }, { "epoch": 2.547169811320755, "grad_norm": 0.5633318424224854, "learning_rate": 4.6444243900839525e-05, "loss": 0.5254, "num_input_tokens_seen": 3185560, "step": 4860 }, { "epoch": 2.54979035639413, "grad_norm": 0.34681329131126404, "learning_rate": 4.643247970619862e-05, "loss": 0.4973, "num_input_tokens_seen": 3189592, "step": 4865 }, { "epoch": 2.552410901467505, "grad_norm": 0.8346880674362183, "learning_rate": 4.642069757783769e-05, "loss": 0.4195, "num_input_tokens_seen": 3192760, "step": 4870 }, { "epoch": 2.5550314465408803, "grad_norm": 0.8070312142372131, "learning_rate": 4.640889752561549e-05, "loss": 0.4201, "num_input_tokens_seen": 3195992, "step": 4875 }, { "epoch": 2.5576519916142555, "grad_norm": 0.8036596775054932, "learning_rate": 4.639707955940575e-05, "loss": 0.5086, "num_input_tokens_seen": 3198200, "step": 4880 }, { "epoch": 2.560272536687631, "grad_norm": 0.3403420150279999, "learning_rate": 4.6385243689097226e-05, "loss": 0.4693, "num_input_tokens_seen": 3201848, "step": 4885 }, { "epoch": 2.5628930817610063, "grad_norm": 1.3153293132781982, "learning_rate": 4.6373389924593615e-05, "loss": 0.6597, "num_input_tokens_seen": 3204504, "step": 4890 }, { "epoch": 2.5655136268343814, "grad_norm": 0.6733996868133545, "learning_rate": 4.6361518275813615e-05, "loss": 0.6332, "num_input_tokens_seen": 3207320, "step": 4895 }, { "epoch": 2.568134171907757, "grad_norm": 0.4792604148387909, "learning_rate": 4.6349628752690876e-05, "loss": 0.4557, "num_input_tokens_seen": 3210584, "step": 4900 }, { "epoch": 2.5707547169811322, "grad_norm": 0.34972113370895386, "learning_rate": 4.633772136517401e-05, "loss": 0.3812, "num_input_tokens_seen": 3213496, "step": 4905 }, { "epoch": 2.5733752620545074, "grad_norm": 0.2911551594734192, "learning_rate": 4.6325796123226575e-05, "loss": 0.4764, "num_input_tokens_seen": 3217016, "step": 4910 }, { "epoch": 2.5759958071278826, "grad_norm": 0.38682085275650024, "learning_rate": 4.6313853036827057e-05, "loss": 0.6195, "num_input_tokens_seen": 3219800, "step": 4915 }, { "epoch": 2.5786163522012577, "grad_norm": 0.6850119829177856, "learning_rate": 4.630189211596891e-05, "loss": 0.4794, "num_input_tokens_seen": 3222424, "step": 4920 }, { "epoch": 2.581236897274633, "grad_norm": 0.5080254077911377, "learning_rate": 4.628991337066047e-05, "loss": 0.4969, "num_input_tokens_seen": 3225272, "step": 4925 }, { "epoch": 2.5838574423480085, "grad_norm": 0.2732101082801819, "learning_rate": 4.627791681092499e-05, "loss": 0.4633, "num_input_tokens_seen": 3228088, "step": 4930 }, { "epoch": 2.5864779874213837, "grad_norm": 0.3874872326850891, "learning_rate": 4.626590244680068e-05, "loss": 0.4853, "num_input_tokens_seen": 3231096, "step": 4935 }, { "epoch": 2.589098532494759, "grad_norm": 0.495447039604187, "learning_rate": 4.625387028834057e-05, "loss": 0.5853, "num_input_tokens_seen": 3234552, "step": 4940 }, { "epoch": 2.591719077568134, "grad_norm": 0.7495498657226562, "learning_rate": 4.6241820345612654e-05, "loss": 0.3946, "num_input_tokens_seen": 3238072, "step": 4945 }, { "epoch": 2.5943396226415096, "grad_norm": 0.7112509608268738, "learning_rate": 4.622975262869976e-05, "loss": 0.5565, "num_input_tokens_seen": 3241624, "step": 4950 }, { "epoch": 2.596960167714885, "grad_norm": 0.39728882908821106, "learning_rate": 4.62176671476996e-05, "loss": 0.7744, "num_input_tokens_seen": 3245080, "step": 4955 }, { "epoch": 2.59958071278826, "grad_norm": 0.4031524360179901, "learning_rate": 4.620556391272476e-05, "loss": 0.525, "num_input_tokens_seen": 3247800, "step": 4960 }, { "epoch": 2.602201257861635, "grad_norm": 0.30467844009399414, "learning_rate": 4.619344293390266e-05, "loss": 0.3751, "num_input_tokens_seen": 3251000, "step": 4965 }, { "epoch": 2.6048218029350103, "grad_norm": 0.4271923899650574, "learning_rate": 4.61813042213756e-05, "loss": 0.4094, "num_input_tokens_seen": 3254872, "step": 4970 }, { "epoch": 2.6074423480083855, "grad_norm": 0.5699502229690552, "learning_rate": 4.6169147785300685e-05, "loss": 0.5188, "num_input_tokens_seen": 3258520, "step": 4975 }, { "epoch": 2.610062893081761, "grad_norm": 0.4120541512966156, "learning_rate": 4.6156973635849864e-05, "loss": 0.52, "num_input_tokens_seen": 3261944, "step": 4980 }, { "epoch": 2.6126834381551363, "grad_norm": 0.5006177425384521, "learning_rate": 4.614478178320993e-05, "loss": 0.581, "num_input_tokens_seen": 3265432, "step": 4985 }, { "epoch": 2.6153039832285114, "grad_norm": 0.6946495771408081, "learning_rate": 4.613257223758245e-05, "loss": 0.5292, "num_input_tokens_seen": 3268632, "step": 4990 }, { "epoch": 2.617924528301887, "grad_norm": 0.8691190481185913, "learning_rate": 4.612034500918381e-05, "loss": 0.4632, "num_input_tokens_seen": 3271416, "step": 4995 }, { "epoch": 2.620545073375262, "grad_norm": 0.5207632780075073, "learning_rate": 4.610810010824522e-05, "loss": 0.3816, "num_input_tokens_seen": 3274424, "step": 5000 }, { "epoch": 2.6231656184486374, "grad_norm": 0.447966605424881, "learning_rate": 4.609583754501263e-05, "loss": 0.4838, "num_input_tokens_seen": 3277688, "step": 5005 }, { "epoch": 2.6257861635220126, "grad_norm": 0.6725216507911682, "learning_rate": 4.6083557329746805e-05, "loss": 0.3811, "num_input_tokens_seen": 3280440, "step": 5010 }, { "epoch": 2.6284067085953877, "grad_norm": 0.47712764143943787, "learning_rate": 4.607125947272326e-05, "loss": 0.6636, "num_input_tokens_seen": 3283704, "step": 5015 }, { "epoch": 2.631027253668763, "grad_norm": 0.4299681484699249, "learning_rate": 4.6058943984232286e-05, "loss": 0.4462, "num_input_tokens_seen": 3286808, "step": 5020 }, { "epoch": 2.6336477987421385, "grad_norm": 0.32195478677749634, "learning_rate": 4.604661087457893e-05, "loss": 0.5872, "num_input_tokens_seen": 3289592, "step": 5025 }, { "epoch": 2.6362683438155137, "grad_norm": 0.32166430354118347, "learning_rate": 4.6034260154082955e-05, "loss": 0.4925, "num_input_tokens_seen": 3292696, "step": 5030 }, { "epoch": 2.638888888888889, "grad_norm": 0.5013790130615234, "learning_rate": 4.602189183307889e-05, "loss": 0.3467, "num_input_tokens_seen": 3296632, "step": 5035 }, { "epoch": 2.641509433962264, "grad_norm": 0.5010368824005127, "learning_rate": 4.600950592191599e-05, "loss": 0.4174, "num_input_tokens_seen": 3300536, "step": 5040 }, { "epoch": 2.6441299790356396, "grad_norm": 0.4724542200565338, "learning_rate": 4.599710243095819e-05, "loss": 0.4807, "num_input_tokens_seen": 3304728, "step": 5045 }, { "epoch": 2.646750524109015, "grad_norm": 0.5609769225120544, "learning_rate": 4.59846813705842e-05, "loss": 0.5449, "num_input_tokens_seen": 3307928, "step": 5050 }, { "epoch": 2.64937106918239, "grad_norm": 0.6966667175292969, "learning_rate": 4.597224275118738e-05, "loss": 0.5733, "num_input_tokens_seen": 3310552, "step": 5055 }, { "epoch": 2.651991614255765, "grad_norm": 0.4247312545776367, "learning_rate": 4.59597865831758e-05, "loss": 0.6144, "num_input_tokens_seen": 3314904, "step": 5060 }, { "epoch": 2.6546121593291403, "grad_norm": 0.41546863317489624, "learning_rate": 4.5947312876972214e-05, "loss": 0.4846, "num_input_tokens_seen": 3320184, "step": 5065 }, { "epoch": 2.6572327044025155, "grad_norm": 0.6077926158905029, "learning_rate": 4.5934821643014034e-05, "loss": 0.4363, "num_input_tokens_seen": 3322904, "step": 5070 }, { "epoch": 2.659853249475891, "grad_norm": 0.3390769064426422, "learning_rate": 4.5922312891753385e-05, "loss": 0.427, "num_input_tokens_seen": 3325784, "step": 5075 }, { "epoch": 2.6624737945492662, "grad_norm": 0.37221553921699524, "learning_rate": 4.590978663365699e-05, "loss": 0.5521, "num_input_tokens_seen": 3328760, "step": 5080 }, { "epoch": 2.6650943396226414, "grad_norm": 0.636896550655365, "learning_rate": 4.589724287920627e-05, "loss": 0.5239, "num_input_tokens_seen": 3331576, "step": 5085 }, { "epoch": 2.667714884696017, "grad_norm": 0.8610575199127197, "learning_rate": 4.5884681638897246e-05, "loss": 0.5596, "num_input_tokens_seen": 3333880, "step": 5090 }, { "epoch": 2.670335429769392, "grad_norm": 0.7014044523239136, "learning_rate": 4.587210292324061e-05, "loss": 0.495, "num_input_tokens_seen": 3337880, "step": 5095 }, { "epoch": 2.6729559748427674, "grad_norm": 0.29673099517822266, "learning_rate": 4.585950674276164e-05, "loss": 0.4816, "num_input_tokens_seen": 3340920, "step": 5100 }, { "epoch": 2.6755765199161425, "grad_norm": 0.5762149095535278, "learning_rate": 4.5846893108000256e-05, "loss": 0.4608, "num_input_tokens_seen": 3344312, "step": 5105 }, { "epoch": 2.6781970649895177, "grad_norm": 0.5592777729034424, "learning_rate": 4.5834262029510965e-05, "loss": 0.4302, "num_input_tokens_seen": 3347672, "step": 5110 }, { "epoch": 2.680817610062893, "grad_norm": 0.3469628095626831, "learning_rate": 4.5821613517862883e-05, "loss": 0.5055, "num_input_tokens_seen": 3351032, "step": 5115 }, { "epoch": 2.6834381551362685, "grad_norm": 0.7285401821136475, "learning_rate": 4.5808947583639693e-05, "loss": 0.653, "num_input_tokens_seen": 3353752, "step": 5120 }, { "epoch": 2.6860587002096437, "grad_norm": 0.5854938626289368, "learning_rate": 4.579626423743969e-05, "loss": 0.4454, "num_input_tokens_seen": 3356600, "step": 5125 }, { "epoch": 2.688679245283019, "grad_norm": 0.9641707539558411, "learning_rate": 4.57835634898757e-05, "loss": 0.5442, "num_input_tokens_seen": 3360280, "step": 5130 }, { "epoch": 2.691299790356394, "grad_norm": 0.35258957743644714, "learning_rate": 4.577084535157514e-05, "loss": 0.6088, "num_input_tokens_seen": 3363768, "step": 5135 }, { "epoch": 2.6939203354297696, "grad_norm": 0.39209866523742676, "learning_rate": 4.5758109833179963e-05, "loss": 0.4317, "num_input_tokens_seen": 3367480, "step": 5140 }, { "epoch": 2.6965408805031448, "grad_norm": 0.680245041847229, "learning_rate": 4.5745356945346676e-05, "loss": 0.4949, "num_input_tokens_seen": 3370520, "step": 5145 }, { "epoch": 2.69916142557652, "grad_norm": 0.7840683460235596, "learning_rate": 4.57325866987463e-05, "loss": 0.4461, "num_input_tokens_seen": 3372984, "step": 5150 }, { "epoch": 2.701781970649895, "grad_norm": 1.3270522356033325, "learning_rate": 4.571979910406441e-05, "loss": 0.4805, "num_input_tokens_seen": 3375288, "step": 5155 }, { "epoch": 2.7044025157232703, "grad_norm": 0.5493078827857971, "learning_rate": 4.570699417200106e-05, "loss": 0.4041, "num_input_tokens_seen": 3378488, "step": 5160 }, { "epoch": 2.7070230607966455, "grad_norm": 0.37325319647789, "learning_rate": 4.569417191327086e-05, "loss": 0.4415, "num_input_tokens_seen": 3381368, "step": 5165 }, { "epoch": 2.709643605870021, "grad_norm": 1.0574960708618164, "learning_rate": 4.5681332338602864e-05, "loss": 0.4955, "num_input_tokens_seen": 3385016, "step": 5170 }, { "epoch": 2.7122641509433962, "grad_norm": 0.33048173785209656, "learning_rate": 4.5668475458740654e-05, "loss": 0.5355, "num_input_tokens_seen": 3387992, "step": 5175 }, { "epoch": 2.7148846960167714, "grad_norm": 0.4280594289302826, "learning_rate": 4.5655601284442276e-05, "loss": 0.5257, "num_input_tokens_seen": 3390872, "step": 5180 }, { "epoch": 2.717505241090147, "grad_norm": 0.4295627474784851, "learning_rate": 4.5642709826480256e-05, "loss": 0.5656, "num_input_tokens_seen": 3393496, "step": 5185 }, { "epoch": 2.720125786163522, "grad_norm": 0.7042911052703857, "learning_rate": 4.562980109564158e-05, "loss": 0.73, "num_input_tokens_seen": 3395992, "step": 5190 }, { "epoch": 2.7227463312368974, "grad_norm": 0.4673435389995575, "learning_rate": 4.561687510272767e-05, "loss": 0.5944, "num_input_tokens_seen": 3399160, "step": 5195 }, { "epoch": 2.7253668763102725, "grad_norm": 0.507491946220398, "learning_rate": 4.5603931858554415e-05, "loss": 0.4882, "num_input_tokens_seen": 3402072, "step": 5200 }, { "epoch": 2.7279874213836477, "grad_norm": 0.659877598285675, "learning_rate": 4.559097137395214e-05, "loss": 0.4891, "num_input_tokens_seen": 3404344, "step": 5205 }, { "epoch": 2.730607966457023, "grad_norm": 0.36732345819473267, "learning_rate": 4.5577993659765574e-05, "loss": 0.4771, "num_input_tokens_seen": 3407640, "step": 5210 }, { "epoch": 2.7332285115303985, "grad_norm": 0.7247602343559265, "learning_rate": 4.556499872685387e-05, "loss": 0.3846, "num_input_tokens_seen": 3410200, "step": 5215 }, { "epoch": 2.7358490566037736, "grad_norm": 0.4635840356349945, "learning_rate": 4.555198658609061e-05, "loss": 0.4275, "num_input_tokens_seen": 3412856, "step": 5220 }, { "epoch": 2.738469601677149, "grad_norm": 0.2510216534137726, "learning_rate": 4.5538957248363756e-05, "loss": 0.4602, "num_input_tokens_seen": 3415864, "step": 5225 }, { "epoch": 2.741090146750524, "grad_norm": 0.28294748067855835, "learning_rate": 4.552591072457565e-05, "loss": 0.5148, "num_input_tokens_seen": 3419224, "step": 5230 }, { "epoch": 2.7437106918238996, "grad_norm": 0.5773622989654541, "learning_rate": 4.551284702564304e-05, "loss": 0.4361, "num_input_tokens_seen": 3422520, "step": 5235 }, { "epoch": 2.7463312368972748, "grad_norm": 0.5640606880187988, "learning_rate": 4.5499766162497025e-05, "loss": 0.4581, "num_input_tokens_seen": 3425976, "step": 5240 }, { "epoch": 2.74895178197065, "grad_norm": 0.4595955014228821, "learning_rate": 4.548666814608308e-05, "loss": 0.4949, "num_input_tokens_seen": 3429368, "step": 5245 }, { "epoch": 2.751572327044025, "grad_norm": 0.7090050578117371, "learning_rate": 4.5473552987361024e-05, "loss": 0.5369, "num_input_tokens_seen": 3434680, "step": 5250 }, { "epoch": 2.7541928721174003, "grad_norm": 0.9436038732528687, "learning_rate": 4.5460420697305024e-05, "loss": 0.5777, "num_input_tokens_seen": 3437752, "step": 5255 }, { "epoch": 2.7568134171907754, "grad_norm": 0.358523428440094, "learning_rate": 4.544727128690358e-05, "loss": 0.6052, "num_input_tokens_seen": 3440760, "step": 5260 }, { "epoch": 2.759433962264151, "grad_norm": 0.5440515279769897, "learning_rate": 4.543410476715951e-05, "loss": 0.5169, "num_input_tokens_seen": 3443736, "step": 5265 }, { "epoch": 2.762054507337526, "grad_norm": 0.8857433795928955, "learning_rate": 4.542092114908997e-05, "loss": 0.62, "num_input_tokens_seen": 3446904, "step": 5270 }, { "epoch": 2.7646750524109014, "grad_norm": 0.45853206515312195, "learning_rate": 4.54077204437264e-05, "loss": 0.4311, "num_input_tokens_seen": 3450104, "step": 5275 }, { "epoch": 2.767295597484277, "grad_norm": 0.3560716211795807, "learning_rate": 4.5394502662114555e-05, "loss": 0.6547, "num_input_tokens_seen": 3453592, "step": 5280 }, { "epoch": 2.769916142557652, "grad_norm": 0.847180187702179, "learning_rate": 4.538126781531446e-05, "loss": 0.4879, "num_input_tokens_seen": 3456824, "step": 5285 }, { "epoch": 2.7725366876310273, "grad_norm": 0.7688814997673035, "learning_rate": 4.536801591440044e-05, "loss": 0.5199, "num_input_tokens_seen": 3460792, "step": 5290 }, { "epoch": 2.7751572327044025, "grad_norm": 0.3764491081237793, "learning_rate": 4.535474697046107e-05, "loss": 0.551, "num_input_tokens_seen": 3464280, "step": 5295 }, { "epoch": 2.7777777777777777, "grad_norm": 1.0471577644348145, "learning_rate": 4.534146099459921e-05, "loss": 0.6286, "num_input_tokens_seen": 3467320, "step": 5300 }, { "epoch": 2.780398322851153, "grad_norm": 0.49251508712768555, "learning_rate": 4.5328157997931955e-05, "loss": 0.4883, "num_input_tokens_seen": 3470520, "step": 5305 }, { "epoch": 2.7830188679245285, "grad_norm": 0.5053970813751221, "learning_rate": 4.531483799159062e-05, "loss": 0.5064, "num_input_tokens_seen": 3474008, "step": 5310 }, { "epoch": 2.7856394129979036, "grad_norm": 1.0072863101959229, "learning_rate": 4.5301500986720816e-05, "loss": 0.4494, "num_input_tokens_seen": 3477208, "step": 5315 }, { "epoch": 2.788259958071279, "grad_norm": 0.4086436331272125, "learning_rate": 4.528814699448232e-05, "loss": 0.5306, "num_input_tokens_seen": 3480088, "step": 5320 }, { "epoch": 2.790880503144654, "grad_norm": 0.24413055181503296, "learning_rate": 4.527477602604914e-05, "loss": 0.4843, "num_input_tokens_seen": 3484504, "step": 5325 }, { "epoch": 2.7935010482180296, "grad_norm": 0.45332807302474976, "learning_rate": 4.52613880926095e-05, "loss": 0.4376, "num_input_tokens_seen": 3487000, "step": 5330 }, { "epoch": 2.7961215932914047, "grad_norm": 0.4020591676235199, "learning_rate": 4.5247983205365806e-05, "loss": 0.4545, "num_input_tokens_seen": 3490712, "step": 5335 }, { "epoch": 2.79874213836478, "grad_norm": 0.34940722584724426, "learning_rate": 4.5234561375534655e-05, "loss": 0.5444, "num_input_tokens_seen": 3494296, "step": 5340 }, { "epoch": 2.801362683438155, "grad_norm": 0.35142114758491516, "learning_rate": 4.5221122614346823e-05, "loss": 0.4366, "num_input_tokens_seen": 3498232, "step": 5345 }, { "epoch": 2.8039832285115303, "grad_norm": 0.6866607666015625, "learning_rate": 4.520766693304725e-05, "loss": 0.4648, "num_input_tokens_seen": 3501080, "step": 5350 }, { "epoch": 2.8066037735849054, "grad_norm": 0.5499856472015381, "learning_rate": 4.519419434289504e-05, "loss": 0.4115, "num_input_tokens_seen": 3504024, "step": 5355 }, { "epoch": 2.809224318658281, "grad_norm": 0.4744854271411896, "learning_rate": 4.518070485516344e-05, "loss": 0.3953, "num_input_tokens_seen": 3507256, "step": 5360 }, { "epoch": 2.811844863731656, "grad_norm": 0.29081812500953674, "learning_rate": 4.5167198481139825e-05, "loss": 0.5356, "num_input_tokens_seen": 3510392, "step": 5365 }, { "epoch": 2.8144654088050314, "grad_norm": 0.45363765954971313, "learning_rate": 4.515367523212573e-05, "loss": 0.538, "num_input_tokens_seen": 3513272, "step": 5370 }, { "epoch": 2.8170859538784065, "grad_norm": 0.3108077347278595, "learning_rate": 4.5140135119436776e-05, "loss": 0.5275, "num_input_tokens_seen": 3516312, "step": 5375 }, { "epoch": 2.819706498951782, "grad_norm": 0.6202510595321655, "learning_rate": 4.512657815440273e-05, "loss": 0.5002, "num_input_tokens_seen": 3519032, "step": 5380 }, { "epoch": 2.8223270440251573, "grad_norm": 0.5457552671432495, "learning_rate": 4.511300434836743e-05, "loss": 0.3753, "num_input_tokens_seen": 3521720, "step": 5385 }, { "epoch": 2.8249475890985325, "grad_norm": 0.7013052105903625, "learning_rate": 4.5099413712688805e-05, "loss": 0.4603, "num_input_tokens_seen": 3524888, "step": 5390 }, { "epoch": 2.8275681341719077, "grad_norm": 0.30787134170532227, "learning_rate": 4.5085806258738896e-05, "loss": 0.4169, "num_input_tokens_seen": 3529688, "step": 5395 }, { "epoch": 2.830188679245283, "grad_norm": 0.4137200713157654, "learning_rate": 4.507218199790379e-05, "loss": 0.5061, "num_input_tokens_seen": 3532440, "step": 5400 }, { "epoch": 2.832809224318658, "grad_norm": 0.5255808234214783, "learning_rate": 4.505854094158365e-05, "loss": 0.4922, "num_input_tokens_seen": 3535288, "step": 5405 }, { "epoch": 2.8354297693920336, "grad_norm": 0.6712587475776672, "learning_rate": 4.5044883101192695e-05, "loss": 0.5253, "num_input_tokens_seen": 3538840, "step": 5410 }, { "epoch": 2.838050314465409, "grad_norm": 0.325411856174469, "learning_rate": 4.503120848815916e-05, "loss": 0.3733, "num_input_tokens_seen": 3541880, "step": 5415 }, { "epoch": 2.840670859538784, "grad_norm": 0.4726559817790985, "learning_rate": 4.501751711392536e-05, "loss": 0.5253, "num_input_tokens_seen": 3545208, "step": 5420 }, { "epoch": 2.8432914046121596, "grad_norm": 0.7315889596939087, "learning_rate": 4.5003808989947605e-05, "loss": 0.4403, "num_input_tokens_seen": 3548120, "step": 5425 }, { "epoch": 2.8459119496855347, "grad_norm": 0.5780505537986755, "learning_rate": 4.499008412769622e-05, "loss": 0.5118, "num_input_tokens_seen": 3551512, "step": 5430 }, { "epoch": 2.84853249475891, "grad_norm": 0.556800127029419, "learning_rate": 4.4976342538655546e-05, "loss": 0.3901, "num_input_tokens_seen": 3554520, "step": 5435 }, { "epoch": 2.851153039832285, "grad_norm": 0.36605989933013916, "learning_rate": 4.4962584234323925e-05, "loss": 0.5743, "num_input_tokens_seen": 3557208, "step": 5440 }, { "epoch": 2.8537735849056602, "grad_norm": 0.801299512386322, "learning_rate": 4.4948809226213664e-05, "loss": 0.6468, "num_input_tokens_seen": 3560600, "step": 5445 }, { "epoch": 2.8563941299790354, "grad_norm": 0.3130406141281128, "learning_rate": 4.4935017525851067e-05, "loss": 0.4679, "num_input_tokens_seen": 3565752, "step": 5450 }, { "epoch": 2.859014675052411, "grad_norm": 0.30336523056030273, "learning_rate": 4.4921209144776414e-05, "loss": 0.4009, "num_input_tokens_seen": 3569400, "step": 5455 }, { "epoch": 2.861635220125786, "grad_norm": 1.108046293258667, "learning_rate": 4.490738409454389e-05, "loss": 0.4709, "num_input_tokens_seen": 3572088, "step": 5460 }, { "epoch": 2.8642557651991614, "grad_norm": 0.280838280916214, "learning_rate": 4.48935423867217e-05, "loss": 0.5819, "num_input_tokens_seen": 3575032, "step": 5465 }, { "epoch": 2.8668763102725365, "grad_norm": 0.5409507751464844, "learning_rate": 4.487968403289195e-05, "loss": 0.474, "num_input_tokens_seen": 3578232, "step": 5470 }, { "epoch": 2.869496855345912, "grad_norm": 0.395729124546051, "learning_rate": 4.4865809044650655e-05, "loss": 0.5181, "num_input_tokens_seen": 3581304, "step": 5475 }, { "epoch": 2.8721174004192873, "grad_norm": 0.4747457504272461, "learning_rate": 4.48519174336078e-05, "loss": 0.395, "num_input_tokens_seen": 3584664, "step": 5480 }, { "epoch": 2.8747379454926625, "grad_norm": 0.5593960285186768, "learning_rate": 4.483800921138722e-05, "loss": 0.5779, "num_input_tokens_seen": 3587544, "step": 5485 }, { "epoch": 2.8773584905660377, "grad_norm": 0.5823366641998291, "learning_rate": 4.4824084389626705e-05, "loss": 0.5258, "num_input_tokens_seen": 3590360, "step": 5490 }, { "epoch": 2.879979035639413, "grad_norm": 0.408566951751709, "learning_rate": 4.48101429799779e-05, "loss": 0.4097, "num_input_tokens_seen": 3597208, "step": 5495 }, { "epoch": 2.882599580712788, "grad_norm": 0.643938422203064, "learning_rate": 4.479618499410634e-05, "loss": 0.3987, "num_input_tokens_seen": 3600568, "step": 5500 }, { "epoch": 2.8852201257861636, "grad_norm": 1.114526391029358, "learning_rate": 4.478221044369143e-05, "loss": 0.5096, "num_input_tokens_seen": 3603480, "step": 5505 }, { "epoch": 2.8878406708595388, "grad_norm": 0.4443126916885376, "learning_rate": 4.476821934042644e-05, "loss": 0.5555, "num_input_tokens_seen": 3606360, "step": 5510 }, { "epoch": 2.890461215932914, "grad_norm": 0.8430497646331787, "learning_rate": 4.4754211696018475e-05, "loss": 0.4475, "num_input_tokens_seen": 3608856, "step": 5515 }, { "epoch": 2.8930817610062896, "grad_norm": 0.37894487380981445, "learning_rate": 4.47401875221885e-05, "loss": 0.3981, "num_input_tokens_seen": 3611384, "step": 5520 }, { "epoch": 2.8957023060796647, "grad_norm": 0.4787212014198303, "learning_rate": 4.4726146830671304e-05, "loss": 0.7094, "num_input_tokens_seen": 3614488, "step": 5525 }, { "epoch": 2.89832285115304, "grad_norm": 0.4898988902568817, "learning_rate": 4.47120896332155e-05, "loss": 0.4771, "num_input_tokens_seen": 3617080, "step": 5530 }, { "epoch": 2.900943396226415, "grad_norm": 0.8842406272888184, "learning_rate": 4.4698015941583494e-05, "loss": 0.6245, "num_input_tokens_seen": 3620728, "step": 5535 }, { "epoch": 2.9035639412997902, "grad_norm": 0.4020688235759735, "learning_rate": 4.4683925767551525e-05, "loss": 0.5434, "num_input_tokens_seen": 3624184, "step": 5540 }, { "epoch": 2.9061844863731654, "grad_norm": 0.25158989429473877, "learning_rate": 4.466981912290959e-05, "loss": 0.4057, "num_input_tokens_seen": 3628856, "step": 5545 }, { "epoch": 2.908805031446541, "grad_norm": 0.33767467737197876, "learning_rate": 4.46556960194615e-05, "loss": 0.5772, "num_input_tokens_seen": 3631800, "step": 5550 }, { "epoch": 2.911425576519916, "grad_norm": 1.173230767250061, "learning_rate": 4.464155646902482e-05, "loss": 0.4726, "num_input_tokens_seen": 3634680, "step": 5555 }, { "epoch": 2.9140461215932913, "grad_norm": 0.4063594341278076, "learning_rate": 4.462740048343087e-05, "loss": 0.4201, "num_input_tokens_seen": 3638264, "step": 5560 }, { "epoch": 2.9166666666666665, "grad_norm": 0.7305771112442017, "learning_rate": 4.461322807452475e-05, "loss": 0.4383, "num_input_tokens_seen": 3641240, "step": 5565 }, { "epoch": 2.919287211740042, "grad_norm": 0.43539300560951233, "learning_rate": 4.4599039254165264e-05, "loss": 0.5668, "num_input_tokens_seen": 3643640, "step": 5570 }, { "epoch": 2.9219077568134173, "grad_norm": 0.282667338848114, "learning_rate": 4.458483403422498e-05, "loss": 0.4987, "num_input_tokens_seen": 3648088, "step": 5575 }, { "epoch": 2.9245283018867925, "grad_norm": 0.3041967749595642, "learning_rate": 4.457061242659018e-05, "loss": 0.4483, "num_input_tokens_seen": 3651352, "step": 5580 }, { "epoch": 2.9271488469601676, "grad_norm": 0.3202519714832306, "learning_rate": 4.455637444316085e-05, "loss": 0.5202, "num_input_tokens_seen": 3655288, "step": 5585 }, { "epoch": 2.929769392033543, "grad_norm": 0.5150640606880188, "learning_rate": 4.454212009585068e-05, "loss": 0.4435, "num_input_tokens_seen": 3657624, "step": 5590 }, { "epoch": 2.932389937106918, "grad_norm": 0.42773163318634033, "learning_rate": 4.4527849396587065e-05, "loss": 0.3735, "num_input_tokens_seen": 3660472, "step": 5595 }, { "epoch": 2.9350104821802936, "grad_norm": 0.34649229049682617, "learning_rate": 4.4513562357311074e-05, "loss": 0.578, "num_input_tokens_seen": 3663992, "step": 5600 }, { "epoch": 2.9376310272536688, "grad_norm": 0.4964124262332916, "learning_rate": 4.449925898997744e-05, "loss": 0.5288, "num_input_tokens_seen": 3666424, "step": 5605 }, { "epoch": 2.940251572327044, "grad_norm": 0.48374128341674805, "learning_rate": 4.4484939306554585e-05, "loss": 0.5501, "num_input_tokens_seen": 3669400, "step": 5610 }, { "epoch": 2.9428721174004195, "grad_norm": 0.49993187189102173, "learning_rate": 4.4470603319024554e-05, "loss": 0.633, "num_input_tokens_seen": 3672440, "step": 5615 }, { "epoch": 2.9454926624737947, "grad_norm": 0.8811215758323669, "learning_rate": 4.445625103938304e-05, "loss": 0.3787, "num_input_tokens_seen": 3675704, "step": 5620 }, { "epoch": 2.94811320754717, "grad_norm": 0.5724438428878784, "learning_rate": 4.4441882479639375e-05, "loss": 0.6467, "num_input_tokens_seen": 3679160, "step": 5625 }, { "epoch": 2.950733752620545, "grad_norm": 0.34247952699661255, "learning_rate": 4.442749765181653e-05, "loss": 0.4697, "num_input_tokens_seen": 3682328, "step": 5630 }, { "epoch": 2.95335429769392, "grad_norm": 0.500796377658844, "learning_rate": 4.441309656795106e-05, "loss": 0.4518, "num_input_tokens_seen": 3685240, "step": 5635 }, { "epoch": 2.9559748427672954, "grad_norm": 0.7242765426635742, "learning_rate": 4.4398679240093144e-05, "loss": 0.6377, "num_input_tokens_seen": 3687960, "step": 5640 }, { "epoch": 2.958595387840671, "grad_norm": 0.5468534827232361, "learning_rate": 4.438424568030652e-05, "loss": 0.4061, "num_input_tokens_seen": 3691704, "step": 5645 }, { "epoch": 2.961215932914046, "grad_norm": 0.49603307247161865, "learning_rate": 4.436979590066857e-05, "loss": 0.5388, "num_input_tokens_seen": 3695416, "step": 5650 }, { "epoch": 2.9638364779874213, "grad_norm": 0.5947315096855164, "learning_rate": 4.435532991327017e-05, "loss": 0.4678, "num_input_tokens_seen": 3698168, "step": 5655 }, { "epoch": 2.9664570230607965, "grad_norm": 0.6685165166854858, "learning_rate": 4.434084773021582e-05, "loss": 0.6287, "num_input_tokens_seen": 3700984, "step": 5660 }, { "epoch": 2.969077568134172, "grad_norm": 0.4989367425441742, "learning_rate": 4.432634936362354e-05, "loss": 0.6777, "num_input_tokens_seen": 3704472, "step": 5665 }, { "epoch": 2.9716981132075473, "grad_norm": 0.40446850657463074, "learning_rate": 4.431183482562491e-05, "loss": 0.3973, "num_input_tokens_seen": 3707960, "step": 5670 }, { "epoch": 2.9743186582809225, "grad_norm": 1.457783818244934, "learning_rate": 4.429730412836503e-05, "loss": 0.4804, "num_input_tokens_seen": 3710520, "step": 5675 }, { "epoch": 2.9769392033542976, "grad_norm": 0.5933915972709656, "learning_rate": 4.4282757284002515e-05, "loss": 0.5099, "num_input_tokens_seen": 3713688, "step": 5680 }, { "epoch": 2.979559748427673, "grad_norm": 0.4644440710544586, "learning_rate": 4.426819430470951e-05, "loss": 0.4567, "num_input_tokens_seen": 3717304, "step": 5685 }, { "epoch": 2.982180293501048, "grad_norm": 0.373129278421402, "learning_rate": 4.425361520267165e-05, "loss": 0.56, "num_input_tokens_seen": 3720888, "step": 5690 }, { "epoch": 2.9848008385744236, "grad_norm": 0.43977752327919006, "learning_rate": 4.423901999008805e-05, "loss": 0.5461, "num_input_tokens_seen": 3724888, "step": 5695 }, { "epoch": 2.9874213836477987, "grad_norm": 0.7008122801780701, "learning_rate": 4.4224408679171324e-05, "loss": 0.4055, "num_input_tokens_seen": 3727288, "step": 5700 }, { "epoch": 2.990041928721174, "grad_norm": 0.7595100998878479, "learning_rate": 4.4209781282147555e-05, "loss": 0.5723, "num_input_tokens_seen": 3730328, "step": 5705 }, { "epoch": 2.9926624737945495, "grad_norm": 0.5671548247337341, "learning_rate": 4.419513781125628e-05, "loss": 0.4341, "num_input_tokens_seen": 3733912, "step": 5710 }, { "epoch": 2.9952830188679247, "grad_norm": 0.3683570623397827, "learning_rate": 4.418047827875048e-05, "loss": 0.4503, "num_input_tokens_seen": 3737464, "step": 5715 }, { "epoch": 2.9979035639413, "grad_norm": 0.38859862089157104, "learning_rate": 4.416580269689658e-05, "loss": 0.4337, "num_input_tokens_seen": 3740152, "step": 5720 }, { "epoch": 3.0, "eval_loss": 0.4988580644130707, "eval_runtime": 14.549, "eval_samples_per_second": 58.286, "eval_steps_per_second": 14.571, "num_input_tokens_seen": 3742552, "step": 5724 }, { "epoch": 3.000524109014675, "grad_norm": 0.7495233416557312, "learning_rate": 4.415111107797445e-05, "loss": 0.4469, "num_input_tokens_seen": 3742968, "step": 5725 }, { "epoch": 3.00314465408805, "grad_norm": 0.3408883512020111, "learning_rate": 4.4136403434277364e-05, "loss": 0.4951, "num_input_tokens_seen": 3745880, "step": 5730 }, { "epoch": 3.0057651991614254, "grad_norm": 0.3119620680809021, "learning_rate": 4.412167977811199e-05, "loss": 0.5523, "num_input_tokens_seen": 3749464, "step": 5735 }, { "epoch": 3.008385744234801, "grad_norm": 1.2972697019577026, "learning_rate": 4.4106940121798424e-05, "loss": 0.4264, "num_input_tokens_seen": 3752248, "step": 5740 }, { "epoch": 3.011006289308176, "grad_norm": 0.61074298620224, "learning_rate": 4.409218447767013e-05, "loss": 0.4885, "num_input_tokens_seen": 3755480, "step": 5745 }, { "epoch": 3.0136268343815513, "grad_norm": 0.34445813298225403, "learning_rate": 4.4077412858073966e-05, "loss": 0.4394, "num_input_tokens_seen": 3758040, "step": 5750 }, { "epoch": 3.0162473794549265, "grad_norm": 1.023378610610962, "learning_rate": 4.406262527537014e-05, "loss": 0.4728, "num_input_tokens_seen": 3761368, "step": 5755 }, { "epoch": 3.018867924528302, "grad_norm": 0.45430171489715576, "learning_rate": 4.404782174193223e-05, "loss": 0.4884, "num_input_tokens_seen": 3764824, "step": 5760 }, { "epoch": 3.0214884696016773, "grad_norm": 0.35912591218948364, "learning_rate": 4.403300227014716e-05, "loss": 0.4047, "num_input_tokens_seen": 3768056, "step": 5765 }, { "epoch": 3.0241090146750524, "grad_norm": 0.5498862862586975, "learning_rate": 4.4018166872415176e-05, "loss": 0.5511, "num_input_tokens_seen": 3770168, "step": 5770 }, { "epoch": 3.0267295597484276, "grad_norm": 0.2969541549682617, "learning_rate": 4.4003315561149875e-05, "loss": 0.5028, "num_input_tokens_seen": 3773112, "step": 5775 }, { "epoch": 3.029350104821803, "grad_norm": 0.35088562965393066, "learning_rate": 4.398844834877815e-05, "loss": 0.5387, "num_input_tokens_seen": 3776568, "step": 5780 }, { "epoch": 3.0319706498951784, "grad_norm": 0.2766354978084564, "learning_rate": 4.39735652477402e-05, "loss": 0.6651, "num_input_tokens_seen": 3780920, "step": 5785 }, { "epoch": 3.0345911949685536, "grad_norm": 0.4306619465351105, "learning_rate": 4.395866627048953e-05, "loss": 0.554, "num_input_tokens_seen": 3783960, "step": 5790 }, { "epoch": 3.0372117400419287, "grad_norm": 0.5081653594970703, "learning_rate": 4.3943751429492925e-05, "loss": 0.4546, "num_input_tokens_seen": 3786968, "step": 5795 }, { "epoch": 3.039832285115304, "grad_norm": 0.34467968344688416, "learning_rate": 4.392882073723043e-05, "loss": 0.5505, "num_input_tokens_seen": 3790808, "step": 5800 }, { "epoch": 3.042452830188679, "grad_norm": 0.3391650319099426, "learning_rate": 4.391387420619539e-05, "loss": 0.4842, "num_input_tokens_seen": 3794264, "step": 5805 }, { "epoch": 3.0450733752620547, "grad_norm": 0.37966281175613403, "learning_rate": 4.389891184889435e-05, "loss": 0.5425, "num_input_tokens_seen": 3797496, "step": 5810 }, { "epoch": 3.04769392033543, "grad_norm": 0.5300936102867126, "learning_rate": 4.3883933677847154e-05, "loss": 0.6267, "num_input_tokens_seen": 3800888, "step": 5815 }, { "epoch": 3.050314465408805, "grad_norm": 0.3492027521133423, "learning_rate": 4.3868939705586844e-05, "loss": 0.4346, "num_input_tokens_seen": 3803512, "step": 5820 }, { "epoch": 3.05293501048218, "grad_norm": 0.8204612135887146, "learning_rate": 4.385392994465968e-05, "loss": 0.5786, "num_input_tokens_seen": 3806552, "step": 5825 }, { "epoch": 3.0555555555555554, "grad_norm": 0.23991426825523376, "learning_rate": 4.383890440762515e-05, "loss": 0.5555, "num_input_tokens_seen": 3810648, "step": 5830 }, { "epoch": 3.058176100628931, "grad_norm": 0.32097771763801575, "learning_rate": 4.3823863107055936e-05, "loss": 0.4019, "num_input_tokens_seen": 3813976, "step": 5835 }, { "epoch": 3.060796645702306, "grad_norm": 1.2539031505584717, "learning_rate": 4.380880605553792e-05, "loss": 0.544, "num_input_tokens_seen": 3816824, "step": 5840 }, { "epoch": 3.0634171907756813, "grad_norm": 0.2720237076282501, "learning_rate": 4.3793733265670147e-05, "loss": 0.6062, "num_input_tokens_seen": 3820376, "step": 5845 }, { "epoch": 3.0660377358490565, "grad_norm": 0.5550243258476257, "learning_rate": 4.3778644750064834e-05, "loss": 0.4286, "num_input_tokens_seen": 3823736, "step": 5850 }, { "epoch": 3.068658280922432, "grad_norm": 0.5364109873771667, "learning_rate": 4.376354052134738e-05, "loss": 0.6357, "num_input_tokens_seen": 3826104, "step": 5855 }, { "epoch": 3.0712788259958073, "grad_norm": 0.40476688742637634, "learning_rate": 4.374842059215629e-05, "loss": 0.553, "num_input_tokens_seen": 3828632, "step": 5860 }, { "epoch": 3.0738993710691824, "grad_norm": 0.33410272002220154, "learning_rate": 4.373328497514325e-05, "loss": 0.3289, "num_input_tokens_seen": 3831576, "step": 5865 }, { "epoch": 3.0765199161425576, "grad_norm": 0.3822379410266876, "learning_rate": 4.371813368297304e-05, "loss": 0.358, "num_input_tokens_seen": 3834392, "step": 5870 }, { "epoch": 3.0791404612159328, "grad_norm": 0.6149437427520752, "learning_rate": 4.370296672832358e-05, "loss": 0.5484, "num_input_tokens_seen": 3837144, "step": 5875 }, { "epoch": 3.0817610062893084, "grad_norm": 0.45633137226104736, "learning_rate": 4.3687784123885875e-05, "loss": 0.5496, "num_input_tokens_seen": 3840504, "step": 5880 }, { "epoch": 3.0843815513626835, "grad_norm": 0.33585283160209656, "learning_rate": 4.3672585882364045e-05, "loss": 0.5319, "num_input_tokens_seen": 3844408, "step": 5885 }, { "epoch": 3.0870020964360587, "grad_norm": 0.4057057797908783, "learning_rate": 4.3657372016475275e-05, "loss": 0.5327, "num_input_tokens_seen": 3847736, "step": 5890 }, { "epoch": 3.089622641509434, "grad_norm": 0.3605087697505951, "learning_rate": 4.364214253894983e-05, "loss": 0.5265, "num_input_tokens_seen": 3850360, "step": 5895 }, { "epoch": 3.092243186582809, "grad_norm": 0.37315094470977783, "learning_rate": 4.3626897462531054e-05, "loss": 0.4725, "num_input_tokens_seen": 3853624, "step": 5900 }, { "epoch": 3.0948637316561847, "grad_norm": 0.46885693073272705, "learning_rate": 4.361163679997532e-05, "loss": 0.3879, "num_input_tokens_seen": 3856664, "step": 5905 }, { "epoch": 3.09748427672956, "grad_norm": 0.49167400598526, "learning_rate": 4.359636056405206e-05, "loss": 0.5426, "num_input_tokens_seen": 3859640, "step": 5910 }, { "epoch": 3.100104821802935, "grad_norm": 0.822730302810669, "learning_rate": 4.3581068767543724e-05, "loss": 0.4045, "num_input_tokens_seen": 3862712, "step": 5915 }, { "epoch": 3.10272536687631, "grad_norm": 0.347318172454834, "learning_rate": 4.35657614232458e-05, "loss": 0.4832, "num_input_tokens_seen": 3866328, "step": 5920 }, { "epoch": 3.1053459119496853, "grad_norm": 0.33524808287620544, "learning_rate": 4.355043854396677e-05, "loss": 0.47, "num_input_tokens_seen": 3869592, "step": 5925 }, { "epoch": 3.107966457023061, "grad_norm": 0.49015751481056213, "learning_rate": 4.35351001425281e-05, "loss": 0.3875, "num_input_tokens_seen": 3875032, "step": 5930 }, { "epoch": 3.110587002096436, "grad_norm": 0.2517223358154297, "learning_rate": 4.351974623176429e-05, "loss": 0.5651, "num_input_tokens_seen": 3879288, "step": 5935 }, { "epoch": 3.1132075471698113, "grad_norm": 0.43810567259788513, "learning_rate": 4.3504376824522787e-05, "loss": 0.389, "num_input_tokens_seen": 3882936, "step": 5940 }, { "epoch": 3.1158280922431865, "grad_norm": 0.3156496584415436, "learning_rate": 4.348899193366399e-05, "loss": 0.5281, "num_input_tokens_seen": 3886296, "step": 5945 }, { "epoch": 3.1184486373165616, "grad_norm": 0.8854103088378906, "learning_rate": 4.34735915720613e-05, "loss": 0.5354, "num_input_tokens_seen": 3889816, "step": 5950 }, { "epoch": 3.1210691823899372, "grad_norm": 0.5472932457923889, "learning_rate": 4.345817575260101e-05, "loss": 0.3918, "num_input_tokens_seen": 3892888, "step": 5955 }, { "epoch": 3.1236897274633124, "grad_norm": 0.9848613739013672, "learning_rate": 4.3442744488182395e-05, "loss": 0.5625, "num_input_tokens_seen": 3896152, "step": 5960 }, { "epoch": 3.1263102725366876, "grad_norm": 0.40323883295059204, "learning_rate": 4.342729779171761e-05, "loss": 0.3487, "num_input_tokens_seen": 3898776, "step": 5965 }, { "epoch": 3.1289308176100628, "grad_norm": 0.4172825217247009, "learning_rate": 4.341183567613177e-05, "loss": 0.5174, "num_input_tokens_seen": 3901912, "step": 5970 }, { "epoch": 3.131551362683438, "grad_norm": 0.3172367215156555, "learning_rate": 4.339635815436286e-05, "loss": 0.5721, "num_input_tokens_seen": 3905688, "step": 5975 }, { "epoch": 3.1341719077568135, "grad_norm": 0.4255950450897217, "learning_rate": 4.3380865239361754e-05, "loss": 0.5041, "num_input_tokens_seen": 3908888, "step": 5980 }, { "epoch": 3.1367924528301887, "grad_norm": 0.5919405221939087, "learning_rate": 4.336535694409222e-05, "loss": 0.427, "num_input_tokens_seen": 3912280, "step": 5985 }, { "epoch": 3.139412997903564, "grad_norm": 0.5596356391906738, "learning_rate": 4.334983328153088e-05, "loss": 0.4451, "num_input_tokens_seen": 3915256, "step": 5990 }, { "epoch": 3.142033542976939, "grad_norm": 0.35899075865745544, "learning_rate": 4.3334294264667255e-05, "loss": 0.4547, "num_input_tokens_seen": 3917912, "step": 5995 }, { "epoch": 3.1446540880503147, "grad_norm": 0.4364791512489319, "learning_rate": 4.3318739906503655e-05, "loss": 0.5505, "num_input_tokens_seen": 3920824, "step": 6000 }, { "epoch": 3.14727463312369, "grad_norm": 0.6232336759567261, "learning_rate": 4.3303170220055264e-05, "loss": 0.4082, "num_input_tokens_seen": 3923736, "step": 6005 }, { "epoch": 3.149895178197065, "grad_norm": 0.33910834789276123, "learning_rate": 4.32875852183501e-05, "loss": 0.5353, "num_input_tokens_seen": 3926584, "step": 6010 }, { "epoch": 3.15251572327044, "grad_norm": 0.6237621307373047, "learning_rate": 4.3271984914428965e-05, "loss": 0.6412, "num_input_tokens_seen": 3929432, "step": 6015 }, { "epoch": 3.1551362683438153, "grad_norm": 0.6116554737091064, "learning_rate": 4.325636932134548e-05, "loss": 0.5253, "num_input_tokens_seen": 3932088, "step": 6020 }, { "epoch": 3.157756813417191, "grad_norm": 0.39617881178855896, "learning_rate": 4.324073845216606e-05, "loss": 0.4381, "num_input_tokens_seen": 3935384, "step": 6025 }, { "epoch": 3.160377358490566, "grad_norm": 0.38625067472457886, "learning_rate": 4.322509231996992e-05, "loss": 0.4564, "num_input_tokens_seen": 3938424, "step": 6030 }, { "epoch": 3.1629979035639413, "grad_norm": 0.4646790623664856, "learning_rate": 4.320943093784901e-05, "loss": 0.5137, "num_input_tokens_seen": 3940984, "step": 6035 }, { "epoch": 3.1656184486373165, "grad_norm": 0.39951688051223755, "learning_rate": 4.319375431890806e-05, "loss": 0.4761, "num_input_tokens_seen": 3945144, "step": 6040 }, { "epoch": 3.1682389937106916, "grad_norm": 0.2516535222530365, "learning_rate": 4.317806247626456e-05, "loss": 0.4991, "num_input_tokens_seen": 3948344, "step": 6045 }, { "epoch": 3.1708595387840672, "grad_norm": 0.4344863295555115, "learning_rate": 4.316235542304872e-05, "loss": 0.549, "num_input_tokens_seen": 3951000, "step": 6050 }, { "epoch": 3.1734800838574424, "grad_norm": 0.5423178672790527, "learning_rate": 4.314663317240348e-05, "loss": 0.4717, "num_input_tokens_seen": 3954584, "step": 6055 }, { "epoch": 3.1761006289308176, "grad_norm": 0.446416437625885, "learning_rate": 4.313089573748451e-05, "loss": 0.3508, "num_input_tokens_seen": 3957944, "step": 6060 }, { "epoch": 3.1787211740041927, "grad_norm": 0.3110542297363281, "learning_rate": 4.311514313146018e-05, "loss": 0.4455, "num_input_tokens_seen": 3961816, "step": 6065 }, { "epoch": 3.181341719077568, "grad_norm": 0.4200058579444885, "learning_rate": 4.309937536751153e-05, "loss": 0.5304, "num_input_tokens_seen": 3964824, "step": 6070 }, { "epoch": 3.1839622641509435, "grad_norm": 0.4246608316898346, "learning_rate": 4.3083592458832327e-05, "loss": 0.3777, "num_input_tokens_seen": 3968056, "step": 6075 }, { "epoch": 3.1865828092243187, "grad_norm": 0.4198152720928192, "learning_rate": 4.3067794418628976e-05, "loss": 0.4719, "num_input_tokens_seen": 3970424, "step": 6080 }, { "epoch": 3.189203354297694, "grad_norm": 0.7329698204994202, "learning_rate": 4.305198126012057e-05, "loss": 0.4425, "num_input_tokens_seen": 3973208, "step": 6085 }, { "epoch": 3.191823899371069, "grad_norm": 0.44448956847190857, "learning_rate": 4.303615299653881e-05, "loss": 0.5266, "num_input_tokens_seen": 3976216, "step": 6090 }, { "epoch": 3.1944444444444446, "grad_norm": 0.6390990018844604, "learning_rate": 4.30203096411281e-05, "loss": 0.5467, "num_input_tokens_seen": 3980824, "step": 6095 }, { "epoch": 3.19706498951782, "grad_norm": 0.43379127979278564, "learning_rate": 4.30044512071454e-05, "loss": 0.5551, "num_input_tokens_seen": 3987480, "step": 6100 }, { "epoch": 3.199685534591195, "grad_norm": 0.40572240948677063, "learning_rate": 4.2988577707860346e-05, "loss": 0.4617, "num_input_tokens_seen": 3991288, "step": 6105 }, { "epoch": 3.20230607966457, "grad_norm": 0.38706013560295105, "learning_rate": 4.2972689156555154e-05, "loss": 0.5618, "num_input_tokens_seen": 3993848, "step": 6110 }, { "epoch": 3.2049266247379453, "grad_norm": 0.4318825602531433, "learning_rate": 4.295678556652464e-05, "loss": 0.498, "num_input_tokens_seen": 3997848, "step": 6115 }, { "epoch": 3.207547169811321, "grad_norm": 0.6327967047691345, "learning_rate": 4.294086695107619e-05, "loss": 0.422, "num_input_tokens_seen": 4001784, "step": 6120 }, { "epoch": 3.210167714884696, "grad_norm": 0.31849756836891174, "learning_rate": 4.292493332352978e-05, "loss": 0.5026, "num_input_tokens_seen": 4005048, "step": 6125 }, { "epoch": 3.2127882599580713, "grad_norm": 0.8332760334014893, "learning_rate": 4.290898469721795e-05, "loss": 0.5159, "num_input_tokens_seen": 4007864, "step": 6130 }, { "epoch": 3.2154088050314464, "grad_norm": 0.4132106304168701, "learning_rate": 4.2893021085485765e-05, "loss": 0.5384, "num_input_tokens_seen": 4011832, "step": 6135 }, { "epoch": 3.2180293501048216, "grad_norm": 0.3783264458179474, "learning_rate": 4.287704250169086e-05, "loss": 0.4735, "num_input_tokens_seen": 4015032, "step": 6140 }, { "epoch": 3.220649895178197, "grad_norm": 0.46614864468574524, "learning_rate": 4.2861048959203386e-05, "loss": 0.6017, "num_input_tokens_seen": 4017432, "step": 6145 }, { "epoch": 3.2232704402515724, "grad_norm": 0.6895266771316528, "learning_rate": 4.284504047140599e-05, "loss": 0.4144, "num_input_tokens_seen": 4020216, "step": 6150 }, { "epoch": 3.2258909853249476, "grad_norm": 0.5178583860397339, "learning_rate": 4.282901705169387e-05, "loss": 0.5481, "num_input_tokens_seen": 4023096, "step": 6155 }, { "epoch": 3.2285115303983227, "grad_norm": 0.9219483733177185, "learning_rate": 4.281297871347468e-05, "loss": 0.388, "num_input_tokens_seen": 4026712, "step": 6160 }, { "epoch": 3.231132075471698, "grad_norm": 0.8817790150642395, "learning_rate": 4.279692547016856e-05, "loss": 0.5373, "num_input_tokens_seen": 4030872, "step": 6165 }, { "epoch": 3.2337526205450735, "grad_norm": 0.4198060929775238, "learning_rate": 4.278085733520814e-05, "loss": 0.4462, "num_input_tokens_seen": 4034744, "step": 6170 }, { "epoch": 3.2363731656184487, "grad_norm": 0.5445418953895569, "learning_rate": 4.2764774322038494e-05, "loss": 0.4638, "num_input_tokens_seen": 4037752, "step": 6175 }, { "epoch": 3.238993710691824, "grad_norm": 1.0788522958755493, "learning_rate": 4.2748676444117156e-05, "loss": 0.4373, "num_input_tokens_seen": 4041048, "step": 6180 }, { "epoch": 3.241614255765199, "grad_norm": 0.667168140411377, "learning_rate": 4.27325637149141e-05, "loss": 0.5771, "num_input_tokens_seen": 4043832, "step": 6185 }, { "epoch": 3.2442348008385746, "grad_norm": 0.467608243227005, "learning_rate": 4.271643614791172e-05, "loss": 0.4241, "num_input_tokens_seen": 4047512, "step": 6190 }, { "epoch": 3.24685534591195, "grad_norm": 0.38842546939849854, "learning_rate": 4.2700293756604824e-05, "loss": 0.4914, "num_input_tokens_seen": 4050584, "step": 6195 }, { "epoch": 3.249475890985325, "grad_norm": 0.46355167031288147, "learning_rate": 4.268413655450064e-05, "loss": 0.3886, "num_input_tokens_seen": 4053752, "step": 6200 }, { "epoch": 3.2520964360587, "grad_norm": 0.7812026739120483, "learning_rate": 4.266796455511875e-05, "loss": 0.5062, "num_input_tokens_seen": 4056824, "step": 6205 }, { "epoch": 3.2547169811320753, "grad_norm": 1.0835638046264648, "learning_rate": 4.2651777771991176e-05, "loss": 0.6335, "num_input_tokens_seen": 4059864, "step": 6210 }, { "epoch": 3.257337526205451, "grad_norm": 0.6285890340805054, "learning_rate": 4.2635576218662257e-05, "loss": 0.4643, "num_input_tokens_seen": 4063416, "step": 6215 }, { "epoch": 3.259958071278826, "grad_norm": 0.33078429102897644, "learning_rate": 4.261935990868871e-05, "loss": 0.4181, "num_input_tokens_seen": 4066264, "step": 6220 }, { "epoch": 3.2625786163522013, "grad_norm": 0.4334056079387665, "learning_rate": 4.260312885563962e-05, "loss": 0.5132, "num_input_tokens_seen": 4068664, "step": 6225 }, { "epoch": 3.2651991614255764, "grad_norm": 0.5062128305435181, "learning_rate": 4.2586883073096386e-05, "loss": 0.6945, "num_input_tokens_seen": 4072088, "step": 6230 }, { "epoch": 3.2678197064989516, "grad_norm": 1.1056205034255981, "learning_rate": 4.257062257465272e-05, "loss": 0.6609, "num_input_tokens_seen": 4075064, "step": 6235 }, { "epoch": 3.270440251572327, "grad_norm": 0.552893340587616, "learning_rate": 4.255434737391469e-05, "loss": 0.5155, "num_input_tokens_seen": 4077752, "step": 6240 }, { "epoch": 3.2730607966457024, "grad_norm": 0.4987391233444214, "learning_rate": 4.2538057484500624e-05, "loss": 0.5151, "num_input_tokens_seen": 4080472, "step": 6245 }, { "epoch": 3.2756813417190775, "grad_norm": 0.5688360929489136, "learning_rate": 4.2521752920041155e-05, "loss": 0.3995, "num_input_tokens_seen": 4083384, "step": 6250 }, { "epoch": 3.2783018867924527, "grad_norm": 0.3658784329891205, "learning_rate": 4.2505433694179216e-05, "loss": 0.5304, "num_input_tokens_seen": 4086488, "step": 6255 }, { "epoch": 3.280922431865828, "grad_norm": 0.42724835872650146, "learning_rate": 4.2489099820569974e-05, "loss": 0.3425, "num_input_tokens_seen": 4089464, "step": 6260 }, { "epoch": 3.2835429769392035, "grad_norm": 0.5278037190437317, "learning_rate": 4.247275131288086e-05, "loss": 0.5403, "num_input_tokens_seen": 4092088, "step": 6265 }, { "epoch": 3.2861635220125787, "grad_norm": 0.5666576623916626, "learning_rate": 4.2456388184791584e-05, "loss": 0.4781, "num_input_tokens_seen": 4095544, "step": 6270 }, { "epoch": 3.288784067085954, "grad_norm": 0.26247647404670715, "learning_rate": 4.2440010449994054e-05, "loss": 0.484, "num_input_tokens_seen": 4098264, "step": 6275 }, { "epoch": 3.291404612159329, "grad_norm": 0.4381787180900574, "learning_rate": 4.24236181221924e-05, "loss": 0.4562, "num_input_tokens_seen": 4100792, "step": 6280 }, { "epoch": 3.2940251572327046, "grad_norm": 0.34342896938323975, "learning_rate": 4.240721121510298e-05, "loss": 0.4468, "num_input_tokens_seen": 4103896, "step": 6285 }, { "epoch": 3.29664570230608, "grad_norm": 0.36919716000556946, "learning_rate": 4.2390789742454354e-05, "loss": 0.3821, "num_input_tokens_seen": 4106968, "step": 6290 }, { "epoch": 3.299266247379455, "grad_norm": 0.41189154982566833, "learning_rate": 4.2374353717987244e-05, "loss": 0.5535, "num_input_tokens_seen": 4110040, "step": 6295 }, { "epoch": 3.30188679245283, "grad_norm": 0.415651798248291, "learning_rate": 4.235790315545457e-05, "loss": 0.4429, "num_input_tokens_seen": 4112664, "step": 6300 }, { "epoch": 3.3045073375262053, "grad_norm": 0.858488917350769, "learning_rate": 4.234143806862141e-05, "loss": 0.49, "num_input_tokens_seen": 4115832, "step": 6305 }, { "epoch": 3.307127882599581, "grad_norm": 0.5166499018669128, "learning_rate": 4.2324958471265006e-05, "loss": 0.459, "num_input_tokens_seen": 4119160, "step": 6310 }, { "epoch": 3.309748427672956, "grad_norm": 0.713454008102417, "learning_rate": 4.230846437717472e-05, "loss": 0.8439, "num_input_tokens_seen": 4122520, "step": 6315 }, { "epoch": 3.3123689727463312, "grad_norm": 0.6084604263305664, "learning_rate": 4.2291955800152063e-05, "loss": 0.4602, "num_input_tokens_seen": 4124888, "step": 6320 }, { "epoch": 3.3149895178197064, "grad_norm": 0.4415496289730072, "learning_rate": 4.2275432754010663e-05, "loss": 0.5227, "num_input_tokens_seen": 4127832, "step": 6325 }, { "epoch": 3.3176100628930816, "grad_norm": 0.5047445297241211, "learning_rate": 4.225889525257624e-05, "loss": 0.6122, "num_input_tokens_seen": 4130232, "step": 6330 }, { "epoch": 3.320230607966457, "grad_norm": 0.37738892436027527, "learning_rate": 4.224234330968663e-05, "loss": 0.4224, "num_input_tokens_seen": 4133080, "step": 6335 }, { "epoch": 3.3228511530398324, "grad_norm": 0.2626029849052429, "learning_rate": 4.222577693919173e-05, "loss": 0.4275, "num_input_tokens_seen": 4136536, "step": 6340 }, { "epoch": 3.3254716981132075, "grad_norm": 0.32495763897895813, "learning_rate": 4.2209196154953536e-05, "loss": 0.44, "num_input_tokens_seen": 4139608, "step": 6345 }, { "epoch": 3.3280922431865827, "grad_norm": 0.7593201398849487, "learning_rate": 4.219260097084608e-05, "loss": 0.6045, "num_input_tokens_seen": 4142200, "step": 6350 }, { "epoch": 3.330712788259958, "grad_norm": 0.6565366983413696, "learning_rate": 4.217599140075546e-05, "loss": 0.8252, "num_input_tokens_seen": 4146200, "step": 6355 }, { "epoch": 3.3333333333333335, "grad_norm": 0.38672569394111633, "learning_rate": 4.2159367458579793e-05, "loss": 0.4065, "num_input_tokens_seen": 4149464, "step": 6360 }, { "epoch": 3.3359538784067087, "grad_norm": 0.2899843752384186, "learning_rate": 4.2142729158229256e-05, "loss": 0.5545, "num_input_tokens_seen": 4151992, "step": 6365 }, { "epoch": 3.338574423480084, "grad_norm": 0.357254296541214, "learning_rate": 4.2126076513626004e-05, "loss": 0.4941, "num_input_tokens_seen": 4155576, "step": 6370 }, { "epoch": 3.341194968553459, "grad_norm": 0.5422326326370239, "learning_rate": 4.210940953870422e-05, "loss": 0.5982, "num_input_tokens_seen": 4159672, "step": 6375 }, { "epoch": 3.3438155136268346, "grad_norm": 0.4195804297924042, "learning_rate": 4.209272824741005e-05, "loss": 0.4395, "num_input_tokens_seen": 4162360, "step": 6380 }, { "epoch": 3.3464360587002098, "grad_norm": 0.4225913882255554, "learning_rate": 4.207603265370166e-05, "loss": 0.4714, "num_input_tokens_seen": 4165304, "step": 6385 }, { "epoch": 3.349056603773585, "grad_norm": 0.3249273896217346, "learning_rate": 4.205932277154914e-05, "loss": 0.4496, "num_input_tokens_seen": 4168728, "step": 6390 }, { "epoch": 3.35167714884696, "grad_norm": 0.6730080246925354, "learning_rate": 4.204259861493457e-05, "loss": 0.3802, "num_input_tokens_seen": 4171960, "step": 6395 }, { "epoch": 3.3542976939203353, "grad_norm": 0.3246231973171234, "learning_rate": 4.202586019785194e-05, "loss": 0.474, "num_input_tokens_seen": 4175544, "step": 6400 }, { "epoch": 3.3569182389937104, "grad_norm": 0.3343845009803772, "learning_rate": 4.2009107534307214e-05, "loss": 0.3969, "num_input_tokens_seen": 4178264, "step": 6405 }, { "epoch": 3.359538784067086, "grad_norm": 0.5170446038246155, "learning_rate": 4.199234063831825e-05, "loss": 0.4832, "num_input_tokens_seen": 4180984, "step": 6410 }, { "epoch": 3.3621593291404612, "grad_norm": 0.47222232818603516, "learning_rate": 4.197555952391482e-05, "loss": 0.5867, "num_input_tokens_seen": 4183768, "step": 6415 }, { "epoch": 3.3647798742138364, "grad_norm": 0.37562528252601624, "learning_rate": 4.195876420513859e-05, "loss": 0.4298, "num_input_tokens_seen": 4187000, "step": 6420 }, { "epoch": 3.3674004192872116, "grad_norm": 0.8304052948951721, "learning_rate": 4.194195469604312e-05, "loss": 0.4903, "num_input_tokens_seen": 4189304, "step": 6425 }, { "epoch": 3.370020964360587, "grad_norm": 0.2985229790210724, "learning_rate": 4.192513101069383e-05, "loss": 0.4951, "num_input_tokens_seen": 4193080, "step": 6430 }, { "epoch": 3.3726415094339623, "grad_norm": 0.5733998417854309, "learning_rate": 4.190829316316803e-05, "loss": 0.4922, "num_input_tokens_seen": 4195608, "step": 6435 }, { "epoch": 3.3752620545073375, "grad_norm": 0.6092982888221741, "learning_rate": 4.189144116755485e-05, "loss": 0.3819, "num_input_tokens_seen": 4198744, "step": 6440 }, { "epoch": 3.3778825995807127, "grad_norm": 0.7475786209106445, "learning_rate": 4.187457503795527e-05, "loss": 0.733, "num_input_tokens_seen": 4202680, "step": 6445 }, { "epoch": 3.380503144654088, "grad_norm": 0.6132342219352722, "learning_rate": 4.1857694788482094e-05, "loss": 0.4742, "num_input_tokens_seen": 4205144, "step": 6450 }, { "epoch": 3.3831236897274635, "grad_norm": 0.5623950362205505, "learning_rate": 4.184080043325995e-05, "loss": 0.4696, "num_input_tokens_seen": 4209112, "step": 6455 }, { "epoch": 3.3857442348008386, "grad_norm": 0.4493253529071808, "learning_rate": 4.1823891986425256e-05, "loss": 0.4776, "num_input_tokens_seen": 4213048, "step": 6460 }, { "epoch": 3.388364779874214, "grad_norm": 0.285150408744812, "learning_rate": 4.180696946212624e-05, "loss": 0.4351, "num_input_tokens_seen": 4216472, "step": 6465 }, { "epoch": 3.390985324947589, "grad_norm": 0.43938809633255005, "learning_rate": 4.1790032874522885e-05, "loss": 0.4155, "num_input_tokens_seen": 4219672, "step": 6470 }, { "epoch": 3.3936058700209646, "grad_norm": 0.6322056651115417, "learning_rate": 4.177308223778696e-05, "loss": 0.5192, "num_input_tokens_seen": 4222872, "step": 6475 }, { "epoch": 3.3962264150943398, "grad_norm": 0.31632956862449646, "learning_rate": 4.175611756610198e-05, "loss": 0.5207, "num_input_tokens_seen": 4225528, "step": 6480 }, { "epoch": 3.398846960167715, "grad_norm": 0.5437989830970764, "learning_rate": 4.173913887366322e-05, "loss": 0.4056, "num_input_tokens_seen": 4229144, "step": 6485 }, { "epoch": 3.40146750524109, "grad_norm": 0.4051440954208374, "learning_rate": 4.172214617467765e-05, "loss": 0.5022, "num_input_tokens_seen": 4232792, "step": 6490 }, { "epoch": 3.4040880503144653, "grad_norm": 0.36790210008621216, "learning_rate": 4.1705139483364e-05, "loss": 0.4351, "num_input_tokens_seen": 4236600, "step": 6495 }, { "epoch": 3.4067085953878404, "grad_norm": 0.5184499025344849, "learning_rate": 4.1688118813952706e-05, "loss": 0.6402, "num_input_tokens_seen": 4239896, "step": 6500 }, { "epoch": 3.409329140461216, "grad_norm": 0.30582812428474426, "learning_rate": 4.167108418068585e-05, "loss": 0.507, "num_input_tokens_seen": 4243640, "step": 6505 }, { "epoch": 3.411949685534591, "grad_norm": 0.42587319016456604, "learning_rate": 4.165403559781727e-05, "loss": 0.5696, "num_input_tokens_seen": 4247160, "step": 6510 }, { "epoch": 3.4145702306079664, "grad_norm": 0.49119439721107483, "learning_rate": 4.163697307961242e-05, "loss": 0.5891, "num_input_tokens_seen": 4251032, "step": 6515 }, { "epoch": 3.4171907756813416, "grad_norm": 0.5864958763122559, "learning_rate": 4.1619896640348445e-05, "loss": 0.5548, "num_input_tokens_seen": 4254680, "step": 6520 }, { "epoch": 3.419811320754717, "grad_norm": 0.4158414304256439, "learning_rate": 4.160280629431413e-05, "loss": 0.4246, "num_input_tokens_seen": 4257720, "step": 6525 }, { "epoch": 3.4224318658280923, "grad_norm": 0.3166358470916748, "learning_rate": 4.158570205580989e-05, "loss": 0.5139, "num_input_tokens_seen": 4260696, "step": 6530 }, { "epoch": 3.4250524109014675, "grad_norm": 0.46377095580101013, "learning_rate": 4.156858393914779e-05, "loss": 0.4516, "num_input_tokens_seen": 4263416, "step": 6535 }, { "epoch": 3.4276729559748427, "grad_norm": 0.6273624300956726, "learning_rate": 4.1551451958651455e-05, "loss": 0.5954, "num_input_tokens_seen": 4266232, "step": 6540 }, { "epoch": 3.430293501048218, "grad_norm": 1.0190849304199219, "learning_rate": 4.153430612865616e-05, "loss": 0.4245, "num_input_tokens_seen": 4269400, "step": 6545 }, { "epoch": 3.4329140461215935, "grad_norm": 0.5481160879135132, "learning_rate": 4.151714646350876e-05, "loss": 0.4817, "num_input_tokens_seen": 4271896, "step": 6550 }, { "epoch": 3.4355345911949686, "grad_norm": 0.48584285378456116, "learning_rate": 4.149997297756767e-05, "loss": 0.6767, "num_input_tokens_seen": 4275416, "step": 6555 }, { "epoch": 3.438155136268344, "grad_norm": 0.29123398661613464, "learning_rate": 4.148278568520289e-05, "loss": 0.4386, "num_input_tokens_seen": 4278936, "step": 6560 }, { "epoch": 3.440775681341719, "grad_norm": 0.46528270840644836, "learning_rate": 4.146558460079595e-05, "loss": 0.4878, "num_input_tokens_seen": 4282808, "step": 6565 }, { "epoch": 3.4433962264150946, "grad_norm": 0.4606470763683319, "learning_rate": 4.1448369738739923e-05, "loss": 0.5144, "num_input_tokens_seen": 4285400, "step": 6570 }, { "epoch": 3.4460167714884697, "grad_norm": 0.7323496341705322, "learning_rate": 4.143114111343944e-05, "loss": 0.3973, "num_input_tokens_seen": 4288280, "step": 6575 }, { "epoch": 3.448637316561845, "grad_norm": 0.32432183623313904, "learning_rate": 4.1413898739310605e-05, "loss": 0.4874, "num_input_tokens_seen": 4291576, "step": 6580 }, { "epoch": 3.45125786163522, "grad_norm": 0.545707643032074, "learning_rate": 4.1396642630781076e-05, "loss": 0.4866, "num_input_tokens_seen": 4295288, "step": 6585 }, { "epoch": 3.4538784067085953, "grad_norm": 0.3439484238624573, "learning_rate": 4.137937280228996e-05, "loss": 0.5363, "num_input_tokens_seen": 4298808, "step": 6590 }, { "epoch": 3.4564989517819704, "grad_norm": 0.6648797988891602, "learning_rate": 4.136208926828786e-05, "loss": 0.3836, "num_input_tokens_seen": 4304280, "step": 6595 }, { "epoch": 3.459119496855346, "grad_norm": 0.7976412177085876, "learning_rate": 4.134479204323685e-05, "loss": 0.6877, "num_input_tokens_seen": 4306616, "step": 6600 }, { "epoch": 3.461740041928721, "grad_norm": 0.6732871532440186, "learning_rate": 4.132748114161046e-05, "loss": 0.4846, "num_input_tokens_seen": 4309112, "step": 6605 }, { "epoch": 3.4643605870020964, "grad_norm": 0.9955762624740601, "learning_rate": 4.131015657789365e-05, "loss": 0.5689, "num_input_tokens_seen": 4311832, "step": 6610 }, { "epoch": 3.4669811320754715, "grad_norm": 0.11212341487407684, "learning_rate": 4.129281836658285e-05, "loss": 0.4664, "num_input_tokens_seen": 4318328, "step": 6615 }, { "epoch": 3.469601677148847, "grad_norm": 0.5738621950149536, "learning_rate": 4.127546652218586e-05, "loss": 0.4522, "num_input_tokens_seen": 4321848, "step": 6620 }, { "epoch": 3.4722222222222223, "grad_norm": 0.23600426316261292, "learning_rate": 4.1258101059221914e-05, "loss": 0.3411, "num_input_tokens_seen": 4324984, "step": 6625 }, { "epoch": 3.4748427672955975, "grad_norm": 0.5461429357528687, "learning_rate": 4.124072199222165e-05, "loss": 0.4745, "num_input_tokens_seen": 4327704, "step": 6630 }, { "epoch": 3.4774633123689727, "grad_norm": 0.3459562063217163, "learning_rate": 4.122332933572707e-05, "loss": 0.4209, "num_input_tokens_seen": 4330904, "step": 6635 }, { "epoch": 3.480083857442348, "grad_norm": 0.6757499575614929, "learning_rate": 4.120592310429154e-05, "loss": 0.5306, "num_input_tokens_seen": 4333592, "step": 6640 }, { "epoch": 3.4827044025157234, "grad_norm": 0.5925214886665344, "learning_rate": 4.118850331247982e-05, "loss": 0.4468, "num_input_tokens_seen": 4336696, "step": 6645 }, { "epoch": 3.4853249475890986, "grad_norm": 0.9932631254196167, "learning_rate": 4.1171069974868e-05, "loss": 0.7335, "num_input_tokens_seen": 4340088, "step": 6650 }, { "epoch": 3.487945492662474, "grad_norm": 0.5551575422286987, "learning_rate": 4.115362310604347e-05, "loss": 0.5467, "num_input_tokens_seen": 4343640, "step": 6655 }, { "epoch": 3.490566037735849, "grad_norm": 0.40128418803215027, "learning_rate": 4.113616272060501e-05, "loss": 0.4564, "num_input_tokens_seen": 4348568, "step": 6660 }, { "epoch": 3.4931865828092246, "grad_norm": 0.6669957041740417, "learning_rate": 4.111868883316266e-05, "loss": 0.4204, "num_input_tokens_seen": 4352376, "step": 6665 }, { "epoch": 3.4958071278825997, "grad_norm": 0.35790789127349854, "learning_rate": 4.110120145833775e-05, "loss": 0.5221, "num_input_tokens_seen": 4356120, "step": 6670 }, { "epoch": 3.498427672955975, "grad_norm": 0.5286223292350769, "learning_rate": 4.108370061076294e-05, "loss": 0.4868, "num_input_tokens_seen": 4359448, "step": 6675 }, { "epoch": 3.5, "eval_loss": 0.49469223618507385, "eval_runtime": 14.5466, "eval_samples_per_second": 58.295, "eval_steps_per_second": 14.574, "num_input_tokens_seen": 4361944, "step": 6678 }, { "epoch": 3.50104821802935, "grad_norm": 0.42925769090652466, "learning_rate": 4.106618630508213e-05, "loss": 0.363, "num_input_tokens_seen": 4362936, "step": 6680 }, { "epoch": 3.5036687631027252, "grad_norm": 0.37006866931915283, "learning_rate": 4.10486585559505e-05, "loss": 0.4285, "num_input_tokens_seen": 4366968, "step": 6685 }, { "epoch": 3.5062893081761004, "grad_norm": 0.3181057274341583, "learning_rate": 4.103111737803446e-05, "loss": 0.3694, "num_input_tokens_seen": 4370264, "step": 6690 }, { "epoch": 3.508909853249476, "grad_norm": 0.5779807567596436, "learning_rate": 4.101356278601167e-05, "loss": 0.3456, "num_input_tokens_seen": 4373432, "step": 6695 }, { "epoch": 3.511530398322851, "grad_norm": 0.40401268005371094, "learning_rate": 4.0995994794571015e-05, "loss": 0.6198, "num_input_tokens_seen": 4376824, "step": 6700 }, { "epoch": 3.5141509433962264, "grad_norm": 0.35093140602111816, "learning_rate": 4.0978413418412574e-05, "loss": 0.5482, "num_input_tokens_seen": 4380504, "step": 6705 }, { "epoch": 3.5167714884696015, "grad_norm": 0.3707839548587799, "learning_rate": 4.0960818672247656e-05, "loss": 0.4668, "num_input_tokens_seen": 4383704, "step": 6710 }, { "epoch": 3.519392033542977, "grad_norm": 0.6124925017356873, "learning_rate": 4.094321057079874e-05, "loss": 0.603, "num_input_tokens_seen": 4387128, "step": 6715 }, { "epoch": 3.5220125786163523, "grad_norm": 0.47177398204803467, "learning_rate": 4.092558912879947e-05, "loss": 0.4991, "num_input_tokens_seen": 4390296, "step": 6720 }, { "epoch": 3.5246331236897275, "grad_norm": 0.46864813566207886, "learning_rate": 4.090795436099466e-05, "loss": 0.539, "num_input_tokens_seen": 4393624, "step": 6725 }, { "epoch": 3.5272536687631026, "grad_norm": 0.38350245356559753, "learning_rate": 4.089030628214029e-05, "loss": 0.4213, "num_input_tokens_seen": 4397944, "step": 6730 }, { "epoch": 3.529874213836478, "grad_norm": 0.36182236671447754, "learning_rate": 4.0872644907003476e-05, "loss": 0.4802, "num_input_tokens_seen": 4400696, "step": 6735 }, { "epoch": 3.532494758909853, "grad_norm": 0.6183091402053833, "learning_rate": 4.0854970250362426e-05, "loss": 0.4795, "num_input_tokens_seen": 4403416, "step": 6740 }, { "epoch": 3.5351153039832286, "grad_norm": 0.3343009352684021, "learning_rate": 4.0837282327006495e-05, "loss": 0.4207, "num_input_tokens_seen": 4407320, "step": 6745 }, { "epoch": 3.5377358490566038, "grad_norm": 0.4818149507045746, "learning_rate": 4.081958115173614e-05, "loss": 0.4457, "num_input_tokens_seen": 4409848, "step": 6750 }, { "epoch": 3.540356394129979, "grad_norm": 0.45267340540885925, "learning_rate": 4.080186673936288e-05, "loss": 0.483, "num_input_tokens_seen": 4412920, "step": 6755 }, { "epoch": 3.5429769392033545, "grad_norm": 0.7106027603149414, "learning_rate": 4.078413910470934e-05, "loss": 0.4096, "num_input_tokens_seen": 4415384, "step": 6760 }, { "epoch": 3.5455974842767297, "grad_norm": 0.5626880526542664, "learning_rate": 4.076639826260919e-05, "loss": 0.3891, "num_input_tokens_seen": 4418040, "step": 6765 }, { "epoch": 3.548218029350105, "grad_norm": 0.3147718012332916, "learning_rate": 4.074864422790714e-05, "loss": 0.4738, "num_input_tokens_seen": 4421464, "step": 6770 }, { "epoch": 3.55083857442348, "grad_norm": 0.4430132508277893, "learning_rate": 4.073087701545897e-05, "loss": 0.5034, "num_input_tokens_seen": 4423672, "step": 6775 }, { "epoch": 3.5534591194968552, "grad_norm": 0.47095292806625366, "learning_rate": 4.071309664013148e-05, "loss": 0.4321, "num_input_tokens_seen": 4426648, "step": 6780 }, { "epoch": 3.5560796645702304, "grad_norm": 0.6246224045753479, "learning_rate": 4.069530311680247e-05, "loss": 0.5144, "num_input_tokens_seen": 4429752, "step": 6785 }, { "epoch": 3.558700209643606, "grad_norm": 0.41312167048454285, "learning_rate": 4.0677496460360734e-05, "loss": 0.6028, "num_input_tokens_seen": 4433784, "step": 6790 }, { "epoch": 3.561320754716981, "grad_norm": 0.5905318856239319, "learning_rate": 4.0659676685706084e-05, "loss": 0.4979, "num_input_tokens_seen": 4436376, "step": 6795 }, { "epoch": 3.5639412997903563, "grad_norm": 0.6242421269416809, "learning_rate": 4.064184380774929e-05, "loss": 0.4237, "num_input_tokens_seen": 4439384, "step": 6800 }, { "epoch": 3.5665618448637315, "grad_norm": 1.001686930656433, "learning_rate": 4.062399784141209e-05, "loss": 0.5081, "num_input_tokens_seen": 4442712, "step": 6805 }, { "epoch": 3.569182389937107, "grad_norm": 0.6119611859321594, "learning_rate": 4.060613880162717e-05, "loss": 0.5155, "num_input_tokens_seen": 4446936, "step": 6810 }, { "epoch": 3.5718029350104823, "grad_norm": 0.5175453424453735, "learning_rate": 4.0588266703338164e-05, "loss": 0.5392, "num_input_tokens_seen": 4450328, "step": 6815 }, { "epoch": 3.5744234800838575, "grad_norm": 0.40224412083625793, "learning_rate": 4.057038156149961e-05, "loss": 0.4331, "num_input_tokens_seen": 4453816, "step": 6820 }, { "epoch": 3.5770440251572326, "grad_norm": 0.3537302017211914, "learning_rate": 4.055248339107701e-05, "loss": 0.4005, "num_input_tokens_seen": 4458072, "step": 6825 }, { "epoch": 3.579664570230608, "grad_norm": 0.270938903093338, "learning_rate": 4.053457220704671e-05, "loss": 0.4253, "num_input_tokens_seen": 4461144, "step": 6830 }, { "epoch": 3.582285115303983, "grad_norm": 0.3923065662384033, "learning_rate": 4.0516648024395974e-05, "loss": 0.3758, "num_input_tokens_seen": 4463928, "step": 6835 }, { "epoch": 3.5849056603773586, "grad_norm": 0.6844568252563477, "learning_rate": 4.049871085812295e-05, "loss": 0.47, "num_input_tokens_seen": 4466488, "step": 6840 }, { "epoch": 3.5875262054507338, "grad_norm": 0.2548898160457611, "learning_rate": 4.0480760723236633e-05, "loss": 0.412, "num_input_tokens_seen": 4470104, "step": 6845 }, { "epoch": 3.590146750524109, "grad_norm": 0.591019332408905, "learning_rate": 4.046279763475687e-05, "loss": 0.5772, "num_input_tokens_seen": 4472184, "step": 6850 }, { "epoch": 3.5927672955974845, "grad_norm": 0.35999053716659546, "learning_rate": 4.0444821607714366e-05, "loss": 0.4873, "num_input_tokens_seen": 4474840, "step": 6855 }, { "epoch": 3.5953878406708597, "grad_norm": 0.8944082856178284, "learning_rate": 4.042683265715063e-05, "loss": 0.556, "num_input_tokens_seen": 4477816, "step": 6860 }, { "epoch": 3.598008385744235, "grad_norm": 0.40375444293022156, "learning_rate": 4.040883079811799e-05, "loss": 0.553, "num_input_tokens_seen": 4481816, "step": 6865 }, { "epoch": 3.60062893081761, "grad_norm": 0.5387933850288391, "learning_rate": 4.039081604567959e-05, "loss": 0.3826, "num_input_tokens_seen": 4485464, "step": 6870 }, { "epoch": 3.603249475890985, "grad_norm": 0.3897915780544281, "learning_rate": 4.037278841490933e-05, "loss": 0.3902, "num_input_tokens_seen": 4490392, "step": 6875 }, { "epoch": 3.6058700209643604, "grad_norm": 0.5833226442337036, "learning_rate": 4.0354747920891954e-05, "loss": 0.5, "num_input_tokens_seen": 4494712, "step": 6880 }, { "epoch": 3.608490566037736, "grad_norm": 0.37756264209747314, "learning_rate": 4.033669457872288e-05, "loss": 0.5368, "num_input_tokens_seen": 4497528, "step": 6885 }, { "epoch": 3.611111111111111, "grad_norm": 0.368028461933136, "learning_rate": 4.0318628403508336e-05, "loss": 0.3961, "num_input_tokens_seen": 4500536, "step": 6890 }, { "epoch": 3.6137316561844863, "grad_norm": 0.4230254590511322, "learning_rate": 4.0300549410365276e-05, "loss": 0.512, "num_input_tokens_seen": 4504056, "step": 6895 }, { "epoch": 3.6163522012578615, "grad_norm": 0.40599188208580017, "learning_rate": 4.0282457614421364e-05, "loss": 0.4578, "num_input_tokens_seen": 4506936, "step": 6900 }, { "epoch": 3.618972746331237, "grad_norm": 0.41491568088531494, "learning_rate": 4.0264353030814996e-05, "loss": 0.3787, "num_input_tokens_seen": 4509560, "step": 6905 }, { "epoch": 3.6215932914046123, "grad_norm": 0.539682149887085, "learning_rate": 4.0246235674695255e-05, "loss": 0.4798, "num_input_tokens_seen": 4513240, "step": 6910 }, { "epoch": 3.6242138364779874, "grad_norm": 0.46847379207611084, "learning_rate": 4.022810556122193e-05, "loss": 0.4772, "num_input_tokens_seen": 4515640, "step": 6915 }, { "epoch": 3.6268343815513626, "grad_norm": 0.2904217541217804, "learning_rate": 4.020996270556546e-05, "loss": 0.412, "num_input_tokens_seen": 4520824, "step": 6920 }, { "epoch": 3.629454926624738, "grad_norm": 0.4152389466762543, "learning_rate": 4.0191807122906964e-05, "loss": 0.5028, "num_input_tokens_seen": 4523896, "step": 6925 }, { "epoch": 3.632075471698113, "grad_norm": 0.5272411704063416, "learning_rate": 4.01736388284382e-05, "loss": 0.6164, "num_input_tokens_seen": 4527000, "step": 6930 }, { "epoch": 3.6346960167714886, "grad_norm": 0.6245113611221313, "learning_rate": 4.015545783736157e-05, "loss": 0.5937, "num_input_tokens_seen": 4529912, "step": 6935 }, { "epoch": 3.6373165618448637, "grad_norm": 0.6942453980445862, "learning_rate": 4.013726416489009e-05, "loss": 0.434, "num_input_tokens_seen": 4533336, "step": 6940 }, { "epoch": 3.639937106918239, "grad_norm": 0.3901107609272003, "learning_rate": 4.01190578262474e-05, "loss": 0.4791, "num_input_tokens_seen": 4536344, "step": 6945 }, { "epoch": 3.6425576519916145, "grad_norm": 0.3145482540130615, "learning_rate": 4.0100838836667735e-05, "loss": 0.4213, "num_input_tokens_seen": 4540152, "step": 6950 }, { "epoch": 3.6451781970649897, "grad_norm": 0.4906693994998932, "learning_rate": 4.0082607211395904e-05, "loss": 0.3556, "num_input_tokens_seen": 4543032, "step": 6955 }, { "epoch": 3.647798742138365, "grad_norm": 0.71284419298172, "learning_rate": 4.006436296568731e-05, "loss": 0.4841, "num_input_tokens_seen": 4547064, "step": 6960 }, { "epoch": 3.65041928721174, "grad_norm": 0.40839922428131104, "learning_rate": 4.00461061148079e-05, "loss": 0.5187, "num_input_tokens_seen": 4551000, "step": 6965 }, { "epoch": 3.653039832285115, "grad_norm": 0.2786627411842346, "learning_rate": 4.0027836674034174e-05, "loss": 0.543, "num_input_tokens_seen": 4555000, "step": 6970 }, { "epoch": 3.6556603773584904, "grad_norm": 0.23181140422821045, "learning_rate": 4.000955465865316e-05, "loss": 0.4279, "num_input_tokens_seen": 4558680, "step": 6975 }, { "epoch": 3.658280922431866, "grad_norm": 0.4376983642578125, "learning_rate": 3.999126008396242e-05, "loss": 0.4504, "num_input_tokens_seen": 4561336, "step": 6980 }, { "epoch": 3.660901467505241, "grad_norm": 0.3383617401123047, "learning_rate": 3.9972952965270006e-05, "loss": 0.4842, "num_input_tokens_seen": 4564568, "step": 6985 }, { "epoch": 3.6635220125786163, "grad_norm": 0.48502275347709656, "learning_rate": 3.9954633317894496e-05, "loss": 0.4528, "num_input_tokens_seen": 4567448, "step": 6990 }, { "epoch": 3.6661425576519915, "grad_norm": 0.2855909466743469, "learning_rate": 3.9936301157164926e-05, "loss": 0.5741, "num_input_tokens_seen": 4570136, "step": 6995 }, { "epoch": 3.668763102725367, "grad_norm": 0.5027920603752136, "learning_rate": 3.99179564984208e-05, "loss": 0.4905, "num_input_tokens_seen": 4573624, "step": 7000 }, { "epoch": 3.6713836477987423, "grad_norm": 1.381174087524414, "learning_rate": 3.989959935701211e-05, "loss": 0.4299, "num_input_tokens_seen": 4577048, "step": 7005 }, { "epoch": 3.6740041928721174, "grad_norm": 0.4042738378047943, "learning_rate": 3.988122974829926e-05, "loss": 0.5198, "num_input_tokens_seen": 4580088, "step": 7010 }, { "epoch": 3.6766247379454926, "grad_norm": 0.38226816058158875, "learning_rate": 3.9862847687653116e-05, "loss": 0.4087, "num_input_tokens_seen": 4583480, "step": 7015 }, { "epoch": 3.6792452830188678, "grad_norm": 0.5025838017463684, "learning_rate": 3.9844453190454924e-05, "loss": 0.648, "num_input_tokens_seen": 4586168, "step": 7020 }, { "epoch": 3.681865828092243, "grad_norm": 0.35194966197013855, "learning_rate": 3.982604627209637e-05, "loss": 0.3937, "num_input_tokens_seen": 4589592, "step": 7025 }, { "epoch": 3.6844863731656186, "grad_norm": 0.7606292963027954, "learning_rate": 3.980762694797953e-05, "loss": 0.5573, "num_input_tokens_seen": 4592440, "step": 7030 }, { "epoch": 3.6871069182389937, "grad_norm": 0.4922422468662262, "learning_rate": 3.978919523351684e-05, "loss": 0.3612, "num_input_tokens_seen": 4594904, "step": 7035 }, { "epoch": 3.689727463312369, "grad_norm": 0.42131075263023376, "learning_rate": 3.977075114413112e-05, "loss": 0.5346, "num_input_tokens_seen": 4597624, "step": 7040 }, { "epoch": 3.6923480083857445, "grad_norm": 0.721359133720398, "learning_rate": 3.9752294695255545e-05, "loss": 0.5608, "num_input_tokens_seen": 4599928, "step": 7045 }, { "epoch": 3.6949685534591197, "grad_norm": 1.0772240161895752, "learning_rate": 3.973382590233362e-05, "loss": 0.4927, "num_input_tokens_seen": 4602328, "step": 7050 }, { "epoch": 3.697589098532495, "grad_norm": 0.4539746642112732, "learning_rate": 3.9715344780819205e-05, "loss": 0.5034, "num_input_tokens_seen": 4606328, "step": 7055 }, { "epoch": 3.70020964360587, "grad_norm": 0.39273300766944885, "learning_rate": 3.9696851346176445e-05, "loss": 0.3907, "num_input_tokens_seen": 4609176, "step": 7060 }, { "epoch": 3.702830188679245, "grad_norm": 0.34806060791015625, "learning_rate": 3.9678345613879796e-05, "loss": 0.5601, "num_input_tokens_seen": 4612056, "step": 7065 }, { "epoch": 3.7054507337526204, "grad_norm": 0.5711743831634521, "learning_rate": 3.965982759941403e-05, "loss": 0.6011, "num_input_tokens_seen": 4615320, "step": 7070 }, { "epoch": 3.708071278825996, "grad_norm": 0.4270710051059723, "learning_rate": 3.964129731827415e-05, "loss": 0.5531, "num_input_tokens_seen": 4618200, "step": 7075 }, { "epoch": 3.710691823899371, "grad_norm": 0.6139219999313354, "learning_rate": 3.9622754785965474e-05, "loss": 0.3492, "num_input_tokens_seen": 4621368, "step": 7080 }, { "epoch": 3.7133123689727463, "grad_norm": 0.4384901523590088, "learning_rate": 3.9604200018003525e-05, "loss": 0.5054, "num_input_tokens_seen": 4624088, "step": 7085 }, { "epoch": 3.7159329140461215, "grad_norm": 0.3649217188358307, "learning_rate": 3.95856330299141e-05, "loss": 0.6184, "num_input_tokens_seen": 4628568, "step": 7090 }, { "epoch": 3.718553459119497, "grad_norm": 0.43102139234542847, "learning_rate": 3.956705383723319e-05, "loss": 0.5294, "num_input_tokens_seen": 4631288, "step": 7095 }, { "epoch": 3.7211740041928723, "grad_norm": 0.29041752219200134, "learning_rate": 3.954846245550704e-05, "loss": 0.3915, "num_input_tokens_seen": 4635224, "step": 7100 }, { "epoch": 3.7237945492662474, "grad_norm": 0.4094657301902771, "learning_rate": 3.952985890029205e-05, "loss": 0.7005, "num_input_tokens_seen": 4638648, "step": 7105 }, { "epoch": 3.7264150943396226, "grad_norm": 0.7532150745391846, "learning_rate": 3.951124318715482e-05, "loss": 0.3885, "num_input_tokens_seen": 4641848, "step": 7110 }, { "epoch": 3.7290356394129978, "grad_norm": 0.41749459505081177, "learning_rate": 3.9492615331672145e-05, "loss": 0.4977, "num_input_tokens_seen": 4644408, "step": 7115 }, { "epoch": 3.731656184486373, "grad_norm": 0.5230793952941895, "learning_rate": 3.947397534943096e-05, "loss": 0.6296, "num_input_tokens_seen": 4647096, "step": 7120 }, { "epoch": 3.7342767295597485, "grad_norm": 0.44254377484321594, "learning_rate": 3.9455323256028344e-05, "loss": 0.4891, "num_input_tokens_seen": 4649176, "step": 7125 }, { "epoch": 3.7368972746331237, "grad_norm": 0.4670647382736206, "learning_rate": 3.943665906707153e-05, "loss": 0.5103, "num_input_tokens_seen": 4653144, "step": 7130 }, { "epoch": 3.739517819706499, "grad_norm": 0.38601401448249817, "learning_rate": 3.9417982798177834e-05, "loss": 0.3593, "num_input_tokens_seen": 4655864, "step": 7135 }, { "epoch": 3.742138364779874, "grad_norm": 0.38420161604881287, "learning_rate": 3.939929446497472e-05, "loss": 0.4965, "num_input_tokens_seen": 4659000, "step": 7140 }, { "epoch": 3.7447589098532497, "grad_norm": 0.28017666935920715, "learning_rate": 3.938059408309974e-05, "loss": 0.5339, "num_input_tokens_seen": 4661720, "step": 7145 }, { "epoch": 3.747379454926625, "grad_norm": 0.7930901050567627, "learning_rate": 3.936188166820051e-05, "loss": 0.4663, "num_input_tokens_seen": 4666584, "step": 7150 }, { "epoch": 3.75, "grad_norm": 0.568840742111206, "learning_rate": 3.9343157235934714e-05, "loss": 0.4681, "num_input_tokens_seen": 4669272, "step": 7155 }, { "epoch": 3.752620545073375, "grad_norm": 0.6293945908546448, "learning_rate": 3.932442080197012e-05, "loss": 0.49, "num_input_tokens_seen": 4672280, "step": 7160 }, { "epoch": 3.7552410901467503, "grad_norm": 0.4549958407878876, "learning_rate": 3.930567238198451e-05, "loss": 0.3646, "num_input_tokens_seen": 4675032, "step": 7165 }, { "epoch": 3.757861635220126, "grad_norm": 0.45000848174095154, "learning_rate": 3.928691199166571e-05, "loss": 0.4651, "num_input_tokens_seen": 4678360, "step": 7170 }, { "epoch": 3.760482180293501, "grad_norm": 1.2857269048690796, "learning_rate": 3.926813964671156e-05, "loss": 0.5054, "num_input_tokens_seen": 4681208, "step": 7175 }, { "epoch": 3.7631027253668763, "grad_norm": 0.31270021200180054, "learning_rate": 3.9249355362829884e-05, "loss": 0.4734, "num_input_tokens_seen": 4683768, "step": 7180 }, { "epoch": 3.7657232704402515, "grad_norm": 0.7328094244003296, "learning_rate": 3.923055915573853e-05, "loss": 0.4852, "num_input_tokens_seen": 4686872, "step": 7185 }, { "epoch": 3.768343815513627, "grad_norm": 0.32172900438308716, "learning_rate": 3.921175104116531e-05, "loss": 0.4739, "num_input_tokens_seen": 4690424, "step": 7190 }, { "epoch": 3.7709643605870022, "grad_norm": 0.6473044753074646, "learning_rate": 3.9192931034847966e-05, "loss": 0.5571, "num_input_tokens_seen": 4692824, "step": 7195 }, { "epoch": 3.7735849056603774, "grad_norm": 0.44445303082466125, "learning_rate": 3.917409915253426e-05, "loss": 0.5317, "num_input_tokens_seen": 4696440, "step": 7200 }, { "epoch": 3.7762054507337526, "grad_norm": 0.46590474247932434, "learning_rate": 3.915525540998182e-05, "loss": 0.4316, "num_input_tokens_seen": 4700440, "step": 7205 }, { "epoch": 3.7788259958071277, "grad_norm": 0.3672661781311035, "learning_rate": 3.9136399822958235e-05, "loss": 0.4467, "num_input_tokens_seen": 4703320, "step": 7210 }, { "epoch": 3.781446540880503, "grad_norm": 0.4089609682559967, "learning_rate": 3.911753240724101e-05, "loss": 0.5706, "num_input_tokens_seen": 4706552, "step": 7215 }, { "epoch": 3.7840670859538785, "grad_norm": 0.2406141310930252, "learning_rate": 3.909865317861753e-05, "loss": 0.4073, "num_input_tokens_seen": 4709592, "step": 7220 }, { "epoch": 3.7866876310272537, "grad_norm": 0.5140736103057861, "learning_rate": 3.907976215288507e-05, "loss": 0.5781, "num_input_tokens_seen": 4712344, "step": 7225 }, { "epoch": 3.789308176100629, "grad_norm": 0.7091354131698608, "learning_rate": 3.9060859345850774e-05, "loss": 0.5393, "num_input_tokens_seen": 4715000, "step": 7230 }, { "epoch": 3.791928721174004, "grad_norm": 0.4576745331287384, "learning_rate": 3.904194477333166e-05, "loss": 0.4408, "num_input_tokens_seen": 4718904, "step": 7235 }, { "epoch": 3.7945492662473796, "grad_norm": 0.6483544707298279, "learning_rate": 3.902301845115456e-05, "loss": 0.577, "num_input_tokens_seen": 4721656, "step": 7240 }, { "epoch": 3.797169811320755, "grad_norm": 1.6312750577926636, "learning_rate": 3.900408039515617e-05, "loss": 0.5692, "num_input_tokens_seen": 4726264, "step": 7245 }, { "epoch": 3.79979035639413, "grad_norm": 0.4567606747150421, "learning_rate": 3.8985130621182985e-05, "loss": 0.4275, "num_input_tokens_seen": 4729112, "step": 7250 }, { "epoch": 3.802410901467505, "grad_norm": 0.42230603098869324, "learning_rate": 3.896616914509131e-05, "loss": 0.5446, "num_input_tokens_seen": 4731832, "step": 7255 }, { "epoch": 3.8050314465408803, "grad_norm": 0.49097511172294617, "learning_rate": 3.894719598274725e-05, "loss": 0.5867, "num_input_tokens_seen": 4738136, "step": 7260 }, { "epoch": 3.8076519916142555, "grad_norm": 0.3805791437625885, "learning_rate": 3.892821115002667e-05, "loss": 0.4203, "num_input_tokens_seen": 4741080, "step": 7265 }, { "epoch": 3.810272536687631, "grad_norm": 0.3842325508594513, "learning_rate": 3.8909214662815216e-05, "loss": 0.3845, "num_input_tokens_seen": 4744472, "step": 7270 }, { "epoch": 3.8128930817610063, "grad_norm": 0.3682522773742676, "learning_rate": 3.889020653700828e-05, "loss": 0.3909, "num_input_tokens_seen": 4748664, "step": 7275 }, { "epoch": 3.8155136268343814, "grad_norm": 0.7752379179000854, "learning_rate": 3.8871186788511e-05, "loss": 0.4999, "num_input_tokens_seen": 4751704, "step": 7280 }, { "epoch": 3.818134171907757, "grad_norm": 2.157536268234253, "learning_rate": 3.8852155433238214e-05, "loss": 0.5475, "num_input_tokens_seen": 4754104, "step": 7285 }, { "epoch": 3.8207547169811322, "grad_norm": 0.41389673948287964, "learning_rate": 3.8833112487114505e-05, "loss": 0.4382, "num_input_tokens_seen": 4757528, "step": 7290 }, { "epoch": 3.8233752620545074, "grad_norm": 0.3146992027759552, "learning_rate": 3.881405796607414e-05, "loss": 0.619, "num_input_tokens_seen": 4761208, "step": 7295 }, { "epoch": 3.8259958071278826, "grad_norm": 0.4538338780403137, "learning_rate": 3.879499188606107e-05, "loss": 0.4657, "num_input_tokens_seen": 4764088, "step": 7300 }, { "epoch": 3.8286163522012577, "grad_norm": 0.2991197407245636, "learning_rate": 3.877591426302892e-05, "loss": 0.5032, "num_input_tokens_seen": 4767352, "step": 7305 }, { "epoch": 3.831236897274633, "grad_norm": 0.7185759544372559, "learning_rate": 3.8756825112940964e-05, "loss": 0.488, "num_input_tokens_seen": 4770648, "step": 7310 }, { "epoch": 3.8338574423480085, "grad_norm": 0.32269856333732605, "learning_rate": 3.873772445177015e-05, "loss": 0.4758, "num_input_tokens_seen": 4777624, "step": 7315 }, { "epoch": 3.8364779874213837, "grad_norm": 0.6068574786186218, "learning_rate": 3.8718612295499036e-05, "loss": 0.5588, "num_input_tokens_seen": 4780376, "step": 7320 }, { "epoch": 3.839098532494759, "grad_norm": 0.2983579933643341, "learning_rate": 3.8699488660119784e-05, "loss": 0.4871, "num_input_tokens_seen": 4783096, "step": 7325 }, { "epoch": 3.841719077568134, "grad_norm": 0.3198521137237549, "learning_rate": 3.868035356163419e-05, "loss": 0.3436, "num_input_tokens_seen": 4787000, "step": 7330 }, { "epoch": 3.8443396226415096, "grad_norm": 0.8056184649467468, "learning_rate": 3.866120701605363e-05, "loss": 0.4908, "num_input_tokens_seen": 4790168, "step": 7335 }, { "epoch": 3.846960167714885, "grad_norm": 0.3335869014263153, "learning_rate": 3.8642049039399054e-05, "loss": 0.5356, "num_input_tokens_seen": 4793944, "step": 7340 }, { "epoch": 3.84958071278826, "grad_norm": 0.44904813170433044, "learning_rate": 3.862287964770099e-05, "loss": 0.4318, "num_input_tokens_seen": 4797016, "step": 7345 }, { "epoch": 3.852201257861635, "grad_norm": 0.2744474411010742, "learning_rate": 3.86036988569995e-05, "loss": 0.4807, "num_input_tokens_seen": 4800216, "step": 7350 }, { "epoch": 3.8548218029350103, "grad_norm": 0.38718217611312866, "learning_rate": 3.8584506683344216e-05, "loss": 0.3915, "num_input_tokens_seen": 4803992, "step": 7355 }, { "epoch": 3.8574423480083855, "grad_norm": 0.6240994930267334, "learning_rate": 3.8565303142794234e-05, "loss": 0.4618, "num_input_tokens_seen": 4806872, "step": 7360 }, { "epoch": 3.860062893081761, "grad_norm": 0.4352092444896698, "learning_rate": 3.8546088251418224e-05, "loss": 0.2997, "num_input_tokens_seen": 4809912, "step": 7365 }, { "epoch": 3.8626834381551363, "grad_norm": 0.31542980670928955, "learning_rate": 3.8526862025294336e-05, "loss": 0.3371, "num_input_tokens_seen": 4816984, "step": 7370 }, { "epoch": 3.8653039832285114, "grad_norm": 0.3562620282173157, "learning_rate": 3.8507624480510186e-05, "loss": 0.5413, "num_input_tokens_seen": 4819608, "step": 7375 }, { "epoch": 3.867924528301887, "grad_norm": 0.19870442152023315, "learning_rate": 3.848837563316287e-05, "loss": 0.397, "num_input_tokens_seen": 4823448, "step": 7380 }, { "epoch": 3.870545073375262, "grad_norm": 0.5354375839233398, "learning_rate": 3.8469115499358945e-05, "loss": 0.5064, "num_input_tokens_seen": 4826328, "step": 7385 }, { "epoch": 3.8731656184486374, "grad_norm": 0.5445772409439087, "learning_rate": 3.844984409521442e-05, "loss": 0.4907, "num_input_tokens_seen": 4829048, "step": 7390 }, { "epoch": 3.8757861635220126, "grad_norm": 0.43396908044815063, "learning_rate": 3.843056143685472e-05, "loss": 0.4149, "num_input_tokens_seen": 4833048, "step": 7395 }, { "epoch": 3.8784067085953877, "grad_norm": 0.31063154339790344, "learning_rate": 3.841126754041468e-05, "loss": 0.4098, "num_input_tokens_seen": 4836184, "step": 7400 }, { "epoch": 3.881027253668763, "grad_norm": 0.8490452170372009, "learning_rate": 3.839196242203859e-05, "loss": 0.4899, "num_input_tokens_seen": 4839896, "step": 7405 }, { "epoch": 3.8836477987421385, "grad_norm": 0.9614840745925903, "learning_rate": 3.837264609788005e-05, "loss": 0.4339, "num_input_tokens_seen": 4842936, "step": 7410 }, { "epoch": 3.8862683438155137, "grad_norm": 0.5617616176605225, "learning_rate": 3.8353318584102096e-05, "loss": 0.6292, "num_input_tokens_seen": 4846200, "step": 7415 }, { "epoch": 3.888888888888889, "grad_norm": 0.4302453100681305, "learning_rate": 3.83339798968771e-05, "loss": 0.477, "num_input_tokens_seen": 4849336, "step": 7420 }, { "epoch": 3.891509433962264, "grad_norm": 0.5508154034614563, "learning_rate": 3.83146300523868e-05, "loss": 0.4885, "num_input_tokens_seen": 4852632, "step": 7425 }, { "epoch": 3.8941299790356396, "grad_norm": 0.5710510015487671, "learning_rate": 3.829526906682227e-05, "loss": 0.5094, "num_input_tokens_seen": 4855384, "step": 7430 }, { "epoch": 3.896750524109015, "grad_norm": 0.4648663401603699, "learning_rate": 3.827589695638388e-05, "loss": 0.5158, "num_input_tokens_seen": 4858936, "step": 7435 }, { "epoch": 3.89937106918239, "grad_norm": 0.5771051049232483, "learning_rate": 3.825651373728133e-05, "loss": 0.5049, "num_input_tokens_seen": 4862488, "step": 7440 }, { "epoch": 3.901991614255765, "grad_norm": 0.38692212104797363, "learning_rate": 3.8237119425733625e-05, "loss": 0.4308, "num_input_tokens_seen": 4865240, "step": 7445 }, { "epoch": 3.9046121593291403, "grad_norm": 0.3183213770389557, "learning_rate": 3.8217714037969035e-05, "loss": 0.4569, "num_input_tokens_seen": 4868152, "step": 7450 }, { "epoch": 3.9072327044025155, "grad_norm": 0.5224676728248596, "learning_rate": 3.8198297590225095e-05, "loss": 0.3648, "num_input_tokens_seen": 4870808, "step": 7455 }, { "epoch": 3.909853249475891, "grad_norm": 0.3078959286212921, "learning_rate": 3.817887009874861e-05, "loss": 0.4637, "num_input_tokens_seen": 4873944, "step": 7460 }, { "epoch": 3.9124737945492662, "grad_norm": 0.5442308187484741, "learning_rate": 3.815943157979561e-05, "loss": 0.547, "num_input_tokens_seen": 4877528, "step": 7465 }, { "epoch": 3.9150943396226414, "grad_norm": 0.4645673930644989, "learning_rate": 3.813998204963136e-05, "loss": 0.4319, "num_input_tokens_seen": 4880504, "step": 7470 }, { "epoch": 3.917714884696017, "grad_norm": 0.48310917615890503, "learning_rate": 3.812052152453035e-05, "loss": 0.4644, "num_input_tokens_seen": 4884312, "step": 7475 }, { "epoch": 3.920335429769392, "grad_norm": 0.5575697422027588, "learning_rate": 3.8101050020776244e-05, "loss": 0.3782, "num_input_tokens_seen": 4887032, "step": 7480 }, { "epoch": 3.9229559748427674, "grad_norm": 0.45512980222702026, "learning_rate": 3.808156755466191e-05, "loss": 0.5246, "num_input_tokens_seen": 4889976, "step": 7485 }, { "epoch": 3.9255765199161425, "grad_norm": 0.2825585603713989, "learning_rate": 3.806207414248939e-05, "loss": 0.5586, "num_input_tokens_seen": 4892600, "step": 7490 }, { "epoch": 3.9281970649895177, "grad_norm": 0.8479301333427429, "learning_rate": 3.804256980056988e-05, "loss": 0.4641, "num_input_tokens_seen": 4895640, "step": 7495 }, { "epoch": 3.930817610062893, "grad_norm": 0.40551793575286865, "learning_rate": 3.8023054545223723e-05, "loss": 0.3976, "num_input_tokens_seen": 4898200, "step": 7500 }, { "epoch": 3.9334381551362685, "grad_norm": 0.7706896066665649, "learning_rate": 3.8003528392780385e-05, "loss": 0.442, "num_input_tokens_seen": 4901080, "step": 7505 }, { "epoch": 3.9360587002096437, "grad_norm": 0.38673722743988037, "learning_rate": 3.798399135957847e-05, "loss": 0.5175, "num_input_tokens_seen": 4904280, "step": 7510 }, { "epoch": 3.938679245283019, "grad_norm": 0.5243788957595825, "learning_rate": 3.7964443461965674e-05, "loss": 0.702, "num_input_tokens_seen": 4907832, "step": 7515 }, { "epoch": 3.941299790356394, "grad_norm": 0.6818978190422058, "learning_rate": 3.794488471629878e-05, "loss": 0.6279, "num_input_tokens_seen": 4911320, "step": 7520 }, { "epoch": 3.9439203354297696, "grad_norm": 0.2604924738407135, "learning_rate": 3.7925315138943655e-05, "loss": 0.4626, "num_input_tokens_seen": 4914072, "step": 7525 }, { "epoch": 3.9465408805031448, "grad_norm": 0.5104311108589172, "learning_rate": 3.790573474627522e-05, "loss": 0.5097, "num_input_tokens_seen": 4918584, "step": 7530 }, { "epoch": 3.94916142557652, "grad_norm": 0.4706178605556488, "learning_rate": 3.7886143554677466e-05, "loss": 0.467, "num_input_tokens_seen": 4921496, "step": 7535 }, { "epoch": 3.951781970649895, "grad_norm": 0.5105645060539246, "learning_rate": 3.7866541580543405e-05, "loss": 0.5078, "num_input_tokens_seen": 4924600, "step": 7540 }, { "epoch": 3.9544025157232703, "grad_norm": 0.3029099106788635, "learning_rate": 3.7846928840275056e-05, "loss": 0.4628, "num_input_tokens_seen": 4927512, "step": 7545 }, { "epoch": 3.9570230607966455, "grad_norm": 0.49309465289115906, "learning_rate": 3.782730535028348e-05, "loss": 0.4499, "num_input_tokens_seen": 4930360, "step": 7550 }, { "epoch": 3.959643605870021, "grad_norm": 0.36434876918792725, "learning_rate": 3.780767112698872e-05, "loss": 0.3879, "num_input_tokens_seen": 4933496, "step": 7555 }, { "epoch": 3.9622641509433962, "grad_norm": 0.3754570484161377, "learning_rate": 3.77880261868198e-05, "loss": 0.3925, "num_input_tokens_seen": 4936152, "step": 7560 }, { "epoch": 3.9648846960167714, "grad_norm": 0.7413024306297302, "learning_rate": 3.7768370546214685e-05, "loss": 0.6869, "num_input_tokens_seen": 4939832, "step": 7565 }, { "epoch": 3.967505241090147, "grad_norm": 0.3900372385978699, "learning_rate": 3.774870422162034e-05, "loss": 0.518, "num_input_tokens_seen": 4943192, "step": 7570 }, { "epoch": 3.970125786163522, "grad_norm": 0.6484267711639404, "learning_rate": 3.7729027229492645e-05, "loss": 0.4528, "num_input_tokens_seen": 4946392, "step": 7575 }, { "epoch": 3.9727463312368974, "grad_norm": 0.32755202054977417, "learning_rate": 3.770933958629639e-05, "loss": 0.3815, "num_input_tokens_seen": 4949464, "step": 7580 }, { "epoch": 3.9753668763102725, "grad_norm": 0.43002545833587646, "learning_rate": 3.768964130850532e-05, "loss": 0.3934, "num_input_tokens_seen": 4955288, "step": 7585 }, { "epoch": 3.9779874213836477, "grad_norm": 0.4591645300388336, "learning_rate": 3.766993241260204e-05, "loss": 0.392, "num_input_tokens_seen": 4958008, "step": 7590 }, { "epoch": 3.980607966457023, "grad_norm": 0.3009031414985657, "learning_rate": 3.765021291507805e-05, "loss": 0.3618, "num_input_tokens_seen": 4961400, "step": 7595 }, { "epoch": 3.9832285115303985, "grad_norm": 0.629816472530365, "learning_rate": 3.763048283243374e-05, "loss": 0.5805, "num_input_tokens_seen": 4964536, "step": 7600 }, { "epoch": 3.9858490566037736, "grad_norm": 0.4483979642391205, "learning_rate": 3.7610742181178325e-05, "loss": 0.595, "num_input_tokens_seen": 4968280, "step": 7605 }, { "epoch": 3.988469601677149, "grad_norm": 0.3279598653316498, "learning_rate": 3.75909909778299e-05, "loss": 0.5709, "num_input_tokens_seen": 4971640, "step": 7610 }, { "epoch": 3.991090146750524, "grad_norm": 0.3516525626182556, "learning_rate": 3.757122923891534e-05, "loss": 0.4998, "num_input_tokens_seen": 4975512, "step": 7615 }, { "epoch": 3.9937106918238996, "grad_norm": 0.44046440720558167, "learning_rate": 3.75514569809704e-05, "loss": 0.5744, "num_input_tokens_seen": 4978520, "step": 7620 }, { "epoch": 3.9963312368972748, "grad_norm": 0.47485780715942383, "learning_rate": 3.7531674220539584e-05, "loss": 0.4046, "num_input_tokens_seen": 4981944, "step": 7625 }, { "epoch": 3.99895178197065, "grad_norm": 0.6508476734161377, "learning_rate": 3.751188097417619e-05, "loss": 0.4298, "num_input_tokens_seen": 4984632, "step": 7630 }, { "epoch": 4.0, "eval_loss": 0.49160492420196533, "eval_runtime": 14.501, "eval_samples_per_second": 58.479, "eval_steps_per_second": 14.62, "num_input_tokens_seen": 4985200, "step": 7632 }, { "epoch": 4.001572327044025, "grad_norm": 0.3895523250102997, "learning_rate": 3.749207725844234e-05, "loss": 0.4479, "num_input_tokens_seen": 4987024, "step": 7635 }, { "epoch": 4.0041928721174, "grad_norm": 0.7016222476959229, "learning_rate": 3.747226308990884e-05, "loss": 0.4237, "num_input_tokens_seen": 4989552, "step": 7640 }, { "epoch": 4.006813417190775, "grad_norm": 0.4818178117275238, "learning_rate": 3.74524384851553e-05, "loss": 0.6822, "num_input_tokens_seen": 4992080, "step": 7645 }, { "epoch": 4.009433962264151, "grad_norm": 0.8810732364654541, "learning_rate": 3.743260346077004e-05, "loss": 0.6768, "num_input_tokens_seen": 4994512, "step": 7650 }, { "epoch": 4.012054507337526, "grad_norm": 0.3430149555206299, "learning_rate": 3.741275803335011e-05, "loss": 0.3079, "num_input_tokens_seen": 4998000, "step": 7655 }, { "epoch": 4.014675052410902, "grad_norm": 0.6619883179664612, "learning_rate": 3.7392902219501234e-05, "loss": 0.4581, "num_input_tokens_seen": 5002000, "step": 7660 }, { "epoch": 4.017295597484277, "grad_norm": 0.4003136157989502, "learning_rate": 3.737303603583788e-05, "loss": 0.4579, "num_input_tokens_seen": 5005648, "step": 7665 }, { "epoch": 4.019916142557652, "grad_norm": 0.5841847062110901, "learning_rate": 3.735315949898314e-05, "loss": 0.6141, "num_input_tokens_seen": 5009104, "step": 7670 }, { "epoch": 4.022536687631027, "grad_norm": 0.32347211241722107, "learning_rate": 3.7333272625568804e-05, "loss": 0.3854, "num_input_tokens_seen": 5012304, "step": 7675 }, { "epoch": 4.0251572327044025, "grad_norm": 1.5209163427352905, "learning_rate": 3.7313375432235295e-05, "loss": 0.5249, "num_input_tokens_seen": 5014608, "step": 7680 }, { "epoch": 4.027777777777778, "grad_norm": 0.8862865567207336, "learning_rate": 3.729346793563167e-05, "loss": 0.5078, "num_input_tokens_seen": 5017776, "step": 7685 }, { "epoch": 4.030398322851153, "grad_norm": 0.4765058159828186, "learning_rate": 3.7273550152415635e-05, "loss": 0.5941, "num_input_tokens_seen": 5020432, "step": 7690 }, { "epoch": 4.033018867924528, "grad_norm": 0.9565091133117676, "learning_rate": 3.725362209925346e-05, "loss": 0.3393, "num_input_tokens_seen": 5023824, "step": 7695 }, { "epoch": 4.035639412997903, "grad_norm": 0.3301360607147217, "learning_rate": 3.7233683792820036e-05, "loss": 0.5139, "num_input_tokens_seen": 5027344, "step": 7700 }, { "epoch": 4.038259958071279, "grad_norm": 0.7697629928588867, "learning_rate": 3.721373524979883e-05, "loss": 0.5054, "num_input_tokens_seen": 5030928, "step": 7705 }, { "epoch": 4.040880503144654, "grad_norm": 0.30931907892227173, "learning_rate": 3.7193776486881854e-05, "loss": 0.3868, "num_input_tokens_seen": 5034320, "step": 7710 }, { "epoch": 4.04350104821803, "grad_norm": 0.253598690032959, "learning_rate": 3.717380752076971e-05, "loss": 0.4428, "num_input_tokens_seen": 5037456, "step": 7715 }, { "epoch": 4.046121593291405, "grad_norm": 0.3195744752883911, "learning_rate": 3.715382836817152e-05, "loss": 0.4196, "num_input_tokens_seen": 5040784, "step": 7720 }, { "epoch": 4.04874213836478, "grad_norm": 0.6125550270080566, "learning_rate": 3.7133839045804906e-05, "loss": 0.5845, "num_input_tokens_seen": 5044176, "step": 7725 }, { "epoch": 4.051362683438155, "grad_norm": 0.4001171290874481, "learning_rate": 3.711383957039602e-05, "loss": 0.5728, "num_input_tokens_seen": 5047376, "step": 7730 }, { "epoch": 4.05398322851153, "grad_norm": 0.3652064800262451, "learning_rate": 3.709382995867954e-05, "loss": 0.4754, "num_input_tokens_seen": 5053712, "step": 7735 }, { "epoch": 4.056603773584905, "grad_norm": 0.6472243666648865, "learning_rate": 3.707381022739856e-05, "loss": 0.4794, "num_input_tokens_seen": 5057968, "step": 7740 }, { "epoch": 4.059224318658281, "grad_norm": 1.0311546325683594, "learning_rate": 3.7053780393304705e-05, "loss": 0.6601, "num_input_tokens_seen": 5061328, "step": 7745 }, { "epoch": 4.061844863731656, "grad_norm": 0.6050152778625488, "learning_rate": 3.7033740473158e-05, "loss": 0.5106, "num_input_tokens_seen": 5064464, "step": 7750 }, { "epoch": 4.064465408805032, "grad_norm": 0.42401647567749023, "learning_rate": 3.701369048372695e-05, "loss": 0.4156, "num_input_tokens_seen": 5067344, "step": 7755 }, { "epoch": 4.067085953878407, "grad_norm": 0.37443405389785767, "learning_rate": 3.699363044178847e-05, "loss": 0.4577, "num_input_tokens_seen": 5070512, "step": 7760 }, { "epoch": 4.069706498951782, "grad_norm": 0.8242135643959045, "learning_rate": 3.697356036412788e-05, "loss": 0.3974, "num_input_tokens_seen": 5073488, "step": 7765 }, { "epoch": 4.072327044025157, "grad_norm": 0.5334346294403076, "learning_rate": 3.695348026753891e-05, "loss": 0.4584, "num_input_tokens_seen": 5079984, "step": 7770 }, { "epoch": 4.0749475890985325, "grad_norm": 0.2943386435508728, "learning_rate": 3.6933390168823655e-05, "loss": 0.482, "num_input_tokens_seen": 5083056, "step": 7775 }, { "epoch": 4.077568134171908, "grad_norm": 0.6489611864089966, "learning_rate": 3.6913290084792616e-05, "loss": 0.529, "num_input_tokens_seen": 5085904, "step": 7780 }, { "epoch": 4.080188679245283, "grad_norm": 0.49931496381759644, "learning_rate": 3.689318003226461e-05, "loss": 0.4599, "num_input_tokens_seen": 5088368, "step": 7785 }, { "epoch": 4.082809224318658, "grad_norm": 0.3635769486427307, "learning_rate": 3.687306002806681e-05, "loss": 0.4297, "num_input_tokens_seen": 5091088, "step": 7790 }, { "epoch": 4.085429769392033, "grad_norm": 0.30780500173568726, "learning_rate": 3.685293008903471e-05, "loss": 0.4969, "num_input_tokens_seen": 5094960, "step": 7795 }, { "epoch": 4.088050314465409, "grad_norm": 0.4168412685394287, "learning_rate": 3.683279023201213e-05, "loss": 0.4874, "num_input_tokens_seen": 5098352, "step": 7800 }, { "epoch": 4.090670859538784, "grad_norm": 1.0829331874847412, "learning_rate": 3.681264047385119e-05, "loss": 0.4932, "num_input_tokens_seen": 5102416, "step": 7805 }, { "epoch": 4.09329140461216, "grad_norm": 0.3450270891189575, "learning_rate": 3.6792480831412293e-05, "loss": 0.6015, "num_input_tokens_seen": 5105072, "step": 7810 }, { "epoch": 4.095911949685535, "grad_norm": 0.8285292387008667, "learning_rate": 3.677231132156408e-05, "loss": 0.5187, "num_input_tokens_seen": 5108528, "step": 7815 }, { "epoch": 4.09853249475891, "grad_norm": 0.4095541536808014, "learning_rate": 3.675213196118349e-05, "loss": 0.5971, "num_input_tokens_seen": 5112176, "step": 7820 }, { "epoch": 4.101153039832285, "grad_norm": 0.25120991468429565, "learning_rate": 3.67319427671557e-05, "loss": 0.4723, "num_input_tokens_seen": 5116848, "step": 7825 }, { "epoch": 4.10377358490566, "grad_norm": 0.3178897500038147, "learning_rate": 3.6711743756374103e-05, "loss": 0.4663, "num_input_tokens_seen": 5120464, "step": 7830 }, { "epoch": 4.106394129979035, "grad_norm": 0.3059529662132263, "learning_rate": 3.6691534945740284e-05, "loss": 0.4118, "num_input_tokens_seen": 5123312, "step": 7835 }, { "epoch": 4.109014675052411, "grad_norm": 0.3574826121330261, "learning_rate": 3.667131635216408e-05, "loss": 0.4646, "num_input_tokens_seen": 5127088, "step": 7840 }, { "epoch": 4.111635220125786, "grad_norm": 0.300574392080307, "learning_rate": 3.665108799256348e-05, "loss": 0.4631, "num_input_tokens_seen": 5130384, "step": 7845 }, { "epoch": 4.114255765199162, "grad_norm": 0.34813639521598816, "learning_rate": 3.663084988386464e-05, "loss": 0.4439, "num_input_tokens_seen": 5134288, "step": 7850 }, { "epoch": 4.116876310272537, "grad_norm": 1.6453256607055664, "learning_rate": 3.6610602043001894e-05, "loss": 0.6052, "num_input_tokens_seen": 5137392, "step": 7855 }, { "epoch": 4.119496855345912, "grad_norm": 0.6259981989860535, "learning_rate": 3.659034448691771e-05, "loss": 0.4686, "num_input_tokens_seen": 5139824, "step": 7860 }, { "epoch": 4.122117400419287, "grad_norm": 0.3082396388053894, "learning_rate": 3.657007723256268e-05, "loss": 0.5563, "num_input_tokens_seen": 5143280, "step": 7865 }, { "epoch": 4.1247379454926625, "grad_norm": 0.5175389051437378, "learning_rate": 3.654980029689553e-05, "loss": 0.5505, "num_input_tokens_seen": 5145584, "step": 7870 }, { "epoch": 4.127358490566038, "grad_norm": 0.42719244956970215, "learning_rate": 3.6529513696883075e-05, "loss": 0.4364, "num_input_tokens_seen": 5148240, "step": 7875 }, { "epoch": 4.129979035639413, "grad_norm": 0.6522115468978882, "learning_rate": 3.650921744950019e-05, "loss": 0.5725, "num_input_tokens_seen": 5151632, "step": 7880 }, { "epoch": 4.132599580712788, "grad_norm": 1.2614928483963013, "learning_rate": 3.6488911571729864e-05, "loss": 0.5624, "num_input_tokens_seen": 5154576, "step": 7885 }, { "epoch": 4.135220125786163, "grad_norm": 0.44612786173820496, "learning_rate": 3.6468596080563134e-05, "loss": 0.4327, "num_input_tokens_seen": 5158224, "step": 7890 }, { "epoch": 4.137840670859539, "grad_norm": 0.3917388617992401, "learning_rate": 3.6448270992999065e-05, "loss": 0.3774, "num_input_tokens_seen": 5161360, "step": 7895 }, { "epoch": 4.140461215932914, "grad_norm": 0.5855047106742859, "learning_rate": 3.6427936326044756e-05, "loss": 0.3918, "num_input_tokens_seen": 5164656, "step": 7900 }, { "epoch": 4.1430817610062896, "grad_norm": 0.415431410074234, "learning_rate": 3.6407592096715345e-05, "loss": 0.4619, "num_input_tokens_seen": 5168336, "step": 7905 }, { "epoch": 4.145702306079665, "grad_norm": 0.46863946318626404, "learning_rate": 3.638723832203396e-05, "loss": 0.4923, "num_input_tokens_seen": 5170736, "step": 7910 }, { "epoch": 4.14832285115304, "grad_norm": 0.41325268149375916, "learning_rate": 3.6366875019031676e-05, "loss": 0.2896, "num_input_tokens_seen": 5173424, "step": 7915 }, { "epoch": 4.150943396226415, "grad_norm": 0.4661963880062103, "learning_rate": 3.6346502204747596e-05, "loss": 0.3636, "num_input_tokens_seen": 5175856, "step": 7920 }, { "epoch": 4.15356394129979, "grad_norm": 0.5368335843086243, "learning_rate": 3.6326119896228766e-05, "loss": 0.4245, "num_input_tokens_seen": 5178672, "step": 7925 }, { "epoch": 4.156184486373165, "grad_norm": 0.5006029009819031, "learning_rate": 3.630572811053016e-05, "loss": 0.4487, "num_input_tokens_seen": 5184496, "step": 7930 }, { "epoch": 4.158805031446541, "grad_norm": 0.23208372294902802, "learning_rate": 3.62853268647147e-05, "loss": 0.5209, "num_input_tokens_seen": 5188144, "step": 7935 }, { "epoch": 4.161425576519916, "grad_norm": 0.7527299523353577, "learning_rate": 3.6264916175853204e-05, "loss": 0.4621, "num_input_tokens_seen": 5192112, "step": 7940 }, { "epoch": 4.164046121593292, "grad_norm": 0.6306975483894348, "learning_rate": 3.624449606102441e-05, "loss": 0.5121, "num_input_tokens_seen": 5194320, "step": 7945 }, { "epoch": 4.166666666666667, "grad_norm": 0.9352222681045532, "learning_rate": 3.622406653731495e-05, "loss": 0.4978, "num_input_tokens_seen": 5197456, "step": 7950 }, { "epoch": 4.169287211740042, "grad_norm": 0.3317630887031555, "learning_rate": 3.620362762181931e-05, "loss": 0.5056, "num_input_tokens_seen": 5199792, "step": 7955 }, { "epoch": 4.171907756813417, "grad_norm": 0.5451275110244751, "learning_rate": 3.6183179331639825e-05, "loss": 0.6072, "num_input_tokens_seen": 5202672, "step": 7960 }, { "epoch": 4.1745283018867925, "grad_norm": 0.7302827835083008, "learning_rate": 3.616272168388671e-05, "loss": 0.581, "num_input_tokens_seen": 5205808, "step": 7965 }, { "epoch": 4.177148846960168, "grad_norm": 0.626469612121582, "learning_rate": 3.614225469567798e-05, "loss": 0.3427, "num_input_tokens_seen": 5208176, "step": 7970 }, { "epoch": 4.179769392033543, "grad_norm": 0.33470991253852844, "learning_rate": 3.612177838413948e-05, "loss": 0.4756, "num_input_tokens_seen": 5215376, "step": 7975 }, { "epoch": 4.182389937106918, "grad_norm": 0.3917901813983917, "learning_rate": 3.6101292766404854e-05, "loss": 0.5089, "num_input_tokens_seen": 5218064, "step": 7980 }, { "epoch": 4.185010482180293, "grad_norm": 0.8295184373855591, "learning_rate": 3.608079785961552e-05, "loss": 0.5086, "num_input_tokens_seen": 5221328, "step": 7985 }, { "epoch": 4.187631027253669, "grad_norm": 0.4048633277416229, "learning_rate": 3.60602936809207e-05, "loss": 0.4814, "num_input_tokens_seen": 5224432, "step": 7990 }, { "epoch": 4.190251572327044, "grad_norm": 0.5442971587181091, "learning_rate": 3.603978024747733e-05, "loss": 0.4855, "num_input_tokens_seen": 5226896, "step": 7995 }, { "epoch": 4.1928721174004195, "grad_norm": 0.4314836859703064, "learning_rate": 3.601925757645013e-05, "loss": 0.4565, "num_input_tokens_seen": 5231056, "step": 8000 }, { "epoch": 4.195492662473795, "grad_norm": 0.5626641511917114, "learning_rate": 3.599872568501152e-05, "loss": 0.3985, "num_input_tokens_seen": 5234480, "step": 8005 }, { "epoch": 4.19811320754717, "grad_norm": 1.9997247457504272, "learning_rate": 3.5978184590341676e-05, "loss": 0.643, "num_input_tokens_seen": 5236816, "step": 8010 }, { "epoch": 4.200733752620545, "grad_norm": 1.4977012872695923, "learning_rate": 3.5957634309628424e-05, "loss": 0.4708, "num_input_tokens_seen": 5239600, "step": 8015 }, { "epoch": 4.20335429769392, "grad_norm": 0.48774591088294983, "learning_rate": 3.59370748600673e-05, "loss": 0.5067, "num_input_tokens_seen": 5242896, "step": 8020 }, { "epoch": 4.205974842767295, "grad_norm": 0.5390903353691101, "learning_rate": 3.591650625886152e-05, "loss": 0.5528, "num_input_tokens_seen": 5245904, "step": 8025 }, { "epoch": 4.2085953878406706, "grad_norm": 0.56259685754776, "learning_rate": 3.5895928523221955e-05, "loss": 0.5694, "num_input_tokens_seen": 5249968, "step": 8030 }, { "epoch": 4.211215932914046, "grad_norm": 0.4599670171737671, "learning_rate": 3.58753416703671e-05, "loss": 0.4373, "num_input_tokens_seen": 5253680, "step": 8035 }, { "epoch": 4.213836477987422, "grad_norm": 0.5799304246902466, "learning_rate": 3.585474571752311e-05, "loss": 0.6473, "num_input_tokens_seen": 5256144, "step": 8040 }, { "epoch": 4.216457023060797, "grad_norm": 0.3659586012363434, "learning_rate": 3.583414068192372e-05, "loss": 0.3735, "num_input_tokens_seen": 5261456, "step": 8045 }, { "epoch": 4.219077568134172, "grad_norm": 0.4840075969696045, "learning_rate": 3.58135265808103e-05, "loss": 0.5567, "num_input_tokens_seen": 5264688, "step": 8050 }, { "epoch": 4.221698113207547, "grad_norm": 0.3778652846813202, "learning_rate": 3.5792903431431775e-05, "loss": 0.6371, "num_input_tokens_seen": 5267184, "step": 8055 }, { "epoch": 4.2243186582809225, "grad_norm": 0.9104519486427307, "learning_rate": 3.577227125104466e-05, "loss": 0.5229, "num_input_tokens_seen": 5269776, "step": 8060 }, { "epoch": 4.226939203354298, "grad_norm": 0.3045504689216614, "learning_rate": 3.575163005691302e-05, "loss": 0.4668, "num_input_tokens_seen": 5273296, "step": 8065 }, { "epoch": 4.229559748427673, "grad_norm": 0.5170242786407471, "learning_rate": 3.573097986630845e-05, "loss": 0.4678, "num_input_tokens_seen": 5276432, "step": 8070 }, { "epoch": 4.232180293501048, "grad_norm": 0.44842296838760376, "learning_rate": 3.5710320696510114e-05, "loss": 0.4014, "num_input_tokens_seen": 5279696, "step": 8075 }, { "epoch": 4.234800838574423, "grad_norm": 0.43693360686302185, "learning_rate": 3.5689652564804646e-05, "loss": 0.4326, "num_input_tokens_seen": 5283312, "step": 8080 }, { "epoch": 4.237421383647799, "grad_norm": 0.5526670813560486, "learning_rate": 3.566897548848619e-05, "loss": 0.3809, "num_input_tokens_seen": 5285712, "step": 8085 }, { "epoch": 4.240041928721174, "grad_norm": 0.33597052097320557, "learning_rate": 3.564828948485639e-05, "loss": 0.5131, "num_input_tokens_seen": 5289136, "step": 8090 }, { "epoch": 4.2426624737945495, "grad_norm": 0.41417357325553894, "learning_rate": 3.562759457122434e-05, "loss": 0.311, "num_input_tokens_seen": 5292528, "step": 8095 }, { "epoch": 4.245283018867925, "grad_norm": 0.34383514523506165, "learning_rate": 3.5606890764906603e-05, "loss": 0.5254, "num_input_tokens_seen": 5295984, "step": 8100 }, { "epoch": 4.2479035639413, "grad_norm": 0.4796389937400818, "learning_rate": 3.5586178083227175e-05, "loss": 0.4389, "num_input_tokens_seen": 5299120, "step": 8105 }, { "epoch": 4.250524109014675, "grad_norm": 0.46515315771102905, "learning_rate": 3.556545654351749e-05, "loss": 0.4204, "num_input_tokens_seen": 5302192, "step": 8110 }, { "epoch": 4.25314465408805, "grad_norm": 0.8523973822593689, "learning_rate": 3.554472616311638e-05, "loss": 0.8111, "num_input_tokens_seen": 5304880, "step": 8115 }, { "epoch": 4.255765199161425, "grad_norm": 0.4204578697681427, "learning_rate": 3.552398695937007e-05, "loss": 0.5481, "num_input_tokens_seen": 5308304, "step": 8120 }, { "epoch": 4.2583857442348005, "grad_norm": 0.5454210042953491, "learning_rate": 3.55032389496322e-05, "loss": 0.5414, "num_input_tokens_seen": 5310960, "step": 8125 }, { "epoch": 4.261006289308176, "grad_norm": 0.4074622094631195, "learning_rate": 3.548248215126374e-05, "loss": 0.4422, "num_input_tokens_seen": 5313264, "step": 8130 }, { "epoch": 4.263626834381552, "grad_norm": 0.4408065974712372, "learning_rate": 3.546171658163304e-05, "loss": 0.5562, "num_input_tokens_seen": 5315600, "step": 8135 }, { "epoch": 4.266247379454927, "grad_norm": 0.372793048620224, "learning_rate": 3.544094225811577e-05, "loss": 0.3743, "num_input_tokens_seen": 5319024, "step": 8140 }, { "epoch": 4.268867924528302, "grad_norm": 0.552099883556366, "learning_rate": 3.542015919809495e-05, "loss": 0.469, "num_input_tokens_seen": 5322160, "step": 8145 }, { "epoch": 4.271488469601677, "grad_norm": 0.3801279664039612, "learning_rate": 3.539936741896088e-05, "loss": 0.4208, "num_input_tokens_seen": 5327152, "step": 8150 }, { "epoch": 4.274109014675052, "grad_norm": 0.4656292796134949, "learning_rate": 3.537856693811118e-05, "loss": 0.3661, "num_input_tokens_seen": 5330256, "step": 8155 }, { "epoch": 4.276729559748428, "grad_norm": 0.7069362998008728, "learning_rate": 3.5357757772950746e-05, "loss": 0.5223, "num_input_tokens_seen": 5333392, "step": 8160 }, { "epoch": 4.279350104821803, "grad_norm": 0.45000988245010376, "learning_rate": 3.533693994089173e-05, "loss": 0.4574, "num_input_tokens_seen": 5336720, "step": 8165 }, { "epoch": 4.281970649895178, "grad_norm": 0.3250468373298645, "learning_rate": 3.531611345935353e-05, "loss": 0.3067, "num_input_tokens_seen": 5339696, "step": 8170 }, { "epoch": 4.284591194968553, "grad_norm": 0.529296875, "learning_rate": 3.529527834576282e-05, "loss": 0.56, "num_input_tokens_seen": 5342896, "step": 8175 }, { "epoch": 4.287211740041929, "grad_norm": 0.7618327736854553, "learning_rate": 3.527443461755346e-05, "loss": 0.5069, "num_input_tokens_seen": 5345392, "step": 8180 }, { "epoch": 4.289832285115304, "grad_norm": 0.3717785179615021, "learning_rate": 3.525358229216653e-05, "loss": 0.4397, "num_input_tokens_seen": 5349392, "step": 8185 }, { "epoch": 4.2924528301886795, "grad_norm": 0.4963497817516327, "learning_rate": 3.52327213870503e-05, "loss": 0.4576, "num_input_tokens_seen": 5353008, "step": 8190 }, { "epoch": 4.295073375262055, "grad_norm": 0.7221416234970093, "learning_rate": 3.521185191966022e-05, "loss": 0.7309, "num_input_tokens_seen": 5356464, "step": 8195 }, { "epoch": 4.29769392033543, "grad_norm": 0.3741227388381958, "learning_rate": 3.5190973907458924e-05, "loss": 0.407, "num_input_tokens_seen": 5359440, "step": 8200 }, { "epoch": 4.300314465408805, "grad_norm": 0.6113585233688354, "learning_rate": 3.517008736791616e-05, "loss": 0.6116, "num_input_tokens_seen": 5361968, "step": 8205 }, { "epoch": 4.30293501048218, "grad_norm": 0.35088708996772766, "learning_rate": 3.514919231850885e-05, "loss": 0.65, "num_input_tokens_seen": 5365744, "step": 8210 }, { "epoch": 4.305555555555555, "grad_norm": 0.528949499130249, "learning_rate": 3.512828877672099e-05, "loss": 0.3808, "num_input_tokens_seen": 5368720, "step": 8215 }, { "epoch": 4.3081761006289305, "grad_norm": 1.2173380851745605, "learning_rate": 3.510737676004372e-05, "loss": 0.5166, "num_input_tokens_seen": 5371408, "step": 8220 }, { "epoch": 4.310796645702306, "grad_norm": 0.899817943572998, "learning_rate": 3.5086456285975274e-05, "loss": 0.562, "num_input_tokens_seen": 5374256, "step": 8225 }, { "epoch": 4.313417190775682, "grad_norm": 0.9055739641189575, "learning_rate": 3.5065527372020935e-05, "loss": 0.5556, "num_input_tokens_seen": 5376624, "step": 8230 }, { "epoch": 4.316037735849057, "grad_norm": 0.6112168431282043, "learning_rate": 3.504459003569306e-05, "loss": 0.4797, "num_input_tokens_seen": 5380048, "step": 8235 }, { "epoch": 4.318658280922432, "grad_norm": 0.4543270468711853, "learning_rate": 3.5023644294511074e-05, "loss": 0.3945, "num_input_tokens_seen": 5382416, "step": 8240 }, { "epoch": 4.321278825995807, "grad_norm": 0.3822976350784302, "learning_rate": 3.50026901660014e-05, "loss": 0.6272, "num_input_tokens_seen": 5385520, "step": 8245 }, { "epoch": 4.323899371069182, "grad_norm": 0.3753868639469147, "learning_rate": 3.4981727667697497e-05, "loss": 0.6847, "num_input_tokens_seen": 5388368, "step": 8250 }, { "epoch": 4.326519916142558, "grad_norm": 0.41257184743881226, "learning_rate": 3.4960756817139825e-05, "loss": 0.4206, "num_input_tokens_seen": 5391920, "step": 8255 }, { "epoch": 4.329140461215933, "grad_norm": 0.42137640714645386, "learning_rate": 3.493977763187584e-05, "loss": 0.3716, "num_input_tokens_seen": 5394864, "step": 8260 }, { "epoch": 4.331761006289308, "grad_norm": 0.4268534779548645, "learning_rate": 3.4918790129459975e-05, "loss": 0.4506, "num_input_tokens_seen": 5398448, "step": 8265 }, { "epoch": 4.334381551362683, "grad_norm": 0.4848392605781555, "learning_rate": 3.4897794327453586e-05, "loss": 0.4262, "num_input_tokens_seen": 5401904, "step": 8270 }, { "epoch": 4.337002096436059, "grad_norm": 0.48878711462020874, "learning_rate": 3.487679024342502e-05, "loss": 0.3953, "num_input_tokens_seen": 5404432, "step": 8275 }, { "epoch": 4.339622641509434, "grad_norm": 0.666090190410614, "learning_rate": 3.4855777894949536e-05, "loss": 0.4855, "num_input_tokens_seen": 5407376, "step": 8280 }, { "epoch": 4.3422431865828095, "grad_norm": 1.0034018754959106, "learning_rate": 3.4834757299609306e-05, "loss": 0.506, "num_input_tokens_seen": 5410768, "step": 8285 }, { "epoch": 4.344863731656185, "grad_norm": 0.3163137435913086, "learning_rate": 3.48137284749934e-05, "loss": 0.5233, "num_input_tokens_seen": 5414256, "step": 8290 }, { "epoch": 4.34748427672956, "grad_norm": 0.534454345703125, "learning_rate": 3.479269143869777e-05, "loss": 0.4119, "num_input_tokens_seen": 5417296, "step": 8295 }, { "epoch": 4.350104821802935, "grad_norm": 0.6646528840065002, "learning_rate": 3.477164620832527e-05, "loss": 0.4801, "num_input_tokens_seen": 5420560, "step": 8300 }, { "epoch": 4.35272536687631, "grad_norm": 0.6436362862586975, "learning_rate": 3.4750592801485564e-05, "loss": 0.4425, "num_input_tokens_seen": 5424496, "step": 8305 }, { "epoch": 4.355345911949685, "grad_norm": 0.3160201907157898, "learning_rate": 3.47295312357952e-05, "loss": 0.4754, "num_input_tokens_seen": 5428944, "step": 8310 }, { "epoch": 4.3579664570230605, "grad_norm": 0.29446372389793396, "learning_rate": 3.4708461528877514e-05, "loss": 0.5296, "num_input_tokens_seen": 5432624, "step": 8315 }, { "epoch": 4.360587002096436, "grad_norm": 0.726830780506134, "learning_rate": 3.468738369836269e-05, "loss": 0.4914, "num_input_tokens_seen": 5435152, "step": 8320 }, { "epoch": 4.363207547169811, "grad_norm": 0.6214956641197205, "learning_rate": 3.466629776188769e-05, "loss": 0.5088, "num_input_tokens_seen": 5438640, "step": 8325 }, { "epoch": 4.365828092243187, "grad_norm": 0.46073582768440247, "learning_rate": 3.464520373709627e-05, "loss": 0.438, "num_input_tokens_seen": 5441264, "step": 8330 }, { "epoch": 4.368448637316562, "grad_norm": 0.30064594745635986, "learning_rate": 3.462410164163893e-05, "loss": 0.4329, "num_input_tokens_seen": 5444752, "step": 8335 }, { "epoch": 4.371069182389937, "grad_norm": 0.6336135864257812, "learning_rate": 3.460299149317294e-05, "loss": 0.4874, "num_input_tokens_seen": 5447632, "step": 8340 }, { "epoch": 4.373689727463312, "grad_norm": 0.4679481089115143, "learning_rate": 3.4581873309362326e-05, "loss": 0.5997, "num_input_tokens_seen": 5451344, "step": 8345 }, { "epoch": 4.376310272536688, "grad_norm": 0.4843898117542267, "learning_rate": 3.456074710787781e-05, "loss": 0.5632, "num_input_tokens_seen": 5454736, "step": 8350 }, { "epoch": 4.378930817610063, "grad_norm": 0.30562058091163635, "learning_rate": 3.453961290639683e-05, "loss": 0.5124, "num_input_tokens_seen": 5458096, "step": 8355 }, { "epoch": 4.381551362683438, "grad_norm": 0.44692376255989075, "learning_rate": 3.451847072260351e-05, "loss": 0.4789, "num_input_tokens_seen": 5460560, "step": 8360 }, { "epoch": 4.384171907756813, "grad_norm": 0.5554906129837036, "learning_rate": 3.4497320574188694e-05, "loss": 0.4553, "num_input_tokens_seen": 5463792, "step": 8365 }, { "epoch": 4.386792452830189, "grad_norm": 0.3902348279953003, "learning_rate": 3.447616247884983e-05, "loss": 0.448, "num_input_tokens_seen": 5467216, "step": 8370 }, { "epoch": 4.389412997903564, "grad_norm": 0.25181132555007935, "learning_rate": 3.445499645429107e-05, "loss": 0.4486, "num_input_tokens_seen": 5470320, "step": 8375 }, { "epoch": 4.3920335429769395, "grad_norm": 0.39213329553604126, "learning_rate": 3.443382251822315e-05, "loss": 0.4222, "num_input_tokens_seen": 5473232, "step": 8380 }, { "epoch": 4.394654088050315, "grad_norm": 0.6026153564453125, "learning_rate": 3.4412640688363475e-05, "loss": 0.6075, "num_input_tokens_seen": 5476432, "step": 8385 }, { "epoch": 4.39727463312369, "grad_norm": 0.7788292765617371, "learning_rate": 3.439145098243601e-05, "loss": 0.5636, "num_input_tokens_seen": 5478928, "step": 8390 }, { "epoch": 4.399895178197065, "grad_norm": 0.3564252257347107, "learning_rate": 3.437025341817137e-05, "loss": 0.3769, "num_input_tokens_seen": 5481776, "step": 8395 }, { "epoch": 4.40251572327044, "grad_norm": 0.31572476029396057, "learning_rate": 3.434904801330667e-05, "loss": 0.4705, "num_input_tokens_seen": 5485008, "step": 8400 }, { "epoch": 4.405136268343815, "grad_norm": 0.2666553556919098, "learning_rate": 3.432783478558564e-05, "loss": 0.4894, "num_input_tokens_seen": 5492496, "step": 8405 }, { "epoch": 4.4077568134171905, "grad_norm": 0.5464043617248535, "learning_rate": 3.430661375275854e-05, "loss": 0.4665, "num_input_tokens_seen": 5495664, "step": 8410 }, { "epoch": 4.410377358490566, "grad_norm": 0.5698633193969727, "learning_rate": 3.4285384932582175e-05, "loss": 0.4402, "num_input_tokens_seen": 5499792, "step": 8415 }, { "epoch": 4.412997903563941, "grad_norm": 0.32377755641937256, "learning_rate": 3.426414834281982e-05, "loss": 0.4299, "num_input_tokens_seen": 5502736, "step": 8420 }, { "epoch": 4.415618448637317, "grad_norm": 0.5233461856842041, "learning_rate": 3.424290400124131e-05, "loss": 0.4344, "num_input_tokens_seen": 5505328, "step": 8425 }, { "epoch": 4.418238993710692, "grad_norm": 0.6041557192802429, "learning_rate": 3.422165192562293e-05, "loss": 0.5054, "num_input_tokens_seen": 5507952, "step": 8430 }, { "epoch": 4.420859538784067, "grad_norm": 1.1574034690856934, "learning_rate": 3.420039213374745e-05, "loss": 0.3925, "num_input_tokens_seen": 5510544, "step": 8435 }, { "epoch": 4.423480083857442, "grad_norm": 0.485673725605011, "learning_rate": 3.4179124643404084e-05, "loss": 0.5514, "num_input_tokens_seen": 5512976, "step": 8440 }, { "epoch": 4.426100628930818, "grad_norm": 0.3466070890426636, "learning_rate": 3.41578494723885e-05, "loss": 0.5393, "num_input_tokens_seen": 5516496, "step": 8445 }, { "epoch": 4.428721174004193, "grad_norm": 0.4382999539375305, "learning_rate": 3.4136566638502795e-05, "loss": 0.3646, "num_input_tokens_seen": 5520720, "step": 8450 }, { "epoch": 4.431341719077568, "grad_norm": 0.26501840353012085, "learning_rate": 3.4115276159555464e-05, "loss": 0.4703, "num_input_tokens_seen": 5524336, "step": 8455 }, { "epoch": 4.433962264150943, "grad_norm": 0.229367196559906, "learning_rate": 3.409397805336142e-05, "loss": 0.4539, "num_input_tokens_seen": 5528112, "step": 8460 }, { "epoch": 4.436582809224318, "grad_norm": 0.3291518986225128, "learning_rate": 3.407267233774193e-05, "loss": 0.5269, "num_input_tokens_seen": 5531664, "step": 8465 }, { "epoch": 4.439203354297694, "grad_norm": 0.7694632411003113, "learning_rate": 3.4051359030524654e-05, "loss": 0.4033, "num_input_tokens_seen": 5536368, "step": 8470 }, { "epoch": 4.4418238993710695, "grad_norm": 0.38330599665641785, "learning_rate": 3.4030038149543594e-05, "loss": 0.4585, "num_input_tokens_seen": 5539600, "step": 8475 }, { "epoch": 4.444444444444445, "grad_norm": 0.36478012800216675, "learning_rate": 3.4008709712639084e-05, "loss": 0.4131, "num_input_tokens_seen": 5542544, "step": 8480 }, { "epoch": 4.44706498951782, "grad_norm": 0.3314960300922394, "learning_rate": 3.398737373765779e-05, "loss": 0.4604, "num_input_tokens_seen": 5546704, "step": 8485 }, { "epoch": 4.449685534591195, "grad_norm": 0.3502393066883087, "learning_rate": 3.396603024245267e-05, "loss": 0.417, "num_input_tokens_seen": 5549520, "step": 8490 }, { "epoch": 4.45230607966457, "grad_norm": 0.7716391682624817, "learning_rate": 3.3944679244883e-05, "loss": 0.4789, "num_input_tokens_seen": 5553040, "step": 8495 }, { "epoch": 4.454926624737945, "grad_norm": 0.34137940406799316, "learning_rate": 3.392332076281433e-05, "loss": 0.4926, "num_input_tokens_seen": 5556880, "step": 8500 }, { "epoch": 4.4575471698113205, "grad_norm": 0.4445749521255493, "learning_rate": 3.390195481411842e-05, "loss": 0.3895, "num_input_tokens_seen": 5559760, "step": 8505 }, { "epoch": 4.460167714884696, "grad_norm": 0.30663567781448364, "learning_rate": 3.3880581416673366e-05, "loss": 0.4466, "num_input_tokens_seen": 5564688, "step": 8510 }, { "epoch": 4.462788259958071, "grad_norm": 0.5879607796669006, "learning_rate": 3.385920058836342e-05, "loss": 0.5214, "num_input_tokens_seen": 5568112, "step": 8515 }, { "epoch": 4.465408805031447, "grad_norm": 0.5910243391990662, "learning_rate": 3.38378123470791e-05, "loss": 0.5786, "num_input_tokens_seen": 5570928, "step": 8520 }, { "epoch": 4.468029350104822, "grad_norm": 0.371358186006546, "learning_rate": 3.381641671071709e-05, "loss": 0.4771, "num_input_tokens_seen": 5574480, "step": 8525 }, { "epoch": 4.470649895178197, "grad_norm": 0.7383023500442505, "learning_rate": 3.379501369718031e-05, "loss": 0.4863, "num_input_tokens_seen": 5577296, "step": 8530 }, { "epoch": 4.473270440251572, "grad_norm": 0.27015116810798645, "learning_rate": 3.377360332437781e-05, "loss": 0.3738, "num_input_tokens_seen": 5580784, "step": 8535 }, { "epoch": 4.475890985324948, "grad_norm": 0.3517860174179077, "learning_rate": 3.37521856102248e-05, "loss": 0.4281, "num_input_tokens_seen": 5583856, "step": 8540 }, { "epoch": 4.478511530398323, "grad_norm": 0.45354926586151123, "learning_rate": 3.373076057264266e-05, "loss": 0.4612, "num_input_tokens_seen": 5586256, "step": 8545 }, { "epoch": 4.481132075471698, "grad_norm": 0.4755817949771881, "learning_rate": 3.370932822955888e-05, "loss": 0.4482, "num_input_tokens_seen": 5589200, "step": 8550 }, { "epoch": 4.483752620545073, "grad_norm": 0.37634965777397156, "learning_rate": 3.368788859890706e-05, "loss": 0.545, "num_input_tokens_seen": 5592976, "step": 8555 }, { "epoch": 4.486373165618448, "grad_norm": 0.5778486132621765, "learning_rate": 3.3666441698626906e-05, "loss": 0.513, "num_input_tokens_seen": 5595696, "step": 8560 }, { "epoch": 4.488993710691824, "grad_norm": 0.5040127635002136, "learning_rate": 3.364498754666421e-05, "loss": 0.508, "num_input_tokens_seen": 5598288, "step": 8565 }, { "epoch": 4.4916142557651995, "grad_norm": 0.40755513310432434, "learning_rate": 3.362352616097082e-05, "loss": 0.543, "num_input_tokens_seen": 5601296, "step": 8570 }, { "epoch": 4.494234800838575, "grad_norm": 1.258349895477295, "learning_rate": 3.360205755950464e-05, "loss": 0.3872, "num_input_tokens_seen": 5604752, "step": 8575 }, { "epoch": 4.49685534591195, "grad_norm": 0.46700724959373474, "learning_rate": 3.358058176022963e-05, "loss": 0.5285, "num_input_tokens_seen": 5607824, "step": 8580 }, { "epoch": 4.499475890985325, "grad_norm": 0.4868948757648468, "learning_rate": 3.355909878111574e-05, "loss": 0.5116, "num_input_tokens_seen": 5611248, "step": 8585 }, { "epoch": 4.5, "eval_loss": 0.4893417954444885, "eval_runtime": 14.5391, "eval_samples_per_second": 58.325, "eval_steps_per_second": 14.581, "num_input_tokens_seen": 5611760, "step": 8586 }, { "epoch": 4.5020964360587, "grad_norm": 0.6345266103744507, "learning_rate": 3.3537608640138954e-05, "loss": 0.4275, "num_input_tokens_seen": 5614192, "step": 8590 }, { "epoch": 4.504716981132075, "grad_norm": 0.5752541422843933, "learning_rate": 3.351611135528125e-05, "loss": 0.5363, "num_input_tokens_seen": 5616944, "step": 8595 }, { "epoch": 4.5073375262054505, "grad_norm": 0.6151546835899353, "learning_rate": 3.349460694453056e-05, "loss": 0.5047, "num_input_tokens_seen": 5619344, "step": 8600 }, { "epoch": 4.509958071278826, "grad_norm": 0.3492070436477661, "learning_rate": 3.3473095425880796e-05, "loss": 0.4049, "num_input_tokens_seen": 5622608, "step": 8605 }, { "epoch": 4.512578616352201, "grad_norm": 1.0362377166748047, "learning_rate": 3.345157681733181e-05, "loss": 0.4945, "num_input_tokens_seen": 5625296, "step": 8610 }, { "epoch": 4.515199161425577, "grad_norm": 0.4714951515197754, "learning_rate": 3.3430051136889404e-05, "loss": 0.3804, "num_input_tokens_seen": 5628272, "step": 8615 }, { "epoch": 4.517819706498952, "grad_norm": 0.34291717410087585, "learning_rate": 3.3408518402565276e-05, "loss": 0.4153, "num_input_tokens_seen": 5632944, "step": 8620 }, { "epoch": 4.520440251572327, "grad_norm": 0.5785251259803772, "learning_rate": 3.338697863237703e-05, "loss": 0.4482, "num_input_tokens_seen": 5636016, "step": 8625 }, { "epoch": 4.523060796645702, "grad_norm": 0.6072019934654236, "learning_rate": 3.336543184434817e-05, "loss": 0.4151, "num_input_tokens_seen": 5642064, "step": 8630 }, { "epoch": 4.5256813417190775, "grad_norm": 0.4820755422115326, "learning_rate": 3.334387805650805e-05, "loss": 0.5383, "num_input_tokens_seen": 5645072, "step": 8635 }, { "epoch": 4.528301886792453, "grad_norm": 0.358138769865036, "learning_rate": 3.3322317286891913e-05, "loss": 0.5309, "num_input_tokens_seen": 5648304, "step": 8640 }, { "epoch": 4.530922431865828, "grad_norm": 0.4544195830821991, "learning_rate": 3.330074955354082e-05, "loss": 0.5909, "num_input_tokens_seen": 5651120, "step": 8645 }, { "epoch": 4.533542976939203, "grad_norm": 1.032765507698059, "learning_rate": 3.3279174874501664e-05, "loss": 0.398, "num_input_tokens_seen": 5653712, "step": 8650 }, { "epoch": 4.536163522012579, "grad_norm": 0.41664746403694153, "learning_rate": 3.325759326782715e-05, "loss": 0.4762, "num_input_tokens_seen": 5656624, "step": 8655 }, { "epoch": 4.538784067085954, "grad_norm": 0.49179938435554504, "learning_rate": 3.323600475157578e-05, "loss": 0.4308, "num_input_tokens_seen": 5658960, "step": 8660 }, { "epoch": 4.5414046121593294, "grad_norm": 0.5948353409767151, "learning_rate": 3.321440934381184e-05, "loss": 0.4762, "num_input_tokens_seen": 5661872, "step": 8665 }, { "epoch": 4.544025157232705, "grad_norm": 0.34355592727661133, "learning_rate": 3.319280706260538e-05, "loss": 0.5717, "num_input_tokens_seen": 5664560, "step": 8670 }, { "epoch": 4.54664570230608, "grad_norm": 0.4280647039413452, "learning_rate": 3.31711979260322e-05, "loss": 0.4155, "num_input_tokens_seen": 5667664, "step": 8675 }, { "epoch": 4.549266247379455, "grad_norm": 0.5538745522499084, "learning_rate": 3.3149581952173846e-05, "loss": 0.4073, "num_input_tokens_seen": 5670128, "step": 8680 }, { "epoch": 4.55188679245283, "grad_norm": 0.47618287801742554, "learning_rate": 3.312795915911757e-05, "loss": 0.5473, "num_input_tokens_seen": 5673296, "step": 8685 }, { "epoch": 4.554507337526205, "grad_norm": 0.46106773614883423, "learning_rate": 3.310632956495634e-05, "loss": 0.5153, "num_input_tokens_seen": 5675920, "step": 8690 }, { "epoch": 4.5571278825995805, "grad_norm": 1.0205609798431396, "learning_rate": 3.308469318778881e-05, "loss": 0.4375, "num_input_tokens_seen": 5678896, "step": 8695 }, { "epoch": 4.559748427672956, "grad_norm": 0.8473626375198364, "learning_rate": 3.306305004571932e-05, "loss": 0.5199, "num_input_tokens_seen": 5681488, "step": 8700 }, { "epoch": 4.562368972746331, "grad_norm": 0.8841246962547302, "learning_rate": 3.304140015685785e-05, "loss": 0.5089, "num_input_tokens_seen": 5684432, "step": 8705 }, { "epoch": 4.564989517819707, "grad_norm": 0.3566700518131256, "learning_rate": 3.301974353932005e-05, "loss": 0.4697, "num_input_tokens_seen": 5688112, "step": 8710 }, { "epoch": 4.567610062893082, "grad_norm": 0.43364855647087097, "learning_rate": 3.2998080211227185e-05, "loss": 0.5231, "num_input_tokens_seen": 5690416, "step": 8715 }, { "epoch": 4.570230607966457, "grad_norm": 0.44695407152175903, "learning_rate": 3.297641019070613e-05, "loss": 0.5329, "num_input_tokens_seen": 5693744, "step": 8720 }, { "epoch": 4.572851153039832, "grad_norm": 0.28730854392051697, "learning_rate": 3.2954733495889376e-05, "loss": 0.4982, "num_input_tokens_seen": 5697776, "step": 8725 }, { "epoch": 4.5754716981132075, "grad_norm": 0.8062750101089478, "learning_rate": 3.2933050144915e-05, "loss": 0.7569, "num_input_tokens_seen": 5700240, "step": 8730 }, { "epoch": 4.578092243186583, "grad_norm": 0.49814119935035706, "learning_rate": 3.2911360155926624e-05, "loss": 0.4284, "num_input_tokens_seen": 5702992, "step": 8735 }, { "epoch": 4.580712788259958, "grad_norm": 0.9802262187004089, "learning_rate": 3.2889663547073444e-05, "loss": 0.4328, "num_input_tokens_seen": 5706544, "step": 8740 }, { "epoch": 4.583333333333333, "grad_norm": 0.5427287220954895, "learning_rate": 3.286796033651019e-05, "loss": 0.4723, "num_input_tokens_seen": 5709328, "step": 8745 }, { "epoch": 4.585953878406709, "grad_norm": 0.2858721911907196, "learning_rate": 3.284625054239714e-05, "loss": 0.4052, "num_input_tokens_seen": 5712720, "step": 8750 }, { "epoch": 4.588574423480084, "grad_norm": 0.4495197534561157, "learning_rate": 3.282453418290002e-05, "loss": 0.5794, "num_input_tokens_seen": 5715536, "step": 8755 }, { "epoch": 4.591194968553459, "grad_norm": 0.5364432334899902, "learning_rate": 3.28028112761901e-05, "loss": 0.502, "num_input_tokens_seen": 5718352, "step": 8760 }, { "epoch": 4.593815513626835, "grad_norm": 0.6014461517333984, "learning_rate": 3.278108184044414e-05, "loss": 0.4719, "num_input_tokens_seen": 5722096, "step": 8765 }, { "epoch": 4.59643605870021, "grad_norm": 0.8177543878555298, "learning_rate": 3.275934589384432e-05, "loss": 0.4576, "num_input_tokens_seen": 5724816, "step": 8770 }, { "epoch": 4.599056603773585, "grad_norm": 0.359554648399353, "learning_rate": 3.273760345457828e-05, "loss": 0.4731, "num_input_tokens_seen": 5729040, "step": 8775 }, { "epoch": 4.60167714884696, "grad_norm": 0.3692406117916107, "learning_rate": 3.2715854540839106e-05, "loss": 0.453, "num_input_tokens_seen": 5732464, "step": 8780 }, { "epoch": 4.604297693920335, "grad_norm": 0.39620643854141235, "learning_rate": 3.269409917082531e-05, "loss": 0.335, "num_input_tokens_seen": 5735472, "step": 8785 }, { "epoch": 4.6069182389937104, "grad_norm": 0.36913880705833435, "learning_rate": 3.2672337362740765e-05, "loss": 0.4669, "num_input_tokens_seen": 5738896, "step": 8790 }, { "epoch": 4.609538784067086, "grad_norm": 0.6311262249946594, "learning_rate": 3.265056913479479e-05, "loss": 0.392, "num_input_tokens_seen": 5742032, "step": 8795 }, { "epoch": 4.612159329140461, "grad_norm": 0.3315156400203705, "learning_rate": 3.262879450520201e-05, "loss": 0.501, "num_input_tokens_seen": 5745168, "step": 8800 }, { "epoch": 4.614779874213837, "grad_norm": 0.46593302488327026, "learning_rate": 3.260701349218248e-05, "loss": 0.5383, "num_input_tokens_seen": 5747920, "step": 8805 }, { "epoch": 4.617400419287212, "grad_norm": 0.5114781856536865, "learning_rate": 3.258522611396151e-05, "loss": 0.4917, "num_input_tokens_seen": 5751120, "step": 8810 }, { "epoch": 4.620020964360587, "grad_norm": 0.4771624803543091, "learning_rate": 3.256343238876983e-05, "loss": 0.4516, "num_input_tokens_seen": 5754352, "step": 8815 }, { "epoch": 4.622641509433962, "grad_norm": 2.505239248275757, "learning_rate": 3.2541632334843394e-05, "loss": 0.4515, "num_input_tokens_seen": 5757200, "step": 8820 }, { "epoch": 4.6252620545073375, "grad_norm": 0.5120879411697388, "learning_rate": 3.251982597042351e-05, "loss": 0.358, "num_input_tokens_seen": 5760944, "step": 8825 }, { "epoch": 4.627882599580713, "grad_norm": 0.609200119972229, "learning_rate": 3.249801331375675e-05, "loss": 0.506, "num_input_tokens_seen": 5764848, "step": 8830 }, { "epoch": 4.630503144654088, "grad_norm": 0.5027180314064026, "learning_rate": 3.2476194383094946e-05, "loss": 0.3979, "num_input_tokens_seen": 5767984, "step": 8835 }, { "epoch": 4.633123689727463, "grad_norm": 0.48437562584877014, "learning_rate": 3.245436919669517e-05, "loss": 0.4238, "num_input_tokens_seen": 5771472, "step": 8840 }, { "epoch": 4.635744234800838, "grad_norm": 0.7801414728164673, "learning_rate": 3.243253777281977e-05, "loss": 0.5308, "num_input_tokens_seen": 5775536, "step": 8845 }, { "epoch": 4.638364779874214, "grad_norm": 0.3464021384716034, "learning_rate": 3.241070012973625e-05, "loss": 0.4342, "num_input_tokens_seen": 5778704, "step": 8850 }, { "epoch": 4.640985324947589, "grad_norm": 0.5922744870185852, "learning_rate": 3.238885628571738e-05, "loss": 0.4845, "num_input_tokens_seen": 5781104, "step": 8855 }, { "epoch": 4.643605870020965, "grad_norm": 0.5072488188743591, "learning_rate": 3.236700625904107e-05, "loss": 0.4996, "num_input_tokens_seen": 5784528, "step": 8860 }, { "epoch": 4.64622641509434, "grad_norm": 0.45058390498161316, "learning_rate": 3.234515006799045e-05, "loss": 0.7289, "num_input_tokens_seen": 5787184, "step": 8865 }, { "epoch": 4.648846960167715, "grad_norm": 0.8675150871276855, "learning_rate": 3.232328773085375e-05, "loss": 0.5166, "num_input_tokens_seen": 5790032, "step": 8870 }, { "epoch": 4.65146750524109, "grad_norm": 0.4480062425136566, "learning_rate": 3.2301419265924395e-05, "loss": 0.4772, "num_input_tokens_seen": 5793104, "step": 8875 }, { "epoch": 4.654088050314465, "grad_norm": 0.7561204433441162, "learning_rate": 3.2279544691500915e-05, "loss": 0.5633, "num_input_tokens_seen": 5796368, "step": 8880 }, { "epoch": 4.65670859538784, "grad_norm": 0.5095909833908081, "learning_rate": 3.2257664025886956e-05, "loss": 0.622, "num_input_tokens_seen": 5799248, "step": 8885 }, { "epoch": 4.659329140461216, "grad_norm": 0.35310542583465576, "learning_rate": 3.2235777287391256e-05, "loss": 0.551, "num_input_tokens_seen": 5802448, "step": 8890 }, { "epoch": 4.661949685534591, "grad_norm": 0.38375324010849, "learning_rate": 3.221388449432764e-05, "loss": 0.5092, "num_input_tokens_seen": 5805936, "step": 8895 }, { "epoch": 4.664570230607967, "grad_norm": 0.6977710127830505, "learning_rate": 3.219198566501499e-05, "loss": 0.4236, "num_input_tokens_seen": 5809744, "step": 8900 }, { "epoch": 4.667190775681342, "grad_norm": 0.48536959290504456, "learning_rate": 3.217008081777726e-05, "loss": 0.5731, "num_input_tokens_seen": 5812656, "step": 8905 }, { "epoch": 4.669811320754717, "grad_norm": 0.7424578666687012, "learning_rate": 3.214816997094341e-05, "loss": 0.6082, "num_input_tokens_seen": 5816048, "step": 8910 }, { "epoch": 4.672431865828092, "grad_norm": 0.29069575667381287, "learning_rate": 3.2126253142847454e-05, "loss": 0.5827, "num_input_tokens_seen": 5820080, "step": 8915 }, { "epoch": 4.6750524109014675, "grad_norm": 0.36145156621932983, "learning_rate": 3.2104330351828374e-05, "loss": 0.5171, "num_input_tokens_seen": 5823344, "step": 8920 }, { "epoch": 4.677672955974843, "grad_norm": 0.4011367857456207, "learning_rate": 3.208240161623017e-05, "loss": 0.4789, "num_input_tokens_seen": 5826448, "step": 8925 }, { "epoch": 4.680293501048218, "grad_norm": 0.7429378628730774, "learning_rate": 3.20604669544018e-05, "loss": 0.4337, "num_input_tokens_seen": 5830000, "step": 8930 }, { "epoch": 4.682914046121593, "grad_norm": 0.3241747319698334, "learning_rate": 3.2038526384697204e-05, "loss": 0.3583, "num_input_tokens_seen": 5832464, "step": 8935 }, { "epoch": 4.685534591194968, "grad_norm": 0.5026168823242188, "learning_rate": 3.201657992547523e-05, "loss": 0.4755, "num_input_tokens_seen": 5835568, "step": 8940 }, { "epoch": 4.688155136268344, "grad_norm": 0.31656861305236816, "learning_rate": 3.1994627595099674e-05, "loss": 0.4715, "num_input_tokens_seen": 5841360, "step": 8945 }, { "epoch": 4.690775681341719, "grad_norm": 0.5189852714538574, "learning_rate": 3.1972669411939256e-05, "loss": 0.3679, "num_input_tokens_seen": 5844240, "step": 8950 }, { "epoch": 4.693396226415095, "grad_norm": 0.4636079668998718, "learning_rate": 3.195070539436757e-05, "loss": 0.4124, "num_input_tokens_seen": 5848144, "step": 8955 }, { "epoch": 4.69601677148847, "grad_norm": 0.5797479748725891, "learning_rate": 3.19287355607631e-05, "loss": 0.536, "num_input_tokens_seen": 5850800, "step": 8960 }, { "epoch": 4.698637316561845, "grad_norm": 0.40497109293937683, "learning_rate": 3.190675992950921e-05, "loss": 0.4604, "num_input_tokens_seen": 5853712, "step": 8965 }, { "epoch": 4.70125786163522, "grad_norm": 0.4582970142364502, "learning_rate": 3.18847785189941e-05, "loss": 0.4181, "num_input_tokens_seen": 5856400, "step": 8970 }, { "epoch": 4.703878406708595, "grad_norm": 0.6415480971336365, "learning_rate": 3.186279134761081e-05, "loss": 0.5172, "num_input_tokens_seen": 5859760, "step": 8975 }, { "epoch": 4.70649895178197, "grad_norm": 0.31012627482414246, "learning_rate": 3.18407984337572e-05, "loss": 0.3869, "num_input_tokens_seen": 5862672, "step": 8980 }, { "epoch": 4.709119496855346, "grad_norm": 0.4601421356201172, "learning_rate": 3.181879979583593e-05, "loss": 0.4989, "num_input_tokens_seen": 5866288, "step": 8985 }, { "epoch": 4.711740041928721, "grad_norm": 0.39770400524139404, "learning_rate": 3.179679545225447e-05, "loss": 0.5504, "num_input_tokens_seen": 5869424, "step": 8990 }, { "epoch": 4.714360587002097, "grad_norm": 0.23897571861743927, "learning_rate": 3.177478542142503e-05, "loss": 0.3591, "num_input_tokens_seen": 5872240, "step": 8995 }, { "epoch": 4.716981132075472, "grad_norm": 0.5246466398239136, "learning_rate": 3.175276972176462e-05, "loss": 0.3907, "num_input_tokens_seen": 5874672, "step": 9000 }, { "epoch": 4.719601677148847, "grad_norm": 0.3747934401035309, "learning_rate": 3.173074837169495e-05, "loss": 0.4191, "num_input_tokens_seen": 5878544, "step": 9005 }, { "epoch": 4.722222222222222, "grad_norm": 0.4123523235321045, "learning_rate": 3.1708721389642495e-05, "loss": 0.5302, "num_input_tokens_seen": 5882096, "step": 9010 }, { "epoch": 4.7248427672955975, "grad_norm": 0.7321045994758606, "learning_rate": 3.1686688794038436e-05, "loss": 0.4933, "num_input_tokens_seen": 5886512, "step": 9015 }, { "epoch": 4.727463312368973, "grad_norm": 0.4038061201572418, "learning_rate": 3.1664650603318616e-05, "loss": 0.4192, "num_input_tokens_seen": 5889648, "step": 9020 }, { "epoch": 4.730083857442348, "grad_norm": 0.5774592757225037, "learning_rate": 3.1642606835923606e-05, "loss": 0.4685, "num_input_tokens_seen": 5892752, "step": 9025 }, { "epoch": 4.732704402515723, "grad_norm": 0.2570076286792755, "learning_rate": 3.1620557510298607e-05, "loss": 0.6099, "num_input_tokens_seen": 5896112, "step": 9030 }, { "epoch": 4.735324947589098, "grad_norm": 0.392730176448822, "learning_rate": 3.159850264489351e-05, "loss": 0.461, "num_input_tokens_seen": 5899440, "step": 9035 }, { "epoch": 4.737945492662474, "grad_norm": 0.5082439184188843, "learning_rate": 3.157644225816281e-05, "loss": 0.5739, "num_input_tokens_seen": 5902096, "step": 9040 }, { "epoch": 4.740566037735849, "grad_norm": 0.33980193734169006, "learning_rate": 3.1554376368565616e-05, "loss": 0.434, "num_input_tokens_seen": 5904880, "step": 9045 }, { "epoch": 4.743186582809225, "grad_norm": 0.5682492852210999, "learning_rate": 3.153230499456568e-05, "loss": 0.5021, "num_input_tokens_seen": 5907632, "step": 9050 }, { "epoch": 4.7458071278826, "grad_norm": 1.1442339420318604, "learning_rate": 3.15102281546313e-05, "loss": 0.4372, "num_input_tokens_seen": 5911120, "step": 9055 }, { "epoch": 4.748427672955975, "grad_norm": 0.4468970000743866, "learning_rate": 3.148814586723537e-05, "loss": 0.4234, "num_input_tokens_seen": 5914928, "step": 9060 }, { "epoch": 4.75104821802935, "grad_norm": 0.4429374933242798, "learning_rate": 3.146605815085536e-05, "loss": 0.3951, "num_input_tokens_seen": 5918192, "step": 9065 }, { "epoch": 4.753668763102725, "grad_norm": 0.38377314805984497, "learning_rate": 3.1443965023973245e-05, "loss": 0.3656, "num_input_tokens_seen": 5921232, "step": 9070 }, { "epoch": 4.7562893081761, "grad_norm": 0.2910180687904358, "learning_rate": 3.142186650507554e-05, "loss": 0.4432, "num_input_tokens_seen": 5925040, "step": 9075 }, { "epoch": 4.758909853249476, "grad_norm": 0.6747817397117615, "learning_rate": 3.1399762612653286e-05, "loss": 0.5, "num_input_tokens_seen": 5927376, "step": 9080 }, { "epoch": 4.761530398322851, "grad_norm": 0.295411616563797, "learning_rate": 3.137765336520201e-05, "loss": 0.4107, "num_input_tokens_seen": 5930576, "step": 9085 }, { "epoch": 4.764150943396227, "grad_norm": 1.9080991744995117, "learning_rate": 3.1355538781221705e-05, "loss": 0.3843, "num_input_tokens_seen": 5933328, "step": 9090 }, { "epoch": 4.766771488469602, "grad_norm": 0.35855039954185486, "learning_rate": 3.133341887921687e-05, "loss": 0.4739, "num_input_tokens_seen": 5936208, "step": 9095 }, { "epoch": 4.769392033542977, "grad_norm": 0.4287266433238983, "learning_rate": 3.1311293677696404e-05, "loss": 0.6152, "num_input_tokens_seen": 5938832, "step": 9100 }, { "epoch": 4.772012578616352, "grad_norm": 0.5685389637947083, "learning_rate": 3.1289163195173695e-05, "loss": 0.3808, "num_input_tokens_seen": 5942128, "step": 9105 }, { "epoch": 4.7746331236897275, "grad_norm": 0.3315616846084595, "learning_rate": 3.126702745016648e-05, "loss": 0.2973, "num_input_tokens_seen": 5945648, "step": 9110 }, { "epoch": 4.777253668763103, "grad_norm": 0.2841363847255707, "learning_rate": 3.1244886461196976e-05, "loss": 0.4009, "num_input_tokens_seen": 5950352, "step": 9115 }, { "epoch": 4.779874213836478, "grad_norm": 0.6281412243843079, "learning_rate": 3.1222740246791734e-05, "loss": 0.594, "num_input_tokens_seen": 5953488, "step": 9120 }, { "epoch": 4.782494758909853, "grad_norm": 0.395367830991745, "learning_rate": 3.12005888254817e-05, "loss": 0.4487, "num_input_tokens_seen": 5956784, "step": 9125 }, { "epoch": 4.785115303983228, "grad_norm": 0.4911406636238098, "learning_rate": 3.1178432215802155e-05, "loss": 0.5174, "num_input_tokens_seen": 5960080, "step": 9130 }, { "epoch": 4.787735849056604, "grad_norm": 0.3515752851963043, "learning_rate": 3.115627043629277e-05, "loss": 0.3742, "num_input_tokens_seen": 5962864, "step": 9135 }, { "epoch": 4.790356394129979, "grad_norm": 0.3346322476863861, "learning_rate": 3.113410350549748e-05, "loss": 0.4609, "num_input_tokens_seen": 5966000, "step": 9140 }, { "epoch": 4.7929769392033545, "grad_norm": 0.4029335677623749, "learning_rate": 3.111193144196457e-05, "loss": 0.5204, "num_input_tokens_seen": 5969168, "step": 9145 }, { "epoch": 4.79559748427673, "grad_norm": 0.3227789103984833, "learning_rate": 3.1089754264246615e-05, "loss": 0.5071, "num_input_tokens_seen": 5972720, "step": 9150 }, { "epoch": 4.798218029350105, "grad_norm": 0.28024807572364807, "learning_rate": 3.106757199090046e-05, "loss": 0.3548, "num_input_tokens_seen": 5975760, "step": 9155 }, { "epoch": 4.80083857442348, "grad_norm": 0.48075559735298157, "learning_rate": 3.104538464048721e-05, "loss": 0.3557, "num_input_tokens_seen": 5979120, "step": 9160 }, { "epoch": 4.803459119496855, "grad_norm": 0.38854023814201355, "learning_rate": 3.102319223157225e-05, "loss": 0.5042, "num_input_tokens_seen": 5983376, "step": 9165 }, { "epoch": 4.80607966457023, "grad_norm": 0.4310731589794159, "learning_rate": 3.100099478272515e-05, "loss": 0.5559, "num_input_tokens_seen": 5986416, "step": 9170 }, { "epoch": 4.808700209643606, "grad_norm": 0.5386779308319092, "learning_rate": 3.097879231251973e-05, "loss": 0.3924, "num_input_tokens_seen": 5989136, "step": 9175 }, { "epoch": 4.811320754716981, "grad_norm": 0.6986268758773804, "learning_rate": 3.0956584839534006e-05, "loss": 0.6541, "num_input_tokens_seen": 5992112, "step": 9180 }, { "epoch": 4.813941299790356, "grad_norm": 0.3730703294277191, "learning_rate": 3.093437238235018e-05, "loss": 0.4141, "num_input_tokens_seen": 5995472, "step": 9185 }, { "epoch": 4.816561844863732, "grad_norm": 0.5066808462142944, "learning_rate": 3.0912154959554606e-05, "loss": 0.4329, "num_input_tokens_seen": 5998736, "step": 9190 }, { "epoch": 4.819182389937107, "grad_norm": 0.4197409749031067, "learning_rate": 3.088993258973782e-05, "loss": 0.6583, "num_input_tokens_seen": 6001552, "step": 9195 }, { "epoch": 4.821802935010482, "grad_norm": 0.8743690252304077, "learning_rate": 3.0867705291494486e-05, "loss": 0.518, "num_input_tokens_seen": 6005520, "step": 9200 }, { "epoch": 4.8244234800838575, "grad_norm": 0.33514055609703064, "learning_rate": 3.0845473083423395e-05, "loss": 0.4308, "num_input_tokens_seen": 6009200, "step": 9205 }, { "epoch": 4.827044025157233, "grad_norm": 0.36953285336494446, "learning_rate": 3.082323598412743e-05, "loss": 0.3963, "num_input_tokens_seen": 6012912, "step": 9210 }, { "epoch": 4.829664570230608, "grad_norm": 0.3419128954410553, "learning_rate": 3.080099401221359e-05, "loss": 0.557, "num_input_tokens_seen": 6017744, "step": 9215 }, { "epoch": 4.832285115303983, "grad_norm": 0.7701703906059265, "learning_rate": 3.0778747186292936e-05, "loss": 0.4375, "num_input_tokens_seen": 6020624, "step": 9220 }, { "epoch": 4.834905660377358, "grad_norm": 0.3135415315628052, "learning_rate": 3.075649552498061e-05, "loss": 0.4403, "num_input_tokens_seen": 6023312, "step": 9225 }, { "epoch": 4.837526205450734, "grad_norm": 0.9941319823265076, "learning_rate": 3.073423904689577e-05, "loss": 0.478, "num_input_tokens_seen": 6026032, "step": 9230 }, { "epoch": 4.840146750524109, "grad_norm": 0.5781282186508179, "learning_rate": 3.071197777066162e-05, "loss": 0.4009, "num_input_tokens_seen": 6029872, "step": 9235 }, { "epoch": 4.8427672955974845, "grad_norm": 0.20285072922706604, "learning_rate": 3.068971171490539e-05, "loss": 0.4701, "num_input_tokens_seen": 6033232, "step": 9240 }, { "epoch": 4.84538784067086, "grad_norm": 1.5433902740478516, "learning_rate": 3.066744089825829e-05, "loss": 0.6571, "num_input_tokens_seen": 6035888, "step": 9245 }, { "epoch": 4.848008385744235, "grad_norm": 0.2867349088191986, "learning_rate": 3.064516533935553e-05, "loss": 0.4644, "num_input_tokens_seen": 6039408, "step": 9250 }, { "epoch": 4.85062893081761, "grad_norm": 0.3503234088420868, "learning_rate": 3.062288505683626e-05, "loss": 0.5428, "num_input_tokens_seen": 6042736, "step": 9255 }, { "epoch": 4.853249475890985, "grad_norm": 0.48264187574386597, "learning_rate": 3.060060006934363e-05, "loss": 0.4791, "num_input_tokens_seen": 6045296, "step": 9260 }, { "epoch": 4.85587002096436, "grad_norm": 0.5658817887306213, "learning_rate": 3.057831039552469e-05, "loss": 0.4768, "num_input_tokens_seen": 6047920, "step": 9265 }, { "epoch": 4.8584905660377355, "grad_norm": 0.2983451783657074, "learning_rate": 3.0556016054030416e-05, "loss": 0.3668, "num_input_tokens_seen": 6052432, "step": 9270 }, { "epoch": 4.861111111111111, "grad_norm": 0.7134292721748352, "learning_rate": 3.053371706351569e-05, "loss": 0.4397, "num_input_tokens_seen": 6055472, "step": 9275 }, { "epoch": 4.863731656184486, "grad_norm": 0.5529367923736572, "learning_rate": 3.0511413442639296e-05, "loss": 0.5813, "num_input_tokens_seen": 6059184, "step": 9280 }, { "epoch": 4.866352201257862, "grad_norm": 0.5396789908409119, "learning_rate": 3.048910521006389e-05, "loss": 0.5977, "num_input_tokens_seen": 6062192, "step": 9285 }, { "epoch": 4.868972746331237, "grad_norm": 0.5828445553779602, "learning_rate": 3.046679238445598e-05, "loss": 0.4704, "num_input_tokens_seen": 6065264, "step": 9290 }, { "epoch": 4.871593291404612, "grad_norm": 0.3536330461502075, "learning_rate": 3.0444474984485905e-05, "loss": 0.5344, "num_input_tokens_seen": 6069008, "step": 9295 }, { "epoch": 4.8742138364779874, "grad_norm": 0.31432652473449707, "learning_rate": 3.042215302882786e-05, "loss": 0.4122, "num_input_tokens_seen": 6072624, "step": 9300 }, { "epoch": 4.876834381551363, "grad_norm": 0.4118805527687073, "learning_rate": 3.0399826536159836e-05, "loss": 0.5822, "num_input_tokens_seen": 6075280, "step": 9305 }, { "epoch": 4.879454926624738, "grad_norm": 0.7918564081192017, "learning_rate": 3.0377495525163624e-05, "loss": 0.4847, "num_input_tokens_seen": 6078448, "step": 9310 }, { "epoch": 4.882075471698113, "grad_norm": 0.6467188596725464, "learning_rate": 3.0355160014524786e-05, "loss": 0.3059, "num_input_tokens_seen": 6081008, "step": 9315 }, { "epoch": 4.884696016771488, "grad_norm": 0.345951646566391, "learning_rate": 3.033282002293266e-05, "loss": 0.4837, "num_input_tokens_seen": 6085296, "step": 9320 }, { "epoch": 4.887316561844864, "grad_norm": 0.4475436508655548, "learning_rate": 3.0310475569080345e-05, "loss": 0.4779, "num_input_tokens_seen": 6087664, "step": 9325 }, { "epoch": 4.889937106918239, "grad_norm": 0.528296709060669, "learning_rate": 3.0288126671664628e-05, "loss": 0.4746, "num_input_tokens_seen": 6090832, "step": 9330 }, { "epoch": 4.8925576519916145, "grad_norm": 0.6548992991447449, "learning_rate": 3.0265773349386078e-05, "loss": 0.5323, "num_input_tokens_seen": 6093616, "step": 9335 }, { "epoch": 4.89517819706499, "grad_norm": 0.449481338262558, "learning_rate": 3.024341562094891e-05, "loss": 0.3715, "num_input_tokens_seen": 6096592, "step": 9340 }, { "epoch": 4.897798742138365, "grad_norm": 0.23047912120819092, "learning_rate": 3.0221053505061063e-05, "loss": 0.332, "num_input_tokens_seen": 6100144, "step": 9345 }, { "epoch": 4.90041928721174, "grad_norm": 0.535720705986023, "learning_rate": 3.0198687020434142e-05, "loss": 0.5229, "num_input_tokens_seen": 6103536, "step": 9350 }, { "epoch": 4.903039832285115, "grad_norm": 0.3053257167339325, "learning_rate": 3.0176316185783383e-05, "loss": 0.48, "num_input_tokens_seen": 6107408, "step": 9355 }, { "epoch": 4.90566037735849, "grad_norm": 0.40045398473739624, "learning_rate": 3.015394101982768e-05, "loss": 0.4879, "num_input_tokens_seen": 6110480, "step": 9360 }, { "epoch": 4.9082809224318655, "grad_norm": 0.7042742371559143, "learning_rate": 3.013156154128955e-05, "loss": 0.4068, "num_input_tokens_seen": 6113488, "step": 9365 }, { "epoch": 4.910901467505241, "grad_norm": 0.39384725689888, "learning_rate": 3.010917776889513e-05, "loss": 0.4742, "num_input_tokens_seen": 6117648, "step": 9370 }, { "epoch": 4.913522012578616, "grad_norm": 0.722275972366333, "learning_rate": 3.0086789721374137e-05, "loss": 0.4653, "num_input_tokens_seen": 6120592, "step": 9375 }, { "epoch": 4.916142557651992, "grad_norm": 0.5488741993904114, "learning_rate": 3.006439741745985e-05, "loss": 0.525, "num_input_tokens_seen": 6124400, "step": 9380 }, { "epoch": 4.918763102725367, "grad_norm": 0.38751527667045593, "learning_rate": 3.004200087588914e-05, "loss": 0.5092, "num_input_tokens_seen": 6128400, "step": 9385 }, { "epoch": 4.921383647798742, "grad_norm": 0.316802978515625, "learning_rate": 3.00196001154024e-05, "loss": 0.4578, "num_input_tokens_seen": 6131536, "step": 9390 }, { "epoch": 4.924004192872117, "grad_norm": 0.3293273150920868, "learning_rate": 2.999719515474358e-05, "loss": 0.3294, "num_input_tokens_seen": 6135280, "step": 9395 }, { "epoch": 4.926624737945493, "grad_norm": 0.7787837386131287, "learning_rate": 2.997478601266011e-05, "loss": 0.5053, "num_input_tokens_seen": 6137680, "step": 9400 }, { "epoch": 4.929245283018868, "grad_norm": 0.550394594669342, "learning_rate": 2.995237270790295e-05, "loss": 0.6974, "num_input_tokens_seen": 6141264, "step": 9405 }, { "epoch": 4.931865828092243, "grad_norm": 0.7723302841186523, "learning_rate": 2.9929955259226515e-05, "loss": 0.4166, "num_input_tokens_seen": 6145104, "step": 9410 }, { "epoch": 4.934486373165618, "grad_norm": 0.3061046004295349, "learning_rate": 2.990753368538872e-05, "loss": 0.4514, "num_input_tokens_seen": 6148976, "step": 9415 }, { "epoch": 4.937106918238994, "grad_norm": 0.594552755355835, "learning_rate": 2.9885108005150897e-05, "loss": 0.6298, "num_input_tokens_seen": 6152560, "step": 9420 }, { "epoch": 4.939727463312369, "grad_norm": 0.5899253487586975, "learning_rate": 2.986267823727784e-05, "loss": 0.5725, "num_input_tokens_seen": 6155024, "step": 9425 }, { "epoch": 4.9423480083857445, "grad_norm": 0.3733568489551544, "learning_rate": 2.9840244400537754e-05, "loss": 0.4999, "num_input_tokens_seen": 6157584, "step": 9430 }, { "epoch": 4.94496855345912, "grad_norm": 0.5708548426628113, "learning_rate": 2.9817806513702244e-05, "loss": 0.5135, "num_input_tokens_seen": 6160240, "step": 9435 }, { "epoch": 4.947589098532495, "grad_norm": 0.37130022048950195, "learning_rate": 2.9795364595546315e-05, "loss": 0.5163, "num_input_tokens_seen": 6163696, "step": 9440 }, { "epoch": 4.95020964360587, "grad_norm": 0.3192526400089264, "learning_rate": 2.977291866484833e-05, "loss": 0.3599, "num_input_tokens_seen": 6168880, "step": 9445 }, { "epoch": 4.952830188679245, "grad_norm": 0.6192455291748047, "learning_rate": 2.975046874039003e-05, "loss": 0.4934, "num_input_tokens_seen": 6172208, "step": 9450 }, { "epoch": 4.95545073375262, "grad_norm": 0.3797912895679474, "learning_rate": 2.9728014840956488e-05, "loss": 0.3611, "num_input_tokens_seen": 6174928, "step": 9455 }, { "epoch": 4.9580712788259955, "grad_norm": 0.30609390139579773, "learning_rate": 2.9705556985336086e-05, "loss": 0.4707, "num_input_tokens_seen": 6178992, "step": 9460 }, { "epoch": 4.960691823899371, "grad_norm": 0.48182985186576843, "learning_rate": 2.968309519232053e-05, "loss": 0.446, "num_input_tokens_seen": 6182128, "step": 9465 }, { "epoch": 4.963312368972746, "grad_norm": 0.6618331074714661, "learning_rate": 2.966062948070485e-05, "loss": 0.4331, "num_input_tokens_seen": 6185328, "step": 9470 }, { "epoch": 4.965932914046122, "grad_norm": 0.29761141538619995, "learning_rate": 2.9638159869287303e-05, "loss": 0.476, "num_input_tokens_seen": 6188560, "step": 9475 }, { "epoch": 4.968553459119497, "grad_norm": 0.4442339837551117, "learning_rate": 2.9615686376869434e-05, "loss": 0.5576, "num_input_tokens_seen": 6193136, "step": 9480 }, { "epoch": 4.971174004192872, "grad_norm": 0.40380844473838806, "learning_rate": 2.9593209022256046e-05, "loss": 0.6796, "num_input_tokens_seen": 6196816, "step": 9485 }, { "epoch": 4.973794549266247, "grad_norm": 0.5270391702651978, "learning_rate": 2.9570727824255163e-05, "loss": 0.4507, "num_input_tokens_seen": 6200272, "step": 9490 }, { "epoch": 4.976415094339623, "grad_norm": 1.008781909942627, "learning_rate": 2.954824280167801e-05, "loss": 0.4112, "num_input_tokens_seen": 6203216, "step": 9495 }, { "epoch": 4.979035639412998, "grad_norm": 0.3381733000278473, "learning_rate": 2.9525753973339044e-05, "loss": 0.4209, "num_input_tokens_seen": 6206736, "step": 9500 }, { "epoch": 4.981656184486373, "grad_norm": 0.28747206926345825, "learning_rate": 2.9503261358055873e-05, "loss": 0.4025, "num_input_tokens_seen": 6209936, "step": 9505 }, { "epoch": 4.984276729559748, "grad_norm": 0.3128841519355774, "learning_rate": 2.9480764974649305e-05, "loss": 0.4762, "num_input_tokens_seen": 6213424, "step": 9510 }, { "epoch": 4.986897274633124, "grad_norm": 0.7031983733177185, "learning_rate": 2.9458264841943272e-05, "loss": 0.3935, "num_input_tokens_seen": 6217360, "step": 9515 }, { "epoch": 4.989517819706499, "grad_norm": 0.6143743395805359, "learning_rate": 2.9435760978764874e-05, "loss": 0.5149, "num_input_tokens_seen": 6219984, "step": 9520 }, { "epoch": 4.9921383647798745, "grad_norm": 0.2953967750072479, "learning_rate": 2.9413253403944297e-05, "loss": 0.3959, "num_input_tokens_seen": 6223664, "step": 9525 }, { "epoch": 4.99475890985325, "grad_norm": 0.3675239086151123, "learning_rate": 2.9390742136314863e-05, "loss": 0.4808, "num_input_tokens_seen": 6227440, "step": 9530 }, { "epoch": 4.997379454926625, "grad_norm": 0.476404070854187, "learning_rate": 2.9368227194712978e-05, "loss": 0.4072, "num_input_tokens_seen": 6230608, "step": 9535 }, { "epoch": 5.0, "grad_norm": 0.7735833525657654, "learning_rate": 2.9345708597978106e-05, "loss": 0.494, "num_input_tokens_seen": 6233920, "step": 9540 }, { "epoch": 5.0, "eval_loss": 0.4878688454627991, "eval_runtime": 14.5588, "eval_samples_per_second": 58.247, "eval_steps_per_second": 14.562, "num_input_tokens_seen": 6233920, "step": 9540 }, { "epoch": 5.002620545073375, "grad_norm": 0.40349891781806946, "learning_rate": 2.932318636495278e-05, "loss": 0.4334, "num_input_tokens_seen": 6237312, "step": 9545 }, { "epoch": 5.00524109014675, "grad_norm": 0.4388822019100189, "learning_rate": 2.930066051448258e-05, "loss": 0.408, "num_input_tokens_seen": 6241344, "step": 9550 }, { "epoch": 5.0078616352201255, "grad_norm": 0.5108670592308044, "learning_rate": 2.927813106541611e-05, "loss": 0.5586, "num_input_tokens_seen": 6244064, "step": 9555 }, { "epoch": 5.010482180293501, "grad_norm": 0.37603458762168884, "learning_rate": 2.9255598036604982e-05, "loss": 0.6517, "num_input_tokens_seen": 6247008, "step": 9560 }, { "epoch": 5.013102725366877, "grad_norm": 0.4240657091140747, "learning_rate": 2.92330614469038e-05, "loss": 0.566, "num_input_tokens_seen": 6250016, "step": 9565 }, { "epoch": 5.015723270440252, "grad_norm": 0.30379560589790344, "learning_rate": 2.921052131517016e-05, "loss": 0.5856, "num_input_tokens_seen": 6253504, "step": 9570 }, { "epoch": 5.018343815513627, "grad_norm": 0.4817821681499481, "learning_rate": 2.9187977660264615e-05, "loss": 0.4903, "num_input_tokens_seen": 6256256, "step": 9575 }, { "epoch": 5.020964360587002, "grad_norm": 0.3426409065723419, "learning_rate": 2.9165430501050657e-05, "loss": 0.6396, "num_input_tokens_seen": 6260160, "step": 9580 }, { "epoch": 5.023584905660377, "grad_norm": 0.3938826024532318, "learning_rate": 2.9142879856394732e-05, "loss": 0.4585, "num_input_tokens_seen": 6264032, "step": 9585 }, { "epoch": 5.026205450733753, "grad_norm": 0.30146610736846924, "learning_rate": 2.9120325745166178e-05, "loss": 0.5237, "num_input_tokens_seen": 6267552, "step": 9590 }, { "epoch": 5.028825995807128, "grad_norm": 0.313369482755661, "learning_rate": 2.909776818623725e-05, "loss": 0.4846, "num_input_tokens_seen": 6270272, "step": 9595 }, { "epoch": 5.031446540880503, "grad_norm": 0.3488582968711853, "learning_rate": 2.9075207198483084e-05, "loss": 0.2923, "num_input_tokens_seen": 6274624, "step": 9600 }, { "epoch": 5.034067085953878, "grad_norm": 0.4541565179824829, "learning_rate": 2.905264280078168e-05, "loss": 0.455, "num_input_tokens_seen": 6277728, "step": 9605 }, { "epoch": 5.036687631027253, "grad_norm": 0.9563388228416443, "learning_rate": 2.9030075012013902e-05, "loss": 0.4462, "num_input_tokens_seen": 6281024, "step": 9610 }, { "epoch": 5.039308176100629, "grad_norm": 0.5132864713668823, "learning_rate": 2.9007503851063433e-05, "loss": 0.5888, "num_input_tokens_seen": 6284576, "step": 9615 }, { "epoch": 5.0419287211740045, "grad_norm": 0.42765557765960693, "learning_rate": 2.8984929336816807e-05, "loss": 0.5506, "num_input_tokens_seen": 6288096, "step": 9620 }, { "epoch": 5.04454926624738, "grad_norm": 0.511037290096283, "learning_rate": 2.896235148816333e-05, "loss": 0.5235, "num_input_tokens_seen": 6292128, "step": 9625 }, { "epoch": 5.047169811320755, "grad_norm": 0.676875114440918, "learning_rate": 2.893977032399512e-05, "loss": 0.4868, "num_input_tokens_seen": 6298912, "step": 9630 }, { "epoch": 5.04979035639413, "grad_norm": 0.47063466906547546, "learning_rate": 2.8917185863207062e-05, "loss": 0.4728, "num_input_tokens_seen": 6301536, "step": 9635 }, { "epoch": 5.052410901467505, "grad_norm": 0.4328313171863556, "learning_rate": 2.889459812469681e-05, "loss": 0.379, "num_input_tokens_seen": 6303936, "step": 9640 }, { "epoch": 5.05503144654088, "grad_norm": 0.6250510215759277, "learning_rate": 2.8872007127364746e-05, "loss": 0.4796, "num_input_tokens_seen": 6306880, "step": 9645 }, { "epoch": 5.0576519916142555, "grad_norm": 0.41658997535705566, "learning_rate": 2.884941289011398e-05, "loss": 0.479, "num_input_tokens_seen": 6309920, "step": 9650 }, { "epoch": 5.060272536687631, "grad_norm": 0.4703925549983978, "learning_rate": 2.882681543185034e-05, "loss": 0.4127, "num_input_tokens_seen": 6312480, "step": 9655 }, { "epoch": 5.062893081761007, "grad_norm": 0.28785616159439087, "learning_rate": 2.880421477148235e-05, "loss": 0.4314, "num_input_tokens_seen": 6316256, "step": 9660 }, { "epoch": 5.065513626834382, "grad_norm": 0.4542624354362488, "learning_rate": 2.878161092792121e-05, "loss": 0.524, "num_input_tokens_seen": 6319520, "step": 9665 }, { "epoch": 5.068134171907757, "grad_norm": 0.35913607478141785, "learning_rate": 2.8759003920080786e-05, "loss": 0.5737, "num_input_tokens_seen": 6323328, "step": 9670 }, { "epoch": 5.070754716981132, "grad_norm": 0.2929772138595581, "learning_rate": 2.8736393766877578e-05, "loss": 0.3097, "num_input_tokens_seen": 6326720, "step": 9675 }, { "epoch": 5.073375262054507, "grad_norm": 0.7355338931083679, "learning_rate": 2.871378048723074e-05, "loss": 0.476, "num_input_tokens_seen": 6329280, "step": 9680 }, { "epoch": 5.075995807127883, "grad_norm": 0.2943524122238159, "learning_rate": 2.8691164100062034e-05, "loss": 0.4267, "num_input_tokens_seen": 6332384, "step": 9685 }, { "epoch": 5.078616352201258, "grad_norm": 0.5217158198356628, "learning_rate": 2.8668544624295814e-05, "loss": 0.368, "num_input_tokens_seen": 6336096, "step": 9690 }, { "epoch": 5.081236897274633, "grad_norm": 0.6632891297340393, "learning_rate": 2.864592207885902e-05, "loss": 0.5665, "num_input_tokens_seen": 6338432, "step": 9695 }, { "epoch": 5.083857442348008, "grad_norm": 0.5584380626678467, "learning_rate": 2.8623296482681166e-05, "loss": 0.4634, "num_input_tokens_seen": 6342016, "step": 9700 }, { "epoch": 5.086477987421383, "grad_norm": 0.9929739832878113, "learning_rate": 2.8600667854694328e-05, "loss": 0.5508, "num_input_tokens_seen": 6344896, "step": 9705 }, { "epoch": 5.089098532494759, "grad_norm": 0.40059345960617065, "learning_rate": 2.857803621383311e-05, "loss": 0.4899, "num_input_tokens_seen": 6348096, "step": 9710 }, { "epoch": 5.0917190775681345, "grad_norm": 0.6896577477455139, "learning_rate": 2.8555401579034607e-05, "loss": 0.5806, "num_input_tokens_seen": 6351488, "step": 9715 }, { "epoch": 5.09433962264151, "grad_norm": 0.4778538644313812, "learning_rate": 2.853276396923848e-05, "loss": 0.5439, "num_input_tokens_seen": 6354464, "step": 9720 }, { "epoch": 5.096960167714885, "grad_norm": 0.3565004765987396, "learning_rate": 2.851012340338683e-05, "loss": 0.4529, "num_input_tokens_seen": 6358912, "step": 9725 }, { "epoch": 5.09958071278826, "grad_norm": 0.9882478713989258, "learning_rate": 2.8487479900424253e-05, "loss": 0.3977, "num_input_tokens_seen": 6362112, "step": 9730 }, { "epoch": 5.102201257861635, "grad_norm": 0.4156314730644226, "learning_rate": 2.8464833479297794e-05, "loss": 0.4502, "num_input_tokens_seen": 6364896, "step": 9735 }, { "epoch": 5.10482180293501, "grad_norm": 0.4895382821559906, "learning_rate": 2.8442184158956947e-05, "loss": 0.6569, "num_input_tokens_seen": 6368160, "step": 9740 }, { "epoch": 5.1074423480083855, "grad_norm": 0.4252597987651825, "learning_rate": 2.8419531958353635e-05, "loss": 0.3805, "num_input_tokens_seen": 6371264, "step": 9745 }, { "epoch": 5.110062893081761, "grad_norm": 0.4360244572162628, "learning_rate": 2.839687689644217e-05, "loss": 0.5293, "num_input_tokens_seen": 6374144, "step": 9750 }, { "epoch": 5.112683438155136, "grad_norm": 0.42389217019081116, "learning_rate": 2.837421899217928e-05, "loss": 0.4518, "num_input_tokens_seen": 6377600, "step": 9755 }, { "epoch": 5.115303983228512, "grad_norm": 0.8273192048072815, "learning_rate": 2.8351558264524076e-05, "loss": 0.4554, "num_input_tokens_seen": 6380608, "step": 9760 }, { "epoch": 5.117924528301887, "grad_norm": 0.4289022386074066, "learning_rate": 2.8328894732437998e-05, "loss": 0.6095, "num_input_tokens_seen": 6384640, "step": 9765 }, { "epoch": 5.120545073375262, "grad_norm": 0.6777971386909485, "learning_rate": 2.830622841488488e-05, "loss": 0.4099, "num_input_tokens_seen": 6387456, "step": 9770 }, { "epoch": 5.123165618448637, "grad_norm": 0.4346393644809723, "learning_rate": 2.8283559330830834e-05, "loss": 0.453, "num_input_tokens_seen": 6390880, "step": 9775 }, { "epoch": 5.1257861635220126, "grad_norm": 0.4624761641025543, "learning_rate": 2.8260887499244333e-05, "loss": 0.3795, "num_input_tokens_seen": 6393696, "step": 9780 }, { "epoch": 5.128406708595388, "grad_norm": 0.272508829832077, "learning_rate": 2.823821293909612e-05, "loss": 0.4828, "num_input_tokens_seen": 6397472, "step": 9785 }, { "epoch": 5.131027253668763, "grad_norm": 0.31372126936912537, "learning_rate": 2.821553566935924e-05, "loss": 0.4522, "num_input_tokens_seen": 6400576, "step": 9790 }, { "epoch": 5.133647798742138, "grad_norm": 0.3795149028301239, "learning_rate": 2.8192855709008985e-05, "loss": 0.5889, "num_input_tokens_seen": 6403168, "step": 9795 }, { "epoch": 5.136268343815513, "grad_norm": 0.3168415427207947, "learning_rate": 2.8170173077022915e-05, "loss": 0.4946, "num_input_tokens_seen": 6406144, "step": 9800 }, { "epoch": 5.138888888888889, "grad_norm": 0.573560357093811, "learning_rate": 2.8147487792380832e-05, "loss": 0.5532, "num_input_tokens_seen": 6409344, "step": 9805 }, { "epoch": 5.1415094339622645, "grad_norm": 0.6552520394325256, "learning_rate": 2.8124799874064733e-05, "loss": 0.4487, "num_input_tokens_seen": 6411776, "step": 9810 }, { "epoch": 5.14412997903564, "grad_norm": 0.27062228322029114, "learning_rate": 2.810210934105883e-05, "loss": 0.3568, "num_input_tokens_seen": 6414784, "step": 9815 }, { "epoch": 5.146750524109015, "grad_norm": 1.0838414430618286, "learning_rate": 2.8079416212349528e-05, "loss": 0.4152, "num_input_tokens_seen": 6417248, "step": 9820 }, { "epoch": 5.14937106918239, "grad_norm": 0.7365031838417053, "learning_rate": 2.805672050692541e-05, "loss": 0.5856, "num_input_tokens_seen": 6420416, "step": 9825 }, { "epoch": 5.151991614255765, "grad_norm": 0.5714299082756042, "learning_rate": 2.8034022243777197e-05, "loss": 0.3941, "num_input_tokens_seen": 6424288, "step": 9830 }, { "epoch": 5.15461215932914, "grad_norm": 0.465469092130661, "learning_rate": 2.8011321441897754e-05, "loss": 0.5645, "num_input_tokens_seen": 6426784, "step": 9835 }, { "epoch": 5.1572327044025155, "grad_norm": 0.3035655915737152, "learning_rate": 2.7988618120282074e-05, "loss": 0.5108, "num_input_tokens_seen": 6429760, "step": 9840 }, { "epoch": 5.159853249475891, "grad_norm": 0.3916498124599457, "learning_rate": 2.7965912297927277e-05, "loss": 0.4317, "num_input_tokens_seen": 6432480, "step": 9845 }, { "epoch": 5.162473794549266, "grad_norm": 0.3444691300392151, "learning_rate": 2.794320399383254e-05, "loss": 0.4415, "num_input_tokens_seen": 6437312, "step": 9850 }, { "epoch": 5.165094339622642, "grad_norm": 0.42935115098953247, "learning_rate": 2.7920493226999143e-05, "loss": 0.5723, "num_input_tokens_seen": 6441664, "step": 9855 }, { "epoch": 5.167714884696017, "grad_norm": 0.23587745428085327, "learning_rate": 2.7897780016430414e-05, "loss": 0.3885, "num_input_tokens_seen": 6445120, "step": 9860 }, { "epoch": 5.170335429769392, "grad_norm": 0.4836586117744446, "learning_rate": 2.7875064381131733e-05, "loss": 0.4514, "num_input_tokens_seen": 6447968, "step": 9865 }, { "epoch": 5.172955974842767, "grad_norm": 0.5040432214736938, "learning_rate": 2.7852346340110508e-05, "loss": 0.4537, "num_input_tokens_seen": 6450592, "step": 9870 }, { "epoch": 5.1755765199161425, "grad_norm": 0.7270060777664185, "learning_rate": 2.7829625912376163e-05, "loss": 0.4491, "num_input_tokens_seen": 6454496, "step": 9875 }, { "epoch": 5.178197064989518, "grad_norm": 0.4355965256690979, "learning_rate": 2.7806903116940093e-05, "loss": 0.3861, "num_input_tokens_seen": 6457088, "step": 9880 }, { "epoch": 5.180817610062893, "grad_norm": 0.37935882806777954, "learning_rate": 2.778417797281571e-05, "loss": 0.5511, "num_input_tokens_seen": 6461088, "step": 9885 }, { "epoch": 5.183438155136268, "grad_norm": 0.36621615290641785, "learning_rate": 2.7761450499018383e-05, "loss": 0.5097, "num_input_tokens_seen": 6463872, "step": 9890 }, { "epoch": 5.186058700209643, "grad_norm": 0.40266308188438416, "learning_rate": 2.7738720714565418e-05, "loss": 0.4867, "num_input_tokens_seen": 6467424, "step": 9895 }, { "epoch": 5.188679245283019, "grad_norm": 0.5071987509727478, "learning_rate": 2.7715988638476055e-05, "loss": 0.4431, "num_input_tokens_seen": 6470688, "step": 9900 }, { "epoch": 5.191299790356394, "grad_norm": 0.4089651107788086, "learning_rate": 2.7693254289771454e-05, "loss": 0.4463, "num_input_tokens_seen": 6473440, "step": 9905 }, { "epoch": 5.19392033542977, "grad_norm": 0.7421665787696838, "learning_rate": 2.7670517687474697e-05, "loss": 0.5504, "num_input_tokens_seen": 6476352, "step": 9910 }, { "epoch": 5.196540880503145, "grad_norm": 0.2968252897262573, "learning_rate": 2.7647778850610723e-05, "loss": 0.4029, "num_input_tokens_seen": 6480064, "step": 9915 }, { "epoch": 5.19916142557652, "grad_norm": 0.4343636929988861, "learning_rate": 2.7625037798206345e-05, "loss": 0.4358, "num_input_tokens_seen": 6483840, "step": 9920 }, { "epoch": 5.201781970649895, "grad_norm": 0.5027446746826172, "learning_rate": 2.7602294549290243e-05, "loss": 0.6635, "num_input_tokens_seen": 6486368, "step": 9925 }, { "epoch": 5.20440251572327, "grad_norm": 0.2865714728832245, "learning_rate": 2.757954912289294e-05, "loss": 0.4021, "num_input_tokens_seen": 6489152, "step": 9930 }, { "epoch": 5.2070230607966455, "grad_norm": 0.4239647388458252, "learning_rate": 2.755680153804675e-05, "loss": 0.4431, "num_input_tokens_seen": 6493088, "step": 9935 }, { "epoch": 5.209643605870021, "grad_norm": 0.5021870136260986, "learning_rate": 2.7534051813785834e-05, "loss": 0.4408, "num_input_tokens_seen": 6496256, "step": 9940 }, { "epoch": 5.212264150943396, "grad_norm": 0.5236265063285828, "learning_rate": 2.75112999691461e-05, "loss": 0.4516, "num_input_tokens_seen": 6499296, "step": 9945 }, { "epoch": 5.214884696016772, "grad_norm": 0.5116919875144958, "learning_rate": 2.7488546023165262e-05, "loss": 0.4527, "num_input_tokens_seen": 6502624, "step": 9950 }, { "epoch": 5.217505241090147, "grad_norm": 0.31945326924324036, "learning_rate": 2.7465789994882796e-05, "loss": 0.6573, "num_input_tokens_seen": 6506272, "step": 9955 }, { "epoch": 5.220125786163522, "grad_norm": 0.5403057932853699, "learning_rate": 2.7443031903339896e-05, "loss": 0.4135, "num_input_tokens_seen": 6509504, "step": 9960 }, { "epoch": 5.222746331236897, "grad_norm": 0.6758636236190796, "learning_rate": 2.742027176757948e-05, "loss": 0.5419, "num_input_tokens_seen": 6512512, "step": 9965 }, { "epoch": 5.2253668763102725, "grad_norm": 0.22312001883983612, "learning_rate": 2.7397509606646204e-05, "loss": 0.4811, "num_input_tokens_seen": 6515456, "step": 9970 }, { "epoch": 5.227987421383648, "grad_norm": 0.4581824839115143, "learning_rate": 2.7374745439586414e-05, "loss": 0.4336, "num_input_tokens_seen": 6518880, "step": 9975 }, { "epoch": 5.230607966457023, "grad_norm": 0.8945074081420898, "learning_rate": 2.735197928544811e-05, "loss": 0.4308, "num_input_tokens_seen": 6521664, "step": 9980 }, { "epoch": 5.233228511530398, "grad_norm": 0.4976062476634979, "learning_rate": 2.7329211163280972e-05, "loss": 0.4661, "num_input_tokens_seen": 6524160, "step": 9985 }, { "epoch": 5.235849056603773, "grad_norm": 0.5098474025726318, "learning_rate": 2.730644109213632e-05, "loss": 0.3731, "num_input_tokens_seen": 6527904, "step": 9990 }, { "epoch": 5.238469601677149, "grad_norm": 1.009907841682434, "learning_rate": 2.7283669091067127e-05, "loss": 0.702, "num_input_tokens_seen": 6530560, "step": 9995 }, { "epoch": 5.241090146750524, "grad_norm": 0.5045578479766846, "learning_rate": 2.7260895179127944e-05, "loss": 0.505, "num_input_tokens_seen": 6534048, "step": 10000 }, { "epoch": 5.2437106918239, "grad_norm": 0.6285730004310608, "learning_rate": 2.7238119375374954e-05, "loss": 0.4342, "num_input_tokens_seen": 6536864, "step": 10005 }, { "epoch": 5.246331236897275, "grad_norm": 1.4638087749481201, "learning_rate": 2.7215341698865904e-05, "loss": 0.5526, "num_input_tokens_seen": 6540608, "step": 10010 }, { "epoch": 5.24895178197065, "grad_norm": 0.5576493144035339, "learning_rate": 2.7192562168660113e-05, "loss": 0.5217, "num_input_tokens_seen": 6543744, "step": 10015 }, { "epoch": 5.251572327044025, "grad_norm": 0.6503416895866394, "learning_rate": 2.7169780803818445e-05, "loss": 0.4342, "num_input_tokens_seen": 6546656, "step": 10020 }, { "epoch": 5.2541928721174, "grad_norm": 0.5722594857215881, "learning_rate": 2.714699762340332e-05, "loss": 0.5271, "num_input_tokens_seen": 6549152, "step": 10025 }, { "epoch": 5.256813417190775, "grad_norm": 0.5046286582946777, "learning_rate": 2.7124212646478652e-05, "loss": 0.5364, "num_input_tokens_seen": 6552096, "step": 10030 }, { "epoch": 5.259433962264151, "grad_norm": 0.2642299234867096, "learning_rate": 2.7101425892109865e-05, "loss": 0.4912, "num_input_tokens_seen": 6555232, "step": 10035 }, { "epoch": 5.262054507337526, "grad_norm": 0.4054403603076935, "learning_rate": 2.707863737936389e-05, "loss": 0.4181, "num_input_tokens_seen": 6557920, "step": 10040 }, { "epoch": 5.264675052410902, "grad_norm": 0.4038589596748352, "learning_rate": 2.7055847127309107e-05, "loss": 0.4712, "num_input_tokens_seen": 6560928, "step": 10045 }, { "epoch": 5.267295597484277, "grad_norm": 0.41191983222961426, "learning_rate": 2.703305515501534e-05, "loss": 0.4272, "num_input_tokens_seen": 6563488, "step": 10050 }, { "epoch": 5.269916142557652, "grad_norm": 0.3651892840862274, "learning_rate": 2.70102614815539e-05, "loss": 0.4489, "num_input_tokens_seen": 6567520, "step": 10055 }, { "epoch": 5.272536687631027, "grad_norm": 0.3069641888141632, "learning_rate": 2.6987466125997475e-05, "loss": 0.5042, "num_input_tokens_seen": 6570528, "step": 10060 }, { "epoch": 5.2751572327044025, "grad_norm": 0.47585466504096985, "learning_rate": 2.696466910742018e-05, "loss": 0.4037, "num_input_tokens_seen": 6573824, "step": 10065 }, { "epoch": 5.277777777777778, "grad_norm": 0.3749580681324005, "learning_rate": 2.694187044489751e-05, "loss": 0.5177, "num_input_tokens_seen": 6576896, "step": 10070 }, { "epoch": 5.280398322851153, "grad_norm": 0.9449686408042908, "learning_rate": 2.691907015750636e-05, "loss": 0.486, "num_input_tokens_seen": 6579136, "step": 10075 }, { "epoch": 5.283018867924528, "grad_norm": 0.3992260694503784, "learning_rate": 2.6896268264324964e-05, "loss": 0.4181, "num_input_tokens_seen": 6582080, "step": 10080 }, { "epoch": 5.285639412997903, "grad_norm": 0.4715198278427124, "learning_rate": 2.6873464784432894e-05, "loss": 0.4428, "num_input_tokens_seen": 6584896, "step": 10085 }, { "epoch": 5.288259958071279, "grad_norm": 0.4954780042171478, "learning_rate": 2.6850659736911073e-05, "loss": 0.4805, "num_input_tokens_seen": 6588448, "step": 10090 }, { "epoch": 5.290880503144654, "grad_norm": 0.524285614490509, "learning_rate": 2.682785314084172e-05, "loss": 0.4009, "num_input_tokens_seen": 6592704, "step": 10095 }, { "epoch": 5.29350104821803, "grad_norm": 0.4552164673805237, "learning_rate": 2.680504501530835e-05, "loss": 0.4574, "num_input_tokens_seen": 6596256, "step": 10100 }, { "epoch": 5.296121593291405, "grad_norm": 0.39894095063209534, "learning_rate": 2.6782235379395766e-05, "loss": 0.5041, "num_input_tokens_seen": 6598656, "step": 10105 }, { "epoch": 5.29874213836478, "grad_norm": 0.34757381677627563, "learning_rate": 2.675942425219002e-05, "loss": 0.487, "num_input_tokens_seen": 6601504, "step": 10110 }, { "epoch": 5.301362683438155, "grad_norm": 0.6927559971809387, "learning_rate": 2.673661165277843e-05, "loss": 0.5109, "num_input_tokens_seen": 6604640, "step": 10115 }, { "epoch": 5.30398322851153, "grad_norm": 0.3860779404640198, "learning_rate": 2.6713797600249536e-05, "loss": 0.5083, "num_input_tokens_seen": 6607520, "step": 10120 }, { "epoch": 5.306603773584905, "grad_norm": 0.3701600134372711, "learning_rate": 2.6690982113693092e-05, "loss": 0.3914, "num_input_tokens_seen": 6611360, "step": 10125 }, { "epoch": 5.309224318658281, "grad_norm": 0.3095915913581848, "learning_rate": 2.6668165212200057e-05, "loss": 0.3682, "num_input_tokens_seen": 6616672, "step": 10130 }, { "epoch": 5.311844863731656, "grad_norm": 0.49796193838119507, "learning_rate": 2.664534691486257e-05, "loss": 0.6157, "num_input_tokens_seen": 6619424, "step": 10135 }, { "epoch": 5.314465408805032, "grad_norm": 0.6280518174171448, "learning_rate": 2.6622527240773942e-05, "loss": 0.3318, "num_input_tokens_seen": 6622656, "step": 10140 }, { "epoch": 5.317085953878407, "grad_norm": 0.5188391208648682, "learning_rate": 2.6599706209028634e-05, "loss": 0.408, "num_input_tokens_seen": 6625568, "step": 10145 }, { "epoch": 5.319706498951782, "grad_norm": 1.2088319063186646, "learning_rate": 2.657688383872224e-05, "loss": 0.65, "num_input_tokens_seen": 6628896, "step": 10150 }, { "epoch": 5.322327044025157, "grad_norm": 0.5964451432228088, "learning_rate": 2.655406014895147e-05, "loss": 0.4235, "num_input_tokens_seen": 6632672, "step": 10155 }, { "epoch": 5.3249475890985325, "grad_norm": 0.4515666365623474, "learning_rate": 2.653123515881417e-05, "loss": 0.4235, "num_input_tokens_seen": 6637344, "step": 10160 }, { "epoch": 5.327568134171908, "grad_norm": 0.7993960976600647, "learning_rate": 2.650840888740923e-05, "loss": 0.5813, "num_input_tokens_seen": 6640416, "step": 10165 }, { "epoch": 5.330188679245283, "grad_norm": 0.37816399335861206, "learning_rate": 2.6485581353836624e-05, "loss": 0.3281, "num_input_tokens_seen": 6642688, "step": 10170 }, { "epoch": 5.332809224318658, "grad_norm": 0.7145774364471436, "learning_rate": 2.6462752577197407e-05, "loss": 0.4777, "num_input_tokens_seen": 6645952, "step": 10175 }, { "epoch": 5.335429769392033, "grad_norm": 0.5625265836715698, "learning_rate": 2.643992257659365e-05, "loss": 0.4857, "num_input_tokens_seen": 6648576, "step": 10180 }, { "epoch": 5.338050314465409, "grad_norm": 0.7054005861282349, "learning_rate": 2.641709137112845e-05, "loss": 0.5534, "num_input_tokens_seen": 6651840, "step": 10185 }, { "epoch": 5.340670859538784, "grad_norm": 0.6679807305335999, "learning_rate": 2.639425897990593e-05, "loss": 0.6205, "num_input_tokens_seen": 6654336, "step": 10190 }, { "epoch": 5.34329140461216, "grad_norm": 0.4309811592102051, "learning_rate": 2.6371425422031172e-05, "loss": 0.4883, "num_input_tokens_seen": 6657600, "step": 10195 }, { "epoch": 5.345911949685535, "grad_norm": 0.31303921341896057, "learning_rate": 2.6348590716610273e-05, "loss": 0.4108, "num_input_tokens_seen": 6660768, "step": 10200 }, { "epoch": 5.34853249475891, "grad_norm": 0.8147851824760437, "learning_rate": 2.6325754882750252e-05, "loss": 0.5389, "num_input_tokens_seen": 6663808, "step": 10205 }, { "epoch": 5.351153039832285, "grad_norm": 0.2548147141933441, "learning_rate": 2.630291793955911e-05, "loss": 0.4823, "num_input_tokens_seen": 6666048, "step": 10210 }, { "epoch": 5.35377358490566, "grad_norm": 0.31770870089530945, "learning_rate": 2.6280079906145756e-05, "loss": 0.4132, "num_input_tokens_seen": 6668672, "step": 10215 }, { "epoch": 5.356394129979035, "grad_norm": 0.739115297794342, "learning_rate": 2.6257240801620004e-05, "loss": 0.6278, "num_input_tokens_seen": 6671680, "step": 10220 }, { "epoch": 5.359014675052411, "grad_norm": 0.4275490343570709, "learning_rate": 2.623440064509258e-05, "loss": 0.4039, "num_input_tokens_seen": 6674592, "step": 10225 }, { "epoch": 5.361635220125786, "grad_norm": 0.5016900897026062, "learning_rate": 2.621155945567508e-05, "loss": 0.3923, "num_input_tokens_seen": 6677632, "step": 10230 }, { "epoch": 5.364255765199162, "grad_norm": 0.708026647567749, "learning_rate": 2.6188717252479968e-05, "loss": 0.3934, "num_input_tokens_seen": 6680768, "step": 10235 }, { "epoch": 5.366876310272537, "grad_norm": 0.504084050655365, "learning_rate": 2.6165874054620552e-05, "loss": 0.5016, "num_input_tokens_seen": 6683712, "step": 10240 }, { "epoch": 5.369496855345912, "grad_norm": 0.40211591124534607, "learning_rate": 2.614302988121099e-05, "loss": 0.5242, "num_input_tokens_seen": 6688512, "step": 10245 }, { "epoch": 5.372117400419287, "grad_norm": 0.3481143116950989, "learning_rate": 2.6120184751366238e-05, "loss": 0.4383, "num_input_tokens_seen": 6691520, "step": 10250 }, { "epoch": 5.3747379454926625, "grad_norm": 0.29393184185028076, "learning_rate": 2.6097338684202043e-05, "loss": 0.539, "num_input_tokens_seen": 6695360, "step": 10255 }, { "epoch": 5.377358490566038, "grad_norm": 0.2899297773838043, "learning_rate": 2.607449169883497e-05, "loss": 0.4616, "num_input_tokens_seen": 6698688, "step": 10260 }, { "epoch": 5.379979035639413, "grad_norm": 0.6162552237510681, "learning_rate": 2.605164381438232e-05, "loss": 0.5393, "num_input_tokens_seen": 6701312, "step": 10265 }, { "epoch": 5.382599580712788, "grad_norm": 0.4531908333301544, "learning_rate": 2.6028795049962167e-05, "loss": 0.4704, "num_input_tokens_seen": 6703840, "step": 10270 }, { "epoch": 5.385220125786163, "grad_norm": 0.34828463196754456, "learning_rate": 2.600594542469331e-05, "loss": 0.4239, "num_input_tokens_seen": 6706720, "step": 10275 }, { "epoch": 5.387840670859539, "grad_norm": 0.433771014213562, "learning_rate": 2.5983094957695263e-05, "loss": 0.4329, "num_input_tokens_seen": 6710240, "step": 10280 }, { "epoch": 5.390461215932914, "grad_norm": 0.47323334217071533, "learning_rate": 2.596024366808827e-05, "loss": 0.5973, "num_input_tokens_seen": 6713632, "step": 10285 }, { "epoch": 5.3930817610062896, "grad_norm": 0.38315093517303467, "learning_rate": 2.5937391574993238e-05, "loss": 0.4198, "num_input_tokens_seen": 6716032, "step": 10290 }, { "epoch": 5.395702306079665, "grad_norm": 0.42648860812187195, "learning_rate": 2.5914538697531755e-05, "loss": 0.4242, "num_input_tokens_seen": 6719136, "step": 10295 }, { "epoch": 5.39832285115304, "grad_norm": 0.6794592142105103, "learning_rate": 2.5891685054826054e-05, "loss": 0.4499, "num_input_tokens_seen": 6721824, "step": 10300 }, { "epoch": 5.400943396226415, "grad_norm": 0.3243483304977417, "learning_rate": 2.586883066599904e-05, "loss": 0.4589, "num_input_tokens_seen": 6724864, "step": 10305 }, { "epoch": 5.40356394129979, "grad_norm": 0.4748457968235016, "learning_rate": 2.5845975550174206e-05, "loss": 0.3655, "num_input_tokens_seen": 6728064, "step": 10310 }, { "epoch": 5.406184486373165, "grad_norm": 0.4382229745388031, "learning_rate": 2.5823119726475682e-05, "loss": 0.5321, "num_input_tokens_seen": 6730784, "step": 10315 }, { "epoch": 5.408805031446541, "grad_norm": 0.3368225395679474, "learning_rate": 2.5800263214028153e-05, "loss": 0.5378, "num_input_tokens_seen": 6733408, "step": 10320 }, { "epoch": 5.411425576519916, "grad_norm": 0.05419303476810455, "learning_rate": 2.5777406031956935e-05, "loss": 0.4623, "num_input_tokens_seen": 6738912, "step": 10325 }, { "epoch": 5.414046121593292, "grad_norm": 0.6179999709129333, "learning_rate": 2.5754548199387863e-05, "loss": 0.4459, "num_input_tokens_seen": 6742784, "step": 10330 }, { "epoch": 5.416666666666667, "grad_norm": 0.3286914527416229, "learning_rate": 2.5731689735447317e-05, "loss": 0.3891, "num_input_tokens_seen": 6746080, "step": 10335 }, { "epoch": 5.419287211740042, "grad_norm": 0.5157833099365234, "learning_rate": 2.5708830659262218e-05, "loss": 0.5804, "num_input_tokens_seen": 6750368, "step": 10340 }, { "epoch": 5.421907756813417, "grad_norm": 0.5308550596237183, "learning_rate": 2.5685970989960005e-05, "loss": 0.4541, "num_input_tokens_seen": 6753984, "step": 10345 }, { "epoch": 5.4245283018867925, "grad_norm": 0.3944914638996124, "learning_rate": 2.5663110746668612e-05, "loss": 0.57, "num_input_tokens_seen": 6756576, "step": 10350 }, { "epoch": 5.427148846960168, "grad_norm": 0.6252602338790894, "learning_rate": 2.564024994851642e-05, "loss": 0.444, "num_input_tokens_seen": 6759904, "step": 10355 }, { "epoch": 5.429769392033543, "grad_norm": 0.3804458677768707, "learning_rate": 2.561738861463232e-05, "loss": 0.4506, "num_input_tokens_seen": 6762880, "step": 10360 }, { "epoch": 5.432389937106918, "grad_norm": 0.6933711171150208, "learning_rate": 2.559452676414564e-05, "loss": 0.6566, "num_input_tokens_seen": 6765728, "step": 10365 }, { "epoch": 5.435010482180293, "grad_norm": 0.9323596358299255, "learning_rate": 2.5571664416186108e-05, "loss": 0.4349, "num_input_tokens_seen": 6768704, "step": 10370 }, { "epoch": 5.437631027253669, "grad_norm": 0.36293792724609375, "learning_rate": 2.5548801589883913e-05, "loss": 0.3505, "num_input_tokens_seen": 6772256, "step": 10375 }, { "epoch": 5.440251572327044, "grad_norm": 0.3663688600063324, "learning_rate": 2.5525938304369614e-05, "loss": 0.4936, "num_input_tokens_seen": 6775552, "step": 10380 }, { "epoch": 5.4428721174004195, "grad_norm": 0.5846678614616394, "learning_rate": 2.5503074578774166e-05, "loss": 0.3872, "num_input_tokens_seen": 6778176, "step": 10385 }, { "epoch": 5.445492662473795, "grad_norm": 0.3669678568840027, "learning_rate": 2.5480210432228886e-05, "loss": 0.4635, "num_input_tokens_seen": 6781888, "step": 10390 }, { "epoch": 5.44811320754717, "grad_norm": 0.3550354242324829, "learning_rate": 2.5457345883865457e-05, "loss": 0.4673, "num_input_tokens_seen": 6785056, "step": 10395 }, { "epoch": 5.450733752620545, "grad_norm": 0.6486369371414185, "learning_rate": 2.5434480952815877e-05, "loss": 0.458, "num_input_tokens_seen": 6788832, "step": 10400 }, { "epoch": 5.45335429769392, "grad_norm": 0.4273740351200104, "learning_rate": 2.5411615658212478e-05, "loss": 0.4512, "num_input_tokens_seen": 6792320, "step": 10405 }, { "epoch": 5.455974842767295, "grad_norm": 0.42629534006118774, "learning_rate": 2.5388750019187912e-05, "loss": 0.569, "num_input_tokens_seen": 6796000, "step": 10410 }, { "epoch": 5.4585953878406706, "grad_norm": 0.6626965999603271, "learning_rate": 2.5365884054875084e-05, "loss": 0.4575, "num_input_tokens_seen": 6798400, "step": 10415 }, { "epoch": 5.461215932914046, "grad_norm": 0.6252530813217163, "learning_rate": 2.5343017784407184e-05, "loss": 0.5197, "num_input_tokens_seen": 6801440, "step": 10420 }, { "epoch": 5.463836477987422, "grad_norm": 0.5086711049079895, "learning_rate": 2.532015122691767e-05, "loss": 0.4461, "num_input_tokens_seen": 6807072, "step": 10425 }, { "epoch": 5.466457023060797, "grad_norm": 0.5774171948432922, "learning_rate": 2.5297284401540243e-05, "loss": 0.4782, "num_input_tokens_seen": 6810272, "step": 10430 }, { "epoch": 5.469077568134172, "grad_norm": 0.4799441993236542, "learning_rate": 2.5274417327408805e-05, "loss": 0.4159, "num_input_tokens_seen": 6813184, "step": 10435 }, { "epoch": 5.471698113207547, "grad_norm": 0.4371941089630127, "learning_rate": 2.5251550023657478e-05, "loss": 0.4687, "num_input_tokens_seen": 6817824, "step": 10440 }, { "epoch": 5.4743186582809225, "grad_norm": 0.39258143305778503, "learning_rate": 2.5228682509420582e-05, "loss": 0.4283, "num_input_tokens_seen": 6820480, "step": 10445 }, { "epoch": 5.476939203354298, "grad_norm": 0.8576293587684631, "learning_rate": 2.5205814803832617e-05, "loss": 0.5134, "num_input_tokens_seen": 6823360, "step": 10450 }, { "epoch": 5.479559748427673, "grad_norm": 0.4778810441493988, "learning_rate": 2.518294692602821e-05, "loss": 0.5009, "num_input_tokens_seen": 6825856, "step": 10455 }, { "epoch": 5.482180293501048, "grad_norm": 0.7644977569580078, "learning_rate": 2.5160078895142186e-05, "loss": 0.5287, "num_input_tokens_seen": 6828896, "step": 10460 }, { "epoch": 5.484800838574423, "grad_norm": 0.43202686309814453, "learning_rate": 2.5137210730309447e-05, "loss": 0.5151, "num_input_tokens_seen": 6832032, "step": 10465 }, { "epoch": 5.487421383647799, "grad_norm": 0.739298939704895, "learning_rate": 2.5114342450665034e-05, "loss": 0.5328, "num_input_tokens_seen": 6834624, "step": 10470 }, { "epoch": 5.490041928721174, "grad_norm": 0.533668041229248, "learning_rate": 2.509147407534409e-05, "loss": 0.6291, "num_input_tokens_seen": 6837504, "step": 10475 }, { "epoch": 5.4926624737945495, "grad_norm": 0.5696955323219299, "learning_rate": 2.5068605623481816e-05, "loss": 0.4945, "num_input_tokens_seen": 6840224, "step": 10480 }, { "epoch": 5.495283018867925, "grad_norm": 0.270507276058197, "learning_rate": 2.5045737114213487e-05, "loss": 0.36, "num_input_tokens_seen": 6843520, "step": 10485 }, { "epoch": 5.4979035639413, "grad_norm": 0.30608803033828735, "learning_rate": 2.502286856667443e-05, "loss": 0.4493, "num_input_tokens_seen": 6846464, "step": 10490 }, { "epoch": 5.5, "eval_loss": 0.4865591526031494, "eval_runtime": 14.5715, "eval_samples_per_second": 58.196, "eval_steps_per_second": 14.549, "num_input_tokens_seen": 6849184, "step": 10494 }, { "epoch": 5.500524109014675, "grad_norm": 0.5477892160415649, "learning_rate": 2.5e-05, "loss": 0.4947, "num_input_tokens_seen": 6849696, "step": 10495 }, { "epoch": 5.50314465408805, "grad_norm": 0.442659467458725, "learning_rate": 2.497713143332557e-05, "loss": 0.6089, "num_input_tokens_seen": 6853216, "step": 10500 }, { "epoch": 5.505765199161425, "grad_norm": 0.5485686659812927, "learning_rate": 2.495426288578652e-05, "loss": 0.4399, "num_input_tokens_seen": 6855872, "step": 10505 }, { "epoch": 5.5083857442348005, "grad_norm": 0.5887805819511414, "learning_rate": 2.493139437651819e-05, "loss": 0.3965, "num_input_tokens_seen": 6858688, "step": 10510 }, { "epoch": 5.511006289308176, "grad_norm": 0.48066309094429016, "learning_rate": 2.490852592465591e-05, "loss": 0.4195, "num_input_tokens_seen": 6861632, "step": 10515 }, { "epoch": 5.513626834381551, "grad_norm": 0.3311723470687866, "learning_rate": 2.488565754933497e-05, "loss": 0.6555, "num_input_tokens_seen": 6865696, "step": 10520 }, { "epoch": 5.516247379454927, "grad_norm": 0.5074434280395508, "learning_rate": 2.486278926969056e-05, "loss": 0.4827, "num_input_tokens_seen": 6869056, "step": 10525 }, { "epoch": 5.518867924528302, "grad_norm": 0.9758118391036987, "learning_rate": 2.483992110485782e-05, "loss": 0.4434, "num_input_tokens_seen": 6871488, "step": 10530 }, { "epoch": 5.521488469601677, "grad_norm": 0.20335941016674042, "learning_rate": 2.4817053073971792e-05, "loss": 0.4928, "num_input_tokens_seen": 6875008, "step": 10535 }, { "epoch": 5.524109014675052, "grad_norm": 0.732214093208313, "learning_rate": 2.4794185196167392e-05, "loss": 0.6645, "num_input_tokens_seen": 6878336, "step": 10540 }, { "epoch": 5.526729559748428, "grad_norm": 0.30760458111763, "learning_rate": 2.477131749057942e-05, "loss": 0.4465, "num_input_tokens_seen": 6882080, "step": 10545 }, { "epoch": 5.529350104821803, "grad_norm": 0.3257649838924408, "learning_rate": 2.4748449976342524e-05, "loss": 0.3865, "num_input_tokens_seen": 6885504, "step": 10550 }, { "epoch": 5.531970649895178, "grad_norm": 0.38638290762901306, "learning_rate": 2.47255826725912e-05, "loss": 0.5107, "num_input_tokens_seen": 6888704, "step": 10555 }, { "epoch": 5.534591194968553, "grad_norm": 0.5588316321372986, "learning_rate": 2.4702715598459766e-05, "loss": 0.5471, "num_input_tokens_seen": 6891936, "step": 10560 }, { "epoch": 5.537211740041929, "grad_norm": 0.46061161160469055, "learning_rate": 2.467984877308233e-05, "loss": 0.544, "num_input_tokens_seen": 6894848, "step": 10565 }, { "epoch": 5.539832285115304, "grad_norm": 0.5334917902946472, "learning_rate": 2.4656982215592818e-05, "loss": 0.4615, "num_input_tokens_seen": 6897728, "step": 10570 }, { "epoch": 5.5424528301886795, "grad_norm": 0.7135491371154785, "learning_rate": 2.463411594512493e-05, "loss": 0.4045, "num_input_tokens_seen": 6900672, "step": 10575 }, { "epoch": 5.545073375262055, "grad_norm": 0.44172295928001404, "learning_rate": 2.4611249980812094e-05, "loss": 0.4228, "num_input_tokens_seen": 6905920, "step": 10580 }, { "epoch": 5.54769392033543, "grad_norm": 0.3760543763637543, "learning_rate": 2.4588384341787518e-05, "loss": 0.4672, "num_input_tokens_seen": 6909408, "step": 10585 }, { "epoch": 5.550314465408805, "grad_norm": 0.5203379392623901, "learning_rate": 2.456551904718413e-05, "loss": 0.4547, "num_input_tokens_seen": 6912128, "step": 10590 }, { "epoch": 5.55293501048218, "grad_norm": 0.4808685779571533, "learning_rate": 2.454265411613455e-05, "loss": 0.511, "num_input_tokens_seen": 6915136, "step": 10595 }, { "epoch": 5.555555555555555, "grad_norm": 0.44097378849983215, "learning_rate": 2.4519789567771116e-05, "loss": 0.4655, "num_input_tokens_seen": 6917856, "step": 10600 }, { "epoch": 5.5581761006289305, "grad_norm": 0.4398902654647827, "learning_rate": 2.4496925421225847e-05, "loss": 0.4374, "num_input_tokens_seen": 6920864, "step": 10605 }, { "epoch": 5.560796645702306, "grad_norm": 0.4265136420726776, "learning_rate": 2.4474061695630395e-05, "loss": 0.6594, "num_input_tokens_seen": 6924384, "step": 10610 }, { "epoch": 5.563417190775681, "grad_norm": 0.48630568385124207, "learning_rate": 2.4451198410116086e-05, "loss": 0.5023, "num_input_tokens_seen": 6926848, "step": 10615 }, { "epoch": 5.566037735849057, "grad_norm": 0.2911646366119385, "learning_rate": 2.4428335583813898e-05, "loss": 0.3861, "num_input_tokens_seen": 6929536, "step": 10620 }, { "epoch": 5.568658280922432, "grad_norm": 0.6474888920783997, "learning_rate": 2.4405473235854367e-05, "loss": 0.5326, "num_input_tokens_seen": 6932064, "step": 10625 }, { "epoch": 5.571278825995807, "grad_norm": 0.5091709494590759, "learning_rate": 2.4382611385367678e-05, "loss": 0.4419, "num_input_tokens_seen": 6935296, "step": 10630 }, { "epoch": 5.573899371069182, "grad_norm": 0.5684879422187805, "learning_rate": 2.4359750051483584e-05, "loss": 0.4138, "num_input_tokens_seen": 6937824, "step": 10635 }, { "epoch": 5.576519916142558, "grad_norm": 0.3848485052585602, "learning_rate": 2.4336889253331397e-05, "loss": 0.471, "num_input_tokens_seen": 6941600, "step": 10640 }, { "epoch": 5.579140461215933, "grad_norm": 0.3897337019443512, "learning_rate": 2.4314029010040004e-05, "loss": 0.4641, "num_input_tokens_seen": 6944608, "step": 10645 }, { "epoch": 5.581761006289308, "grad_norm": 0.6035739183425903, "learning_rate": 2.429116934073779e-05, "loss": 0.5401, "num_input_tokens_seen": 6948000, "step": 10650 }, { "epoch": 5.584381551362683, "grad_norm": 0.3371812403202057, "learning_rate": 2.426831026455269e-05, "loss": 0.3652, "num_input_tokens_seen": 6951424, "step": 10655 }, { "epoch": 5.587002096436059, "grad_norm": 0.4008481502532959, "learning_rate": 2.424545180061215e-05, "loss": 0.4274, "num_input_tokens_seen": 6956000, "step": 10660 }, { "epoch": 5.589622641509434, "grad_norm": 0.4593120217323303, "learning_rate": 2.422259396804307e-05, "loss": 0.4777, "num_input_tokens_seen": 6958944, "step": 10665 }, { "epoch": 5.5922431865828095, "grad_norm": 0.23645460605621338, "learning_rate": 2.4199736785971846e-05, "loss": 0.4142, "num_input_tokens_seen": 6962304, "step": 10670 }, { "epoch": 5.594863731656185, "grad_norm": 0.50884610414505, "learning_rate": 2.417688027352433e-05, "loss": 0.5695, "num_input_tokens_seen": 6965632, "step": 10675 }, { "epoch": 5.59748427672956, "grad_norm": 0.24417608976364136, "learning_rate": 2.41540244498258e-05, "loss": 0.546, "num_input_tokens_seen": 6969312, "step": 10680 }, { "epoch": 5.600104821802935, "grad_norm": 0.32217004895210266, "learning_rate": 2.4131169334000963e-05, "loss": 0.3894, "num_input_tokens_seen": 6972576, "step": 10685 }, { "epoch": 5.60272536687631, "grad_norm": 0.43840789794921875, "learning_rate": 2.4108314945173955e-05, "loss": 0.3503, "num_input_tokens_seen": 6975872, "step": 10690 }, { "epoch": 5.605345911949685, "grad_norm": 0.2599007785320282, "learning_rate": 2.4085461302468254e-05, "loss": 0.36, "num_input_tokens_seen": 6978784, "step": 10695 }, { "epoch": 5.6079664570230605, "grad_norm": 0.35568079352378845, "learning_rate": 2.4062608425006765e-05, "loss": 0.6636, "num_input_tokens_seen": 6981408, "step": 10700 }, { "epoch": 5.610587002096436, "grad_norm": 0.4170653223991394, "learning_rate": 2.4039756331911737e-05, "loss": 0.4228, "num_input_tokens_seen": 6984800, "step": 10705 }, { "epoch": 5.613207547169811, "grad_norm": 0.42547348141670227, "learning_rate": 2.401690504230474e-05, "loss": 0.4637, "num_input_tokens_seen": 6988192, "step": 10710 }, { "epoch": 5.615828092243187, "grad_norm": 0.39529678225517273, "learning_rate": 2.3994054575306698e-05, "loss": 0.3791, "num_input_tokens_seen": 6992416, "step": 10715 }, { "epoch": 5.618448637316562, "grad_norm": 0.6587173938751221, "learning_rate": 2.397120495003784e-05, "loss": 0.4133, "num_input_tokens_seen": 6995520, "step": 10720 }, { "epoch": 5.621069182389937, "grad_norm": 0.4624587893486023, "learning_rate": 2.394835618561768e-05, "loss": 0.4229, "num_input_tokens_seen": 6998464, "step": 10725 }, { "epoch": 5.623689727463312, "grad_norm": 0.4116639494895935, "learning_rate": 2.3925508301165043e-05, "loss": 0.4417, "num_input_tokens_seen": 7001600, "step": 10730 }, { "epoch": 5.626310272536688, "grad_norm": 0.4653794765472412, "learning_rate": 2.390266131579796e-05, "loss": 0.4824, "num_input_tokens_seen": 7004704, "step": 10735 }, { "epoch": 5.628930817610063, "grad_norm": 0.5267269611358643, "learning_rate": 2.3879815248633768e-05, "loss": 0.5645, "num_input_tokens_seen": 7007424, "step": 10740 }, { "epoch": 5.631551362683438, "grad_norm": 0.4267626702785492, "learning_rate": 2.385697011878902e-05, "loss": 0.4086, "num_input_tokens_seen": 7010688, "step": 10745 }, { "epoch": 5.634171907756813, "grad_norm": 0.31881552934646606, "learning_rate": 2.383412594537945e-05, "loss": 0.358, "num_input_tokens_seen": 7014048, "step": 10750 }, { "epoch": 5.636792452830189, "grad_norm": 0.5208503007888794, "learning_rate": 2.3811282747520038e-05, "loss": 0.4172, "num_input_tokens_seen": 7017856, "step": 10755 }, { "epoch": 5.639412997903564, "grad_norm": 0.409127801656723, "learning_rate": 2.378844054432493e-05, "loss": 0.3732, "num_input_tokens_seen": 7021248, "step": 10760 }, { "epoch": 5.6420335429769395, "grad_norm": 0.5576200485229492, "learning_rate": 2.3765599354907427e-05, "loss": 0.458, "num_input_tokens_seen": 7024320, "step": 10765 }, { "epoch": 5.644654088050315, "grad_norm": 0.9239394664764404, "learning_rate": 2.374275919838e-05, "loss": 0.5125, "num_input_tokens_seen": 7027296, "step": 10770 }, { "epoch": 5.64727463312369, "grad_norm": 0.27732670307159424, "learning_rate": 2.371992009385425e-05, "loss": 0.4667, "num_input_tokens_seen": 7030784, "step": 10775 }, { "epoch": 5.649895178197065, "grad_norm": 0.25571319460868835, "learning_rate": 2.369708206044089e-05, "loss": 0.6774, "num_input_tokens_seen": 7034112, "step": 10780 }, { "epoch": 5.65251572327044, "grad_norm": 0.42470085620880127, "learning_rate": 2.3674245117249747e-05, "loss": 0.3765, "num_input_tokens_seen": 7039072, "step": 10785 }, { "epoch": 5.655136268343815, "grad_norm": 0.5512936115264893, "learning_rate": 2.3651409283389743e-05, "loss": 0.4302, "num_input_tokens_seen": 7042304, "step": 10790 }, { "epoch": 5.6577568134171905, "grad_norm": 0.38536542654037476, "learning_rate": 2.3628574577968834e-05, "loss": 0.5266, "num_input_tokens_seen": 7046400, "step": 10795 }, { "epoch": 5.660377358490566, "grad_norm": 0.428395539522171, "learning_rate": 2.360574102009408e-05, "loss": 0.4542, "num_input_tokens_seen": 7050656, "step": 10800 }, { "epoch": 5.662997903563941, "grad_norm": 0.49510738253593445, "learning_rate": 2.3582908628871554e-05, "loss": 0.4679, "num_input_tokens_seen": 7053824, "step": 10805 }, { "epoch": 5.665618448637317, "grad_norm": 0.5686017870903015, "learning_rate": 2.3560077423406355e-05, "loss": 0.5271, "num_input_tokens_seen": 7057760, "step": 10810 }, { "epoch": 5.668238993710692, "grad_norm": 0.36749008297920227, "learning_rate": 2.3537247422802595e-05, "loss": 0.4046, "num_input_tokens_seen": 7061888, "step": 10815 }, { "epoch": 5.670859538784067, "grad_norm": 0.3755805492401123, "learning_rate": 2.351441864616338e-05, "loss": 0.3565, "num_input_tokens_seen": 7064960, "step": 10820 }, { "epoch": 5.673480083857442, "grad_norm": 0.33737462759017944, "learning_rate": 2.3491591112590776e-05, "loss": 0.4328, "num_input_tokens_seen": 7067968, "step": 10825 }, { "epoch": 5.676100628930818, "grad_norm": 0.266659677028656, "learning_rate": 2.346876484118584e-05, "loss": 0.5305, "num_input_tokens_seen": 7071424, "step": 10830 }, { "epoch": 5.678721174004193, "grad_norm": 0.591185450553894, "learning_rate": 2.3445939851048533e-05, "loss": 0.54, "num_input_tokens_seen": 7074656, "step": 10835 }, { "epoch": 5.681341719077568, "grad_norm": 0.5673957467079163, "learning_rate": 2.342311616127777e-05, "loss": 0.497, "num_input_tokens_seen": 7077952, "step": 10840 }, { "epoch": 5.683962264150943, "grad_norm": 0.46326744556427, "learning_rate": 2.3400293790971378e-05, "loss": 0.3504, "num_input_tokens_seen": 7080512, "step": 10845 }, { "epoch": 5.686582809224319, "grad_norm": 0.7103022336959839, "learning_rate": 2.3377472759226064e-05, "loss": 0.4225, "num_input_tokens_seen": 7083904, "step": 10850 }, { "epoch": 5.689203354297694, "grad_norm": 0.530667781829834, "learning_rate": 2.3354653085137433e-05, "loss": 0.4674, "num_input_tokens_seen": 7086144, "step": 10855 }, { "epoch": 5.6918238993710695, "grad_norm": 0.35143667459487915, "learning_rate": 2.333183478779995e-05, "loss": 0.46, "num_input_tokens_seen": 7088608, "step": 10860 }, { "epoch": 5.694444444444445, "grad_norm": 0.2549031376838684, "learning_rate": 2.330901788630691e-05, "loss": 0.3974, "num_input_tokens_seen": 7091296, "step": 10865 }, { "epoch": 5.69706498951782, "grad_norm": 0.5067264437675476, "learning_rate": 2.3286202399750463e-05, "loss": 0.446, "num_input_tokens_seen": 7095008, "step": 10870 }, { "epoch": 5.699685534591195, "grad_norm": 0.7068521976470947, "learning_rate": 2.3263388347221575e-05, "loss": 0.4393, "num_input_tokens_seen": 7098016, "step": 10875 }, { "epoch": 5.70230607966457, "grad_norm": 0.31318753957748413, "learning_rate": 2.3240575747809984e-05, "loss": 0.3938, "num_input_tokens_seen": 7101056, "step": 10880 }, { "epoch": 5.704926624737945, "grad_norm": 0.5784431099891663, "learning_rate": 2.3217764620604233e-05, "loss": 0.3798, "num_input_tokens_seen": 7103488, "step": 10885 }, { "epoch": 5.7075471698113205, "grad_norm": 0.3511040210723877, "learning_rate": 2.3194954984691656e-05, "loss": 0.4048, "num_input_tokens_seen": 7107072, "step": 10890 }, { "epoch": 5.710167714884696, "grad_norm": 0.6126925945281982, "learning_rate": 2.3172146859158282e-05, "loss": 0.4551, "num_input_tokens_seen": 7110336, "step": 10895 }, { "epoch": 5.712788259958071, "grad_norm": 0.4220244288444519, "learning_rate": 2.314934026308893e-05, "loss": 0.5432, "num_input_tokens_seen": 7114240, "step": 10900 }, { "epoch": 5.715408805031447, "grad_norm": 0.3187107443809509, "learning_rate": 2.3126535215567112e-05, "loss": 0.5379, "num_input_tokens_seen": 7121216, "step": 10905 }, { "epoch": 5.718029350104822, "grad_norm": 0.8567295670509338, "learning_rate": 2.3103731735675045e-05, "loss": 0.425, "num_input_tokens_seen": 7124800, "step": 10910 }, { "epoch": 5.720649895178197, "grad_norm": 0.5054190754890442, "learning_rate": 2.308092984249365e-05, "loss": 0.5214, "num_input_tokens_seen": 7128224, "step": 10915 }, { "epoch": 5.723270440251572, "grad_norm": 0.5978290438652039, "learning_rate": 2.3058129555102498e-05, "loss": 0.3888, "num_input_tokens_seen": 7131456, "step": 10920 }, { "epoch": 5.725890985324948, "grad_norm": 0.48426035046577454, "learning_rate": 2.3035330892579825e-05, "loss": 0.4378, "num_input_tokens_seen": 7134944, "step": 10925 }, { "epoch": 5.728511530398323, "grad_norm": 0.24162085354328156, "learning_rate": 2.3012533874002534e-05, "loss": 0.5938, "num_input_tokens_seen": 7139200, "step": 10930 }, { "epoch": 5.731132075471698, "grad_norm": 0.5502359867095947, "learning_rate": 2.2989738518446104e-05, "loss": 0.4241, "num_input_tokens_seen": 7141792, "step": 10935 }, { "epoch": 5.733752620545073, "grad_norm": 0.33472442626953125, "learning_rate": 2.2966944844984658e-05, "loss": 0.414, "num_input_tokens_seen": 7145152, "step": 10940 }, { "epoch": 5.736373165618449, "grad_norm": 0.4243567883968353, "learning_rate": 2.29441528726909e-05, "loss": 0.5848, "num_input_tokens_seen": 7148192, "step": 10945 }, { "epoch": 5.738993710691824, "grad_norm": 0.3813316822052002, "learning_rate": 2.292136262063611e-05, "loss": 0.3735, "num_input_tokens_seen": 7151520, "step": 10950 }, { "epoch": 5.7416142557651995, "grad_norm": 0.6975141167640686, "learning_rate": 2.289857410789013e-05, "loss": 0.3751, "num_input_tokens_seen": 7154496, "step": 10955 }, { "epoch": 5.744234800838575, "grad_norm": 0.3014630079269409, "learning_rate": 2.287578735352136e-05, "loss": 0.5085, "num_input_tokens_seen": 7157600, "step": 10960 }, { "epoch": 5.74685534591195, "grad_norm": 0.5755729079246521, "learning_rate": 2.285300237659668e-05, "loss": 0.5394, "num_input_tokens_seen": 7160224, "step": 10965 }, { "epoch": 5.749475890985325, "grad_norm": 0.51036137342453, "learning_rate": 2.283021919618155e-05, "loss": 0.373, "num_input_tokens_seen": 7162720, "step": 10970 }, { "epoch": 5.7520964360587, "grad_norm": 0.71729576587677, "learning_rate": 2.28074378313399e-05, "loss": 0.5728, "num_input_tokens_seen": 7166240, "step": 10975 }, { "epoch": 5.754716981132075, "grad_norm": 0.47562816739082336, "learning_rate": 2.2784658301134105e-05, "loss": 0.5304, "num_input_tokens_seen": 7172480, "step": 10980 }, { "epoch": 5.7573375262054505, "grad_norm": 0.3914123475551605, "learning_rate": 2.2761880624625048e-05, "loss": 0.53, "num_input_tokens_seen": 7176416, "step": 10985 }, { "epoch": 5.759958071278826, "grad_norm": 0.3546169698238373, "learning_rate": 2.2739104820872062e-05, "loss": 0.5849, "num_input_tokens_seen": 7180832, "step": 10990 }, { "epoch": 5.762578616352201, "grad_norm": 0.5603137016296387, "learning_rate": 2.271633090893288e-05, "loss": 0.4856, "num_input_tokens_seen": 7184416, "step": 10995 }, { "epoch": 5.765199161425577, "grad_norm": 0.5413626432418823, "learning_rate": 2.269355890786368e-05, "loss": 0.426, "num_input_tokens_seen": 7186816, "step": 11000 }, { "epoch": 5.767819706498952, "grad_norm": 0.4955574572086334, "learning_rate": 2.2670788836719037e-05, "loss": 0.5796, "num_input_tokens_seen": 7189824, "step": 11005 }, { "epoch": 5.770440251572327, "grad_norm": 0.5084646940231323, "learning_rate": 2.2648020714551897e-05, "loss": 0.5116, "num_input_tokens_seen": 7192640, "step": 11010 }, { "epoch": 5.773060796645702, "grad_norm": 0.40992528200149536, "learning_rate": 2.26252545604136e-05, "loss": 0.4498, "num_input_tokens_seen": 7195328, "step": 11015 }, { "epoch": 5.7756813417190775, "grad_norm": 0.936614453792572, "learning_rate": 2.2602490393353798e-05, "loss": 0.6842, "num_input_tokens_seen": 7198464, "step": 11020 }, { "epoch": 5.778301886792453, "grad_norm": 0.2730365991592407, "learning_rate": 2.2579728232420525e-05, "loss": 0.4549, "num_input_tokens_seen": 7202080, "step": 11025 }, { "epoch": 5.780922431865828, "grad_norm": 0.4342687427997589, "learning_rate": 2.255696809666012e-05, "loss": 0.4813, "num_input_tokens_seen": 7204832, "step": 11030 }, { "epoch": 5.783542976939203, "grad_norm": 0.5193789005279541, "learning_rate": 2.253421000511721e-05, "loss": 0.4014, "num_input_tokens_seen": 7208128, "step": 11035 }, { "epoch": 5.786163522012579, "grad_norm": 0.3897092044353485, "learning_rate": 2.2511453976834733e-05, "loss": 0.4357, "num_input_tokens_seen": 7211232, "step": 11040 }, { "epoch": 5.788784067085954, "grad_norm": 0.348779559135437, "learning_rate": 2.2488700030853907e-05, "loss": 0.3986, "num_input_tokens_seen": 7214720, "step": 11045 }, { "epoch": 5.7914046121593294, "grad_norm": 0.5473248362541199, "learning_rate": 2.2465948186214175e-05, "loss": 0.5739, "num_input_tokens_seen": 7219200, "step": 11050 }, { "epoch": 5.794025157232705, "grad_norm": 0.32407146692276, "learning_rate": 2.244319846195325e-05, "loss": 0.4734, "num_input_tokens_seen": 7223808, "step": 11055 }, { "epoch": 5.79664570230608, "grad_norm": 0.8559238910675049, "learning_rate": 2.2420450877107075e-05, "loss": 0.5015, "num_input_tokens_seen": 7227264, "step": 11060 }, { "epoch": 5.799266247379455, "grad_norm": 0.5468615293502808, "learning_rate": 2.2397705450709763e-05, "loss": 0.5579, "num_input_tokens_seen": 7232672, "step": 11065 }, { "epoch": 5.80188679245283, "grad_norm": 1.121859073638916, "learning_rate": 2.237496220179366e-05, "loss": 0.4418, "num_input_tokens_seen": 7235584, "step": 11070 }, { "epoch": 5.804507337526205, "grad_norm": 0.44844475388526917, "learning_rate": 2.235222114938929e-05, "loss": 0.5397, "num_input_tokens_seen": 7238592, "step": 11075 }, { "epoch": 5.8071278825995805, "grad_norm": 0.5043392181396484, "learning_rate": 2.232948231252531e-05, "loss": 0.4958, "num_input_tokens_seen": 7241952, "step": 11080 }, { "epoch": 5.809748427672956, "grad_norm": 0.4140024185180664, "learning_rate": 2.2306745710228545e-05, "loss": 0.4313, "num_input_tokens_seen": 7245056, "step": 11085 }, { "epoch": 5.812368972746331, "grad_norm": 0.4682360589504242, "learning_rate": 2.2284011361523954e-05, "loss": 0.4649, "num_input_tokens_seen": 7248896, "step": 11090 }, { "epoch": 5.814989517819707, "grad_norm": 0.35760411620140076, "learning_rate": 2.2261279285434588e-05, "loss": 0.5291, "num_input_tokens_seen": 7251680, "step": 11095 }, { "epoch": 5.817610062893082, "grad_norm": 0.403109073638916, "learning_rate": 2.2238549500981626e-05, "loss": 0.5815, "num_input_tokens_seen": 7254432, "step": 11100 }, { "epoch": 5.820230607966457, "grad_norm": 1.9383679628372192, "learning_rate": 2.2215822027184294e-05, "loss": 0.5147, "num_input_tokens_seen": 7257280, "step": 11105 }, { "epoch": 5.822851153039832, "grad_norm": 0.3200426995754242, "learning_rate": 2.2193096883059913e-05, "loss": 0.4261, "num_input_tokens_seen": 7259840, "step": 11110 }, { "epoch": 5.8254716981132075, "grad_norm": 0.587677001953125, "learning_rate": 2.2170374087623853e-05, "loss": 0.4459, "num_input_tokens_seen": 7262624, "step": 11115 }, { "epoch": 5.828092243186583, "grad_norm": 0.29947102069854736, "learning_rate": 2.2147653659889494e-05, "loss": 0.4274, "num_input_tokens_seen": 7265568, "step": 11120 }, { "epoch": 5.830712788259958, "grad_norm": 0.6447634696960449, "learning_rate": 2.2124935618868266e-05, "loss": 0.3965, "num_input_tokens_seen": 7268896, "step": 11125 }, { "epoch": 5.833333333333333, "grad_norm": 0.6073199510574341, "learning_rate": 2.210221998356959e-05, "loss": 0.3629, "num_input_tokens_seen": 7271328, "step": 11130 }, { "epoch": 5.835953878406709, "grad_norm": 0.5388931632041931, "learning_rate": 2.2079506773000862e-05, "loss": 0.5576, "num_input_tokens_seen": 7274752, "step": 11135 }, { "epoch": 5.838574423480084, "grad_norm": 0.2910420000553131, "learning_rate": 2.205679600616746e-05, "loss": 0.5623, "num_input_tokens_seen": 7279168, "step": 11140 }, { "epoch": 5.841194968553459, "grad_norm": 0.43245044350624084, "learning_rate": 2.2034087702072736e-05, "loss": 0.489, "num_input_tokens_seen": 7282944, "step": 11145 }, { "epoch": 5.843815513626835, "grad_norm": 0.5123026967048645, "learning_rate": 2.2011381879717928e-05, "loss": 0.5153, "num_input_tokens_seen": 7285824, "step": 11150 }, { "epoch": 5.84643605870021, "grad_norm": 0.4747774004936218, "learning_rate": 2.1988678558102255e-05, "loss": 0.4941, "num_input_tokens_seen": 7288384, "step": 11155 }, { "epoch": 5.849056603773585, "grad_norm": 0.4950191080570221, "learning_rate": 2.1965977756222816e-05, "loss": 0.545, "num_input_tokens_seen": 7291840, "step": 11160 }, { "epoch": 5.85167714884696, "grad_norm": 0.4602759778499603, "learning_rate": 2.1943279493074595e-05, "loss": 0.4172, "num_input_tokens_seen": 7296032, "step": 11165 }, { "epoch": 5.854297693920335, "grad_norm": 0.595546305179596, "learning_rate": 2.192058378765047e-05, "loss": 0.6118, "num_input_tokens_seen": 7298560, "step": 11170 }, { "epoch": 5.8569182389937104, "grad_norm": 0.4073837995529175, "learning_rate": 2.1897890658941175e-05, "loss": 0.3808, "num_input_tokens_seen": 7301376, "step": 11175 }, { "epoch": 5.859538784067086, "grad_norm": 0.6195172071456909, "learning_rate": 2.1875200125935273e-05, "loss": 0.497, "num_input_tokens_seen": 7304896, "step": 11180 }, { "epoch": 5.862159329140461, "grad_norm": 0.32721614837646484, "learning_rate": 2.185251220761917e-05, "loss": 0.4899, "num_input_tokens_seen": 7307968, "step": 11185 }, { "epoch": 5.864779874213837, "grad_norm": 0.518602192401886, "learning_rate": 2.182982692297709e-05, "loss": 0.4671, "num_input_tokens_seen": 7311040, "step": 11190 }, { "epoch": 5.867400419287212, "grad_norm": 0.4620415270328522, "learning_rate": 2.180714429099102e-05, "loss": 0.4066, "num_input_tokens_seen": 7314848, "step": 11195 }, { "epoch": 5.870020964360587, "grad_norm": 0.40303924679756165, "learning_rate": 2.1784464330640774e-05, "loss": 0.5161, "num_input_tokens_seen": 7317440, "step": 11200 }, { "epoch": 5.872641509433962, "grad_norm": 0.4712480306625366, "learning_rate": 2.1761787060903888e-05, "loss": 0.4565, "num_input_tokens_seen": 7320288, "step": 11205 }, { "epoch": 5.8752620545073375, "grad_norm": 0.5400663018226624, "learning_rate": 2.1739112500755673e-05, "loss": 0.5202, "num_input_tokens_seen": 7323168, "step": 11210 }, { "epoch": 5.877882599580713, "grad_norm": 0.44255825877189636, "learning_rate": 2.1716440669169175e-05, "loss": 0.4629, "num_input_tokens_seen": 7326048, "step": 11215 }, { "epoch": 5.880503144654088, "grad_norm": 0.4567382335662842, "learning_rate": 2.169377158511513e-05, "loss": 0.5111, "num_input_tokens_seen": 7328288, "step": 11220 }, { "epoch": 5.883123689727463, "grad_norm": 0.8754846453666687, "learning_rate": 2.1671105267562e-05, "loss": 0.5249, "num_input_tokens_seen": 7331328, "step": 11225 }, { "epoch": 5.885744234800838, "grad_norm": 0.4280256927013397, "learning_rate": 2.1648441735475936e-05, "loss": 0.4404, "num_input_tokens_seen": 7334720, "step": 11230 }, { "epoch": 5.888364779874214, "grad_norm": 0.3060363531112671, "learning_rate": 2.1625781007820723e-05, "loss": 0.3933, "num_input_tokens_seen": 7337312, "step": 11235 }, { "epoch": 5.890985324947589, "grad_norm": 0.4481584429740906, "learning_rate": 2.160312310355783e-05, "loss": 0.5172, "num_input_tokens_seen": 7340384, "step": 11240 }, { "epoch": 5.893605870020965, "grad_norm": 0.27657943964004517, "learning_rate": 2.1580468041646378e-05, "loss": 0.4877, "num_input_tokens_seen": 7343936, "step": 11245 }, { "epoch": 5.89622641509434, "grad_norm": 0.48312103748321533, "learning_rate": 2.155781584104306e-05, "loss": 0.4724, "num_input_tokens_seen": 7346848, "step": 11250 }, { "epoch": 5.898846960167715, "grad_norm": 0.42753463983535767, "learning_rate": 2.153516652070221e-05, "loss": 0.512, "num_input_tokens_seen": 7350112, "step": 11255 }, { "epoch": 5.90146750524109, "grad_norm": 0.2696671783924103, "learning_rate": 2.1512520099575756e-05, "loss": 0.4182, "num_input_tokens_seen": 7353952, "step": 11260 }, { "epoch": 5.904088050314465, "grad_norm": 0.7576078772544861, "learning_rate": 2.1489876596613176e-05, "loss": 0.5948, "num_input_tokens_seen": 7357440, "step": 11265 }, { "epoch": 5.90670859538784, "grad_norm": 0.40028074383735657, "learning_rate": 2.146723603076152e-05, "loss": 0.5943, "num_input_tokens_seen": 7360800, "step": 11270 }, { "epoch": 5.909329140461216, "grad_norm": 0.5864025354385376, "learning_rate": 2.14445984209654e-05, "loss": 0.5322, "num_input_tokens_seen": 7364032, "step": 11275 }, { "epoch": 5.911949685534591, "grad_norm": 0.5774968266487122, "learning_rate": 2.14219637861669e-05, "loss": 0.5485, "num_input_tokens_seen": 7366880, "step": 11280 }, { "epoch": 5.914570230607967, "grad_norm": 0.30237600207328796, "learning_rate": 2.1399332145305678e-05, "loss": 0.4199, "num_input_tokens_seen": 7369664, "step": 11285 }, { "epoch": 5.917190775681342, "grad_norm": 0.4330461919307709, "learning_rate": 2.1376703517318837e-05, "loss": 0.5787, "num_input_tokens_seen": 7372192, "step": 11290 }, { "epoch": 5.919811320754717, "grad_norm": 0.7408244609832764, "learning_rate": 2.1354077921140984e-05, "loss": 0.5019, "num_input_tokens_seen": 7374720, "step": 11295 }, { "epoch": 5.922431865828092, "grad_norm": 0.5725399851799011, "learning_rate": 2.1331455375704195e-05, "loss": 0.4763, "num_input_tokens_seen": 7378112, "step": 11300 }, { "epoch": 5.9250524109014675, "grad_norm": 0.4633631408214569, "learning_rate": 2.1308835899937972e-05, "loss": 0.4543, "num_input_tokens_seen": 7381728, "step": 11305 }, { "epoch": 5.927672955974843, "grad_norm": 0.4822458028793335, "learning_rate": 2.128621951276926e-05, "loss": 0.5032, "num_input_tokens_seen": 7385504, "step": 11310 }, { "epoch": 5.930293501048218, "grad_norm": 0.3549441993236542, "learning_rate": 2.126360623312243e-05, "loss": 0.3913, "num_input_tokens_seen": 7389600, "step": 11315 }, { "epoch": 5.932914046121593, "grad_norm": 0.5112136006355286, "learning_rate": 2.124099607991922e-05, "loss": 0.4612, "num_input_tokens_seen": 7394240, "step": 11320 }, { "epoch": 5.935534591194968, "grad_norm": 0.3203805387020111, "learning_rate": 2.121838907207879e-05, "loss": 0.4603, "num_input_tokens_seen": 7396704, "step": 11325 }, { "epoch": 5.938155136268344, "grad_norm": 0.3085130453109741, "learning_rate": 2.1195785228517658e-05, "loss": 0.4363, "num_input_tokens_seen": 7400352, "step": 11330 }, { "epoch": 5.940775681341719, "grad_norm": 0.44549888372421265, "learning_rate": 2.117318456814967e-05, "loss": 0.3784, "num_input_tokens_seen": 7402816, "step": 11335 }, { "epoch": 5.943396226415095, "grad_norm": 0.5175673365592957, "learning_rate": 2.1150587109886026e-05, "loss": 0.4571, "num_input_tokens_seen": 7405568, "step": 11340 }, { "epoch": 5.94601677148847, "grad_norm": 0.3818765878677368, "learning_rate": 2.1127992872635263e-05, "loss": 0.4826, "num_input_tokens_seen": 7408608, "step": 11345 }, { "epoch": 5.948637316561845, "grad_norm": 0.6598461866378784, "learning_rate": 2.1105401875303193e-05, "loss": 0.5232, "num_input_tokens_seen": 7412448, "step": 11350 }, { "epoch": 5.95125786163522, "grad_norm": 0.6154507994651794, "learning_rate": 2.1082814136792937e-05, "loss": 0.412, "num_input_tokens_seen": 7415104, "step": 11355 }, { "epoch": 5.953878406708595, "grad_norm": 0.5188581347465515, "learning_rate": 2.1060229676004887e-05, "loss": 0.4259, "num_input_tokens_seen": 7418944, "step": 11360 }, { "epoch": 5.95649895178197, "grad_norm": 0.25027933716773987, "learning_rate": 2.1037648511836675e-05, "loss": 0.3925, "num_input_tokens_seen": 7422816, "step": 11365 }, { "epoch": 5.959119496855346, "grad_norm": 0.49454382061958313, "learning_rate": 2.1015070663183195e-05, "loss": 0.3919, "num_input_tokens_seen": 7425824, "step": 11370 }, { "epoch": 5.961740041928721, "grad_norm": 0.32551562786102295, "learning_rate": 2.0992496148936573e-05, "loss": 0.5402, "num_input_tokens_seen": 7429088, "step": 11375 }, { "epoch": 5.964360587002097, "grad_norm": 0.3389585614204407, "learning_rate": 2.0969924987986107e-05, "loss": 0.5491, "num_input_tokens_seen": 7432736, "step": 11380 }, { "epoch": 5.966981132075472, "grad_norm": 0.5856248140335083, "learning_rate": 2.0947357199218325e-05, "loss": 0.5477, "num_input_tokens_seen": 7435840, "step": 11385 }, { "epoch": 5.969601677148847, "grad_norm": 0.6390650272369385, "learning_rate": 2.0924792801516922e-05, "loss": 0.3988, "num_input_tokens_seen": 7441056, "step": 11390 }, { "epoch": 5.972222222222222, "grad_norm": 0.2570144236087799, "learning_rate": 2.0902231813762753e-05, "loss": 0.4961, "num_input_tokens_seen": 7444640, "step": 11395 }, { "epoch": 5.9748427672955975, "grad_norm": 0.606288492679596, "learning_rate": 2.0879674254833828e-05, "loss": 0.5244, "num_input_tokens_seen": 7448096, "step": 11400 }, { "epoch": 5.977463312368973, "grad_norm": 0.4014296233654022, "learning_rate": 2.085712014360527e-05, "loss": 0.4437, "num_input_tokens_seen": 7451520, "step": 11405 }, { "epoch": 5.980083857442348, "grad_norm": 0.4903198480606079, "learning_rate": 2.0834569498949342e-05, "loss": 0.4232, "num_input_tokens_seen": 7454048, "step": 11410 }, { "epoch": 5.982704402515723, "grad_norm": 1.2617542743682861, "learning_rate": 2.0812022339735395e-05, "loss": 0.4295, "num_input_tokens_seen": 7457216, "step": 11415 }, { "epoch": 5.985324947589098, "grad_norm": 0.3422994911670685, "learning_rate": 2.0789478684829846e-05, "loss": 0.4853, "num_input_tokens_seen": 7460992, "step": 11420 }, { "epoch": 5.987945492662474, "grad_norm": 0.5559880137443542, "learning_rate": 2.0766938553096204e-05, "loss": 0.4497, "num_input_tokens_seen": 7463488, "step": 11425 }, { "epoch": 5.990566037735849, "grad_norm": 0.35355859994888306, "learning_rate": 2.0744401963395027e-05, "loss": 0.4706, "num_input_tokens_seen": 7466304, "step": 11430 }, { "epoch": 5.993186582809225, "grad_norm": 0.5184772610664368, "learning_rate": 2.0721868934583897e-05, "loss": 0.5185, "num_input_tokens_seen": 7470176, "step": 11435 }, { "epoch": 5.9958071278826, "grad_norm": 0.4878290593624115, "learning_rate": 2.0699339485517422e-05, "loss": 0.3711, "num_input_tokens_seen": 7473376, "step": 11440 }, { "epoch": 5.998427672955975, "grad_norm": 0.7106026411056519, "learning_rate": 2.0676813635047225e-05, "loss": 0.4835, "num_input_tokens_seen": 7476864, "step": 11445 }, { "epoch": 6.0, "eval_loss": 0.48562777042388916, "eval_runtime": 14.5393, "eval_samples_per_second": 58.325, "eval_steps_per_second": 14.581, "num_input_tokens_seen": 7478504, "step": 11448 }, { "epoch": 6.00104821802935, "grad_norm": 0.36942556500434875, "learning_rate": 2.0654291402021896e-05, "loss": 0.4162, "num_input_tokens_seen": 7479688, "step": 11450 }, { "epoch": 6.003668763102725, "grad_norm": 0.36787399649620056, "learning_rate": 2.063177280528702e-05, "loss": 0.4103, "num_input_tokens_seen": 7483176, "step": 11455 }, { "epoch": 6.0062893081761, "grad_norm": 0.345916211605072, "learning_rate": 2.0609257863685142e-05, "loss": 0.4876, "num_input_tokens_seen": 7486120, "step": 11460 }, { "epoch": 6.008909853249476, "grad_norm": 0.4440450370311737, "learning_rate": 2.0586746596055706e-05, "loss": 0.4795, "num_input_tokens_seen": 7488872, "step": 11465 }, { "epoch": 6.011530398322851, "grad_norm": 0.7418286800384521, "learning_rate": 2.0564239021235128e-05, "loss": 0.5003, "num_input_tokens_seen": 7491880, "step": 11470 }, { "epoch": 6.014150943396227, "grad_norm": 0.6940468549728394, "learning_rate": 2.0541735158056733e-05, "loss": 0.3971, "num_input_tokens_seen": 7494952, "step": 11475 }, { "epoch": 6.016771488469602, "grad_norm": 0.28494101762771606, "learning_rate": 2.0519235025350704e-05, "loss": 0.5189, "num_input_tokens_seen": 7498600, "step": 11480 }, { "epoch": 6.019392033542977, "grad_norm": 0.4280816912651062, "learning_rate": 2.0496738641944133e-05, "loss": 0.5015, "num_input_tokens_seen": 7502696, "step": 11485 }, { "epoch": 6.022012578616352, "grad_norm": 0.405810683965683, "learning_rate": 2.0474246026660966e-05, "loss": 0.4708, "num_input_tokens_seen": 7505800, "step": 11490 }, { "epoch": 6.0246331236897275, "grad_norm": 0.6717480421066284, "learning_rate": 2.0451757198321992e-05, "loss": 0.47, "num_input_tokens_seen": 7508712, "step": 11495 }, { "epoch": 6.027253668763103, "grad_norm": 0.4726194739341736, "learning_rate": 2.042927217574485e-05, "loss": 0.3836, "num_input_tokens_seen": 7511400, "step": 11500 }, { "epoch": 6.029874213836478, "grad_norm": 0.3483924865722656, "learning_rate": 2.040679097774396e-05, "loss": 0.4556, "num_input_tokens_seen": 7515208, "step": 11505 }, { "epoch": 6.032494758909853, "grad_norm": 0.5868837833404541, "learning_rate": 2.0384313623130565e-05, "loss": 0.3033, "num_input_tokens_seen": 7520712, "step": 11510 }, { "epoch": 6.035115303983228, "grad_norm": 0.2788659334182739, "learning_rate": 2.0361840130712706e-05, "loss": 0.4115, "num_input_tokens_seen": 7522984, "step": 11515 }, { "epoch": 6.037735849056604, "grad_norm": 0.3022781014442444, "learning_rate": 2.033937051929516e-05, "loss": 0.4599, "num_input_tokens_seen": 7526024, "step": 11520 }, { "epoch": 6.040356394129979, "grad_norm": 1.0347964763641357, "learning_rate": 2.0316904807679464e-05, "loss": 0.512, "num_input_tokens_seen": 7529288, "step": 11525 }, { "epoch": 6.0429769392033545, "grad_norm": 0.4331234097480774, "learning_rate": 2.0294443014663923e-05, "loss": 0.4469, "num_input_tokens_seen": 7531752, "step": 11530 }, { "epoch": 6.04559748427673, "grad_norm": 0.6194505095481873, "learning_rate": 2.0271985159043518e-05, "loss": 0.465, "num_input_tokens_seen": 7535112, "step": 11535 }, { "epoch": 6.048218029350105, "grad_norm": 0.595209538936615, "learning_rate": 2.0249531259609965e-05, "loss": 0.4177, "num_input_tokens_seen": 7538344, "step": 11540 }, { "epoch": 6.05083857442348, "grad_norm": 0.917502224445343, "learning_rate": 2.0227081335151675e-05, "loss": 0.4347, "num_input_tokens_seen": 7541064, "step": 11545 }, { "epoch": 6.053459119496855, "grad_norm": 1.5661139488220215, "learning_rate": 2.0204635404453688e-05, "loss": 0.423, "num_input_tokens_seen": 7544808, "step": 11550 }, { "epoch": 6.05607966457023, "grad_norm": 0.4415990710258484, "learning_rate": 2.0182193486297755e-05, "loss": 0.3538, "num_input_tokens_seen": 7548872, "step": 11555 }, { "epoch": 6.058700209643606, "grad_norm": 0.4309420883655548, "learning_rate": 2.0159755599462256e-05, "loss": 0.5421, "num_input_tokens_seen": 7551912, "step": 11560 }, { "epoch": 6.061320754716981, "grad_norm": 0.2827287018299103, "learning_rate": 2.0137321762722166e-05, "loss": 0.3851, "num_input_tokens_seen": 7554856, "step": 11565 }, { "epoch": 6.063941299790357, "grad_norm": 1.5486472845077515, "learning_rate": 2.0114891994849112e-05, "loss": 0.5622, "num_input_tokens_seen": 7558344, "step": 11570 }, { "epoch": 6.066561844863732, "grad_norm": 0.42638832330703735, "learning_rate": 2.0092466314611287e-05, "loss": 0.4854, "num_input_tokens_seen": 7561704, "step": 11575 }, { "epoch": 6.069182389937107, "grad_norm": 0.4389583468437195, "learning_rate": 2.0070044740773487e-05, "loss": 0.4698, "num_input_tokens_seen": 7563816, "step": 11580 }, { "epoch": 6.071802935010482, "grad_norm": 0.35990574955940247, "learning_rate": 2.0047627292097067e-05, "loss": 0.4002, "num_input_tokens_seen": 7566408, "step": 11585 }, { "epoch": 6.0744234800838575, "grad_norm": 0.5742946863174438, "learning_rate": 2.002521398733989e-05, "loss": 0.4409, "num_input_tokens_seen": 7569960, "step": 11590 }, { "epoch": 6.077044025157233, "grad_norm": 0.7356237769126892, "learning_rate": 2.0002804845256423e-05, "loss": 0.411, "num_input_tokens_seen": 7572872, "step": 11595 }, { "epoch": 6.079664570230608, "grad_norm": 0.5313948392868042, "learning_rate": 1.9980399884597605e-05, "loss": 0.588, "num_input_tokens_seen": 7576104, "step": 11600 }, { "epoch": 6.082285115303983, "grad_norm": 0.314179390668869, "learning_rate": 1.995799912411087e-05, "loss": 0.4097, "num_input_tokens_seen": 7578856, "step": 11605 }, { "epoch": 6.084905660377358, "grad_norm": 0.42978140711784363, "learning_rate": 1.993560258254016e-05, "loss": 0.4424, "num_input_tokens_seen": 7582408, "step": 11610 }, { "epoch": 6.087526205450734, "grad_norm": 0.31783246994018555, "learning_rate": 1.9913210278625876e-05, "loss": 0.3943, "num_input_tokens_seen": 7585224, "step": 11615 }, { "epoch": 6.090146750524109, "grad_norm": 0.8687772750854492, "learning_rate": 1.9890822231104872e-05, "loss": 0.4421, "num_input_tokens_seen": 7587880, "step": 11620 }, { "epoch": 6.0927672955974845, "grad_norm": 0.37501823902130127, "learning_rate": 1.9868438458710447e-05, "loss": 0.5187, "num_input_tokens_seen": 7591784, "step": 11625 }, { "epoch": 6.09538784067086, "grad_norm": 0.7600768804550171, "learning_rate": 1.984605898017233e-05, "loss": 0.4832, "num_input_tokens_seen": 7594920, "step": 11630 }, { "epoch": 6.098008385744235, "grad_norm": 0.6843298077583313, "learning_rate": 1.9823683814216622e-05, "loss": 0.5902, "num_input_tokens_seen": 7598504, "step": 11635 }, { "epoch": 6.10062893081761, "grad_norm": 0.3155280649662018, "learning_rate": 1.980131297956586e-05, "loss": 0.5348, "num_input_tokens_seen": 7601640, "step": 11640 }, { "epoch": 6.103249475890985, "grad_norm": 0.5599827170372009, "learning_rate": 1.977894649493894e-05, "loss": 0.4965, "num_input_tokens_seen": 7604520, "step": 11645 }, { "epoch": 6.10587002096436, "grad_norm": 0.22528141736984253, "learning_rate": 1.9756584379051092e-05, "loss": 0.3359, "num_input_tokens_seen": 7608136, "step": 11650 }, { "epoch": 6.1084905660377355, "grad_norm": 0.5280407667160034, "learning_rate": 1.9734226650613928e-05, "loss": 0.5228, "num_input_tokens_seen": 7611592, "step": 11655 }, { "epoch": 6.111111111111111, "grad_norm": 0.9658175706863403, "learning_rate": 1.9711873328335374e-05, "loss": 0.4718, "num_input_tokens_seen": 7614184, "step": 11660 }, { "epoch": 6.113731656184487, "grad_norm": 0.5371577143669128, "learning_rate": 1.9689524430919664e-05, "loss": 0.3919, "num_input_tokens_seen": 7617064, "step": 11665 }, { "epoch": 6.116352201257862, "grad_norm": 0.37381765246391296, "learning_rate": 1.9667179977067344e-05, "loss": 0.4037, "num_input_tokens_seen": 7620680, "step": 11670 }, { "epoch": 6.118972746331237, "grad_norm": 0.397123783826828, "learning_rate": 1.9644839985475216e-05, "loss": 0.4159, "num_input_tokens_seen": 7623944, "step": 11675 }, { "epoch": 6.121593291404612, "grad_norm": 0.7656184434890747, "learning_rate": 1.962250447483638e-05, "loss": 0.4217, "num_input_tokens_seen": 7627144, "step": 11680 }, { "epoch": 6.1242138364779874, "grad_norm": 0.4081661105155945, "learning_rate": 1.960017346384017e-05, "loss": 0.4362, "num_input_tokens_seen": 7630216, "step": 11685 }, { "epoch": 6.126834381551363, "grad_norm": 0.5950894355773926, "learning_rate": 1.9577846971172144e-05, "loss": 0.4651, "num_input_tokens_seen": 7633000, "step": 11690 }, { "epoch": 6.129454926624738, "grad_norm": 0.7803776860237122, "learning_rate": 1.9555525015514097e-05, "loss": 0.3864, "num_input_tokens_seen": 7636104, "step": 11695 }, { "epoch": 6.132075471698113, "grad_norm": 0.5266518592834473, "learning_rate": 1.9533207615544034e-05, "loss": 0.5995, "num_input_tokens_seen": 7639208, "step": 11700 }, { "epoch": 6.134696016771488, "grad_norm": 0.308571994304657, "learning_rate": 1.9510894789936113e-05, "loss": 0.4838, "num_input_tokens_seen": 7642184, "step": 11705 }, { "epoch": 6.137316561844864, "grad_norm": 0.9080867767333984, "learning_rate": 1.9488586557360703e-05, "loss": 0.4908, "num_input_tokens_seen": 7645480, "step": 11710 }, { "epoch": 6.139937106918239, "grad_norm": 0.2793242931365967, "learning_rate": 1.9466282936484313e-05, "loss": 0.475, "num_input_tokens_seen": 7648936, "step": 11715 }, { "epoch": 6.1425576519916145, "grad_norm": 0.5002403855323792, "learning_rate": 1.944398394596959e-05, "loss": 0.3752, "num_input_tokens_seen": 7651816, "step": 11720 }, { "epoch": 6.14517819706499, "grad_norm": 0.2968067526817322, "learning_rate": 1.942168960447531e-05, "loss": 0.4999, "num_input_tokens_seen": 7654568, "step": 11725 }, { "epoch": 6.147798742138365, "grad_norm": 0.3812718093395233, "learning_rate": 1.9399399930656377e-05, "loss": 0.2984, "num_input_tokens_seen": 7657544, "step": 11730 }, { "epoch": 6.15041928721174, "grad_norm": 0.5639442801475525, "learning_rate": 1.937711494316374e-05, "loss": 0.5282, "num_input_tokens_seen": 7660936, "step": 11735 }, { "epoch": 6.153039832285115, "grad_norm": 0.5421589016914368, "learning_rate": 1.9354834660644478e-05, "loss": 0.51, "num_input_tokens_seen": 7663336, "step": 11740 }, { "epoch": 6.15566037735849, "grad_norm": 0.37182241678237915, "learning_rate": 1.9332559101741715e-05, "loss": 0.458, "num_input_tokens_seen": 7666312, "step": 11745 }, { "epoch": 6.1582809224318655, "grad_norm": 0.5578316450119019, "learning_rate": 1.9310288285094615e-05, "loss": 0.5308, "num_input_tokens_seen": 7669224, "step": 11750 }, { "epoch": 6.160901467505241, "grad_norm": 0.3162440359592438, "learning_rate": 1.9288022229338384e-05, "loss": 0.4627, "num_input_tokens_seen": 7672264, "step": 11755 }, { "epoch": 6.163522012578617, "grad_norm": 0.5068563222885132, "learning_rate": 1.9265760953104235e-05, "loss": 0.532, "num_input_tokens_seen": 7674824, "step": 11760 }, { "epoch": 6.166142557651992, "grad_norm": 0.6503132581710815, "learning_rate": 1.924350447501939e-05, "loss": 0.3348, "num_input_tokens_seen": 7677608, "step": 11765 }, { "epoch": 6.168763102725367, "grad_norm": 0.6046767234802246, "learning_rate": 1.922125281370707e-05, "loss": 0.4876, "num_input_tokens_seen": 7680680, "step": 11770 }, { "epoch": 6.171383647798742, "grad_norm": 0.42139026522636414, "learning_rate": 1.919900598778642e-05, "loss": 0.4834, "num_input_tokens_seen": 7684072, "step": 11775 }, { "epoch": 6.174004192872117, "grad_norm": 0.27065855264663696, "learning_rate": 1.9176764015872578e-05, "loss": 0.4881, "num_input_tokens_seen": 7687624, "step": 11780 }, { "epoch": 6.176624737945493, "grad_norm": 0.31045693159103394, "learning_rate": 1.9154526916576618e-05, "loss": 0.5515, "num_input_tokens_seen": 7691656, "step": 11785 }, { "epoch": 6.179245283018868, "grad_norm": 0.3078550696372986, "learning_rate": 1.913229470850552e-05, "loss": 0.5055, "num_input_tokens_seen": 7694568, "step": 11790 }, { "epoch": 6.181865828092243, "grad_norm": 0.19471104443073273, "learning_rate": 1.9110067410262185e-05, "loss": 0.3884, "num_input_tokens_seen": 7697544, "step": 11795 }, { "epoch": 6.184486373165618, "grad_norm": 0.41598424315452576, "learning_rate": 1.90878450404454e-05, "loss": 0.7491, "num_input_tokens_seen": 7700456, "step": 11800 }, { "epoch": 6.187106918238993, "grad_norm": 0.5251849889755249, "learning_rate": 1.9065627617649828e-05, "loss": 0.4371, "num_input_tokens_seen": 7703112, "step": 11805 }, { "epoch": 6.189727463312369, "grad_norm": 0.408539742231369, "learning_rate": 1.9043415160465993e-05, "loss": 0.4202, "num_input_tokens_seen": 7705992, "step": 11810 }, { "epoch": 6.1923480083857445, "grad_norm": 0.4335121810436249, "learning_rate": 1.9021207687480278e-05, "loss": 0.4727, "num_input_tokens_seen": 7709576, "step": 11815 }, { "epoch": 6.19496855345912, "grad_norm": 0.4283578097820282, "learning_rate": 1.8999005217274857e-05, "loss": 0.4493, "num_input_tokens_seen": 7712712, "step": 11820 }, { "epoch": 6.197589098532495, "grad_norm": 0.7460825443267822, "learning_rate": 1.897680776842775e-05, "loss": 0.4391, "num_input_tokens_seen": 7715432, "step": 11825 }, { "epoch": 6.20020964360587, "grad_norm": 0.36693212389945984, "learning_rate": 1.895461535951279e-05, "loss": 0.4015, "num_input_tokens_seen": 7718216, "step": 11830 }, { "epoch": 6.202830188679245, "grad_norm": 0.4509761929512024, "learning_rate": 1.8932428009099545e-05, "loss": 0.4321, "num_input_tokens_seen": 7721096, "step": 11835 }, { "epoch": 6.20545073375262, "grad_norm": 0.6873663663864136, "learning_rate": 1.891024573575339e-05, "loss": 0.5499, "num_input_tokens_seen": 7724264, "step": 11840 }, { "epoch": 6.2080712788259955, "grad_norm": 0.5344118475914001, "learning_rate": 1.8888068558035435e-05, "loss": 0.4228, "num_input_tokens_seen": 7727176, "step": 11845 }, { "epoch": 6.210691823899371, "grad_norm": 0.5826122164726257, "learning_rate": 1.8865896494502525e-05, "loss": 0.4783, "num_input_tokens_seen": 7729544, "step": 11850 }, { "epoch": 6.213312368972747, "grad_norm": 0.3470027446746826, "learning_rate": 1.8843729563707247e-05, "loss": 0.5455, "num_input_tokens_seen": 7733288, "step": 11855 }, { "epoch": 6.215932914046122, "grad_norm": 0.44481924176216125, "learning_rate": 1.8821567784197847e-05, "loss": 0.4047, "num_input_tokens_seen": 7737096, "step": 11860 }, { "epoch": 6.218553459119497, "grad_norm": 0.7749979496002197, "learning_rate": 1.8799411174518306e-05, "loss": 0.4464, "num_input_tokens_seen": 7741512, "step": 11865 }, { "epoch": 6.221174004192872, "grad_norm": 0.4851498305797577, "learning_rate": 1.8777259753208275e-05, "loss": 0.4549, "num_input_tokens_seen": 7744648, "step": 11870 }, { "epoch": 6.223794549266247, "grad_norm": 0.9490376114845276, "learning_rate": 1.8755113538803026e-05, "loss": 0.3773, "num_input_tokens_seen": 7748392, "step": 11875 }, { "epoch": 6.226415094339623, "grad_norm": 0.5701913833618164, "learning_rate": 1.8732972549833516e-05, "loss": 0.404, "num_input_tokens_seen": 7752360, "step": 11880 }, { "epoch": 6.229035639412998, "grad_norm": 0.584348201751709, "learning_rate": 1.8710836804826314e-05, "loss": 0.329, "num_input_tokens_seen": 7755016, "step": 11885 }, { "epoch": 6.231656184486373, "grad_norm": 0.3707134425640106, "learning_rate": 1.8688706322303595e-05, "loss": 0.5132, "num_input_tokens_seen": 7758120, "step": 11890 }, { "epoch": 6.234276729559748, "grad_norm": 0.7233261466026306, "learning_rate": 1.8666581120783134e-05, "loss": 0.6224, "num_input_tokens_seen": 7761032, "step": 11895 }, { "epoch": 6.236897274633123, "grad_norm": 0.4892785847187042, "learning_rate": 1.8644461218778304e-05, "loss": 0.4197, "num_input_tokens_seen": 7763240, "step": 11900 }, { "epoch": 6.239517819706499, "grad_norm": 0.30245375633239746, "learning_rate": 1.8622346634798e-05, "loss": 0.4493, "num_input_tokens_seen": 7766664, "step": 11905 }, { "epoch": 6.2421383647798745, "grad_norm": 0.559799075126648, "learning_rate": 1.8600237387346716e-05, "loss": 0.5535, "num_input_tokens_seen": 7769448, "step": 11910 }, { "epoch": 6.24475890985325, "grad_norm": 0.9748933911323547, "learning_rate": 1.8578133494924473e-05, "loss": 0.5715, "num_input_tokens_seen": 7772296, "step": 11915 }, { "epoch": 6.247379454926625, "grad_norm": 0.6002548933029175, "learning_rate": 1.8556034976026764e-05, "loss": 0.3855, "num_input_tokens_seen": 7775912, "step": 11920 }, { "epoch": 6.25, "grad_norm": 0.3870489001274109, "learning_rate": 1.8533941849144642e-05, "loss": 0.5253, "num_input_tokens_seen": 7779784, "step": 11925 }, { "epoch": 6.252620545073375, "grad_norm": 0.36671119928359985, "learning_rate": 1.8511854132764627e-05, "loss": 0.3469, "num_input_tokens_seen": 7782504, "step": 11930 }, { "epoch": 6.25524109014675, "grad_norm": 0.6479450464248657, "learning_rate": 1.84897718453687e-05, "loss": 0.3296, "num_input_tokens_seen": 7785864, "step": 11935 }, { "epoch": 6.2578616352201255, "grad_norm": 0.579310953617096, "learning_rate": 1.846769500543434e-05, "loss": 0.4013, "num_input_tokens_seen": 7788808, "step": 11940 }, { "epoch": 6.260482180293501, "grad_norm": 0.5723283290863037, "learning_rate": 1.844562363143439e-05, "loss": 0.4259, "num_input_tokens_seen": 7792232, "step": 11945 }, { "epoch": 6.263102725366876, "grad_norm": 0.34798142313957214, "learning_rate": 1.8423557741837198e-05, "loss": 0.4317, "num_input_tokens_seen": 7794856, "step": 11950 }, { "epoch": 6.265723270440252, "grad_norm": 0.42270055413246155, "learning_rate": 1.84014973551065e-05, "loss": 0.5879, "num_input_tokens_seen": 7797800, "step": 11955 }, { "epoch": 6.268343815513627, "grad_norm": 0.2830858528614044, "learning_rate": 1.8379442489701396e-05, "loss": 0.416, "num_input_tokens_seen": 7801256, "step": 11960 }, { "epoch": 6.270964360587002, "grad_norm": 0.3429669141769409, "learning_rate": 1.8357393164076403e-05, "loss": 0.3967, "num_input_tokens_seen": 7804296, "step": 11965 }, { "epoch": 6.273584905660377, "grad_norm": 0.49492549896240234, "learning_rate": 1.8335349396681394e-05, "loss": 0.4762, "num_input_tokens_seen": 7807464, "step": 11970 }, { "epoch": 6.276205450733753, "grad_norm": 0.21290481090545654, "learning_rate": 1.8313311205961577e-05, "loss": 0.4263, "num_input_tokens_seen": 7812008, "step": 11975 }, { "epoch": 6.278825995807128, "grad_norm": 0.8782435059547424, "learning_rate": 1.82912786103575e-05, "loss": 0.4555, "num_input_tokens_seen": 7815336, "step": 11980 }, { "epoch": 6.281446540880503, "grad_norm": 0.2628708481788635, "learning_rate": 1.826925162830505e-05, "loss": 0.3968, "num_input_tokens_seen": 7818856, "step": 11985 }, { "epoch": 6.284067085953878, "grad_norm": 0.42247846722602844, "learning_rate": 1.8247230278235384e-05, "loss": 0.5601, "num_input_tokens_seen": 7822024, "step": 11990 }, { "epoch": 6.286687631027253, "grad_norm": 0.40799370408058167, "learning_rate": 1.8225214578574967e-05, "loss": 0.6172, "num_input_tokens_seen": 7825640, "step": 11995 }, { "epoch": 6.289308176100629, "grad_norm": 0.3290785551071167, "learning_rate": 1.820320454774554e-05, "loss": 0.4411, "num_input_tokens_seen": 7827944, "step": 12000 }, { "epoch": 6.2919287211740045, "grad_norm": 0.44637370109558105, "learning_rate": 1.8181200204164073e-05, "loss": 0.4549, "num_input_tokens_seen": 7830600, "step": 12005 }, { "epoch": 6.29454926624738, "grad_norm": 0.28002411127090454, "learning_rate": 1.8159201566242806e-05, "loss": 0.3522, "num_input_tokens_seen": 7834056, "step": 12010 }, { "epoch": 6.297169811320755, "grad_norm": 0.5051760077476501, "learning_rate": 1.81372086523892e-05, "loss": 0.4476, "num_input_tokens_seen": 7836904, "step": 12015 }, { "epoch": 6.29979035639413, "grad_norm": 0.7345203757286072, "learning_rate": 1.8115221481005904e-05, "loss": 0.4701, "num_input_tokens_seen": 7839880, "step": 12020 }, { "epoch": 6.302410901467505, "grad_norm": 0.36192774772644043, "learning_rate": 1.809324007049079e-05, "loss": 0.4331, "num_input_tokens_seen": 7843432, "step": 12025 }, { "epoch": 6.30503144654088, "grad_norm": 0.4943949282169342, "learning_rate": 1.8071264439236903e-05, "loss": 0.4173, "num_input_tokens_seen": 7846216, "step": 12030 }, { "epoch": 6.3076519916142555, "grad_norm": 0.6728324890136719, "learning_rate": 1.8049294605632434e-05, "loss": 0.4728, "num_input_tokens_seen": 7849544, "step": 12035 }, { "epoch": 6.310272536687631, "grad_norm": 0.5410138964653015, "learning_rate": 1.8027330588060757e-05, "loss": 0.4944, "num_input_tokens_seen": 7852552, "step": 12040 }, { "epoch": 6.312893081761006, "grad_norm": 0.5102012753486633, "learning_rate": 1.8005372404900335e-05, "loss": 0.5472, "num_input_tokens_seen": 7855336, "step": 12045 }, { "epoch": 6.315513626834382, "grad_norm": 0.35334959626197815, "learning_rate": 1.7983420074524777e-05, "loss": 0.5079, "num_input_tokens_seen": 7858184, "step": 12050 }, { "epoch": 6.318134171907757, "grad_norm": 0.680285632610321, "learning_rate": 1.7961473615302805e-05, "loss": 0.3904, "num_input_tokens_seen": 7861192, "step": 12055 }, { "epoch": 6.320754716981132, "grad_norm": 0.38005784153938293, "learning_rate": 1.79395330455982e-05, "loss": 0.4518, "num_input_tokens_seen": 7864520, "step": 12060 }, { "epoch": 6.323375262054507, "grad_norm": 0.6727699041366577, "learning_rate": 1.7917598383769836e-05, "loss": 0.5399, "num_input_tokens_seen": 7867240, "step": 12065 }, { "epoch": 6.325995807127883, "grad_norm": 0.6145023703575134, "learning_rate": 1.789566964817163e-05, "loss": 0.5176, "num_input_tokens_seen": 7870120, "step": 12070 }, { "epoch": 6.328616352201258, "grad_norm": 0.7534199953079224, "learning_rate": 1.7873746857152552e-05, "loss": 0.3655, "num_input_tokens_seen": 7873736, "step": 12075 }, { "epoch": 6.331236897274633, "grad_norm": 0.539345920085907, "learning_rate": 1.7851830029056587e-05, "loss": 0.4402, "num_input_tokens_seen": 7877000, "step": 12080 }, { "epoch": 6.333857442348008, "grad_norm": 0.5661314725875854, "learning_rate": 1.7829919182222752e-05, "loss": 0.3422, "num_input_tokens_seen": 7879784, "step": 12085 }, { "epoch": 6.336477987421383, "grad_norm": 0.5042904615402222, "learning_rate": 1.780801433498501e-05, "loss": 0.5289, "num_input_tokens_seen": 7882792, "step": 12090 }, { "epoch": 6.339098532494759, "grad_norm": 0.666902482509613, "learning_rate": 1.7786115505672364e-05, "loss": 0.5166, "num_input_tokens_seen": 7885416, "step": 12095 }, { "epoch": 6.3417190775681345, "grad_norm": 0.3417324721813202, "learning_rate": 1.7764222712608753e-05, "loss": 0.4939, "num_input_tokens_seen": 7888936, "step": 12100 }, { "epoch": 6.34433962264151, "grad_norm": 0.2754647135734558, "learning_rate": 1.7742335974113046e-05, "loss": 0.5245, "num_input_tokens_seen": 7891976, "step": 12105 }, { "epoch": 6.346960167714885, "grad_norm": 0.7674411535263062, "learning_rate": 1.7720455308499084e-05, "loss": 0.5567, "num_input_tokens_seen": 7895848, "step": 12110 }, { "epoch": 6.34958071278826, "grad_norm": 0.47958996891975403, "learning_rate": 1.769858073407561e-05, "loss": 0.6811, "num_input_tokens_seen": 7898760, "step": 12115 }, { "epoch": 6.352201257861635, "grad_norm": 0.6724675297737122, "learning_rate": 1.767671226914625e-05, "loss": 0.4485, "num_input_tokens_seen": 7902120, "step": 12120 }, { "epoch": 6.35482180293501, "grad_norm": 0.953906238079071, "learning_rate": 1.7654849932009566e-05, "loss": 0.6557, "num_input_tokens_seen": 7905224, "step": 12125 }, { "epoch": 6.3574423480083855, "grad_norm": 0.4657610356807709, "learning_rate": 1.763299374095893e-05, "loss": 0.568, "num_input_tokens_seen": 7908424, "step": 12130 }, { "epoch": 6.360062893081761, "grad_norm": 0.5046035051345825, "learning_rate": 1.761114371428262e-05, "loss": 0.4498, "num_input_tokens_seen": 7911336, "step": 12135 }, { "epoch": 6.362683438155136, "grad_norm": 0.41407331824302673, "learning_rate": 1.7589299870263753e-05, "loss": 0.3632, "num_input_tokens_seen": 7913704, "step": 12140 }, { "epoch": 6.365303983228512, "grad_norm": 0.7149323225021362, "learning_rate": 1.756746222718024e-05, "loss": 0.5223, "num_input_tokens_seen": 7916584, "step": 12145 }, { "epoch": 6.367924528301887, "grad_norm": 0.3443928062915802, "learning_rate": 1.7545630803304826e-05, "loss": 0.5632, "num_input_tokens_seen": 7919880, "step": 12150 }, { "epoch": 6.370545073375262, "grad_norm": 0.552416205406189, "learning_rate": 1.7523805616905063e-05, "loss": 0.5325, "num_input_tokens_seen": 7923752, "step": 12155 }, { "epoch": 6.373165618448637, "grad_norm": 0.419621080160141, "learning_rate": 1.7501986686243256e-05, "loss": 0.5553, "num_input_tokens_seen": 7927464, "step": 12160 }, { "epoch": 6.3757861635220126, "grad_norm": 0.5339087247848511, "learning_rate": 1.748017402957649e-05, "loss": 0.3787, "num_input_tokens_seen": 7930792, "step": 12165 }, { "epoch": 6.378406708595388, "grad_norm": 0.41132041811943054, "learning_rate": 1.7458367665156615e-05, "loss": 0.4919, "num_input_tokens_seen": 7935112, "step": 12170 }, { "epoch": 6.381027253668763, "grad_norm": 0.37763673067092896, "learning_rate": 1.743656761123018e-05, "loss": 0.4262, "num_input_tokens_seen": 7938152, "step": 12175 }, { "epoch": 6.383647798742138, "grad_norm": 0.3350672125816345, "learning_rate": 1.7414773886038487e-05, "loss": 0.4843, "num_input_tokens_seen": 7942536, "step": 12180 }, { "epoch": 6.386268343815513, "grad_norm": 0.4942372441291809, "learning_rate": 1.7392986507817532e-05, "loss": 0.4061, "num_input_tokens_seen": 7945224, "step": 12185 }, { "epoch": 6.388888888888889, "grad_norm": 0.537584125995636, "learning_rate": 1.7371205494797987e-05, "loss": 0.4576, "num_input_tokens_seen": 7948520, "step": 12190 }, { "epoch": 6.3915094339622645, "grad_norm": 0.6151387095451355, "learning_rate": 1.7349430865205215e-05, "loss": 0.5247, "num_input_tokens_seen": 7951368, "step": 12195 }, { "epoch": 6.39412997903564, "grad_norm": 0.37715259194374084, "learning_rate": 1.7327662637259234e-05, "loss": 0.6137, "num_input_tokens_seen": 7954472, "step": 12200 }, { "epoch": 6.396750524109015, "grad_norm": 0.4328692853450775, "learning_rate": 1.7305900829174697e-05, "loss": 0.5177, "num_input_tokens_seen": 7957000, "step": 12205 }, { "epoch": 6.39937106918239, "grad_norm": 0.47371891140937805, "learning_rate": 1.7284145459160893e-05, "loss": 0.4847, "num_input_tokens_seen": 7959400, "step": 12210 }, { "epoch": 6.401991614255765, "grad_norm": 1.0821987390518188, "learning_rate": 1.7262396545421728e-05, "loss": 0.4729, "num_input_tokens_seen": 7962312, "step": 12215 }, { "epoch": 6.40461215932914, "grad_norm": 0.6736742854118347, "learning_rate": 1.7240654106155688e-05, "loss": 0.5488, "num_input_tokens_seen": 7965768, "step": 12220 }, { "epoch": 6.4072327044025155, "grad_norm": 0.38170990347862244, "learning_rate": 1.721891815955587e-05, "loss": 0.403, "num_input_tokens_seen": 7972840, "step": 12225 }, { "epoch": 6.409853249475891, "grad_norm": 0.5123251676559448, "learning_rate": 1.71971887238099e-05, "loss": 0.5353, "num_input_tokens_seen": 7975304, "step": 12230 }, { "epoch": 6.412473794549266, "grad_norm": 0.4834706783294678, "learning_rate": 1.7175465817099988e-05, "loss": 0.3867, "num_input_tokens_seen": 7977928, "step": 12235 }, { "epoch": 6.415094339622642, "grad_norm": 0.4831320345401764, "learning_rate": 1.7153749457602874e-05, "loss": 0.4449, "num_input_tokens_seen": 7982120, "step": 12240 }, { "epoch": 6.417714884696017, "grad_norm": 0.47402796149253845, "learning_rate": 1.7132039663489806e-05, "loss": 0.4465, "num_input_tokens_seen": 7985032, "step": 12245 }, { "epoch": 6.420335429769392, "grad_norm": 0.5014074444770813, "learning_rate": 1.7110336452926555e-05, "loss": 0.4443, "num_input_tokens_seen": 7987272, "step": 12250 }, { "epoch": 6.422955974842767, "grad_norm": 0.4417906403541565, "learning_rate": 1.708863984407338e-05, "loss": 0.5086, "num_input_tokens_seen": 7990152, "step": 12255 }, { "epoch": 6.4255765199161425, "grad_norm": 0.43497011065483093, "learning_rate": 1.7066949855085e-05, "loss": 0.5365, "num_input_tokens_seen": 7992648, "step": 12260 }, { "epoch": 6.428197064989518, "grad_norm": 0.3306581377983093, "learning_rate": 1.704526650411062e-05, "loss": 0.4394, "num_input_tokens_seen": 7995880, "step": 12265 }, { "epoch": 6.430817610062893, "grad_norm": 1.111523985862732, "learning_rate": 1.7023589809293876e-05, "loss": 0.4349, "num_input_tokens_seen": 7998312, "step": 12270 }, { "epoch": 6.433438155136268, "grad_norm": 0.34829097986221313, "learning_rate": 1.7001919788772824e-05, "loss": 0.4378, "num_input_tokens_seen": 8001768, "step": 12275 }, { "epoch": 6.436058700209643, "grad_norm": 1.0439943075180054, "learning_rate": 1.6980256460679953e-05, "loss": 0.4227, "num_input_tokens_seen": 8004392, "step": 12280 }, { "epoch": 6.438679245283019, "grad_norm": 0.40315207839012146, "learning_rate": 1.6958599843142153e-05, "loss": 0.4389, "num_input_tokens_seen": 8006984, "step": 12285 }, { "epoch": 6.441299790356394, "grad_norm": 0.3170359432697296, "learning_rate": 1.6936949954280686e-05, "loss": 0.6193, "num_input_tokens_seen": 8010920, "step": 12290 }, { "epoch": 6.44392033542977, "grad_norm": 0.4526955783367157, "learning_rate": 1.691530681221119e-05, "loss": 0.525, "num_input_tokens_seen": 8013800, "step": 12295 }, { "epoch": 6.446540880503145, "grad_norm": 0.5991788506507874, "learning_rate": 1.6893670435043666e-05, "loss": 0.3799, "num_input_tokens_seen": 8016616, "step": 12300 }, { "epoch": 6.44916142557652, "grad_norm": 0.2583974599838257, "learning_rate": 1.6872040840882434e-05, "loss": 0.4173, "num_input_tokens_seen": 8020040, "step": 12305 }, { "epoch": 6.451781970649895, "grad_norm": 0.3729840815067291, "learning_rate": 1.6850418047826167e-05, "loss": 0.3844, "num_input_tokens_seen": 8024008, "step": 12310 }, { "epoch": 6.45440251572327, "grad_norm": 0.4708462953567505, "learning_rate": 1.6828802073967805e-05, "loss": 0.4908, "num_input_tokens_seen": 8027528, "step": 12315 }, { "epoch": 6.4570230607966455, "grad_norm": 1.164556622505188, "learning_rate": 1.6807192937394624e-05, "loss": 0.5428, "num_input_tokens_seen": 8030312, "step": 12320 }, { "epoch": 6.459643605870021, "grad_norm": 0.38830405473709106, "learning_rate": 1.6785590656188167e-05, "loss": 0.3507, "num_input_tokens_seen": 8034280, "step": 12325 }, { "epoch": 6.462264150943396, "grad_norm": 0.3689507246017456, "learning_rate": 1.6763995248424223e-05, "loss": 0.4972, "num_input_tokens_seen": 8038408, "step": 12330 }, { "epoch": 6.464884696016772, "grad_norm": 0.5046773552894592, "learning_rate": 1.6742406732172854e-05, "loss": 0.4619, "num_input_tokens_seen": 8041544, "step": 12335 }, { "epoch": 6.467505241090147, "grad_norm": 0.2765987515449524, "learning_rate": 1.6720825125498342e-05, "loss": 0.3162, "num_input_tokens_seen": 8045096, "step": 12340 }, { "epoch": 6.470125786163522, "grad_norm": 0.403628408908844, "learning_rate": 1.6699250446459182e-05, "loss": 0.4193, "num_input_tokens_seen": 8049064, "step": 12345 }, { "epoch": 6.472746331236897, "grad_norm": 0.440595805644989, "learning_rate": 1.6677682713108082e-05, "loss": 0.509, "num_input_tokens_seen": 8051816, "step": 12350 }, { "epoch": 6.4753668763102725, "grad_norm": 0.4991702139377594, "learning_rate": 1.6656121943491954e-05, "loss": 0.4687, "num_input_tokens_seen": 8055528, "step": 12355 }, { "epoch": 6.477987421383648, "grad_norm": 0.33644410967826843, "learning_rate": 1.6634568155651842e-05, "loss": 0.3904, "num_input_tokens_seen": 8058472, "step": 12360 }, { "epoch": 6.480607966457023, "grad_norm": 0.4835313558578491, "learning_rate": 1.6613021367622978e-05, "loss": 0.6901, "num_input_tokens_seen": 8061608, "step": 12365 }, { "epoch": 6.483228511530398, "grad_norm": 0.23638179898262024, "learning_rate": 1.6591481597434733e-05, "loss": 0.481, "num_input_tokens_seen": 8064648, "step": 12370 }, { "epoch": 6.485849056603773, "grad_norm": 0.5026748776435852, "learning_rate": 1.65699488631106e-05, "loss": 0.6383, "num_input_tokens_seen": 8067496, "step": 12375 }, { "epoch": 6.488469601677149, "grad_norm": 0.3223457932472229, "learning_rate": 1.6548423182668186e-05, "loss": 0.4078, "num_input_tokens_seen": 8069864, "step": 12380 }, { "epoch": 6.491090146750524, "grad_norm": 0.4914149343967438, "learning_rate": 1.6526904574119213e-05, "loss": 0.5739, "num_input_tokens_seen": 8072936, "step": 12385 }, { "epoch": 6.4937106918239, "grad_norm": 0.77780681848526, "learning_rate": 1.6505393055469444e-05, "loss": 0.541, "num_input_tokens_seen": 8075784, "step": 12390 }, { "epoch": 6.496331236897275, "grad_norm": 0.9470133185386658, "learning_rate": 1.648388864471875e-05, "loss": 0.5008, "num_input_tokens_seen": 8078280, "step": 12395 }, { "epoch": 6.49895178197065, "grad_norm": 0.37166836857795715, "learning_rate": 1.646239135986105e-05, "loss": 0.3614, "num_input_tokens_seen": 8081992, "step": 12400 }, { "epoch": 6.5, "eval_loss": 0.4850774109363556, "eval_runtime": 14.5274, "eval_samples_per_second": 58.373, "eval_steps_per_second": 14.593, "num_input_tokens_seen": 8083560, "step": 12402 }, { "epoch": 6.501572327044025, "grad_norm": 0.5188433527946472, "learning_rate": 1.6440901218884264e-05, "loss": 0.4917, "num_input_tokens_seen": 8085672, "step": 12405 }, { "epoch": 6.5041928721174, "grad_norm": 0.25600194931030273, "learning_rate": 1.641941823977038e-05, "loss": 0.3981, "num_input_tokens_seen": 8089000, "step": 12410 }, { "epoch": 6.506813417190775, "grad_norm": 0.3587893545627594, "learning_rate": 1.6397942440495363e-05, "loss": 0.4027, "num_input_tokens_seen": 8092168, "step": 12415 }, { "epoch": 6.509433962264151, "grad_norm": 0.384745329618454, "learning_rate": 1.6376473839029188e-05, "loss": 0.5003, "num_input_tokens_seen": 8094728, "step": 12420 }, { "epoch": 6.512054507337526, "grad_norm": 1.346463918685913, "learning_rate": 1.63550124533358e-05, "loss": 0.5105, "num_input_tokens_seen": 8098088, "step": 12425 }, { "epoch": 6.514675052410902, "grad_norm": 0.583342969417572, "learning_rate": 1.63335583013731e-05, "loss": 0.4736, "num_input_tokens_seen": 8100904, "step": 12430 }, { "epoch": 6.517295597484277, "grad_norm": 0.48939019441604614, "learning_rate": 1.6312111401092946e-05, "loss": 0.3863, "num_input_tokens_seen": 8104264, "step": 12435 }, { "epoch": 6.519916142557652, "grad_norm": 0.33072417974472046, "learning_rate": 1.6290671770441135e-05, "loss": 0.3668, "num_input_tokens_seen": 8107144, "step": 12440 }, { "epoch": 6.522536687631027, "grad_norm": 0.34375205636024475, "learning_rate": 1.6269239427357348e-05, "loss": 0.4702, "num_input_tokens_seen": 8111304, "step": 12445 }, { "epoch": 6.5251572327044025, "grad_norm": 0.5646814703941345, "learning_rate": 1.62478143897752e-05, "loss": 0.5012, "num_input_tokens_seen": 8114056, "step": 12450 }, { "epoch": 6.527777777777778, "grad_norm": 0.850141704082489, "learning_rate": 1.6226396675622203e-05, "loss": 0.5695, "num_input_tokens_seen": 8116808, "step": 12455 }, { "epoch": 6.530398322851153, "grad_norm": 0.5427826642990112, "learning_rate": 1.6204986302819693e-05, "loss": 0.4839, "num_input_tokens_seen": 8119496, "step": 12460 }, { "epoch": 6.533018867924528, "grad_norm": 0.672717809677124, "learning_rate": 1.6183583289282906e-05, "loss": 0.5142, "num_input_tokens_seen": 8122728, "step": 12465 }, { "epoch": 6.535639412997903, "grad_norm": 0.5309325456619263, "learning_rate": 1.616218765292091e-05, "loss": 0.5898, "num_input_tokens_seen": 8126504, "step": 12470 }, { "epoch": 6.538259958071279, "grad_norm": 0.6960228085517883, "learning_rate": 1.6140799411636586e-05, "loss": 0.6536, "num_input_tokens_seen": 8128776, "step": 12475 }, { "epoch": 6.540880503144654, "grad_norm": 0.3517395853996277, "learning_rate": 1.611941858332664e-05, "loss": 0.5549, "num_input_tokens_seen": 8131720, "step": 12480 }, { "epoch": 6.54350104821803, "grad_norm": 0.4738260507583618, "learning_rate": 1.6098045185881587e-05, "loss": 0.5536, "num_input_tokens_seen": 8135624, "step": 12485 }, { "epoch": 6.546121593291405, "grad_norm": 0.4479638338088989, "learning_rate": 1.6076679237185682e-05, "loss": 0.4239, "num_input_tokens_seen": 8138632, "step": 12490 }, { "epoch": 6.54874213836478, "grad_norm": 0.38834887742996216, "learning_rate": 1.6055320755117004e-05, "loss": 0.4785, "num_input_tokens_seen": 8142184, "step": 12495 }, { "epoch": 6.551362683438155, "grad_norm": 0.30261650681495667, "learning_rate": 1.6033969757547336e-05, "loss": 0.5362, "num_input_tokens_seen": 8146792, "step": 12500 }, { "epoch": 6.55398322851153, "grad_norm": 0.48818591237068176, "learning_rate": 1.601262626234222e-05, "loss": 0.3849, "num_input_tokens_seen": 8150504, "step": 12505 }, { "epoch": 6.556603773584905, "grad_norm": 0.3781694173812866, "learning_rate": 1.5991290287360925e-05, "loss": 0.4595, "num_input_tokens_seen": 8153000, "step": 12510 }, { "epoch": 6.559224318658281, "grad_norm": 0.6191318035125732, "learning_rate": 1.5969961850456412e-05, "loss": 0.3866, "num_input_tokens_seen": 8155720, "step": 12515 }, { "epoch": 6.561844863731656, "grad_norm": 0.46615684032440186, "learning_rate": 1.5948640969475346e-05, "loss": 0.4296, "num_input_tokens_seen": 8159400, "step": 12520 }, { "epoch": 6.564465408805032, "grad_norm": 0.4934194087982178, "learning_rate": 1.592732766225808e-05, "loss": 0.369, "num_input_tokens_seen": 8162952, "step": 12525 }, { "epoch": 6.567085953878407, "grad_norm": 0.6047238707542419, "learning_rate": 1.5906021946638585e-05, "loss": 0.4708, "num_input_tokens_seen": 8166376, "step": 12530 }, { "epoch": 6.569706498951782, "grad_norm": 0.7501690983772278, "learning_rate": 1.5884723840444532e-05, "loss": 0.5201, "num_input_tokens_seen": 8169064, "step": 12535 }, { "epoch": 6.572327044025157, "grad_norm": 0.4740757346153259, "learning_rate": 1.5863433361497214e-05, "loss": 0.5829, "num_input_tokens_seen": 8171976, "step": 12540 }, { "epoch": 6.5749475890985325, "grad_norm": 0.4453968107700348, "learning_rate": 1.5842150527611506e-05, "loss": 0.4444, "num_input_tokens_seen": 8176200, "step": 12545 }, { "epoch": 6.577568134171908, "grad_norm": 0.4782255291938782, "learning_rate": 1.5820875356595925e-05, "loss": 0.4806, "num_input_tokens_seen": 8179368, "step": 12550 }, { "epoch": 6.580188679245283, "grad_norm": 0.49484384059906006, "learning_rate": 1.579960786625256e-05, "loss": 0.3607, "num_input_tokens_seen": 8182248, "step": 12555 }, { "epoch": 6.582809224318658, "grad_norm": 0.469584584236145, "learning_rate": 1.5778348074377074e-05, "loss": 0.5851, "num_input_tokens_seen": 8184776, "step": 12560 }, { "epoch": 6.585429769392033, "grad_norm": 0.3288717567920685, "learning_rate": 1.575709599875869e-05, "loss": 0.39, "num_input_tokens_seen": 8187976, "step": 12565 }, { "epoch": 6.588050314465409, "grad_norm": 0.4847364127635956, "learning_rate": 1.5735851657180184e-05, "loss": 0.4513, "num_input_tokens_seen": 8190984, "step": 12570 }, { "epoch": 6.590670859538784, "grad_norm": 0.5735188722610474, "learning_rate": 1.571461506741783e-05, "loss": 0.4168, "num_input_tokens_seen": 8194568, "step": 12575 }, { "epoch": 6.59329140461216, "grad_norm": 0.5357340574264526, "learning_rate": 1.5693386247241453e-05, "loss": 0.6072, "num_input_tokens_seen": 8197512, "step": 12580 }, { "epoch": 6.595911949685535, "grad_norm": 0.42956897616386414, "learning_rate": 1.5672165214414362e-05, "loss": 0.4312, "num_input_tokens_seen": 8201128, "step": 12585 }, { "epoch": 6.59853249475891, "grad_norm": 0.3142218589782715, "learning_rate": 1.5650951986693334e-05, "loss": 0.3089, "num_input_tokens_seen": 8204392, "step": 12590 }, { "epoch": 6.601153039832285, "grad_norm": 0.5678719282150269, "learning_rate": 1.5629746581828642e-05, "loss": 0.4964, "num_input_tokens_seen": 8206952, "step": 12595 }, { "epoch": 6.60377358490566, "grad_norm": 0.8197391033172607, "learning_rate": 1.560854901756399e-05, "loss": 0.3958, "num_input_tokens_seen": 8209768, "step": 12600 }, { "epoch": 6.606394129979035, "grad_norm": 0.4360329210758209, "learning_rate": 1.558735931163653e-05, "loss": 0.5449, "num_input_tokens_seen": 8216680, "step": 12605 }, { "epoch": 6.609014675052411, "grad_norm": 0.42729952931404114, "learning_rate": 1.5566177481776857e-05, "loss": 0.4489, "num_input_tokens_seen": 8220552, "step": 12610 }, { "epoch": 6.611635220125786, "grad_norm": 0.37952831387519836, "learning_rate": 1.554500354570894e-05, "loss": 0.502, "num_input_tokens_seen": 8225064, "step": 12615 }, { "epoch": 6.614255765199162, "grad_norm": 0.4666159451007843, "learning_rate": 1.552383752115017e-05, "loss": 0.5339, "num_input_tokens_seen": 8228168, "step": 12620 }, { "epoch": 6.616876310272537, "grad_norm": 1.2513400316238403, "learning_rate": 1.550267942581132e-05, "loss": 0.3776, "num_input_tokens_seen": 8231944, "step": 12625 }, { "epoch": 6.619496855345912, "grad_norm": 0.545364499092102, "learning_rate": 1.548152927739649e-05, "loss": 0.4728, "num_input_tokens_seen": 8235848, "step": 12630 }, { "epoch": 6.622117400419287, "grad_norm": 0.3181051015853882, "learning_rate": 1.5460387093603178e-05, "loss": 0.5461, "num_input_tokens_seen": 8238792, "step": 12635 }, { "epoch": 6.6247379454926625, "grad_norm": 0.5606282353401184, "learning_rate": 1.5439252892122197e-05, "loss": 0.4425, "num_input_tokens_seen": 8242088, "step": 12640 }, { "epoch": 6.627358490566038, "grad_norm": 0.42522117495536804, "learning_rate": 1.5418126690637673e-05, "loss": 0.5849, "num_input_tokens_seen": 8244616, "step": 12645 }, { "epoch": 6.629979035639413, "grad_norm": 0.5455744862556458, "learning_rate": 1.5397008506827057e-05, "loss": 0.4124, "num_input_tokens_seen": 8247912, "step": 12650 }, { "epoch": 6.632599580712788, "grad_norm": 0.5125686526298523, "learning_rate": 1.537589835836108e-05, "loss": 0.4698, "num_input_tokens_seen": 8251816, "step": 12655 }, { "epoch": 6.635220125786163, "grad_norm": 0.44164687395095825, "learning_rate": 1.5354796262903736e-05, "loss": 0.5226, "num_input_tokens_seen": 8255432, "step": 12660 }, { "epoch": 6.637840670859539, "grad_norm": 0.2634292244911194, "learning_rate": 1.5333702238112306e-05, "loss": 0.3396, "num_input_tokens_seen": 8258824, "step": 12665 }, { "epoch": 6.640461215932914, "grad_norm": 0.6282626390457153, "learning_rate": 1.5312616301637313e-05, "loss": 0.5086, "num_input_tokens_seen": 8262888, "step": 12670 }, { "epoch": 6.6430817610062896, "grad_norm": 0.6063100695610046, "learning_rate": 1.5291538471122488e-05, "loss": 0.3508, "num_input_tokens_seen": 8266568, "step": 12675 }, { "epoch": 6.645702306079665, "grad_norm": 0.3989923894405365, "learning_rate": 1.527046876420481e-05, "loss": 0.527, "num_input_tokens_seen": 8269800, "step": 12680 }, { "epoch": 6.64832285115304, "grad_norm": 0.2765529453754425, "learning_rate": 1.524940719851444e-05, "loss": 0.3812, "num_input_tokens_seen": 8272936, "step": 12685 }, { "epoch": 6.650943396226415, "grad_norm": 0.4670572578907013, "learning_rate": 1.5228353791674734e-05, "loss": 0.4685, "num_input_tokens_seen": 8276104, "step": 12690 }, { "epoch": 6.65356394129979, "grad_norm": 0.239015594124794, "learning_rate": 1.520730856130223e-05, "loss": 0.3719, "num_input_tokens_seen": 8278728, "step": 12695 }, { "epoch": 6.656184486373165, "grad_norm": 0.462772399187088, "learning_rate": 1.5186271525006607e-05, "loss": 0.4322, "num_input_tokens_seen": 8282312, "step": 12700 }, { "epoch": 6.658805031446541, "grad_norm": 0.40932005643844604, "learning_rate": 1.5165242700390697e-05, "loss": 0.5069, "num_input_tokens_seen": 8285800, "step": 12705 }, { "epoch": 6.661425576519916, "grad_norm": 0.47627148032188416, "learning_rate": 1.5144222105050471e-05, "loss": 0.5742, "num_input_tokens_seen": 8288648, "step": 12710 }, { "epoch": 6.664046121593291, "grad_norm": 0.49234235286712646, "learning_rate": 1.5123209756574986e-05, "loss": 0.4303, "num_input_tokens_seen": 8291688, "step": 12715 }, { "epoch": 6.666666666666667, "grad_norm": 0.6050611734390259, "learning_rate": 1.5102205672546416e-05, "loss": 0.5517, "num_input_tokens_seen": 8295400, "step": 12720 }, { "epoch": 6.669287211740042, "grad_norm": 0.32467249035835266, "learning_rate": 1.508120987054004e-05, "loss": 0.4381, "num_input_tokens_seen": 8297704, "step": 12725 }, { "epoch": 6.671907756813417, "grad_norm": 0.7683065533638, "learning_rate": 1.5060222368124163e-05, "loss": 0.6176, "num_input_tokens_seen": 8300808, "step": 12730 }, { "epoch": 6.6745283018867925, "grad_norm": 0.49166029691696167, "learning_rate": 1.5039243182860177e-05, "loss": 0.6509, "num_input_tokens_seen": 8304104, "step": 12735 }, { "epoch": 6.677148846960168, "grad_norm": 0.5362672805786133, "learning_rate": 1.5018272332302513e-05, "loss": 0.5663, "num_input_tokens_seen": 8307784, "step": 12740 }, { "epoch": 6.679769392033543, "grad_norm": 0.42563557624816895, "learning_rate": 1.4997309833998607e-05, "loss": 0.444, "num_input_tokens_seen": 8311848, "step": 12745 }, { "epoch": 6.682389937106918, "grad_norm": 0.36141619086265564, "learning_rate": 1.4976355705488932e-05, "loss": 0.4848, "num_input_tokens_seen": 8315272, "step": 12750 }, { "epoch": 6.685010482180293, "grad_norm": 0.5802140831947327, "learning_rate": 1.4955409964306946e-05, "loss": 0.387, "num_input_tokens_seen": 8318376, "step": 12755 }, { "epoch": 6.687631027253669, "grad_norm": 0.39416956901550293, "learning_rate": 1.4934472627979067e-05, "loss": 0.4604, "num_input_tokens_seen": 8322248, "step": 12760 }, { "epoch": 6.690251572327044, "grad_norm": 0.23465344309806824, "learning_rate": 1.491354371402473e-05, "loss": 0.3151, "num_input_tokens_seen": 8326056, "step": 12765 }, { "epoch": 6.6928721174004195, "grad_norm": 0.39358609914779663, "learning_rate": 1.4892623239956289e-05, "loss": 0.5527, "num_input_tokens_seen": 8329608, "step": 12770 }, { "epoch": 6.695492662473795, "grad_norm": 0.40492820739746094, "learning_rate": 1.4871711223279022e-05, "loss": 0.4712, "num_input_tokens_seen": 8332072, "step": 12775 }, { "epoch": 6.69811320754717, "grad_norm": 0.4928841292858124, "learning_rate": 1.4850807681491169e-05, "loss": 0.4703, "num_input_tokens_seen": 8334760, "step": 12780 }, { "epoch": 6.700733752620545, "grad_norm": 0.3768109083175659, "learning_rate": 1.4829912632083845e-05, "loss": 0.3648, "num_input_tokens_seen": 8337736, "step": 12785 }, { "epoch": 6.70335429769392, "grad_norm": 0.4480217695236206, "learning_rate": 1.4809026092541078e-05, "loss": 0.3656, "num_input_tokens_seen": 8340488, "step": 12790 }, { "epoch": 6.705974842767295, "grad_norm": 0.7030220627784729, "learning_rate": 1.4788148080339787e-05, "loss": 0.4865, "num_input_tokens_seen": 8343176, "step": 12795 }, { "epoch": 6.7085953878406706, "grad_norm": 0.3909781277179718, "learning_rate": 1.4767278612949703e-05, "loss": 0.5701, "num_input_tokens_seen": 8346376, "step": 12800 }, { "epoch": 6.711215932914046, "grad_norm": 0.32655370235443115, "learning_rate": 1.474641770783347e-05, "loss": 0.4374, "num_input_tokens_seen": 8349672, "step": 12805 }, { "epoch": 6.713836477987421, "grad_norm": 0.42612358927726746, "learning_rate": 1.4725565382446549e-05, "loss": 0.4948, "num_input_tokens_seen": 8352232, "step": 12810 }, { "epoch": 6.716457023060797, "grad_norm": 0.2881709635257721, "learning_rate": 1.4704721654237185e-05, "loss": 0.3678, "num_input_tokens_seen": 8355592, "step": 12815 }, { "epoch": 6.719077568134172, "grad_norm": 0.2964605391025543, "learning_rate": 1.4683886540646468e-05, "loss": 0.3767, "num_input_tokens_seen": 8357736, "step": 12820 }, { "epoch": 6.721698113207547, "grad_norm": 0.42157304286956787, "learning_rate": 1.4663060059108282e-05, "loss": 0.3875, "num_input_tokens_seen": 8362568, "step": 12825 }, { "epoch": 6.7243186582809225, "grad_norm": 0.3800501823425293, "learning_rate": 1.464224222704926e-05, "loss": 0.5004, "num_input_tokens_seen": 8366152, "step": 12830 }, { "epoch": 6.726939203354298, "grad_norm": 1.1690469980239868, "learning_rate": 1.462143306188882e-05, "loss": 0.4625, "num_input_tokens_seen": 8368456, "step": 12835 }, { "epoch": 6.729559748427673, "grad_norm": 0.5970911383628845, "learning_rate": 1.4600632581039123e-05, "loss": 0.5012, "num_input_tokens_seen": 8371560, "step": 12840 }, { "epoch": 6.732180293501048, "grad_norm": 0.6082143783569336, "learning_rate": 1.457984080190506e-05, "loss": 0.4591, "num_input_tokens_seen": 8374792, "step": 12845 }, { "epoch": 6.734800838574423, "grad_norm": 0.566372811794281, "learning_rate": 1.4559057741884227e-05, "loss": 0.4448, "num_input_tokens_seen": 8377832, "step": 12850 }, { "epoch": 6.737421383647799, "grad_norm": 0.33914715051651, "learning_rate": 1.4538283418366965e-05, "loss": 0.5263, "num_input_tokens_seen": 8380872, "step": 12855 }, { "epoch": 6.740041928721174, "grad_norm": 0.3584570586681366, "learning_rate": 1.4517517848736267e-05, "loss": 0.4498, "num_input_tokens_seen": 8384264, "step": 12860 }, { "epoch": 6.7426624737945495, "grad_norm": 0.2919985353946686, "learning_rate": 1.449676105036781e-05, "loss": 0.49, "num_input_tokens_seen": 8389064, "step": 12865 }, { "epoch": 6.745283018867925, "grad_norm": 0.5387247800827026, "learning_rate": 1.4476013040629938e-05, "loss": 0.5422, "num_input_tokens_seen": 8391976, "step": 12870 }, { "epoch": 6.7479035639413, "grad_norm": 0.3708752989768982, "learning_rate": 1.4455273836883629e-05, "loss": 0.3688, "num_input_tokens_seen": 8395176, "step": 12875 }, { "epoch": 6.750524109014675, "grad_norm": 0.4530501067638397, "learning_rate": 1.443454345648252e-05, "loss": 0.5388, "num_input_tokens_seen": 8398216, "step": 12880 }, { "epoch": 6.75314465408805, "grad_norm": 0.31473878026008606, "learning_rate": 1.4413821916772832e-05, "loss": 0.516, "num_input_tokens_seen": 8402120, "step": 12885 }, { "epoch": 6.755765199161425, "grad_norm": 0.48799440264701843, "learning_rate": 1.4393109235093399e-05, "loss": 0.5239, "num_input_tokens_seen": 8405736, "step": 12890 }, { "epoch": 6.7583857442348005, "grad_norm": 1.3935370445251465, "learning_rate": 1.4372405428775664e-05, "loss": 0.5838, "num_input_tokens_seen": 8408808, "step": 12895 }, { "epoch": 6.761006289308176, "grad_norm": 0.4571623206138611, "learning_rate": 1.4351710515143618e-05, "loss": 0.3883, "num_input_tokens_seen": 8411400, "step": 12900 }, { "epoch": 6.763626834381551, "grad_norm": 0.4232812225818634, "learning_rate": 1.4331024511513808e-05, "loss": 0.3395, "num_input_tokens_seen": 8414632, "step": 12905 }, { "epoch": 6.766247379454927, "grad_norm": 0.7446792125701904, "learning_rate": 1.4310347435195368e-05, "loss": 0.4901, "num_input_tokens_seen": 8417256, "step": 12910 }, { "epoch": 6.768867924528302, "grad_norm": 0.4241424798965454, "learning_rate": 1.428967930348989e-05, "loss": 0.3976, "num_input_tokens_seen": 8420584, "step": 12915 }, { "epoch": 6.771488469601677, "grad_norm": 0.38851702213287354, "learning_rate": 1.4269020133691542e-05, "loss": 0.5792, "num_input_tokens_seen": 8423912, "step": 12920 }, { "epoch": 6.774109014675052, "grad_norm": 0.511327862739563, "learning_rate": 1.4248369943086998e-05, "loss": 0.358, "num_input_tokens_seen": 8427368, "step": 12925 }, { "epoch": 6.776729559748428, "grad_norm": 0.3651634156703949, "learning_rate": 1.4227728748955345e-05, "loss": 0.4639, "num_input_tokens_seen": 8431176, "step": 12930 }, { "epoch": 6.779350104821803, "grad_norm": 0.3457644581794739, "learning_rate": 1.4207096568568232e-05, "loss": 0.4421, "num_input_tokens_seen": 8433768, "step": 12935 }, { "epoch": 6.781970649895178, "grad_norm": 0.47211411595344543, "learning_rate": 1.418647341918971e-05, "loss": 0.3918, "num_input_tokens_seen": 8437384, "step": 12940 }, { "epoch": 6.784591194968553, "grad_norm": 0.43858909606933594, "learning_rate": 1.4165859318076276e-05, "loss": 0.3791, "num_input_tokens_seen": 8441192, "step": 12945 }, { "epoch": 6.787211740041929, "grad_norm": 0.4380676746368408, "learning_rate": 1.4145254282476895e-05, "loss": 0.6086, "num_input_tokens_seen": 8446504, "step": 12950 }, { "epoch": 6.789832285115304, "grad_norm": 0.4793511629104614, "learning_rate": 1.4124658329632901e-05, "loss": 0.3998, "num_input_tokens_seen": 8449384, "step": 12955 }, { "epoch": 6.7924528301886795, "grad_norm": 0.6473883390426636, "learning_rate": 1.4104071476778044e-05, "loss": 0.4276, "num_input_tokens_seen": 8452456, "step": 12960 }, { "epoch": 6.795073375262055, "grad_norm": 0.494288831949234, "learning_rate": 1.4083493741138486e-05, "loss": 0.5145, "num_input_tokens_seen": 8457000, "step": 12965 }, { "epoch": 6.79769392033543, "grad_norm": 0.6831126809120178, "learning_rate": 1.4062925139932703e-05, "loss": 0.4468, "num_input_tokens_seen": 8461576, "step": 12970 }, { "epoch": 6.800314465408805, "grad_norm": 0.7015981674194336, "learning_rate": 1.4042365690371587e-05, "loss": 0.4556, "num_input_tokens_seen": 8464712, "step": 12975 }, { "epoch": 6.80293501048218, "grad_norm": 0.4629092216491699, "learning_rate": 1.4021815409658335e-05, "loss": 0.3793, "num_input_tokens_seen": 8467880, "step": 12980 }, { "epoch": 6.805555555555555, "grad_norm": 0.4474045932292938, "learning_rate": 1.4001274314988475e-05, "loss": 0.4, "num_input_tokens_seen": 8471208, "step": 12985 }, { "epoch": 6.8081761006289305, "grad_norm": 0.3800627291202545, "learning_rate": 1.3980742423549875e-05, "loss": 0.488, "num_input_tokens_seen": 8474248, "step": 12990 }, { "epoch": 6.810796645702306, "grad_norm": 0.496418833732605, "learning_rate": 1.3960219752522679e-05, "loss": 0.3882, "num_input_tokens_seen": 8477064, "step": 12995 }, { "epoch": 6.813417190775681, "grad_norm": 0.6631186008453369, "learning_rate": 1.3939706319079305e-05, "loss": 0.5138, "num_input_tokens_seen": 8479528, "step": 13000 }, { "epoch": 6.816037735849057, "grad_norm": 0.4970797002315521, "learning_rate": 1.391920214038448e-05, "loss": 0.3577, "num_input_tokens_seen": 8482856, "step": 13005 }, { "epoch": 6.818658280922432, "grad_norm": 0.2726325988769531, "learning_rate": 1.3898707233595153e-05, "loss": 0.4742, "num_input_tokens_seen": 8486312, "step": 13010 }, { "epoch": 6.821278825995807, "grad_norm": 0.4191387891769409, "learning_rate": 1.3878221615860527e-05, "loss": 0.5163, "num_input_tokens_seen": 8489096, "step": 13015 }, { "epoch": 6.823899371069182, "grad_norm": 0.2501313090324402, "learning_rate": 1.3857745304322017e-05, "loss": 0.5142, "num_input_tokens_seen": 8493448, "step": 13020 }, { "epoch": 6.826519916142558, "grad_norm": 0.36944448947906494, "learning_rate": 1.3837278316113293e-05, "loss": 0.345, "num_input_tokens_seen": 8496968, "step": 13025 }, { "epoch": 6.829140461215933, "grad_norm": 1.87282395362854, "learning_rate": 1.3816820668360177e-05, "loss": 0.3951, "num_input_tokens_seen": 8500616, "step": 13030 }, { "epoch": 6.831761006289308, "grad_norm": 0.42366766929626465, "learning_rate": 1.3796372378180691e-05, "loss": 0.4916, "num_input_tokens_seen": 8504136, "step": 13035 }, { "epoch": 6.834381551362683, "grad_norm": 0.42432525753974915, "learning_rate": 1.3775933462685047e-05, "loss": 0.5406, "num_input_tokens_seen": 8507464, "step": 13040 }, { "epoch": 6.837002096436059, "grad_norm": 0.6262378096580505, "learning_rate": 1.375550393897559e-05, "loss": 0.5374, "num_input_tokens_seen": 8510216, "step": 13045 }, { "epoch": 6.839622641509434, "grad_norm": 0.4411218762397766, "learning_rate": 1.3735083824146793e-05, "loss": 0.522, "num_input_tokens_seen": 8513960, "step": 13050 }, { "epoch": 6.8422431865828095, "grad_norm": 0.3125257194042206, "learning_rate": 1.3714673135285316e-05, "loss": 0.4572, "num_input_tokens_seen": 8517768, "step": 13055 }, { "epoch": 6.844863731656185, "grad_norm": 0.31998467445373535, "learning_rate": 1.3694271889469844e-05, "loss": 0.5505, "num_input_tokens_seen": 8521640, "step": 13060 }, { "epoch": 6.84748427672956, "grad_norm": 0.49472135305404663, "learning_rate": 1.3673880103771241e-05, "loss": 0.428, "num_input_tokens_seen": 8524968, "step": 13065 }, { "epoch": 6.850104821802935, "grad_norm": 0.7491679787635803, "learning_rate": 1.365349779525241e-05, "loss": 0.6666, "num_input_tokens_seen": 8528200, "step": 13070 }, { "epoch": 6.85272536687631, "grad_norm": 0.42926257848739624, "learning_rate": 1.3633124980968327e-05, "loss": 0.4676, "num_input_tokens_seen": 8531144, "step": 13075 }, { "epoch": 6.855345911949685, "grad_norm": 0.47461768984794617, "learning_rate": 1.3612761677966051e-05, "loss": 0.4911, "num_input_tokens_seen": 8535240, "step": 13080 }, { "epoch": 6.8579664570230605, "grad_norm": 0.33324846625328064, "learning_rate": 1.3592407903284654e-05, "loss": 0.522, "num_input_tokens_seen": 8537352, "step": 13085 }, { "epoch": 6.860587002096436, "grad_norm": 0.4530034363269806, "learning_rate": 1.3572063673955238e-05, "loss": 0.4106, "num_input_tokens_seen": 8540776, "step": 13090 }, { "epoch": 6.863207547169811, "grad_norm": 0.5268626809120178, "learning_rate": 1.355172900700095e-05, "loss": 0.5231, "num_input_tokens_seen": 8543944, "step": 13095 }, { "epoch": 6.865828092243187, "grad_norm": 0.5538097023963928, "learning_rate": 1.3531403919436875e-05, "loss": 0.4773, "num_input_tokens_seen": 8546984, "step": 13100 }, { "epoch": 6.868448637316562, "grad_norm": 0.834053635597229, "learning_rate": 1.3511088428270142e-05, "loss": 0.8102, "num_input_tokens_seen": 8550376, "step": 13105 }, { "epoch": 6.871069182389937, "grad_norm": 0.6208744049072266, "learning_rate": 1.3490782550499823e-05, "loss": 0.5041, "num_input_tokens_seen": 8553512, "step": 13110 }, { "epoch": 6.873689727463312, "grad_norm": 0.3843320906162262, "learning_rate": 1.3470486303116936e-05, "loss": 0.4401, "num_input_tokens_seen": 8557320, "step": 13115 }, { "epoch": 6.876310272536688, "grad_norm": 0.45370224118232727, "learning_rate": 1.3450199703104471e-05, "loss": 0.4168, "num_input_tokens_seen": 8559848, "step": 13120 }, { "epoch": 6.878930817610063, "grad_norm": 0.2610209882259369, "learning_rate": 1.3429922767437319e-05, "loss": 0.3103, "num_input_tokens_seen": 8563400, "step": 13125 }, { "epoch": 6.881551362683438, "grad_norm": 0.3969069719314575, "learning_rate": 1.3409655513082291e-05, "loss": 0.4509, "num_input_tokens_seen": 8566728, "step": 13130 }, { "epoch": 6.884171907756813, "grad_norm": 0.6348926424980164, "learning_rate": 1.3389397956998111e-05, "loss": 0.4329, "num_input_tokens_seen": 8570056, "step": 13135 }, { "epoch": 6.886792452830189, "grad_norm": 0.4630921185016632, "learning_rate": 1.336915011613537e-05, "loss": 0.4386, "num_input_tokens_seen": 8573064, "step": 13140 }, { "epoch": 6.889412997903564, "grad_norm": 0.4288502633571625, "learning_rate": 1.3348912007436537e-05, "loss": 0.5728, "num_input_tokens_seen": 8577640, "step": 13145 }, { "epoch": 6.8920335429769395, "grad_norm": 0.3992661237716675, "learning_rate": 1.3328683647835933e-05, "loss": 0.5884, "num_input_tokens_seen": 8581256, "step": 13150 }, { "epoch": 6.894654088050315, "grad_norm": 0.37798523902893066, "learning_rate": 1.330846505425972e-05, "loss": 0.3963, "num_input_tokens_seen": 8584168, "step": 13155 }, { "epoch": 6.89727463312369, "grad_norm": 0.32295307517051697, "learning_rate": 1.3288256243625911e-05, "loss": 0.484, "num_input_tokens_seen": 8587432, "step": 13160 }, { "epoch": 6.899895178197065, "grad_norm": 0.3624807894229889, "learning_rate": 1.3268057232844305e-05, "loss": 0.4996, "num_input_tokens_seen": 8590728, "step": 13165 }, { "epoch": 6.90251572327044, "grad_norm": 0.43528714776039124, "learning_rate": 1.3247868038816504e-05, "loss": 0.3895, "num_input_tokens_seen": 8594120, "step": 13170 }, { "epoch": 6.905136268343815, "grad_norm": 0.3510453402996063, "learning_rate": 1.3227688678435924e-05, "loss": 0.4827, "num_input_tokens_seen": 8596904, "step": 13175 }, { "epoch": 6.9077568134171905, "grad_norm": 0.39852309226989746, "learning_rate": 1.3207519168587717e-05, "loss": 0.4127, "num_input_tokens_seen": 8599848, "step": 13180 }, { "epoch": 6.910377358490566, "grad_norm": 0.2761901021003723, "learning_rate": 1.3187359526148813e-05, "loss": 0.43, "num_input_tokens_seen": 8603272, "step": 13185 }, { "epoch": 6.912997903563941, "grad_norm": 0.3447844088077545, "learning_rate": 1.3167209767987868e-05, "loss": 0.4496, "num_input_tokens_seen": 8607240, "step": 13190 }, { "epoch": 6.915618448637317, "grad_norm": 1.2839823961257935, "learning_rate": 1.3147069910965298e-05, "loss": 0.6187, "num_input_tokens_seen": 8613160, "step": 13195 }, { "epoch": 6.918238993710692, "grad_norm": 0.404819130897522, "learning_rate": 1.3126939971933205e-05, "loss": 0.4897, "num_input_tokens_seen": 8617480, "step": 13200 }, { "epoch": 6.920859538784067, "grad_norm": 0.3168332874774933, "learning_rate": 1.3106819967735395e-05, "loss": 0.4582, "num_input_tokens_seen": 8620520, "step": 13205 }, { "epoch": 6.923480083857442, "grad_norm": 0.44692838191986084, "learning_rate": 1.3086709915207388e-05, "loss": 0.3354, "num_input_tokens_seen": 8626568, "step": 13210 }, { "epoch": 6.926100628930818, "grad_norm": 0.3357786238193512, "learning_rate": 1.3066609831176346e-05, "loss": 0.4943, "num_input_tokens_seen": 8630184, "step": 13215 }, { "epoch": 6.928721174004193, "grad_norm": 0.4902229905128479, "learning_rate": 1.3046519732461094e-05, "loss": 0.4328, "num_input_tokens_seen": 8633576, "step": 13220 }, { "epoch": 6.931341719077568, "grad_norm": 0.3834441006183624, "learning_rate": 1.302643963587213e-05, "loss": 0.4602, "num_input_tokens_seen": 8636520, "step": 13225 }, { "epoch": 6.933962264150943, "grad_norm": 0.40706491470336914, "learning_rate": 1.3006369558211534e-05, "loss": 0.3411, "num_input_tokens_seen": 8639816, "step": 13230 }, { "epoch": 6.936582809224319, "grad_norm": 0.5381856560707092, "learning_rate": 1.2986309516273043e-05, "loss": 0.5189, "num_input_tokens_seen": 8642888, "step": 13235 }, { "epoch": 6.939203354297694, "grad_norm": 0.42364218831062317, "learning_rate": 1.2966259526842006e-05, "loss": 0.4902, "num_input_tokens_seen": 8645704, "step": 13240 }, { "epoch": 6.9418238993710695, "grad_norm": 0.393659770488739, "learning_rate": 1.2946219606695297e-05, "loss": 0.4196, "num_input_tokens_seen": 8648648, "step": 13245 }, { "epoch": 6.944444444444445, "grad_norm": 0.4570927917957306, "learning_rate": 1.2926189772601438e-05, "loss": 0.4132, "num_input_tokens_seen": 8651560, "step": 13250 }, { "epoch": 6.94706498951782, "grad_norm": 0.2931286096572876, "learning_rate": 1.2906170041320468e-05, "loss": 0.5475, "num_input_tokens_seen": 8655208, "step": 13255 }, { "epoch": 6.949685534591195, "grad_norm": 0.42560455203056335, "learning_rate": 1.2886160429603972e-05, "loss": 0.327, "num_input_tokens_seen": 8657576, "step": 13260 }, { "epoch": 6.95230607966457, "grad_norm": 0.6545131802558899, "learning_rate": 1.2866160954195112e-05, "loss": 0.7088, "num_input_tokens_seen": 8660872, "step": 13265 }, { "epoch": 6.954926624737945, "grad_norm": 0.4320824146270752, "learning_rate": 1.284617163182849e-05, "loss": 0.4561, "num_input_tokens_seen": 8665192, "step": 13270 }, { "epoch": 6.9575471698113205, "grad_norm": 0.19313941895961761, "learning_rate": 1.2826192479230287e-05, "loss": 0.4372, "num_input_tokens_seen": 8670088, "step": 13275 }, { "epoch": 6.960167714884696, "grad_norm": 0.8315659165382385, "learning_rate": 1.2806223513118154e-05, "loss": 0.4827, "num_input_tokens_seen": 8673576, "step": 13280 }, { "epoch": 6.962788259958071, "grad_norm": 0.4400761127471924, "learning_rate": 1.2786264750201182e-05, "loss": 0.658, "num_input_tokens_seen": 8676424, "step": 13285 }, { "epoch": 6.965408805031447, "grad_norm": 0.39188578724861145, "learning_rate": 1.2766316207179973e-05, "loss": 0.4257, "num_input_tokens_seen": 8679912, "step": 13290 }, { "epoch": 6.968029350104822, "grad_norm": 0.37157270312309265, "learning_rate": 1.2746377900746548e-05, "loss": 0.7082, "num_input_tokens_seen": 8683880, "step": 13295 }, { "epoch": 6.970649895178197, "grad_norm": 0.36229413747787476, "learning_rate": 1.2726449847584365e-05, "loss": 0.546, "num_input_tokens_seen": 8687784, "step": 13300 }, { "epoch": 6.973270440251572, "grad_norm": 0.3684808909893036, "learning_rate": 1.2706532064368326e-05, "loss": 0.4373, "num_input_tokens_seen": 8690920, "step": 13305 }, { "epoch": 6.975890985324948, "grad_norm": 0.7946386337280273, "learning_rate": 1.268662456776471e-05, "loss": 0.3679, "num_input_tokens_seen": 8693960, "step": 13310 }, { "epoch": 6.978511530398323, "grad_norm": 0.26071831583976746, "learning_rate": 1.2666727374431198e-05, "loss": 0.5204, "num_input_tokens_seen": 8697640, "step": 13315 }, { "epoch": 6.981132075471698, "grad_norm": 0.39055824279785156, "learning_rate": 1.2646840501016863e-05, "loss": 0.6867, "num_input_tokens_seen": 8699912, "step": 13320 }, { "epoch": 6.983752620545073, "grad_norm": 0.4975760281085968, "learning_rate": 1.262696396416213e-05, "loss": 0.6114, "num_input_tokens_seen": 8703240, "step": 13325 }, { "epoch": 6.986373165618449, "grad_norm": 0.4282698333263397, "learning_rate": 1.2607097780498772e-05, "loss": 0.3775, "num_input_tokens_seen": 8706376, "step": 13330 }, { "epoch": 6.988993710691824, "grad_norm": 0.5614853501319885, "learning_rate": 1.2587241966649908e-05, "loss": 0.5826, "num_input_tokens_seen": 8709224, "step": 13335 }, { "epoch": 6.9916142557651995, "grad_norm": 0.9557355642318726, "learning_rate": 1.2567396539229965e-05, "loss": 0.5492, "num_input_tokens_seen": 8711816, "step": 13340 }, { "epoch": 6.994234800838575, "grad_norm": 0.753890335559845, "learning_rate": 1.2547561514844704e-05, "loss": 0.4695, "num_input_tokens_seen": 8715496, "step": 13345 }, { "epoch": 6.99685534591195, "grad_norm": 0.4036840796470642, "learning_rate": 1.2527736910091168e-05, "loss": 0.3926, "num_input_tokens_seen": 8718312, "step": 13350 }, { "epoch": 6.999475890985325, "grad_norm": 0.5733431577682495, "learning_rate": 1.2507922741557665e-05, "loss": 0.4523, "num_input_tokens_seen": 8722632, "step": 13355 }, { "epoch": 7.0, "eval_loss": 0.4842182397842407, "eval_runtime": 14.5348, "eval_samples_per_second": 58.343, "eval_steps_per_second": 14.586, "num_input_tokens_seen": 8722744, "step": 13356 }, { "epoch": 7.0020964360587, "grad_norm": 0.5395805239677429, "learning_rate": 1.2488119025823802e-05, "loss": 0.4657, "num_input_tokens_seen": 8725944, "step": 13360 }, { "epoch": 7.004716981132075, "grad_norm": 0.3620794117450714, "learning_rate": 1.2468325779460424e-05, "loss": 0.4733, "num_input_tokens_seen": 8729592, "step": 13365 }, { "epoch": 7.0073375262054505, "grad_norm": 0.5554630160331726, "learning_rate": 1.2448543019029607e-05, "loss": 0.4146, "num_input_tokens_seen": 8734264, "step": 13370 }, { "epoch": 7.009958071278826, "grad_norm": 0.30568405985832214, "learning_rate": 1.2428770761084655e-05, "loss": 0.4052, "num_input_tokens_seen": 8738744, "step": 13375 }, { "epoch": 7.012578616352202, "grad_norm": 0.45657098293304443, "learning_rate": 1.2409009022170109e-05, "loss": 0.398, "num_input_tokens_seen": 8741720, "step": 13380 }, { "epoch": 7.015199161425577, "grad_norm": 0.4089142680168152, "learning_rate": 1.2389257818821679e-05, "loss": 0.4484, "num_input_tokens_seen": 8745112, "step": 13385 }, { "epoch": 7.017819706498952, "grad_norm": 1.3226182460784912, "learning_rate": 1.236951716756626e-05, "loss": 0.4275, "num_input_tokens_seen": 8747960, "step": 13390 }, { "epoch": 7.020440251572327, "grad_norm": 0.654438853263855, "learning_rate": 1.2349787084921952e-05, "loss": 0.4989, "num_input_tokens_seen": 8750808, "step": 13395 }, { "epoch": 7.023060796645702, "grad_norm": 0.4057759940624237, "learning_rate": 1.233006758739797e-05, "loss": 0.4771, "num_input_tokens_seen": 8753368, "step": 13400 }, { "epoch": 7.0256813417190775, "grad_norm": 0.7370367050170898, "learning_rate": 1.2310358691494681e-05, "loss": 0.5113, "num_input_tokens_seen": 8756056, "step": 13405 }, { "epoch": 7.028301886792453, "grad_norm": 0.6705074906349182, "learning_rate": 1.229066041370362e-05, "loss": 0.4514, "num_input_tokens_seen": 8759256, "step": 13410 }, { "epoch": 7.030922431865828, "grad_norm": 0.7800843715667725, "learning_rate": 1.2270972770507364e-05, "loss": 0.4, "num_input_tokens_seen": 8763032, "step": 13415 }, { "epoch": 7.033542976939203, "grad_norm": 0.9066664576530457, "learning_rate": 1.2251295778379657e-05, "loss": 0.3455, "num_input_tokens_seen": 8766072, "step": 13420 }, { "epoch": 7.036163522012578, "grad_norm": 0.6673932075500488, "learning_rate": 1.2231629453785324e-05, "loss": 0.4704, "num_input_tokens_seen": 8769976, "step": 13425 }, { "epoch": 7.038784067085954, "grad_norm": 0.37678518891334534, "learning_rate": 1.2211973813180209e-05, "loss": 0.5278, "num_input_tokens_seen": 8773528, "step": 13430 }, { "epoch": 7.0414046121593294, "grad_norm": 0.8804860711097717, "learning_rate": 1.2192328873011283e-05, "loss": 0.4931, "num_input_tokens_seen": 8776440, "step": 13435 }, { "epoch": 7.044025157232705, "grad_norm": 0.3268956243991852, "learning_rate": 1.2172694649716524e-05, "loss": 0.346, "num_input_tokens_seen": 8780408, "step": 13440 }, { "epoch": 7.04664570230608, "grad_norm": 0.36857840418815613, "learning_rate": 1.2153071159724947e-05, "loss": 0.4099, "num_input_tokens_seen": 8782872, "step": 13445 }, { "epoch": 7.049266247379455, "grad_norm": 0.2746488153934479, "learning_rate": 1.2133458419456614e-05, "loss": 0.4454, "num_input_tokens_seen": 8786776, "step": 13450 }, { "epoch": 7.05188679245283, "grad_norm": 0.40311381220817566, "learning_rate": 1.2113856445322541e-05, "loss": 0.5108, "num_input_tokens_seen": 8791576, "step": 13455 }, { "epoch": 7.054507337526205, "grad_norm": 0.6362214684486389, "learning_rate": 1.2094265253724777e-05, "loss": 0.4301, "num_input_tokens_seen": 8794072, "step": 13460 }, { "epoch": 7.0571278825995805, "grad_norm": 0.3203834593296051, "learning_rate": 1.207468486105636e-05, "loss": 0.4806, "num_input_tokens_seen": 8798424, "step": 13465 }, { "epoch": 7.059748427672956, "grad_norm": 0.6123035550117493, "learning_rate": 1.2055115283701224e-05, "loss": 0.3708, "num_input_tokens_seen": 8801208, "step": 13470 }, { "epoch": 7.062368972746331, "grad_norm": 0.47066888213157654, "learning_rate": 1.2035556538034332e-05, "loss": 0.4908, "num_input_tokens_seen": 8804472, "step": 13475 }, { "epoch": 7.064989517819707, "grad_norm": 0.3063379228115082, "learning_rate": 1.2016008640421533e-05, "loss": 0.5195, "num_input_tokens_seen": 8808152, "step": 13480 }, { "epoch": 7.067610062893082, "grad_norm": 0.3440614640712738, "learning_rate": 1.1996471607219612e-05, "loss": 0.4188, "num_input_tokens_seen": 8811352, "step": 13485 }, { "epoch": 7.070230607966457, "grad_norm": 1.0806387662887573, "learning_rate": 1.1976945454776284e-05, "loss": 0.457, "num_input_tokens_seen": 8814328, "step": 13490 }, { "epoch": 7.072851153039832, "grad_norm": 0.2454414963722229, "learning_rate": 1.1957430199430128e-05, "loss": 0.5045, "num_input_tokens_seen": 8817976, "step": 13495 }, { "epoch": 7.0754716981132075, "grad_norm": 0.4264254868030548, "learning_rate": 1.1937925857510609e-05, "loss": 0.3792, "num_input_tokens_seen": 8820856, "step": 13500 }, { "epoch": 7.078092243186583, "grad_norm": 0.5929949283599854, "learning_rate": 1.1918432445338092e-05, "loss": 0.5623, "num_input_tokens_seen": 8824120, "step": 13505 }, { "epoch": 7.080712788259958, "grad_norm": 0.44693276286125183, "learning_rate": 1.1898949979223765e-05, "loss": 0.4928, "num_input_tokens_seen": 8827192, "step": 13510 }, { "epoch": 7.083333333333333, "grad_norm": 0.603874921798706, "learning_rate": 1.187947847546966e-05, "loss": 0.3918, "num_input_tokens_seen": 8831064, "step": 13515 }, { "epoch": 7.085953878406708, "grad_norm": 0.2983558475971222, "learning_rate": 1.1860017950368646e-05, "loss": 0.5441, "num_input_tokens_seen": 8834200, "step": 13520 }, { "epoch": 7.088574423480084, "grad_norm": 0.5397470593452454, "learning_rate": 1.1840568420204392e-05, "loss": 0.3798, "num_input_tokens_seen": 8837240, "step": 13525 }, { "epoch": 7.091194968553459, "grad_norm": 0.3084793984889984, "learning_rate": 1.1821129901251396e-05, "loss": 0.6169, "num_input_tokens_seen": 8841976, "step": 13530 }, { "epoch": 7.093815513626835, "grad_norm": 0.5730765461921692, "learning_rate": 1.1801702409774909e-05, "loss": 0.3837, "num_input_tokens_seen": 8845336, "step": 13535 }, { "epoch": 7.09643605870021, "grad_norm": 0.40109875798225403, "learning_rate": 1.1782285962030965e-05, "loss": 0.4349, "num_input_tokens_seen": 8848184, "step": 13540 }, { "epoch": 7.099056603773585, "grad_norm": 1.2493637800216675, "learning_rate": 1.1762880574266374e-05, "loss": 0.5056, "num_input_tokens_seen": 8850840, "step": 13545 }, { "epoch": 7.10167714884696, "grad_norm": 0.8892948031425476, "learning_rate": 1.1743486262718673e-05, "loss": 0.5805, "num_input_tokens_seen": 8854296, "step": 13550 }, { "epoch": 7.104297693920335, "grad_norm": 0.38761910796165466, "learning_rate": 1.1724103043616134e-05, "loss": 0.4247, "num_input_tokens_seen": 8857560, "step": 13555 }, { "epoch": 7.1069182389937104, "grad_norm": 0.24405501782894135, "learning_rate": 1.1704730933177738e-05, "loss": 0.4363, "num_input_tokens_seen": 8861496, "step": 13560 }, { "epoch": 7.109538784067086, "grad_norm": 0.48155903816223145, "learning_rate": 1.1685369947613204e-05, "loss": 0.4647, "num_input_tokens_seen": 8864344, "step": 13565 }, { "epoch": 7.112159329140461, "grad_norm": 0.24509063363075256, "learning_rate": 1.1666020103122907e-05, "loss": 0.5372, "num_input_tokens_seen": 8868152, "step": 13570 }, { "epoch": 7.114779874213837, "grad_norm": 0.5486131310462952, "learning_rate": 1.1646681415897912e-05, "loss": 0.332, "num_input_tokens_seen": 8870776, "step": 13575 }, { "epoch": 7.117400419287212, "grad_norm": 0.6681270003318787, "learning_rate": 1.1627353902119958e-05, "loss": 0.5564, "num_input_tokens_seen": 8873944, "step": 13580 }, { "epoch": 7.120020964360587, "grad_norm": 0.9472677111625671, "learning_rate": 1.1608037577961423e-05, "loss": 0.4991, "num_input_tokens_seen": 8876792, "step": 13585 }, { "epoch": 7.122641509433962, "grad_norm": 0.823262631893158, "learning_rate": 1.158873245958531e-05, "loss": 0.5652, "num_input_tokens_seen": 8879672, "step": 13590 }, { "epoch": 7.1252620545073375, "grad_norm": 0.6991091370582581, "learning_rate": 1.1569438563145297e-05, "loss": 0.3583, "num_input_tokens_seen": 8882904, "step": 13595 }, { "epoch": 7.127882599580713, "grad_norm": 0.5965636968612671, "learning_rate": 1.1550155904785587e-05, "loss": 0.376, "num_input_tokens_seen": 8886328, "step": 13600 }, { "epoch": 7.130503144654088, "grad_norm": 0.831207275390625, "learning_rate": 1.1530884500641063e-05, "loss": 0.4471, "num_input_tokens_seen": 8888888, "step": 13605 }, { "epoch": 7.133123689727463, "grad_norm": 0.42138251662254333, "learning_rate": 1.1511624366837143e-05, "loss": 0.4608, "num_input_tokens_seen": 8892536, "step": 13610 }, { "epoch": 7.135744234800838, "grad_norm": 0.29813581705093384, "learning_rate": 1.149237551948982e-05, "loss": 0.4249, "num_input_tokens_seen": 8895768, "step": 13615 }, { "epoch": 7.138364779874214, "grad_norm": 0.4604979157447815, "learning_rate": 1.147313797470567e-05, "loss": 0.446, "num_input_tokens_seen": 8899320, "step": 13620 }, { "epoch": 7.140985324947589, "grad_norm": 0.6269621849060059, "learning_rate": 1.1453911748581778e-05, "loss": 0.4508, "num_input_tokens_seen": 8901816, "step": 13625 }, { "epoch": 7.143605870020965, "grad_norm": 0.44285356998443604, "learning_rate": 1.1434696857205765e-05, "loss": 0.3743, "num_input_tokens_seen": 8904984, "step": 13630 }, { "epoch": 7.14622641509434, "grad_norm": 0.346809983253479, "learning_rate": 1.1415493316655804e-05, "loss": 0.4305, "num_input_tokens_seen": 8909272, "step": 13635 }, { "epoch": 7.148846960167715, "grad_norm": 0.32222411036491394, "learning_rate": 1.1396301143000499e-05, "loss": 0.5131, "num_input_tokens_seen": 8912088, "step": 13640 }, { "epoch": 7.15146750524109, "grad_norm": 0.4088734984397888, "learning_rate": 1.1377120352299014e-05, "loss": 0.5331, "num_input_tokens_seen": 8915160, "step": 13645 }, { "epoch": 7.154088050314465, "grad_norm": 0.2617134749889374, "learning_rate": 1.1357950960600955e-05, "loss": 0.4105, "num_input_tokens_seen": 8918200, "step": 13650 }, { "epoch": 7.15670859538784, "grad_norm": 0.3716651201248169, "learning_rate": 1.1338792983946376e-05, "loss": 0.362, "num_input_tokens_seen": 8921464, "step": 13655 }, { "epoch": 7.159329140461216, "grad_norm": 0.5372587442398071, "learning_rate": 1.1319646438365817e-05, "loss": 0.467, "num_input_tokens_seen": 8924344, "step": 13660 }, { "epoch": 7.161949685534591, "grad_norm": 0.6096907258033752, "learning_rate": 1.1300511339880227e-05, "loss": 0.4336, "num_input_tokens_seen": 8927352, "step": 13665 }, { "epoch": 7.164570230607967, "grad_norm": 0.45939329266548157, "learning_rate": 1.128138770450097e-05, "loss": 0.5172, "num_input_tokens_seen": 8930936, "step": 13670 }, { "epoch": 7.167190775681342, "grad_norm": 0.8592544794082642, "learning_rate": 1.126227554822985e-05, "loss": 0.5408, "num_input_tokens_seen": 8934296, "step": 13675 }, { "epoch": 7.169811320754717, "grad_norm": 0.2925727367401123, "learning_rate": 1.1243174887059038e-05, "loss": 0.3417, "num_input_tokens_seen": 8937016, "step": 13680 }, { "epoch": 7.172431865828092, "grad_norm": 0.4248013496398926, "learning_rate": 1.1224085736971093e-05, "loss": 0.3828, "num_input_tokens_seen": 8940792, "step": 13685 }, { "epoch": 7.1750524109014675, "grad_norm": 0.40254971385002136, "learning_rate": 1.1205008113938934e-05, "loss": 0.5186, "num_input_tokens_seen": 8944024, "step": 13690 }, { "epoch": 7.177672955974843, "grad_norm": 0.5735228061676025, "learning_rate": 1.1185942033925867e-05, "loss": 0.4604, "num_input_tokens_seen": 8946872, "step": 13695 }, { "epoch": 7.180293501048218, "grad_norm": 0.3270532488822937, "learning_rate": 1.1166887512885505e-05, "loss": 0.4398, "num_input_tokens_seen": 8950168, "step": 13700 }, { "epoch": 7.182914046121593, "grad_norm": 0.8420215845108032, "learning_rate": 1.11478445667618e-05, "loss": 0.4555, "num_input_tokens_seen": 8953112, "step": 13705 }, { "epoch": 7.185534591194968, "grad_norm": 0.655037522315979, "learning_rate": 1.1128813211489012e-05, "loss": 0.3852, "num_input_tokens_seen": 8956312, "step": 13710 }, { "epoch": 7.188155136268344, "grad_norm": 0.6925208568572998, "learning_rate": 1.1109793462991725e-05, "loss": 0.4128, "num_input_tokens_seen": 8959032, "step": 13715 }, { "epoch": 7.190775681341719, "grad_norm": 0.6950237154960632, "learning_rate": 1.109078533718479e-05, "loss": 0.5755, "num_input_tokens_seen": 8962744, "step": 13720 }, { "epoch": 7.193396226415095, "grad_norm": 0.4342074990272522, "learning_rate": 1.107178884997334e-05, "loss": 0.4815, "num_input_tokens_seen": 8966520, "step": 13725 }, { "epoch": 7.19601677148847, "grad_norm": 0.4458467364311218, "learning_rate": 1.1052804017252751e-05, "loss": 0.5364, "num_input_tokens_seen": 8969304, "step": 13730 }, { "epoch": 7.198637316561845, "grad_norm": 0.2721993029117584, "learning_rate": 1.1033830854908691e-05, "loss": 0.5209, "num_input_tokens_seen": 8973432, "step": 13735 }, { "epoch": 7.20125786163522, "grad_norm": 0.42018425464630127, "learning_rate": 1.1014869378817022e-05, "loss": 0.505, "num_input_tokens_seen": 8976248, "step": 13740 }, { "epoch": 7.203878406708595, "grad_norm": 0.4006314277648926, "learning_rate": 1.0995919604843832e-05, "loss": 0.4562, "num_input_tokens_seen": 8980792, "step": 13745 }, { "epoch": 7.20649895178197, "grad_norm": 0.442800909280777, "learning_rate": 1.0976981548845444e-05, "loss": 0.561, "num_input_tokens_seen": 8983480, "step": 13750 }, { "epoch": 7.209119496855346, "grad_norm": 0.8833749294281006, "learning_rate": 1.095805522666835e-05, "loss": 0.4953, "num_input_tokens_seen": 8986200, "step": 13755 }, { "epoch": 7.211740041928721, "grad_norm": 0.4952974021434784, "learning_rate": 1.0939140654149225e-05, "loss": 0.476, "num_input_tokens_seen": 8989816, "step": 13760 }, { "epoch": 7.214360587002097, "grad_norm": 0.36986759305000305, "learning_rate": 1.0920237847114944e-05, "loss": 0.2981, "num_input_tokens_seen": 8992600, "step": 13765 }, { "epoch": 7.216981132075472, "grad_norm": 0.4204326868057251, "learning_rate": 1.0901346821382476e-05, "loss": 0.4786, "num_input_tokens_seen": 8997432, "step": 13770 }, { "epoch": 7.219601677148847, "grad_norm": 0.7372080087661743, "learning_rate": 1.0882467592758989e-05, "loss": 0.5045, "num_input_tokens_seen": 8999864, "step": 13775 }, { "epoch": 7.222222222222222, "grad_norm": 0.3514323830604553, "learning_rate": 1.0863600177041772e-05, "loss": 0.3874, "num_input_tokens_seen": 9003160, "step": 13780 }, { "epoch": 7.2248427672955975, "grad_norm": 0.6287257671356201, "learning_rate": 1.0844744590018186e-05, "loss": 0.4551, "num_input_tokens_seen": 9006040, "step": 13785 }, { "epoch": 7.227463312368973, "grad_norm": 0.4436356723308563, "learning_rate": 1.0825900847465748e-05, "loss": 0.4399, "num_input_tokens_seen": 9009624, "step": 13790 }, { "epoch": 7.230083857442348, "grad_norm": 0.3765696585178375, "learning_rate": 1.0807068965152033e-05, "loss": 0.4549, "num_input_tokens_seen": 9013912, "step": 13795 }, { "epoch": 7.232704402515723, "grad_norm": 0.42348650097846985, "learning_rate": 1.0788248958834695e-05, "loss": 0.4483, "num_input_tokens_seen": 9017016, "step": 13800 }, { "epoch": 7.235324947589098, "grad_norm": 0.8103942275047302, "learning_rate": 1.0769440844261481e-05, "loss": 0.4119, "num_input_tokens_seen": 9019000, "step": 13805 }, { "epoch": 7.237945492662474, "grad_norm": 0.5958459377288818, "learning_rate": 1.0750644637170122e-05, "loss": 0.5409, "num_input_tokens_seen": 9022520, "step": 13810 }, { "epoch": 7.240566037735849, "grad_norm": 0.44800975918769836, "learning_rate": 1.0731860353288445e-05, "loss": 0.4822, "num_input_tokens_seen": 9025496, "step": 13815 }, { "epoch": 7.243186582809225, "grad_norm": 0.4381340742111206, "learning_rate": 1.0713088008334302e-05, "loss": 0.4545, "num_input_tokens_seen": 9029464, "step": 13820 }, { "epoch": 7.2458071278826, "grad_norm": 0.5151557326316833, "learning_rate": 1.0694327618015493e-05, "loss": 0.5289, "num_input_tokens_seen": 9032088, "step": 13825 }, { "epoch": 7.248427672955975, "grad_norm": 0.358010470867157, "learning_rate": 1.0675579198029887e-05, "loss": 0.4155, "num_input_tokens_seen": 9035768, "step": 13830 }, { "epoch": 7.25104821802935, "grad_norm": 0.4487888216972351, "learning_rate": 1.0656842764065295e-05, "loss": 0.4479, "num_input_tokens_seen": 9038840, "step": 13835 }, { "epoch": 7.253668763102725, "grad_norm": 0.4946368932723999, "learning_rate": 1.0638118331799499e-05, "loss": 0.3391, "num_input_tokens_seen": 9042328, "step": 13840 }, { "epoch": 7.2562893081761, "grad_norm": 0.8862960338592529, "learning_rate": 1.061940591690027e-05, "loss": 0.4588, "num_input_tokens_seen": 9045336, "step": 13845 }, { "epoch": 7.258909853249476, "grad_norm": 0.4834420084953308, "learning_rate": 1.0600705535025285e-05, "loss": 0.539, "num_input_tokens_seen": 9048312, "step": 13850 }, { "epoch": 7.261530398322851, "grad_norm": 0.3982839286327362, "learning_rate": 1.058201720182217e-05, "loss": 0.5835, "num_input_tokens_seen": 9052760, "step": 13855 }, { "epoch": 7.264150943396227, "grad_norm": 0.9687431454658508, "learning_rate": 1.056334093292848e-05, "loss": 0.6871, "num_input_tokens_seen": 9055864, "step": 13860 }, { "epoch": 7.266771488469602, "grad_norm": 0.7224364876747131, "learning_rate": 1.054467674397166e-05, "loss": 0.4694, "num_input_tokens_seen": 9058776, "step": 13865 }, { "epoch": 7.269392033542977, "grad_norm": 0.5382495522499084, "learning_rate": 1.0526024650569047e-05, "loss": 0.5862, "num_input_tokens_seen": 9062360, "step": 13870 }, { "epoch": 7.272012578616352, "grad_norm": 0.3016619384288788, "learning_rate": 1.0507384668327852e-05, "loss": 0.5815, "num_input_tokens_seen": 9065016, "step": 13875 }, { "epoch": 7.2746331236897275, "grad_norm": 0.4201703667640686, "learning_rate": 1.048875681284518e-05, "loss": 0.4926, "num_input_tokens_seen": 9071992, "step": 13880 }, { "epoch": 7.277253668763103, "grad_norm": 0.3598325252532959, "learning_rate": 1.0470141099707959e-05, "loss": 0.489, "num_input_tokens_seen": 9075672, "step": 13885 }, { "epoch": 7.279874213836478, "grad_norm": 0.5323907136917114, "learning_rate": 1.0451537544492968e-05, "loss": 0.4232, "num_input_tokens_seen": 9078936, "step": 13890 }, { "epoch": 7.282494758909853, "grad_norm": 0.619927704334259, "learning_rate": 1.0432946162766805e-05, "loss": 0.6413, "num_input_tokens_seen": 9082168, "step": 13895 }, { "epoch": 7.285115303983228, "grad_norm": 0.7167806029319763, "learning_rate": 1.0414366970085906e-05, "loss": 0.6091, "num_input_tokens_seen": 9084664, "step": 13900 }, { "epoch": 7.287735849056604, "grad_norm": 0.2936565577983856, "learning_rate": 1.0395799981996479e-05, "loss": 0.4407, "num_input_tokens_seen": 9087864, "step": 13905 }, { "epoch": 7.290356394129979, "grad_norm": 0.5249952077865601, "learning_rate": 1.0377245214034537e-05, "loss": 0.556, "num_input_tokens_seen": 9091416, "step": 13910 }, { "epoch": 7.2929769392033545, "grad_norm": 0.6793009042739868, "learning_rate": 1.0358702681725848e-05, "loss": 0.441, "num_input_tokens_seen": 9094296, "step": 13915 }, { "epoch": 7.29559748427673, "grad_norm": 0.515987753868103, "learning_rate": 1.0340172400585977e-05, "loss": 0.5282, "num_input_tokens_seen": 9097112, "step": 13920 }, { "epoch": 7.298218029350105, "grad_norm": 0.391217976808548, "learning_rate": 1.0321654386120205e-05, "loss": 0.4442, "num_input_tokens_seen": 9100632, "step": 13925 }, { "epoch": 7.30083857442348, "grad_norm": 0.45395782589912415, "learning_rate": 1.0303148653823557e-05, "loss": 0.4182, "num_input_tokens_seen": 9103864, "step": 13930 }, { "epoch": 7.303459119496855, "grad_norm": 0.5307629108428955, "learning_rate": 1.0284655219180797e-05, "loss": 0.4395, "num_input_tokens_seen": 9107928, "step": 13935 }, { "epoch": 7.30607966457023, "grad_norm": 0.4818200170993805, "learning_rate": 1.026617409766638e-05, "loss": 0.3812, "num_input_tokens_seen": 9111384, "step": 13940 }, { "epoch": 7.308700209643606, "grad_norm": 0.4250311255455017, "learning_rate": 1.0247705304744457e-05, "loss": 0.4442, "num_input_tokens_seen": 9114488, "step": 13945 }, { "epoch": 7.311320754716981, "grad_norm": 0.33563756942749023, "learning_rate": 1.0229248855868892e-05, "loss": 0.4918, "num_input_tokens_seen": 9117208, "step": 13950 }, { "epoch": 7.313941299790357, "grad_norm": 0.4634060859680176, "learning_rate": 1.0210804766483168e-05, "loss": 0.474, "num_input_tokens_seen": 9121176, "step": 13955 }, { "epoch": 7.316561844863732, "grad_norm": 0.4431760609149933, "learning_rate": 1.019237305202048e-05, "loss": 0.5463, "num_input_tokens_seen": 9124216, "step": 13960 }, { "epoch": 7.319182389937107, "grad_norm": 0.4284471869468689, "learning_rate": 1.0173953727903634e-05, "loss": 0.4906, "num_input_tokens_seen": 9126936, "step": 13965 }, { "epoch": 7.321802935010482, "grad_norm": 0.38460561633110046, "learning_rate": 1.0155546809545077e-05, "loss": 0.522, "num_input_tokens_seen": 9129688, "step": 13970 }, { "epoch": 7.3244234800838575, "grad_norm": 0.49615350365638733, "learning_rate": 1.013715231234689e-05, "loss": 0.4197, "num_input_tokens_seen": 9132152, "step": 13975 }, { "epoch": 7.327044025157233, "grad_norm": 0.5106218457221985, "learning_rate": 1.0118770251700741e-05, "loss": 0.5484, "num_input_tokens_seen": 9135352, "step": 13980 }, { "epoch": 7.329664570230608, "grad_norm": 0.29540160298347473, "learning_rate": 1.0100400642987886e-05, "loss": 0.6081, "num_input_tokens_seen": 9139064, "step": 13985 }, { "epoch": 7.332285115303983, "grad_norm": 0.4630134105682373, "learning_rate": 1.0082043501579205e-05, "loss": 0.4077, "num_input_tokens_seen": 9141624, "step": 13990 }, { "epoch": 7.334905660377358, "grad_norm": 0.4166030287742615, "learning_rate": 1.0063698842835082e-05, "loss": 0.388, "num_input_tokens_seen": 9144120, "step": 13995 }, { "epoch": 7.337526205450734, "grad_norm": 0.6813272833824158, "learning_rate": 1.0045366682105511e-05, "loss": 0.4158, "num_input_tokens_seen": 9147384, "step": 14000 }, { "epoch": 7.340146750524109, "grad_norm": 0.2801685929298401, "learning_rate": 1.002704703473e-05, "loss": 0.4341, "num_input_tokens_seen": 9150744, "step": 14005 }, { "epoch": 7.3427672955974845, "grad_norm": 0.3586546778678894, "learning_rate": 1.0008739916037585e-05, "loss": 0.4592, "num_input_tokens_seen": 9153560, "step": 14010 }, { "epoch": 7.34538784067086, "grad_norm": 0.4327365756034851, "learning_rate": 9.990445341346846e-06, "loss": 0.3336, "num_input_tokens_seen": 9156696, "step": 14015 }, { "epoch": 7.348008385744235, "grad_norm": 0.39712706208229065, "learning_rate": 9.972163325965833e-06, "loss": 0.3921, "num_input_tokens_seen": 9159416, "step": 14020 }, { "epoch": 7.35062893081761, "grad_norm": 0.36806532740592957, "learning_rate": 9.953893885192097e-06, "loss": 0.3561, "num_input_tokens_seen": 9162584, "step": 14025 }, { "epoch": 7.353249475890985, "grad_norm": 0.4345409572124481, "learning_rate": 9.93563703431269e-06, "loss": 0.4355, "num_input_tokens_seen": 9165848, "step": 14030 }, { "epoch": 7.35587002096436, "grad_norm": 0.5167527794837952, "learning_rate": 9.917392788604097e-06, "loss": 0.4104, "num_input_tokens_seen": 9168600, "step": 14035 }, { "epoch": 7.3584905660377355, "grad_norm": 0.34702497720718384, "learning_rate": 9.899161163332274e-06, "loss": 0.383, "num_input_tokens_seen": 9173336, "step": 14040 }, { "epoch": 7.361111111111111, "grad_norm": 0.45901721715927124, "learning_rate": 9.880942173752602e-06, "loss": 0.4746, "num_input_tokens_seen": 9176376, "step": 14045 }, { "epoch": 7.363731656184487, "grad_norm": 0.4365841746330261, "learning_rate": 9.862735835109915e-06, "loss": 0.4882, "num_input_tokens_seen": 9180088, "step": 14050 }, { "epoch": 7.366352201257862, "grad_norm": 1.2262449264526367, "learning_rate": 9.844542162638442e-06, "loss": 0.4193, "num_input_tokens_seen": 9183064, "step": 14055 }, { "epoch": 7.368972746331237, "grad_norm": 0.6127293705940247, "learning_rate": 9.826361171561802e-06, "loss": 0.4419, "num_input_tokens_seen": 9186040, "step": 14060 }, { "epoch": 7.371593291404612, "grad_norm": 0.46146294474601746, "learning_rate": 9.808192877093039e-06, "loss": 0.4439, "num_input_tokens_seen": 9189080, "step": 14065 }, { "epoch": 7.3742138364779874, "grad_norm": 0.29841381311416626, "learning_rate": 9.790037294434545e-06, "loss": 0.5834, "num_input_tokens_seen": 9191992, "step": 14070 }, { "epoch": 7.376834381551363, "grad_norm": 0.5886067748069763, "learning_rate": 9.771894438778075e-06, "loss": 0.4576, "num_input_tokens_seen": 9194520, "step": 14075 }, { "epoch": 7.379454926624738, "grad_norm": 0.7311380505561829, "learning_rate": 9.753764325304751e-06, "loss": 0.47, "num_input_tokens_seen": 9196632, "step": 14080 }, { "epoch": 7.382075471698113, "grad_norm": 0.5670555233955383, "learning_rate": 9.735646969185008e-06, "loss": 0.5281, "num_input_tokens_seen": 9199832, "step": 14085 }, { "epoch": 7.384696016771488, "grad_norm": 0.6207590103149414, "learning_rate": 9.717542385578645e-06, "loss": 0.4611, "num_input_tokens_seen": 9203384, "step": 14090 }, { "epoch": 7.387316561844864, "grad_norm": 0.9288303256034851, "learning_rate": 9.699450589634736e-06, "loss": 0.5118, "num_input_tokens_seen": 9206840, "step": 14095 }, { "epoch": 7.389937106918239, "grad_norm": 0.7077956199645996, "learning_rate": 9.681371596491665e-06, "loss": 0.6145, "num_input_tokens_seen": 9209432, "step": 14100 }, { "epoch": 7.3925576519916145, "grad_norm": 0.37193456292152405, "learning_rate": 9.663305421277125e-06, "loss": 0.4422, "num_input_tokens_seen": 9212056, "step": 14105 }, { "epoch": 7.39517819706499, "grad_norm": 0.38252854347229004, "learning_rate": 9.645252079108055e-06, "loss": 0.3708, "num_input_tokens_seen": 9215224, "step": 14110 }, { "epoch": 7.397798742138365, "grad_norm": 0.26825836300849915, "learning_rate": 9.62721158509066e-06, "loss": 0.3786, "num_input_tokens_seen": 9219672, "step": 14115 }, { "epoch": 7.40041928721174, "grad_norm": 0.6741531491279602, "learning_rate": 9.609183954320425e-06, "loss": 0.4643, "num_input_tokens_seen": 9222616, "step": 14120 }, { "epoch": 7.403039832285115, "grad_norm": 0.48960167169570923, "learning_rate": 9.59116920188202e-06, "loss": 0.6991, "num_input_tokens_seen": 9227032, "step": 14125 }, { "epoch": 7.40566037735849, "grad_norm": 1.1527243852615356, "learning_rate": 9.573167342849375e-06, "loss": 0.5212, "num_input_tokens_seen": 9229688, "step": 14130 }, { "epoch": 7.4082809224318655, "grad_norm": 0.4521636366844177, "learning_rate": 9.555178392285647e-06, "loss": 0.5048, "num_input_tokens_seen": 9233016, "step": 14135 }, { "epoch": 7.410901467505241, "grad_norm": 0.4331180155277252, "learning_rate": 9.53720236524313e-06, "loss": 0.4552, "num_input_tokens_seen": 9236504, "step": 14140 }, { "epoch": 7.413522012578617, "grad_norm": 0.4551527798175812, "learning_rate": 9.519239276763376e-06, "loss": 0.4415, "num_input_tokens_seen": 9240344, "step": 14145 }, { "epoch": 7.416142557651992, "grad_norm": 0.4968133568763733, "learning_rate": 9.501289141877056e-06, "loss": 0.4524, "num_input_tokens_seen": 9242904, "step": 14150 }, { "epoch": 7.418763102725367, "grad_norm": 0.8114441633224487, "learning_rate": 9.483351975604025e-06, "loss": 0.5756, "num_input_tokens_seen": 9246072, "step": 14155 }, { "epoch": 7.421383647798742, "grad_norm": 0.9903644323348999, "learning_rate": 9.465427792953293e-06, "loss": 0.4801, "num_input_tokens_seen": 9248696, "step": 14160 }, { "epoch": 7.424004192872117, "grad_norm": 0.40069711208343506, "learning_rate": 9.447516608922996e-06, "loss": 0.6365, "num_input_tokens_seen": 9251160, "step": 14165 }, { "epoch": 7.426624737945493, "grad_norm": 0.42869365215301514, "learning_rate": 9.429618438500381e-06, "loss": 0.5059, "num_input_tokens_seen": 9254040, "step": 14170 }, { "epoch": 7.429245283018868, "grad_norm": 0.2936114966869354, "learning_rate": 9.411733296661852e-06, "loss": 0.4872, "num_input_tokens_seen": 9257816, "step": 14175 }, { "epoch": 7.431865828092243, "grad_norm": 0.544668972492218, "learning_rate": 9.393861198372836e-06, "loss": 0.3941, "num_input_tokens_seen": 9260184, "step": 14180 }, { "epoch": 7.434486373165618, "grad_norm": 0.5712119936943054, "learning_rate": 9.376002158587915e-06, "loss": 0.5743, "num_input_tokens_seen": 9263032, "step": 14185 }, { "epoch": 7.437106918238994, "grad_norm": 0.44672784209251404, "learning_rate": 9.358156192250717e-06, "loss": 0.5431, "num_input_tokens_seen": 9267448, "step": 14190 }, { "epoch": 7.439727463312369, "grad_norm": 0.5566244721412659, "learning_rate": 9.340323314293917e-06, "loss": 0.4356, "num_input_tokens_seen": 9270200, "step": 14195 }, { "epoch": 7.4423480083857445, "grad_norm": 0.6168826818466187, "learning_rate": 9.322503539639269e-06, "loss": 0.4491, "num_input_tokens_seen": 9272856, "step": 14200 }, { "epoch": 7.44496855345912, "grad_norm": 0.2992514371871948, "learning_rate": 9.304696883197542e-06, "loss": 0.4326, "num_input_tokens_seen": 9276504, "step": 14205 }, { "epoch": 7.447589098532495, "grad_norm": 0.4613470435142517, "learning_rate": 9.286903359868518e-06, "loss": 0.5237, "num_input_tokens_seen": 9279736, "step": 14210 }, { "epoch": 7.45020964360587, "grad_norm": 0.6426288485527039, "learning_rate": 9.269122984541029e-06, "loss": 0.4127, "num_input_tokens_seen": 9282680, "step": 14215 }, { "epoch": 7.452830188679245, "grad_norm": 0.7196266651153564, "learning_rate": 9.251355772092867e-06, "loss": 0.4555, "num_input_tokens_seen": 9285912, "step": 14220 }, { "epoch": 7.45545073375262, "grad_norm": 0.5424767136573792, "learning_rate": 9.233601737390826e-06, "loss": 0.3677, "num_input_tokens_seen": 9289176, "step": 14225 }, { "epoch": 7.4580712788259955, "grad_norm": 0.6708307266235352, "learning_rate": 9.215860895290662e-06, "loss": 0.4364, "num_input_tokens_seen": 9291736, "step": 14230 }, { "epoch": 7.460691823899371, "grad_norm": 0.47235527634620667, "learning_rate": 9.198133260637121e-06, "loss": 0.4471, "num_input_tokens_seen": 9296312, "step": 14235 }, { "epoch": 7.463312368972747, "grad_norm": 0.8183172941207886, "learning_rate": 9.180418848263866e-06, "loss": 0.4072, "num_input_tokens_seen": 9298904, "step": 14240 }, { "epoch": 7.465932914046122, "grad_norm": 0.3845219910144806, "learning_rate": 9.162717672993499e-06, "loss": 0.4692, "num_input_tokens_seen": 9301432, "step": 14245 }, { "epoch": 7.468553459119497, "grad_norm": 0.41984957456588745, "learning_rate": 9.145029749637576e-06, "loss": 0.481, "num_input_tokens_seen": 9304376, "step": 14250 }, { "epoch": 7.471174004192872, "grad_norm": 0.5416908860206604, "learning_rate": 9.127355092996532e-06, "loss": 0.4525, "num_input_tokens_seen": 9307864, "step": 14255 }, { "epoch": 7.473794549266247, "grad_norm": 0.4936507046222687, "learning_rate": 9.10969371785971e-06, "loss": 0.5144, "num_input_tokens_seen": 9311032, "step": 14260 }, { "epoch": 7.476415094339623, "grad_norm": 0.2915037274360657, "learning_rate": 9.092045639005347e-06, "loss": 0.3525, "num_input_tokens_seen": 9314264, "step": 14265 }, { "epoch": 7.479035639412998, "grad_norm": 0.2699783742427826, "learning_rate": 9.07441087120054e-06, "loss": 0.3866, "num_input_tokens_seen": 9317272, "step": 14270 }, { "epoch": 7.481656184486373, "grad_norm": 0.8017538189888, "learning_rate": 9.05678942920127e-06, "loss": 0.4823, "num_input_tokens_seen": 9320504, "step": 14275 }, { "epoch": 7.484276729559748, "grad_norm": 0.4527721107006073, "learning_rate": 9.03918132775235e-06, "loss": 0.4479, "num_input_tokens_seen": 9324792, "step": 14280 }, { "epoch": 7.486897274633124, "grad_norm": 0.3008699417114258, "learning_rate": 9.021586581587425e-06, "loss": 0.3896, "num_input_tokens_seen": 9327544, "step": 14285 }, { "epoch": 7.489517819706499, "grad_norm": 0.5174546837806702, "learning_rate": 9.004005205428992e-06, "loss": 0.5191, "num_input_tokens_seen": 9330488, "step": 14290 }, { "epoch": 7.4921383647798745, "grad_norm": 0.5964468121528625, "learning_rate": 8.986437213988336e-06, "loss": 0.5306, "num_input_tokens_seen": 9335288, "step": 14295 }, { "epoch": 7.49475890985325, "grad_norm": 0.2923237979412079, "learning_rate": 8.968882621965542e-06, "loss": 0.4281, "num_input_tokens_seen": 9339384, "step": 14300 }, { "epoch": 7.497379454926625, "grad_norm": 0.3858889639377594, "learning_rate": 8.951341444049513e-06, "loss": 0.5784, "num_input_tokens_seen": 9342872, "step": 14305 }, { "epoch": 7.5, "grad_norm": 0.402608186006546, "learning_rate": 8.933813694917873e-06, "loss": 0.4158, "num_input_tokens_seen": 9345976, "step": 14310 }, { "epoch": 7.5, "eval_loss": 0.4838065505027771, "eval_runtime": 14.5419, "eval_samples_per_second": 58.314, "eval_steps_per_second": 14.579, "num_input_tokens_seen": 9345976, "step": 14310 }, { "epoch": 7.502620545073375, "grad_norm": 0.5374040007591248, "learning_rate": 8.916299389237067e-06, "loss": 0.4133, "num_input_tokens_seen": 9349208, "step": 14315 }, { "epoch": 7.50524109014675, "grad_norm": 0.5826287865638733, "learning_rate": 8.898798541662259e-06, "loss": 0.4405, "num_input_tokens_seen": 9352184, "step": 14320 }, { "epoch": 7.5078616352201255, "grad_norm": 0.5099855065345764, "learning_rate": 8.88131116683735e-06, "loss": 0.4985, "num_input_tokens_seen": 9356216, "step": 14325 }, { "epoch": 7.510482180293501, "grad_norm": 0.7816892266273499, "learning_rate": 8.863837279394993e-06, "loss": 0.4537, "num_input_tokens_seen": 9359544, "step": 14330 }, { "epoch": 7.513102725366876, "grad_norm": 0.32412970066070557, "learning_rate": 8.84637689395653e-06, "loss": 0.4277, "num_input_tokens_seen": 9362200, "step": 14335 }, { "epoch": 7.515723270440252, "grad_norm": 0.3413497805595398, "learning_rate": 8.828930025132006e-06, "loss": 0.4128, "num_input_tokens_seen": 9366520, "step": 14340 }, { "epoch": 7.518343815513627, "grad_norm": 0.3338906466960907, "learning_rate": 8.81149668752018e-06, "loss": 0.394, "num_input_tokens_seen": 9372440, "step": 14345 }, { "epoch": 7.520964360587002, "grad_norm": 0.5863485932350159, "learning_rate": 8.794076895708463e-06, "loss": 0.3607, "num_input_tokens_seen": 9375064, "step": 14350 }, { "epoch": 7.523584905660377, "grad_norm": 0.5563605427742004, "learning_rate": 8.776670664272946e-06, "loss": 0.3748, "num_input_tokens_seen": 9377720, "step": 14355 }, { "epoch": 7.526205450733753, "grad_norm": 0.36315420269966125, "learning_rate": 8.759278007778362e-06, "loss": 0.469, "num_input_tokens_seen": 9380984, "step": 14360 }, { "epoch": 7.528825995807128, "grad_norm": 0.577091634273529, "learning_rate": 8.741898940778088e-06, "loss": 0.3935, "num_input_tokens_seen": 9383992, "step": 14365 }, { "epoch": 7.531446540880503, "grad_norm": 1.0740083456039429, "learning_rate": 8.724533477814148e-06, "loss": 0.4738, "num_input_tokens_seen": 9386360, "step": 14370 }, { "epoch": 7.534067085953878, "grad_norm": 0.657381534576416, "learning_rate": 8.707181633417159e-06, "loss": 0.5259, "num_input_tokens_seen": 9389144, "step": 14375 }, { "epoch": 7.536687631027254, "grad_norm": 0.5894361734390259, "learning_rate": 8.689843422106345e-06, "loss": 0.3456, "num_input_tokens_seen": 9392024, "step": 14380 }, { "epoch": 7.539308176100629, "grad_norm": 0.7394143342971802, "learning_rate": 8.672518858389548e-06, "loss": 0.4437, "num_input_tokens_seen": 9394872, "step": 14385 }, { "epoch": 7.5419287211740045, "grad_norm": 0.43395549058914185, "learning_rate": 8.655207956763159e-06, "loss": 0.6275, "num_input_tokens_seen": 9397304, "step": 14390 }, { "epoch": 7.54454926624738, "grad_norm": 0.54768967628479, "learning_rate": 8.63791073171215e-06, "loss": 0.5127, "num_input_tokens_seen": 9400664, "step": 14395 }, { "epoch": 7.547169811320755, "grad_norm": 0.28967925906181335, "learning_rate": 8.620627197710044e-06, "loss": 0.5283, "num_input_tokens_seen": 9403864, "step": 14400 }, { "epoch": 7.54979035639413, "grad_norm": 0.8175686597824097, "learning_rate": 8.603357369218928e-06, "loss": 0.4559, "num_input_tokens_seen": 9407544, "step": 14405 }, { "epoch": 7.552410901467505, "grad_norm": 0.20980842411518097, "learning_rate": 8.586101260689397e-06, "loss": 0.3657, "num_input_tokens_seen": 9410968, "step": 14410 }, { "epoch": 7.55503144654088, "grad_norm": 0.675358772277832, "learning_rate": 8.568858886560563e-06, "loss": 0.5064, "num_input_tokens_seen": 9413400, "step": 14415 }, { "epoch": 7.5576519916142555, "grad_norm": 0.7654277682304382, "learning_rate": 8.551630261260079e-06, "loss": 0.3755, "num_input_tokens_seen": 9415832, "step": 14420 }, { "epoch": 7.560272536687631, "grad_norm": 0.5045801401138306, "learning_rate": 8.53441539920406e-06, "loss": 0.5568, "num_input_tokens_seen": 9419064, "step": 14425 }, { "epoch": 7.562893081761006, "grad_norm": 0.4361601769924164, "learning_rate": 8.517214314797108e-06, "loss": 0.5078, "num_input_tokens_seen": 9421752, "step": 14430 }, { "epoch": 7.565513626834382, "grad_norm": 0.5217888355255127, "learning_rate": 8.500027022432333e-06, "loss": 0.5107, "num_input_tokens_seen": 9424984, "step": 14435 }, { "epoch": 7.568134171907757, "grad_norm": 0.4172496795654297, "learning_rate": 8.482853536491239e-06, "loss": 0.4043, "num_input_tokens_seen": 9428216, "step": 14440 }, { "epoch": 7.570754716981132, "grad_norm": 0.5543926954269409, "learning_rate": 8.465693871343842e-06, "loss": 0.4733, "num_input_tokens_seen": 9431576, "step": 14445 }, { "epoch": 7.573375262054507, "grad_norm": 0.6933387517929077, "learning_rate": 8.448548041348552e-06, "loss": 0.596, "num_input_tokens_seen": 9434680, "step": 14450 }, { "epoch": 7.575995807127883, "grad_norm": 0.7325169444084167, "learning_rate": 8.431416060852218e-06, "loss": 0.2918, "num_input_tokens_seen": 9437432, "step": 14455 }, { "epoch": 7.578616352201258, "grad_norm": 0.3700108826160431, "learning_rate": 8.414297944190108e-06, "loss": 0.3903, "num_input_tokens_seen": 9441208, "step": 14460 }, { "epoch": 7.581236897274633, "grad_norm": 0.5937731266021729, "learning_rate": 8.397193705685873e-06, "loss": 0.4659, "num_input_tokens_seen": 9444056, "step": 14465 }, { "epoch": 7.583857442348008, "grad_norm": 1.798872709274292, "learning_rate": 8.380103359651553e-06, "loss": 0.5783, "num_input_tokens_seen": 9446968, "step": 14470 }, { "epoch": 7.586477987421384, "grad_norm": 0.30528730154037476, "learning_rate": 8.36302692038759e-06, "loss": 0.472, "num_input_tokens_seen": 9450712, "step": 14475 }, { "epoch": 7.589098532494759, "grad_norm": 0.34090423583984375, "learning_rate": 8.345964402182739e-06, "loss": 0.4249, "num_input_tokens_seen": 9453240, "step": 14480 }, { "epoch": 7.5917190775681345, "grad_norm": 0.41221943497657776, "learning_rate": 8.328915819314148e-06, "loss": 0.392, "num_input_tokens_seen": 9456728, "step": 14485 }, { "epoch": 7.59433962264151, "grad_norm": 0.47662153840065, "learning_rate": 8.31188118604731e-06, "loss": 0.4621, "num_input_tokens_seen": 9459800, "step": 14490 }, { "epoch": 7.596960167714885, "grad_norm": 0.382659375667572, "learning_rate": 8.294860516636e-06, "loss": 0.437, "num_input_tokens_seen": 9463384, "step": 14495 }, { "epoch": 7.59958071278826, "grad_norm": 0.46232959628105164, "learning_rate": 8.277853825322355e-06, "loss": 0.457, "num_input_tokens_seen": 9466520, "step": 14500 }, { "epoch": 7.602201257861635, "grad_norm": 0.41065096855163574, "learning_rate": 8.260861126336794e-06, "loss": 0.3741, "num_input_tokens_seen": 9469944, "step": 14505 }, { "epoch": 7.60482180293501, "grad_norm": 2.957273006439209, "learning_rate": 8.243882433898018e-06, "loss": 0.4958, "num_input_tokens_seen": 9472280, "step": 14510 }, { "epoch": 7.6074423480083855, "grad_norm": 0.6807156205177307, "learning_rate": 8.226917762213044e-06, "loss": 0.467, "num_input_tokens_seen": 9474808, "step": 14515 }, { "epoch": 7.610062893081761, "grad_norm": 0.30824026465415955, "learning_rate": 8.209967125477119e-06, "loss": 0.4587, "num_input_tokens_seen": 9479064, "step": 14520 }, { "epoch": 7.612683438155136, "grad_norm": 0.42198094725608826, "learning_rate": 8.193030537873761e-06, "loss": 0.4876, "num_input_tokens_seen": 9483128, "step": 14525 }, { "epoch": 7.615303983228512, "grad_norm": 0.568791389465332, "learning_rate": 8.176108013574743e-06, "loss": 0.4475, "num_input_tokens_seen": 9486136, "step": 14530 }, { "epoch": 7.617924528301887, "grad_norm": 1.1973278522491455, "learning_rate": 8.159199566740055e-06, "loss": 0.8055, "num_input_tokens_seen": 9488568, "step": 14535 }, { "epoch": 7.620545073375262, "grad_norm": 0.6075212955474854, "learning_rate": 8.142305211517914e-06, "loss": 0.4672, "num_input_tokens_seen": 9492440, "step": 14540 }, { "epoch": 7.623165618448637, "grad_norm": 0.413964182138443, "learning_rate": 8.125424962044742e-06, "loss": 0.4137, "num_input_tokens_seen": 9495192, "step": 14545 }, { "epoch": 7.6257861635220126, "grad_norm": 1.0338693857192993, "learning_rate": 8.108558832445157e-06, "loss": 0.4957, "num_input_tokens_seen": 9497688, "step": 14550 }, { "epoch": 7.628406708595388, "grad_norm": 0.34963688254356384, "learning_rate": 8.091706836831974e-06, "loss": 0.4588, "num_input_tokens_seen": 9500984, "step": 14555 }, { "epoch": 7.631027253668763, "grad_norm": 0.3382962942123413, "learning_rate": 8.074868989306173e-06, "loss": 0.441, "num_input_tokens_seen": 9504152, "step": 14560 }, { "epoch": 7.633647798742138, "grad_norm": 0.3539784550666809, "learning_rate": 8.058045303956885e-06, "loss": 0.4616, "num_input_tokens_seen": 9506840, "step": 14565 }, { "epoch": 7.636268343815514, "grad_norm": 0.2891302704811096, "learning_rate": 8.041235794861416e-06, "loss": 0.4947, "num_input_tokens_seen": 9509880, "step": 14570 }, { "epoch": 7.638888888888889, "grad_norm": 0.6276590824127197, "learning_rate": 8.024440476085188e-06, "loss": 0.4367, "num_input_tokens_seen": 9513464, "step": 14575 }, { "epoch": 7.6415094339622645, "grad_norm": 0.8350415825843811, "learning_rate": 8.007659361681758e-06, "loss": 0.5148, "num_input_tokens_seen": 9516216, "step": 14580 }, { "epoch": 7.64412997903564, "grad_norm": 0.3838295638561249, "learning_rate": 7.990892465692787e-06, "loss": 0.3735, "num_input_tokens_seen": 9519032, "step": 14585 }, { "epoch": 7.646750524109015, "grad_norm": 0.2523414194583893, "learning_rate": 7.974139802148065e-06, "loss": 0.379, "num_input_tokens_seen": 9523640, "step": 14590 }, { "epoch": 7.64937106918239, "grad_norm": 0.43090933561325073, "learning_rate": 7.957401385065444e-06, "loss": 0.4244, "num_input_tokens_seen": 9526616, "step": 14595 }, { "epoch": 7.651991614255765, "grad_norm": 0.3543574810028076, "learning_rate": 7.94067722845086e-06, "loss": 0.4845, "num_input_tokens_seen": 9529656, "step": 14600 }, { "epoch": 7.65461215932914, "grad_norm": 0.2755182683467865, "learning_rate": 7.923967346298345e-06, "loss": 0.417, "num_input_tokens_seen": 9532984, "step": 14605 }, { "epoch": 7.6572327044025155, "grad_norm": 0.8753013014793396, "learning_rate": 7.907271752589951e-06, "loss": 0.43, "num_input_tokens_seen": 9535928, "step": 14610 }, { "epoch": 7.659853249475891, "grad_norm": 0.4242202639579773, "learning_rate": 7.89059046129578e-06, "loss": 0.49, "num_input_tokens_seen": 9539256, "step": 14615 }, { "epoch": 7.662473794549266, "grad_norm": 0.3009949028491974, "learning_rate": 7.873923486374001e-06, "loss": 0.4511, "num_input_tokens_seen": 9542296, "step": 14620 }, { "epoch": 7.665094339622642, "grad_norm": 0.49952974915504456, "learning_rate": 7.857270841770745e-06, "loss": 0.371, "num_input_tokens_seen": 9545368, "step": 14625 }, { "epoch": 7.667714884696017, "grad_norm": 0.2675643563270569, "learning_rate": 7.8406325414202e-06, "loss": 0.4424, "num_input_tokens_seen": 9548472, "step": 14630 }, { "epoch": 7.670335429769392, "grad_norm": 0.24443566799163818, "learning_rate": 7.824008599244553e-06, "loss": 0.3952, "num_input_tokens_seen": 9552248, "step": 14635 }, { "epoch": 7.672955974842767, "grad_norm": 0.4662371575832367, "learning_rate": 7.807399029153925e-06, "loss": 0.7014, "num_input_tokens_seen": 9555320, "step": 14640 }, { "epoch": 7.6755765199161425, "grad_norm": 0.5501882433891296, "learning_rate": 7.790803845046474e-06, "loss": 0.4676, "num_input_tokens_seen": 9558904, "step": 14645 }, { "epoch": 7.678197064989518, "grad_norm": 0.7515208125114441, "learning_rate": 7.774223060808277e-06, "loss": 0.5005, "num_input_tokens_seen": 9561624, "step": 14650 }, { "epoch": 7.680817610062893, "grad_norm": 0.6661239862442017, "learning_rate": 7.757656690313375e-06, "loss": 0.5927, "num_input_tokens_seen": 9564568, "step": 14655 }, { "epoch": 7.683438155136268, "grad_norm": 0.4852483570575714, "learning_rate": 7.741104747423769e-06, "loss": 0.5406, "num_input_tokens_seen": 9567704, "step": 14660 }, { "epoch": 7.686058700209644, "grad_norm": 0.4234083890914917, "learning_rate": 7.724567245989342e-06, "loss": 0.4558, "num_input_tokens_seen": 9571512, "step": 14665 }, { "epoch": 7.688679245283019, "grad_norm": 0.40301233530044556, "learning_rate": 7.708044199847934e-06, "loss": 0.5509, "num_input_tokens_seen": 9575448, "step": 14670 }, { "epoch": 7.691299790356394, "grad_norm": 0.2996521592140198, "learning_rate": 7.691535622825288e-06, "loss": 0.2771, "num_input_tokens_seen": 9583704, "step": 14675 }, { "epoch": 7.69392033542977, "grad_norm": 0.4439983069896698, "learning_rate": 7.675041528735e-06, "loss": 0.4377, "num_input_tokens_seen": 9586392, "step": 14680 }, { "epoch": 7.696540880503145, "grad_norm": 0.5139893293380737, "learning_rate": 7.658561931378594e-06, "loss": 0.5964, "num_input_tokens_seen": 9589336, "step": 14685 }, { "epoch": 7.69916142557652, "grad_norm": 0.30129167437553406, "learning_rate": 7.64209684454544e-06, "loss": 0.4204, "num_input_tokens_seen": 9593016, "step": 14690 }, { "epoch": 7.701781970649895, "grad_norm": 0.683161199092865, "learning_rate": 7.625646282012763e-06, "loss": 0.4305, "num_input_tokens_seen": 9595512, "step": 14695 }, { "epoch": 7.70440251572327, "grad_norm": 0.5709900259971619, "learning_rate": 7.6092102575456546e-06, "loss": 0.4603, "num_input_tokens_seen": 9598904, "step": 14700 }, { "epoch": 7.7070230607966455, "grad_norm": 0.3581549823284149, "learning_rate": 7.592788784897023e-06, "loss": 0.2988, "num_input_tokens_seen": 9602712, "step": 14705 }, { "epoch": 7.709643605870021, "grad_norm": 0.34968021512031555, "learning_rate": 7.576381877807598e-06, "loss": 0.4676, "num_input_tokens_seen": 9605560, "step": 14710 }, { "epoch": 7.712264150943396, "grad_norm": 0.378005713224411, "learning_rate": 7.559989550005947e-06, "loss": 0.5877, "num_input_tokens_seen": 9609048, "step": 14715 }, { "epoch": 7.714884696016772, "grad_norm": 0.5810762643814087, "learning_rate": 7.543611815208415e-06, "loss": 0.4426, "num_input_tokens_seen": 9612824, "step": 14720 }, { "epoch": 7.717505241090147, "grad_norm": 0.240659698843956, "learning_rate": 7.5272486871191375e-06, "loss": 0.4055, "num_input_tokens_seen": 9615704, "step": 14725 }, { "epoch": 7.720125786163522, "grad_norm": 0.5654626488685608, "learning_rate": 7.510900179430036e-06, "loss": 0.5194, "num_input_tokens_seen": 9618840, "step": 14730 }, { "epoch": 7.722746331236897, "grad_norm": 0.36085936427116394, "learning_rate": 7.494566305820788e-06, "loss": 0.3957, "num_input_tokens_seen": 9622552, "step": 14735 }, { "epoch": 7.7253668763102725, "grad_norm": 0.27778884768486023, "learning_rate": 7.478247079958845e-06, "loss": 0.5998, "num_input_tokens_seen": 9625720, "step": 14740 }, { "epoch": 7.727987421383648, "grad_norm": 0.4338826835155487, "learning_rate": 7.461942515499384e-06, "loss": 0.3981, "num_input_tokens_seen": 9628664, "step": 14745 }, { "epoch": 7.730607966457023, "grad_norm": 0.35082191228866577, "learning_rate": 7.445652626085312e-06, "loss": 0.4152, "num_input_tokens_seen": 9632088, "step": 14750 }, { "epoch": 7.733228511530398, "grad_norm": 0.4555208384990692, "learning_rate": 7.429377425347281e-06, "loss": 0.4876, "num_input_tokens_seen": 9635160, "step": 14755 }, { "epoch": 7.735849056603773, "grad_norm": 0.37266433238983154, "learning_rate": 7.413116926903624e-06, "loss": 0.4792, "num_input_tokens_seen": 9638776, "step": 14760 }, { "epoch": 7.738469601677149, "grad_norm": 0.8735508918762207, "learning_rate": 7.396871144360387e-06, "loss": 0.4872, "num_input_tokens_seen": 9641624, "step": 14765 }, { "epoch": 7.741090146750524, "grad_norm": 0.31385740637779236, "learning_rate": 7.380640091311291e-06, "loss": 0.533, "num_input_tokens_seen": 9649176, "step": 14770 }, { "epoch": 7.7437106918239, "grad_norm": 0.475568562746048, "learning_rate": 7.3644237813377535e-06, "loss": 0.662, "num_input_tokens_seen": 9651768, "step": 14775 }, { "epoch": 7.746331236897275, "grad_norm": 0.3748024106025696, "learning_rate": 7.348222228008836e-06, "loss": 0.2984, "num_input_tokens_seen": 9654552, "step": 14780 }, { "epoch": 7.74895178197065, "grad_norm": 0.31529319286346436, "learning_rate": 7.332035444881247e-06, "loss": 0.4482, "num_input_tokens_seen": 9657112, "step": 14785 }, { "epoch": 7.751572327044025, "grad_norm": 0.31859222054481506, "learning_rate": 7.315863445499366e-06, "loss": 0.3888, "num_input_tokens_seen": 9661176, "step": 14790 }, { "epoch": 7.7541928721174, "grad_norm": 0.55277419090271, "learning_rate": 7.299706243395177e-06, "loss": 0.4443, "num_input_tokens_seen": 9663480, "step": 14795 }, { "epoch": 7.756813417190775, "grad_norm": 0.30693838000297546, "learning_rate": 7.283563852088277e-06, "loss": 0.4521, "num_input_tokens_seen": 9666808, "step": 14800 }, { "epoch": 7.759433962264151, "grad_norm": 0.6818961501121521, "learning_rate": 7.267436285085905e-06, "loss": 0.6045, "num_input_tokens_seen": 9669336, "step": 14805 }, { "epoch": 7.762054507337526, "grad_norm": 0.46107229590415955, "learning_rate": 7.251323555882844e-06, "loss": 0.5612, "num_input_tokens_seen": 9671928, "step": 14810 }, { "epoch": 7.764675052410902, "grad_norm": 0.33482086658477783, "learning_rate": 7.235225677961513e-06, "loss": 0.5297, "num_input_tokens_seen": 9676120, "step": 14815 }, { "epoch": 7.767295597484277, "grad_norm": 0.5457087755203247, "learning_rate": 7.219142664791872e-06, "loss": 0.3898, "num_input_tokens_seen": 9679032, "step": 14820 }, { "epoch": 7.769916142557652, "grad_norm": 0.46332091093063354, "learning_rate": 7.203074529831444e-06, "loss": 0.4413, "num_input_tokens_seen": 9682744, "step": 14825 }, { "epoch": 7.772536687631027, "grad_norm": 0.47116777300834656, "learning_rate": 7.187021286525328e-06, "loss": 0.652, "num_input_tokens_seen": 9686040, "step": 14830 }, { "epoch": 7.7751572327044025, "grad_norm": 0.58465975522995, "learning_rate": 7.170982948306135e-06, "loss": 0.4814, "num_input_tokens_seen": 9690168, "step": 14835 }, { "epoch": 7.777777777777778, "grad_norm": 0.6164806485176086, "learning_rate": 7.154959528594002e-06, "loss": 0.494, "num_input_tokens_seen": 9693496, "step": 14840 }, { "epoch": 7.780398322851153, "grad_norm": 0.5453695058822632, "learning_rate": 7.138951040796627e-06, "loss": 0.4501, "num_input_tokens_seen": 9696760, "step": 14845 }, { "epoch": 7.783018867924528, "grad_norm": 0.4232114851474762, "learning_rate": 7.122957498309143e-06, "loss": 0.5403, "num_input_tokens_seen": 9699672, "step": 14850 }, { "epoch": 7.785639412997903, "grad_norm": 0.3447321057319641, "learning_rate": 7.1069789145142424e-06, "loss": 0.4059, "num_input_tokens_seen": 9703384, "step": 14855 }, { "epoch": 7.788259958071279, "grad_norm": 0.24993929266929626, "learning_rate": 7.091015302782064e-06, "loss": 0.5042, "num_input_tokens_seen": 9706328, "step": 14860 }, { "epoch": 7.790880503144654, "grad_norm": 0.5339667201042175, "learning_rate": 7.075066676470224e-06, "loss": 0.5379, "num_input_tokens_seen": 9709432, "step": 14865 }, { "epoch": 7.79350104821803, "grad_norm": 0.8495398759841919, "learning_rate": 7.0591330489238185e-06, "loss": 0.5793, "num_input_tokens_seen": 9712696, "step": 14870 }, { "epoch": 7.796121593291405, "grad_norm": 0.6511796712875366, "learning_rate": 7.0432144334753734e-06, "loss": 0.5317, "num_input_tokens_seen": 9715704, "step": 14875 }, { "epoch": 7.79874213836478, "grad_norm": 0.46447527408599854, "learning_rate": 7.027310843444846e-06, "loss": 0.6868, "num_input_tokens_seen": 9720344, "step": 14880 }, { "epoch": 7.801362683438155, "grad_norm": 0.28922805190086365, "learning_rate": 7.011422292139655e-06, "loss": 0.4195, "num_input_tokens_seen": 9723544, "step": 14885 }, { "epoch": 7.80398322851153, "grad_norm": 0.534448504447937, "learning_rate": 6.995548792854606e-06, "loss": 0.7017, "num_input_tokens_seen": 9726424, "step": 14890 }, { "epoch": 7.806603773584905, "grad_norm": 0.5930765271186829, "learning_rate": 6.979690358871912e-06, "loss": 0.44, "num_input_tokens_seen": 9729144, "step": 14895 }, { "epoch": 7.809224318658281, "grad_norm": 0.2807387113571167, "learning_rate": 6.963847003461188e-06, "loss": 0.4443, "num_input_tokens_seen": 9732504, "step": 14900 }, { "epoch": 7.811844863731656, "grad_norm": 0.4215393364429474, "learning_rate": 6.948018739879439e-06, "loss": 0.4402, "num_input_tokens_seen": 9737048, "step": 14905 }, { "epoch": 7.814465408805032, "grad_norm": 0.6621297597885132, "learning_rate": 6.932205581371026e-06, "loss": 0.5006, "num_input_tokens_seen": 9739960, "step": 14910 }, { "epoch": 7.817085953878407, "grad_norm": 0.4352341294288635, "learning_rate": 6.91640754116768e-06, "loss": 0.3805, "num_input_tokens_seen": 9743288, "step": 14915 }, { "epoch": 7.819706498951782, "grad_norm": 0.5959263443946838, "learning_rate": 6.9006246324884695e-06, "loss": 0.6893, "num_input_tokens_seen": 9746072, "step": 14920 }, { "epoch": 7.822327044025157, "grad_norm": 0.42180559039115906, "learning_rate": 6.88485686853983e-06, "loss": 0.5229, "num_input_tokens_seen": 9748920, "step": 14925 }, { "epoch": 7.8249475890985325, "grad_norm": 0.3159434199333191, "learning_rate": 6.869104262515494e-06, "loss": 0.3418, "num_input_tokens_seen": 9751864, "step": 14930 }, { "epoch": 7.827568134171908, "grad_norm": 0.35011088848114014, "learning_rate": 6.8533668275965276e-06, "loss": 0.4337, "num_input_tokens_seen": 9755128, "step": 14935 }, { "epoch": 7.830188679245283, "grad_norm": 0.9083426594734192, "learning_rate": 6.837644576951283e-06, "loss": 0.3713, "num_input_tokens_seen": 9757976, "step": 14940 }, { "epoch": 7.832809224318658, "grad_norm": 0.2582823634147644, "learning_rate": 6.8219375237354445e-06, "loss": 0.3985, "num_input_tokens_seen": 9761464, "step": 14945 }, { "epoch": 7.835429769392033, "grad_norm": 0.6390737891197205, "learning_rate": 6.806245681091944e-06, "loss": 0.4679, "num_input_tokens_seen": 9768408, "step": 14950 }, { "epoch": 7.838050314465409, "grad_norm": 0.35946041345596313, "learning_rate": 6.790569062150992e-06, "loss": 0.5201, "num_input_tokens_seen": 9771832, "step": 14955 }, { "epoch": 7.840670859538784, "grad_norm": 0.38737764954566956, "learning_rate": 6.774907680030085e-06, "loss": 0.5628, "num_input_tokens_seen": 9775736, "step": 14960 }, { "epoch": 7.84329140461216, "grad_norm": 0.41169872879981995, "learning_rate": 6.75926154783394e-06, "loss": 0.519, "num_input_tokens_seen": 9779096, "step": 14965 }, { "epoch": 7.845911949685535, "grad_norm": 0.7385725378990173, "learning_rate": 6.743630678654519e-06, "loss": 0.7283, "num_input_tokens_seen": 9782264, "step": 14970 }, { "epoch": 7.84853249475891, "grad_norm": 0.3641194701194763, "learning_rate": 6.728015085571049e-06, "loss": 0.4666, "num_input_tokens_seen": 9785368, "step": 14975 }, { "epoch": 7.851153039832285, "grad_norm": 0.6891164183616638, "learning_rate": 6.712414781649906e-06, "loss": 0.3903, "num_input_tokens_seen": 9789304, "step": 14980 }, { "epoch": 7.85377358490566, "grad_norm": 0.546383261680603, "learning_rate": 6.69682977994473e-06, "loss": 0.5942, "num_input_tokens_seen": 9792184, "step": 14985 }, { "epoch": 7.856394129979035, "grad_norm": 0.44160959124565125, "learning_rate": 6.681260093496355e-06, "loss": 0.5523, "num_input_tokens_seen": 9795928, "step": 14990 }, { "epoch": 7.859014675052411, "grad_norm": 0.4577599763870239, "learning_rate": 6.665705735332753e-06, "loss": 0.4809, "num_input_tokens_seen": 9798776, "step": 14995 }, { "epoch": 7.861635220125786, "grad_norm": 0.37375712394714355, "learning_rate": 6.65016671846912e-06, "loss": 0.4009, "num_input_tokens_seen": 9802008, "step": 15000 }, { "epoch": 7.864255765199162, "grad_norm": 0.9648166298866272, "learning_rate": 6.634643055907791e-06, "loss": 0.6148, "num_input_tokens_seen": 9805912, "step": 15005 }, { "epoch": 7.866876310272537, "grad_norm": 0.43355172872543335, "learning_rate": 6.619134760638248e-06, "loss": 0.4664, "num_input_tokens_seen": 9808664, "step": 15010 }, { "epoch": 7.869496855345912, "grad_norm": 0.2733030915260315, "learning_rate": 6.6036418456371516e-06, "loss": 0.405, "num_input_tokens_seen": 9811256, "step": 15015 }, { "epoch": 7.872117400419287, "grad_norm": 0.3360690474510193, "learning_rate": 6.588164323868229e-06, "loss": 0.5217, "num_input_tokens_seen": 9816440, "step": 15020 }, { "epoch": 7.8747379454926625, "grad_norm": 0.39375749230384827, "learning_rate": 6.572702208282381e-06, "loss": 0.4458, "num_input_tokens_seen": 9818776, "step": 15025 }, { "epoch": 7.877358490566038, "grad_norm": 0.4421781599521637, "learning_rate": 6.557255511817617e-06, "loss": 0.4864, "num_input_tokens_seen": 9822520, "step": 15030 }, { "epoch": 7.879979035639413, "grad_norm": 0.30173584818840027, "learning_rate": 6.5418242473989925e-06, "loss": 0.3693, "num_input_tokens_seen": 9825496, "step": 15035 }, { "epoch": 7.882599580712788, "grad_norm": 0.36895430088043213, "learning_rate": 6.52640842793871e-06, "loss": 0.6208, "num_input_tokens_seen": 9829080, "step": 15040 }, { "epoch": 7.885220125786163, "grad_norm": 0.6733325123786926, "learning_rate": 6.5110080663360165e-06, "loss": 0.4246, "num_input_tokens_seen": 9832056, "step": 15045 }, { "epoch": 7.887840670859539, "grad_norm": 0.4611046016216278, "learning_rate": 6.495623175477223e-06, "loss": 0.487, "num_input_tokens_seen": 9834680, "step": 15050 }, { "epoch": 7.890461215932914, "grad_norm": 0.45148608088493347, "learning_rate": 6.480253768235714e-06, "loss": 0.5551, "num_input_tokens_seen": 9838008, "step": 15055 }, { "epoch": 7.8930817610062896, "grad_norm": 0.4832833409309387, "learning_rate": 6.464899857471907e-06, "loss": 0.5726, "num_input_tokens_seen": 9841528, "step": 15060 }, { "epoch": 7.895702306079665, "grad_norm": 0.6170724630355835, "learning_rate": 6.44956145603324e-06, "loss": 0.6031, "num_input_tokens_seen": 9844984, "step": 15065 }, { "epoch": 7.89832285115304, "grad_norm": 0.2789331078529358, "learning_rate": 6.4342385767542036e-06, "loss": 0.4739, "num_input_tokens_seen": 9848568, "step": 15070 }, { "epoch": 7.900943396226415, "grad_norm": 0.5015266537666321, "learning_rate": 6.418931232456279e-06, "loss": 0.302, "num_input_tokens_seen": 9851640, "step": 15075 }, { "epoch": 7.90356394129979, "grad_norm": 0.5143541693687439, "learning_rate": 6.403639435947948e-06, "loss": 0.3745, "num_input_tokens_seen": 9855192, "step": 15080 }, { "epoch": 7.906184486373165, "grad_norm": 0.4262728989124298, "learning_rate": 6.38836320002468e-06, "loss": 0.477, "num_input_tokens_seen": 9858072, "step": 15085 }, { "epoch": 7.908805031446541, "grad_norm": 0.49672389030456543, "learning_rate": 6.37310253746895e-06, "loss": 0.4282, "num_input_tokens_seen": 9861784, "step": 15090 }, { "epoch": 7.911425576519916, "grad_norm": 0.5842902660369873, "learning_rate": 6.357857461050176e-06, "loss": 0.5192, "num_input_tokens_seen": 9866104, "step": 15095 }, { "epoch": 7.914046121593291, "grad_norm": 0.8383328914642334, "learning_rate": 6.342627983524737e-06, "loss": 0.5989, "num_input_tokens_seen": 9869816, "step": 15100 }, { "epoch": 7.916666666666667, "grad_norm": 0.3733694553375244, "learning_rate": 6.32741411763596e-06, "loss": 0.3005, "num_input_tokens_seen": 9872504, "step": 15105 }, { "epoch": 7.919287211740042, "grad_norm": 0.31528106331825256, "learning_rate": 6.312215876114127e-06, "loss": 0.4666, "num_input_tokens_seen": 9875320, "step": 15110 }, { "epoch": 7.921907756813417, "grad_norm": 0.48273733258247375, "learning_rate": 6.297033271676425e-06, "loss": 0.3746, "num_input_tokens_seen": 9878136, "step": 15115 }, { "epoch": 7.9245283018867925, "grad_norm": 0.5628237724304199, "learning_rate": 6.281866317026966e-06, "loss": 0.4468, "num_input_tokens_seen": 9880792, "step": 15120 }, { "epoch": 7.927148846960168, "grad_norm": 0.3110138773918152, "learning_rate": 6.2667150248567534e-06, "loss": 0.3967, "num_input_tokens_seen": 9884312, "step": 15125 }, { "epoch": 7.929769392033543, "grad_norm": 0.4426518678665161, "learning_rate": 6.251579407843713e-06, "loss": 0.4584, "num_input_tokens_seen": 9887416, "step": 15130 }, { "epoch": 7.932389937106918, "grad_norm": 0.3968007266521454, "learning_rate": 6.236459478652629e-06, "loss": 0.3429, "num_input_tokens_seen": 9890200, "step": 15135 }, { "epoch": 7.935010482180293, "grad_norm": 0.6649740934371948, "learning_rate": 6.221355249935165e-06, "loss": 0.4026, "num_input_tokens_seen": 9893368, "step": 15140 }, { "epoch": 7.937631027253669, "grad_norm": 0.49156051874160767, "learning_rate": 6.20626673432986e-06, "loss": 0.4948, "num_input_tokens_seen": 9899224, "step": 15145 }, { "epoch": 7.940251572327044, "grad_norm": 0.5592445731163025, "learning_rate": 6.191193944462087e-06, "loss": 0.4928, "num_input_tokens_seen": 9901944, "step": 15150 }, { "epoch": 7.9428721174004195, "grad_norm": 0.25920626521110535, "learning_rate": 6.176136892944062e-06, "loss": 0.5898, "num_input_tokens_seen": 9905720, "step": 15155 }, { "epoch": 7.945492662473795, "grad_norm": 0.4721619486808777, "learning_rate": 6.161095592374863e-06, "loss": 0.6073, "num_input_tokens_seen": 9908440, "step": 15160 }, { "epoch": 7.94811320754717, "grad_norm": 0.511958658695221, "learning_rate": 6.1460700553403275e-06, "loss": 0.5227, "num_input_tokens_seen": 9911256, "step": 15165 }, { "epoch": 7.950733752620545, "grad_norm": 0.5721732378005981, "learning_rate": 6.1310602944131655e-06, "loss": 0.5022, "num_input_tokens_seen": 9913336, "step": 15170 }, { "epoch": 7.95335429769392, "grad_norm": 0.2848241329193115, "learning_rate": 6.11606632215285e-06, "loss": 0.3606, "num_input_tokens_seen": 9917048, "step": 15175 }, { "epoch": 7.955974842767295, "grad_norm": 0.9054235219955444, "learning_rate": 6.101088151105647e-06, "loss": 0.4545, "num_input_tokens_seen": 9920952, "step": 15180 }, { "epoch": 7.9585953878406706, "grad_norm": 0.43601545691490173, "learning_rate": 6.086125793804618e-06, "loss": 0.4031, "num_input_tokens_seen": 9924376, "step": 15185 }, { "epoch": 7.961215932914046, "grad_norm": 0.4211655557155609, "learning_rate": 6.071179262769572e-06, "loss": 0.585, "num_input_tokens_seen": 9927160, "step": 15190 }, { "epoch": 7.963836477987421, "grad_norm": 0.45013272762298584, "learning_rate": 6.056248570507078e-06, "loss": 0.5253, "num_input_tokens_seen": 9930840, "step": 15195 }, { "epoch": 7.966457023060797, "grad_norm": 0.4085257053375244, "learning_rate": 6.041333729510479e-06, "loss": 0.6479, "num_input_tokens_seen": 9934104, "step": 15200 }, { "epoch": 7.969077568134172, "grad_norm": 0.5465455651283264, "learning_rate": 6.026434752259802e-06, "loss": 0.4017, "num_input_tokens_seen": 9937016, "step": 15205 }, { "epoch": 7.971698113207547, "grad_norm": 1.2979403734207153, "learning_rate": 6.011551651221856e-06, "loss": 0.5091, "num_input_tokens_seen": 9939736, "step": 15210 }, { "epoch": 7.9743186582809225, "grad_norm": 0.3591768443584442, "learning_rate": 5.996684438850131e-06, "loss": 0.5092, "num_input_tokens_seen": 9942744, "step": 15215 }, { "epoch": 7.976939203354298, "grad_norm": 0.8841627240180969, "learning_rate": 5.981833127584824e-06, "loss": 0.5848, "num_input_tokens_seen": 9946776, "step": 15220 }, { "epoch": 7.979559748427673, "grad_norm": 0.5513745546340942, "learning_rate": 5.966997729852844e-06, "loss": 0.4823, "num_input_tokens_seen": 9949304, "step": 15225 }, { "epoch": 7.982180293501048, "grad_norm": 0.37839630246162415, "learning_rate": 5.952178258067775e-06, "loss": 0.4663, "num_input_tokens_seen": 9953112, "step": 15230 }, { "epoch": 7.984800838574423, "grad_norm": 0.47235107421875, "learning_rate": 5.93737472462986e-06, "loss": 0.4805, "num_input_tokens_seen": 9956056, "step": 15235 }, { "epoch": 7.987421383647799, "grad_norm": 0.6480945348739624, "learning_rate": 5.92258714192604e-06, "loss": 0.3612, "num_input_tokens_seen": 9959416, "step": 15240 }, { "epoch": 7.990041928721174, "grad_norm": 0.36807161569595337, "learning_rate": 5.907815522329877e-06, "loss": 0.3954, "num_input_tokens_seen": 9962680, "step": 15245 }, { "epoch": 7.9926624737945495, "grad_norm": 0.27978938817977905, "learning_rate": 5.893059878201587e-06, "loss": 0.5878, "num_input_tokens_seen": 9966616, "step": 15250 }, { "epoch": 7.995283018867925, "grad_norm": 0.27115610241889954, "learning_rate": 5.878320221888015e-06, "loss": 0.4434, "num_input_tokens_seen": 9971864, "step": 15255 }, { "epoch": 7.9979035639413, "grad_norm": 0.3721071779727936, "learning_rate": 5.8635965657226455e-06, "loss": 0.7811, "num_input_tokens_seen": 9975864, "step": 15260 }, { "epoch": 8.0, "eval_loss": 0.48359647393226624, "eval_runtime": 14.5401, "eval_samples_per_second": 58.322, "eval_steps_per_second": 14.58, "num_input_tokens_seen": 9977520, "step": 15264 }, { "epoch": 8.000524109014675, "grad_norm": 0.631061851978302, "learning_rate": 5.848888922025553e-06, "loss": 0.537, "num_input_tokens_seen": 9978032, "step": 15265 }, { "epoch": 8.00314465408805, "grad_norm": 0.4728575646877289, "learning_rate": 5.834197303103414e-06, "loss": 0.6397, "num_input_tokens_seen": 9981616, "step": 15270 }, { "epoch": 8.005765199161425, "grad_norm": 0.7256849408149719, "learning_rate": 5.819521721249524e-06, "loss": 0.4, "num_input_tokens_seen": 9984624, "step": 15275 }, { "epoch": 8.0083857442348, "grad_norm": 0.46684330701828003, "learning_rate": 5.804862188743726e-06, "loss": 0.4366, "num_input_tokens_seen": 9987568, "step": 15280 }, { "epoch": 8.011006289308176, "grad_norm": 0.45291146636009216, "learning_rate": 5.79021871785245e-06, "loss": 0.4217, "num_input_tokens_seen": 9991280, "step": 15285 }, { "epoch": 8.01362683438155, "grad_norm": 0.33280616998672485, "learning_rate": 5.775591320828683e-06, "loss": 0.4388, "num_input_tokens_seen": 9994288, "step": 15290 }, { "epoch": 8.016247379454926, "grad_norm": 0.48246482014656067, "learning_rate": 5.7609800099119565e-06, "loss": 0.4188, "num_input_tokens_seen": 9997200, "step": 15295 }, { "epoch": 8.018867924528301, "grad_norm": 0.5035497546195984, "learning_rate": 5.746384797328361e-06, "loss": 0.5843, "num_input_tokens_seen": 10000176, "step": 15300 }, { "epoch": 8.021488469601676, "grad_norm": 0.3958096206188202, "learning_rate": 5.731805695290498e-06, "loss": 0.4956, "num_input_tokens_seen": 10003504, "step": 15305 }, { "epoch": 8.024109014675052, "grad_norm": 0.5762079954147339, "learning_rate": 5.7172427159974865e-06, "loss": 0.464, "num_input_tokens_seen": 10008688, "step": 15310 }, { "epoch": 8.026729559748428, "grad_norm": 0.494773268699646, "learning_rate": 5.702695871634975e-06, "loss": 0.6112, "num_input_tokens_seen": 10011536, "step": 15315 }, { "epoch": 8.029350104821804, "grad_norm": 0.6448982954025269, "learning_rate": 5.688165174375093e-06, "loss": 0.4504, "num_input_tokens_seen": 10014096, "step": 15320 }, { "epoch": 8.031970649895179, "grad_norm": 0.2845061123371124, "learning_rate": 5.673650636376457e-06, "loss": 0.4418, "num_input_tokens_seen": 10017360, "step": 15325 }, { "epoch": 8.034591194968554, "grad_norm": 0.3585575520992279, "learning_rate": 5.659152269784188e-06, "loss": 0.5826, "num_input_tokens_seen": 10020560, "step": 15330 }, { "epoch": 8.03721174004193, "grad_norm": 0.5059967637062073, "learning_rate": 5.644670086729834e-06, "loss": 0.5273, "num_input_tokens_seen": 10024176, "step": 15335 }, { "epoch": 8.039832285115304, "grad_norm": 0.3795239329338074, "learning_rate": 5.630204099331432e-06, "loss": 0.3371, "num_input_tokens_seen": 10028336, "step": 15340 }, { "epoch": 8.04245283018868, "grad_norm": 0.5107150077819824, "learning_rate": 5.615754319693481e-06, "loss": 0.4491, "num_input_tokens_seen": 10031376, "step": 15345 }, { "epoch": 8.045073375262055, "grad_norm": 0.653976559638977, "learning_rate": 5.601320759906861e-06, "loss": 0.4639, "num_input_tokens_seen": 10033904, "step": 15350 }, { "epoch": 8.04769392033543, "grad_norm": 0.38631299138069153, "learning_rate": 5.586903432048943e-06, "loss": 0.4726, "num_input_tokens_seen": 10036720, "step": 15355 }, { "epoch": 8.050314465408805, "grad_norm": 0.4916597008705139, "learning_rate": 5.572502348183475e-06, "loss": 0.5133, "num_input_tokens_seen": 10040016, "step": 15360 }, { "epoch": 8.05293501048218, "grad_norm": 0.33925169706344604, "learning_rate": 5.558117520360623e-06, "loss": 0.5277, "num_input_tokens_seen": 10043088, "step": 15365 }, { "epoch": 8.055555555555555, "grad_norm": 0.4793020188808441, "learning_rate": 5.543748960616971e-06, "loss": 0.5564, "num_input_tokens_seen": 10046160, "step": 15370 }, { "epoch": 8.05817610062893, "grad_norm": 0.3904685378074646, "learning_rate": 5.529396680975457e-06, "loss": 0.461, "num_input_tokens_seen": 10048592, "step": 15375 }, { "epoch": 8.060796645702306, "grad_norm": 0.20646081864833832, "learning_rate": 5.515060693445418e-06, "loss": 0.4961, "num_input_tokens_seen": 10051536, "step": 15380 }, { "epoch": 8.06341719077568, "grad_norm": 0.36380109190940857, "learning_rate": 5.500741010022564e-06, "loss": 0.5343, "num_input_tokens_seen": 10055632, "step": 15385 }, { "epoch": 8.066037735849056, "grad_norm": 0.3857966959476471, "learning_rate": 5.48643764268893e-06, "loss": 0.5242, "num_input_tokens_seen": 10060528, "step": 15390 }, { "epoch": 8.068658280922431, "grad_norm": 0.5789867639541626, "learning_rate": 5.472150603412937e-06, "loss": 0.349, "num_input_tokens_seen": 10063664, "step": 15395 }, { "epoch": 8.071278825995806, "grad_norm": 1.0239042043685913, "learning_rate": 5.457879904149327e-06, "loss": 0.4876, "num_input_tokens_seen": 10066288, "step": 15400 }, { "epoch": 8.073899371069182, "grad_norm": 0.9581869840621948, "learning_rate": 5.4436255568391545e-06, "loss": 0.6264, "num_input_tokens_seen": 10070064, "step": 15405 }, { "epoch": 8.076519916142558, "grad_norm": 0.47646191716194153, "learning_rate": 5.429387573409825e-06, "loss": 0.3973, "num_input_tokens_seen": 10074128, "step": 15410 }, { "epoch": 8.079140461215934, "grad_norm": 0.306665301322937, "learning_rate": 5.415165965775024e-06, "loss": 0.2972, "num_input_tokens_seen": 10077936, "step": 15415 }, { "epoch": 8.081761006289309, "grad_norm": 0.47861993312835693, "learning_rate": 5.400960745834735e-06, "loss": 0.4921, "num_input_tokens_seen": 10080976, "step": 15420 }, { "epoch": 8.084381551362684, "grad_norm": 0.5097410082817078, "learning_rate": 5.386771925475256e-06, "loss": 0.4733, "num_input_tokens_seen": 10083984, "step": 15425 }, { "epoch": 8.08700209643606, "grad_norm": 0.5523068308830261, "learning_rate": 5.3725995165691294e-06, "loss": 0.6036, "num_input_tokens_seen": 10086800, "step": 15430 }, { "epoch": 8.089622641509434, "grad_norm": 0.3225211501121521, "learning_rate": 5.358443530975188e-06, "loss": 0.4116, "num_input_tokens_seen": 10090544, "step": 15435 }, { "epoch": 8.09224318658281, "grad_norm": 1.2162152528762817, "learning_rate": 5.344303980538498e-06, "loss": 0.4244, "num_input_tokens_seen": 10093360, "step": 15440 }, { "epoch": 8.094863731656185, "grad_norm": 0.43856120109558105, "learning_rate": 5.33018087709041e-06, "loss": 0.5991, "num_input_tokens_seen": 10097520, "step": 15445 }, { "epoch": 8.09748427672956, "grad_norm": 0.36259686946868896, "learning_rate": 5.316074232448484e-06, "loss": 0.4114, "num_input_tokens_seen": 10100240, "step": 15450 }, { "epoch": 8.100104821802935, "grad_norm": 0.42124801874160767, "learning_rate": 5.301984058416506e-06, "loss": 0.4406, "num_input_tokens_seen": 10103280, "step": 15455 }, { "epoch": 8.10272536687631, "grad_norm": 0.2813081741333008, "learning_rate": 5.2879103667845045e-06, "loss": 0.5007, "num_input_tokens_seen": 10106288, "step": 15460 }, { "epoch": 8.105345911949685, "grad_norm": 0.41517242789268494, "learning_rate": 5.2738531693286965e-06, "loss": 0.4075, "num_input_tokens_seen": 10110064, "step": 15465 }, { "epoch": 8.10796645702306, "grad_norm": 0.33914217352867126, "learning_rate": 5.2598124778115044e-06, "loss": 0.372, "num_input_tokens_seen": 10113360, "step": 15470 }, { "epoch": 8.110587002096436, "grad_norm": 0.466408371925354, "learning_rate": 5.245788303981533e-06, "loss": 0.525, "num_input_tokens_seen": 10119760, "step": 15475 }, { "epoch": 8.11320754716981, "grad_norm": 0.3416937291622162, "learning_rate": 5.231780659573565e-06, "loss": 0.4413, "num_input_tokens_seen": 10126864, "step": 15480 }, { "epoch": 8.115828092243186, "grad_norm": 0.40619412064552307, "learning_rate": 5.2177895563085725e-06, "loss": 0.393, "num_input_tokens_seen": 10129776, "step": 15485 }, { "epoch": 8.118448637316561, "grad_norm": 0.5111480355262756, "learning_rate": 5.203815005893664e-06, "loss": 0.5352, "num_input_tokens_seen": 10132816, "step": 15490 }, { "epoch": 8.121069182389936, "grad_norm": 0.47234880924224854, "learning_rate": 5.189857020022099e-06, "loss": 0.3361, "num_input_tokens_seen": 10136112, "step": 15495 }, { "epoch": 8.123689727463312, "grad_norm": 0.2631925940513611, "learning_rate": 5.1759156103732946e-06, "loss": 0.4537, "num_input_tokens_seen": 10138896, "step": 15500 }, { "epoch": 8.126310272536688, "grad_norm": 0.35207444429397583, "learning_rate": 5.161990788612781e-06, "loss": 0.4549, "num_input_tokens_seen": 10141456, "step": 15505 }, { "epoch": 8.128930817610064, "grad_norm": 0.42459017038345337, "learning_rate": 5.148082566392204e-06, "loss": 0.5368, "num_input_tokens_seen": 10144944, "step": 15510 }, { "epoch": 8.131551362683439, "grad_norm": 0.3380706012248993, "learning_rate": 5.13419095534935e-06, "loss": 0.3934, "num_input_tokens_seen": 10148016, "step": 15515 }, { "epoch": 8.134171907756814, "grad_norm": 0.36788424849510193, "learning_rate": 5.120315967108055e-06, "loss": 0.5169, "num_input_tokens_seen": 10152048, "step": 15520 }, { "epoch": 8.13679245283019, "grad_norm": 0.34868577122688293, "learning_rate": 5.106457613278298e-06, "loss": 0.5213, "num_input_tokens_seen": 10155600, "step": 15525 }, { "epoch": 8.139412997903564, "grad_norm": 0.6777585744857788, "learning_rate": 5.092615905456111e-06, "loss": 0.4906, "num_input_tokens_seen": 10158160, "step": 15530 }, { "epoch": 8.14203354297694, "grad_norm": 0.3159315586090088, "learning_rate": 5.078790855223595e-06, "loss": 0.4858, "num_input_tokens_seen": 10161296, "step": 15535 }, { "epoch": 8.144654088050315, "grad_norm": 0.27187445759773254, "learning_rate": 5.0649824741489325e-06, "loss": 0.4818, "num_input_tokens_seen": 10164112, "step": 15540 }, { "epoch": 8.14727463312369, "grad_norm": 0.5313558578491211, "learning_rate": 5.051190773786341e-06, "loss": 0.4595, "num_input_tokens_seen": 10167248, "step": 15545 }, { "epoch": 8.149895178197065, "grad_norm": 0.3628036379814148, "learning_rate": 5.0374157656760786e-06, "loss": 0.3666, "num_input_tokens_seen": 10171088, "step": 15550 }, { "epoch": 8.15251572327044, "grad_norm": 0.45375022292137146, "learning_rate": 5.023657461344456e-06, "loss": 0.5001, "num_input_tokens_seen": 10174192, "step": 15555 }, { "epoch": 8.155136268343815, "grad_norm": 0.4631226062774658, "learning_rate": 5.009915872303786e-06, "loss": 0.5038, "num_input_tokens_seen": 10177264, "step": 15560 }, { "epoch": 8.15775681341719, "grad_norm": 0.7532998323440552, "learning_rate": 4.996191010052403e-06, "loss": 0.5801, "num_input_tokens_seen": 10179824, "step": 15565 }, { "epoch": 8.160377358490566, "grad_norm": 0.34938931465148926, "learning_rate": 4.982482886074647e-06, "loss": 0.4313, "num_input_tokens_seen": 10183024, "step": 15570 }, { "epoch": 8.16299790356394, "grad_norm": 0.3112257421016693, "learning_rate": 4.968791511840842e-06, "loss": 0.4387, "num_input_tokens_seen": 10186832, "step": 15575 }, { "epoch": 8.165618448637316, "grad_norm": 0.4169905185699463, "learning_rate": 4.955116898807316e-06, "loss": 0.5147, "num_input_tokens_seen": 10189360, "step": 15580 }, { "epoch": 8.168238993710691, "grad_norm": 0.3746313154697418, "learning_rate": 4.941459058416356e-06, "loss": 0.4907, "num_input_tokens_seen": 10193360, "step": 15585 }, { "epoch": 8.170859538784066, "grad_norm": 0.6040776371955872, "learning_rate": 4.927818002096213e-06, "loss": 0.4736, "num_input_tokens_seen": 10197136, "step": 15590 }, { "epoch": 8.173480083857442, "grad_norm": 0.4337129592895508, "learning_rate": 4.9141937412611084e-06, "loss": 0.4637, "num_input_tokens_seen": 10200592, "step": 15595 }, { "epoch": 8.176100628930818, "grad_norm": 0.5485533475875854, "learning_rate": 4.900586287311202e-06, "loss": 0.4579, "num_input_tokens_seen": 10203088, "step": 15600 }, { "epoch": 8.178721174004194, "grad_norm": 0.5244458913803101, "learning_rate": 4.886995651632584e-06, "loss": 0.6052, "num_input_tokens_seen": 10205840, "step": 15605 }, { "epoch": 8.181341719077569, "grad_norm": 0.3796929121017456, "learning_rate": 4.873421845597273e-06, "loss": 0.3628, "num_input_tokens_seen": 10209040, "step": 15610 }, { "epoch": 8.183962264150944, "grad_norm": 0.4422202706336975, "learning_rate": 4.859864880563222e-06, "loss": 0.4434, "num_input_tokens_seen": 10212048, "step": 15615 }, { "epoch": 8.18658280922432, "grad_norm": 0.4481213390827179, "learning_rate": 4.846324767874277e-06, "loss": 0.507, "num_input_tokens_seen": 10214896, "step": 15620 }, { "epoch": 8.189203354297694, "grad_norm": 0.4842471480369568, "learning_rate": 4.832801518860175e-06, "loss": 0.4826, "num_input_tokens_seen": 10217872, "step": 15625 }, { "epoch": 8.19182389937107, "grad_norm": 0.5377528667449951, "learning_rate": 4.819295144836566e-06, "loss": 0.4632, "num_input_tokens_seen": 10221488, "step": 15630 }, { "epoch": 8.194444444444445, "grad_norm": 0.5082554221153259, "learning_rate": 4.805805657104965e-06, "loss": 0.3812, "num_input_tokens_seen": 10224048, "step": 15635 }, { "epoch": 8.19706498951782, "grad_norm": 0.29964056611061096, "learning_rate": 4.792333066952748e-06, "loss": 0.4166, "num_input_tokens_seen": 10228368, "step": 15640 }, { "epoch": 8.199685534591195, "grad_norm": 0.388448566198349, "learning_rate": 4.778877385653186e-06, "loss": 0.4496, "num_input_tokens_seen": 10231728, "step": 15645 }, { "epoch": 8.20230607966457, "grad_norm": 0.4220178723335266, "learning_rate": 4.7654386244653485e-06, "loss": 0.4326, "num_input_tokens_seen": 10234960, "step": 15650 }, { "epoch": 8.204926624737945, "grad_norm": 0.2942679226398468, "learning_rate": 4.752016794634201e-06, "loss": 0.4341, "num_input_tokens_seen": 10238448, "step": 15655 }, { "epoch": 8.20754716981132, "grad_norm": 0.5844259262084961, "learning_rate": 4.738611907390508e-06, "loss": 0.38, "num_input_tokens_seen": 10240912, "step": 15660 }, { "epoch": 8.210167714884696, "grad_norm": 0.32544323801994324, "learning_rate": 4.725223973950863e-06, "loss": 0.4743, "num_input_tokens_seen": 10244816, "step": 15665 }, { "epoch": 8.21278825995807, "grad_norm": 0.47348499298095703, "learning_rate": 4.711853005517686e-06, "loss": 0.498, "num_input_tokens_seen": 10247728, "step": 15670 }, { "epoch": 8.215408805031446, "grad_norm": 0.3322368860244751, "learning_rate": 4.698499013279189e-06, "loss": 0.4098, "num_input_tokens_seen": 10251248, "step": 15675 }, { "epoch": 8.218029350104821, "grad_norm": 0.4292924404144287, "learning_rate": 4.685162008409374e-06, "loss": 0.4804, "num_input_tokens_seen": 10254160, "step": 15680 }, { "epoch": 8.220649895178196, "grad_norm": 0.42213165760040283, "learning_rate": 4.671842002068061e-06, "loss": 0.4155, "num_input_tokens_seen": 10256944, "step": 15685 }, { "epoch": 8.223270440251572, "grad_norm": 0.48293426632881165, "learning_rate": 4.658539005400794e-06, "loss": 0.5203, "num_input_tokens_seen": 10259888, "step": 15690 }, { "epoch": 8.225890985324948, "grad_norm": 0.4226606488227844, "learning_rate": 4.645253029538926e-06, "loss": 0.5831, "num_input_tokens_seen": 10262640, "step": 15695 }, { "epoch": 8.228511530398324, "grad_norm": 0.5125459432601929, "learning_rate": 4.631984085599569e-06, "loss": 0.5065, "num_input_tokens_seen": 10265424, "step": 15700 }, { "epoch": 8.231132075471699, "grad_norm": 0.37926891446113586, "learning_rate": 4.618732184685542e-06, "loss": 0.5055, "num_input_tokens_seen": 10267888, "step": 15705 }, { "epoch": 8.233752620545074, "grad_norm": 0.2029845416545868, "learning_rate": 4.60549733788545e-06, "loss": 0.3296, "num_input_tokens_seen": 10270992, "step": 15710 }, { "epoch": 8.23637316561845, "grad_norm": 0.721586287021637, "learning_rate": 4.592279556273604e-06, "loss": 0.5176, "num_input_tokens_seen": 10274128, "step": 15715 }, { "epoch": 8.238993710691824, "grad_norm": 0.6006555557250977, "learning_rate": 4.579078850910032e-06, "loss": 0.416, "num_input_tokens_seen": 10278128, "step": 15720 }, { "epoch": 8.2416142557652, "grad_norm": 0.4524094760417938, "learning_rate": 4.565895232840489e-06, "loss": 0.4471, "num_input_tokens_seen": 10280720, "step": 15725 }, { "epoch": 8.244234800838575, "grad_norm": 0.32357484102249146, "learning_rate": 4.552728713096427e-06, "loss": 0.437, "num_input_tokens_seen": 10283280, "step": 15730 }, { "epoch": 8.24685534591195, "grad_norm": 0.3159482777118683, "learning_rate": 4.539579302694977e-06, "loss": 0.498, "num_input_tokens_seen": 10287600, "step": 15735 }, { "epoch": 8.249475890985325, "grad_norm": 0.5807706713676453, "learning_rate": 4.5264470126389765e-06, "loss": 0.4289, "num_input_tokens_seen": 10289808, "step": 15740 }, { "epoch": 8.2520964360587, "grad_norm": 0.8025239706039429, "learning_rate": 4.5133318539169215e-06, "loss": 0.5051, "num_input_tokens_seen": 10292656, "step": 15745 }, { "epoch": 8.254716981132075, "grad_norm": 0.3882347047328949, "learning_rate": 4.500233837502979e-06, "loss": 0.4137, "num_input_tokens_seen": 10295664, "step": 15750 }, { "epoch": 8.25733752620545, "grad_norm": 0.6380775570869446, "learning_rate": 4.4871529743569675e-06, "loss": 0.5012, "num_input_tokens_seen": 10299248, "step": 15755 }, { "epoch": 8.259958071278826, "grad_norm": 0.524185299873352, "learning_rate": 4.474089275424351e-06, "loss": 0.5004, "num_input_tokens_seen": 10302832, "step": 15760 }, { "epoch": 8.2625786163522, "grad_norm": 0.3399099111557007, "learning_rate": 4.461042751636252e-06, "loss": 0.4869, "num_input_tokens_seen": 10307888, "step": 15765 }, { "epoch": 8.265199161425576, "grad_norm": 0.5508050918579102, "learning_rate": 4.448013413909394e-06, "loss": 0.4439, "num_input_tokens_seen": 10311248, "step": 15770 }, { "epoch": 8.267819706498951, "grad_norm": 0.5228670835494995, "learning_rate": 4.435001273146127e-06, "loss": 0.4932, "num_input_tokens_seen": 10314320, "step": 15775 }, { "epoch": 8.270440251572326, "grad_norm": 0.4743099510669708, "learning_rate": 4.422006340234433e-06, "loss": 0.5071, "num_input_tokens_seen": 10317936, "step": 15780 }, { "epoch": 8.273060796645701, "grad_norm": 0.41569122672080994, "learning_rate": 4.4090286260478674e-06, "loss": 0.5432, "num_input_tokens_seen": 10321136, "step": 15785 }, { "epoch": 8.275681341719078, "grad_norm": 0.43828579783439636, "learning_rate": 4.3960681414455864e-06, "loss": 0.5347, "num_input_tokens_seen": 10323760, "step": 15790 }, { "epoch": 8.278301886792454, "grad_norm": 1.1385226249694824, "learning_rate": 4.383124897272331e-06, "loss": 0.4968, "num_input_tokens_seen": 10327088, "step": 15795 }, { "epoch": 8.280922431865829, "grad_norm": 0.4941794276237488, "learning_rate": 4.3701989043584274e-06, "loss": 0.5906, "num_input_tokens_seen": 10330448, "step": 15800 }, { "epoch": 8.283542976939204, "grad_norm": 0.7437271475791931, "learning_rate": 4.357290173519746e-06, "loss": 0.3964, "num_input_tokens_seen": 10334672, "step": 15805 }, { "epoch": 8.286163522012579, "grad_norm": 0.7292137742042542, "learning_rate": 4.344398715557724e-06, "loss": 0.5109, "num_input_tokens_seen": 10337904, "step": 15810 }, { "epoch": 8.288784067085954, "grad_norm": 0.45738816261291504, "learning_rate": 4.3315245412593496e-06, "loss": 0.4266, "num_input_tokens_seen": 10341488, "step": 15815 }, { "epoch": 8.29140461215933, "grad_norm": 0.5362314581871033, "learning_rate": 4.318667661397141e-06, "loss": 0.4656, "num_input_tokens_seen": 10344944, "step": 15820 }, { "epoch": 8.294025157232705, "grad_norm": 0.6584106087684631, "learning_rate": 4.305828086729144e-06, "loss": 0.5385, "num_input_tokens_seen": 10347248, "step": 15825 }, { "epoch": 8.29664570230608, "grad_norm": 0.45129522681236267, "learning_rate": 4.293005827998942e-06, "loss": 0.5865, "num_input_tokens_seen": 10349712, "step": 15830 }, { "epoch": 8.299266247379455, "grad_norm": 0.44512373208999634, "learning_rate": 4.280200895935593e-06, "loss": 0.4603, "num_input_tokens_seen": 10353936, "step": 15835 }, { "epoch": 8.30188679245283, "grad_norm": 1.2482789754867554, "learning_rate": 4.267413301253701e-06, "loss": 0.4331, "num_input_tokens_seen": 10356432, "step": 15840 }, { "epoch": 8.304507337526205, "grad_norm": 0.5147969126701355, "learning_rate": 4.254643054653329e-06, "loss": 0.714, "num_input_tokens_seen": 10360624, "step": 15845 }, { "epoch": 8.30712788259958, "grad_norm": 0.5217978954315186, "learning_rate": 4.241890166820034e-06, "loss": 0.4858, "num_input_tokens_seen": 10364432, "step": 15850 }, { "epoch": 8.309748427672956, "grad_norm": 0.44895535707473755, "learning_rate": 4.22915464842486e-06, "loss": 0.4391, "num_input_tokens_seen": 10367184, "step": 15855 }, { "epoch": 8.31236897274633, "grad_norm": 0.4589516818523407, "learning_rate": 4.216436510124303e-06, "loss": 0.4447, "num_input_tokens_seen": 10370192, "step": 15860 }, { "epoch": 8.314989517819706, "grad_norm": 0.39810001850128174, "learning_rate": 4.203735762560312e-06, "loss": 0.5477, "num_input_tokens_seen": 10373744, "step": 15865 }, { "epoch": 8.317610062893081, "grad_norm": 0.43714404106140137, "learning_rate": 4.191052416360314e-06, "loss": 0.5078, "num_input_tokens_seen": 10376400, "step": 15870 }, { "epoch": 8.320230607966456, "grad_norm": 0.38942477107048035, "learning_rate": 4.178386482137126e-06, "loss": 0.3673, "num_input_tokens_seen": 10379600, "step": 15875 }, { "epoch": 8.322851153039831, "grad_norm": 0.512409508228302, "learning_rate": 4.165737970489036e-06, "loss": 0.4506, "num_input_tokens_seen": 10383088, "step": 15880 }, { "epoch": 8.325471698113208, "grad_norm": 0.528891921043396, "learning_rate": 4.153106891999753e-06, "loss": 0.4801, "num_input_tokens_seen": 10386288, "step": 15885 }, { "epoch": 8.328092243186584, "grad_norm": 0.36936140060424805, "learning_rate": 4.140493257238362e-06, "loss": 0.6263, "num_input_tokens_seen": 10389744, "step": 15890 }, { "epoch": 8.330712788259959, "grad_norm": 0.4002956449985504, "learning_rate": 4.127897076759399e-06, "loss": 0.3946, "num_input_tokens_seen": 10392880, "step": 15895 }, { "epoch": 8.333333333333334, "grad_norm": 0.6131595373153687, "learning_rate": 4.11531836110276e-06, "loss": 0.577, "num_input_tokens_seen": 10397168, "step": 15900 }, { "epoch": 8.335953878406709, "grad_norm": 0.6491900086402893, "learning_rate": 4.1027571207937345e-06, "loss": 0.4862, "num_input_tokens_seen": 10400784, "step": 15905 }, { "epoch": 8.338574423480084, "grad_norm": 0.45168358087539673, "learning_rate": 4.09021336634301e-06, "loss": 0.2957, "num_input_tokens_seen": 10403568, "step": 15910 }, { "epoch": 8.34119496855346, "grad_norm": 0.5656275153160095, "learning_rate": 4.077687108246622e-06, "loss": 0.4498, "num_input_tokens_seen": 10406800, "step": 15915 }, { "epoch": 8.343815513626835, "grad_norm": 0.2913353443145752, "learning_rate": 4.06517835698596e-06, "loss": 0.3907, "num_input_tokens_seen": 10409744, "step": 15920 }, { "epoch": 8.34643605870021, "grad_norm": 0.6304960250854492, "learning_rate": 4.0526871230277905e-06, "loss": 0.4632, "num_input_tokens_seen": 10412880, "step": 15925 }, { "epoch": 8.349056603773585, "grad_norm": 0.6257203817367554, "learning_rate": 4.040213416824204e-06, "loss": 0.4856, "num_input_tokens_seen": 10416432, "step": 15930 }, { "epoch": 8.35167714884696, "grad_norm": 0.7766960859298706, "learning_rate": 4.027757248812622e-06, "loss": 0.4509, "num_input_tokens_seen": 10419824, "step": 15935 }, { "epoch": 8.354297693920335, "grad_norm": 0.35478535294532776, "learning_rate": 4.015318629415804e-06, "loss": 0.4572, "num_input_tokens_seen": 10422800, "step": 15940 }, { "epoch": 8.35691823899371, "grad_norm": 0.5101391077041626, "learning_rate": 4.002897569041808e-06, "loss": 0.3951, "num_input_tokens_seen": 10425808, "step": 15945 }, { "epoch": 8.359538784067086, "grad_norm": 0.36682042479515076, "learning_rate": 3.990494078084022e-06, "loss": 0.4582, "num_input_tokens_seen": 10428944, "step": 15950 }, { "epoch": 8.36215932914046, "grad_norm": 0.40746811032295227, "learning_rate": 3.9781081669211156e-06, "loss": 0.5072, "num_input_tokens_seen": 10433648, "step": 15955 }, { "epoch": 8.364779874213836, "grad_norm": 0.2835358679294586, "learning_rate": 3.965739845917049e-06, "loss": 0.4357, "num_input_tokens_seen": 10436784, "step": 15960 }, { "epoch": 8.367400419287211, "grad_norm": 0.5724095106124878, "learning_rate": 3.953389125421078e-06, "loss": 0.4494, "num_input_tokens_seen": 10439280, "step": 15965 }, { "epoch": 8.370020964360586, "grad_norm": 0.4494653642177582, "learning_rate": 3.941056015767713e-06, "loss": 0.3793, "num_input_tokens_seen": 10441744, "step": 15970 }, { "epoch": 8.372641509433961, "grad_norm": 0.4178015887737274, "learning_rate": 3.928740527276745e-06, "loss": 0.439, "num_input_tokens_seen": 10445424, "step": 15975 }, { "epoch": 8.375262054507338, "grad_norm": 0.5169162154197693, "learning_rate": 3.916442670253198e-06, "loss": 0.6618, "num_input_tokens_seen": 10448688, "step": 15980 }, { "epoch": 8.377882599580714, "grad_norm": 0.40338394045829773, "learning_rate": 3.904162454987373e-06, "loss": 0.4905, "num_input_tokens_seen": 10453040, "step": 15985 }, { "epoch": 8.380503144654089, "grad_norm": 0.43346425890922546, "learning_rate": 3.891899891754788e-06, "loss": 0.5212, "num_input_tokens_seen": 10457072, "step": 15990 }, { "epoch": 8.383123689727464, "grad_norm": 1.5257314443588257, "learning_rate": 3.8796549908161864e-06, "loss": 0.3399, "num_input_tokens_seen": 10459280, "step": 15995 }, { "epoch": 8.385744234800839, "grad_norm": 0.7979829907417297, "learning_rate": 3.867427762417555e-06, "loss": 0.4265, "num_input_tokens_seen": 10462192, "step": 16000 }, { "epoch": 8.388364779874214, "grad_norm": 0.3965546786785126, "learning_rate": 3.855218216790077e-06, "loss": 0.4678, "num_input_tokens_seen": 10466672, "step": 16005 }, { "epoch": 8.39098532494759, "grad_norm": 0.3433134853839874, "learning_rate": 3.843026364150132e-06, "loss": 0.4602, "num_input_tokens_seen": 10469776, "step": 16010 }, { "epoch": 8.393605870020965, "grad_norm": 0.8136436939239502, "learning_rate": 3.830852214699326e-06, "loss": 0.3636, "num_input_tokens_seen": 10472848, "step": 16015 }, { "epoch": 8.39622641509434, "grad_norm": 1.4275588989257812, "learning_rate": 3.818695778624409e-06, "loss": 0.5498, "num_input_tokens_seen": 10476368, "step": 16020 }, { "epoch": 8.398846960167715, "grad_norm": 0.4621179401874542, "learning_rate": 3.8065570660973436e-06, "loss": 0.4826, "num_input_tokens_seen": 10479472, "step": 16025 }, { "epoch": 8.40146750524109, "grad_norm": 0.5334873795509338, "learning_rate": 3.7944360872752495e-06, "loss": 0.5338, "num_input_tokens_seen": 10482000, "step": 16030 }, { "epoch": 8.404088050314465, "grad_norm": 0.58015376329422, "learning_rate": 3.782332852300402e-06, "loss": 0.478, "num_input_tokens_seen": 10484592, "step": 16035 }, { "epoch": 8.40670859538784, "grad_norm": 1.03990638256073, "learning_rate": 3.770247371300242e-06, "loss": 0.4439, "num_input_tokens_seen": 10487216, "step": 16040 }, { "epoch": 8.409329140461216, "grad_norm": 0.43247753381729126, "learning_rate": 3.7581796543873477e-06, "loss": 0.4733, "num_input_tokens_seen": 10490512, "step": 16045 }, { "epoch": 8.41194968553459, "grad_norm": 0.44164034724235535, "learning_rate": 3.746129711659424e-06, "loss": 0.3674, "num_input_tokens_seen": 10493552, "step": 16050 }, { "epoch": 8.414570230607966, "grad_norm": 0.47463032603263855, "learning_rate": 3.7340975531993313e-06, "loss": 0.5383, "num_input_tokens_seen": 10497264, "step": 16055 }, { "epoch": 8.417190775681341, "grad_norm": 0.4562815725803375, "learning_rate": 3.7220831890750067e-06, "loss": 0.3612, "num_input_tokens_seen": 10499984, "step": 16060 }, { "epoch": 8.419811320754716, "grad_norm": 0.49311596155166626, "learning_rate": 3.7100866293395403e-06, "loss": 0.4311, "num_input_tokens_seen": 10503248, "step": 16065 }, { "epoch": 8.422431865828091, "grad_norm": 0.3563843369483948, "learning_rate": 3.698107884031099e-06, "loss": 0.478, "num_input_tokens_seen": 10507088, "step": 16070 }, { "epoch": 8.425052410901468, "grad_norm": 0.3501626253128052, "learning_rate": 3.68614696317294e-06, "loss": 0.4248, "num_input_tokens_seen": 10510416, "step": 16075 }, { "epoch": 8.427672955974844, "grad_norm": 0.485771507024765, "learning_rate": 3.6742038767734326e-06, "loss": 0.5303, "num_input_tokens_seen": 10514384, "step": 16080 }, { "epoch": 8.430293501048219, "grad_norm": 0.39539334177970886, "learning_rate": 3.6622786348259967e-06, "loss": 0.5554, "num_input_tokens_seen": 10517872, "step": 16085 }, { "epoch": 8.432914046121594, "grad_norm": 0.46624818444252014, "learning_rate": 3.6503712473091257e-06, "loss": 0.4814, "num_input_tokens_seen": 10521200, "step": 16090 }, { "epoch": 8.435534591194969, "grad_norm": 0.3304520547389984, "learning_rate": 3.6384817241863877e-06, "loss": 0.4524, "num_input_tokens_seen": 10523856, "step": 16095 }, { "epoch": 8.438155136268344, "grad_norm": 0.3257623016834259, "learning_rate": 3.626610075406389e-06, "loss": 0.506, "num_input_tokens_seen": 10527728, "step": 16100 }, { "epoch": 8.44077568134172, "grad_norm": 0.45869380235671997, "learning_rate": 3.614756310902781e-06, "loss": 0.4588, "num_input_tokens_seen": 10531536, "step": 16105 }, { "epoch": 8.443396226415095, "grad_norm": 0.3798746168613434, "learning_rate": 3.6029204405942485e-06, "loss": 0.4021, "num_input_tokens_seen": 10534352, "step": 16110 }, { "epoch": 8.44601677148847, "grad_norm": 0.36667561531066895, "learning_rate": 3.5911024743845166e-06, "loss": 0.4671, "num_input_tokens_seen": 10537552, "step": 16115 }, { "epoch": 8.448637316561845, "grad_norm": 0.636420726776123, "learning_rate": 3.5793024221623147e-06, "loss": 0.4722, "num_input_tokens_seen": 10539952, "step": 16120 }, { "epoch": 8.45125786163522, "grad_norm": 0.4612787961959839, "learning_rate": 3.567520293801388e-06, "loss": 0.4415, "num_input_tokens_seen": 10543408, "step": 16125 }, { "epoch": 8.453878406708595, "grad_norm": 0.3006177246570587, "learning_rate": 3.555756099160476e-06, "loss": 0.368, "num_input_tokens_seen": 10546352, "step": 16130 }, { "epoch": 8.45649895178197, "grad_norm": 0.314557820558548, "learning_rate": 3.544009848083335e-06, "loss": 0.4094, "num_input_tokens_seen": 10550352, "step": 16135 }, { "epoch": 8.459119496855346, "grad_norm": 0.3596094846725464, "learning_rate": 3.5322815503986804e-06, "loss": 0.5883, "num_input_tokens_seen": 10554864, "step": 16140 }, { "epoch": 8.46174004192872, "grad_norm": 0.3699497878551483, "learning_rate": 3.520571215920218e-06, "loss": 0.384, "num_input_tokens_seen": 10558960, "step": 16145 }, { "epoch": 8.464360587002096, "grad_norm": 0.4075722396373749, "learning_rate": 3.5088788544466177e-06, "loss": 0.4657, "num_input_tokens_seen": 10563600, "step": 16150 }, { "epoch": 8.466981132075471, "grad_norm": 0.5946688652038574, "learning_rate": 3.4972044757615203e-06, "loss": 0.4611, "num_input_tokens_seen": 10566480, "step": 16155 }, { "epoch": 8.469601677148846, "grad_norm": 0.38825178146362305, "learning_rate": 3.4855480896335084e-06, "loss": 0.368, "num_input_tokens_seen": 10569616, "step": 16160 }, { "epoch": 8.472222222222221, "grad_norm": 0.4497855007648468, "learning_rate": 3.4739097058161114e-06, "loss": 0.5344, "num_input_tokens_seen": 10572336, "step": 16165 }, { "epoch": 8.474842767295598, "grad_norm": 0.5463660955429077, "learning_rate": 3.462289334047805e-06, "loss": 0.5595, "num_input_tokens_seen": 10574576, "step": 16170 }, { "epoch": 8.477463312368974, "grad_norm": 0.3246290385723114, "learning_rate": 3.450686984051979e-06, "loss": 0.4099, "num_input_tokens_seen": 10578640, "step": 16175 }, { "epoch": 8.480083857442349, "grad_norm": 0.3870337903499603, "learning_rate": 3.4391026655369474e-06, "loss": 0.3654, "num_input_tokens_seen": 10581392, "step": 16180 }, { "epoch": 8.482704402515724, "grad_norm": 0.5734310746192932, "learning_rate": 3.427536388195954e-06, "loss": 0.6651, "num_input_tokens_seen": 10584144, "step": 16185 }, { "epoch": 8.485324947589099, "grad_norm": 1.0066356658935547, "learning_rate": 3.415988161707109e-06, "loss": 0.4994, "num_input_tokens_seen": 10587600, "step": 16190 }, { "epoch": 8.487945492662474, "grad_norm": 0.7955865263938904, "learning_rate": 3.404457995733451e-06, "loss": 0.4784, "num_input_tokens_seen": 10590832, "step": 16195 }, { "epoch": 8.49056603773585, "grad_norm": 0.5293580293655396, "learning_rate": 3.3929458999229113e-06, "loss": 0.5127, "num_input_tokens_seen": 10593712, "step": 16200 }, { "epoch": 8.493186582809225, "grad_norm": 0.3387070894241333, "learning_rate": 3.381451883908257e-06, "loss": 0.4314, "num_input_tokens_seen": 10596336, "step": 16205 }, { "epoch": 8.4958071278826, "grad_norm": 0.3554207980632782, "learning_rate": 3.369975957307178e-06, "loss": 0.4472, "num_input_tokens_seen": 10599536, "step": 16210 }, { "epoch": 8.498427672955975, "grad_norm": 0.4795713424682617, "learning_rate": 3.358518129722199e-06, "loss": 0.2548, "num_input_tokens_seen": 10602576, "step": 16215 }, { "epoch": 8.5, "eval_loss": 0.48336100578308105, "eval_runtime": 14.5433, "eval_samples_per_second": 58.309, "eval_steps_per_second": 14.577, "num_input_tokens_seen": 10604656, "step": 16218 }, { "epoch": 8.50104821802935, "grad_norm": 0.6179157495498657, "learning_rate": 3.3470784107406976e-06, "loss": 0.3983, "num_input_tokens_seen": 10605616, "step": 16220 }, { "epoch": 8.503668763102725, "grad_norm": 0.662920355796814, "learning_rate": 3.3356568099349283e-06, "loss": 0.6283, "num_input_tokens_seen": 10609040, "step": 16225 }, { "epoch": 8.5062893081761, "grad_norm": 0.3483963906764984, "learning_rate": 3.3242533368619435e-06, "loss": 0.5626, "num_input_tokens_seen": 10612304, "step": 16230 }, { "epoch": 8.508909853249476, "grad_norm": 0.3303278982639313, "learning_rate": 3.312868001063654e-06, "loss": 0.3899, "num_input_tokens_seen": 10614928, "step": 16235 }, { "epoch": 8.51153039832285, "grad_norm": 0.3826712369918823, "learning_rate": 3.3015008120668072e-06, "loss": 0.5852, "num_input_tokens_seen": 10618320, "step": 16240 }, { "epoch": 8.514150943396226, "grad_norm": 0.6180025935173035, "learning_rate": 3.290151779382922e-06, "loss": 0.4746, "num_input_tokens_seen": 10621296, "step": 16245 }, { "epoch": 8.516771488469601, "grad_norm": 0.4517839848995209, "learning_rate": 3.2788209125083654e-06, "loss": 0.505, "num_input_tokens_seen": 10623984, "step": 16250 }, { "epoch": 8.519392033542976, "grad_norm": 0.48083022236824036, "learning_rate": 3.267508220924287e-06, "loss": 0.4538, "num_input_tokens_seen": 10628560, "step": 16255 }, { "epoch": 8.522012578616351, "grad_norm": 0.635352611541748, "learning_rate": 3.256213714096623e-06, "loss": 0.4187, "num_input_tokens_seen": 10631856, "step": 16260 }, { "epoch": 8.524633123689728, "grad_norm": 0.3266298770904541, "learning_rate": 3.2449374014761114e-06, "loss": 0.432, "num_input_tokens_seen": 10635696, "step": 16265 }, { "epoch": 8.527253668763104, "grad_norm": 0.5728681087493896, "learning_rate": 3.2336792924982514e-06, "loss": 0.4774, "num_input_tokens_seen": 10638384, "step": 16270 }, { "epoch": 8.529874213836479, "grad_norm": 0.5691657066345215, "learning_rate": 3.222439396583307e-06, "loss": 0.4925, "num_input_tokens_seen": 10641456, "step": 16275 }, { "epoch": 8.532494758909854, "grad_norm": 1.224467158317566, "learning_rate": 3.2112177231363226e-06, "loss": 0.3939, "num_input_tokens_seen": 10644336, "step": 16280 }, { "epoch": 8.535115303983229, "grad_norm": 0.4153590798377991, "learning_rate": 3.2000142815470756e-06, "loss": 0.4039, "num_input_tokens_seen": 10647440, "step": 16285 }, { "epoch": 8.537735849056604, "grad_norm": 0.4409239888191223, "learning_rate": 3.188829081190095e-06, "loss": 0.4358, "num_input_tokens_seen": 10649872, "step": 16290 }, { "epoch": 8.54035639412998, "grad_norm": 0.3927362859249115, "learning_rate": 3.1776621314246384e-06, "loss": 0.5181, "num_input_tokens_seen": 10652976, "step": 16295 }, { "epoch": 8.542976939203355, "grad_norm": 0.497989296913147, "learning_rate": 3.1665134415947125e-06, "loss": 0.431, "num_input_tokens_seen": 10656016, "step": 16300 }, { "epoch": 8.54559748427673, "grad_norm": 0.48799586296081543, "learning_rate": 3.1553830210290236e-06, "loss": 0.5114, "num_input_tokens_seen": 10659536, "step": 16305 }, { "epoch": 8.548218029350105, "grad_norm": 0.4925394356250763, "learning_rate": 3.1442708790410002e-06, "loss": 0.3691, "num_input_tokens_seen": 10662416, "step": 16310 }, { "epoch": 8.55083857442348, "grad_norm": 0.27833449840545654, "learning_rate": 3.133177024928771e-06, "loss": 0.3963, "num_input_tokens_seen": 10666320, "step": 16315 }, { "epoch": 8.553459119496855, "grad_norm": 0.617967426776886, "learning_rate": 3.1221014679751777e-06, "loss": 0.6304, "num_input_tokens_seen": 10669136, "step": 16320 }, { "epoch": 8.55607966457023, "grad_norm": 0.4546443223953247, "learning_rate": 3.111044217447731e-06, "loss": 0.4408, "num_input_tokens_seen": 10672240, "step": 16325 }, { "epoch": 8.558700209643606, "grad_norm": 0.6498191952705383, "learning_rate": 3.1000052825986366e-06, "loss": 0.4778, "num_input_tokens_seen": 10674992, "step": 16330 }, { "epoch": 8.56132075471698, "grad_norm": 0.500320553779602, "learning_rate": 3.0889846726647657e-06, "loss": 0.508, "num_input_tokens_seen": 10677616, "step": 16335 }, { "epoch": 8.563941299790356, "grad_norm": 0.44885095953941345, "learning_rate": 3.077982396867668e-06, "loss": 0.4715, "num_input_tokens_seen": 10681136, "step": 16340 }, { "epoch": 8.566561844863731, "grad_norm": 0.7299615144729614, "learning_rate": 3.066998464413545e-06, "loss": 0.6128, "num_input_tokens_seen": 10684368, "step": 16345 }, { "epoch": 8.569182389937106, "grad_norm": 0.4553450047969818, "learning_rate": 3.056032884493243e-06, "loss": 0.5237, "num_input_tokens_seen": 10687632, "step": 16350 }, { "epoch": 8.571802935010481, "grad_norm": 0.2452678680419922, "learning_rate": 3.045085666282266e-06, "loss": 0.4965, "num_input_tokens_seen": 10692336, "step": 16355 }, { "epoch": 8.574423480083858, "grad_norm": 0.3161924183368683, "learning_rate": 3.034156818940745e-06, "loss": 0.5053, "num_input_tokens_seen": 10696048, "step": 16360 }, { "epoch": 8.577044025157234, "grad_norm": 1.336634635925293, "learning_rate": 3.0232463516134317e-06, "loss": 0.4342, "num_input_tokens_seen": 10700304, "step": 16365 }, { "epoch": 8.579664570230609, "grad_norm": 0.39462926983833313, "learning_rate": 3.0123542734297267e-06, "loss": 0.4676, "num_input_tokens_seen": 10703120, "step": 16370 }, { "epoch": 8.582285115303984, "grad_norm": 0.2287328988313675, "learning_rate": 3.0014805935035973e-06, "loss": 0.47, "num_input_tokens_seen": 10707536, "step": 16375 }, { "epoch": 8.584905660377359, "grad_norm": 0.4437112510204315, "learning_rate": 2.99062532093366e-06, "loss": 0.5378, "num_input_tokens_seen": 10710384, "step": 16380 }, { "epoch": 8.587526205450734, "grad_norm": 0.5824297666549683, "learning_rate": 2.979788464803107e-06, "loss": 0.4562, "num_input_tokens_seen": 10713840, "step": 16385 }, { "epoch": 8.59014675052411, "grad_norm": 0.46229466795921326, "learning_rate": 2.968970034179719e-06, "loss": 0.4114, "num_input_tokens_seen": 10716720, "step": 16390 }, { "epoch": 8.592767295597485, "grad_norm": 0.446362167596817, "learning_rate": 2.9581700381158735e-06, "loss": 0.512, "num_input_tokens_seen": 10720720, "step": 16395 }, { "epoch": 8.59538784067086, "grad_norm": 0.6071131825447083, "learning_rate": 2.9473884856485113e-06, "loss": 0.4469, "num_input_tokens_seen": 10723952, "step": 16400 }, { "epoch": 8.598008385744235, "grad_norm": 0.6324207186698914, "learning_rate": 2.936625385799133e-06, "loss": 0.5638, "num_input_tokens_seen": 10726096, "step": 16405 }, { "epoch": 8.60062893081761, "grad_norm": 0.28480860590934753, "learning_rate": 2.925880747573831e-06, "loss": 0.3895, "num_input_tokens_seen": 10729456, "step": 16410 }, { "epoch": 8.603249475890985, "grad_norm": 0.39854133129119873, "learning_rate": 2.9151545799632003e-06, "loss": 0.3606, "num_input_tokens_seen": 10731824, "step": 16415 }, { "epoch": 8.60587002096436, "grad_norm": 0.3440108001232147, "learning_rate": 2.9044468919424305e-06, "loss": 0.3585, "num_input_tokens_seen": 10734864, "step": 16420 }, { "epoch": 8.608490566037736, "grad_norm": 0.4662792384624481, "learning_rate": 2.8937576924712133e-06, "loss": 0.457, "num_input_tokens_seen": 10737744, "step": 16425 }, { "epoch": 8.61111111111111, "grad_norm": 0.5455425381660461, "learning_rate": 2.883086990493783e-06, "loss": 0.3259, "num_input_tokens_seen": 10740656, "step": 16430 }, { "epoch": 8.613731656184486, "grad_norm": 0.7829487323760986, "learning_rate": 2.872434794938905e-06, "loss": 0.7093, "num_input_tokens_seen": 10744688, "step": 16435 }, { "epoch": 8.616352201257861, "grad_norm": 0.40071651339530945, "learning_rate": 2.861801114719842e-06, "loss": 0.5177, "num_input_tokens_seen": 10747184, "step": 16440 }, { "epoch": 8.618972746331236, "grad_norm": 0.633023738861084, "learning_rate": 2.8511859587343704e-06, "loss": 0.6002, "num_input_tokens_seen": 10750320, "step": 16445 }, { "epoch": 8.621593291404611, "grad_norm": 0.44381555914878845, "learning_rate": 2.840589335864774e-06, "loss": 0.3905, "num_input_tokens_seen": 10753328, "step": 16450 }, { "epoch": 8.624213836477988, "grad_norm": 0.4908807575702667, "learning_rate": 2.830011254977821e-06, "loss": 0.4868, "num_input_tokens_seen": 10756208, "step": 16455 }, { "epoch": 8.626834381551364, "grad_norm": 0.9169021248817444, "learning_rate": 2.819451724924768e-06, "loss": 0.4328, "num_input_tokens_seen": 10759888, "step": 16460 }, { "epoch": 8.629454926624739, "grad_norm": 0.6648685932159424, "learning_rate": 2.8089107545413355e-06, "loss": 0.4203, "num_input_tokens_seen": 10763152, "step": 16465 }, { "epoch": 8.632075471698114, "grad_norm": 0.5005737543106079, "learning_rate": 2.7983883526477433e-06, "loss": 0.4715, "num_input_tokens_seen": 10767024, "step": 16470 }, { "epoch": 8.634696016771489, "grad_norm": 0.399533748626709, "learning_rate": 2.7878845280486453e-06, "loss": 0.3605, "num_input_tokens_seen": 10770096, "step": 16475 }, { "epoch": 8.637316561844864, "grad_norm": 0.48731529712677, "learning_rate": 2.777399289533164e-06, "loss": 0.534, "num_input_tokens_seen": 10774576, "step": 16480 }, { "epoch": 8.63993710691824, "grad_norm": 0.40575000643730164, "learning_rate": 2.766932645874873e-06, "loss": 0.3577, "num_input_tokens_seen": 10777296, "step": 16485 }, { "epoch": 8.642557651991615, "grad_norm": 0.29208508133888245, "learning_rate": 2.756484605831777e-06, "loss": 0.4095, "num_input_tokens_seen": 10779696, "step": 16490 }, { "epoch": 8.64517819706499, "grad_norm": 0.5386680960655212, "learning_rate": 2.74605517814632e-06, "loss": 0.3984, "num_input_tokens_seen": 10783184, "step": 16495 }, { "epoch": 8.647798742138365, "grad_norm": 0.36261096596717834, "learning_rate": 2.7356443715453705e-06, "loss": 0.6156, "num_input_tokens_seen": 10786512, "step": 16500 }, { "epoch": 8.65041928721174, "grad_norm": 0.47305357456207275, "learning_rate": 2.725252194740213e-06, "loss": 0.424, "num_input_tokens_seen": 10789872, "step": 16505 }, { "epoch": 8.653039832285115, "grad_norm": 0.35252922773361206, "learning_rate": 2.714878656426553e-06, "loss": 0.4045, "num_input_tokens_seen": 10792944, "step": 16510 }, { "epoch": 8.65566037735849, "grad_norm": 0.6245428323745728, "learning_rate": 2.704523765284489e-06, "loss": 0.4804, "num_input_tokens_seen": 10796624, "step": 16515 }, { "epoch": 8.658280922431866, "grad_norm": 0.40169355273246765, "learning_rate": 2.6941875299785174e-06, "loss": 0.4536, "num_input_tokens_seen": 10800144, "step": 16520 }, { "epoch": 8.66090146750524, "grad_norm": 0.6133055090904236, "learning_rate": 2.683869959157534e-06, "loss": 0.4143, "num_input_tokens_seen": 10803152, "step": 16525 }, { "epoch": 8.663522012578616, "grad_norm": 0.822066605091095, "learning_rate": 2.673571061454813e-06, "loss": 0.51, "num_input_tokens_seen": 10805552, "step": 16530 }, { "epoch": 8.666142557651991, "grad_norm": 0.5298933386802673, "learning_rate": 2.6632908454879898e-06, "loss": 0.4571, "num_input_tokens_seen": 10808880, "step": 16535 }, { "epoch": 8.668763102725366, "grad_norm": 0.49235790967941284, "learning_rate": 2.653029319859096e-06, "loss": 0.4384, "num_input_tokens_seen": 10812240, "step": 16540 }, { "epoch": 8.671383647798741, "grad_norm": 1.0546953678131104, "learning_rate": 2.642786493154492e-06, "loss": 0.4276, "num_input_tokens_seen": 10815600, "step": 16545 }, { "epoch": 8.674004192872118, "grad_norm": 0.4789697229862213, "learning_rate": 2.6325623739449108e-06, "loss": 0.448, "num_input_tokens_seen": 10819120, "step": 16550 }, { "epoch": 8.676624737945493, "grad_norm": 0.6490334272384644, "learning_rate": 2.6223569707854444e-06, "loss": 0.3961, "num_input_tokens_seen": 10821744, "step": 16555 }, { "epoch": 8.679245283018869, "grad_norm": 0.8177077770233154, "learning_rate": 2.612170292215482e-06, "loss": 0.4291, "num_input_tokens_seen": 10824336, "step": 16560 }, { "epoch": 8.681865828092244, "grad_norm": 0.6414976119995117, "learning_rate": 2.6020023467587917e-06, "loss": 0.5761, "num_input_tokens_seen": 10826672, "step": 16565 }, { "epoch": 8.684486373165619, "grad_norm": 0.35372912883758545, "learning_rate": 2.5918531429234368e-06, "loss": 0.3945, "num_input_tokens_seen": 10830192, "step": 16570 }, { "epoch": 8.687106918238994, "grad_norm": 1.2656852006912231, "learning_rate": 2.5817226892018016e-06, "loss": 0.64, "num_input_tokens_seen": 10834096, "step": 16575 }, { "epoch": 8.68972746331237, "grad_norm": 0.6408225893974304, "learning_rate": 2.571610994070603e-06, "loss": 0.4613, "num_input_tokens_seen": 10837360, "step": 16580 }, { "epoch": 8.692348008385745, "grad_norm": 0.4096130430698395, "learning_rate": 2.561518065990834e-06, "loss": 0.5536, "num_input_tokens_seen": 10841168, "step": 16585 }, { "epoch": 8.69496855345912, "grad_norm": 0.5702529549598694, "learning_rate": 2.5514439134077945e-06, "loss": 0.3425, "num_input_tokens_seen": 10844784, "step": 16590 }, { "epoch": 8.697589098532495, "grad_norm": 0.5509459972381592, "learning_rate": 2.541388544751089e-06, "loss": 0.4656, "num_input_tokens_seen": 10847376, "step": 16595 }, { "epoch": 8.70020964360587, "grad_norm": 0.5264990925788879, "learning_rate": 2.53135196843457e-06, "loss": 0.3985, "num_input_tokens_seen": 10850000, "step": 16600 }, { "epoch": 8.702830188679245, "grad_norm": 0.4875281751155853, "learning_rate": 2.521334192856403e-06, "loss": 0.374, "num_input_tokens_seen": 10852528, "step": 16605 }, { "epoch": 8.70545073375262, "grad_norm": 0.9728409647941589, "learning_rate": 2.5113352263990005e-06, "loss": 0.436, "num_input_tokens_seen": 10855312, "step": 16610 }, { "epoch": 8.708071278825996, "grad_norm": 0.2915073335170746, "learning_rate": 2.5013550774290322e-06, "loss": 0.6068, "num_input_tokens_seen": 10859376, "step": 16615 }, { "epoch": 8.71069182389937, "grad_norm": 0.48998135328292847, "learning_rate": 2.491393754297444e-06, "loss": 0.4995, "num_input_tokens_seen": 10862224, "step": 16620 }, { "epoch": 8.713312368972746, "grad_norm": 0.6878345012664795, "learning_rate": 2.48145126533941e-06, "loss": 0.6035, "num_input_tokens_seen": 10865104, "step": 16625 }, { "epoch": 8.715932914046121, "grad_norm": 0.465912401676178, "learning_rate": 2.4715276188743476e-06, "loss": 0.3984, "num_input_tokens_seen": 10867792, "step": 16630 }, { "epoch": 8.718553459119496, "grad_norm": 0.642446756362915, "learning_rate": 2.461622823205917e-06, "loss": 0.4929, "num_input_tokens_seen": 10870768, "step": 16635 }, { "epoch": 8.721174004192871, "grad_norm": 0.35837942361831665, "learning_rate": 2.451736886621997e-06, "loss": 0.3308, "num_input_tokens_seen": 10875056, "step": 16640 }, { "epoch": 8.723794549266248, "grad_norm": 0.6865161657333374, "learning_rate": 2.4418698173946872e-06, "loss": 0.5206, "num_input_tokens_seen": 10878288, "step": 16645 }, { "epoch": 8.726415094339622, "grad_norm": 0.5573830008506775, "learning_rate": 2.432021623780295e-06, "loss": 0.4969, "num_input_tokens_seen": 10880720, "step": 16650 }, { "epoch": 8.729035639412999, "grad_norm": 0.414510577917099, "learning_rate": 2.4221923140193477e-06, "loss": 0.2893, "num_input_tokens_seen": 10884528, "step": 16655 }, { "epoch": 8.731656184486374, "grad_norm": 0.48730534315109253, "learning_rate": 2.41238189633656e-06, "loss": 0.5694, "num_input_tokens_seen": 10886960, "step": 16660 }, { "epoch": 8.734276729559749, "grad_norm": 0.32753926515579224, "learning_rate": 2.402590378940836e-06, "loss": 0.4746, "num_input_tokens_seen": 10889904, "step": 16665 }, { "epoch": 8.736897274633124, "grad_norm": 0.48342272639274597, "learning_rate": 2.3928177700252798e-06, "loss": 0.564, "num_input_tokens_seen": 10893072, "step": 16670 }, { "epoch": 8.7395178197065, "grad_norm": 0.37275317311286926, "learning_rate": 2.3830640777671583e-06, "loss": 0.5009, "num_input_tokens_seen": 10897392, "step": 16675 }, { "epoch": 8.742138364779874, "grad_norm": 0.3408099114894867, "learning_rate": 2.3733293103279153e-06, "loss": 0.5572, "num_input_tokens_seen": 10901008, "step": 16680 }, { "epoch": 8.74475890985325, "grad_norm": 0.4123198986053467, "learning_rate": 2.3636134758531604e-06, "loss": 0.5293, "num_input_tokens_seen": 10904848, "step": 16685 }, { "epoch": 8.747379454926625, "grad_norm": 0.6865183711051941, "learning_rate": 2.3539165824726565e-06, "loss": 0.4706, "num_input_tokens_seen": 10908528, "step": 16690 }, { "epoch": 8.75, "grad_norm": 0.44309374690055847, "learning_rate": 2.344238638300328e-06, "loss": 0.5686, "num_input_tokens_seen": 10912080, "step": 16695 }, { "epoch": 8.752620545073375, "grad_norm": 0.41541099548339844, "learning_rate": 2.334579651434235e-06, "loss": 0.4755, "num_input_tokens_seen": 10916528, "step": 16700 }, { "epoch": 8.75524109014675, "grad_norm": 0.5414721369743347, "learning_rate": 2.3249396299565683e-06, "loss": 0.4396, "num_input_tokens_seen": 10920688, "step": 16705 }, { "epoch": 8.757861635220126, "grad_norm": 0.5959079265594482, "learning_rate": 2.3153185819336705e-06, "loss": 0.4445, "num_input_tokens_seen": 10923600, "step": 16710 }, { "epoch": 8.7604821802935, "grad_norm": 0.3151915371417999, "learning_rate": 2.3057165154159873e-06, "loss": 0.388, "num_input_tokens_seen": 10926544, "step": 16715 }, { "epoch": 8.763102725366876, "grad_norm": 0.3661130964756012, "learning_rate": 2.296133438438086e-06, "loss": 0.3898, "num_input_tokens_seen": 10929392, "step": 16720 }, { "epoch": 8.765723270440251, "grad_norm": 0.7427371144294739, "learning_rate": 2.2865693590186616e-06, "loss": 0.4272, "num_input_tokens_seen": 10933456, "step": 16725 }, { "epoch": 8.768343815513626, "grad_norm": 0.6435765624046326, "learning_rate": 2.2770242851604813e-06, "loss": 0.6048, "num_input_tokens_seen": 10936336, "step": 16730 }, { "epoch": 8.770964360587001, "grad_norm": 0.4382232129573822, "learning_rate": 2.2674982248504395e-06, "loss": 0.4917, "num_input_tokens_seen": 10939696, "step": 16735 }, { "epoch": 8.773584905660378, "grad_norm": 0.5668628811836243, "learning_rate": 2.257991186059502e-06, "loss": 0.3743, "num_input_tokens_seen": 10942192, "step": 16740 }, { "epoch": 8.776205450733752, "grad_norm": 0.5907712578773499, "learning_rate": 2.248503176742725e-06, "loss": 0.2947, "num_input_tokens_seen": 10944624, "step": 16745 }, { "epoch": 8.778825995807129, "grad_norm": 0.25671422481536865, "learning_rate": 2.2390342048392467e-06, "loss": 0.3981, "num_input_tokens_seen": 10947952, "step": 16750 }, { "epoch": 8.781446540880504, "grad_norm": 0.6957025527954102, "learning_rate": 2.229584278272265e-06, "loss": 0.4294, "num_input_tokens_seen": 10951440, "step": 16755 }, { "epoch": 8.784067085953879, "grad_norm": 0.38383498787879944, "learning_rate": 2.2201534049490436e-06, "loss": 0.3924, "num_input_tokens_seen": 10954224, "step": 16760 }, { "epoch": 8.786687631027254, "grad_norm": 0.9660680890083313, "learning_rate": 2.2107415927609176e-06, "loss": 0.4796, "num_input_tokens_seen": 10957392, "step": 16765 }, { "epoch": 8.78930817610063, "grad_norm": 0.3152945339679718, "learning_rate": 2.2013488495832542e-06, "loss": 0.5098, "num_input_tokens_seen": 10961136, "step": 16770 }, { "epoch": 8.791928721174004, "grad_norm": 0.3713281750679016, "learning_rate": 2.1919751832754714e-06, "loss": 0.4934, "num_input_tokens_seen": 10964272, "step": 16775 }, { "epoch": 8.79454926624738, "grad_norm": 0.4002632200717926, "learning_rate": 2.182620601681029e-06, "loss": 0.3662, "num_input_tokens_seen": 10967344, "step": 16780 }, { "epoch": 8.797169811320755, "grad_norm": 0.5062642693519592, "learning_rate": 2.1732851126274047e-06, "loss": 0.4092, "num_input_tokens_seen": 10970800, "step": 16785 }, { "epoch": 8.79979035639413, "grad_norm": 0.27572715282440186, "learning_rate": 2.1639687239261214e-06, "loss": 0.4761, "num_input_tokens_seen": 10974544, "step": 16790 }, { "epoch": 8.802410901467505, "grad_norm": 0.9857847690582275, "learning_rate": 2.1546714433726993e-06, "loss": 0.5178, "num_input_tokens_seen": 10976784, "step": 16795 }, { "epoch": 8.80503144654088, "grad_norm": 0.39286667108535767, "learning_rate": 2.1453932787466767e-06, "loss": 0.5319, "num_input_tokens_seen": 10980400, "step": 16800 }, { "epoch": 8.807651991614255, "grad_norm": 0.41499581933021545, "learning_rate": 2.1361342378116072e-06, "loss": 0.3435, "num_input_tokens_seen": 10986160, "step": 16805 }, { "epoch": 8.81027253668763, "grad_norm": 0.4899844229221344, "learning_rate": 2.1268943283150294e-06, "loss": 0.4931, "num_input_tokens_seen": 10989584, "step": 16810 }, { "epoch": 8.812893081761006, "grad_norm": 0.7614622712135315, "learning_rate": 2.1176735579884753e-06, "loss": 0.5497, "num_input_tokens_seen": 10992464, "step": 16815 }, { "epoch": 8.815513626834381, "grad_norm": 0.5825258493423462, "learning_rate": 2.1084719345474597e-06, "loss": 0.5601, "num_input_tokens_seen": 10995472, "step": 16820 }, { "epoch": 8.818134171907756, "grad_norm": 0.898101806640625, "learning_rate": 2.0992894656914895e-06, "loss": 0.4406, "num_input_tokens_seen": 10999088, "step": 16825 }, { "epoch": 8.820754716981131, "grad_norm": 0.39298370480537415, "learning_rate": 2.0901261591040333e-06, "loss": 0.4738, "num_input_tokens_seen": 11002000, "step": 16830 }, { "epoch": 8.823375262054507, "grad_norm": 0.79267418384552, "learning_rate": 2.0809820224525213e-06, "loss": 0.6142, "num_input_tokens_seen": 11005360, "step": 16835 }, { "epoch": 8.825995807127882, "grad_norm": 0.3830620050430298, "learning_rate": 2.0718570633883576e-06, "loss": 0.3884, "num_input_tokens_seen": 11008272, "step": 16840 }, { "epoch": 8.828616352201259, "grad_norm": 0.42912736535072327, "learning_rate": 2.0627512895468883e-06, "loss": 0.4421, "num_input_tokens_seen": 11011760, "step": 16845 }, { "epoch": 8.831236897274634, "grad_norm": 0.4878423810005188, "learning_rate": 2.0536647085474037e-06, "loss": 0.5038, "num_input_tokens_seen": 11014736, "step": 16850 }, { "epoch": 8.833857442348009, "grad_norm": 0.3495885729789734, "learning_rate": 2.044597327993153e-06, "loss": 0.4036, "num_input_tokens_seen": 11018128, "step": 16855 }, { "epoch": 8.836477987421384, "grad_norm": 0.4690801501274109, "learning_rate": 2.035549155471289e-06, "loss": 0.404, "num_input_tokens_seen": 11022032, "step": 16860 }, { "epoch": 8.83909853249476, "grad_norm": 0.5541612505912781, "learning_rate": 2.0265201985529226e-06, "loss": 0.4399, "num_input_tokens_seen": 11024944, "step": 16865 }, { "epoch": 8.841719077568134, "grad_norm": 0.4255814552307129, "learning_rate": 2.0175104647930655e-06, "loss": 0.4264, "num_input_tokens_seen": 11028272, "step": 16870 }, { "epoch": 8.84433962264151, "grad_norm": 0.9948586225509644, "learning_rate": 2.008519961730651e-06, "loss": 0.4812, "num_input_tokens_seen": 11031088, "step": 16875 }, { "epoch": 8.846960167714885, "grad_norm": 0.5381307005882263, "learning_rate": 1.9995486968885284e-06, "loss": 0.5055, "num_input_tokens_seen": 11033456, "step": 16880 }, { "epoch": 8.84958071278826, "grad_norm": 0.3099273145198822, "learning_rate": 1.990596677773435e-06, "loss": 0.4892, "num_input_tokens_seen": 11036976, "step": 16885 }, { "epoch": 8.852201257861635, "grad_norm": 0.41553357243537903, "learning_rate": 1.981663911876014e-06, "loss": 0.4678, "num_input_tokens_seen": 11039472, "step": 16890 }, { "epoch": 8.85482180293501, "grad_norm": 0.4762500822544098, "learning_rate": 1.972750406670801e-06, "loss": 0.4382, "num_input_tokens_seen": 11042064, "step": 16895 }, { "epoch": 8.857442348008385, "grad_norm": 0.36709773540496826, "learning_rate": 1.9638561696161962e-06, "loss": 0.5058, "num_input_tokens_seen": 11045232, "step": 16900 }, { "epoch": 8.86006289308176, "grad_norm": 0.3871864676475525, "learning_rate": 1.954981208154502e-06, "loss": 0.4216, "num_input_tokens_seen": 11048784, "step": 16905 }, { "epoch": 8.862683438155136, "grad_norm": 0.3762940466403961, "learning_rate": 1.9461255297118868e-06, "loss": 0.5036, "num_input_tokens_seen": 11052528, "step": 16910 }, { "epoch": 8.865303983228511, "grad_norm": 0.8022775053977966, "learning_rate": 1.937289141698359e-06, "loss": 0.7135, "num_input_tokens_seen": 11055824, "step": 16915 }, { "epoch": 8.867924528301886, "grad_norm": 0.4098091125488281, "learning_rate": 1.928472051507821e-06, "loss": 0.3224, "num_input_tokens_seen": 11059504, "step": 16920 }, { "epoch": 8.870545073375261, "grad_norm": 0.32114192843437195, "learning_rate": 1.919674266518004e-06, "loss": 0.4476, "num_input_tokens_seen": 11062544, "step": 16925 }, { "epoch": 8.873165618448636, "grad_norm": 0.31851375102996826, "learning_rate": 1.910895794090492e-06, "loss": 0.4293, "num_input_tokens_seen": 11065584, "step": 16930 }, { "epoch": 8.875786163522012, "grad_norm": 0.5170251131057739, "learning_rate": 1.902136641570712e-06, "loss": 0.383, "num_input_tokens_seen": 11068336, "step": 16935 }, { "epoch": 8.878406708595389, "grad_norm": 0.6555979251861572, "learning_rate": 1.8933968162879235e-06, "loss": 0.4949, "num_input_tokens_seen": 11071184, "step": 16940 }, { "epoch": 8.881027253668764, "grad_norm": 0.7941973805427551, "learning_rate": 1.8846763255552097e-06, "loss": 0.2915, "num_input_tokens_seen": 11074320, "step": 16945 }, { "epoch": 8.883647798742139, "grad_norm": 0.3793259561061859, "learning_rate": 1.8759751766694811e-06, "loss": 0.4246, "num_input_tokens_seen": 11077168, "step": 16950 }, { "epoch": 8.886268343815514, "grad_norm": 0.7237054705619812, "learning_rate": 1.8672933769114636e-06, "loss": 0.4532, "num_input_tokens_seen": 11080080, "step": 16955 }, { "epoch": 8.88888888888889, "grad_norm": 0.3927192687988281, "learning_rate": 1.8586309335456908e-06, "loss": 0.3616, "num_input_tokens_seen": 11083408, "step": 16960 }, { "epoch": 8.891509433962264, "grad_norm": 0.4745715260505676, "learning_rate": 1.8499878538204951e-06, "loss": 0.5668, "num_input_tokens_seen": 11087184, "step": 16965 }, { "epoch": 8.89412997903564, "grad_norm": 0.3665062189102173, "learning_rate": 1.8413641449680081e-06, "loss": 0.3946, "num_input_tokens_seen": 11090864, "step": 16970 }, { "epoch": 8.896750524109015, "grad_norm": 0.41607722640037537, "learning_rate": 1.8327598142041658e-06, "loss": 0.4087, "num_input_tokens_seen": 11094288, "step": 16975 }, { "epoch": 8.89937106918239, "grad_norm": 0.3397178649902344, "learning_rate": 1.824174868728673e-06, "loss": 0.3015, "num_input_tokens_seen": 11096656, "step": 16980 }, { "epoch": 8.901991614255765, "grad_norm": 0.270341157913208, "learning_rate": 1.815609315725017e-06, "loss": 0.4374, "num_input_tokens_seen": 11100144, "step": 16985 }, { "epoch": 8.90461215932914, "grad_norm": 0.3183417022228241, "learning_rate": 1.80706316236047e-06, "loss": 0.404, "num_input_tokens_seen": 11103440, "step": 16990 }, { "epoch": 8.907232704402515, "grad_norm": 0.4016265869140625, "learning_rate": 1.7985364157860562e-06, "loss": 0.4197, "num_input_tokens_seen": 11107056, "step": 16995 }, { "epoch": 8.90985324947589, "grad_norm": 0.678164005279541, "learning_rate": 1.7900290831365713e-06, "loss": 0.4313, "num_input_tokens_seen": 11110640, "step": 17000 }, { "epoch": 8.912473794549266, "grad_norm": 0.09425361454486847, "learning_rate": 1.781541171530554e-06, "loss": 0.3271, "num_input_tokens_seen": 11117008, "step": 17005 }, { "epoch": 8.915094339622641, "grad_norm": 0.45522162318229675, "learning_rate": 1.7730726880703125e-06, "loss": 0.4094, "num_input_tokens_seen": 11120176, "step": 17010 }, { "epoch": 8.917714884696016, "grad_norm": 0.5526595115661621, "learning_rate": 1.7646236398418835e-06, "loss": 0.4822, "num_input_tokens_seen": 11123504, "step": 17015 }, { "epoch": 8.920335429769391, "grad_norm": 0.592329204082489, "learning_rate": 1.7561940339150373e-06, "loss": 0.5016, "num_input_tokens_seen": 11126000, "step": 17020 }, { "epoch": 8.922955974842766, "grad_norm": 0.32253193855285645, "learning_rate": 1.7477838773432926e-06, "loss": 0.4286, "num_input_tokens_seen": 11129584, "step": 17025 }, { "epoch": 8.925576519916142, "grad_norm": 0.4488500952720642, "learning_rate": 1.7393931771638839e-06, "loss": 0.4846, "num_input_tokens_seen": 11132240, "step": 17030 }, { "epoch": 8.928197064989519, "grad_norm": 0.4854106605052948, "learning_rate": 1.7310219403977563e-06, "loss": 0.4928, "num_input_tokens_seen": 11135120, "step": 17035 }, { "epoch": 8.930817610062894, "grad_norm": 1.025329351425171, "learning_rate": 1.7226701740495926e-06, "loss": 0.3383, "num_input_tokens_seen": 11137968, "step": 17040 }, { "epoch": 8.933438155136269, "grad_norm": 0.35247641801834106, "learning_rate": 1.714337885107753e-06, "loss": 0.3672, "num_input_tokens_seen": 11140368, "step": 17045 }, { "epoch": 8.936058700209644, "grad_norm": 0.43305808305740356, "learning_rate": 1.7060250805443296e-06, "loss": 0.5391, "num_input_tokens_seen": 11145168, "step": 17050 }, { "epoch": 8.93867924528302, "grad_norm": 0.4468317925930023, "learning_rate": 1.6977317673150916e-06, "loss": 0.4928, "num_input_tokens_seen": 11149008, "step": 17055 }, { "epoch": 8.941299790356394, "grad_norm": 0.47776922583580017, "learning_rate": 1.6894579523595022e-06, "loss": 0.4368, "num_input_tokens_seen": 11156336, "step": 17060 }, { "epoch": 8.94392033542977, "grad_norm": 0.5498602390289307, "learning_rate": 1.6812036426007176e-06, "loss": 0.5557, "num_input_tokens_seen": 11159152, "step": 17065 }, { "epoch": 8.946540880503145, "grad_norm": 0.4210612177848816, "learning_rate": 1.6729688449455689e-06, "loss": 0.4033, "num_input_tokens_seen": 11162096, "step": 17070 }, { "epoch": 8.94916142557652, "grad_norm": 0.5128505229949951, "learning_rate": 1.6647535662845466e-06, "loss": 0.4524, "num_input_tokens_seen": 11165200, "step": 17075 }, { "epoch": 8.951781970649895, "grad_norm": 0.7387170791625977, "learning_rate": 1.656557813491838e-06, "loss": 0.5809, "num_input_tokens_seen": 11168368, "step": 17080 }, { "epoch": 8.95440251572327, "grad_norm": 0.3253476917743683, "learning_rate": 1.6483815934252578e-06, "loss": 0.5309, "num_input_tokens_seen": 11172048, "step": 17085 }, { "epoch": 8.957023060796645, "grad_norm": 0.31197109818458557, "learning_rate": 1.6402249129263025e-06, "loss": 0.4569, "num_input_tokens_seen": 11175600, "step": 17090 }, { "epoch": 8.95964360587002, "grad_norm": 0.8297432065010071, "learning_rate": 1.6320877788201127e-06, "loss": 0.4263, "num_input_tokens_seen": 11178576, "step": 17095 }, { "epoch": 8.962264150943396, "grad_norm": 0.4379422664642334, "learning_rate": 1.6239701979154614e-06, "loss": 0.5236, "num_input_tokens_seen": 11181168, "step": 17100 }, { "epoch": 8.964884696016771, "grad_norm": 0.6849067211151123, "learning_rate": 1.6158721770047762e-06, "loss": 0.4054, "num_input_tokens_seen": 11184400, "step": 17105 }, { "epoch": 8.967505241090146, "grad_norm": 0.4781620502471924, "learning_rate": 1.6077937228641093e-06, "loss": 0.5029, "num_input_tokens_seen": 11187536, "step": 17110 }, { "epoch": 8.970125786163521, "grad_norm": 0.47738978266716003, "learning_rate": 1.5997348422531395e-06, "loss": 0.4282, "num_input_tokens_seen": 11191568, "step": 17115 }, { "epoch": 8.972746331236896, "grad_norm": 1.316786527633667, "learning_rate": 1.5916955419151725e-06, "loss": 0.3893, "num_input_tokens_seen": 11195024, "step": 17120 }, { "epoch": 8.975366876310272, "grad_norm": 0.6332635283470154, "learning_rate": 1.5836758285771303e-06, "loss": 0.5058, "num_input_tokens_seen": 11198480, "step": 17125 }, { "epoch": 8.977987421383649, "grad_norm": 0.23555073142051697, "learning_rate": 1.5756757089495366e-06, "loss": 0.4892, "num_input_tokens_seen": 11201712, "step": 17130 }, { "epoch": 8.980607966457024, "grad_norm": 0.31470388174057007, "learning_rate": 1.5676951897265313e-06, "loss": 0.4025, "num_input_tokens_seen": 11204720, "step": 17135 }, { "epoch": 8.983228511530399, "grad_norm": 0.5923284292221069, "learning_rate": 1.5597342775858476e-06, "loss": 0.4427, "num_input_tokens_seen": 11207440, "step": 17140 }, { "epoch": 8.985849056603774, "grad_norm": 0.4143155813217163, "learning_rate": 1.5517929791888125e-06, "loss": 0.443, "num_input_tokens_seen": 11210384, "step": 17145 }, { "epoch": 8.98846960167715, "grad_norm": 0.607303261756897, "learning_rate": 1.5438713011803385e-06, "loss": 0.4532, "num_input_tokens_seen": 11213392, "step": 17150 }, { "epoch": 8.991090146750524, "grad_norm": 0.5958565473556519, "learning_rate": 1.535969250188926e-06, "loss": 0.5348, "num_input_tokens_seen": 11216016, "step": 17155 }, { "epoch": 8.9937106918239, "grad_norm": 0.4344741404056549, "learning_rate": 1.5280868328266528e-06, "loss": 0.5285, "num_input_tokens_seen": 11218160, "step": 17160 }, { "epoch": 8.996331236897275, "grad_norm": 0.424324095249176, "learning_rate": 1.520224055689165e-06, "loss": 0.7013, "num_input_tokens_seen": 11222000, "step": 17165 }, { "epoch": 8.99895178197065, "grad_norm": 0.6195769309997559, "learning_rate": 1.5123809253556692e-06, "loss": 0.4824, "num_input_tokens_seen": 11224816, "step": 17170 }, { "epoch": 9.0, "eval_loss": 0.4832555055618286, "eval_runtime": 14.5527, "eval_samples_per_second": 58.271, "eval_steps_per_second": 14.568, "num_input_tokens_seen": 11225416, "step": 17172 }, { "epoch": 9.001572327044025, "grad_norm": 0.6285069584846497, "learning_rate": 1.5045574483889463e-06, "loss": 0.5808, "num_input_tokens_seen": 11227112, "step": 17175 }, { "epoch": 9.0041928721174, "grad_norm": 0.38770556449890137, "learning_rate": 1.4967536313353237e-06, "loss": 0.4166, "num_input_tokens_seen": 11229544, "step": 17180 }, { "epoch": 9.006813417190775, "grad_norm": 0.3824554681777954, "learning_rate": 1.4889694807246779e-06, "loss": 0.4907, "num_input_tokens_seen": 11233640, "step": 17185 }, { "epoch": 9.00943396226415, "grad_norm": 0.4671635925769806, "learning_rate": 1.481205003070424e-06, "loss": 0.3819, "num_input_tokens_seen": 11236552, "step": 17190 }, { "epoch": 9.012054507337526, "grad_norm": 0.6609065532684326, "learning_rate": 1.4734602048695312e-06, "loss": 0.4205, "num_input_tokens_seen": 11240008, "step": 17195 }, { "epoch": 9.014675052410901, "grad_norm": 0.5683530569076538, "learning_rate": 1.465735092602491e-06, "loss": 0.4657, "num_input_tokens_seen": 11243176, "step": 17200 }, { "epoch": 9.017295597484276, "grad_norm": 0.4941043257713318, "learning_rate": 1.4580296727333187e-06, "loss": 0.4427, "num_input_tokens_seen": 11246568, "step": 17205 }, { "epoch": 9.019916142557651, "grad_norm": 0.23750634491443634, "learning_rate": 1.450343951709568e-06, "loss": 0.4288, "num_input_tokens_seen": 11251080, "step": 17210 }, { "epoch": 9.022536687631026, "grad_norm": 0.6560170650482178, "learning_rate": 1.4426779359622916e-06, "loss": 0.6544, "num_input_tokens_seen": 11253256, "step": 17215 }, { "epoch": 9.025157232704403, "grad_norm": 0.3833125829696655, "learning_rate": 1.4350316319060585e-06, "loss": 0.4495, "num_input_tokens_seen": 11256936, "step": 17220 }, { "epoch": 9.027777777777779, "grad_norm": 0.43824946880340576, "learning_rate": 1.4274050459389594e-06, "loss": 0.4762, "num_input_tokens_seen": 11259784, "step": 17225 }, { "epoch": 9.030398322851154, "grad_norm": 0.36012983322143555, "learning_rate": 1.4197981844425583e-06, "loss": 0.454, "num_input_tokens_seen": 11262728, "step": 17230 }, { "epoch": 9.033018867924529, "grad_norm": 0.4189225733280182, "learning_rate": 1.4122110537819365e-06, "loss": 0.4302, "num_input_tokens_seen": 11265640, "step": 17235 }, { "epoch": 9.035639412997904, "grad_norm": 0.3080269992351532, "learning_rate": 1.4046436603056601e-06, "loss": 0.4016, "num_input_tokens_seen": 11270344, "step": 17240 }, { "epoch": 9.03825995807128, "grad_norm": 1.831540584564209, "learning_rate": 1.397096010345772e-06, "loss": 0.5935, "num_input_tokens_seen": 11272584, "step": 17245 }, { "epoch": 9.040880503144654, "grad_norm": 0.44061705470085144, "learning_rate": 1.3895681102178094e-06, "loss": 0.4979, "num_input_tokens_seen": 11276872, "step": 17250 }, { "epoch": 9.04350104821803, "grad_norm": 0.3778270483016968, "learning_rate": 1.3820599662207695e-06, "loss": 0.463, "num_input_tokens_seen": 11279688, "step": 17255 }, { "epoch": 9.046121593291405, "grad_norm": 0.4624400734901428, "learning_rate": 1.3745715846371244e-06, "loss": 0.414, "num_input_tokens_seen": 11282888, "step": 17260 }, { "epoch": 9.04874213836478, "grad_norm": 0.41053780913352966, "learning_rate": 1.3671029717328142e-06, "loss": 0.5502, "num_input_tokens_seen": 11285928, "step": 17265 }, { "epoch": 9.051362683438155, "grad_norm": 0.5813620686531067, "learning_rate": 1.3596541337572265e-06, "loss": 0.4306, "num_input_tokens_seen": 11289288, "step": 17270 }, { "epoch": 9.05398322851153, "grad_norm": 0.3157801032066345, "learning_rate": 1.3522250769432115e-06, "loss": 0.5358, "num_input_tokens_seen": 11295976, "step": 17275 }, { "epoch": 9.056603773584905, "grad_norm": 0.3537648916244507, "learning_rate": 1.3448158075070687e-06, "loss": 0.479, "num_input_tokens_seen": 11299816, "step": 17280 }, { "epoch": 9.05922431865828, "grad_norm": 0.23745350539684296, "learning_rate": 1.337426331648528e-06, "loss": 0.5443, "num_input_tokens_seen": 11303176, "step": 17285 }, { "epoch": 9.061844863731656, "grad_norm": 0.514176070690155, "learning_rate": 1.3300566555507709e-06, "loss": 0.3489, "num_input_tokens_seen": 11306376, "step": 17290 }, { "epoch": 9.064465408805031, "grad_norm": 0.3848818242549896, "learning_rate": 1.3227067853804065e-06, "loss": 0.3576, "num_input_tokens_seen": 11310696, "step": 17295 }, { "epoch": 9.067085953878406, "grad_norm": 0.8794047236442566, "learning_rate": 1.315376727287465e-06, "loss": 0.3989, "num_input_tokens_seen": 11314888, "step": 17300 }, { "epoch": 9.069706498951781, "grad_norm": 0.5876845717430115, "learning_rate": 1.3080664874054127e-06, "loss": 0.5058, "num_input_tokens_seen": 11317576, "step": 17305 }, { "epoch": 9.072327044025156, "grad_norm": 0.4156530499458313, "learning_rate": 1.3007760718511176e-06, "loss": 0.4754, "num_input_tokens_seen": 11320840, "step": 17310 }, { "epoch": 9.074947589098532, "grad_norm": 0.46910130977630615, "learning_rate": 1.2935054867248692e-06, "loss": 0.3057, "num_input_tokens_seen": 11324040, "step": 17315 }, { "epoch": 9.077568134171909, "grad_norm": 0.5553165078163147, "learning_rate": 1.2862547381103567e-06, "loss": 0.5395, "num_input_tokens_seen": 11327784, "step": 17320 }, { "epoch": 9.080188679245284, "grad_norm": 0.28888991475105286, "learning_rate": 1.2790238320746827e-06, "loss": 0.4126, "num_input_tokens_seen": 11331304, "step": 17325 }, { "epoch": 9.082809224318659, "grad_norm": 0.511267364025116, "learning_rate": 1.271812774668335e-06, "loss": 0.5598, "num_input_tokens_seen": 11334312, "step": 17330 }, { "epoch": 9.085429769392034, "grad_norm": 0.26371708512306213, "learning_rate": 1.2646215719251952e-06, "loss": 0.4835, "num_input_tokens_seen": 11339144, "step": 17335 }, { "epoch": 9.08805031446541, "grad_norm": 0.5078057050704956, "learning_rate": 1.2574502298625334e-06, "loss": 0.3238, "num_input_tokens_seen": 11342664, "step": 17340 }, { "epoch": 9.090670859538784, "grad_norm": 0.5353273749351501, "learning_rate": 1.250298754481008e-06, "loss": 0.4018, "num_input_tokens_seen": 11345768, "step": 17345 }, { "epoch": 9.09329140461216, "grad_norm": 0.43859565258026123, "learning_rate": 1.2431671517646403e-06, "loss": 0.6014, "num_input_tokens_seen": 11349448, "step": 17350 }, { "epoch": 9.095911949685535, "grad_norm": 0.49927476048469543, "learning_rate": 1.2360554276808295e-06, "loss": 0.3938, "num_input_tokens_seen": 11352744, "step": 17355 }, { "epoch": 9.09853249475891, "grad_norm": 1.557645559310913, "learning_rate": 1.228963588180343e-06, "loss": 0.5148, "num_input_tokens_seen": 11354920, "step": 17360 }, { "epoch": 9.101153039832285, "grad_norm": 0.6014249324798584, "learning_rate": 1.2218916391973118e-06, "loss": 0.3922, "num_input_tokens_seen": 11359240, "step": 17365 }, { "epoch": 9.10377358490566, "grad_norm": 0.391513466835022, "learning_rate": 1.2148395866492135e-06, "loss": 0.5175, "num_input_tokens_seen": 11362056, "step": 17370 }, { "epoch": 9.106394129979035, "grad_norm": 0.5175548791885376, "learning_rate": 1.2078074364368862e-06, "loss": 0.3677, "num_input_tokens_seen": 11364648, "step": 17375 }, { "epoch": 9.10901467505241, "grad_norm": 0.3884676992893219, "learning_rate": 1.2007951944445122e-06, "loss": 0.3994, "num_input_tokens_seen": 11368168, "step": 17380 }, { "epoch": 9.111635220125786, "grad_norm": 0.44598308205604553, "learning_rate": 1.1938028665396173e-06, "loss": 0.492, "num_input_tokens_seen": 11370856, "step": 17385 }, { "epoch": 9.114255765199161, "grad_norm": 0.5494983196258545, "learning_rate": 1.1868304585730571e-06, "loss": 0.4617, "num_input_tokens_seen": 11373768, "step": 17390 }, { "epoch": 9.116876310272536, "grad_norm": 0.573569655418396, "learning_rate": 1.1798779763790346e-06, "loss": 0.5066, "num_input_tokens_seen": 11377192, "step": 17395 }, { "epoch": 9.119496855345911, "grad_norm": 0.4261104166507721, "learning_rate": 1.1729454257750544e-06, "loss": 0.4311, "num_input_tokens_seen": 11379912, "step": 17400 }, { "epoch": 9.122117400419286, "grad_norm": 0.38305437564849854, "learning_rate": 1.1660328125619652e-06, "loss": 0.471, "num_input_tokens_seen": 11383496, "step": 17405 }, { "epoch": 9.124737945492662, "grad_norm": 0.6000325679779053, "learning_rate": 1.1591401425239318e-06, "loss": 0.6034, "num_input_tokens_seen": 11386504, "step": 17410 }, { "epoch": 9.127358490566039, "grad_norm": 2.36444091796875, "learning_rate": 1.1522674214284158e-06, "loss": 0.3992, "num_input_tokens_seen": 11388776, "step": 17415 }, { "epoch": 9.129979035639414, "grad_norm": 0.4141996204853058, "learning_rate": 1.145414655026203e-06, "loss": 0.5982, "num_input_tokens_seen": 11391976, "step": 17420 }, { "epoch": 9.132599580712789, "grad_norm": 0.32804977893829346, "learning_rate": 1.1385818490513733e-06, "loss": 0.4322, "num_input_tokens_seen": 11394856, "step": 17425 }, { "epoch": 9.135220125786164, "grad_norm": 0.7410093545913696, "learning_rate": 1.1317690092213007e-06, "loss": 0.5114, "num_input_tokens_seen": 11397896, "step": 17430 }, { "epoch": 9.13784067085954, "grad_norm": 0.7315228581428528, "learning_rate": 1.124976141236675e-06, "loss": 0.5422, "num_input_tokens_seen": 11401096, "step": 17435 }, { "epoch": 9.140461215932914, "grad_norm": 0.7726212739944458, "learning_rate": 1.1182032507814354e-06, "loss": 0.4306, "num_input_tokens_seen": 11404616, "step": 17440 }, { "epoch": 9.14308176100629, "grad_norm": 0.3844158947467804, "learning_rate": 1.1114503435228434e-06, "loss": 0.4474, "num_input_tokens_seen": 11407848, "step": 17445 }, { "epoch": 9.145702306079665, "grad_norm": 0.4292648434638977, "learning_rate": 1.1047174251114234e-06, "loss": 0.4349, "num_input_tokens_seen": 11410760, "step": 17450 }, { "epoch": 9.14832285115304, "grad_norm": 0.5723268985748291, "learning_rate": 1.0980045011809604e-06, "loss": 0.5256, "num_input_tokens_seen": 11413864, "step": 17455 }, { "epoch": 9.150943396226415, "grad_norm": 0.3862370252609253, "learning_rate": 1.0913115773485388e-06, "loss": 0.3969, "num_input_tokens_seen": 11416808, "step": 17460 }, { "epoch": 9.15356394129979, "grad_norm": 0.39746931195259094, "learning_rate": 1.084638659214482e-06, "loss": 0.5341, "num_input_tokens_seen": 11419944, "step": 17465 }, { "epoch": 9.156184486373165, "grad_norm": 0.4817606806755066, "learning_rate": 1.0779857523623815e-06, "loss": 0.4089, "num_input_tokens_seen": 11422728, "step": 17470 }, { "epoch": 9.15880503144654, "grad_norm": 0.30652105808258057, "learning_rate": 1.071352862359093e-06, "loss": 0.4881, "num_input_tokens_seen": 11430984, "step": 17475 }, { "epoch": 9.161425576519916, "grad_norm": 0.5566508769989014, "learning_rate": 1.0647399947547127e-06, "loss": 0.4517, "num_input_tokens_seen": 11433672, "step": 17480 }, { "epoch": 9.164046121593291, "grad_norm": 0.5421069860458374, "learning_rate": 1.0581471550825812e-06, "loss": 0.4899, "num_input_tokens_seen": 11436168, "step": 17485 }, { "epoch": 9.166666666666666, "grad_norm": 0.24918822944164276, "learning_rate": 1.0515743488592939e-06, "loss": 0.3353, "num_input_tokens_seen": 11439528, "step": 17490 }, { "epoch": 9.169287211740041, "grad_norm": 0.6466579437255859, "learning_rate": 1.0450215815846736e-06, "loss": 0.5097, "num_input_tokens_seen": 11442312, "step": 17495 }, { "epoch": 9.171907756813416, "grad_norm": 0.6454312205314636, "learning_rate": 1.0384888587417736e-06, "loss": 0.4091, "num_input_tokens_seen": 11445416, "step": 17500 }, { "epoch": 9.174528301886792, "grad_norm": 0.6776018738746643, "learning_rate": 1.0319761857968735e-06, "loss": 0.6124, "num_input_tokens_seen": 11448712, "step": 17505 }, { "epoch": 9.177148846960169, "grad_norm": 0.4160771667957306, "learning_rate": 1.0254835681994895e-06, "loss": 0.5025, "num_input_tokens_seen": 11452104, "step": 17510 }, { "epoch": 9.179769392033544, "grad_norm": 0.3990565240383148, "learning_rate": 1.0190110113823426e-06, "loss": 0.3261, "num_input_tokens_seen": 11456136, "step": 17515 }, { "epoch": 9.182389937106919, "grad_norm": 0.46347880363464355, "learning_rate": 1.0125585207613752e-06, "loss": 0.3857, "num_input_tokens_seen": 11459272, "step": 17520 }, { "epoch": 9.185010482180294, "grad_norm": 0.4008978307247162, "learning_rate": 1.0061261017357327e-06, "loss": 0.4313, "num_input_tokens_seen": 11463208, "step": 17525 }, { "epoch": 9.18763102725367, "grad_norm": 0.4132170081138611, "learning_rate": 9.997137596877732e-07, "loss": 0.4298, "num_input_tokens_seen": 11466472, "step": 17530 }, { "epoch": 9.190251572327044, "grad_norm": 0.7785208821296692, "learning_rate": 9.93321499983052e-07, "loss": 0.5529, "num_input_tokens_seen": 11469192, "step": 17535 }, { "epoch": 9.19287211740042, "grad_norm": 0.5124107599258423, "learning_rate": 9.869493279703158e-07, "loss": 0.4244, "num_input_tokens_seen": 11472232, "step": 17540 }, { "epoch": 9.195492662473795, "grad_norm": 0.37968209385871887, "learning_rate": 9.805972489815102e-07, "loss": 0.4985, "num_input_tokens_seen": 11476040, "step": 17545 }, { "epoch": 9.19811320754717, "grad_norm": 0.4574350118637085, "learning_rate": 9.742652683317643e-07, "loss": 0.4472, "num_input_tokens_seen": 11478728, "step": 17550 }, { "epoch": 9.200733752620545, "grad_norm": 0.4974486231803894, "learning_rate": 9.679533913193927e-07, "loss": 0.5367, "num_input_tokens_seen": 11481448, "step": 17555 }, { "epoch": 9.20335429769392, "grad_norm": 0.4199790358543396, "learning_rate": 9.61661623225879e-07, "loss": 0.4466, "num_input_tokens_seen": 11484424, "step": 17560 }, { "epoch": 9.205974842767295, "grad_norm": 0.5387084484100342, "learning_rate": 9.553899693158951e-07, "loss": 0.4351, "num_input_tokens_seen": 11487560, "step": 17565 }, { "epoch": 9.20859538784067, "grad_norm": 0.9735351800918579, "learning_rate": 9.491384348372684e-07, "loss": 0.4487, "num_input_tokens_seen": 11490632, "step": 17570 }, { "epoch": 9.211215932914046, "grad_norm": 0.848965048789978, "learning_rate": 9.429070250210004e-07, "loss": 0.4257, "num_input_tokens_seen": 11493032, "step": 17575 }, { "epoch": 9.213836477987421, "grad_norm": 0.41676321625709534, "learning_rate": 9.366957450812535e-07, "loss": 0.3892, "num_input_tokens_seen": 11496200, "step": 17580 }, { "epoch": 9.216457023060796, "grad_norm": 0.5910327434539795, "learning_rate": 9.305046002153345e-07, "loss": 0.4897, "num_input_tokens_seen": 11498696, "step": 17585 }, { "epoch": 9.219077568134171, "grad_norm": 0.4669612646102905, "learning_rate": 9.243335956037186e-07, "loss": 0.3942, "num_input_tokens_seen": 11502312, "step": 17590 }, { "epoch": 9.221698113207546, "grad_norm": 0.30186009407043457, "learning_rate": 9.181827364100171e-07, "loss": 0.4518, "num_input_tokens_seen": 11505160, "step": 17595 }, { "epoch": 9.224318658280922, "grad_norm": 0.39272791147232056, "learning_rate": 9.120520277809852e-07, "loss": 0.3612, "num_input_tokens_seen": 11508456, "step": 17600 }, { "epoch": 9.226939203354299, "grad_norm": 0.48595130443573, "learning_rate": 9.059414748465278e-07, "loss": 0.5289, "num_input_tokens_seen": 11511816, "step": 17605 }, { "epoch": 9.229559748427674, "grad_norm": 0.5356800556182861, "learning_rate": 8.998510827196715e-07, "loss": 0.5194, "num_input_tokens_seen": 11514088, "step": 17610 }, { "epoch": 9.232180293501049, "grad_norm": 0.4747573733329773, "learning_rate": 8.937808564965733e-07, "loss": 0.3954, "num_input_tokens_seen": 11517096, "step": 17615 }, { "epoch": 9.234800838574424, "grad_norm": 0.456758975982666, "learning_rate": 8.877308012565339e-07, "loss": 0.4144, "num_input_tokens_seen": 11520168, "step": 17620 }, { "epoch": 9.2374213836478, "grad_norm": 0.41571372747421265, "learning_rate": 8.817009220619482e-07, "loss": 0.5717, "num_input_tokens_seen": 11523944, "step": 17625 }, { "epoch": 9.240041928721174, "grad_norm": 0.2130880504846573, "learning_rate": 8.756912239583554e-07, "loss": 0.6037, "num_input_tokens_seen": 11527720, "step": 17630 }, { "epoch": 9.24266247379455, "grad_norm": 0.36303600668907166, "learning_rate": 8.697017119743911e-07, "loss": 0.5214, "num_input_tokens_seen": 11531304, "step": 17635 }, { "epoch": 9.245283018867925, "grad_norm": 0.4287474751472473, "learning_rate": 8.637323911218048e-07, "loss": 0.4152, "num_input_tokens_seen": 11535176, "step": 17640 }, { "epoch": 9.2479035639413, "grad_norm": 0.5110912322998047, "learning_rate": 8.577832663954538e-07, "loss": 0.5125, "num_input_tokens_seen": 11539016, "step": 17645 }, { "epoch": 9.250524109014675, "grad_norm": 0.3455313444137573, "learning_rate": 8.51854342773295e-07, "loss": 0.3884, "num_input_tokens_seen": 11543752, "step": 17650 }, { "epoch": 9.25314465408805, "grad_norm": 0.6538006067276001, "learning_rate": 8.459456252163739e-07, "loss": 0.463, "num_input_tokens_seen": 11546664, "step": 17655 }, { "epoch": 9.255765199161425, "grad_norm": 0.8193972110748291, "learning_rate": 8.400571186688466e-07, "loss": 0.4901, "num_input_tokens_seen": 11549032, "step": 17660 }, { "epoch": 9.2583857442348, "grad_norm": 0.44378021359443665, "learning_rate": 8.341888280579386e-07, "loss": 0.5689, "num_input_tokens_seen": 11552328, "step": 17665 }, { "epoch": 9.261006289308176, "grad_norm": 0.4219503104686737, "learning_rate": 8.283407582939689e-07, "loss": 0.5728, "num_input_tokens_seen": 11555464, "step": 17670 }, { "epoch": 9.26362683438155, "grad_norm": 0.5640112161636353, "learning_rate": 8.22512914270332e-07, "loss": 0.4743, "num_input_tokens_seen": 11558088, "step": 17675 }, { "epoch": 9.266247379454926, "grad_norm": 0.48364049196243286, "learning_rate": 8.167053008635101e-07, "loss": 0.5094, "num_input_tokens_seen": 11560872, "step": 17680 }, { "epoch": 9.268867924528301, "grad_norm": 0.5148943662643433, "learning_rate": 8.109179229330438e-07, "loss": 0.4193, "num_input_tokens_seen": 11564264, "step": 17685 }, { "epoch": 9.271488469601676, "grad_norm": 0.35004058480262756, "learning_rate": 8.051507853215401e-07, "loss": 0.4387, "num_input_tokens_seen": 11567656, "step": 17690 }, { "epoch": 9.274109014675052, "grad_norm": 0.435283362865448, "learning_rate": 7.994038928546887e-07, "loss": 0.5021, "num_input_tokens_seen": 11571176, "step": 17695 }, { "epoch": 9.276729559748428, "grad_norm": 0.4575514495372772, "learning_rate": 7.93677250341221e-07, "loss": 0.5328, "num_input_tokens_seen": 11574216, "step": 17700 }, { "epoch": 9.279350104821804, "grad_norm": 0.5128079652786255, "learning_rate": 7.879708625729287e-07, "loss": 0.3629, "num_input_tokens_seen": 11577608, "step": 17705 }, { "epoch": 9.281970649895179, "grad_norm": 0.4518446624279022, "learning_rate": 7.822847343246564e-07, "loss": 0.4978, "num_input_tokens_seen": 11581000, "step": 17710 }, { "epoch": 9.284591194968554, "grad_norm": 0.4807211458683014, "learning_rate": 7.766188703542954e-07, "loss": 0.486, "num_input_tokens_seen": 11584840, "step": 17715 }, { "epoch": 9.28721174004193, "grad_norm": 0.5001543760299683, "learning_rate": 7.709732754027866e-07, "loss": 0.4799, "num_input_tokens_seen": 11587912, "step": 17720 }, { "epoch": 9.289832285115304, "grad_norm": 0.6309995651245117, "learning_rate": 7.653479541941038e-07, "loss": 0.48, "num_input_tokens_seen": 11591752, "step": 17725 }, { "epoch": 9.29245283018868, "grad_norm": 0.597827672958374, "learning_rate": 7.597429114352572e-07, "loss": 0.5372, "num_input_tokens_seen": 11594248, "step": 17730 }, { "epoch": 9.295073375262055, "grad_norm": 0.4872429668903351, "learning_rate": 7.541581518162922e-07, "loss": 0.4592, "num_input_tokens_seen": 11597448, "step": 17735 }, { "epoch": 9.29769392033543, "grad_norm": 0.5077419877052307, "learning_rate": 7.485936800102788e-07, "loss": 0.6485, "num_input_tokens_seen": 11600360, "step": 17740 }, { "epoch": 9.300314465408805, "grad_norm": 0.7048050761222839, "learning_rate": 7.430495006733152e-07, "loss": 0.5219, "num_input_tokens_seen": 11603528, "step": 17745 }, { "epoch": 9.30293501048218, "grad_norm": 0.5293779969215393, "learning_rate": 7.375256184445178e-07, "loss": 0.4974, "num_input_tokens_seen": 11606600, "step": 17750 }, { "epoch": 9.305555555555555, "grad_norm": 0.48521319031715393, "learning_rate": 7.320220379460146e-07, "loss": 0.4254, "num_input_tokens_seen": 11609064, "step": 17755 }, { "epoch": 9.30817610062893, "grad_norm": 0.36312493681907654, "learning_rate": 7.265387637829524e-07, "loss": 0.422, "num_input_tokens_seen": 11612328, "step": 17760 }, { "epoch": 9.310796645702306, "grad_norm": 0.4327743649482727, "learning_rate": 7.210758005434887e-07, "loss": 0.4784, "num_input_tokens_seen": 11615912, "step": 17765 }, { "epoch": 9.31341719077568, "grad_norm": 0.49526381492614746, "learning_rate": 7.156331527987753e-07, "loss": 0.5046, "num_input_tokens_seen": 11618888, "step": 17770 }, { "epoch": 9.316037735849056, "grad_norm": 0.43697282671928406, "learning_rate": 7.102108251029777e-07, "loss": 0.6967, "num_input_tokens_seen": 11621544, "step": 17775 }, { "epoch": 9.318658280922431, "grad_norm": 0.49034687876701355, "learning_rate": 7.04808821993247e-07, "loss": 0.4081, "num_input_tokens_seen": 11624040, "step": 17780 }, { "epoch": 9.321278825995806, "grad_norm": 0.5168036818504333, "learning_rate": 6.994271479897314e-07, "loss": 0.4784, "num_input_tokens_seen": 11626728, "step": 17785 }, { "epoch": 9.323899371069182, "grad_norm": 0.6167699098587036, "learning_rate": 6.940658075955759e-07, "loss": 0.3515, "num_input_tokens_seen": 11629832, "step": 17790 }, { "epoch": 9.326519916142558, "grad_norm": 0.46116355061531067, "learning_rate": 6.887248052969003e-07, "loss": 0.3638, "num_input_tokens_seen": 11633320, "step": 17795 }, { "epoch": 9.329140461215934, "grad_norm": 0.42955678701400757, "learning_rate": 6.834041455628104e-07, "loss": 0.428, "num_input_tokens_seen": 11636104, "step": 17800 }, { "epoch": 9.331761006289309, "grad_norm": 0.35969969630241394, "learning_rate": 6.781038328454003e-07, "loss": 0.42, "num_input_tokens_seen": 11639752, "step": 17805 }, { "epoch": 9.334381551362684, "grad_norm": 0.8552454113960266, "learning_rate": 6.728238715797169e-07, "loss": 0.4306, "num_input_tokens_seen": 11642664, "step": 17810 }, { "epoch": 9.33700209643606, "grad_norm": 0.43764927983283997, "learning_rate": 6.675642661838011e-07, "loss": 0.488, "num_input_tokens_seen": 11646024, "step": 17815 }, { "epoch": 9.339622641509434, "grad_norm": 0.36881527304649353, "learning_rate": 6.623250210586463e-07, "loss": 0.4495, "num_input_tokens_seen": 11649192, "step": 17820 }, { "epoch": 9.34224318658281, "grad_norm": 0.31225377321243286, "learning_rate": 6.571061405882095e-07, "loss": 0.5645, "num_input_tokens_seen": 11653192, "step": 17825 }, { "epoch": 9.344863731656185, "grad_norm": 0.9036805629730225, "learning_rate": 6.519076291394172e-07, "loss": 0.4509, "num_input_tokens_seen": 11657224, "step": 17830 }, { "epoch": 9.34748427672956, "grad_norm": 0.46297627687454224, "learning_rate": 6.467294910621452e-07, "loss": 0.5907, "num_input_tokens_seen": 11660328, "step": 17835 }, { "epoch": 9.350104821802935, "grad_norm": 0.38492268323898315, "learning_rate": 6.415717306892193e-07, "loss": 0.5148, "num_input_tokens_seen": 11663400, "step": 17840 }, { "epoch": 9.35272536687631, "grad_norm": 0.4809213876724243, "learning_rate": 6.364343523364263e-07, "loss": 0.5119, "num_input_tokens_seen": 11665992, "step": 17845 }, { "epoch": 9.355345911949685, "grad_norm": 0.7329555153846741, "learning_rate": 6.313173603024802e-07, "loss": 0.4184, "num_input_tokens_seen": 11669000, "step": 17850 }, { "epoch": 9.35796645702306, "grad_norm": 0.5637892484664917, "learning_rate": 6.262207588690533e-07, "loss": 0.4505, "num_input_tokens_seen": 11671976, "step": 17855 }, { "epoch": 9.360587002096436, "grad_norm": 0.35676857829093933, "learning_rate": 6.211445523007398e-07, "loss": 0.4062, "num_input_tokens_seen": 11674984, "step": 17860 }, { "epoch": 9.36320754716981, "grad_norm": 0.6119475960731506, "learning_rate": 6.160887448450892e-07, "loss": 0.4374, "num_input_tokens_seen": 11677864, "step": 17865 }, { "epoch": 9.365828092243186, "grad_norm": 0.21970123052597046, "learning_rate": 6.11053340732562e-07, "loss": 0.4113, "num_input_tokens_seen": 11681128, "step": 17870 }, { "epoch": 9.368448637316561, "grad_norm": 0.582686722278595, "learning_rate": 6.060383441765544e-07, "loss": 0.4603, "num_input_tokens_seen": 11683880, "step": 17875 }, { "epoch": 9.371069182389936, "grad_norm": 0.6397392749786377, "learning_rate": 6.01043759373393e-07, "loss": 0.4684, "num_input_tokens_seen": 11687496, "step": 17880 }, { "epoch": 9.373689727463312, "grad_norm": 0.6261096596717834, "learning_rate": 5.960695905023128e-07, "loss": 0.4769, "num_input_tokens_seen": 11690696, "step": 17885 }, { "epoch": 9.376310272536688, "grad_norm": 0.35562098026275635, "learning_rate": 5.91115841725473e-07, "loss": 0.4168, "num_input_tokens_seen": 11693544, "step": 17890 }, { "epoch": 9.378930817610064, "grad_norm": 0.5759718418121338, "learning_rate": 5.861825171879415e-07, "loss": 0.5319, "num_input_tokens_seen": 11696840, "step": 17895 }, { "epoch": 9.381551362683439, "grad_norm": 0.3629257082939148, "learning_rate": 5.812696210177021e-07, "loss": 0.4727, "num_input_tokens_seen": 11700264, "step": 17900 }, { "epoch": 9.384171907756814, "grad_norm": 0.6497679948806763, "learning_rate": 5.763771573256415e-07, "loss": 0.3345, "num_input_tokens_seen": 11703496, "step": 17905 }, { "epoch": 9.38679245283019, "grad_norm": 0.6889035105705261, "learning_rate": 5.715051302055491e-07, "loss": 0.5759, "num_input_tokens_seen": 11706664, "step": 17910 }, { "epoch": 9.389412997903564, "grad_norm": 0.6880638599395752, "learning_rate": 5.666535437341108e-07, "loss": 0.4942, "num_input_tokens_seen": 11709480, "step": 17915 }, { "epoch": 9.39203354297694, "grad_norm": 0.3140813410282135, "learning_rate": 5.618224019709212e-07, "loss": 0.3829, "num_input_tokens_seen": 11712712, "step": 17920 }, { "epoch": 9.394654088050315, "grad_norm": 0.6988933086395264, "learning_rate": 5.570117089584548e-07, "loss": 0.5655, "num_input_tokens_seen": 11715464, "step": 17925 }, { "epoch": 9.39727463312369, "grad_norm": 0.5125806331634521, "learning_rate": 5.522214687220751e-07, "loss": 0.5152, "num_input_tokens_seen": 11718120, "step": 17930 }, { "epoch": 9.399895178197065, "grad_norm": 0.4808393716812134, "learning_rate": 5.474516852700451e-07, "loss": 0.5051, "num_input_tokens_seen": 11721512, "step": 17935 }, { "epoch": 9.40251572327044, "grad_norm": 0.5217018127441406, "learning_rate": 5.427023625934946e-07, "loss": 0.3255, "num_input_tokens_seen": 11724008, "step": 17940 }, { "epoch": 9.405136268343815, "grad_norm": 0.42602846026420593, "learning_rate": 5.379735046664419e-07, "loss": 0.4667, "num_input_tokens_seen": 11726696, "step": 17945 }, { "epoch": 9.40775681341719, "grad_norm": 0.2553784251213074, "learning_rate": 5.33265115445783e-07, "loss": 0.3792, "num_input_tokens_seen": 11730408, "step": 17950 }, { "epoch": 9.410377358490566, "grad_norm": 0.5406027436256409, "learning_rate": 5.285771988712746e-07, "loss": 0.548, "num_input_tokens_seen": 11733320, "step": 17955 }, { "epoch": 9.41299790356394, "grad_norm": 0.6861776113510132, "learning_rate": 5.239097588655595e-07, "loss": 0.4524, "num_input_tokens_seen": 11738472, "step": 17960 }, { "epoch": 9.415618448637316, "grad_norm": 0.45190155506134033, "learning_rate": 5.192627993341359e-07, "loss": 0.5167, "num_input_tokens_seen": 11741032, "step": 17965 }, { "epoch": 9.418238993710691, "grad_norm": 0.8622046113014221, "learning_rate": 5.146363241653657e-07, "loss": 0.4649, "num_input_tokens_seen": 11744328, "step": 17970 }, { "epoch": 9.420859538784066, "grad_norm": 0.5523704290390015, "learning_rate": 5.100303372304716e-07, "loss": 0.3676, "num_input_tokens_seen": 11747976, "step": 17975 }, { "epoch": 9.423480083857442, "grad_norm": 0.35104861855506897, "learning_rate": 5.054448423835373e-07, "loss": 0.5706, "num_input_tokens_seen": 11751144, "step": 17980 }, { "epoch": 9.426100628930818, "grad_norm": 0.38253793120384216, "learning_rate": 5.008798434614908e-07, "loss": 0.4913, "num_input_tokens_seen": 11754312, "step": 17985 }, { "epoch": 9.428721174004194, "grad_norm": 0.3378612697124481, "learning_rate": 4.963353442841156e-07, "loss": 0.3541, "num_input_tokens_seen": 11757896, "step": 17990 }, { "epoch": 9.431341719077569, "grad_norm": 0.31138303875923157, "learning_rate": 4.918113486540393e-07, "loss": 0.4127, "num_input_tokens_seen": 11760840, "step": 17995 }, { "epoch": 9.433962264150944, "grad_norm": 0.4710812270641327, "learning_rate": 4.873078603567421e-07, "loss": 0.4089, "num_input_tokens_seen": 11764136, "step": 18000 }, { "epoch": 9.43658280922432, "grad_norm": 1.496252179145813, "learning_rate": 4.828248831605292e-07, "loss": 0.5739, "num_input_tokens_seen": 11766696, "step": 18005 }, { "epoch": 9.439203354297694, "grad_norm": 0.7020965218544006, "learning_rate": 4.783624208165554e-07, "loss": 0.5185, "num_input_tokens_seen": 11769160, "step": 18010 }, { "epoch": 9.44182389937107, "grad_norm": 0.27475598454475403, "learning_rate": 4.739204770588035e-07, "loss": 0.504, "num_input_tokens_seen": 11772360, "step": 18015 }, { "epoch": 9.444444444444445, "grad_norm": 0.7048736810684204, "learning_rate": 4.694990556040918e-07, "loss": 0.5262, "num_input_tokens_seen": 11774984, "step": 18020 }, { "epoch": 9.44706498951782, "grad_norm": 0.2868896722793579, "learning_rate": 4.65098160152061e-07, "loss": 0.453, "num_input_tokens_seen": 11777928, "step": 18025 }, { "epoch": 9.449685534591195, "grad_norm": 0.44301727414131165, "learning_rate": 4.6071779438517924e-07, "loss": 0.4764, "num_input_tokens_seen": 11781096, "step": 18030 }, { "epoch": 9.45230607966457, "grad_norm": 0.27457869052886963, "learning_rate": 4.563579619687369e-07, "loss": 0.4143, "num_input_tokens_seen": 11784104, "step": 18035 }, { "epoch": 9.454926624737945, "grad_norm": 0.2556225061416626, "learning_rate": 4.5201866655084636e-07, "loss": 0.4955, "num_input_tokens_seen": 11787304, "step": 18040 }, { "epoch": 9.45754716981132, "grad_norm": 0.5415632724761963, "learning_rate": 4.4769991176242533e-07, "loss": 0.3568, "num_input_tokens_seen": 11790024, "step": 18045 }, { "epoch": 9.460167714884696, "grad_norm": 0.4220922887325287, "learning_rate": 4.4340170121721645e-07, "loss": 0.407, "num_input_tokens_seen": 11792840, "step": 18050 }, { "epoch": 9.46278825995807, "grad_norm": 0.6053581237792969, "learning_rate": 4.3912403851176234e-07, "loss": 0.4407, "num_input_tokens_seen": 11795720, "step": 18055 }, { "epoch": 9.465408805031446, "grad_norm": 0.6232970356941223, "learning_rate": 4.348669272254163e-07, "loss": 0.4481, "num_input_tokens_seen": 11798472, "step": 18060 }, { "epoch": 9.468029350104821, "grad_norm": 0.5517882108688354, "learning_rate": 4.306303709203374e-07, "loss": 0.4467, "num_input_tokens_seen": 11801800, "step": 18065 }, { "epoch": 9.470649895178196, "grad_norm": 0.5795467495918274, "learning_rate": 4.264143731414788e-07, "loss": 0.5289, "num_input_tokens_seen": 11805160, "step": 18070 }, { "epoch": 9.473270440251572, "grad_norm": 0.577065110206604, "learning_rate": 4.2221893741659636e-07, "loss": 0.4332, "num_input_tokens_seen": 11811816, "step": 18075 }, { "epoch": 9.475890985324948, "grad_norm": 0.3004209101200104, "learning_rate": 4.180440672562402e-07, "loss": 0.5093, "num_input_tokens_seen": 11816040, "step": 18080 }, { "epoch": 9.478511530398324, "grad_norm": 0.35319504141807556, "learning_rate": 4.1388976615374665e-07, "loss": 0.5067, "num_input_tokens_seen": 11818920, "step": 18085 }, { "epoch": 9.481132075471699, "grad_norm": 0.6736288666725159, "learning_rate": 4.097560375852516e-07, "loss": 0.5329, "num_input_tokens_seen": 11821736, "step": 18090 }, { "epoch": 9.483752620545074, "grad_norm": 0.9388456344604492, "learning_rate": 4.056428850096661e-07, "loss": 0.4947, "num_input_tokens_seen": 11825256, "step": 18095 }, { "epoch": 9.48637316561845, "grad_norm": 0.26588737964630127, "learning_rate": 4.01550311868687e-07, "loss": 0.5677, "num_input_tokens_seen": 11829416, "step": 18100 }, { "epoch": 9.488993710691824, "grad_norm": 0.3546873927116394, "learning_rate": 3.974783215867972e-07, "loss": 0.5868, "num_input_tokens_seen": 11832200, "step": 18105 }, { "epoch": 9.4916142557652, "grad_norm": 0.6437236666679382, "learning_rate": 3.9342691757124626e-07, "loss": 0.4875, "num_input_tokens_seen": 11835592, "step": 18110 }, { "epoch": 9.494234800838575, "grad_norm": 0.49940311908721924, "learning_rate": 3.8939610321206966e-07, "loss": 0.4281, "num_input_tokens_seen": 11839400, "step": 18115 }, { "epoch": 9.49685534591195, "grad_norm": 0.5421358346939087, "learning_rate": 3.853858818820694e-07, "loss": 0.571, "num_input_tokens_seen": 11842216, "step": 18120 }, { "epoch": 9.499475890985325, "grad_norm": 0.5761134028434753, "learning_rate": 3.8139625693680847e-07, "loss": 0.3894, "num_input_tokens_seen": 11844936, "step": 18125 }, { "epoch": 9.5, "eval_loss": 0.4834924042224884, "eval_runtime": 14.5659, "eval_samples_per_second": 58.218, "eval_steps_per_second": 14.555, "num_input_tokens_seen": 11845704, "step": 18126 }, { "epoch": 9.5020964360587, "grad_norm": 0.690080463886261, "learning_rate": 3.774272317146277e-07, "loss": 0.47, "num_input_tokens_seen": 11847912, "step": 18130 }, { "epoch": 9.504716981132075, "grad_norm": 0.23202413320541382, "learning_rate": 3.7347880953662597e-07, "loss": 0.364, "num_input_tokens_seen": 11853192, "step": 18135 }, { "epoch": 9.50733752620545, "grad_norm": 0.6510362029075623, "learning_rate": 3.6955099370666045e-07, "loss": 0.5445, "num_input_tokens_seen": 11856808, "step": 18140 }, { "epoch": 9.509958071278826, "grad_norm": 0.7690724730491638, "learning_rate": 3.656437875113522e-07, "loss": 0.7121, "num_input_tokens_seen": 11860360, "step": 18145 }, { "epoch": 9.5125786163522, "grad_norm": 0.39666515588760376, "learning_rate": 3.617571942200693e-07, "loss": 0.3561, "num_input_tokens_seen": 11863048, "step": 18150 }, { "epoch": 9.515199161425576, "grad_norm": 0.3493335545063019, "learning_rate": 3.5789121708493523e-07, "loss": 0.442, "num_input_tokens_seen": 11867016, "step": 18155 }, { "epoch": 9.517819706498951, "grad_norm": 0.8789134621620178, "learning_rate": 3.5404585934082635e-07, "loss": 0.4434, "num_input_tokens_seen": 11870728, "step": 18160 }, { "epoch": 9.520440251572326, "grad_norm": 0.30527299642562866, "learning_rate": 3.502211242053577e-07, "loss": 0.4806, "num_input_tokens_seen": 11873928, "step": 18165 }, { "epoch": 9.523060796645701, "grad_norm": 0.5202436447143555, "learning_rate": 3.4641701487889697e-07, "loss": 0.4182, "num_input_tokens_seen": 11876968, "step": 18170 }, { "epoch": 9.525681341719078, "grad_norm": 0.3448165953159332, "learning_rate": 3.4263353454454806e-07, "loss": 0.4497, "num_input_tokens_seen": 11880648, "step": 18175 }, { "epoch": 9.528301886792454, "grad_norm": 0.5849862694740295, "learning_rate": 3.3887068636815346e-07, "loss": 0.4647, "num_input_tokens_seen": 11883560, "step": 18180 }, { "epoch": 9.530922431865829, "grad_norm": 0.44016486406326294, "learning_rate": 3.351284734982918e-07, "loss": 0.5099, "num_input_tokens_seen": 11888296, "step": 18185 }, { "epoch": 9.533542976939204, "grad_norm": 0.38689446449279785, "learning_rate": 3.3140689906628054e-07, "loss": 0.4306, "num_input_tokens_seen": 11892872, "step": 18190 }, { "epoch": 9.536163522012579, "grad_norm": 0.7995065450668335, "learning_rate": 3.2770596618615645e-07, "loss": 0.4139, "num_input_tokens_seen": 11895944, "step": 18195 }, { "epoch": 9.538784067085954, "grad_norm": 0.8914022445678711, "learning_rate": 3.240256779546952e-07, "loss": 0.4909, "num_input_tokens_seen": 11898696, "step": 18200 }, { "epoch": 9.54140461215933, "grad_norm": 0.2563081979751587, "learning_rate": 3.2036603745139447e-07, "loss": 0.3618, "num_input_tokens_seen": 11904744, "step": 18205 }, { "epoch": 9.544025157232705, "grad_norm": 0.6384598612785339, "learning_rate": 3.167270477384743e-07, "loss": 0.4295, "num_input_tokens_seen": 11907656, "step": 18210 }, { "epoch": 9.54664570230608, "grad_norm": 0.44730842113494873, "learning_rate": 3.1310871186086834e-07, "loss": 0.538, "num_input_tokens_seen": 11910504, "step": 18215 }, { "epoch": 9.549266247379455, "grad_norm": 1.172143816947937, "learning_rate": 3.095110328462464e-07, "loss": 0.5356, "num_input_tokens_seen": 11913192, "step": 18220 }, { "epoch": 9.55188679245283, "grad_norm": 0.5281341671943665, "learning_rate": 3.0593401370497264e-07, "loss": 0.6039, "num_input_tokens_seen": 11916680, "step": 18225 }, { "epoch": 9.554507337526205, "grad_norm": 1.12156343460083, "learning_rate": 3.0237765743013626e-07, "loss": 0.4052, "num_input_tokens_seen": 11919656, "step": 18230 }, { "epoch": 9.55712788259958, "grad_norm": 0.3830340504646301, "learning_rate": 2.9884196699753453e-07, "loss": 0.4324, "num_input_tokens_seen": 11922632, "step": 18235 }, { "epoch": 9.559748427672956, "grad_norm": 0.4393095076084137, "learning_rate": 2.953269453656704e-07, "loss": 0.4843, "num_input_tokens_seen": 11926280, "step": 18240 }, { "epoch": 9.56236897274633, "grad_norm": 0.3436582684516907, "learning_rate": 2.9183259547575504e-07, "loss": 0.3137, "num_input_tokens_seen": 11929768, "step": 18245 }, { "epoch": 9.564989517819706, "grad_norm": 0.320719450712204, "learning_rate": 2.883589202517023e-07, "loss": 0.4426, "num_input_tokens_seen": 11933480, "step": 18250 }, { "epoch": 9.567610062893081, "grad_norm": 0.4414977729320526, "learning_rate": 2.849059226001177e-07, "loss": 0.3635, "num_input_tokens_seen": 11936200, "step": 18255 }, { "epoch": 9.570230607966456, "grad_norm": 0.6303771138191223, "learning_rate": 2.8147360541032065e-07, "loss": 0.5611, "num_input_tokens_seen": 11938472, "step": 18260 }, { "epoch": 9.572851153039831, "grad_norm": 0.29412195086479187, "learning_rate": 2.780619715543109e-07, "loss": 0.5361, "num_input_tokens_seen": 11942280, "step": 18265 }, { "epoch": 9.575471698113208, "grad_norm": 0.35891595482826233, "learning_rate": 2.746710238867911e-07, "loss": 0.4159, "num_input_tokens_seen": 11945800, "step": 18270 }, { "epoch": 9.578092243186584, "grad_norm": 0.3457779288291931, "learning_rate": 2.713007652451499e-07, "loss": 0.4314, "num_input_tokens_seen": 11948200, "step": 18275 }, { "epoch": 9.580712788259959, "grad_norm": 0.5204859375953674, "learning_rate": 2.6795119844946757e-07, "loss": 0.499, "num_input_tokens_seen": 11951656, "step": 18280 }, { "epoch": 9.583333333333334, "grad_norm": 0.3149822950363159, "learning_rate": 2.646223263025077e-07, "loss": 0.4606, "num_input_tokens_seen": 11955208, "step": 18285 }, { "epoch": 9.585953878406709, "grad_norm": 0.490909218788147, "learning_rate": 2.6131415158971993e-07, "loss": 0.3742, "num_input_tokens_seen": 11957768, "step": 18290 }, { "epoch": 9.588574423480084, "grad_norm": 0.5505219101905823, "learning_rate": 2.5802667707922887e-07, "loss": 0.4479, "num_input_tokens_seen": 11960552, "step": 18295 }, { "epoch": 9.59119496855346, "grad_norm": 0.6011710166931152, "learning_rate": 2.54759905521848e-07, "loss": 0.5256, "num_input_tokens_seen": 11964552, "step": 18300 }, { "epoch": 9.593815513626835, "grad_norm": 1.0806363821029663, "learning_rate": 2.51513839651063e-07, "loss": 0.4309, "num_input_tokens_seen": 11966824, "step": 18305 }, { "epoch": 9.59643605870021, "grad_norm": 0.4730078876018524, "learning_rate": 2.4828848218302615e-07, "loss": 0.4487, "num_input_tokens_seen": 11969832, "step": 18310 }, { "epoch": 9.599056603773585, "grad_norm": 0.3813066780567169, "learning_rate": 2.450838358165786e-07, "loss": 0.405, "num_input_tokens_seen": 11972424, "step": 18315 }, { "epoch": 9.60167714884696, "grad_norm": 0.5417377352714539, "learning_rate": 2.41899903233217e-07, "loss": 0.4735, "num_input_tokens_seen": 11976552, "step": 18320 }, { "epoch": 9.604297693920335, "grad_norm": 0.4749578535556793, "learning_rate": 2.387366870971103e-07, "loss": 0.4398, "num_input_tokens_seen": 11979720, "step": 18325 }, { "epoch": 9.60691823899371, "grad_norm": 0.404983252286911, "learning_rate": 2.3559419005509675e-07, "loss": 0.5279, "num_input_tokens_seen": 11982536, "step": 18330 }, { "epoch": 9.609538784067086, "grad_norm": 0.2891688644886017, "learning_rate": 2.3247241473667026e-07, "loss": 0.3406, "num_input_tokens_seen": 11985384, "step": 18335 }, { "epoch": 9.61215932914046, "grad_norm": 0.4663790464401245, "learning_rate": 2.2937136375399126e-07, "loss": 0.4203, "num_input_tokens_seen": 11988712, "step": 18340 }, { "epoch": 9.614779874213836, "grad_norm": 0.2503162622451782, "learning_rate": 2.2629103970188137e-07, "loss": 0.4329, "num_input_tokens_seen": 11993352, "step": 18345 }, { "epoch": 9.617400419287211, "grad_norm": 0.33969846367836, "learning_rate": 2.2323144515780935e-07, "loss": 0.471, "num_input_tokens_seen": 11995848, "step": 18350 }, { "epoch": 9.620020964360586, "grad_norm": 0.4073438048362732, "learning_rate": 2.201925826819079e-07, "loss": 0.5246, "num_input_tokens_seen": 11999336, "step": 18355 }, { "epoch": 9.622641509433961, "grad_norm": 0.4991224706172943, "learning_rate": 2.1717445481695408e-07, "loss": 0.4717, "num_input_tokens_seen": 12003080, "step": 18360 }, { "epoch": 9.625262054507338, "grad_norm": 0.26595538854599, "learning_rate": 2.1417706408838333e-07, "loss": 0.2833, "num_input_tokens_seen": 12007240, "step": 18365 }, { "epoch": 9.627882599580714, "grad_norm": 0.3922433853149414, "learning_rate": 2.112004130042755e-07, "loss": 0.5024, "num_input_tokens_seen": 12009928, "step": 18370 }, { "epoch": 9.630503144654089, "grad_norm": 0.35390031337738037, "learning_rate": 2.082445040553549e-07, "loss": 0.4274, "num_input_tokens_seen": 12013384, "step": 18375 }, { "epoch": 9.633123689727464, "grad_norm": 0.6572347283363342, "learning_rate": 2.053093397149902e-07, "loss": 0.4971, "num_input_tokens_seen": 12016520, "step": 18380 }, { "epoch": 9.635744234800839, "grad_norm": 0.6359766721725464, "learning_rate": 2.0239492243919467e-07, "loss": 0.4228, "num_input_tokens_seen": 12020040, "step": 18385 }, { "epoch": 9.638364779874214, "grad_norm": 0.4401771128177643, "learning_rate": 1.9950125466662028e-07, "loss": 0.4486, "num_input_tokens_seen": 12023464, "step": 18390 }, { "epoch": 9.64098532494759, "grad_norm": 0.5725033283233643, "learning_rate": 1.9662833881855248e-07, "loss": 0.3351, "num_input_tokens_seen": 12026664, "step": 18395 }, { "epoch": 9.643605870020965, "grad_norm": 0.3286609947681427, "learning_rate": 1.9377617729891828e-07, "loss": 0.3942, "num_input_tokens_seen": 12030440, "step": 18400 }, { "epoch": 9.64622641509434, "grad_norm": 0.46557703614234924, "learning_rate": 1.9094477249427534e-07, "loss": 0.5038, "num_input_tokens_seen": 12032872, "step": 18405 }, { "epoch": 9.648846960167715, "grad_norm": 0.3898506760597229, "learning_rate": 1.8813412677381737e-07, "loss": 0.3857, "num_input_tokens_seen": 12037864, "step": 18410 }, { "epoch": 9.65146750524109, "grad_norm": 0.43622323870658875, "learning_rate": 1.8534424248935756e-07, "loss": 0.312, "num_input_tokens_seen": 12040904, "step": 18415 }, { "epoch": 9.654088050314465, "grad_norm": 0.44135627150535583, "learning_rate": 1.8257512197535076e-07, "loss": 0.4536, "num_input_tokens_seen": 12044296, "step": 18420 }, { "epoch": 9.65670859538784, "grad_norm": 0.40219613909721375, "learning_rate": 1.7982676754886574e-07, "loss": 0.3555, "num_input_tokens_seen": 12047208, "step": 18425 }, { "epoch": 9.659329140461216, "grad_norm": 0.4098197817802429, "learning_rate": 1.7709918150959904e-07, "loss": 0.5172, "num_input_tokens_seen": 12050696, "step": 18430 }, { "epoch": 9.66194968553459, "grad_norm": 0.43824711441993713, "learning_rate": 1.7439236613987775e-07, "loss": 0.5077, "num_input_tokens_seen": 12054536, "step": 18435 }, { "epoch": 9.664570230607966, "grad_norm": 0.5852528214454651, "learning_rate": 1.717063237046318e-07, "loss": 0.463, "num_input_tokens_seen": 12056776, "step": 18440 }, { "epoch": 9.667190775681341, "grad_norm": 0.49174049496650696, "learning_rate": 1.6904105645142444e-07, "loss": 0.516, "num_input_tokens_seen": 12059720, "step": 18445 }, { "epoch": 9.669811320754716, "grad_norm": 0.5252620577812195, "learning_rate": 1.6639656661043e-07, "loss": 0.4543, "num_input_tokens_seen": 12063304, "step": 18450 }, { "epoch": 9.672431865828091, "grad_norm": 0.3917083442211151, "learning_rate": 1.6377285639443407e-07, "loss": 0.5142, "num_input_tokens_seen": 12067592, "step": 18455 }, { "epoch": 9.675052410901468, "grad_norm": 0.3949885368347168, "learning_rate": 1.61169927998836e-07, "loss": 0.4691, "num_input_tokens_seen": 12070856, "step": 18460 }, { "epoch": 9.677672955974844, "grad_norm": 0.36282747983932495, "learning_rate": 1.5858778360165195e-07, "loss": 0.474, "num_input_tokens_seen": 12074280, "step": 18465 }, { "epoch": 9.680293501048219, "grad_norm": 0.4930509626865387, "learning_rate": 1.5602642536350075e-07, "loss": 0.4082, "num_input_tokens_seen": 12077288, "step": 18470 }, { "epoch": 9.682914046121594, "grad_norm": 0.42963024973869324, "learning_rate": 1.5348585542760974e-07, "loss": 0.3857, "num_input_tokens_seen": 12082056, "step": 18475 }, { "epoch": 9.685534591194969, "grad_norm": 0.3600664436817169, "learning_rate": 1.5096607591980894e-07, "loss": 0.5345, "num_input_tokens_seen": 12085128, "step": 18480 }, { "epoch": 9.688155136268344, "grad_norm": 0.37744033336639404, "learning_rate": 1.4846708894853955e-07, "loss": 0.5619, "num_input_tokens_seen": 12089032, "step": 18485 }, { "epoch": 9.69077568134172, "grad_norm": 0.4206576347351074, "learning_rate": 1.459888966048373e-07, "loss": 0.5405, "num_input_tokens_seen": 12091496, "step": 18490 }, { "epoch": 9.693396226415095, "grad_norm": 0.4490574598312378, "learning_rate": 1.4353150096234058e-07, "loss": 0.5699, "num_input_tokens_seen": 12095208, "step": 18495 }, { "epoch": 9.69601677148847, "grad_norm": 0.499492883682251, "learning_rate": 1.410949040772852e-07, "loss": 0.519, "num_input_tokens_seen": 12099176, "step": 18500 }, { "epoch": 9.698637316561845, "grad_norm": 0.5055239796638489, "learning_rate": 1.3867910798850692e-07, "loss": 0.5599, "num_input_tokens_seen": 12102408, "step": 18505 }, { "epoch": 9.70125786163522, "grad_norm": 0.40232983231544495, "learning_rate": 1.3628411471742764e-07, "loss": 0.552, "num_input_tokens_seen": 12105704, "step": 18510 }, { "epoch": 9.703878406708595, "grad_norm": 0.2769545614719391, "learning_rate": 1.3390992626807485e-07, "loss": 0.5464, "num_input_tokens_seen": 12108520, "step": 18515 }, { "epoch": 9.70649895178197, "grad_norm": 0.4106120467185974, "learning_rate": 1.315565446270567e-07, "loss": 0.4415, "num_input_tokens_seen": 12111912, "step": 18520 }, { "epoch": 9.709119496855346, "grad_norm": 0.5849049687385559, "learning_rate": 1.292239717635785e-07, "loss": 0.435, "num_input_tokens_seen": 12116040, "step": 18525 }, { "epoch": 9.71174004192872, "grad_norm": 0.3719842731952667, "learning_rate": 1.269122096294262e-07, "loss": 0.4188, "num_input_tokens_seen": 12118632, "step": 18530 }, { "epoch": 9.714360587002096, "grad_norm": 0.6240454912185669, "learning_rate": 1.24621260158983e-07, "loss": 0.3341, "num_input_tokens_seen": 12121192, "step": 18535 }, { "epoch": 9.716981132075471, "grad_norm": 0.5060901045799255, "learning_rate": 1.2235112526920723e-07, "loss": 0.4317, "num_input_tokens_seen": 12123976, "step": 18540 }, { "epoch": 9.719601677148846, "grad_norm": 0.3751234710216522, "learning_rate": 1.2010180685964324e-07, "loss": 0.5838, "num_input_tokens_seen": 12127816, "step": 18545 }, { "epoch": 9.722222222222221, "grad_norm": 0.3203640580177307, "learning_rate": 1.1787330681241881e-07, "loss": 0.5292, "num_input_tokens_seen": 12131848, "step": 18550 }, { "epoch": 9.724842767295598, "grad_norm": 0.4183841049671173, "learning_rate": 1.156656269922396e-07, "loss": 0.4411, "num_input_tokens_seen": 12135432, "step": 18555 }, { "epoch": 9.727463312368974, "grad_norm": 0.49174582958221436, "learning_rate": 1.1347876924639455e-07, "loss": 0.5024, "num_input_tokens_seen": 12138376, "step": 18560 }, { "epoch": 9.730083857442349, "grad_norm": 0.3850119113922119, "learning_rate": 1.1131273540474496e-07, "loss": 0.566, "num_input_tokens_seen": 12141480, "step": 18565 }, { "epoch": 9.732704402515724, "grad_norm": 0.5228574275970459, "learning_rate": 1.091675272797299e-07, "loss": 0.417, "num_input_tokens_seen": 12144168, "step": 18570 }, { "epoch": 9.735324947589099, "grad_norm": 0.38529208302497864, "learning_rate": 1.0704314666635795e-07, "loss": 0.5753, "num_input_tokens_seen": 12148168, "step": 18575 }, { "epoch": 9.737945492662474, "grad_norm": 0.3930829167366028, "learning_rate": 1.0493959534221832e-07, "loss": 0.3291, "num_input_tokens_seen": 12150696, "step": 18580 }, { "epoch": 9.74056603773585, "grad_norm": 0.6231387853622437, "learning_rate": 1.0285687506746133e-07, "loss": 0.3677, "num_input_tokens_seen": 12153928, "step": 18585 }, { "epoch": 9.743186582809225, "grad_norm": 0.4293641746044159, "learning_rate": 1.0079498758481798e-07, "loss": 0.4279, "num_input_tokens_seen": 12157384, "step": 18590 }, { "epoch": 9.7458071278826, "grad_norm": 0.34272700548171997, "learning_rate": 9.87539346195776e-08, "loss": 0.4519, "num_input_tokens_seen": 12160200, "step": 18595 }, { "epoch": 9.748427672955975, "grad_norm": 0.5272493362426758, "learning_rate": 9.673371787960183e-08, "loss": 0.6373, "num_input_tokens_seen": 12162504, "step": 18600 }, { "epoch": 9.75104821802935, "grad_norm": 0.5055599212646484, "learning_rate": 9.473433905531626e-08, "loss": 0.4657, "num_input_tokens_seen": 12165288, "step": 18605 }, { "epoch": 9.753668763102725, "grad_norm": 0.8412028551101685, "learning_rate": 9.275579981970483e-08, "loss": 0.4409, "num_input_tokens_seen": 12167912, "step": 18610 }, { "epoch": 9.7562893081761, "grad_norm": 1.7234314680099487, "learning_rate": 9.07981018283266e-08, "loss": 0.5368, "num_input_tokens_seen": 12171624, "step": 18615 }, { "epoch": 9.758909853249476, "grad_norm": 0.78414386510849, "learning_rate": 8.886124671928786e-08, "loss": 0.4864, "num_input_tokens_seen": 12174632, "step": 18620 }, { "epoch": 9.76153039832285, "grad_norm": 0.7347813844680786, "learning_rate": 8.694523611326444e-08, "loss": 0.5812, "num_input_tokens_seen": 12177896, "step": 18625 }, { "epoch": 9.764150943396226, "grad_norm": 0.47013288736343384, "learning_rate": 8.505007161348222e-08, "loss": 0.4935, "num_input_tokens_seen": 12181256, "step": 18630 }, { "epoch": 9.766771488469601, "grad_norm": 0.4189368784427643, "learning_rate": 8.31757548057338e-08, "loss": 0.5143, "num_input_tokens_seen": 12183848, "step": 18635 }, { "epoch": 9.769392033542976, "grad_norm": 0.3187235891819, "learning_rate": 8.132228725835634e-08, "loss": 0.4401, "num_input_tokens_seen": 12187208, "step": 18640 }, { "epoch": 9.772012578616351, "grad_norm": 0.29744425415992737, "learning_rate": 7.948967052225087e-08, "loss": 0.4805, "num_input_tokens_seen": 12190568, "step": 18645 }, { "epoch": 9.774633123689728, "grad_norm": 0.47845470905303955, "learning_rate": 7.767790613086301e-08, "loss": 0.3346, "num_input_tokens_seen": 12193224, "step": 18650 }, { "epoch": 9.777253668763104, "grad_norm": 0.5712874531745911, "learning_rate": 7.588699560019952e-08, "loss": 0.4305, "num_input_tokens_seen": 12195816, "step": 18655 }, { "epoch": 9.779874213836479, "grad_norm": 0.7028350830078125, "learning_rate": 7.411694042881168e-08, "loss": 0.4623, "num_input_tokens_seen": 12198856, "step": 18660 }, { "epoch": 9.782494758909854, "grad_norm": 0.569786548614502, "learning_rate": 7.23677420977953e-08, "loss": 0.546, "num_input_tokens_seen": 12201992, "step": 18665 }, { "epoch": 9.785115303983229, "grad_norm": 0.49996325373649597, "learning_rate": 7.063940207080733e-08, "loss": 0.4817, "num_input_tokens_seen": 12205608, "step": 18670 }, { "epoch": 9.787735849056604, "grad_norm": 0.3198641240596771, "learning_rate": 6.893192179403817e-08, "loss": 0.5414, "num_input_tokens_seen": 12209352, "step": 18675 }, { "epoch": 9.79035639412998, "grad_norm": 0.6854060292243958, "learning_rate": 6.724530269623108e-08, "loss": 0.4559, "num_input_tokens_seen": 12213768, "step": 18680 }, { "epoch": 9.792976939203355, "grad_norm": 0.6120165586471558, "learning_rate": 6.557954618867102e-08, "loss": 0.4483, "num_input_tokens_seen": 12216776, "step": 18685 }, { "epoch": 9.79559748427673, "grad_norm": 0.3482840657234192, "learning_rate": 6.393465366519024e-08, "loss": 0.3922, "num_input_tokens_seen": 12221000, "step": 18690 }, { "epoch": 9.798218029350105, "grad_norm": 0.7613972425460815, "learning_rate": 6.231062650215724e-08, "loss": 0.4646, "num_input_tokens_seen": 12224200, "step": 18695 }, { "epoch": 9.80083857442348, "grad_norm": 0.4207402467727661, "learning_rate": 6.070746605848221e-08, "loss": 0.4701, "num_input_tokens_seen": 12226920, "step": 18700 }, { "epoch": 9.803459119496855, "grad_norm": 0.3289869427680969, "learning_rate": 5.912517367561987e-08, "loss": 0.5069, "num_input_tokens_seen": 12229960, "step": 18705 }, { "epoch": 9.80607966457023, "grad_norm": 0.5989525318145752, "learning_rate": 5.756375067755837e-08, "loss": 0.4213, "num_input_tokens_seen": 12232040, "step": 18710 }, { "epoch": 9.808700209643606, "grad_norm": 0.4609276056289673, "learning_rate": 5.602319837082481e-08, "loss": 0.397, "num_input_tokens_seen": 12234472, "step": 18715 }, { "epoch": 9.81132075471698, "grad_norm": 0.9318786859512329, "learning_rate": 5.450351804448528e-08, "loss": 0.351, "num_input_tokens_seen": 12237448, "step": 18720 }, { "epoch": 9.813941299790356, "grad_norm": 0.33280861377716064, "learning_rate": 5.3004710970133705e-08, "loss": 0.4553, "num_input_tokens_seen": 12240264, "step": 18725 }, { "epoch": 9.816561844863731, "grad_norm": 0.40297412872314453, "learning_rate": 5.1526778401911334e-08, "loss": 0.3996, "num_input_tokens_seen": 12243176, "step": 18730 }, { "epoch": 9.819182389937106, "grad_norm": 0.5627027750015259, "learning_rate": 5.0069721576476156e-08, "loss": 0.4716, "num_input_tokens_seen": 12245864, "step": 18735 }, { "epoch": 9.821802935010481, "grad_norm": 0.41653871536254883, "learning_rate": 4.863354171303347e-08, "loss": 0.4376, "num_input_tokens_seen": 12248712, "step": 18740 }, { "epoch": 9.824423480083858, "grad_norm": 1.061939001083374, "learning_rate": 4.72182400133081e-08, "loss": 0.5239, "num_input_tokens_seen": 12251144, "step": 18745 }, { "epoch": 9.827044025157234, "grad_norm": 0.5348390340805054, "learning_rate": 4.582381766156385e-08, "loss": 0.4595, "num_input_tokens_seen": 12255336, "step": 18750 }, { "epoch": 9.829664570230609, "grad_norm": 0.8413878083229065, "learning_rate": 4.445027582458683e-08, "loss": 0.5102, "num_input_tokens_seen": 12257672, "step": 18755 }, { "epoch": 9.832285115303984, "grad_norm": 0.41842082142829895, "learning_rate": 4.309761565169379e-08, "loss": 0.4621, "num_input_tokens_seen": 12261032, "step": 18760 }, { "epoch": 9.834905660377359, "grad_norm": 0.4691280126571655, "learning_rate": 4.1765838274732125e-08, "loss": 0.4415, "num_input_tokens_seen": 12264488, "step": 18765 }, { "epoch": 9.837526205450734, "grad_norm": 0.7038111686706543, "learning_rate": 4.045494480807155e-08, "loss": 0.4718, "num_input_tokens_seen": 12267432, "step": 18770 }, { "epoch": 9.84014675052411, "grad_norm": 0.6131623387336731, "learning_rate": 3.916493634860407e-08, "loss": 0.3475, "num_input_tokens_seen": 12270888, "step": 18775 }, { "epoch": 9.842767295597485, "grad_norm": 0.30119651556015015, "learning_rate": 3.789581397575515e-08, "loss": 0.4467, "num_input_tokens_seen": 12273896, "step": 18780 }, { "epoch": 9.84538784067086, "grad_norm": 0.44897714257240295, "learning_rate": 3.664757875146418e-08, "loss": 0.5189, "num_input_tokens_seen": 12276328, "step": 18785 }, { "epoch": 9.848008385744235, "grad_norm": 0.4707897901535034, "learning_rate": 3.5420231720198485e-08, "loss": 0.408, "num_input_tokens_seen": 12279016, "step": 18790 }, { "epoch": 9.85062893081761, "grad_norm": 0.6458615660667419, "learning_rate": 3.421377390894764e-08, "loss": 0.3635, "num_input_tokens_seen": 12281512, "step": 18795 }, { "epoch": 9.853249475890985, "grad_norm": 0.5552914142608643, "learning_rate": 3.3028206327218035e-08, "loss": 0.5342, "num_input_tokens_seen": 12285160, "step": 18800 }, { "epoch": 9.85587002096436, "grad_norm": 0.4067921042442322, "learning_rate": 3.1863529967041117e-08, "loss": 0.4698, "num_input_tokens_seen": 12288616, "step": 18805 }, { "epoch": 9.858490566037736, "grad_norm": 0.40634506940841675, "learning_rate": 3.071974580296233e-08, "loss": 0.5412, "num_input_tokens_seen": 12292680, "step": 18810 }, { "epoch": 9.86111111111111, "grad_norm": 0.33611026406288147, "learning_rate": 2.9596854792052207e-08, "loss": 0.412, "num_input_tokens_seen": 12297160, "step": 18815 }, { "epoch": 9.863731656184486, "grad_norm": 0.36750030517578125, "learning_rate": 2.8494857873889724e-08, "loss": 0.6077, "num_input_tokens_seen": 12300520, "step": 18820 }, { "epoch": 9.866352201257861, "grad_norm": 0.4893649220466614, "learning_rate": 2.741375597057616e-08, "loss": 0.4706, "num_input_tokens_seen": 12303496, "step": 18825 }, { "epoch": 9.868972746331236, "grad_norm": 0.45258384943008423, "learning_rate": 2.6353549986729566e-08, "loss": 0.5319, "num_input_tokens_seen": 12306568, "step": 18830 }, { "epoch": 9.871593291404611, "grad_norm": 0.43796899914741516, "learning_rate": 2.531424080948197e-08, "loss": 0.5532, "num_input_tokens_seen": 12309160, "step": 18835 }, { "epoch": 9.874213836477988, "grad_norm": 0.3935408592224121, "learning_rate": 2.4295829308482176e-08, "loss": 0.4161, "num_input_tokens_seen": 12312776, "step": 18840 }, { "epoch": 9.876834381551364, "grad_norm": 0.6801460981369019, "learning_rate": 2.329831633588464e-08, "loss": 0.3986, "num_input_tokens_seen": 12316104, "step": 18845 }, { "epoch": 9.879454926624739, "grad_norm": 0.7599796652793884, "learning_rate": 2.232170272636891e-08, "loss": 0.5599, "num_input_tokens_seen": 12318568, "step": 18850 }, { "epoch": 9.882075471698114, "grad_norm": 0.6378341317176819, "learning_rate": 2.136598929711464e-08, "loss": 0.5172, "num_input_tokens_seen": 12321736, "step": 18855 }, { "epoch": 9.884696016771489, "grad_norm": 0.4267111122608185, "learning_rate": 2.0431176847823807e-08, "loss": 0.492, "num_input_tokens_seen": 12325032, "step": 18860 }, { "epoch": 9.887316561844864, "grad_norm": 0.23660951852798462, "learning_rate": 1.9517266160704038e-08, "loss": 0.5057, "num_input_tokens_seen": 12329160, "step": 18865 }, { "epoch": 9.88993710691824, "grad_norm": 0.3079928755760193, "learning_rate": 1.8624258000471405e-08, "loss": 0.6807, "num_input_tokens_seen": 12332392, "step": 18870 }, { "epoch": 9.892557651991615, "grad_norm": 0.2520180940628052, "learning_rate": 1.7752153114358737e-08, "loss": 0.4312, "num_input_tokens_seen": 12336456, "step": 18875 }, { "epoch": 9.89517819706499, "grad_norm": 0.40697428584098816, "learning_rate": 1.6900952232098977e-08, "loss": 0.4153, "num_input_tokens_seen": 12339080, "step": 18880 }, { "epoch": 9.897798742138365, "grad_norm": 0.5415671467781067, "learning_rate": 1.6070656065939048e-08, "loss": 0.3972, "num_input_tokens_seen": 12341768, "step": 18885 }, { "epoch": 9.90041928721174, "grad_norm": 0.5272725820541382, "learning_rate": 1.526126531063432e-08, "loss": 0.4125, "num_input_tokens_seen": 12344936, "step": 18890 }, { "epoch": 9.903039832285115, "grad_norm": 0.4392356872558594, "learning_rate": 1.4472780643445817e-08, "loss": 0.4603, "num_input_tokens_seen": 12349416, "step": 18895 }, { "epoch": 9.90566037735849, "grad_norm": 0.9610886573791504, "learning_rate": 1.3705202724142996e-08, "loss": 0.5117, "num_input_tokens_seen": 12352360, "step": 18900 }, { "epoch": 9.908280922431866, "grad_norm": 0.4717167019844055, "learning_rate": 1.2958532194995432e-08, "loss": 0.5271, "num_input_tokens_seen": 12355688, "step": 18905 }, { "epoch": 9.91090146750524, "grad_norm": 0.31005769968032837, "learning_rate": 1.2232769680789457e-08, "loss": 0.4518, "num_input_tokens_seen": 12359560, "step": 18910 }, { "epoch": 9.913522012578616, "grad_norm": 0.4659147560596466, "learning_rate": 1.152791578880319e-08, "loss": 0.5226, "num_input_tokens_seen": 12363656, "step": 18915 }, { "epoch": 9.916142557651991, "grad_norm": 0.30648958683013916, "learning_rate": 1.0843971108828732e-08, "loss": 0.3914, "num_input_tokens_seen": 12367688, "step": 18920 }, { "epoch": 9.918763102725366, "grad_norm": 0.7900435924530029, "learning_rate": 1.018093621316385e-08, "loss": 0.469, "num_input_tokens_seen": 12371400, "step": 18925 }, { "epoch": 9.921383647798741, "grad_norm": 0.4129381477832794, "learning_rate": 9.53881165659809e-09, "loss": 0.4517, "num_input_tokens_seen": 12374152, "step": 18930 }, { "epoch": 9.924004192872118, "grad_norm": 0.6278218030929565, "learning_rate": 8.91759797644054e-09, "loss": 0.3564, "num_input_tokens_seen": 12378472, "step": 18935 }, { "epoch": 9.926624737945493, "grad_norm": 0.6270018219947815, "learning_rate": 8.317295692486516e-09, "loss": 0.5057, "num_input_tokens_seen": 12381480, "step": 18940 }, { "epoch": 9.929245283018869, "grad_norm": 0.2727804183959961, "learning_rate": 7.737905307045323e-09, "loss": 0.4257, "num_input_tokens_seen": 12384648, "step": 18945 }, { "epoch": 9.931865828092244, "grad_norm": 0.5214640498161316, "learning_rate": 7.179427304926378e-09, "loss": 0.3688, "num_input_tokens_seen": 12387432, "step": 18950 }, { "epoch": 9.934486373165619, "grad_norm": 0.4084150195121765, "learning_rate": 6.641862153433653e-09, "loss": 0.4423, "num_input_tokens_seen": 12390984, "step": 18955 }, { "epoch": 9.937106918238994, "grad_norm": 0.4298418164253235, "learning_rate": 6.125210302382333e-09, "loss": 0.3978, "num_input_tokens_seen": 12394760, "step": 18960 }, { "epoch": 9.93972746331237, "grad_norm": 0.4928020238876343, "learning_rate": 5.629472184079387e-09, "loss": 0.5307, "num_input_tokens_seen": 12397768, "step": 18965 }, { "epoch": 9.942348008385745, "grad_norm": 0.4760669469833374, "learning_rate": 5.154648213334668e-09, "loss": 0.3971, "num_input_tokens_seen": 12400968, "step": 18970 }, { "epoch": 9.94496855345912, "grad_norm": 0.35223206877708435, "learning_rate": 4.700738787466463e-09, "loss": 0.5256, "num_input_tokens_seen": 12406664, "step": 18975 }, { "epoch": 9.947589098532495, "grad_norm": 0.5487617254257202, "learning_rate": 4.26774428627652e-09, "loss": 0.446, "num_input_tokens_seen": 12409448, "step": 18980 }, { "epoch": 9.95020964360587, "grad_norm": 0.3044726252555847, "learning_rate": 3.855665072080572e-09, "loss": 0.3501, "num_input_tokens_seen": 12412744, "step": 18985 }, { "epoch": 9.952830188679245, "grad_norm": 0.3954128623008728, "learning_rate": 3.464501489683358e-09, "loss": 0.6239, "num_input_tokens_seen": 12415656, "step": 18990 }, { "epoch": 9.95545073375262, "grad_norm": 0.43222248554229736, "learning_rate": 3.094253866398056e-09, "loss": 0.5632, "num_input_tokens_seen": 12418472, "step": 18995 }, { "epoch": 9.958071278825996, "grad_norm": 0.43112245202064514, "learning_rate": 2.7449225120268484e-09, "loss": 0.4081, "num_input_tokens_seen": 12421768, "step": 19000 }, { "epoch": 9.96069182389937, "grad_norm": 0.4216673672199249, "learning_rate": 2.416507718877581e-09, "loss": 0.445, "num_input_tokens_seen": 12425800, "step": 19005 }, { "epoch": 9.963312368972746, "grad_norm": 0.7173749208450317, "learning_rate": 2.109009761747105e-09, "loss": 0.5609, "num_input_tokens_seen": 12429288, "step": 19010 }, { "epoch": 9.965932914046121, "grad_norm": 0.44205307960510254, "learning_rate": 1.8224288979434844e-09, "loss": 0.3956, "num_input_tokens_seen": 12433160, "step": 19015 }, { "epoch": 9.968553459119496, "grad_norm": 1.3142393827438354, "learning_rate": 1.5567653672554638e-09, "loss": 0.4545, "num_input_tokens_seen": 12435944, "step": 19020 }, { "epoch": 9.971174004192871, "grad_norm": 0.3813258409500122, "learning_rate": 1.3120193919857748e-09, "loss": 0.4001, "num_input_tokens_seen": 12438216, "step": 19025 }, { "epoch": 9.973794549266248, "grad_norm": 0.6833551526069641, "learning_rate": 1.0881911769261565e-09, "loss": 0.4123, "num_input_tokens_seen": 12440904, "step": 19030 }, { "epoch": 9.976415094339622, "grad_norm": 0.754343569278717, "learning_rate": 8.852809093601311e-10, "loss": 0.5261, "num_input_tokens_seen": 12443752, "step": 19035 }, { "epoch": 9.979035639412999, "grad_norm": 0.9295276403427124, "learning_rate": 7.03288759076881e-10, "loss": 0.6735, "num_input_tokens_seen": 12446152, "step": 19040 }, { "epoch": 9.981656184486374, "grad_norm": 0.44524165987968445, "learning_rate": 5.422148783629233e-10, "loss": 0.461, "num_input_tokens_seen": 12449160, "step": 19045 }, { "epoch": 9.984276729559749, "grad_norm": 0.39607521891593933, "learning_rate": 4.0205940199100623e-10, "loss": 0.4993, "num_input_tokens_seen": 12453064, "step": 19050 }, { "epoch": 9.986897274633124, "grad_norm": 0.8619703054428101, "learning_rate": 2.828224472395391e-10, "loss": 0.458, "num_input_tokens_seen": 12455944, "step": 19055 }, { "epoch": 9.9895178197065, "grad_norm": 0.5295709371566772, "learning_rate": 1.8450411388426515e-10, "loss": 0.5059, "num_input_tokens_seen": 12459528, "step": 19060 }, { "epoch": 9.992138364779874, "grad_norm": 0.31087425351142883, "learning_rate": 1.0710448418715935e-10, "loss": 0.4861, "num_input_tokens_seen": 12463048, "step": 19065 }, { "epoch": 9.99475890985325, "grad_norm": 0.4976613223552704, "learning_rate": 5.062362291585743e-11, "loss": 0.4444, "num_input_tokens_seen": 12467240, "step": 19070 }, { "epoch": 9.997379454926625, "grad_norm": 0.4553039073944092, "learning_rate": 1.5061577329777976e-11, "loss": 0.6473, "num_input_tokens_seen": 12470216, "step": 19075 }, { "epoch": 10.0, "grad_norm": 1.2901636362075806, "learning_rate": 4.183771884491705e-13, "loss": 0.6659, "num_input_tokens_seen": 12472912, "step": 19080 }, { "epoch": 10.0, "eval_loss": 0.48326343297958374, "eval_runtime": 14.5223, "eval_samples_per_second": 58.393, "eval_steps_per_second": 14.598, "num_input_tokens_seen": 12472912, "step": 19080 }, { "epoch": 10.0, "num_input_tokens_seen": 12472912, "step": 19080, "total_flos": 5.6166951473145446e+17, "train_loss": 0.5117214915262578, "train_runtime": 4512.91, "train_samples_per_second": 16.905, "train_steps_per_second": 4.228 } ], "logging_steps": 5, "max_steps": 19080, "num_input_tokens_seen": 12472912, "num_train_epochs": 10, "save_steps": 954, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.6166951473145446e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }