{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8144612112848125, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009049569014275695, "grad_norm": 159.94659423828125, "learning_rate": 6.024096385542169e-07, "loss": 3.3777, "step": 10 }, { "epoch": 0.001809913802855139, "grad_norm": 92.94371795654297, "learning_rate": 1.2048192771084338e-06, "loss": 2.8739, "step": 20 }, { "epoch": 0.0027148707042827084, "grad_norm": 40.02893829345703, "learning_rate": 1.8072289156626508e-06, "loss": 2.0823, "step": 30 }, { "epoch": 0.003619827605710278, "grad_norm": 11.130951881408691, "learning_rate": 2.4096385542168676e-06, "loss": 1.3735, "step": 40 }, { "epoch": 0.004524784507137848, "grad_norm": 6.264873027801514, "learning_rate": 3.012048192771085e-06, "loss": 0.77, "step": 50 }, { "epoch": 0.005429741408565417, "grad_norm": 5.753260135650635, "learning_rate": 3.6144578313253016e-06, "loss": 0.5821, "step": 60 }, { "epoch": 0.006334698309992987, "grad_norm": 5.4176836013793945, "learning_rate": 4.216867469879519e-06, "loss": 0.5349, "step": 70 }, { "epoch": 0.007239655211420556, "grad_norm": 5.466179370880127, "learning_rate": 4.819277108433735e-06, "loss": 0.4904, "step": 80 }, { "epoch": 0.008144612112848126, "grad_norm": 5.408243179321289, "learning_rate": 5.421686746987952e-06, "loss": 0.4391, "step": 90 }, { "epoch": 0.009049569014275696, "grad_norm": 4.930578708648682, "learning_rate": 6.02409638554217e-06, "loss": 0.382, "step": 100 }, { "epoch": 0.009954525915703265, "grad_norm": 4.409604072570801, "learning_rate": 6.626506024096386e-06, "loss": 0.3354, "step": 110 }, { "epoch": 0.010859482817130834, "grad_norm": 3.2613117694854736, "learning_rate": 7.228915662650603e-06, "loss": 0.2748, "step": 120 }, { "epoch": 0.011764439718558404, "grad_norm": 3.974195718765259, "learning_rate": 7.83132530120482e-06, "loss": 0.2458, "step": 130 }, { "epoch": 0.012669396619985973, "grad_norm": 2.277785539627075, "learning_rate": 8.433734939759038e-06, "loss": 0.2215, "step": 140 }, { "epoch": 0.013574353521413543, "grad_norm": 1.618218183517456, "learning_rate": 9.036144578313254e-06, "loss": 0.2, "step": 150 }, { "epoch": 0.014479310422841112, "grad_norm": 1.4689830541610718, "learning_rate": 9.63855421686747e-06, "loss": 0.1821, "step": 160 }, { "epoch": 0.015384267324268681, "grad_norm": 1.7365655899047852, "learning_rate": 1.0240963855421688e-05, "loss": 0.1699, "step": 170 }, { "epoch": 0.01628922422569625, "grad_norm": 0.9865982532501221, "learning_rate": 1.0843373493975904e-05, "loss": 0.1667, "step": 180 }, { "epoch": 0.01719418112712382, "grad_norm": 0.9514006972312927, "learning_rate": 1.1445783132530122e-05, "loss": 0.1651, "step": 190 }, { "epoch": 0.01809913802855139, "grad_norm": 1.1965084075927734, "learning_rate": 1.204819277108434e-05, "loss": 0.1564, "step": 200 }, { "epoch": 0.01900409492997896, "grad_norm": 1.2576557397842407, "learning_rate": 1.2650602409638555e-05, "loss": 0.1589, "step": 210 }, { "epoch": 0.01990905183140653, "grad_norm": 0.7250511646270752, "learning_rate": 1.3253012048192772e-05, "loss": 0.1567, "step": 220 }, { "epoch": 0.020814008732834097, "grad_norm": 0.881618320941925, "learning_rate": 1.3855421686746989e-05, "loss": 0.1554, "step": 230 }, { "epoch": 0.021718965634261667, "grad_norm": 0.7235729694366455, "learning_rate": 1.4457831325301207e-05, "loss": 0.1594, "step": 240 }, { "epoch": 0.022623922535689237, "grad_norm": 0.6479834318161011, "learning_rate": 1.5060240963855424e-05, "loss": 0.1554, "step": 250 }, { "epoch": 0.023528879437116807, "grad_norm": 0.8083927035331726, "learning_rate": 1.566265060240964e-05, "loss": 0.1526, "step": 260 }, { "epoch": 0.024433836338544377, "grad_norm": 1.1908012628555298, "learning_rate": 1.6265060240963857e-05, "loss": 0.1525, "step": 270 }, { "epoch": 0.025338793239971947, "grad_norm": 0.944805920124054, "learning_rate": 1.6867469879518076e-05, "loss": 0.1583, "step": 280 }, { "epoch": 0.026243750141399517, "grad_norm": 0.6904934048652649, "learning_rate": 1.746987951807229e-05, "loss": 0.1531, "step": 290 }, { "epoch": 0.027148707042827087, "grad_norm": 1.380239486694336, "learning_rate": 1.807228915662651e-05, "loss": 0.1483, "step": 300 }, { "epoch": 0.028053663944254653, "grad_norm": 0.4651995897293091, "learning_rate": 1.8674698795180725e-05, "loss": 0.1498, "step": 310 }, { "epoch": 0.028958620845682223, "grad_norm": 0.6768488883972168, "learning_rate": 1.927710843373494e-05, "loss": 0.152, "step": 320 }, { "epoch": 0.029863577747109793, "grad_norm": 0.6283469796180725, "learning_rate": 1.987951807228916e-05, "loss": 0.1464, "step": 330 }, { "epoch": 0.030768534648537363, "grad_norm": 0.7869206070899963, "learning_rate": 1.999997250700714e-05, "loss": 0.1454, "step": 340 }, { "epoch": 0.03167349154996493, "grad_norm": 1.3408620357513428, "learning_rate": 1.9999860816982734e-05, "loss": 0.154, "step": 350 }, { "epoch": 0.0325784484513925, "grad_norm": 1.0676746368408203, "learning_rate": 1.9999663212573584e-05, "loss": 0.1482, "step": 360 }, { "epoch": 0.03348340535282007, "grad_norm": 0.7311077117919922, "learning_rate": 1.9999379695477417e-05, "loss": 0.1486, "step": 370 }, { "epoch": 0.03438836225424764, "grad_norm": 0.8585270643234253, "learning_rate": 1.999901026813009e-05, "loss": 0.1463, "step": 380 }, { "epoch": 0.03529331915567521, "grad_norm": 0.4578467607498169, "learning_rate": 1.9998554933705552e-05, "loss": 0.1407, "step": 390 }, { "epoch": 0.03619827605710278, "grad_norm": 0.6237673163414001, "learning_rate": 1.9998013696115847e-05, "loss": 0.1463, "step": 400 }, { "epoch": 0.03710323295853035, "grad_norm": 0.49388837814331055, "learning_rate": 1.999738656001104e-05, "loss": 0.1498, "step": 410 }, { "epoch": 0.03800818985995792, "grad_norm": 0.9949780106544495, "learning_rate": 1.999667353077921e-05, "loss": 0.1457, "step": 420 }, { "epoch": 0.03891314676138549, "grad_norm": 0.792772114276886, "learning_rate": 1.9995874614546386e-05, "loss": 0.1567, "step": 430 }, { "epoch": 0.03981810366281306, "grad_norm": 0.45337289571762085, "learning_rate": 1.9994989818176507e-05, "loss": 0.1444, "step": 440 }, { "epoch": 0.040723060564240625, "grad_norm": 0.5384504795074463, "learning_rate": 1.9994019149271357e-05, "loss": 0.1464, "step": 450 }, { "epoch": 0.041628017465668195, "grad_norm": 0.4586544334888458, "learning_rate": 1.9992962616170485e-05, "loss": 0.1366, "step": 460 }, { "epoch": 0.042532974367095765, "grad_norm": 1.124829888343811, "learning_rate": 1.999182022795116e-05, "loss": 0.1441, "step": 470 }, { "epoch": 0.043437931268523335, "grad_norm": 7.702340602874756, "learning_rate": 1.9990591994428278e-05, "loss": 0.1434, "step": 480 }, { "epoch": 0.044342888169950904, "grad_norm": 0.600829541683197, "learning_rate": 1.9989277926154273e-05, "loss": 0.1554, "step": 490 }, { "epoch": 0.045247845071378474, "grad_norm": 0.525310218334198, "learning_rate": 1.9987878034419047e-05, "loss": 0.1524, "step": 500 }, { "epoch": 0.046152801972806044, "grad_norm": 1.6653215885162354, "learning_rate": 1.998639233124985e-05, "loss": 0.1504, "step": 510 }, { "epoch": 0.047057758874233614, "grad_norm": 0.5178505778312683, "learning_rate": 1.998482082941118e-05, "loss": 0.1462, "step": 520 }, { "epoch": 0.047962715775661184, "grad_norm": 0.524719774723053, "learning_rate": 1.9983163542404694e-05, "loss": 0.1482, "step": 530 }, { "epoch": 0.048867672677088754, "grad_norm": 0.4859486520290375, "learning_rate": 1.9981420484469062e-05, "loss": 0.1504, "step": 540 }, { "epoch": 0.049772629578516324, "grad_norm": 0.6776233315467834, "learning_rate": 1.997959167057988e-05, "loss": 0.1494, "step": 550 }, { "epoch": 0.050677586479943894, "grad_norm": 0.6861289143562317, "learning_rate": 1.9977677116449494e-05, "loss": 0.1492, "step": 560 }, { "epoch": 0.051582543381371464, "grad_norm": 0.4854241609573364, "learning_rate": 1.9975676838526914e-05, "loss": 0.1437, "step": 570 }, { "epoch": 0.052487500282799034, "grad_norm": 0.37351515889167786, "learning_rate": 1.9973590853997646e-05, "loss": 0.146, "step": 580 }, { "epoch": 0.053392457184226604, "grad_norm": 0.5376414060592651, "learning_rate": 1.997141918078354e-05, "loss": 0.1445, "step": 590 }, { "epoch": 0.05429741408565417, "grad_norm": 1.200066328048706, "learning_rate": 1.996916183754266e-05, "loss": 0.1407, "step": 600 }, { "epoch": 0.05520237098708174, "grad_norm": 0.5804212689399719, "learning_rate": 1.9966818843669097e-05, "loss": 0.1482, "step": 610 }, { "epoch": 0.056107327888509306, "grad_norm": 0.5233150124549866, "learning_rate": 1.9964390219292823e-05, "loss": 0.1423, "step": 620 }, { "epoch": 0.057012284789936876, "grad_norm": 0.7850412130355835, "learning_rate": 1.9961875985279503e-05, "loss": 0.1436, "step": 630 }, { "epoch": 0.057917241691364446, "grad_norm": 0.46773582696914673, "learning_rate": 1.9959276163230325e-05, "loss": 0.136, "step": 640 }, { "epoch": 0.058822198592792016, "grad_norm": 0.47068318724632263, "learning_rate": 1.9956590775481808e-05, "loss": 0.1477, "step": 650 }, { "epoch": 0.059727155494219586, "grad_norm": 0.7239755988121033, "learning_rate": 1.9953819845105616e-05, "loss": 0.1414, "step": 660 }, { "epoch": 0.060632112395647156, "grad_norm": 0.4646810293197632, "learning_rate": 1.9950963395908368e-05, "loss": 0.1433, "step": 670 }, { "epoch": 0.061537069297074726, "grad_norm": 0.5105672478675842, "learning_rate": 1.99480214524314e-05, "loss": 0.1381, "step": 680 }, { "epoch": 0.062442026198502296, "grad_norm": 0.4857189655303955, "learning_rate": 1.99449940399506e-05, "loss": 0.1355, "step": 690 }, { "epoch": 0.06334698309992987, "grad_norm": 0.7053922414779663, "learning_rate": 1.9941881184476154e-05, "loss": 0.1402, "step": 700 }, { "epoch": 0.06425194000135744, "grad_norm": 0.6095537543296814, "learning_rate": 1.9938682912752343e-05, "loss": 0.1424, "step": 710 }, { "epoch": 0.065156896902785, "grad_norm": 0.8942661285400391, "learning_rate": 1.99353992522573e-05, "loss": 0.1337, "step": 720 }, { "epoch": 0.06606185380421258, "grad_norm": 0.8702712059020996, "learning_rate": 1.9932030231202786e-05, "loss": 0.146, "step": 730 }, { "epoch": 0.06696681070564015, "grad_norm": 0.6549976468086243, "learning_rate": 1.9928575878533946e-05, "loss": 0.1389, "step": 740 }, { "epoch": 0.06787176760706772, "grad_norm": 0.56646329164505, "learning_rate": 1.9925036223929045e-05, "loss": 0.1399, "step": 750 }, { "epoch": 0.06877672450849528, "grad_norm": 0.5389584898948669, "learning_rate": 1.9921411297799233e-05, "loss": 0.1398, "step": 760 }, { "epoch": 0.06968168140992285, "grad_norm": 0.7392621636390686, "learning_rate": 1.9917701131288274e-05, "loss": 0.1436, "step": 770 }, { "epoch": 0.07058663831135042, "grad_norm": 0.5267114639282227, "learning_rate": 1.991390575627228e-05, "loss": 0.1398, "step": 780 }, { "epoch": 0.071491595212778, "grad_norm": 0.543932318687439, "learning_rate": 1.9910025205359434e-05, "loss": 0.1469, "step": 790 }, { "epoch": 0.07239655211420556, "grad_norm": 0.5490307211875916, "learning_rate": 1.990605951188972e-05, "loss": 0.1342, "step": 800 }, { "epoch": 0.07330150901563313, "grad_norm": 0.4569859206676483, "learning_rate": 1.990200870993461e-05, "loss": 0.1432, "step": 810 }, { "epoch": 0.0742064659170607, "grad_norm": 0.5565773844718933, "learning_rate": 1.9897872834296816e-05, "loss": 0.1465, "step": 820 }, { "epoch": 0.07511142281848827, "grad_norm": 0.5516616106033325, "learning_rate": 1.989365192050995e-05, "loss": 0.1434, "step": 830 }, { "epoch": 0.07601637971991584, "grad_norm": 0.8122782707214355, "learning_rate": 1.988934600483824e-05, "loss": 0.1424, "step": 840 }, { "epoch": 0.07692133662134341, "grad_norm": 0.4941742420196533, "learning_rate": 1.9884955124276214e-05, "loss": 0.1437, "step": 850 }, { "epoch": 0.07782629352277098, "grad_norm": 0.68223637342453, "learning_rate": 1.9880479316548365e-05, "loss": 0.1366, "step": 860 }, { "epoch": 0.07873125042419855, "grad_norm": 0.7050871253013611, "learning_rate": 1.9875918620108867e-05, "loss": 0.1358, "step": 870 }, { "epoch": 0.07963620732562612, "grad_norm": 0.44282448291778564, "learning_rate": 1.9871273074141197e-05, "loss": 0.1384, "step": 880 }, { "epoch": 0.08054116422705368, "grad_norm": 0.6209523677825928, "learning_rate": 1.9866542718557844e-05, "loss": 0.1389, "step": 890 }, { "epoch": 0.08144612112848125, "grad_norm": 0.49669986963272095, "learning_rate": 1.9861727593999927e-05, "loss": 0.1298, "step": 900 }, { "epoch": 0.08235107802990882, "grad_norm": 0.4034930467605591, "learning_rate": 1.985682774183687e-05, "loss": 0.139, "step": 910 }, { "epoch": 0.08325603493133639, "grad_norm": 0.42605486512184143, "learning_rate": 1.985184320416603e-05, "loss": 0.1359, "step": 920 }, { "epoch": 0.08416099183276396, "grad_norm": 0.5757314562797546, "learning_rate": 1.9846774023812366e-05, "loss": 0.1412, "step": 930 }, { "epoch": 0.08506594873419153, "grad_norm": 0.6896102428436279, "learning_rate": 1.984162024432802e-05, "loss": 0.1322, "step": 940 }, { "epoch": 0.0859709056356191, "grad_norm": 0.4264814257621765, "learning_rate": 1.9836381909992e-05, "loss": 0.1413, "step": 950 }, { "epoch": 0.08687586253704667, "grad_norm": 0.4684106111526489, "learning_rate": 1.9831059065809756e-05, "loss": 0.1373, "step": 960 }, { "epoch": 0.08778081943847424, "grad_norm": 0.5796085000038147, "learning_rate": 1.9825651757512808e-05, "loss": 0.1357, "step": 970 }, { "epoch": 0.08868577633990181, "grad_norm": 0.5410571098327637, "learning_rate": 1.9820160031558365e-05, "loss": 0.1364, "step": 980 }, { "epoch": 0.08959073324132938, "grad_norm": 0.45756787061691284, "learning_rate": 1.9814583935128902e-05, "loss": 0.1425, "step": 990 }, { "epoch": 0.09049569014275695, "grad_norm": 0.9066041111946106, "learning_rate": 1.9808923516131787e-05, "loss": 0.1367, "step": 1000 }, { "epoch": 0.09140064704418452, "grad_norm": 0.49664556980133057, "learning_rate": 1.9803178823198826e-05, "loss": 0.1382, "step": 1010 }, { "epoch": 0.09230560394561209, "grad_norm": 0.3994297683238983, "learning_rate": 1.979734990568589e-05, "loss": 0.1357, "step": 1020 }, { "epoch": 0.09321056084703966, "grad_norm": 0.4870229959487915, "learning_rate": 1.979143681367246e-05, "loss": 0.1387, "step": 1030 }, { "epoch": 0.09411551774846723, "grad_norm": 0.5828505754470825, "learning_rate": 1.9785439597961207e-05, "loss": 0.1388, "step": 1040 }, { "epoch": 0.0950204746498948, "grad_norm": 0.4454402029514313, "learning_rate": 1.977935831007756e-05, "loss": 0.1394, "step": 1050 }, { "epoch": 0.09592543155132237, "grad_norm": 0.6127322912216187, "learning_rate": 1.977319300226926e-05, "loss": 0.1394, "step": 1060 }, { "epoch": 0.09683038845274994, "grad_norm": 0.42766839265823364, "learning_rate": 1.97669437275059e-05, "loss": 0.1379, "step": 1070 }, { "epoch": 0.09773534535417751, "grad_norm": 0.556703507900238, "learning_rate": 1.9760610539478492e-05, "loss": 0.1336, "step": 1080 }, { "epoch": 0.09864030225560508, "grad_norm": 0.4200476408004761, "learning_rate": 1.9754193492598985e-05, "loss": 0.1398, "step": 1090 }, { "epoch": 0.09954525915703265, "grad_norm": 0.5154082179069519, "learning_rate": 1.9747692641999815e-05, "loss": 0.1391, "step": 1100 }, { "epoch": 0.10045021605846022, "grad_norm": 0.4285774230957031, "learning_rate": 1.9741108043533416e-05, "loss": 0.1405, "step": 1110 }, { "epoch": 0.10135517295988779, "grad_norm": 0.37950581312179565, "learning_rate": 1.9734439753771742e-05, "loss": 0.1399, "step": 1120 }, { "epoch": 0.10226012986131536, "grad_norm": 0.4357180893421173, "learning_rate": 1.9727687830005795e-05, "loss": 0.1354, "step": 1130 }, { "epoch": 0.10316508676274293, "grad_norm": 0.341791570186615, "learning_rate": 1.9720852330245127e-05, "loss": 0.1368, "step": 1140 }, { "epoch": 0.1040700436641705, "grad_norm": 0.37262991070747375, "learning_rate": 1.971393331321732e-05, "loss": 0.1406, "step": 1150 }, { "epoch": 0.10497500056559807, "grad_norm": 0.5309743881225586, "learning_rate": 1.9706930838367517e-05, "loss": 0.1386, "step": 1160 }, { "epoch": 0.10587995746702564, "grad_norm": 0.4703729748725891, "learning_rate": 1.9699844965857884e-05, "loss": 0.1457, "step": 1170 }, { "epoch": 0.10678491436845321, "grad_norm": 0.43350547552108765, "learning_rate": 1.969267575656711e-05, "loss": 0.1429, "step": 1180 }, { "epoch": 0.10768987126988078, "grad_norm": 0.5514594316482544, "learning_rate": 1.968542327208987e-05, "loss": 0.14, "step": 1190 }, { "epoch": 0.10859482817130835, "grad_norm": 0.7761058211326599, "learning_rate": 1.9678087574736305e-05, "loss": 0.1361, "step": 1200 }, { "epoch": 0.10949978507273592, "grad_norm": 0.4929139316082001, "learning_rate": 1.9670668727531486e-05, "loss": 0.1382, "step": 1210 }, { "epoch": 0.11040474197416349, "grad_norm": 0.5243300199508667, "learning_rate": 1.9663166794214868e-05, "loss": 0.1443, "step": 1220 }, { "epoch": 0.11130969887559106, "grad_norm": 0.3967302739620209, "learning_rate": 1.965558183923975e-05, "loss": 0.1359, "step": 1230 }, { "epoch": 0.11221465577701861, "grad_norm": 0.6645998954772949, "learning_rate": 1.9647913927772708e-05, "loss": 0.1422, "step": 1240 }, { "epoch": 0.11311961267844618, "grad_norm": 0.28883126378059387, "learning_rate": 1.9640163125693053e-05, "loss": 0.1397, "step": 1250 }, { "epoch": 0.11402456957987375, "grad_norm": 0.48156359791755676, "learning_rate": 1.9632329499592248e-05, "loss": 0.141, "step": 1260 }, { "epoch": 0.11492952648130132, "grad_norm": 0.9770751595497131, "learning_rate": 1.962441311677335e-05, "loss": 0.1343, "step": 1270 }, { "epoch": 0.11583448338272889, "grad_norm": 0.675937831401825, "learning_rate": 1.9616414045250417e-05, "loss": 0.143, "step": 1280 }, { "epoch": 0.11673944028415646, "grad_norm": 0.5480718016624451, "learning_rate": 1.960833235374794e-05, "loss": 0.1391, "step": 1290 }, { "epoch": 0.11764439718558403, "grad_norm": 0.42649492621421814, "learning_rate": 1.960016811170024e-05, "loss": 0.1331, "step": 1300 }, { "epoch": 0.1185493540870116, "grad_norm": 0.3937451243400574, "learning_rate": 1.9591921389250872e-05, "loss": 0.1406, "step": 1310 }, { "epoch": 0.11945431098843917, "grad_norm": 0.5013184547424316, "learning_rate": 1.958359225725204e-05, "loss": 0.1363, "step": 1320 }, { "epoch": 0.12035926788986674, "grad_norm": 0.475367933511734, "learning_rate": 1.9575180787263955e-05, "loss": 0.1368, "step": 1330 }, { "epoch": 0.12126422479129431, "grad_norm": 0.4039798676967621, "learning_rate": 1.956668705155426e-05, "loss": 0.1398, "step": 1340 }, { "epoch": 0.12216918169272188, "grad_norm": 0.5868827104568481, "learning_rate": 1.955811112309737e-05, "loss": 0.1373, "step": 1350 }, { "epoch": 0.12307413859414945, "grad_norm": 0.4204873740673065, "learning_rate": 1.9549453075573873e-05, "loss": 0.1385, "step": 1360 }, { "epoch": 0.12397909549557702, "grad_norm": 0.5889093279838562, "learning_rate": 1.954071298336989e-05, "loss": 0.1326, "step": 1370 }, { "epoch": 0.12488405239700459, "grad_norm": 0.4504663050174713, "learning_rate": 1.9531890921576425e-05, "loss": 0.1371, "step": 1380 }, { "epoch": 0.12578900929843218, "grad_norm": 0.40761178731918335, "learning_rate": 1.9522986965988748e-05, "loss": 0.1336, "step": 1390 }, { "epoch": 0.12669396619985973, "grad_norm": 0.46256113052368164, "learning_rate": 1.9514001193105693e-05, "loss": 0.1351, "step": 1400 }, { "epoch": 0.12759892310128731, "grad_norm": 0.39085566997528076, "learning_rate": 1.9504933680129063e-05, "loss": 0.1347, "step": 1410 }, { "epoch": 0.12850388000271487, "grad_norm": 0.6131863594055176, "learning_rate": 1.9495784504962913e-05, "loss": 0.1356, "step": 1420 }, { "epoch": 0.12940883690414243, "grad_norm": 0.4904513955116272, "learning_rate": 1.9486553746212915e-05, "loss": 0.1365, "step": 1430 }, { "epoch": 0.13031379380557, "grad_norm": 0.2950369417667389, "learning_rate": 1.9477241483185675e-05, "loss": 0.1374, "step": 1440 }, { "epoch": 0.13121875070699757, "grad_norm": 0.4332665503025055, "learning_rate": 1.946784779588803e-05, "loss": 0.1361, "step": 1450 }, { "epoch": 0.13212370760842515, "grad_norm": 0.3877945840358734, "learning_rate": 1.9458372765026402e-05, "loss": 0.1332, "step": 1460 }, { "epoch": 0.1330286645098527, "grad_norm": 0.35281670093536377, "learning_rate": 1.9448816472006057e-05, "loss": 0.1406, "step": 1470 }, { "epoch": 0.1339336214112803, "grad_norm": 0.5009306073188782, "learning_rate": 1.943917899893045e-05, "loss": 0.1341, "step": 1480 }, { "epoch": 0.13483857831270785, "grad_norm": 0.7299574613571167, "learning_rate": 1.9429460428600485e-05, "loss": 0.1337, "step": 1490 }, { "epoch": 0.13574353521413543, "grad_norm": 0.4611435830593109, "learning_rate": 1.9419660844513828e-05, "loss": 0.1438, "step": 1500 }, { "epoch": 0.136648492115563, "grad_norm": 0.4176791310310364, "learning_rate": 1.940978033086417e-05, "loss": 0.1399, "step": 1510 }, { "epoch": 0.13755344901699057, "grad_norm": 0.3517489731311798, "learning_rate": 1.9399818972540526e-05, "loss": 0.1333, "step": 1520 }, { "epoch": 0.13845840591841813, "grad_norm": 0.4304490387439728, "learning_rate": 1.9389776855126472e-05, "loss": 0.1416, "step": 1530 }, { "epoch": 0.1393633628198457, "grad_norm": 0.57242751121521, "learning_rate": 1.937965406489945e-05, "loss": 0.1375, "step": 1540 }, { "epoch": 0.14026831972127327, "grad_norm": 0.4773651361465454, "learning_rate": 1.936945068883e-05, "loss": 0.1281, "step": 1550 }, { "epoch": 0.14117327662270085, "grad_norm": 0.4251551926136017, "learning_rate": 1.9359166814581017e-05, "loss": 0.1368, "step": 1560 }, { "epoch": 0.1420782335241284, "grad_norm": 0.7797797918319702, "learning_rate": 1.9348802530507003e-05, "loss": 0.1363, "step": 1570 }, { "epoch": 0.142983190425556, "grad_norm": 1.2365995645523071, "learning_rate": 1.9338357925653312e-05, "loss": 0.1344, "step": 1580 }, { "epoch": 0.14388814732698355, "grad_norm": 0.6574030518531799, "learning_rate": 1.932783308975537e-05, "loss": 0.1419, "step": 1590 }, { "epoch": 0.14479310422841113, "grad_norm": 0.6940555572509766, "learning_rate": 1.9317228113237916e-05, "loss": 0.1427, "step": 1600 }, { "epoch": 0.14569806112983869, "grad_norm": 0.4072614014148712, "learning_rate": 1.9306543087214215e-05, "loss": 0.1306, "step": 1610 }, { "epoch": 0.14660301803126627, "grad_norm": 0.37100809812545776, "learning_rate": 1.9295778103485297e-05, "loss": 0.1368, "step": 1620 }, { "epoch": 0.14750797493269382, "grad_norm": 0.3309537470340729, "learning_rate": 1.9284933254539143e-05, "loss": 0.1319, "step": 1630 }, { "epoch": 0.1484129318341214, "grad_norm": 0.5338348746299744, "learning_rate": 1.9274008633549905e-05, "loss": 0.1321, "step": 1640 }, { "epoch": 0.14931788873554896, "grad_norm": 0.81573086977005, "learning_rate": 1.9263004334377087e-05, "loss": 0.1332, "step": 1650 }, { "epoch": 0.15022284563697655, "grad_norm": 0.5445601344108582, "learning_rate": 1.9251920451564773e-05, "loss": 0.1335, "step": 1660 }, { "epoch": 0.1511278025384041, "grad_norm": 0.5735446810722351, "learning_rate": 1.9240757080340787e-05, "loss": 0.1432, "step": 1670 }, { "epoch": 0.1520327594398317, "grad_norm": 0.35098642110824585, "learning_rate": 1.9229514316615875e-05, "loss": 0.1397, "step": 1680 }, { "epoch": 0.15293771634125924, "grad_norm": 0.5715315341949463, "learning_rate": 1.9218192256982898e-05, "loss": 0.1356, "step": 1690 }, { "epoch": 0.15384267324268683, "grad_norm": 0.4406200051307678, "learning_rate": 1.920679099871599e-05, "loss": 0.1348, "step": 1700 }, { "epoch": 0.15474763014411438, "grad_norm": 0.3725470304489136, "learning_rate": 1.919531063976972e-05, "loss": 0.1374, "step": 1710 }, { "epoch": 0.15565258704554197, "grad_norm": 0.2997819781303406, "learning_rate": 1.918375127877826e-05, "loss": 0.1332, "step": 1720 }, { "epoch": 0.15655754394696952, "grad_norm": 0.40308165550231934, "learning_rate": 1.917211301505453e-05, "loss": 0.1331, "step": 1730 }, { "epoch": 0.1574625008483971, "grad_norm": 0.437809020280838, "learning_rate": 1.916039594858935e-05, "loss": 0.1398, "step": 1740 }, { "epoch": 0.15836745774982466, "grad_norm": 0.8596120476722717, "learning_rate": 1.914860018005058e-05, "loss": 0.1358, "step": 1750 }, { "epoch": 0.15927241465125225, "grad_norm": 0.6408957242965698, "learning_rate": 1.913672581078224e-05, "loss": 0.1334, "step": 1760 }, { "epoch": 0.1601773715526798, "grad_norm": 0.49583616852760315, "learning_rate": 1.912477294280367e-05, "loss": 0.1404, "step": 1770 }, { "epoch": 0.16108232845410736, "grad_norm": 0.5075445175170898, "learning_rate": 1.911274167880863e-05, "loss": 0.1372, "step": 1780 }, { "epoch": 0.16198728535553494, "grad_norm": 0.4242574870586395, "learning_rate": 1.9100632122164423e-05, "loss": 0.1377, "step": 1790 }, { "epoch": 0.1628922422569625, "grad_norm": 0.5238597989082336, "learning_rate": 1.9088444376911002e-05, "loss": 0.1427, "step": 1800 }, { "epoch": 0.16379719915839008, "grad_norm": 0.43989208340644836, "learning_rate": 1.9076178547760095e-05, "loss": 0.1317, "step": 1810 }, { "epoch": 0.16470215605981764, "grad_norm": 0.426861047744751, "learning_rate": 1.9063834740094284e-05, "loss": 0.1375, "step": 1820 }, { "epoch": 0.16560711296124522, "grad_norm": 0.602488100528717, "learning_rate": 1.90514130599661e-05, "loss": 0.1319, "step": 1830 }, { "epoch": 0.16651206986267278, "grad_norm": 0.3982050120830536, "learning_rate": 1.9038913614097142e-05, "loss": 0.1371, "step": 1840 }, { "epoch": 0.16741702676410036, "grad_norm": 0.33180946111679077, "learning_rate": 1.902633650987712e-05, "loss": 0.1328, "step": 1850 }, { "epoch": 0.16832198366552792, "grad_norm": 0.5042048096656799, "learning_rate": 1.9013681855362952e-05, "loss": 0.1342, "step": 1860 }, { "epoch": 0.1692269405669555, "grad_norm": 0.35587260127067566, "learning_rate": 1.9000949759277844e-05, "loss": 0.1436, "step": 1870 }, { "epoch": 0.17013189746838306, "grad_norm": 0.4799160361289978, "learning_rate": 1.898814033101033e-05, "loss": 0.1407, "step": 1880 }, { "epoch": 0.17103685436981064, "grad_norm": 0.4712836742401123, "learning_rate": 1.897525368061336e-05, "loss": 0.1361, "step": 1890 }, { "epoch": 0.1719418112712382, "grad_norm": 0.45217564702033997, "learning_rate": 1.896228991880334e-05, "loss": 0.1399, "step": 1900 }, { "epoch": 0.17284676817266578, "grad_norm": 0.4772576689720154, "learning_rate": 1.8949249156959185e-05, "loss": 0.1426, "step": 1910 }, { "epoch": 0.17375172507409334, "grad_norm": 0.3817451298236847, "learning_rate": 1.893613150712135e-05, "loss": 0.1332, "step": 1920 }, { "epoch": 0.17465668197552092, "grad_norm": 0.35645753145217896, "learning_rate": 1.892293708199089e-05, "loss": 0.1274, "step": 1930 }, { "epoch": 0.17556163887694848, "grad_norm": 0.5715720057487488, "learning_rate": 1.8909665994928478e-05, "loss": 0.1354, "step": 1940 }, { "epoch": 0.17646659577837606, "grad_norm": 0.4831380248069763, "learning_rate": 1.889631835995342e-05, "loss": 0.1353, "step": 1950 }, { "epoch": 0.17737155267980362, "grad_norm": 0.5451020002365112, "learning_rate": 1.8882894291742703e-05, "loss": 0.1407, "step": 1960 }, { "epoch": 0.1782765095812312, "grad_norm": 0.5351752638816833, "learning_rate": 1.886939390562999e-05, "loss": 0.1353, "step": 1970 }, { "epoch": 0.17918146648265876, "grad_norm": 0.546170175075531, "learning_rate": 1.8855817317604622e-05, "loss": 0.1382, "step": 1980 }, { "epoch": 0.18008642338408634, "grad_norm": 0.38834547996520996, "learning_rate": 1.8842164644310657e-05, "loss": 0.1289, "step": 1990 }, { "epoch": 0.1809913802855139, "grad_norm": 0.3078169524669647, "learning_rate": 1.882843600304582e-05, "loss": 0.1291, "step": 2000 }, { "epoch": 0.18189633718694148, "grad_norm": 0.36554184556007385, "learning_rate": 1.8814631511760535e-05, "loss": 0.1445, "step": 2010 }, { "epoch": 0.18280129408836904, "grad_norm": 0.5817809700965881, "learning_rate": 1.8800751289056885e-05, "loss": 0.1396, "step": 2020 }, { "epoch": 0.18370625098979662, "grad_norm": 0.873615562915802, "learning_rate": 1.8786795454187615e-05, "loss": 0.1367, "step": 2030 }, { "epoch": 0.18461120789122418, "grad_norm": 0.7153456807136536, "learning_rate": 1.8772764127055087e-05, "loss": 0.1344, "step": 2040 }, { "epoch": 0.18551616479265176, "grad_norm": 0.28874504566192627, "learning_rate": 1.8758657428210266e-05, "loss": 0.1375, "step": 2050 }, { "epoch": 0.18642112169407932, "grad_norm": 0.41073450446128845, "learning_rate": 1.8744475478851667e-05, "loss": 0.1392, "step": 2060 }, { "epoch": 0.1873260785955069, "grad_norm": 0.4002520740032196, "learning_rate": 1.8730218400824337e-05, "loss": 0.1334, "step": 2070 }, { "epoch": 0.18823103549693446, "grad_norm": 0.49055391550064087, "learning_rate": 1.871588631661879e-05, "loss": 0.1398, "step": 2080 }, { "epoch": 0.18913599239836204, "grad_norm": 0.3669786751270294, "learning_rate": 1.8701479349369957e-05, "loss": 0.1309, "step": 2090 }, { "epoch": 0.1900409492997896, "grad_norm": 0.4466594159603119, "learning_rate": 1.8686997622856134e-05, "loss": 0.1361, "step": 2100 }, { "epoch": 0.19094590620121718, "grad_norm": 0.37340307235717773, "learning_rate": 1.8672441261497915e-05, "loss": 0.1314, "step": 2110 }, { "epoch": 0.19185086310264474, "grad_norm": 0.35569268465042114, "learning_rate": 1.8657810390357126e-05, "loss": 0.1385, "step": 2120 }, { "epoch": 0.1927558200040723, "grad_norm": 0.8156322240829468, "learning_rate": 1.8643105135135743e-05, "loss": 0.1358, "step": 2130 }, { "epoch": 0.19366077690549988, "grad_norm": 0.4480380117893219, "learning_rate": 1.8628325622174818e-05, "loss": 0.1367, "step": 2140 }, { "epoch": 0.19456573380692743, "grad_norm": 0.5346203446388245, "learning_rate": 1.86134719784534e-05, "loss": 0.1367, "step": 2150 }, { "epoch": 0.19547069070835502, "grad_norm": 0.3813647925853729, "learning_rate": 1.8598544331587427e-05, "loss": 0.1386, "step": 2160 }, { "epoch": 0.19637564760978257, "grad_norm": 0.5341768860816956, "learning_rate": 1.858354280982865e-05, "loss": 0.1298, "step": 2170 }, { "epoch": 0.19728060451121016, "grad_norm": 0.3788771331310272, "learning_rate": 1.8568467542063505e-05, "loss": 0.1416, "step": 2180 }, { "epoch": 0.1981855614126377, "grad_norm": 0.28519588708877563, "learning_rate": 1.8553318657812035e-05, "loss": 0.1336, "step": 2190 }, { "epoch": 0.1990905183140653, "grad_norm": 0.38430145382881165, "learning_rate": 1.853809628722676e-05, "loss": 0.1346, "step": 2200 }, { "epoch": 0.19999547521549285, "grad_norm": 0.3612518906593323, "learning_rate": 1.8522800561091556e-05, "loss": 0.1344, "step": 2210 }, { "epoch": 0.20090043211692044, "grad_norm": 0.36791539192199707, "learning_rate": 1.8507431610820547e-05, "loss": 0.1345, "step": 2220 }, { "epoch": 0.201805389018348, "grad_norm": 0.8380405902862549, "learning_rate": 1.8491989568456962e-05, "loss": 0.1343, "step": 2230 }, { "epoch": 0.20271034591977558, "grad_norm": 0.4786330461502075, "learning_rate": 1.8476474566671995e-05, "loss": 0.1409, "step": 2240 }, { "epoch": 0.20361530282120313, "grad_norm": 0.5574005842208862, "learning_rate": 1.8460886738763698e-05, "loss": 0.1324, "step": 2250 }, { "epoch": 0.20452025972263072, "grad_norm": 0.3608987033367157, "learning_rate": 1.8445226218655787e-05, "loss": 0.1429, "step": 2260 }, { "epoch": 0.20542521662405827, "grad_norm": 0.3293229639530182, "learning_rate": 1.842949314089654e-05, "loss": 0.1332, "step": 2270 }, { "epoch": 0.20633017352548585, "grad_norm": 0.594818651676178, "learning_rate": 1.8413687640657602e-05, "loss": 0.1354, "step": 2280 }, { "epoch": 0.2072351304269134, "grad_norm": 0.3503284156322479, "learning_rate": 1.8397809853732846e-05, "loss": 0.1373, "step": 2290 }, { "epoch": 0.208140087328341, "grad_norm": 0.6148970127105713, "learning_rate": 1.8381859916537204e-05, "loss": 0.1435, "step": 2300 }, { "epoch": 0.20904504422976855, "grad_norm": 0.39734190702438354, "learning_rate": 1.8365837966105486e-05, "loss": 0.1325, "step": 2310 }, { "epoch": 0.20995000113119613, "grad_norm": 0.3820249140262604, "learning_rate": 1.8349744140091205e-05, "loss": 0.1307, "step": 2320 }, { "epoch": 0.2108549580326237, "grad_norm": 0.2815605700016022, "learning_rate": 1.83335785767654e-05, "loss": 0.1395, "step": 2330 }, { "epoch": 0.21175991493405127, "grad_norm": 0.26476067304611206, "learning_rate": 1.831734141501546e-05, "loss": 0.1332, "step": 2340 }, { "epoch": 0.21266487183547883, "grad_norm": 0.4209696352481842, "learning_rate": 1.830103279434389e-05, "loss": 0.1373, "step": 2350 }, { "epoch": 0.21356982873690641, "grad_norm": 0.29990458488464355, "learning_rate": 1.828465285486716e-05, "loss": 0.1345, "step": 2360 }, { "epoch": 0.21447478563833397, "grad_norm": 0.39812973141670227, "learning_rate": 1.826820173731446e-05, "loss": 0.1361, "step": 2370 }, { "epoch": 0.21537974253976155, "grad_norm": 0.35514476895332336, "learning_rate": 1.825167958302653e-05, "loss": 0.1353, "step": 2380 }, { "epoch": 0.2162846994411891, "grad_norm": 0.3101460039615631, "learning_rate": 1.8235086533954418e-05, "loss": 0.1369, "step": 2390 }, { "epoch": 0.2171896563426167, "grad_norm": 0.42703959345817566, "learning_rate": 1.8218422732658263e-05, "loss": 0.1348, "step": 2400 }, { "epoch": 0.21809461324404425, "grad_norm": 0.44410404562950134, "learning_rate": 1.820168832230609e-05, "loss": 0.1292, "step": 2410 }, { "epoch": 0.21899957014547183, "grad_norm": 0.4274084270000458, "learning_rate": 1.8184883446672545e-05, "loss": 0.1325, "step": 2420 }, { "epoch": 0.2199045270468994, "grad_norm": 0.3276132643222809, "learning_rate": 1.81680082501377e-05, "loss": 0.1304, "step": 2430 }, { "epoch": 0.22080948394832697, "grad_norm": 0.28649523854255676, "learning_rate": 1.8151062877685785e-05, "loss": 0.1379, "step": 2440 }, { "epoch": 0.22171444084975453, "grad_norm": 0.3112781345844269, "learning_rate": 1.813404747490395e-05, "loss": 0.1324, "step": 2450 }, { "epoch": 0.2226193977511821, "grad_norm": 0.37147268652915955, "learning_rate": 1.811696218798102e-05, "loss": 0.1362, "step": 2460 }, { "epoch": 0.22352435465260967, "grad_norm": 0.30297672748565674, "learning_rate": 1.8099807163706225e-05, "loss": 0.1382, "step": 2470 }, { "epoch": 0.22442931155403723, "grad_norm": 0.3513183295726776, "learning_rate": 1.808258254946795e-05, "loss": 0.1313, "step": 2480 }, { "epoch": 0.2253342684554648, "grad_norm": 0.2990841865539551, "learning_rate": 1.806528849325248e-05, "loss": 0.1361, "step": 2490 }, { "epoch": 0.22623922535689237, "grad_norm": 0.44124889373779297, "learning_rate": 1.8047925143642685e-05, "loss": 0.1348, "step": 2500 }, { "epoch": 0.22714418225831995, "grad_norm": 0.5837457776069641, "learning_rate": 1.8030492649816807e-05, "loss": 0.1384, "step": 2510 }, { "epoch": 0.2280491391597475, "grad_norm": 0.4553276598453522, "learning_rate": 1.801299116154712e-05, "loss": 0.1345, "step": 2520 }, { "epoch": 0.2289540960611751, "grad_norm": 0.5152564644813538, "learning_rate": 1.7995420829198677e-05, "loss": 0.1319, "step": 2530 }, { "epoch": 0.22985905296260264, "grad_norm": 0.4430082440376282, "learning_rate": 1.7977781803728012e-05, "loss": 0.1352, "step": 2540 }, { "epoch": 0.23076400986403023, "grad_norm": 0.47474417090415955, "learning_rate": 1.7960074236681832e-05, "loss": 0.1387, "step": 2550 }, { "epoch": 0.23166896676545778, "grad_norm": 0.3780491054058075, "learning_rate": 1.7942298280195735e-05, "loss": 0.1369, "step": 2560 }, { "epoch": 0.23257392366688537, "grad_norm": 0.3033972680568695, "learning_rate": 1.7924454086992874e-05, "loss": 0.1297, "step": 2570 }, { "epoch": 0.23347888056831292, "grad_norm": 0.3683450520038605, "learning_rate": 1.7906541810382676e-05, "loss": 0.1318, "step": 2580 }, { "epoch": 0.2343838374697405, "grad_norm": 0.3137243092060089, "learning_rate": 1.78885616042595e-05, "loss": 0.1299, "step": 2590 }, { "epoch": 0.23528879437116806, "grad_norm": 0.6204285621643066, "learning_rate": 1.787051362310134e-05, "loss": 0.1328, "step": 2600 }, { "epoch": 0.23619375127259565, "grad_norm": 0.3297698497772217, "learning_rate": 1.785239802196847e-05, "loss": 0.1266, "step": 2610 }, { "epoch": 0.2370987081740232, "grad_norm": 0.27107271552085876, "learning_rate": 1.7834214956502124e-05, "loss": 0.1386, "step": 2620 }, { "epoch": 0.2380036650754508, "grad_norm": 0.3303036093711853, "learning_rate": 1.781596458292317e-05, "loss": 0.1314, "step": 2630 }, { "epoch": 0.23890862197687834, "grad_norm": 0.2860264480113983, "learning_rate": 1.7797647058030748e-05, "loss": 0.1341, "step": 2640 }, { "epoch": 0.23981357887830593, "grad_norm": 0.333578497171402, "learning_rate": 1.7779262539200937e-05, "loss": 0.134, "step": 2650 }, { "epoch": 0.24071853577973348, "grad_norm": 0.3807545602321625, "learning_rate": 1.7760811184385406e-05, "loss": 0.1327, "step": 2660 }, { "epoch": 0.24162349268116107, "grad_norm": 0.3607015013694763, "learning_rate": 1.7742293152110033e-05, "loss": 0.1356, "step": 2670 }, { "epoch": 0.24252844958258862, "grad_norm": 0.4713596701622009, "learning_rate": 1.7723708601473566e-05, "loss": 0.1371, "step": 2680 }, { "epoch": 0.2434334064840162, "grad_norm": 0.5265551805496216, "learning_rate": 1.7705057692146258e-05, "loss": 0.1293, "step": 2690 }, { "epoch": 0.24433836338544376, "grad_norm": 0.35034945607185364, "learning_rate": 1.768634058436847e-05, "loss": 0.1354, "step": 2700 }, { "epoch": 0.24524332028687135, "grad_norm": 0.3199910819530487, "learning_rate": 1.7667557438949328e-05, "loss": 0.1411, "step": 2710 }, { "epoch": 0.2461482771882989, "grad_norm": 0.379367858171463, "learning_rate": 1.7648708417265314e-05, "loss": 0.1345, "step": 2720 }, { "epoch": 0.2470532340897265, "grad_norm": 0.5331190824508667, "learning_rate": 1.7629793681258892e-05, "loss": 0.133, "step": 2730 }, { "epoch": 0.24795819099115404, "grad_norm": 0.29094645380973816, "learning_rate": 1.761081339343711e-05, "loss": 0.1288, "step": 2740 }, { "epoch": 0.24886314789258163, "grad_norm": 0.47043576836586, "learning_rate": 1.759176771687022e-05, "loss": 0.1368, "step": 2750 }, { "epoch": 0.24976810479400918, "grad_norm": 0.8411908149719238, "learning_rate": 1.7572656815190253e-05, "loss": 0.1326, "step": 2760 }, { "epoch": 0.25067306169543674, "grad_norm": 0.39577701687812805, "learning_rate": 1.7553480852589635e-05, "loss": 0.1336, "step": 2770 }, { "epoch": 0.25157801859686435, "grad_norm": 0.5741309523582458, "learning_rate": 1.7534239993819758e-05, "loss": 0.1367, "step": 2780 }, { "epoch": 0.2524829754982919, "grad_norm": 0.3736680746078491, "learning_rate": 1.7514934404189574e-05, "loss": 0.1259, "step": 2790 }, { "epoch": 0.25338793239971946, "grad_norm": 0.4589287042617798, "learning_rate": 1.7495564249564184e-05, "loss": 0.1319, "step": 2800 }, { "epoch": 0.254292889301147, "grad_norm": 0.4368409514427185, "learning_rate": 1.7476129696363394e-05, "loss": 0.1282, "step": 2810 }, { "epoch": 0.25519784620257463, "grad_norm": 0.3239140212535858, "learning_rate": 1.7456630911560294e-05, "loss": 0.1309, "step": 2820 }, { "epoch": 0.2561028031040022, "grad_norm": 0.3988368809223175, "learning_rate": 1.7437068062679827e-05, "loss": 0.1338, "step": 2830 }, { "epoch": 0.25700776000542974, "grad_norm": 0.45862647891044617, "learning_rate": 1.7417441317797342e-05, "loss": 0.1344, "step": 2840 }, { "epoch": 0.2579127169068573, "grad_norm": 0.3606816232204437, "learning_rate": 1.7397750845537163e-05, "loss": 0.1369, "step": 2850 }, { "epoch": 0.25881767380828485, "grad_norm": 0.44180789589881897, "learning_rate": 1.7377996815071122e-05, "loss": 0.1299, "step": 2860 }, { "epoch": 0.25972263070971247, "grad_norm": 0.30617383122444153, "learning_rate": 1.7358179396117118e-05, "loss": 0.1334, "step": 2870 }, { "epoch": 0.26062758761114, "grad_norm": 0.3464939594268799, "learning_rate": 1.7338298758937656e-05, "loss": 0.1317, "step": 2880 }, { "epoch": 0.2615325445125676, "grad_norm": 0.3210926949977875, "learning_rate": 1.7318355074338387e-05, "loss": 0.1334, "step": 2890 }, { "epoch": 0.26243750141399513, "grad_norm": 0.309063196182251, "learning_rate": 1.7298348513666632e-05, "loss": 0.1432, "step": 2900 }, { "epoch": 0.26334245831542274, "grad_norm": 0.3307989835739136, "learning_rate": 1.727827924880992e-05, "loss": 0.1376, "step": 2910 }, { "epoch": 0.2642474152168503, "grad_norm": 0.2714211046695709, "learning_rate": 1.725814745219451e-05, "loss": 0.1338, "step": 2920 }, { "epoch": 0.26515237211827786, "grad_norm": 0.4215063750743866, "learning_rate": 1.723795329678389e-05, "loss": 0.1346, "step": 2930 }, { "epoch": 0.2660573290197054, "grad_norm": 0.36363697052001953, "learning_rate": 1.721769695607733e-05, "loss": 0.1323, "step": 2940 }, { "epoch": 0.266962285921133, "grad_norm": 0.3916791081428528, "learning_rate": 1.7197378604108352e-05, "loss": 0.1299, "step": 2950 }, { "epoch": 0.2678672428225606, "grad_norm": 0.3785913288593292, "learning_rate": 1.7176998415443256e-05, "loss": 0.1328, "step": 2960 }, { "epoch": 0.26877219972398814, "grad_norm": 0.329843670129776, "learning_rate": 1.7156556565179618e-05, "loss": 0.1316, "step": 2970 }, { "epoch": 0.2696771566254157, "grad_norm": 0.30012860894203186, "learning_rate": 1.713605322894478e-05, "loss": 0.1344, "step": 2980 }, { "epoch": 0.2705821135268433, "grad_norm": 0.33648693561553955, "learning_rate": 1.7115488582894345e-05, "loss": 0.1238, "step": 2990 }, { "epoch": 0.27148707042827086, "grad_norm": 0.4068211317062378, "learning_rate": 1.7094862803710665e-05, "loss": 0.1414, "step": 3000 }, { "epoch": 0.2723920273296984, "grad_norm": 0.34052276611328125, "learning_rate": 1.7074176068601318e-05, "loss": 0.1314, "step": 3010 }, { "epoch": 0.273296984231126, "grad_norm": 0.4234228730201721, "learning_rate": 1.705342855529759e-05, "loss": 0.1344, "step": 3020 }, { "epoch": 0.2742019411325536, "grad_norm": 0.3007555603981018, "learning_rate": 1.7032620442052948e-05, "loss": 0.1403, "step": 3030 }, { "epoch": 0.27510689803398114, "grad_norm": 0.3519713878631592, "learning_rate": 1.70117519076415e-05, "loss": 0.1326, "step": 3040 }, { "epoch": 0.2760118549354087, "grad_norm": 0.6432228088378906, "learning_rate": 1.699082313135648e-05, "loss": 0.1339, "step": 3050 }, { "epoch": 0.27691681183683625, "grad_norm": 0.39598795771598816, "learning_rate": 1.6969834293008674e-05, "loss": 0.133, "step": 3060 }, { "epoch": 0.27782176873826386, "grad_norm": 0.3361396789550781, "learning_rate": 1.6948785572924912e-05, "loss": 0.1258, "step": 3070 }, { "epoch": 0.2787267256396914, "grad_norm": 0.3960329592227936, "learning_rate": 1.692767715194649e-05, "loss": 0.1315, "step": 3080 }, { "epoch": 0.279631682541119, "grad_norm": 0.3047797381877899, "learning_rate": 1.6906509211427633e-05, "loss": 0.1321, "step": 3090 }, { "epoch": 0.28053663944254653, "grad_norm": 0.3368310332298279, "learning_rate": 1.6885281933233936e-05, "loss": 0.1421, "step": 3100 }, { "epoch": 0.28144159634397414, "grad_norm": 0.5269041061401367, "learning_rate": 1.6863995499740785e-05, "loss": 0.1342, "step": 3110 }, { "epoch": 0.2823465532454017, "grad_norm": 0.3757074773311615, "learning_rate": 1.6842650093831817e-05, "loss": 0.1347, "step": 3120 }, { "epoch": 0.28325151014682926, "grad_norm": 0.4319840669631958, "learning_rate": 1.6821245898897317e-05, "loss": 0.1368, "step": 3130 }, { "epoch": 0.2841564670482568, "grad_norm": 0.2823663651943207, "learning_rate": 1.6799783098832677e-05, "loss": 0.1318, "step": 3140 }, { "epoch": 0.2850614239496844, "grad_norm": 0.3386325538158417, "learning_rate": 1.6778261878036784e-05, "loss": 0.1335, "step": 3150 }, { "epoch": 0.285966380851112, "grad_norm": 0.44977718591690063, "learning_rate": 1.6756682421410454e-05, "loss": 0.1342, "step": 3160 }, { "epoch": 0.28687133775253953, "grad_norm": 0.43069374561309814, "learning_rate": 1.6735044914354853e-05, "loss": 0.1316, "step": 3170 }, { "epoch": 0.2877762946539671, "grad_norm": 0.47004443407058716, "learning_rate": 1.6713349542769865e-05, "loss": 0.1353, "step": 3180 }, { "epoch": 0.28868125155539465, "grad_norm": 0.5096125602722168, "learning_rate": 1.6691596493052543e-05, "loss": 0.1346, "step": 3190 }, { "epoch": 0.28958620845682226, "grad_norm": 0.4015505611896515, "learning_rate": 1.6669785952095468e-05, "loss": 0.1334, "step": 3200 }, { "epoch": 0.2904911653582498, "grad_norm": 0.37036389112472534, "learning_rate": 1.6647918107285182e-05, "loss": 0.127, "step": 3210 }, { "epoch": 0.29139612225967737, "grad_norm": 0.39770999550819397, "learning_rate": 1.6625993146500536e-05, "loss": 0.1355, "step": 3220 }, { "epoch": 0.2923010791611049, "grad_norm": 0.43057796359062195, "learning_rate": 1.6604011258111097e-05, "loss": 0.1358, "step": 3230 }, { "epoch": 0.29320603606253254, "grad_norm": 0.3283129930496216, "learning_rate": 1.658197263097555e-05, "loss": 0.1317, "step": 3240 }, { "epoch": 0.2941109929639601, "grad_norm": 0.28569671511650085, "learning_rate": 1.6559877454440025e-05, "loss": 0.1351, "step": 3250 }, { "epoch": 0.29501594986538765, "grad_norm": 0.29221346974372864, "learning_rate": 1.6537725918336524e-05, "loss": 0.135, "step": 3260 }, { "epoch": 0.2959209067668152, "grad_norm": 3.769948959350586, "learning_rate": 1.6515518212981248e-05, "loss": 0.1562, "step": 3270 }, { "epoch": 0.2968258636682428, "grad_norm": 0.6297938823699951, "learning_rate": 1.6493254529172996e-05, "loss": 0.1332, "step": 3280 }, { "epoch": 0.2977308205696704, "grad_norm": 0.4224908649921417, "learning_rate": 1.647093505819149e-05, "loss": 0.1362, "step": 3290 }, { "epoch": 0.29863577747109793, "grad_norm": 0.60915607213974, "learning_rate": 1.6448559991795762e-05, "loss": 0.1331, "step": 3300 }, { "epoch": 0.2995407343725255, "grad_norm": 0.4966517686843872, "learning_rate": 1.64261295222225e-05, "loss": 0.1264, "step": 3310 }, { "epoch": 0.3004456912739531, "grad_norm": 0.8263148069381714, "learning_rate": 1.6403643842184383e-05, "loss": 0.1284, "step": 3320 }, { "epoch": 0.30135064817538065, "grad_norm": 0.2521939277648926, "learning_rate": 1.6381103144868434e-05, "loss": 0.1321, "step": 3330 }, { "epoch": 0.3022556050768082, "grad_norm": 0.3264107406139374, "learning_rate": 1.6358507623934368e-05, "loss": 0.1338, "step": 3340 }, { "epoch": 0.30316056197823577, "grad_norm": 0.3555287718772888, "learning_rate": 1.6335857473512908e-05, "loss": 0.1318, "step": 3350 }, { "epoch": 0.3040655188796634, "grad_norm": 0.3218032717704773, "learning_rate": 1.6313152888204143e-05, "loss": 0.1244, "step": 3360 }, { "epoch": 0.30497047578109093, "grad_norm": 0.37903180718421936, "learning_rate": 1.629039406307583e-05, "loss": 0.1315, "step": 3370 }, { "epoch": 0.3058754326825185, "grad_norm": 0.32837244868278503, "learning_rate": 1.626758119366174e-05, "loss": 0.1299, "step": 3380 }, { "epoch": 0.30678038958394604, "grad_norm": 0.5855687856674194, "learning_rate": 1.6244714475959958e-05, "loss": 0.1372, "step": 3390 }, { "epoch": 0.30768534648537366, "grad_norm": 0.30140820145606995, "learning_rate": 1.622179410643123e-05, "loss": 0.1303, "step": 3400 }, { "epoch": 0.3085903033868012, "grad_norm": 0.32345208525657654, "learning_rate": 1.619882028199723e-05, "loss": 0.1318, "step": 3410 }, { "epoch": 0.30949526028822877, "grad_norm": 0.4127529561519623, "learning_rate": 1.617579320003891e-05, "loss": 0.1325, "step": 3420 }, { "epoch": 0.3104002171896563, "grad_norm": 0.2518954873085022, "learning_rate": 1.6152713058394778e-05, "loss": 0.1323, "step": 3430 }, { "epoch": 0.31130517409108394, "grad_norm": 0.385233610868454, "learning_rate": 1.612958005535921e-05, "loss": 0.1321, "step": 3440 }, { "epoch": 0.3122101309925115, "grad_norm": 4.388869285583496, "learning_rate": 1.6106394389680752e-05, "loss": 0.1392, "step": 3450 }, { "epoch": 0.31311508789393905, "grad_norm": 0.3289415240287781, "learning_rate": 1.6083156260560387e-05, "loss": 0.1319, "step": 3460 }, { "epoch": 0.3140200447953666, "grad_norm": 0.2971116900444031, "learning_rate": 1.605986586764986e-05, "loss": 0.1314, "step": 3470 }, { "epoch": 0.3149250016967942, "grad_norm": 0.2933705449104309, "learning_rate": 1.603652341104993e-05, "loss": 0.1331, "step": 3480 }, { "epoch": 0.31582995859822177, "grad_norm": 0.2969614863395691, "learning_rate": 1.6013129091308658e-05, "loss": 0.1356, "step": 3490 }, { "epoch": 0.3167349154996493, "grad_norm": 0.48023778200149536, "learning_rate": 1.5989683109419717e-05, "loss": 0.1296, "step": 3500 }, { "epoch": 0.3176398724010769, "grad_norm": 0.3945216238498688, "learning_rate": 1.5966185666820608e-05, "loss": 0.1393, "step": 3510 }, { "epoch": 0.3185448293025045, "grad_norm": 0.34867751598358154, "learning_rate": 1.5942636965390983e-05, "loss": 0.1316, "step": 3520 }, { "epoch": 0.31944978620393205, "grad_norm": 0.4952505826950073, "learning_rate": 1.5919037207450873e-05, "loss": 0.1292, "step": 3530 }, { "epoch": 0.3203547431053596, "grad_norm": 0.35089966654777527, "learning_rate": 1.589538659575897e-05, "loss": 0.1339, "step": 3540 }, { "epoch": 0.32125970000678716, "grad_norm": 0.5478940606117249, "learning_rate": 1.5871685333510873e-05, "loss": 0.1331, "step": 3550 }, { "epoch": 0.3221646569082147, "grad_norm": 0.41318562626838684, "learning_rate": 1.584793362433736e-05, "loss": 0.1328, "step": 3560 }, { "epoch": 0.32306961380964233, "grad_norm": 0.3964232802391052, "learning_rate": 1.5824131672302608e-05, "loss": 0.1445, "step": 3570 }, { "epoch": 0.3239745707110699, "grad_norm": 0.3068403899669647, "learning_rate": 1.5800279681902483e-05, "loss": 0.1275, "step": 3580 }, { "epoch": 0.32487952761249744, "grad_norm": 0.2769792079925537, "learning_rate": 1.5776377858062737e-05, "loss": 0.1286, "step": 3590 }, { "epoch": 0.325784484513925, "grad_norm": 0.5100188255310059, "learning_rate": 1.5752426406137275e-05, "loss": 0.1344, "step": 3600 }, { "epoch": 0.3266894414153526, "grad_norm": 0.3469770550727844, "learning_rate": 1.5728425531906396e-05, "loss": 0.1323, "step": 3610 }, { "epoch": 0.32759439831678017, "grad_norm": 0.5681571960449219, "learning_rate": 1.5704375441574996e-05, "loss": 0.1335, "step": 3620 }, { "epoch": 0.3284993552182077, "grad_norm": 0.3062500059604645, "learning_rate": 1.568027634177083e-05, "loss": 0.1367, "step": 3630 }, { "epoch": 0.3294043121196353, "grad_norm": 0.3792308270931244, "learning_rate": 1.5656128439542704e-05, "loss": 0.1301, "step": 3640 }, { "epoch": 0.3303092690210629, "grad_norm": 0.3073217272758484, "learning_rate": 1.5631931942358723e-05, "loss": 0.1304, "step": 3650 }, { "epoch": 0.33121422592249045, "grad_norm": 0.3077130615711212, "learning_rate": 1.560768705810451e-05, "loss": 0.1293, "step": 3660 }, { "epoch": 0.332119182823918, "grad_norm": 0.34068816900253296, "learning_rate": 1.558339399508138e-05, "loss": 0.138, "step": 3670 }, { "epoch": 0.33302413972534556, "grad_norm": 0.366473525762558, "learning_rate": 1.55590529620046e-05, "loss": 0.1243, "step": 3680 }, { "epoch": 0.33392909662677317, "grad_norm": 0.40335702896118164, "learning_rate": 1.553466416800157e-05, "loss": 0.1305, "step": 3690 }, { "epoch": 0.3348340535282007, "grad_norm": 0.3505435287952423, "learning_rate": 1.551022782261003e-05, "loss": 0.1332, "step": 3700 }, { "epoch": 0.3357390104296283, "grad_norm": 0.5044335126876831, "learning_rate": 1.5485744135776258e-05, "loss": 0.1326, "step": 3710 }, { "epoch": 0.33664396733105584, "grad_norm": 0.3738217055797577, "learning_rate": 1.546121331785327e-05, "loss": 0.1251, "step": 3720 }, { "epoch": 0.33754892423248345, "grad_norm": 0.3503580093383789, "learning_rate": 1.5436635579599014e-05, "loss": 0.1349, "step": 3730 }, { "epoch": 0.338453881133911, "grad_norm": 0.3351600766181946, "learning_rate": 1.541201113217456e-05, "loss": 0.1275, "step": 3740 }, { "epoch": 0.33935883803533856, "grad_norm": 0.2801141142845154, "learning_rate": 1.538734018714227e-05, "loss": 0.1278, "step": 3750 }, { "epoch": 0.3402637949367661, "grad_norm": 0.26787105202674866, "learning_rate": 1.5362622956463998e-05, "loss": 0.1413, "step": 3760 }, { "epoch": 0.34116875183819373, "grad_norm": 0.394846111536026, "learning_rate": 1.5337859652499277e-05, "loss": 0.1354, "step": 3770 }, { "epoch": 0.3420737087396213, "grad_norm": 0.3125811815261841, "learning_rate": 1.531305048800346e-05, "loss": 0.1324, "step": 3780 }, { "epoch": 0.34297866564104884, "grad_norm": 0.3050634264945984, "learning_rate": 1.5288195676125937e-05, "loss": 0.1345, "step": 3790 }, { "epoch": 0.3438836225424764, "grad_norm": 0.257493257522583, "learning_rate": 1.5263295430408255e-05, "loss": 0.1311, "step": 3800 }, { "epoch": 0.344788579443904, "grad_norm": 0.3533366620540619, "learning_rate": 1.5238349964782325e-05, "loss": 0.1282, "step": 3810 }, { "epoch": 0.34569353634533156, "grad_norm": 0.3291451036930084, "learning_rate": 1.5213359493568562e-05, "loss": 0.1218, "step": 3820 }, { "epoch": 0.3465984932467591, "grad_norm": 0.3743618130683899, "learning_rate": 1.5188324231474054e-05, "loss": 0.1295, "step": 3830 }, { "epoch": 0.3475034501481867, "grad_norm": 0.5372154712677002, "learning_rate": 1.51632443935907e-05, "loss": 0.1331, "step": 3840 }, { "epoch": 0.3484084070496143, "grad_norm": 0.3277980387210846, "learning_rate": 1.5138120195393396e-05, "loss": 0.1387, "step": 3850 }, { "epoch": 0.34931336395104184, "grad_norm": 0.40068790316581726, "learning_rate": 1.5112951852738138e-05, "loss": 0.1301, "step": 3860 }, { "epoch": 0.3502183208524694, "grad_norm": 0.32944217324256897, "learning_rate": 1.5087739581860213e-05, "loss": 0.1312, "step": 3870 }, { "epoch": 0.35112327775389696, "grad_norm": 0.3983187675476074, "learning_rate": 1.50624835993723e-05, "loss": 0.1325, "step": 3880 }, { "epoch": 0.35202823465532457, "grad_norm": 0.3687704801559448, "learning_rate": 1.5037184122262645e-05, "loss": 0.128, "step": 3890 }, { "epoch": 0.3529331915567521, "grad_norm": 0.4385727345943451, "learning_rate": 1.501184136789317e-05, "loss": 0.1329, "step": 3900 }, { "epoch": 0.3538381484581797, "grad_norm": 0.29301881790161133, "learning_rate": 1.4986455553997625e-05, "loss": 0.1301, "step": 3910 }, { "epoch": 0.35474310535960724, "grad_norm": 0.4024035632610321, "learning_rate": 1.4961026898679703e-05, "loss": 0.1325, "step": 3920 }, { "epoch": 0.3556480622610348, "grad_norm": 0.32220858335494995, "learning_rate": 1.4935555620411168e-05, "loss": 0.1361, "step": 3930 }, { "epoch": 0.3565530191624624, "grad_norm": 0.2925558388233185, "learning_rate": 1.4910041938029993e-05, "loss": 0.1299, "step": 3940 }, { "epoch": 0.35745797606388996, "grad_norm": 0.3419649600982666, "learning_rate": 1.4884486070738457e-05, "loss": 0.1371, "step": 3950 }, { "epoch": 0.3583629329653175, "grad_norm": 0.37976405024528503, "learning_rate": 1.4858888238101278e-05, "loss": 0.1381, "step": 3960 }, { "epoch": 0.35926788986674507, "grad_norm": 0.3191063106060028, "learning_rate": 1.483324866004372e-05, "loss": 0.1299, "step": 3970 }, { "epoch": 0.3601728467681727, "grad_norm": 0.35269689559936523, "learning_rate": 1.4807567556849707e-05, "loss": 0.134, "step": 3980 }, { "epoch": 0.36107780366960024, "grad_norm": 0.44557368755340576, "learning_rate": 1.478184514915993e-05, "loss": 0.1333, "step": 3990 }, { "epoch": 0.3619827605710278, "grad_norm": 0.26233455538749695, "learning_rate": 1.4756081657969947e-05, "loss": 0.126, "step": 4000 }, { "epoch": 0.36288771747245535, "grad_norm": 0.27105700969696045, "learning_rate": 1.4730277304628287e-05, "loss": 0.1338, "step": 4010 }, { "epoch": 0.36379267437388296, "grad_norm": 0.45541515946388245, "learning_rate": 1.4704432310834551e-05, "loss": 0.1327, "step": 4020 }, { "epoch": 0.3646976312753105, "grad_norm": 0.31574854254722595, "learning_rate": 1.4678546898637502e-05, "loss": 0.1329, "step": 4030 }, { "epoch": 0.3656025881767381, "grad_norm": 0.3075743019580841, "learning_rate": 1.4652621290433166e-05, "loss": 0.1297, "step": 4040 }, { "epoch": 0.36650754507816563, "grad_norm": 0.32941344380378723, "learning_rate": 1.4626655708962904e-05, "loss": 0.125, "step": 4050 }, { "epoch": 0.36741250197959324, "grad_norm": 0.3551495671272278, "learning_rate": 1.4600650377311523e-05, "loss": 0.134, "step": 4060 }, { "epoch": 0.3683174588810208, "grad_norm": 0.3904673457145691, "learning_rate": 1.4574605518905336e-05, "loss": 0.131, "step": 4070 }, { "epoch": 0.36922241578244835, "grad_norm": 0.3711760342121124, "learning_rate": 1.4548521357510256e-05, "loss": 0.1302, "step": 4080 }, { "epoch": 0.3701273726838759, "grad_norm": 0.23497672379016876, "learning_rate": 1.4522398117229874e-05, "loss": 0.1352, "step": 4090 }, { "epoch": 0.3710323295853035, "grad_norm": 0.39998266100883484, "learning_rate": 1.4496236022503523e-05, "loss": 0.1326, "step": 4100 }, { "epoch": 0.3719372864867311, "grad_norm": 0.38762685656547546, "learning_rate": 1.4470035298104355e-05, "loss": 0.1269, "step": 4110 }, { "epoch": 0.37284224338815863, "grad_norm": 0.29660049080848694, "learning_rate": 1.444379616913742e-05, "loss": 0.1314, "step": 4120 }, { "epoch": 0.3737472002895862, "grad_norm": 0.3709215223789215, "learning_rate": 1.4417518861037713e-05, "loss": 0.1315, "step": 4130 }, { "epoch": 0.3746521571910138, "grad_norm": 0.5628421902656555, "learning_rate": 1.4391203599568257e-05, "loss": 0.1345, "step": 4140 }, { "epoch": 0.37555711409244136, "grad_norm": 0.32759326696395874, "learning_rate": 1.4364850610818147e-05, "loss": 0.1369, "step": 4150 }, { "epoch": 0.3764620709938689, "grad_norm": 0.33175286650657654, "learning_rate": 1.4338460121200612e-05, "loss": 0.1303, "step": 4160 }, { "epoch": 0.37736702789529647, "grad_norm": 0.5355362296104431, "learning_rate": 1.4312032357451084e-05, "loss": 0.1283, "step": 4170 }, { "epoch": 0.3782719847967241, "grad_norm": 0.25995931029319763, "learning_rate": 1.428556754662522e-05, "loss": 0.1306, "step": 4180 }, { "epoch": 0.37917694169815164, "grad_norm": 0.34952160716056824, "learning_rate": 1.4259065916096983e-05, "loss": 0.1315, "step": 4190 }, { "epoch": 0.3800818985995792, "grad_norm": 0.34290722012519836, "learning_rate": 1.4232527693556673e-05, "loss": 0.1349, "step": 4200 }, { "epoch": 0.38098685550100675, "grad_norm": 0.2907416522502899, "learning_rate": 1.4205953107008964e-05, "loss": 0.134, "step": 4210 }, { "epoch": 0.38189181240243436, "grad_norm": 0.305785208940506, "learning_rate": 1.4179342384770964e-05, "loss": 0.1322, "step": 4220 }, { "epoch": 0.3827967693038619, "grad_norm": 0.3259272873401642, "learning_rate": 1.4152695755470235e-05, "loss": 0.124, "step": 4230 }, { "epoch": 0.3837017262052895, "grad_norm": 0.4469420611858368, "learning_rate": 1.4126013448042838e-05, "loss": 0.1247, "step": 4240 }, { "epoch": 0.38460668310671703, "grad_norm": 0.3158349096775055, "learning_rate": 1.4099295691731374e-05, "loss": 0.1213, "step": 4250 }, { "epoch": 0.3855116400081446, "grad_norm": 0.30905407667160034, "learning_rate": 1.4072542716082986e-05, "loss": 0.1334, "step": 4260 }, { "epoch": 0.3864165969095722, "grad_norm": 0.29381975531578064, "learning_rate": 1.4045754750947428e-05, "loss": 0.1328, "step": 4270 }, { "epoch": 0.38732155381099975, "grad_norm": 0.39547502994537354, "learning_rate": 1.401893202647505e-05, "loss": 0.1269, "step": 4280 }, { "epoch": 0.3882265107124273, "grad_norm": 0.3500146269798279, "learning_rate": 1.3992074773114852e-05, "loss": 0.1322, "step": 4290 }, { "epoch": 0.38913146761385486, "grad_norm": 0.2726495563983917, "learning_rate": 1.3965183221612484e-05, "loss": 0.1337, "step": 4300 }, { "epoch": 0.3900364245152825, "grad_norm": 0.3656439185142517, "learning_rate": 1.393825760300827e-05, "loss": 0.1324, "step": 4310 }, { "epoch": 0.39094138141671003, "grad_norm": 0.34188976883888245, "learning_rate": 1.3911298148635224e-05, "loss": 0.1358, "step": 4320 }, { "epoch": 0.3918463383181376, "grad_norm": 0.5021139979362488, "learning_rate": 1.3884305090117069e-05, "loss": 0.1365, "step": 4330 }, { "epoch": 0.39275129521956514, "grad_norm": 0.32963699102401733, "learning_rate": 1.3857278659366232e-05, "loss": 0.1337, "step": 4340 }, { "epoch": 0.39365625212099276, "grad_norm": 0.31096163392066956, "learning_rate": 1.3830219088581856e-05, "loss": 0.1305, "step": 4350 }, { "epoch": 0.3945612090224203, "grad_norm": 0.3522721230983734, "learning_rate": 1.380312661024782e-05, "loss": 0.1327, "step": 4360 }, { "epoch": 0.39546616592384787, "grad_norm": 0.3698263168334961, "learning_rate": 1.3776001457130725e-05, "loss": 0.1363, "step": 4370 }, { "epoch": 0.3963711228252754, "grad_norm": 0.6951903700828552, "learning_rate": 1.37488438622779e-05, "loss": 0.1281, "step": 4380 }, { "epoch": 0.39727607972670304, "grad_norm": 0.3918182849884033, "learning_rate": 1.3721654059015393e-05, "loss": 0.1338, "step": 4390 }, { "epoch": 0.3981810366281306, "grad_norm": 0.40132880210876465, "learning_rate": 1.3694432280945978e-05, "loss": 0.1258, "step": 4400 }, { "epoch": 0.39908599352955815, "grad_norm": 0.36218178272247314, "learning_rate": 1.3667178761947144e-05, "loss": 0.1324, "step": 4410 }, { "epoch": 0.3999909504309857, "grad_norm": 0.411044180393219, "learning_rate": 1.3639893736169083e-05, "loss": 0.1332, "step": 4420 }, { "epoch": 0.4008959073324133, "grad_norm": 0.3805508017539978, "learning_rate": 1.3612577438032673e-05, "loss": 0.1317, "step": 4430 }, { "epoch": 0.40180086423384087, "grad_norm": 0.33358198404312134, "learning_rate": 1.3585230102227478e-05, "loss": 0.1254, "step": 4440 }, { "epoch": 0.4027058211352684, "grad_norm": 0.2710610628128052, "learning_rate": 1.355785196370972e-05, "loss": 0.1319, "step": 4450 }, { "epoch": 0.403610778036696, "grad_norm": 0.24267303943634033, "learning_rate": 1.3530443257700272e-05, "loss": 0.134, "step": 4460 }, { "epoch": 0.4045157349381236, "grad_norm": 0.32278409600257874, "learning_rate": 1.3503004219682611e-05, "loss": 0.1377, "step": 4470 }, { "epoch": 0.40542069183955115, "grad_norm": 0.2883910834789276, "learning_rate": 1.3475535085400836e-05, "loss": 0.13, "step": 4480 }, { "epoch": 0.4063256487409787, "grad_norm": 0.5005854368209839, "learning_rate": 1.3448036090857601e-05, "loss": 0.1299, "step": 4490 }, { "epoch": 0.40723060564240626, "grad_norm": 0.4682227075099945, "learning_rate": 1.3420507472312121e-05, "loss": 0.1365, "step": 4500 }, { "epoch": 0.4081355625438339, "grad_norm": 0.3037811815738678, "learning_rate": 1.3392949466278116e-05, "loss": 0.1334, "step": 4510 }, { "epoch": 0.40904051944526143, "grad_norm": 0.2893620729446411, "learning_rate": 1.3365362309521794e-05, "loss": 0.1313, "step": 4520 }, { "epoch": 0.409945476346689, "grad_norm": 0.2590632736682892, "learning_rate": 1.3337746239059817e-05, "loss": 0.1284, "step": 4530 }, { "epoch": 0.41085043324811654, "grad_norm": 0.44863688945770264, "learning_rate": 1.3310101492157256e-05, "loss": 0.128, "step": 4540 }, { "epoch": 0.41175539014954415, "grad_norm": 0.3310014009475708, "learning_rate": 1.328242830632556e-05, "loss": 0.1352, "step": 4550 }, { "epoch": 0.4126603470509717, "grad_norm": 0.44096454977989197, "learning_rate": 1.3254726919320509e-05, "loss": 0.1302, "step": 4560 }, { "epoch": 0.41356530395239927, "grad_norm": 0.30306538939476013, "learning_rate": 1.322699756914018e-05, "loss": 0.1364, "step": 4570 }, { "epoch": 0.4144702608538268, "grad_norm": 0.2730850875377655, "learning_rate": 1.3199240494022891e-05, "loss": 0.1393, "step": 4580 }, { "epoch": 0.41537521775525443, "grad_norm": 0.21079055964946747, "learning_rate": 1.3171455932445172e-05, "loss": 0.1294, "step": 4590 }, { "epoch": 0.416280174656682, "grad_norm": 0.5057855248451233, "learning_rate": 1.3143644123119692e-05, "loss": 0.1338, "step": 4600 }, { "epoch": 0.41718513155810955, "grad_norm": 0.3610301613807678, "learning_rate": 1.3115805304993221e-05, "loss": 0.1298, "step": 4610 }, { "epoch": 0.4180900884595371, "grad_norm": 0.3835664391517639, "learning_rate": 1.3087939717244591e-05, "loss": 0.1303, "step": 4620 }, { "epoch": 0.41899504536096466, "grad_norm": 0.3250782787799835, "learning_rate": 1.306004759928261e-05, "loss": 0.1296, "step": 4630 }, { "epoch": 0.41990000226239227, "grad_norm": 0.3284454941749573, "learning_rate": 1.3032129190744032e-05, "loss": 0.1285, "step": 4640 }, { "epoch": 0.4208049591638198, "grad_norm": 0.30871763825416565, "learning_rate": 1.3004184731491478e-05, "loss": 0.1331, "step": 4650 }, { "epoch": 0.4217099160652474, "grad_norm": 0.2988496720790863, "learning_rate": 1.29762144616114e-05, "loss": 0.125, "step": 4660 }, { "epoch": 0.42261487296667494, "grad_norm": 0.30450698733329773, "learning_rate": 1.2948218621411996e-05, "loss": 0.134, "step": 4670 }, { "epoch": 0.42351982986810255, "grad_norm": 0.33566537499427795, "learning_rate": 1.2920197451421145e-05, "loss": 0.1439, "step": 4680 }, { "epoch": 0.4244247867695301, "grad_norm": 0.4010887145996094, "learning_rate": 1.2892151192384362e-05, "loss": 0.1348, "step": 4690 }, { "epoch": 0.42532974367095766, "grad_norm": 0.42090174555778503, "learning_rate": 1.2864080085262702e-05, "loss": 0.1315, "step": 4700 }, { "epoch": 0.4262347005723852, "grad_norm": 0.3819758892059326, "learning_rate": 1.2835984371230722e-05, "loss": 0.1264, "step": 4710 }, { "epoch": 0.42713965747381283, "grad_norm": 0.6393516659736633, "learning_rate": 1.2807864291674374e-05, "loss": 0.1312, "step": 4720 }, { "epoch": 0.4280446143752404, "grad_norm": 0.3352563977241516, "learning_rate": 1.2779720088188954e-05, "loss": 0.1337, "step": 4730 }, { "epoch": 0.42894957127666794, "grad_norm": 0.304123193025589, "learning_rate": 1.2751552002577024e-05, "loss": 0.1303, "step": 4740 }, { "epoch": 0.4298545281780955, "grad_norm": 0.391735702753067, "learning_rate": 1.2723360276846322e-05, "loss": 0.1349, "step": 4750 }, { "epoch": 0.4307594850795231, "grad_norm": 0.2713654935359955, "learning_rate": 1.26951451532077e-05, "loss": 0.1326, "step": 4760 }, { "epoch": 0.43166444198095066, "grad_norm": 0.26157835125923157, "learning_rate": 1.2666906874073024e-05, "loss": 0.1296, "step": 4770 }, { "epoch": 0.4325693988823782, "grad_norm": 0.36790212988853455, "learning_rate": 1.2638645682053119e-05, "loss": 0.1293, "step": 4780 }, { "epoch": 0.4334743557838058, "grad_norm": 0.4308098554611206, "learning_rate": 1.2610361819955647e-05, "loss": 0.1294, "step": 4790 }, { "epoch": 0.4343793126852334, "grad_norm": 0.32590603828430176, "learning_rate": 1.2582055530783059e-05, "loss": 0.1292, "step": 4800 }, { "epoch": 0.43528426958666094, "grad_norm": 0.41218942403793335, "learning_rate": 1.2553727057730481e-05, "loss": 0.1292, "step": 4810 }, { "epoch": 0.4361892264880885, "grad_norm": 0.26972493529319763, "learning_rate": 1.2525376644183625e-05, "loss": 0.1288, "step": 4820 }, { "epoch": 0.43709418338951606, "grad_norm": 0.305836021900177, "learning_rate": 1.2497004533716726e-05, "loss": 0.133, "step": 4830 }, { "epoch": 0.43799914029094367, "grad_norm": 0.49705770611763, "learning_rate": 1.246861097009041e-05, "loss": 0.1312, "step": 4840 }, { "epoch": 0.4389040971923712, "grad_norm": 0.8208497166633606, "learning_rate": 1.2440196197249634e-05, "loss": 0.139, "step": 4850 }, { "epoch": 0.4398090540937988, "grad_norm": 0.43357348442077637, "learning_rate": 1.2411760459321562e-05, "loss": 0.1347, "step": 4860 }, { "epoch": 0.44071401099522634, "grad_norm": 0.29426875710487366, "learning_rate": 1.238330400061349e-05, "loss": 0.1277, "step": 4870 }, { "epoch": 0.44161896789665395, "grad_norm": 0.31841394305229187, "learning_rate": 1.235482706561074e-05, "loss": 0.1275, "step": 4880 }, { "epoch": 0.4425239247980815, "grad_norm": 0.3246687650680542, "learning_rate": 1.2326329898974543e-05, "loss": 0.1322, "step": 4890 }, { "epoch": 0.44342888169950906, "grad_norm": 0.3303876519203186, "learning_rate": 1.2297812745539968e-05, "loss": 0.1249, "step": 4900 }, { "epoch": 0.4443338386009366, "grad_norm": 0.28597357869148254, "learning_rate": 1.2269275850313788e-05, "loss": 0.1251, "step": 4910 }, { "epoch": 0.4452387955023642, "grad_norm": 0.4606969356536865, "learning_rate": 1.2240719458472402e-05, "loss": 0.1287, "step": 4920 }, { "epoch": 0.4461437524037918, "grad_norm": 0.48796603083610535, "learning_rate": 1.2212143815359702e-05, "loss": 0.1283, "step": 4930 }, { "epoch": 0.44704870930521934, "grad_norm": 0.43160954117774963, "learning_rate": 1.2183549166484988e-05, "loss": 0.1262, "step": 4940 }, { "epoch": 0.4479536662066469, "grad_norm": 0.43935272097587585, "learning_rate": 1.2154935757520847e-05, "loss": 0.131, "step": 4950 }, { "epoch": 0.44885862310807445, "grad_norm": 0.2970845699310303, "learning_rate": 1.212630383430104e-05, "loss": 0.1292, "step": 4960 }, { "epoch": 0.44976358000950206, "grad_norm": 0.3177463710308075, "learning_rate": 1.2097653642818404e-05, "loss": 0.1269, "step": 4970 }, { "epoch": 0.4506685369109296, "grad_norm": 0.3390902578830719, "learning_rate": 1.2068985429222712e-05, "loss": 0.1258, "step": 4980 }, { "epoch": 0.4515734938123572, "grad_norm": 0.32120341062545776, "learning_rate": 1.204029943981859e-05, "loss": 0.1346, "step": 4990 }, { "epoch": 0.45247845071378473, "grad_norm": 0.3396281898021698, "learning_rate": 1.2011595921063388e-05, "loss": 0.1313, "step": 5000 }, { "epoch": 0.45338340761521234, "grad_norm": 0.45511719584465027, "learning_rate": 1.1982875119565045e-05, "loss": 0.1264, "step": 5010 }, { "epoch": 0.4542883645166399, "grad_norm": 0.44824886322021484, "learning_rate": 1.1954137282079999e-05, "loss": 0.1283, "step": 5020 }, { "epoch": 0.45519332141806745, "grad_norm": 0.37619948387145996, "learning_rate": 1.1925382655511044e-05, "loss": 0.1266, "step": 5030 }, { "epoch": 0.456098278319495, "grad_norm": 0.35430288314819336, "learning_rate": 1.1896611486905232e-05, "loss": 0.1324, "step": 5040 }, { "epoch": 0.4570032352209226, "grad_norm": 0.28441110253334045, "learning_rate": 1.1867824023451719e-05, "loss": 0.1291, "step": 5050 }, { "epoch": 0.4579081921223502, "grad_norm": 0.41242095828056335, "learning_rate": 1.1839020512479676e-05, "loss": 0.1299, "step": 5060 }, { "epoch": 0.45881314902377773, "grad_norm": 0.2977141737937927, "learning_rate": 1.1810201201456134e-05, "loss": 0.1344, "step": 5070 }, { "epoch": 0.4597181059252053, "grad_norm": 0.342632919549942, "learning_rate": 1.1781366337983882e-05, "loss": 0.1431, "step": 5080 }, { "epoch": 0.4606230628266329, "grad_norm": 0.304565966129303, "learning_rate": 1.175251616979932e-05, "loss": 0.1281, "step": 5090 }, { "epoch": 0.46152801972806046, "grad_norm": 0.27322742342948914, "learning_rate": 1.1723650944770343e-05, "loss": 0.1252, "step": 5100 }, { "epoch": 0.462432976629488, "grad_norm": 0.2408372461795807, "learning_rate": 1.1694770910894213e-05, "loss": 0.1325, "step": 5110 }, { "epoch": 0.46333793353091557, "grad_norm": 0.34792786836624146, "learning_rate": 1.1665876316295408e-05, "loss": 0.1248, "step": 5120 }, { "epoch": 0.4642428904323432, "grad_norm": 0.2646757662296295, "learning_rate": 1.1636967409223521e-05, "loss": 0.1219, "step": 5130 }, { "epoch": 0.46514784733377074, "grad_norm": 0.30345967411994934, "learning_rate": 1.1608044438051107e-05, "loss": 0.132, "step": 5140 }, { "epoch": 0.4660528042351983, "grad_norm": 0.3464104235172272, "learning_rate": 1.1579107651271544e-05, "loss": 0.1248, "step": 5150 }, { "epoch": 0.46695776113662585, "grad_norm": 0.3681158423423767, "learning_rate": 1.1550157297496927e-05, "loss": 0.1352, "step": 5160 }, { "epoch": 0.46786271803805346, "grad_norm": 0.3495553731918335, "learning_rate": 1.152119362545589e-05, "loss": 0.1405, "step": 5170 }, { "epoch": 0.468767674939481, "grad_norm": 0.41019749641418457, "learning_rate": 1.1492216883991512e-05, "loss": 0.1313, "step": 5180 }, { "epoch": 0.46967263184090857, "grad_norm": 0.30911651253700256, "learning_rate": 1.1463227322059143e-05, "loss": 0.1239, "step": 5190 }, { "epoch": 0.47057758874233613, "grad_norm": 0.32175543904304504, "learning_rate": 1.1434225188724289e-05, "loss": 0.1345, "step": 5200 }, { "epoch": 0.47148254564376374, "grad_norm": 0.3402431011199951, "learning_rate": 1.1405210733160463e-05, "loss": 0.1305, "step": 5210 }, { "epoch": 0.4723875025451913, "grad_norm": 0.5482098460197449, "learning_rate": 1.1376184204647047e-05, "loss": 0.129, "step": 5220 }, { "epoch": 0.47329245944661885, "grad_norm": 0.4262870252132416, "learning_rate": 1.134714585256714e-05, "loss": 0.1312, "step": 5230 }, { "epoch": 0.4741974163480464, "grad_norm": 0.28574520349502563, "learning_rate": 1.1318095926405434e-05, "loss": 0.1259, "step": 5240 }, { "epoch": 0.475102373249474, "grad_norm": 0.49999454617500305, "learning_rate": 1.1289034675746056e-05, "loss": 0.1265, "step": 5250 }, { "epoch": 0.4760073301509016, "grad_norm": 0.2796456515789032, "learning_rate": 1.1259962350270428e-05, "loss": 0.1305, "step": 5260 }, { "epoch": 0.47691228705232913, "grad_norm": 0.3368781805038452, "learning_rate": 1.1230879199755118e-05, "loss": 0.1334, "step": 5270 }, { "epoch": 0.4778172439537567, "grad_norm": 0.3882569968700409, "learning_rate": 1.1201785474069706e-05, "loss": 0.1379, "step": 5280 }, { "epoch": 0.4787222008551843, "grad_norm": 0.3234885036945343, "learning_rate": 1.1172681423174625e-05, "loss": 0.1308, "step": 5290 }, { "epoch": 0.47962715775661185, "grad_norm": 0.30723053216934204, "learning_rate": 1.114356729711902e-05, "loss": 0.1291, "step": 5300 }, { "epoch": 0.4805321146580394, "grad_norm": 0.34784960746765137, "learning_rate": 1.1114443346038591e-05, "loss": 0.1264, "step": 5310 }, { "epoch": 0.48143707155946697, "grad_norm": 0.36275964975357056, "learning_rate": 1.1085309820153456e-05, "loss": 0.1247, "step": 5320 }, { "epoch": 0.4823420284608945, "grad_norm": 0.33047980070114136, "learning_rate": 1.1056166969765991e-05, "loss": 0.1277, "step": 5330 }, { "epoch": 0.48324698536232213, "grad_norm": 0.3921981155872345, "learning_rate": 1.1027015045258694e-05, "loss": 0.1345, "step": 5340 }, { "epoch": 0.4841519422637497, "grad_norm": 0.3603346347808838, "learning_rate": 1.0997854297092011e-05, "loss": 0.1232, "step": 5350 }, { "epoch": 0.48505689916517725, "grad_norm": 0.5657104253768921, "learning_rate": 1.0968684975802206e-05, "loss": 0.1316, "step": 5360 }, { "epoch": 0.4859618560666048, "grad_norm": 0.28306034207344055, "learning_rate": 1.0939507331999195e-05, "loss": 0.1312, "step": 5370 }, { "epoch": 0.4868668129680324, "grad_norm": 0.33064553141593933, "learning_rate": 1.0910321616364397e-05, "loss": 0.1347, "step": 5380 }, { "epoch": 0.48777176986945997, "grad_norm": 0.39418330788612366, "learning_rate": 1.0881128079648586e-05, "loss": 0.1263, "step": 5390 }, { "epoch": 0.4886767267708875, "grad_norm": 0.3185255527496338, "learning_rate": 1.0851926972669722e-05, "loss": 0.1399, "step": 5400 }, { "epoch": 0.4895816836723151, "grad_norm": 0.4073029160499573, "learning_rate": 1.0822718546310816e-05, "loss": 0.1303, "step": 5410 }, { "epoch": 0.4904866405737427, "grad_norm": 0.49370768666267395, "learning_rate": 1.0793503051517758e-05, "loss": 0.1249, "step": 5420 }, { "epoch": 0.49139159747517025, "grad_norm": 0.2705213725566864, "learning_rate": 1.0764280739297163e-05, "loss": 0.1341, "step": 5430 }, { "epoch": 0.4922965543765978, "grad_norm": 0.2689199447631836, "learning_rate": 1.0735051860714231e-05, "loss": 0.1306, "step": 5440 }, { "epoch": 0.49320151127802536, "grad_norm": 0.27751344442367554, "learning_rate": 1.0705816666890561e-05, "loss": 0.1285, "step": 5450 }, { "epoch": 0.494106468179453, "grad_norm": 0.3316192030906677, "learning_rate": 1.0676575409002024e-05, "loss": 0.1362, "step": 5460 }, { "epoch": 0.49501142508088053, "grad_norm": 0.3556595742702484, "learning_rate": 1.064732833827658e-05, "loss": 0.1271, "step": 5470 }, { "epoch": 0.4959163819823081, "grad_norm": 0.2501612603664398, "learning_rate": 1.0618075705992138e-05, "loss": 0.1307, "step": 5480 }, { "epoch": 0.49682133888373564, "grad_norm": 0.32025519013404846, "learning_rate": 1.0588817763474388e-05, "loss": 0.1271, "step": 5490 }, { "epoch": 0.49772629578516325, "grad_norm": 0.3633834421634674, "learning_rate": 1.0559554762094637e-05, "loss": 0.1221, "step": 5500 }, { "epoch": 0.4986312526865908, "grad_norm": 0.3367408514022827, "learning_rate": 1.0530286953267665e-05, "loss": 0.1288, "step": 5510 }, { "epoch": 0.49953620958801837, "grad_norm": 0.36954525113105774, "learning_rate": 1.050101458844955e-05, "loss": 0.1247, "step": 5520 }, { "epoch": 0.5004411664894459, "grad_norm": 0.46406790614128113, "learning_rate": 1.047173791913551e-05, "loss": 0.1312, "step": 5530 }, { "epoch": 0.5013461233908735, "grad_norm": 0.32501330971717834, "learning_rate": 1.044245719685775e-05, "loss": 0.1323, "step": 5540 }, { "epoch": 0.502251080292301, "grad_norm": 0.31969472765922546, "learning_rate": 1.0413172673183298e-05, "loss": 0.1343, "step": 5550 }, { "epoch": 0.5031560371937287, "grad_norm": 0.2659285068511963, "learning_rate": 1.0383884599711838e-05, "loss": 0.1244, "step": 5560 }, { "epoch": 0.5040609940951563, "grad_norm": 0.4111124873161316, "learning_rate": 1.035459322807355e-05, "loss": 0.1289, "step": 5570 }, { "epoch": 0.5049659509965838, "grad_norm": 0.36066919565200806, "learning_rate": 1.0325298809926962e-05, "loss": 0.1297, "step": 5580 }, { "epoch": 0.5058709078980114, "grad_norm": 0.30882957577705383, "learning_rate": 1.029600159695676e-05, "loss": 0.1256, "step": 5590 }, { "epoch": 0.5067758647994389, "grad_norm": 0.30170318484306335, "learning_rate": 1.0266701840871657e-05, "loss": 0.1266, "step": 5600 }, { "epoch": 0.5076808217008665, "grad_norm": 0.39970946311950684, "learning_rate": 1.0237399793402203e-05, "loss": 0.1334, "step": 5610 }, { "epoch": 0.508585778602294, "grad_norm": 0.24716977775096893, "learning_rate": 1.0208095706298643e-05, "loss": 0.1277, "step": 5620 }, { "epoch": 0.5094907355037216, "grad_norm": 0.3430007994174957, "learning_rate": 1.017878983132874e-05, "loss": 0.1315, "step": 5630 }, { "epoch": 0.5103956924051493, "grad_norm": 0.37827351689338684, "learning_rate": 1.0149482420275623e-05, "loss": 0.1288, "step": 5640 }, { "epoch": 0.5113006493065768, "grad_norm": 0.2566516101360321, "learning_rate": 1.0120173724935614e-05, "loss": 0.1331, "step": 5650 }, { "epoch": 0.5122056062080044, "grad_norm": 0.327240914106369, "learning_rate": 1.0090863997116066e-05, "loss": 0.1267, "step": 5660 }, { "epoch": 0.5131105631094319, "grad_norm": 0.49574097990989685, "learning_rate": 1.0061553488633217e-05, "loss": 0.129, "step": 5670 }, { "epoch": 0.5140155200108595, "grad_norm": 0.33093881607055664, "learning_rate": 1.0032242451309996e-05, "loss": 0.1289, "step": 5680 }, { "epoch": 0.514920476912287, "grad_norm": 0.30682793259620667, "learning_rate": 1.0002931136973881e-05, "loss": 0.1289, "step": 5690 }, { "epoch": 0.5158254338137146, "grad_norm": 0.32405000925064087, "learning_rate": 9.973619797454734e-06, "loss": 0.1298, "step": 5700 }, { "epoch": 0.5167303907151422, "grad_norm": 0.34693455696105957, "learning_rate": 9.944308684582627e-06, "loss": 0.129, "step": 5710 }, { "epoch": 0.5176353476165697, "grad_norm": 0.44524097442626953, "learning_rate": 9.914998050185693e-06, "loss": 0.1252, "step": 5720 }, { "epoch": 0.5185403045179974, "grad_norm": 0.346175879240036, "learning_rate": 9.885688146087945e-06, "loss": 0.1324, "step": 5730 }, { "epoch": 0.5194452614194249, "grad_norm": 0.32724258303642273, "learning_rate": 9.856379224107124e-06, "loss": 0.1222, "step": 5740 }, { "epoch": 0.5203502183208525, "grad_norm": 0.3304066061973572, "learning_rate": 9.827071536052536e-06, "loss": 0.1297, "step": 5750 }, { "epoch": 0.52125517522228, "grad_norm": 0.3203341066837311, "learning_rate": 9.797765333722888e-06, "loss": 0.1315, "step": 5760 }, { "epoch": 0.5221601321237076, "grad_norm": 0.33740121126174927, "learning_rate": 9.768460868904112e-06, "loss": 0.1281, "step": 5770 }, { "epoch": 0.5230650890251352, "grad_norm": 0.3358958065509796, "learning_rate": 9.739158393367229e-06, "loss": 0.1298, "step": 5780 }, { "epoch": 0.5239700459265627, "grad_norm": 0.3097865581512451, "learning_rate": 9.709858158866147e-06, "loss": 0.1219, "step": 5790 }, { "epoch": 0.5248750028279903, "grad_norm": 0.3808821141719818, "learning_rate": 9.680560417135538e-06, "loss": 0.1244, "step": 5800 }, { "epoch": 0.5257799597294179, "grad_norm": 0.3200414478778839, "learning_rate": 9.651265419888651e-06, "loss": 0.1312, "step": 5810 }, { "epoch": 0.5266849166308455, "grad_norm": 0.38478338718414307, "learning_rate": 9.621973418815154e-06, "loss": 0.1324, "step": 5820 }, { "epoch": 0.527589873532273, "grad_norm": 0.39911043643951416, "learning_rate": 9.592684665578978e-06, "loss": 0.1246, "step": 5830 }, { "epoch": 0.5284948304337006, "grad_norm": 0.328489750623703, "learning_rate": 9.563399411816141e-06, "loss": 0.1276, "step": 5840 }, { "epoch": 0.5293997873351282, "grad_norm": 0.4059896171092987, "learning_rate": 9.534117909132606e-06, "loss": 0.136, "step": 5850 }, { "epoch": 0.5303047442365557, "grad_norm": 0.3130931854248047, "learning_rate": 9.5048404091021e-06, "loss": 0.1286, "step": 5860 }, { "epoch": 0.5312097011379833, "grad_norm": 0.3940030634403229, "learning_rate": 9.475567163263968e-06, "loss": 0.1315, "step": 5870 }, { "epoch": 0.5321146580394108, "grad_norm": 0.40726789832115173, "learning_rate": 9.446298423120995e-06, "loss": 0.1289, "step": 5880 }, { "epoch": 0.5330196149408385, "grad_norm": 0.5030715465545654, "learning_rate": 9.417034440137264e-06, "loss": 0.1305, "step": 5890 }, { "epoch": 0.533924571842266, "grad_norm": 0.3696930706501007, "learning_rate": 9.387775465735987e-06, "loss": 0.1337, "step": 5900 }, { "epoch": 0.5348295287436936, "grad_norm": 0.2996613681316376, "learning_rate": 9.358521751297336e-06, "loss": 0.1296, "step": 5910 }, { "epoch": 0.5357344856451212, "grad_norm": 0.30213144421577454, "learning_rate": 9.329273548156305e-06, "loss": 0.1284, "step": 5920 }, { "epoch": 0.5366394425465487, "grad_norm": 0.35034891963005066, "learning_rate": 9.300031107600519e-06, "loss": 0.1258, "step": 5930 }, { "epoch": 0.5375443994479763, "grad_norm": 0.4542511999607086, "learning_rate": 9.270794680868108e-06, "loss": 0.1384, "step": 5940 }, { "epoch": 0.5384493563494038, "grad_norm": 0.4177015423774719, "learning_rate": 9.241564519145529e-06, "loss": 0.1308, "step": 5950 }, { "epoch": 0.5393543132508314, "grad_norm": 0.3148910701274872, "learning_rate": 9.212340873565417e-06, "loss": 0.1256, "step": 5960 }, { "epoch": 0.540259270152259, "grad_norm": 0.3220025300979614, "learning_rate": 9.183123995204419e-06, "loss": 0.1356, "step": 5970 }, { "epoch": 0.5411642270536866, "grad_norm": 0.37302547693252563, "learning_rate": 9.153914135081037e-06, "loss": 0.1319, "step": 5980 }, { "epoch": 0.5420691839551142, "grad_norm": 0.2666574716567993, "learning_rate": 9.12471154415348e-06, "loss": 0.1271, "step": 5990 }, { "epoch": 0.5429741408565417, "grad_norm": 0.333808034658432, "learning_rate": 9.095516473317506e-06, "loss": 0.1249, "step": 6000 }, { "epoch": 0.5438790977579693, "grad_norm": 0.29716721177101135, "learning_rate": 9.066329173404267e-06, "loss": 0.1331, "step": 6010 }, { "epoch": 0.5447840546593968, "grad_norm": 0.3095798194408417, "learning_rate": 9.037149895178132e-06, "loss": 0.1219, "step": 6020 }, { "epoch": 0.5456890115608244, "grad_norm": 0.3963325619697571, "learning_rate": 9.007978889334573e-06, "loss": 0.1281, "step": 6030 }, { "epoch": 0.546593968462252, "grad_norm": 0.2934766113758087, "learning_rate": 8.978816406497977e-06, "loss": 0.1254, "step": 6040 }, { "epoch": 0.5474989253636795, "grad_norm": 0.3293272852897644, "learning_rate": 8.949662697219507e-06, "loss": 0.132, "step": 6050 }, { "epoch": 0.5484038822651072, "grad_norm": 0.47501856088638306, "learning_rate": 8.920518011974955e-06, "loss": 0.134, "step": 6060 }, { "epoch": 0.5493088391665347, "grad_norm": 0.38773998618125916, "learning_rate": 8.891382601162571e-06, "loss": 0.1347, "step": 6070 }, { "epoch": 0.5502137960679623, "grad_norm": 0.30083420872688293, "learning_rate": 8.862256715100926e-06, "loss": 0.1232, "step": 6080 }, { "epoch": 0.5511187529693898, "grad_norm": 0.32377147674560547, "learning_rate": 8.833140604026763e-06, "loss": 0.1244, "step": 6090 }, { "epoch": 0.5520237098708174, "grad_norm": 1.7971711158752441, "learning_rate": 8.804034518092846e-06, "loss": 0.1269, "step": 6100 }, { "epoch": 0.552928666772245, "grad_norm": 0.28756287693977356, "learning_rate": 8.7749387073658e-06, "loss": 0.1285, "step": 6110 }, { "epoch": 0.5538336236736725, "grad_norm": 0.3303268551826477, "learning_rate": 8.745853421823965e-06, "loss": 0.131, "step": 6120 }, { "epoch": 0.5547385805751001, "grad_norm": 0.30345699191093445, "learning_rate": 8.716778911355266e-06, "loss": 0.1344, "step": 6130 }, { "epoch": 0.5556435374765277, "grad_norm": 0.33741581439971924, "learning_rate": 8.687715425755047e-06, "loss": 0.125, "step": 6140 }, { "epoch": 0.5565484943779553, "grad_norm": 0.36916887760162354, "learning_rate": 8.65866321472393e-06, "loss": 0.1391, "step": 6150 }, { "epoch": 0.5574534512793828, "grad_norm": 0.2726483941078186, "learning_rate": 8.62962252786568e-06, "loss": 0.1274, "step": 6160 }, { "epoch": 0.5583584081808104, "grad_norm": 0.6223147511482239, "learning_rate": 8.600593614685035e-06, "loss": 0.1222, "step": 6170 }, { "epoch": 0.559263365082238, "grad_norm": 0.285051554441452, "learning_rate": 8.571576724585589e-06, "loss": 0.1316, "step": 6180 }, { "epoch": 0.5601683219836655, "grad_norm": 0.2996697723865509, "learning_rate": 8.542572106867643e-06, "loss": 0.1221, "step": 6190 }, { "epoch": 0.5610732788850931, "grad_norm": 0.4449019730091095, "learning_rate": 8.513580010726052e-06, "loss": 0.136, "step": 6200 }, { "epoch": 0.5619782357865206, "grad_norm": 0.3610653579235077, "learning_rate": 8.484600685248089e-06, "loss": 0.1349, "step": 6210 }, { "epoch": 0.5628831926879483, "grad_norm": 0.23445254564285278, "learning_rate": 8.455634379411314e-06, "loss": 0.1304, "step": 6220 }, { "epoch": 0.5637881495893758, "grad_norm": 0.4748559594154358, "learning_rate": 8.426681342081428e-06, "loss": 0.1281, "step": 6230 }, { "epoch": 0.5646931064908034, "grad_norm": 0.28520044684410095, "learning_rate": 8.397741822010128e-06, "loss": 0.1318, "step": 6240 }, { "epoch": 0.565598063392231, "grad_norm": 0.31549420952796936, "learning_rate": 8.368816067832986e-06, "loss": 0.1259, "step": 6250 }, { "epoch": 0.5665030202936585, "grad_norm": 0.3754958510398865, "learning_rate": 8.339904328067289e-06, "loss": 0.1317, "step": 6260 }, { "epoch": 0.5674079771950861, "grad_norm": 0.3160054087638855, "learning_rate": 8.311006851109939e-06, "loss": 0.1311, "step": 6270 }, { "epoch": 0.5683129340965136, "grad_norm": 0.3136991560459137, "learning_rate": 8.282123885235276e-06, "loss": 0.1349, "step": 6280 }, { "epoch": 0.5692178909979412, "grad_norm": 0.38889384269714355, "learning_rate": 8.253255678592985e-06, "loss": 0.1349, "step": 6290 }, { "epoch": 0.5701228478993688, "grad_norm": 0.3792050778865814, "learning_rate": 8.224402479205941e-06, "loss": 0.1237, "step": 6300 }, { "epoch": 0.5710278048007964, "grad_norm": 0.35518279671669006, "learning_rate": 8.195564534968074e-06, "loss": 0.1232, "step": 6310 }, { "epoch": 0.571932761702224, "grad_norm": 0.2767059803009033, "learning_rate": 8.166742093642263e-06, "loss": 0.1265, "step": 6320 }, { "epoch": 0.5728377186036515, "grad_norm": 0.37054571509361267, "learning_rate": 8.137935402858182e-06, "loss": 0.1288, "step": 6330 }, { "epoch": 0.5737426755050791, "grad_norm": 0.3450683653354645, "learning_rate": 8.10914471011019e-06, "loss": 0.1339, "step": 6340 }, { "epoch": 0.5746476324065066, "grad_norm": 0.29977867007255554, "learning_rate": 8.080370262755191e-06, "loss": 0.126, "step": 6350 }, { "epoch": 0.5755525893079342, "grad_norm": 0.34314143657684326, "learning_rate": 8.051612308010526e-06, "loss": 0.1283, "step": 6360 }, { "epoch": 0.5764575462093617, "grad_norm": 0.37054121494293213, "learning_rate": 8.022871092951827e-06, "loss": 0.1292, "step": 6370 }, { "epoch": 0.5773625031107893, "grad_norm": 0.37676891684532166, "learning_rate": 7.994146864510912e-06, "loss": 0.1285, "step": 6380 }, { "epoch": 0.578267460012217, "grad_norm": 0.26649826765060425, "learning_rate": 7.965439869473664e-06, "loss": 0.1261, "step": 6390 }, { "epoch": 0.5791724169136445, "grad_norm": 0.38938188552856445, "learning_rate": 7.936750354477891e-06, "loss": 0.1272, "step": 6400 }, { "epoch": 0.5800773738150721, "grad_norm": 0.32541313767433167, "learning_rate": 7.908078566011227e-06, "loss": 0.1233, "step": 6410 }, { "epoch": 0.5809823307164996, "grad_norm": 0.36433953046798706, "learning_rate": 7.879424750409007e-06, "loss": 0.1314, "step": 6420 }, { "epoch": 0.5818872876179272, "grad_norm": 0.3940136432647705, "learning_rate": 7.850789153852157e-06, "loss": 0.1373, "step": 6430 }, { "epoch": 0.5827922445193547, "grad_norm": 0.3312411904335022, "learning_rate": 7.822172022365059e-06, "loss": 0.1258, "step": 6440 }, { "epoch": 0.5836972014207823, "grad_norm": 0.5461381077766418, "learning_rate": 7.793573601813467e-06, "loss": 0.1275, "step": 6450 }, { "epoch": 0.5846021583222099, "grad_norm": 0.41021519899368286, "learning_rate": 7.764994137902366e-06, "loss": 0.1305, "step": 6460 }, { "epoch": 0.5855071152236375, "grad_norm": 0.5024427175521851, "learning_rate": 7.736433876173879e-06, "loss": 0.1264, "step": 6470 }, { "epoch": 0.5864120721250651, "grad_norm": 0.3114100992679596, "learning_rate": 7.70789306200516e-06, "loss": 0.1328, "step": 6480 }, { "epoch": 0.5873170290264926, "grad_norm": 0.3421667814254761, "learning_rate": 7.679371940606265e-06, "loss": 0.1336, "step": 6490 }, { "epoch": 0.5882219859279202, "grad_norm": 0.4376727044582367, "learning_rate": 7.650870757018061e-06, "loss": 0.1277, "step": 6500 }, { "epoch": 0.5891269428293477, "grad_norm": 0.26968899369239807, "learning_rate": 7.622389756110126e-06, "loss": 0.1281, "step": 6510 }, { "epoch": 0.5900318997307753, "grad_norm": 0.3418639004230499, "learning_rate": 7.593929182578634e-06, "loss": 0.1321, "step": 6520 }, { "epoch": 0.5909368566322029, "grad_norm": 0.3123999536037445, "learning_rate": 7.565489280944256e-06, "loss": 0.1257, "step": 6530 }, { "epoch": 0.5918418135336304, "grad_norm": 0.39082077145576477, "learning_rate": 7.537070295550051e-06, "loss": 0.1303, "step": 6540 }, { "epoch": 0.5927467704350581, "grad_norm": 0.32185035943984985, "learning_rate": 7.508672470559385e-06, "loss": 0.1278, "step": 6550 }, { "epoch": 0.5936517273364856, "grad_norm": 0.6370688080787659, "learning_rate": 7.480296049953823e-06, "loss": 0.132, "step": 6560 }, { "epoch": 0.5945566842379132, "grad_norm": 0.44037026166915894, "learning_rate": 7.451941277531025e-06, "loss": 0.1264, "step": 6570 }, { "epoch": 0.5954616411393407, "grad_norm": 0.301471471786499, "learning_rate": 7.423608396902673e-06, "loss": 0.1261, "step": 6580 }, { "epoch": 0.5963665980407683, "grad_norm": 0.3698810338973999, "learning_rate": 7.395297651492346e-06, "loss": 0.1262, "step": 6590 }, { "epoch": 0.5972715549421959, "grad_norm": 0.27682480216026306, "learning_rate": 7.36700928453346e-06, "loss": 0.1301, "step": 6600 }, { "epoch": 0.5981765118436234, "grad_norm": 0.3122202455997467, "learning_rate": 7.338743539067163e-06, "loss": 0.1325, "step": 6610 }, { "epoch": 0.599081468745051, "grad_norm": 0.3500480651855469, "learning_rate": 7.310500657940253e-06, "loss": 0.1332, "step": 6620 }, { "epoch": 0.5999864256464786, "grad_norm": 0.3293434977531433, "learning_rate": 7.282280883803073e-06, "loss": 0.1337, "step": 6630 }, { "epoch": 0.6008913825479062, "grad_norm": 0.2992168366909027, "learning_rate": 7.254084459107453e-06, "loss": 0.1336, "step": 6640 }, { "epoch": 0.6017963394493338, "grad_norm": 0.3215314745903015, "learning_rate": 7.225911626104621e-06, "loss": 0.1283, "step": 6650 }, { "epoch": 0.6027012963507613, "grad_norm": 0.3701172173023224, "learning_rate": 7.1977626268430965e-06, "loss": 0.1219, "step": 6660 }, { "epoch": 0.6036062532521889, "grad_norm": 0.3572734594345093, "learning_rate": 7.1696377031666495e-06, "loss": 0.1204, "step": 6670 }, { "epoch": 0.6045112101536164, "grad_norm": 0.37793678045272827, "learning_rate": 7.1415370967121896e-06, "loss": 0.1253, "step": 6680 }, { "epoch": 0.605416167055044, "grad_norm": 0.2588195204734802, "learning_rate": 7.113461048907711e-06, "loss": 0.1247, "step": 6690 }, { "epoch": 0.6063211239564715, "grad_norm": 0.3132305145263672, "learning_rate": 7.085409800970203e-06, "loss": 0.1307, "step": 6700 }, { "epoch": 0.6072260808578992, "grad_norm": 0.36036190390586853, "learning_rate": 7.0573835939035974e-06, "loss": 0.1322, "step": 6710 }, { "epoch": 0.6081310377593268, "grad_norm": 0.39142096042633057, "learning_rate": 7.029382668496679e-06, "loss": 0.1218, "step": 6720 }, { "epoch": 0.6090359946607543, "grad_norm": 0.33356091380119324, "learning_rate": 7.001407265321019e-06, "loss": 0.1268, "step": 6730 }, { "epoch": 0.6099409515621819, "grad_norm": 0.3158833086490631, "learning_rate": 6.973457624728922e-06, "loss": 0.1248, "step": 6740 }, { "epoch": 0.6108459084636094, "grad_norm": 0.35817354917526245, "learning_rate": 6.945533986851345e-06, "loss": 0.1304, "step": 6750 }, { "epoch": 0.611750865365037, "grad_norm": 0.28623586893081665, "learning_rate": 6.917636591595849e-06, "loss": 0.1243, "step": 6760 }, { "epoch": 0.6126558222664645, "grad_norm": 0.39012181758880615, "learning_rate": 6.8897656786445166e-06, "loss": 0.1213, "step": 6770 }, { "epoch": 0.6135607791678921, "grad_norm": 0.44978418946266174, "learning_rate": 6.861921487451922e-06, "loss": 0.1234, "step": 6780 }, { "epoch": 0.6144657360693196, "grad_norm": 0.34891191124916077, "learning_rate": 6.834104257243043e-06, "loss": 0.1275, "step": 6790 }, { "epoch": 0.6153706929707473, "grad_norm": 0.29933297634124756, "learning_rate": 6.806314227011235e-06, "loss": 0.1307, "step": 6800 }, { "epoch": 0.6162756498721749, "grad_norm": 0.3456212282180786, "learning_rate": 6.778551635516157e-06, "loss": 0.1273, "step": 6810 }, { "epoch": 0.6171806067736024, "grad_norm": 0.32694172859191895, "learning_rate": 6.750816721281719e-06, "loss": 0.1278, "step": 6820 }, { "epoch": 0.61808556367503, "grad_norm": 0.3350003659725189, "learning_rate": 6.7231097225940475e-06, "loss": 0.1318, "step": 6830 }, { "epoch": 0.6189905205764575, "grad_norm": 0.3554823100566864, "learning_rate": 6.695430877499434e-06, "loss": 0.1282, "step": 6840 }, { "epoch": 0.6198954774778851, "grad_norm": 0.4120415449142456, "learning_rate": 6.6677804238022806e-06, "loss": 0.1311, "step": 6850 }, { "epoch": 0.6208004343793126, "grad_norm": 0.3582000732421875, "learning_rate": 6.640158599063069e-06, "loss": 0.1223, "step": 6860 }, { "epoch": 0.6217053912807402, "grad_norm": 0.3458595275878906, "learning_rate": 6.612565640596307e-06, "loss": 0.1294, "step": 6870 }, { "epoch": 0.6226103481821679, "grad_norm": 0.2830416262149811, "learning_rate": 6.585001785468497e-06, "loss": 0.1273, "step": 6880 }, { "epoch": 0.6235153050835954, "grad_norm": 0.32797959446907043, "learning_rate": 6.5574672704961025e-06, "loss": 0.1284, "step": 6890 }, { "epoch": 0.624420261985023, "grad_norm": 0.3323483467102051, "learning_rate": 6.529962332243509e-06, "loss": 0.1258, "step": 6900 }, { "epoch": 0.6253252188864505, "grad_norm": 0.2794325649738312, "learning_rate": 6.5024872070209936e-06, "loss": 0.1323, "step": 6910 }, { "epoch": 0.6262301757878781, "grad_norm": 0.2866572439670563, "learning_rate": 6.4750421308826795e-06, "loss": 0.1269, "step": 6920 }, { "epoch": 0.6271351326893057, "grad_norm": 0.3717053532600403, "learning_rate": 6.447627339624538e-06, "loss": 0.1257, "step": 6930 }, { "epoch": 0.6280400895907332, "grad_norm": 0.3445277512073517, "learning_rate": 6.4202430687823416e-06, "loss": 0.133, "step": 6940 }, { "epoch": 0.6289450464921608, "grad_norm": 0.4389108419418335, "learning_rate": 6.39288955362964e-06, "loss": 0.1271, "step": 6950 }, { "epoch": 0.6298500033935884, "grad_norm": 0.38752782344818115, "learning_rate": 6.365567029175747e-06, "loss": 0.1306, "step": 6960 }, { "epoch": 0.630754960295016, "grad_norm": 0.4342532157897949, "learning_rate": 6.338275730163715e-06, "loss": 0.1286, "step": 6970 }, { "epoch": 0.6316599171964435, "grad_norm": 0.3530200719833374, "learning_rate": 6.311015891068328e-06, "loss": 0.1239, "step": 6980 }, { "epoch": 0.6325648740978711, "grad_norm": 0.33301499485969543, "learning_rate": 6.283787746094077e-06, "loss": 0.1311, "step": 6990 }, { "epoch": 0.6334698309992987, "grad_norm": 0.3174525201320648, "learning_rate": 6.256591529173148e-06, "loss": 0.1318, "step": 7000 }, { "epoch": 0.6343747879007262, "grad_norm": 0.27925947308540344, "learning_rate": 6.229427473963416e-06, "loss": 0.1291, "step": 7010 }, { "epoch": 0.6352797448021538, "grad_norm": 0.36644554138183594, "learning_rate": 6.20229581384644e-06, "loss": 0.119, "step": 7020 }, { "epoch": 0.6361847017035813, "grad_norm": 0.4176923930644989, "learning_rate": 6.1751967819254545e-06, "loss": 0.126, "step": 7030 }, { "epoch": 0.637089658605009, "grad_norm": 0.30893582105636597, "learning_rate": 6.148130611023361e-06, "loss": 0.1283, "step": 7040 }, { "epoch": 0.6379946155064365, "grad_norm": 0.37425440549850464, "learning_rate": 6.121097533680745e-06, "loss": 0.1265, "step": 7050 }, { "epoch": 0.6388995724078641, "grad_norm": 0.2834739089012146, "learning_rate": 6.094097782153853e-06, "loss": 0.1311, "step": 7060 }, { "epoch": 0.6398045293092917, "grad_norm": 0.3139822781085968, "learning_rate": 6.0671315884126225e-06, "loss": 0.1231, "step": 7070 }, { "epoch": 0.6407094862107192, "grad_norm": 0.2844420075416565, "learning_rate": 6.040199184138668e-06, "loss": 0.129, "step": 7080 }, { "epoch": 0.6416144431121468, "grad_norm": 0.35503455996513367, "learning_rate": 6.013300800723312e-06, "loss": 0.1311, "step": 7090 }, { "epoch": 0.6425194000135743, "grad_norm": 0.35351240634918213, "learning_rate": 5.986436669265568e-06, "loss": 0.1331, "step": 7100 }, { "epoch": 0.6434243569150019, "grad_norm": 0.3031436800956726, "learning_rate": 5.959607020570184e-06, "loss": 0.1305, "step": 7110 }, { "epoch": 0.6443293138164294, "grad_norm": 0.23189416527748108, "learning_rate": 5.932812085145647e-06, "loss": 0.1235, "step": 7120 }, { "epoch": 0.6452342707178571, "grad_norm": 0.2822563946247101, "learning_rate": 5.906052093202199e-06, "loss": 0.1269, "step": 7130 }, { "epoch": 0.6461392276192847, "grad_norm": 0.28259310126304626, "learning_rate": 5.879327274649868e-06, "loss": 0.1273, "step": 7140 }, { "epoch": 0.6470441845207122, "grad_norm": 0.33720916509628296, "learning_rate": 5.852637859096475e-06, "loss": 0.1345, "step": 7150 }, { "epoch": 0.6479491414221398, "grad_norm": 0.3332005441188812, "learning_rate": 5.825984075845691e-06, "loss": 0.1248, "step": 7160 }, { "epoch": 0.6488540983235673, "grad_norm": 0.3517606258392334, "learning_rate": 5.799366153895037e-06, "loss": 0.1288, "step": 7170 }, { "epoch": 0.6497590552249949, "grad_norm": 0.265109121799469, "learning_rate": 5.772784321933939e-06, "loss": 0.1329, "step": 7180 }, { "epoch": 0.6506640121264224, "grad_norm": 0.3004017472267151, "learning_rate": 5.746238808341751e-06, "loss": 0.1252, "step": 7190 }, { "epoch": 0.65156896902785, "grad_norm": 0.34260621666908264, "learning_rate": 5.719729841185786e-06, "loss": 0.1267, "step": 7200 }, { "epoch": 0.6524739259292777, "grad_norm": 0.24408110976219177, "learning_rate": 5.693257648219379e-06, "loss": 0.1296, "step": 7210 }, { "epoch": 0.6533788828307052, "grad_norm": 0.2951405346393585, "learning_rate": 5.666822456879918e-06, "loss": 0.1231, "step": 7220 }, { "epoch": 0.6542838397321328, "grad_norm": 0.32168251276016235, "learning_rate": 5.640424494286878e-06, "loss": 0.1298, "step": 7230 }, { "epoch": 0.6551887966335603, "grad_norm": 0.2844177186489105, "learning_rate": 5.614063987239885e-06, "loss": 0.1289, "step": 7240 }, { "epoch": 0.6560937535349879, "grad_norm": 0.5491533875465393, "learning_rate": 5.587741162216768e-06, "loss": 0.1313, "step": 7250 }, { "epoch": 0.6569987104364154, "grad_norm": 0.298951119184494, "learning_rate": 5.561456245371608e-06, "loss": 0.125, "step": 7260 }, { "epoch": 0.657903667337843, "grad_norm": 0.3288751542568207, "learning_rate": 5.535209462532792e-06, "loss": 0.1296, "step": 7270 }, { "epoch": 0.6588086242392706, "grad_norm": 0.43045946955680847, "learning_rate": 5.509001039201085e-06, "loss": 0.1263, "step": 7280 }, { "epoch": 0.6597135811406982, "grad_norm": 0.2680876851081848, "learning_rate": 5.482831200547667e-06, "loss": 0.1324, "step": 7290 }, { "epoch": 0.6606185380421258, "grad_norm": 0.3521096110343933, "learning_rate": 5.456700171412231e-06, "loss": 0.1204, "step": 7300 }, { "epoch": 0.6615234949435533, "grad_norm": 0.2706452012062073, "learning_rate": 5.430608176301036e-06, "loss": 0.1269, "step": 7310 }, { "epoch": 0.6624284518449809, "grad_norm": 0.3557042181491852, "learning_rate": 5.4045554393849635e-06, "loss": 0.132, "step": 7320 }, { "epoch": 0.6633334087464084, "grad_norm": 0.3670320510864258, "learning_rate": 5.378542184497623e-06, "loss": 0.1257, "step": 7330 }, { "epoch": 0.664238365647836, "grad_norm": 0.31355276703834534, "learning_rate": 5.3525686351333976e-06, "loss": 0.1275, "step": 7340 }, { "epoch": 0.6651433225492636, "grad_norm": 0.30157995223999023, "learning_rate": 5.326635014445547e-06, "loss": 0.1291, "step": 7350 }, { "epoch": 0.6660482794506911, "grad_norm": 0.2899110019207001, "learning_rate": 5.300741545244279e-06, "loss": 0.1311, "step": 7360 }, { "epoch": 0.6669532363521188, "grad_norm": 0.34780648350715637, "learning_rate": 5.274888449994843e-06, "loss": 0.1294, "step": 7370 }, { "epoch": 0.6678581932535463, "grad_norm": 0.2876949608325958, "learning_rate": 5.2490759508155975e-06, "loss": 0.1303, "step": 7380 }, { "epoch": 0.6687631501549739, "grad_norm": 0.3087066113948822, "learning_rate": 5.223304269476137e-06, "loss": 0.1255, "step": 7390 }, { "epoch": 0.6696681070564015, "grad_norm": 0.38678351044654846, "learning_rate": 5.19757362739535e-06, "loss": 0.1288, "step": 7400 }, { "epoch": 0.670573063957829, "grad_norm": 0.2943480610847473, "learning_rate": 5.171884245639545e-06, "loss": 0.1284, "step": 7410 }, { "epoch": 0.6714780208592566, "grad_norm": 0.28372156620025635, "learning_rate": 5.146236344920542e-06, "loss": 0.1292, "step": 7420 }, { "epoch": 0.6723829777606841, "grad_norm": 0.2806905508041382, "learning_rate": 5.12063014559376e-06, "loss": 0.1272, "step": 7430 }, { "epoch": 0.6732879346621117, "grad_norm": 0.3286825120449066, "learning_rate": 5.095065867656351e-06, "loss": 0.1205, "step": 7440 }, { "epoch": 0.6741928915635392, "grad_norm": 0.33701106905937195, "learning_rate": 5.0695437307452945e-06, "loss": 0.1312, "step": 7450 }, { "epoch": 0.6750978484649669, "grad_norm": 0.3478999137878418, "learning_rate": 5.044063954135508e-06, "loss": 0.1284, "step": 7460 }, { "epoch": 0.6760028053663945, "grad_norm": 0.28950104117393494, "learning_rate": 5.018626756737979e-06, "loss": 0.1267, "step": 7470 }, { "epoch": 0.676907762267822, "grad_norm": 0.3087421655654907, "learning_rate": 4.9932323570978605e-06, "loss": 0.1254, "step": 7480 }, { "epoch": 0.6778127191692496, "grad_norm": 0.34977859258651733, "learning_rate": 4.967880973392607e-06, "loss": 0.1293, "step": 7490 }, { "epoch": 0.6787176760706771, "grad_norm": 0.3207535147666931, "learning_rate": 4.942572823430107e-06, "loss": 0.1268, "step": 7500 }, { "epoch": 0.6796226329721047, "grad_norm": 0.34587785601615906, "learning_rate": 4.917308124646802e-06, "loss": 0.1272, "step": 7510 }, { "epoch": 0.6805275898735322, "grad_norm": 0.338029146194458, "learning_rate": 4.892087094105818e-06, "loss": 0.1208, "step": 7520 }, { "epoch": 0.6814325467749598, "grad_norm": 0.3155430555343628, "learning_rate": 4.866909948495101e-06, "loss": 0.1234, "step": 7530 }, { "epoch": 0.6823375036763875, "grad_norm": 0.27973029017448425, "learning_rate": 4.841776904125559e-06, "loss": 0.1301, "step": 7540 }, { "epoch": 0.683242460577815, "grad_norm": 0.3526993989944458, "learning_rate": 4.816688176929207e-06, "loss": 0.1258, "step": 7550 }, { "epoch": 0.6841474174792426, "grad_norm": 0.3077426850795746, "learning_rate": 4.791643982457293e-06, "loss": 0.1235, "step": 7560 }, { "epoch": 0.6850523743806701, "grad_norm": 0.31144073605537415, "learning_rate": 4.766644535878476e-06, "loss": 0.1226, "step": 7570 }, { "epoch": 0.6859573312820977, "grad_norm": 0.32703807950019836, "learning_rate": 4.741690051976946e-06, "loss": 0.1265, "step": 7580 }, { "epoch": 0.6868622881835252, "grad_norm": 0.4660681486129761, "learning_rate": 4.716780745150602e-06, "loss": 0.1323, "step": 7590 }, { "epoch": 0.6877672450849528, "grad_norm": 0.27197226881980896, "learning_rate": 4.6919168294092e-06, "loss": 0.1319, "step": 7600 }, { "epoch": 0.6886722019863804, "grad_norm": 0.3342832624912262, "learning_rate": 4.6670985183725205e-06, "loss": 0.134, "step": 7610 }, { "epoch": 0.689577158887808, "grad_norm": 0.33844679594039917, "learning_rate": 4.642326025268514e-06, "loss": 0.1282, "step": 7620 }, { "epoch": 0.6904821157892356, "grad_norm": 0.39204105734825134, "learning_rate": 4.6175995629314994e-06, "loss": 0.1236, "step": 7630 }, { "epoch": 0.6913870726906631, "grad_norm": 0.38780298829078674, "learning_rate": 4.592919343800315e-06, "loss": 0.1316, "step": 7640 }, { "epoch": 0.6922920295920907, "grad_norm": 0.3531728982925415, "learning_rate": 4.568285579916491e-06, "loss": 0.1339, "step": 7650 }, { "epoch": 0.6931969864935182, "grad_norm": 0.264414519071579, "learning_rate": 4.543698482922445e-06, "loss": 0.1309, "step": 7660 }, { "epoch": 0.6941019433949458, "grad_norm": 0.3809826076030731, "learning_rate": 4.519158264059642e-06, "loss": 0.1302, "step": 7670 }, { "epoch": 0.6950069002963734, "grad_norm": 0.3677527904510498, "learning_rate": 4.4946651341668006e-06, "loss": 0.128, "step": 7680 }, { "epoch": 0.6959118571978009, "grad_norm": 0.3577388823032379, "learning_rate": 4.470219303678069e-06, "loss": 0.1242, "step": 7690 }, { "epoch": 0.6968168140992286, "grad_norm": 0.3218074142932892, "learning_rate": 4.44582098262122e-06, "loss": 0.128, "step": 7700 }, { "epoch": 0.6977217710006561, "grad_norm": 0.3113616406917572, "learning_rate": 4.421470380615841e-06, "loss": 0.1246, "step": 7710 }, { "epoch": 0.6986267279020837, "grad_norm": 0.4153074622154236, "learning_rate": 4.397167706871546e-06, "loss": 0.1236, "step": 7720 }, { "epoch": 0.6995316848035112, "grad_norm": 0.2737235426902771, "learning_rate": 4.37291317018617e-06, "loss": 0.1334, "step": 7730 }, { "epoch": 0.7004366417049388, "grad_norm": 0.3413766026496887, "learning_rate": 4.348706978943965e-06, "loss": 0.1248, "step": 7740 }, { "epoch": 0.7013415986063664, "grad_norm": 0.44644635915756226, "learning_rate": 4.324549341113839e-06, "loss": 0.131, "step": 7750 }, { "epoch": 0.7022465555077939, "grad_norm": 0.3571605384349823, "learning_rate": 4.300440464247528e-06, "loss": 0.1219, "step": 7760 }, { "epoch": 0.7031515124092215, "grad_norm": 0.218129500746727, "learning_rate": 4.276380555477855e-06, "loss": 0.1224, "step": 7770 }, { "epoch": 0.7040564693106491, "grad_norm": 0.2756895124912262, "learning_rate": 4.25236982151692e-06, "loss": 0.1165, "step": 7780 }, { "epoch": 0.7049614262120767, "grad_norm": 0.3429490029811859, "learning_rate": 4.22840846865434e-06, "loss": 0.133, "step": 7790 }, { "epoch": 0.7058663831135042, "grad_norm": 0.3609547019004822, "learning_rate": 4.204496702755471e-06, "loss": 0.1229, "step": 7800 }, { "epoch": 0.7067713400149318, "grad_norm": 0.31833615899086, "learning_rate": 4.180634729259635e-06, "loss": 0.131, "step": 7810 }, { "epoch": 0.7076762969163594, "grad_norm": 0.3234885632991791, "learning_rate": 4.15682275317836e-06, "loss": 0.1242, "step": 7820 }, { "epoch": 0.7085812538177869, "grad_norm": 0.331586092710495, "learning_rate": 4.133060979093623e-06, "loss": 0.1238, "step": 7830 }, { "epoch": 0.7094862107192145, "grad_norm": 0.29033374786376953, "learning_rate": 4.109349611156088e-06, "loss": 0.1231, "step": 7840 }, { "epoch": 0.710391167620642, "grad_norm": 0.4018980860710144, "learning_rate": 4.085688853083346e-06, "loss": 0.1276, "step": 7850 }, { "epoch": 0.7112961245220696, "grad_norm": 0.3441830575466156, "learning_rate": 4.062078908158174e-06, "loss": 0.1334, "step": 7860 }, { "epoch": 0.7122010814234973, "grad_norm": 0.3075098693370819, "learning_rate": 4.038519979226785e-06, "loss": 0.1223, "step": 7870 }, { "epoch": 0.7131060383249248, "grad_norm": 0.4702622890472412, "learning_rate": 4.015012268697085e-06, "loss": 0.1274, "step": 7880 }, { "epoch": 0.7140109952263524, "grad_norm": 0.40426555275917053, "learning_rate": 3.991555978536937e-06, "loss": 0.1286, "step": 7890 }, { "epoch": 0.7149159521277799, "grad_norm": 0.3282513916492462, "learning_rate": 3.968151310272417e-06, "loss": 0.1286, "step": 7900 }, { "epoch": 0.7158209090292075, "grad_norm": 0.31669655442237854, "learning_rate": 3.944798464986086e-06, "loss": 0.1228, "step": 7910 }, { "epoch": 0.716725865930635, "grad_norm": 0.3846443295478821, "learning_rate": 3.9214976433152755e-06, "loss": 0.1289, "step": 7920 }, { "epoch": 0.7176308228320626, "grad_norm": 0.3068075478076935, "learning_rate": 3.8982490454503455e-06, "loss": 0.1258, "step": 7930 }, { "epoch": 0.7185357797334901, "grad_norm": 0.259236216545105, "learning_rate": 3.875052871132979e-06, "loss": 0.126, "step": 7940 }, { "epoch": 0.7194407366349178, "grad_norm": 0.32442575693130493, "learning_rate": 3.851909319654448e-06, "loss": 0.1282, "step": 7950 }, { "epoch": 0.7203456935363454, "grad_norm": 0.3598691523075104, "learning_rate": 3.82881858985392e-06, "loss": 0.127, "step": 7960 }, { "epoch": 0.7212506504377729, "grad_norm": 0.23929624259471893, "learning_rate": 3.8057808801167463e-06, "loss": 0.1243, "step": 7970 }, { "epoch": 0.7221556073392005, "grad_norm": 0.31415387988090515, "learning_rate": 3.782796388372739e-06, "loss": 0.1309, "step": 7980 }, { "epoch": 0.723060564240628, "grad_norm": 0.3203081786632538, "learning_rate": 3.7598653120945015e-06, "loss": 0.1268, "step": 7990 }, { "epoch": 0.7239655211420556, "grad_norm": 0.38148126006126404, "learning_rate": 3.736987848295699e-06, "loss": 0.1196, "step": 8000 }, { "epoch": 0.7248704780434831, "grad_norm": 0.2480911761522293, "learning_rate": 3.7141641935293926e-06, "loss": 0.1296, "step": 8010 }, { "epoch": 0.7257754349449107, "grad_norm": 0.2723288834095001, "learning_rate": 3.6913945438863397e-06, "loss": 0.127, "step": 8020 }, { "epoch": 0.7266803918463384, "grad_norm": 0.3012789785861969, "learning_rate": 3.6686790949933082e-06, "loss": 0.1254, "step": 8030 }, { "epoch": 0.7275853487477659, "grad_norm": 0.2832350432872772, "learning_rate": 3.64601804201139e-06, "loss": 0.1158, "step": 8040 }, { "epoch": 0.7284903056491935, "grad_norm": 0.30464476346969604, "learning_rate": 3.6234115796343405e-06, "loss": 0.1223, "step": 8050 }, { "epoch": 0.729395262550621, "grad_norm": 0.3467896282672882, "learning_rate": 3.6008599020868985e-06, "loss": 0.1326, "step": 8060 }, { "epoch": 0.7303002194520486, "grad_norm": 0.35166192054748535, "learning_rate": 3.5783632031231018e-06, "loss": 0.1257, "step": 8070 }, { "epoch": 0.7312051763534761, "grad_norm": 0.3181626796722412, "learning_rate": 3.555921676024653e-06, "loss": 0.1269, "step": 8080 }, { "epoch": 0.7321101332549037, "grad_norm": 0.30875396728515625, "learning_rate": 3.53353551359923e-06, "loss": 0.1297, "step": 8090 }, { "epoch": 0.7330150901563313, "grad_norm": 0.3796384632587433, "learning_rate": 3.511204908178848e-06, "loss": 0.1243, "step": 8100 }, { "epoch": 0.7339200470577589, "grad_norm": 0.3839415907859802, "learning_rate": 3.488930051618201e-06, "loss": 0.1265, "step": 8110 }, { "epoch": 0.7348250039591865, "grad_norm": 0.34740591049194336, "learning_rate": 3.4667111352930163e-06, "loss": 0.1339, "step": 8120 }, { "epoch": 0.735729960860614, "grad_norm": 0.2768769860267639, "learning_rate": 3.4445483500983944e-06, "loss": 0.1238, "step": 8130 }, { "epoch": 0.7366349177620416, "grad_norm": 0.3522382378578186, "learning_rate": 3.4224418864471976e-06, "loss": 0.1242, "step": 8140 }, { "epoch": 0.7375398746634692, "grad_norm": 0.31364932656288147, "learning_rate": 3.400391934268391e-06, "loss": 0.1261, "step": 8150 }, { "epoch": 0.7384448315648967, "grad_norm": 0.3115822374820709, "learning_rate": 3.378398683005416e-06, "loss": 0.1248, "step": 8160 }, { "epoch": 0.7393497884663243, "grad_norm": 0.370149165391922, "learning_rate": 3.356462321614573e-06, "loss": 0.1294, "step": 8170 }, { "epoch": 0.7402547453677518, "grad_norm": 0.28242307901382446, "learning_rate": 3.334583038563376e-06, "loss": 0.1298, "step": 8180 }, { "epoch": 0.7411597022691794, "grad_norm": 0.323049396276474, "learning_rate": 3.3127610218289617e-06, "loss": 0.1228, "step": 8190 }, { "epoch": 0.742064659170607, "grad_norm": 0.393040269613266, "learning_rate": 3.2909964588964514e-06, "loss": 0.1276, "step": 8200 }, { "epoch": 0.7429696160720346, "grad_norm": 0.36402514576911926, "learning_rate": 3.269289536757352e-06, "loss": 0.1296, "step": 8210 }, { "epoch": 0.7438745729734622, "grad_norm": 0.3288993239402771, "learning_rate": 3.2476404419079487e-06, "loss": 0.1245, "step": 8220 }, { "epoch": 0.7447795298748897, "grad_norm": 0.40095171332359314, "learning_rate": 3.226049360347694e-06, "loss": 0.1275, "step": 8230 }, { "epoch": 0.7456844867763173, "grad_norm": 0.36854180693626404, "learning_rate": 3.2045164775776137e-06, "loss": 0.1254, "step": 8240 }, { "epoch": 0.7465894436777448, "grad_norm": 0.35555073618888855, "learning_rate": 3.1830419785987243e-06, "loss": 0.1237, "step": 8250 }, { "epoch": 0.7474944005791724, "grad_norm": 0.4191891849040985, "learning_rate": 3.161626047910431e-06, "loss": 0.13, "step": 8260 }, { "epoch": 0.7483993574805999, "grad_norm": 0.3702433705329895, "learning_rate": 3.140268869508949e-06, "loss": 0.1317, "step": 8270 }, { "epoch": 0.7493043143820276, "grad_norm": 0.27752381563186646, "learning_rate": 3.1189706268857077e-06, "loss": 0.1226, "step": 8280 }, { "epoch": 0.7502092712834552, "grad_norm": 0.3350948095321655, "learning_rate": 3.0977315030258002e-06, "loss": 0.1309, "step": 8290 }, { "epoch": 0.7511142281848827, "grad_norm": 0.4031756520271301, "learning_rate": 3.0765516804063932e-06, "loss": 0.127, "step": 8300 }, { "epoch": 0.7520191850863103, "grad_norm": 0.3240339159965515, "learning_rate": 3.055431340995163e-06, "loss": 0.123, "step": 8310 }, { "epoch": 0.7529241419877378, "grad_norm": 0.3321908414363861, "learning_rate": 3.0343706662487306e-06, "loss": 0.1258, "step": 8320 }, { "epoch": 0.7538290988891654, "grad_norm": 0.3860868513584137, "learning_rate": 3.013369837111101e-06, "loss": 0.1297, "step": 8330 }, { "epoch": 0.7547340557905929, "grad_norm": 0.3689778745174408, "learning_rate": 2.992429034012121e-06, "loss": 0.1253, "step": 8340 }, { "epoch": 0.7556390126920205, "grad_norm": 0.27259641885757446, "learning_rate": 2.9715484368659152e-06, "loss": 0.1258, "step": 8350 }, { "epoch": 0.7565439695934482, "grad_norm": 0.39256593585014343, "learning_rate": 2.9507282250693514e-06, "loss": 0.119, "step": 8360 }, { "epoch": 0.7574489264948757, "grad_norm": 0.33603495359420776, "learning_rate": 2.9299685775004793e-06, "loss": 0.1337, "step": 8370 }, { "epoch": 0.7583538833963033, "grad_norm": 0.4902133345603943, "learning_rate": 2.9092696725170212e-06, "loss": 0.1352, "step": 8380 }, { "epoch": 0.7592588402977308, "grad_norm": 0.27200329303741455, "learning_rate": 2.8886316879548205e-06, "loss": 0.1231, "step": 8390 }, { "epoch": 0.7601637971991584, "grad_norm": 0.27076977491378784, "learning_rate": 2.868054801126321e-06, "loss": 0.1209, "step": 8400 }, { "epoch": 0.7610687541005859, "grad_norm": 0.32341551780700684, "learning_rate": 2.8475391888190395e-06, "loss": 0.1346, "step": 8410 }, { "epoch": 0.7619737110020135, "grad_norm": 0.29108741879463196, "learning_rate": 2.8270850272940466e-06, "loss": 0.1251, "step": 8420 }, { "epoch": 0.762878667903441, "grad_norm": 0.321847528219223, "learning_rate": 2.806692492284461e-06, "loss": 0.1248, "step": 8430 }, { "epoch": 0.7637836248048687, "grad_norm": 0.3972260355949402, "learning_rate": 2.786361758993932e-06, "loss": 0.1266, "step": 8440 }, { "epoch": 0.7646885817062963, "grad_norm": 0.3229351341724396, "learning_rate": 2.766093002095137e-06, "loss": 0.1253, "step": 8450 }, { "epoch": 0.7655935386077238, "grad_norm": 0.32253509759902954, "learning_rate": 2.745886395728271e-06, "loss": 0.124, "step": 8460 }, { "epoch": 0.7664984955091514, "grad_norm": 0.34673410654067993, "learning_rate": 2.725742113499571e-06, "loss": 0.124, "step": 8470 }, { "epoch": 0.767403452410579, "grad_norm": 0.3298405110836029, "learning_rate": 2.705660328479809e-06, "loss": 0.1259, "step": 8480 }, { "epoch": 0.7683084093120065, "grad_norm": 0.454266756772995, "learning_rate": 2.6856412132027997e-06, "loss": 0.1211, "step": 8490 }, { "epoch": 0.7692133662134341, "grad_norm": 0.39579710364341736, "learning_rate": 2.6656849396639415e-06, "loss": 0.1302, "step": 8500 }, { "epoch": 0.7701183231148616, "grad_norm": 0.2675047814846039, "learning_rate": 2.6457916793187124e-06, "loss": 0.123, "step": 8510 }, { "epoch": 0.7710232800162892, "grad_norm": 0.44622719287872314, "learning_rate": 2.6259616030812128e-06, "loss": 0.1238, "step": 8520 }, { "epoch": 0.7719282369177168, "grad_norm": 0.40703779458999634, "learning_rate": 2.6061948813226968e-06, "loss": 0.1222, "step": 8530 }, { "epoch": 0.7728331938191444, "grad_norm": 0.32759425044059753, "learning_rate": 2.5864916838701016e-06, "loss": 0.1257, "step": 8540 }, { "epoch": 0.773738150720572, "grad_norm": 0.2865431308746338, "learning_rate": 2.5668521800045944e-06, "loss": 0.1291, "step": 8550 }, { "epoch": 0.7746431076219995, "grad_norm": 0.30442455410957336, "learning_rate": 2.5472765384601074e-06, "loss": 0.1214, "step": 8560 }, { "epoch": 0.7755480645234271, "grad_norm": 0.31685060262680054, "learning_rate": 2.5277649274219064e-06, "loss": 0.131, "step": 8570 }, { "epoch": 0.7764530214248546, "grad_norm": 0.274147093296051, "learning_rate": 2.508317514525125e-06, "loss": 0.1195, "step": 8580 }, { "epoch": 0.7773579783262822, "grad_norm": 0.28254234790802, "learning_rate": 2.4889344668533453e-06, "loss": 0.1313, "step": 8590 }, { "epoch": 0.7782629352277097, "grad_norm": 0.3491531014442444, "learning_rate": 2.469615950937142e-06, "loss": 0.1279, "step": 8600 }, { "epoch": 0.7791678921291374, "grad_norm": 0.3661726713180542, "learning_rate": 2.4503621327526694e-06, "loss": 0.1252, "step": 8610 }, { "epoch": 0.780072849030565, "grad_norm": 0.24949342012405396, "learning_rate": 2.431173177720223e-06, "loss": 0.1209, "step": 8620 }, { "epoch": 0.7809778059319925, "grad_norm": 0.24942710995674133, "learning_rate": 2.4120492507028236e-06, "loss": 0.1294, "step": 8630 }, { "epoch": 0.7818827628334201, "grad_norm": 0.3253498077392578, "learning_rate": 2.392990516004804e-06, "loss": 0.1313, "step": 8640 }, { "epoch": 0.7827877197348476, "grad_norm": 0.31229835748672485, "learning_rate": 2.3739971373703852e-06, "loss": 0.1244, "step": 8650 }, { "epoch": 0.7836926766362752, "grad_norm": 0.3631618320941925, "learning_rate": 2.355069277982286e-06, "loss": 0.1266, "step": 8660 }, { "epoch": 0.7845976335377027, "grad_norm": 0.2935367822647095, "learning_rate": 2.3362071004603036e-06, "loss": 0.1222, "step": 8670 }, { "epoch": 0.7855025904391303, "grad_norm": 0.34212765097618103, "learning_rate": 2.3174107668599366e-06, "loss": 0.126, "step": 8680 }, { "epoch": 0.786407547340558, "grad_norm": 0.28019851446151733, "learning_rate": 2.298680438670976e-06, "loss": 0.119, "step": 8690 }, { "epoch": 0.7873125042419855, "grad_norm": 0.35425955057144165, "learning_rate": 2.2800162768161204e-06, "loss": 0.1237, "step": 8700 }, { "epoch": 0.7882174611434131, "grad_norm": 0.3190707564353943, "learning_rate": 2.2614184416496022e-06, "loss": 0.1206, "step": 8710 }, { "epoch": 0.7891224180448406, "grad_norm": 0.40515249967575073, "learning_rate": 2.2428870929558012e-06, "loss": 0.1251, "step": 8720 }, { "epoch": 0.7900273749462682, "grad_norm": 0.3554609417915344, "learning_rate": 2.224422389947879e-06, "loss": 0.1268, "step": 8730 }, { "epoch": 0.7909323318476957, "grad_norm": 0.37230101227760315, "learning_rate": 2.2060244912663996e-06, "loss": 0.134, "step": 8740 }, { "epoch": 0.7918372887491233, "grad_norm": 0.2492271512746811, "learning_rate": 2.1876935549779766e-06, "loss": 0.1247, "step": 8750 }, { "epoch": 0.7927422456505508, "grad_norm": 0.3300861418247223, "learning_rate": 2.169429738573915e-06, "loss": 0.1243, "step": 8760 }, { "epoch": 0.7936472025519785, "grad_norm": 0.4448375105857849, "learning_rate": 2.151233198968854e-06, "loss": 0.121, "step": 8770 }, { "epoch": 0.7945521594534061, "grad_norm": 0.2607729434967041, "learning_rate": 2.1331040924994216e-06, "loss": 0.1194, "step": 8780 }, { "epoch": 0.7954571163548336, "grad_norm": 0.3317795991897583, "learning_rate": 2.1150425749228853e-06, "loss": 0.122, "step": 8790 }, { "epoch": 0.7963620732562612, "grad_norm": 0.3975413739681244, "learning_rate": 2.097048801415823e-06, "loss": 0.1261, "step": 8800 }, { "epoch": 0.7972670301576887, "grad_norm": 0.3129339814186096, "learning_rate": 2.079122926572784e-06, "loss": 0.1264, "step": 8810 }, { "epoch": 0.7981719870591163, "grad_norm": 0.32011422514915466, "learning_rate": 2.0612651044049683e-06, "loss": 0.1287, "step": 8820 }, { "epoch": 0.7990769439605439, "grad_norm": 0.34052857756614685, "learning_rate": 2.043475488338885e-06, "loss": 0.1217, "step": 8830 }, { "epoch": 0.7999819008619714, "grad_norm": 0.2858836054801941, "learning_rate": 2.0257542312150534e-06, "loss": 0.1242, "step": 8840 }, { "epoch": 0.800886857763399, "grad_norm": 0.26198819279670715, "learning_rate": 2.0081014852866843e-06, "loss": 0.1288, "step": 8850 }, { "epoch": 0.8017918146648266, "grad_norm": 0.3347429633140564, "learning_rate": 1.9905174022183702e-06, "loss": 0.1251, "step": 8860 }, { "epoch": 0.8026967715662542, "grad_norm": 0.3790920078754425, "learning_rate": 1.9730021330847838e-06, "loss": 0.1263, "step": 8870 }, { "epoch": 0.8036017284676817, "grad_norm": 0.27628254890441895, "learning_rate": 1.955555828369371e-06, "loss": 0.1272, "step": 8880 }, { "epoch": 0.8045066853691093, "grad_norm": 0.2933729887008667, "learning_rate": 1.938178637963074e-06, "loss": 0.1332, "step": 8890 }, { "epoch": 0.8054116422705369, "grad_norm": 0.2660931348800659, "learning_rate": 1.9208707111630376e-06, "loss": 0.1259, "step": 8900 }, { "epoch": 0.8063165991719644, "grad_norm": 0.3294607698917389, "learning_rate": 1.903632196671311e-06, "loss": 0.1293, "step": 8910 }, { "epoch": 0.807221556073392, "grad_norm": 0.3121941089630127, "learning_rate": 1.8864632425936015e-06, "loss": 0.1289, "step": 8920 }, { "epoch": 0.8081265129748195, "grad_norm": 0.2634756863117218, "learning_rate": 1.8693639964379661e-06, "loss": 0.1291, "step": 8930 }, { "epoch": 0.8090314698762472, "grad_norm": 0.3612058162689209, "learning_rate": 1.852334605113576e-06, "loss": 0.1284, "step": 8940 }, { "epoch": 0.8099364267776747, "grad_norm": 0.27511492371559143, "learning_rate": 1.8353752149294335e-06, "loss": 0.1255, "step": 8950 }, { "epoch": 0.8108413836791023, "grad_norm": 0.5154643058776855, "learning_rate": 1.8184859715931247e-06, "loss": 0.1293, "step": 8960 }, { "epoch": 0.8117463405805299, "grad_norm": 0.2842332124710083, "learning_rate": 1.8016670202095677e-06, "loss": 0.1233, "step": 8970 }, { "epoch": 0.8126512974819574, "grad_norm": 0.26935648918151855, "learning_rate": 1.7849185052797525e-06, "loss": 0.127, "step": 8980 }, { "epoch": 0.813556254383385, "grad_norm": 0.4419654607772827, "learning_rate": 1.7682405706995243e-06, "loss": 0.1255, "step": 8990 }, { "epoch": 0.8144612112848125, "grad_norm": 0.27756479382514954, "learning_rate": 1.7516333597583214e-06, "loss": 0.1195, "step": 9000 } ], "logging_steps": 10, "max_steps": 11050, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.722768489962275e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }