iSHIFT / trainer_state.json
SarthakM320's picture
Upload folder using huggingface_hub
84157e2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8144612112848125,
"eval_steps": 500,
"global_step": 9000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009049569014275695,
"grad_norm": 159.94659423828125,
"learning_rate": 6.024096385542169e-07,
"loss": 3.3777,
"step": 10
},
{
"epoch": 0.001809913802855139,
"grad_norm": 92.94371795654297,
"learning_rate": 1.2048192771084338e-06,
"loss": 2.8739,
"step": 20
},
{
"epoch": 0.0027148707042827084,
"grad_norm": 40.02893829345703,
"learning_rate": 1.8072289156626508e-06,
"loss": 2.0823,
"step": 30
},
{
"epoch": 0.003619827605710278,
"grad_norm": 11.130951881408691,
"learning_rate": 2.4096385542168676e-06,
"loss": 1.3735,
"step": 40
},
{
"epoch": 0.004524784507137848,
"grad_norm": 6.264873027801514,
"learning_rate": 3.012048192771085e-06,
"loss": 0.77,
"step": 50
},
{
"epoch": 0.005429741408565417,
"grad_norm": 5.753260135650635,
"learning_rate": 3.6144578313253016e-06,
"loss": 0.5821,
"step": 60
},
{
"epoch": 0.006334698309992987,
"grad_norm": 5.4176836013793945,
"learning_rate": 4.216867469879519e-06,
"loss": 0.5349,
"step": 70
},
{
"epoch": 0.007239655211420556,
"grad_norm": 5.466179370880127,
"learning_rate": 4.819277108433735e-06,
"loss": 0.4904,
"step": 80
},
{
"epoch": 0.008144612112848126,
"grad_norm": 5.408243179321289,
"learning_rate": 5.421686746987952e-06,
"loss": 0.4391,
"step": 90
},
{
"epoch": 0.009049569014275696,
"grad_norm": 4.930578708648682,
"learning_rate": 6.02409638554217e-06,
"loss": 0.382,
"step": 100
},
{
"epoch": 0.009954525915703265,
"grad_norm": 4.409604072570801,
"learning_rate": 6.626506024096386e-06,
"loss": 0.3354,
"step": 110
},
{
"epoch": 0.010859482817130834,
"grad_norm": 3.2613117694854736,
"learning_rate": 7.228915662650603e-06,
"loss": 0.2748,
"step": 120
},
{
"epoch": 0.011764439718558404,
"grad_norm": 3.974195718765259,
"learning_rate": 7.83132530120482e-06,
"loss": 0.2458,
"step": 130
},
{
"epoch": 0.012669396619985973,
"grad_norm": 2.277785539627075,
"learning_rate": 8.433734939759038e-06,
"loss": 0.2215,
"step": 140
},
{
"epoch": 0.013574353521413543,
"grad_norm": 1.618218183517456,
"learning_rate": 9.036144578313254e-06,
"loss": 0.2,
"step": 150
},
{
"epoch": 0.014479310422841112,
"grad_norm": 1.4689830541610718,
"learning_rate": 9.63855421686747e-06,
"loss": 0.1821,
"step": 160
},
{
"epoch": 0.015384267324268681,
"grad_norm": 1.7365655899047852,
"learning_rate": 1.0240963855421688e-05,
"loss": 0.1699,
"step": 170
},
{
"epoch": 0.01628922422569625,
"grad_norm": 0.9865982532501221,
"learning_rate": 1.0843373493975904e-05,
"loss": 0.1667,
"step": 180
},
{
"epoch": 0.01719418112712382,
"grad_norm": 0.9514006972312927,
"learning_rate": 1.1445783132530122e-05,
"loss": 0.1651,
"step": 190
},
{
"epoch": 0.01809913802855139,
"grad_norm": 1.1965084075927734,
"learning_rate": 1.204819277108434e-05,
"loss": 0.1564,
"step": 200
},
{
"epoch": 0.01900409492997896,
"grad_norm": 1.2576557397842407,
"learning_rate": 1.2650602409638555e-05,
"loss": 0.1589,
"step": 210
},
{
"epoch": 0.01990905183140653,
"grad_norm": 0.7250511646270752,
"learning_rate": 1.3253012048192772e-05,
"loss": 0.1567,
"step": 220
},
{
"epoch": 0.020814008732834097,
"grad_norm": 0.881618320941925,
"learning_rate": 1.3855421686746989e-05,
"loss": 0.1554,
"step": 230
},
{
"epoch": 0.021718965634261667,
"grad_norm": 0.7235729694366455,
"learning_rate": 1.4457831325301207e-05,
"loss": 0.1594,
"step": 240
},
{
"epoch": 0.022623922535689237,
"grad_norm": 0.6479834318161011,
"learning_rate": 1.5060240963855424e-05,
"loss": 0.1554,
"step": 250
},
{
"epoch": 0.023528879437116807,
"grad_norm": 0.8083927035331726,
"learning_rate": 1.566265060240964e-05,
"loss": 0.1526,
"step": 260
},
{
"epoch": 0.024433836338544377,
"grad_norm": 1.1908012628555298,
"learning_rate": 1.6265060240963857e-05,
"loss": 0.1525,
"step": 270
},
{
"epoch": 0.025338793239971947,
"grad_norm": 0.944805920124054,
"learning_rate": 1.6867469879518076e-05,
"loss": 0.1583,
"step": 280
},
{
"epoch": 0.026243750141399517,
"grad_norm": 0.6904934048652649,
"learning_rate": 1.746987951807229e-05,
"loss": 0.1531,
"step": 290
},
{
"epoch": 0.027148707042827087,
"grad_norm": 1.380239486694336,
"learning_rate": 1.807228915662651e-05,
"loss": 0.1483,
"step": 300
},
{
"epoch": 0.028053663944254653,
"grad_norm": 0.4651995897293091,
"learning_rate": 1.8674698795180725e-05,
"loss": 0.1498,
"step": 310
},
{
"epoch": 0.028958620845682223,
"grad_norm": 0.6768488883972168,
"learning_rate": 1.927710843373494e-05,
"loss": 0.152,
"step": 320
},
{
"epoch": 0.029863577747109793,
"grad_norm": 0.6283469796180725,
"learning_rate": 1.987951807228916e-05,
"loss": 0.1464,
"step": 330
},
{
"epoch": 0.030768534648537363,
"grad_norm": 0.7869206070899963,
"learning_rate": 1.999997250700714e-05,
"loss": 0.1454,
"step": 340
},
{
"epoch": 0.03167349154996493,
"grad_norm": 1.3408620357513428,
"learning_rate": 1.9999860816982734e-05,
"loss": 0.154,
"step": 350
},
{
"epoch": 0.0325784484513925,
"grad_norm": 1.0676746368408203,
"learning_rate": 1.9999663212573584e-05,
"loss": 0.1482,
"step": 360
},
{
"epoch": 0.03348340535282007,
"grad_norm": 0.7311077117919922,
"learning_rate": 1.9999379695477417e-05,
"loss": 0.1486,
"step": 370
},
{
"epoch": 0.03438836225424764,
"grad_norm": 0.8585270643234253,
"learning_rate": 1.999901026813009e-05,
"loss": 0.1463,
"step": 380
},
{
"epoch": 0.03529331915567521,
"grad_norm": 0.4578467607498169,
"learning_rate": 1.9998554933705552e-05,
"loss": 0.1407,
"step": 390
},
{
"epoch": 0.03619827605710278,
"grad_norm": 0.6237673163414001,
"learning_rate": 1.9998013696115847e-05,
"loss": 0.1463,
"step": 400
},
{
"epoch": 0.03710323295853035,
"grad_norm": 0.49388837814331055,
"learning_rate": 1.999738656001104e-05,
"loss": 0.1498,
"step": 410
},
{
"epoch": 0.03800818985995792,
"grad_norm": 0.9949780106544495,
"learning_rate": 1.999667353077921e-05,
"loss": 0.1457,
"step": 420
},
{
"epoch": 0.03891314676138549,
"grad_norm": 0.792772114276886,
"learning_rate": 1.9995874614546386e-05,
"loss": 0.1567,
"step": 430
},
{
"epoch": 0.03981810366281306,
"grad_norm": 0.45337289571762085,
"learning_rate": 1.9994989818176507e-05,
"loss": 0.1444,
"step": 440
},
{
"epoch": 0.040723060564240625,
"grad_norm": 0.5384504795074463,
"learning_rate": 1.9994019149271357e-05,
"loss": 0.1464,
"step": 450
},
{
"epoch": 0.041628017465668195,
"grad_norm": 0.4586544334888458,
"learning_rate": 1.9992962616170485e-05,
"loss": 0.1366,
"step": 460
},
{
"epoch": 0.042532974367095765,
"grad_norm": 1.124829888343811,
"learning_rate": 1.999182022795116e-05,
"loss": 0.1441,
"step": 470
},
{
"epoch": 0.043437931268523335,
"grad_norm": 7.702340602874756,
"learning_rate": 1.9990591994428278e-05,
"loss": 0.1434,
"step": 480
},
{
"epoch": 0.044342888169950904,
"grad_norm": 0.600829541683197,
"learning_rate": 1.9989277926154273e-05,
"loss": 0.1554,
"step": 490
},
{
"epoch": 0.045247845071378474,
"grad_norm": 0.525310218334198,
"learning_rate": 1.9987878034419047e-05,
"loss": 0.1524,
"step": 500
},
{
"epoch": 0.046152801972806044,
"grad_norm": 1.6653215885162354,
"learning_rate": 1.998639233124985e-05,
"loss": 0.1504,
"step": 510
},
{
"epoch": 0.047057758874233614,
"grad_norm": 0.5178505778312683,
"learning_rate": 1.998482082941118e-05,
"loss": 0.1462,
"step": 520
},
{
"epoch": 0.047962715775661184,
"grad_norm": 0.524719774723053,
"learning_rate": 1.9983163542404694e-05,
"loss": 0.1482,
"step": 530
},
{
"epoch": 0.048867672677088754,
"grad_norm": 0.4859486520290375,
"learning_rate": 1.9981420484469062e-05,
"loss": 0.1504,
"step": 540
},
{
"epoch": 0.049772629578516324,
"grad_norm": 0.6776233315467834,
"learning_rate": 1.997959167057988e-05,
"loss": 0.1494,
"step": 550
},
{
"epoch": 0.050677586479943894,
"grad_norm": 0.6861289143562317,
"learning_rate": 1.9977677116449494e-05,
"loss": 0.1492,
"step": 560
},
{
"epoch": 0.051582543381371464,
"grad_norm": 0.4854241609573364,
"learning_rate": 1.9975676838526914e-05,
"loss": 0.1437,
"step": 570
},
{
"epoch": 0.052487500282799034,
"grad_norm": 0.37351515889167786,
"learning_rate": 1.9973590853997646e-05,
"loss": 0.146,
"step": 580
},
{
"epoch": 0.053392457184226604,
"grad_norm": 0.5376414060592651,
"learning_rate": 1.997141918078354e-05,
"loss": 0.1445,
"step": 590
},
{
"epoch": 0.05429741408565417,
"grad_norm": 1.200066328048706,
"learning_rate": 1.996916183754266e-05,
"loss": 0.1407,
"step": 600
},
{
"epoch": 0.05520237098708174,
"grad_norm": 0.5804212689399719,
"learning_rate": 1.9966818843669097e-05,
"loss": 0.1482,
"step": 610
},
{
"epoch": 0.056107327888509306,
"grad_norm": 0.5233150124549866,
"learning_rate": 1.9964390219292823e-05,
"loss": 0.1423,
"step": 620
},
{
"epoch": 0.057012284789936876,
"grad_norm": 0.7850412130355835,
"learning_rate": 1.9961875985279503e-05,
"loss": 0.1436,
"step": 630
},
{
"epoch": 0.057917241691364446,
"grad_norm": 0.46773582696914673,
"learning_rate": 1.9959276163230325e-05,
"loss": 0.136,
"step": 640
},
{
"epoch": 0.058822198592792016,
"grad_norm": 0.47068318724632263,
"learning_rate": 1.9956590775481808e-05,
"loss": 0.1477,
"step": 650
},
{
"epoch": 0.059727155494219586,
"grad_norm": 0.7239755988121033,
"learning_rate": 1.9953819845105616e-05,
"loss": 0.1414,
"step": 660
},
{
"epoch": 0.060632112395647156,
"grad_norm": 0.4646810293197632,
"learning_rate": 1.9950963395908368e-05,
"loss": 0.1433,
"step": 670
},
{
"epoch": 0.061537069297074726,
"grad_norm": 0.5105672478675842,
"learning_rate": 1.99480214524314e-05,
"loss": 0.1381,
"step": 680
},
{
"epoch": 0.062442026198502296,
"grad_norm": 0.4857189655303955,
"learning_rate": 1.99449940399506e-05,
"loss": 0.1355,
"step": 690
},
{
"epoch": 0.06334698309992987,
"grad_norm": 0.7053922414779663,
"learning_rate": 1.9941881184476154e-05,
"loss": 0.1402,
"step": 700
},
{
"epoch": 0.06425194000135744,
"grad_norm": 0.6095537543296814,
"learning_rate": 1.9938682912752343e-05,
"loss": 0.1424,
"step": 710
},
{
"epoch": 0.065156896902785,
"grad_norm": 0.8942661285400391,
"learning_rate": 1.99353992522573e-05,
"loss": 0.1337,
"step": 720
},
{
"epoch": 0.06606185380421258,
"grad_norm": 0.8702712059020996,
"learning_rate": 1.9932030231202786e-05,
"loss": 0.146,
"step": 730
},
{
"epoch": 0.06696681070564015,
"grad_norm": 0.6549976468086243,
"learning_rate": 1.9928575878533946e-05,
"loss": 0.1389,
"step": 740
},
{
"epoch": 0.06787176760706772,
"grad_norm": 0.56646329164505,
"learning_rate": 1.9925036223929045e-05,
"loss": 0.1399,
"step": 750
},
{
"epoch": 0.06877672450849528,
"grad_norm": 0.5389584898948669,
"learning_rate": 1.9921411297799233e-05,
"loss": 0.1398,
"step": 760
},
{
"epoch": 0.06968168140992285,
"grad_norm": 0.7392621636390686,
"learning_rate": 1.9917701131288274e-05,
"loss": 0.1436,
"step": 770
},
{
"epoch": 0.07058663831135042,
"grad_norm": 0.5267114639282227,
"learning_rate": 1.991390575627228e-05,
"loss": 0.1398,
"step": 780
},
{
"epoch": 0.071491595212778,
"grad_norm": 0.543932318687439,
"learning_rate": 1.9910025205359434e-05,
"loss": 0.1469,
"step": 790
},
{
"epoch": 0.07239655211420556,
"grad_norm": 0.5490307211875916,
"learning_rate": 1.990605951188972e-05,
"loss": 0.1342,
"step": 800
},
{
"epoch": 0.07330150901563313,
"grad_norm": 0.4569859206676483,
"learning_rate": 1.990200870993461e-05,
"loss": 0.1432,
"step": 810
},
{
"epoch": 0.0742064659170607,
"grad_norm": 0.5565773844718933,
"learning_rate": 1.9897872834296816e-05,
"loss": 0.1465,
"step": 820
},
{
"epoch": 0.07511142281848827,
"grad_norm": 0.5516616106033325,
"learning_rate": 1.989365192050995e-05,
"loss": 0.1434,
"step": 830
},
{
"epoch": 0.07601637971991584,
"grad_norm": 0.8122782707214355,
"learning_rate": 1.988934600483824e-05,
"loss": 0.1424,
"step": 840
},
{
"epoch": 0.07692133662134341,
"grad_norm": 0.4941742420196533,
"learning_rate": 1.9884955124276214e-05,
"loss": 0.1437,
"step": 850
},
{
"epoch": 0.07782629352277098,
"grad_norm": 0.68223637342453,
"learning_rate": 1.9880479316548365e-05,
"loss": 0.1366,
"step": 860
},
{
"epoch": 0.07873125042419855,
"grad_norm": 0.7050871253013611,
"learning_rate": 1.9875918620108867e-05,
"loss": 0.1358,
"step": 870
},
{
"epoch": 0.07963620732562612,
"grad_norm": 0.44282448291778564,
"learning_rate": 1.9871273074141197e-05,
"loss": 0.1384,
"step": 880
},
{
"epoch": 0.08054116422705368,
"grad_norm": 0.6209523677825928,
"learning_rate": 1.9866542718557844e-05,
"loss": 0.1389,
"step": 890
},
{
"epoch": 0.08144612112848125,
"grad_norm": 0.49669986963272095,
"learning_rate": 1.9861727593999927e-05,
"loss": 0.1298,
"step": 900
},
{
"epoch": 0.08235107802990882,
"grad_norm": 0.4034930467605591,
"learning_rate": 1.985682774183687e-05,
"loss": 0.139,
"step": 910
},
{
"epoch": 0.08325603493133639,
"grad_norm": 0.42605486512184143,
"learning_rate": 1.985184320416603e-05,
"loss": 0.1359,
"step": 920
},
{
"epoch": 0.08416099183276396,
"grad_norm": 0.5757314562797546,
"learning_rate": 1.9846774023812366e-05,
"loss": 0.1412,
"step": 930
},
{
"epoch": 0.08506594873419153,
"grad_norm": 0.6896102428436279,
"learning_rate": 1.984162024432802e-05,
"loss": 0.1322,
"step": 940
},
{
"epoch": 0.0859709056356191,
"grad_norm": 0.4264814257621765,
"learning_rate": 1.9836381909992e-05,
"loss": 0.1413,
"step": 950
},
{
"epoch": 0.08687586253704667,
"grad_norm": 0.4684106111526489,
"learning_rate": 1.9831059065809756e-05,
"loss": 0.1373,
"step": 960
},
{
"epoch": 0.08778081943847424,
"grad_norm": 0.5796085000038147,
"learning_rate": 1.9825651757512808e-05,
"loss": 0.1357,
"step": 970
},
{
"epoch": 0.08868577633990181,
"grad_norm": 0.5410571098327637,
"learning_rate": 1.9820160031558365e-05,
"loss": 0.1364,
"step": 980
},
{
"epoch": 0.08959073324132938,
"grad_norm": 0.45756787061691284,
"learning_rate": 1.9814583935128902e-05,
"loss": 0.1425,
"step": 990
},
{
"epoch": 0.09049569014275695,
"grad_norm": 0.9066041111946106,
"learning_rate": 1.9808923516131787e-05,
"loss": 0.1367,
"step": 1000
},
{
"epoch": 0.09140064704418452,
"grad_norm": 0.49664556980133057,
"learning_rate": 1.9803178823198826e-05,
"loss": 0.1382,
"step": 1010
},
{
"epoch": 0.09230560394561209,
"grad_norm": 0.3994297683238983,
"learning_rate": 1.979734990568589e-05,
"loss": 0.1357,
"step": 1020
},
{
"epoch": 0.09321056084703966,
"grad_norm": 0.4870229959487915,
"learning_rate": 1.979143681367246e-05,
"loss": 0.1387,
"step": 1030
},
{
"epoch": 0.09411551774846723,
"grad_norm": 0.5828505754470825,
"learning_rate": 1.9785439597961207e-05,
"loss": 0.1388,
"step": 1040
},
{
"epoch": 0.0950204746498948,
"grad_norm": 0.4454402029514313,
"learning_rate": 1.977935831007756e-05,
"loss": 0.1394,
"step": 1050
},
{
"epoch": 0.09592543155132237,
"grad_norm": 0.6127322912216187,
"learning_rate": 1.977319300226926e-05,
"loss": 0.1394,
"step": 1060
},
{
"epoch": 0.09683038845274994,
"grad_norm": 0.42766839265823364,
"learning_rate": 1.97669437275059e-05,
"loss": 0.1379,
"step": 1070
},
{
"epoch": 0.09773534535417751,
"grad_norm": 0.556703507900238,
"learning_rate": 1.9760610539478492e-05,
"loss": 0.1336,
"step": 1080
},
{
"epoch": 0.09864030225560508,
"grad_norm": 0.4200476408004761,
"learning_rate": 1.9754193492598985e-05,
"loss": 0.1398,
"step": 1090
},
{
"epoch": 0.09954525915703265,
"grad_norm": 0.5154082179069519,
"learning_rate": 1.9747692641999815e-05,
"loss": 0.1391,
"step": 1100
},
{
"epoch": 0.10045021605846022,
"grad_norm": 0.4285774230957031,
"learning_rate": 1.9741108043533416e-05,
"loss": 0.1405,
"step": 1110
},
{
"epoch": 0.10135517295988779,
"grad_norm": 0.37950581312179565,
"learning_rate": 1.9734439753771742e-05,
"loss": 0.1399,
"step": 1120
},
{
"epoch": 0.10226012986131536,
"grad_norm": 0.4357180893421173,
"learning_rate": 1.9727687830005795e-05,
"loss": 0.1354,
"step": 1130
},
{
"epoch": 0.10316508676274293,
"grad_norm": 0.341791570186615,
"learning_rate": 1.9720852330245127e-05,
"loss": 0.1368,
"step": 1140
},
{
"epoch": 0.1040700436641705,
"grad_norm": 0.37262991070747375,
"learning_rate": 1.971393331321732e-05,
"loss": 0.1406,
"step": 1150
},
{
"epoch": 0.10497500056559807,
"grad_norm": 0.5309743881225586,
"learning_rate": 1.9706930838367517e-05,
"loss": 0.1386,
"step": 1160
},
{
"epoch": 0.10587995746702564,
"grad_norm": 0.4703729748725891,
"learning_rate": 1.9699844965857884e-05,
"loss": 0.1457,
"step": 1170
},
{
"epoch": 0.10678491436845321,
"grad_norm": 0.43350547552108765,
"learning_rate": 1.969267575656711e-05,
"loss": 0.1429,
"step": 1180
},
{
"epoch": 0.10768987126988078,
"grad_norm": 0.5514594316482544,
"learning_rate": 1.968542327208987e-05,
"loss": 0.14,
"step": 1190
},
{
"epoch": 0.10859482817130835,
"grad_norm": 0.7761058211326599,
"learning_rate": 1.9678087574736305e-05,
"loss": 0.1361,
"step": 1200
},
{
"epoch": 0.10949978507273592,
"grad_norm": 0.4929139316082001,
"learning_rate": 1.9670668727531486e-05,
"loss": 0.1382,
"step": 1210
},
{
"epoch": 0.11040474197416349,
"grad_norm": 0.5243300199508667,
"learning_rate": 1.9663166794214868e-05,
"loss": 0.1443,
"step": 1220
},
{
"epoch": 0.11130969887559106,
"grad_norm": 0.3967302739620209,
"learning_rate": 1.965558183923975e-05,
"loss": 0.1359,
"step": 1230
},
{
"epoch": 0.11221465577701861,
"grad_norm": 0.6645998954772949,
"learning_rate": 1.9647913927772708e-05,
"loss": 0.1422,
"step": 1240
},
{
"epoch": 0.11311961267844618,
"grad_norm": 0.28883126378059387,
"learning_rate": 1.9640163125693053e-05,
"loss": 0.1397,
"step": 1250
},
{
"epoch": 0.11402456957987375,
"grad_norm": 0.48156359791755676,
"learning_rate": 1.9632329499592248e-05,
"loss": 0.141,
"step": 1260
},
{
"epoch": 0.11492952648130132,
"grad_norm": 0.9770751595497131,
"learning_rate": 1.962441311677335e-05,
"loss": 0.1343,
"step": 1270
},
{
"epoch": 0.11583448338272889,
"grad_norm": 0.675937831401825,
"learning_rate": 1.9616414045250417e-05,
"loss": 0.143,
"step": 1280
},
{
"epoch": 0.11673944028415646,
"grad_norm": 0.5480718016624451,
"learning_rate": 1.960833235374794e-05,
"loss": 0.1391,
"step": 1290
},
{
"epoch": 0.11764439718558403,
"grad_norm": 0.42649492621421814,
"learning_rate": 1.960016811170024e-05,
"loss": 0.1331,
"step": 1300
},
{
"epoch": 0.1185493540870116,
"grad_norm": 0.3937451243400574,
"learning_rate": 1.9591921389250872e-05,
"loss": 0.1406,
"step": 1310
},
{
"epoch": 0.11945431098843917,
"grad_norm": 0.5013184547424316,
"learning_rate": 1.958359225725204e-05,
"loss": 0.1363,
"step": 1320
},
{
"epoch": 0.12035926788986674,
"grad_norm": 0.475367933511734,
"learning_rate": 1.9575180787263955e-05,
"loss": 0.1368,
"step": 1330
},
{
"epoch": 0.12126422479129431,
"grad_norm": 0.4039798676967621,
"learning_rate": 1.956668705155426e-05,
"loss": 0.1398,
"step": 1340
},
{
"epoch": 0.12216918169272188,
"grad_norm": 0.5868827104568481,
"learning_rate": 1.955811112309737e-05,
"loss": 0.1373,
"step": 1350
},
{
"epoch": 0.12307413859414945,
"grad_norm": 0.4204873740673065,
"learning_rate": 1.9549453075573873e-05,
"loss": 0.1385,
"step": 1360
},
{
"epoch": 0.12397909549557702,
"grad_norm": 0.5889093279838562,
"learning_rate": 1.954071298336989e-05,
"loss": 0.1326,
"step": 1370
},
{
"epoch": 0.12488405239700459,
"grad_norm": 0.4504663050174713,
"learning_rate": 1.9531890921576425e-05,
"loss": 0.1371,
"step": 1380
},
{
"epoch": 0.12578900929843218,
"grad_norm": 0.40761178731918335,
"learning_rate": 1.9522986965988748e-05,
"loss": 0.1336,
"step": 1390
},
{
"epoch": 0.12669396619985973,
"grad_norm": 0.46256113052368164,
"learning_rate": 1.9514001193105693e-05,
"loss": 0.1351,
"step": 1400
},
{
"epoch": 0.12759892310128731,
"grad_norm": 0.39085566997528076,
"learning_rate": 1.9504933680129063e-05,
"loss": 0.1347,
"step": 1410
},
{
"epoch": 0.12850388000271487,
"grad_norm": 0.6131863594055176,
"learning_rate": 1.9495784504962913e-05,
"loss": 0.1356,
"step": 1420
},
{
"epoch": 0.12940883690414243,
"grad_norm": 0.4904513955116272,
"learning_rate": 1.9486553746212915e-05,
"loss": 0.1365,
"step": 1430
},
{
"epoch": 0.13031379380557,
"grad_norm": 0.2950369417667389,
"learning_rate": 1.9477241483185675e-05,
"loss": 0.1374,
"step": 1440
},
{
"epoch": 0.13121875070699757,
"grad_norm": 0.4332665503025055,
"learning_rate": 1.946784779588803e-05,
"loss": 0.1361,
"step": 1450
},
{
"epoch": 0.13212370760842515,
"grad_norm": 0.3877945840358734,
"learning_rate": 1.9458372765026402e-05,
"loss": 0.1332,
"step": 1460
},
{
"epoch": 0.1330286645098527,
"grad_norm": 0.35281670093536377,
"learning_rate": 1.9448816472006057e-05,
"loss": 0.1406,
"step": 1470
},
{
"epoch": 0.1339336214112803,
"grad_norm": 0.5009306073188782,
"learning_rate": 1.943917899893045e-05,
"loss": 0.1341,
"step": 1480
},
{
"epoch": 0.13483857831270785,
"grad_norm": 0.7299574613571167,
"learning_rate": 1.9429460428600485e-05,
"loss": 0.1337,
"step": 1490
},
{
"epoch": 0.13574353521413543,
"grad_norm": 0.4611435830593109,
"learning_rate": 1.9419660844513828e-05,
"loss": 0.1438,
"step": 1500
},
{
"epoch": 0.136648492115563,
"grad_norm": 0.4176791310310364,
"learning_rate": 1.940978033086417e-05,
"loss": 0.1399,
"step": 1510
},
{
"epoch": 0.13755344901699057,
"grad_norm": 0.3517489731311798,
"learning_rate": 1.9399818972540526e-05,
"loss": 0.1333,
"step": 1520
},
{
"epoch": 0.13845840591841813,
"grad_norm": 0.4304490387439728,
"learning_rate": 1.9389776855126472e-05,
"loss": 0.1416,
"step": 1530
},
{
"epoch": 0.1393633628198457,
"grad_norm": 0.57242751121521,
"learning_rate": 1.937965406489945e-05,
"loss": 0.1375,
"step": 1540
},
{
"epoch": 0.14026831972127327,
"grad_norm": 0.4773651361465454,
"learning_rate": 1.936945068883e-05,
"loss": 0.1281,
"step": 1550
},
{
"epoch": 0.14117327662270085,
"grad_norm": 0.4251551926136017,
"learning_rate": 1.9359166814581017e-05,
"loss": 0.1368,
"step": 1560
},
{
"epoch": 0.1420782335241284,
"grad_norm": 0.7797797918319702,
"learning_rate": 1.9348802530507003e-05,
"loss": 0.1363,
"step": 1570
},
{
"epoch": 0.142983190425556,
"grad_norm": 1.2365995645523071,
"learning_rate": 1.9338357925653312e-05,
"loss": 0.1344,
"step": 1580
},
{
"epoch": 0.14388814732698355,
"grad_norm": 0.6574030518531799,
"learning_rate": 1.932783308975537e-05,
"loss": 0.1419,
"step": 1590
},
{
"epoch": 0.14479310422841113,
"grad_norm": 0.6940555572509766,
"learning_rate": 1.9317228113237916e-05,
"loss": 0.1427,
"step": 1600
},
{
"epoch": 0.14569806112983869,
"grad_norm": 0.4072614014148712,
"learning_rate": 1.9306543087214215e-05,
"loss": 0.1306,
"step": 1610
},
{
"epoch": 0.14660301803126627,
"grad_norm": 0.37100809812545776,
"learning_rate": 1.9295778103485297e-05,
"loss": 0.1368,
"step": 1620
},
{
"epoch": 0.14750797493269382,
"grad_norm": 0.3309537470340729,
"learning_rate": 1.9284933254539143e-05,
"loss": 0.1319,
"step": 1630
},
{
"epoch": 0.1484129318341214,
"grad_norm": 0.5338348746299744,
"learning_rate": 1.9274008633549905e-05,
"loss": 0.1321,
"step": 1640
},
{
"epoch": 0.14931788873554896,
"grad_norm": 0.81573086977005,
"learning_rate": 1.9263004334377087e-05,
"loss": 0.1332,
"step": 1650
},
{
"epoch": 0.15022284563697655,
"grad_norm": 0.5445601344108582,
"learning_rate": 1.9251920451564773e-05,
"loss": 0.1335,
"step": 1660
},
{
"epoch": 0.1511278025384041,
"grad_norm": 0.5735446810722351,
"learning_rate": 1.9240757080340787e-05,
"loss": 0.1432,
"step": 1670
},
{
"epoch": 0.1520327594398317,
"grad_norm": 0.35098642110824585,
"learning_rate": 1.9229514316615875e-05,
"loss": 0.1397,
"step": 1680
},
{
"epoch": 0.15293771634125924,
"grad_norm": 0.5715315341949463,
"learning_rate": 1.9218192256982898e-05,
"loss": 0.1356,
"step": 1690
},
{
"epoch": 0.15384267324268683,
"grad_norm": 0.4406200051307678,
"learning_rate": 1.920679099871599e-05,
"loss": 0.1348,
"step": 1700
},
{
"epoch": 0.15474763014411438,
"grad_norm": 0.3725470304489136,
"learning_rate": 1.919531063976972e-05,
"loss": 0.1374,
"step": 1710
},
{
"epoch": 0.15565258704554197,
"grad_norm": 0.2997819781303406,
"learning_rate": 1.918375127877826e-05,
"loss": 0.1332,
"step": 1720
},
{
"epoch": 0.15655754394696952,
"grad_norm": 0.40308165550231934,
"learning_rate": 1.917211301505453e-05,
"loss": 0.1331,
"step": 1730
},
{
"epoch": 0.1574625008483971,
"grad_norm": 0.437809020280838,
"learning_rate": 1.916039594858935e-05,
"loss": 0.1398,
"step": 1740
},
{
"epoch": 0.15836745774982466,
"grad_norm": 0.8596120476722717,
"learning_rate": 1.914860018005058e-05,
"loss": 0.1358,
"step": 1750
},
{
"epoch": 0.15927241465125225,
"grad_norm": 0.6408957242965698,
"learning_rate": 1.913672581078224e-05,
"loss": 0.1334,
"step": 1760
},
{
"epoch": 0.1601773715526798,
"grad_norm": 0.49583616852760315,
"learning_rate": 1.912477294280367e-05,
"loss": 0.1404,
"step": 1770
},
{
"epoch": 0.16108232845410736,
"grad_norm": 0.5075445175170898,
"learning_rate": 1.911274167880863e-05,
"loss": 0.1372,
"step": 1780
},
{
"epoch": 0.16198728535553494,
"grad_norm": 0.4242574870586395,
"learning_rate": 1.9100632122164423e-05,
"loss": 0.1377,
"step": 1790
},
{
"epoch": 0.1628922422569625,
"grad_norm": 0.5238597989082336,
"learning_rate": 1.9088444376911002e-05,
"loss": 0.1427,
"step": 1800
},
{
"epoch": 0.16379719915839008,
"grad_norm": 0.43989208340644836,
"learning_rate": 1.9076178547760095e-05,
"loss": 0.1317,
"step": 1810
},
{
"epoch": 0.16470215605981764,
"grad_norm": 0.426861047744751,
"learning_rate": 1.9063834740094284e-05,
"loss": 0.1375,
"step": 1820
},
{
"epoch": 0.16560711296124522,
"grad_norm": 0.602488100528717,
"learning_rate": 1.90514130599661e-05,
"loss": 0.1319,
"step": 1830
},
{
"epoch": 0.16651206986267278,
"grad_norm": 0.3982050120830536,
"learning_rate": 1.9038913614097142e-05,
"loss": 0.1371,
"step": 1840
},
{
"epoch": 0.16741702676410036,
"grad_norm": 0.33180946111679077,
"learning_rate": 1.902633650987712e-05,
"loss": 0.1328,
"step": 1850
},
{
"epoch": 0.16832198366552792,
"grad_norm": 0.5042048096656799,
"learning_rate": 1.9013681855362952e-05,
"loss": 0.1342,
"step": 1860
},
{
"epoch": 0.1692269405669555,
"grad_norm": 0.35587260127067566,
"learning_rate": 1.9000949759277844e-05,
"loss": 0.1436,
"step": 1870
},
{
"epoch": 0.17013189746838306,
"grad_norm": 0.4799160361289978,
"learning_rate": 1.898814033101033e-05,
"loss": 0.1407,
"step": 1880
},
{
"epoch": 0.17103685436981064,
"grad_norm": 0.4712836742401123,
"learning_rate": 1.897525368061336e-05,
"loss": 0.1361,
"step": 1890
},
{
"epoch": 0.1719418112712382,
"grad_norm": 0.45217564702033997,
"learning_rate": 1.896228991880334e-05,
"loss": 0.1399,
"step": 1900
},
{
"epoch": 0.17284676817266578,
"grad_norm": 0.4772576689720154,
"learning_rate": 1.8949249156959185e-05,
"loss": 0.1426,
"step": 1910
},
{
"epoch": 0.17375172507409334,
"grad_norm": 0.3817451298236847,
"learning_rate": 1.893613150712135e-05,
"loss": 0.1332,
"step": 1920
},
{
"epoch": 0.17465668197552092,
"grad_norm": 0.35645753145217896,
"learning_rate": 1.892293708199089e-05,
"loss": 0.1274,
"step": 1930
},
{
"epoch": 0.17556163887694848,
"grad_norm": 0.5715720057487488,
"learning_rate": 1.8909665994928478e-05,
"loss": 0.1354,
"step": 1940
},
{
"epoch": 0.17646659577837606,
"grad_norm": 0.4831380248069763,
"learning_rate": 1.889631835995342e-05,
"loss": 0.1353,
"step": 1950
},
{
"epoch": 0.17737155267980362,
"grad_norm": 0.5451020002365112,
"learning_rate": 1.8882894291742703e-05,
"loss": 0.1407,
"step": 1960
},
{
"epoch": 0.1782765095812312,
"grad_norm": 0.5351752638816833,
"learning_rate": 1.886939390562999e-05,
"loss": 0.1353,
"step": 1970
},
{
"epoch": 0.17918146648265876,
"grad_norm": 0.546170175075531,
"learning_rate": 1.8855817317604622e-05,
"loss": 0.1382,
"step": 1980
},
{
"epoch": 0.18008642338408634,
"grad_norm": 0.38834547996520996,
"learning_rate": 1.8842164644310657e-05,
"loss": 0.1289,
"step": 1990
},
{
"epoch": 0.1809913802855139,
"grad_norm": 0.3078169524669647,
"learning_rate": 1.882843600304582e-05,
"loss": 0.1291,
"step": 2000
},
{
"epoch": 0.18189633718694148,
"grad_norm": 0.36554184556007385,
"learning_rate": 1.8814631511760535e-05,
"loss": 0.1445,
"step": 2010
},
{
"epoch": 0.18280129408836904,
"grad_norm": 0.5817809700965881,
"learning_rate": 1.8800751289056885e-05,
"loss": 0.1396,
"step": 2020
},
{
"epoch": 0.18370625098979662,
"grad_norm": 0.873615562915802,
"learning_rate": 1.8786795454187615e-05,
"loss": 0.1367,
"step": 2030
},
{
"epoch": 0.18461120789122418,
"grad_norm": 0.7153456807136536,
"learning_rate": 1.8772764127055087e-05,
"loss": 0.1344,
"step": 2040
},
{
"epoch": 0.18551616479265176,
"grad_norm": 0.28874504566192627,
"learning_rate": 1.8758657428210266e-05,
"loss": 0.1375,
"step": 2050
},
{
"epoch": 0.18642112169407932,
"grad_norm": 0.41073450446128845,
"learning_rate": 1.8744475478851667e-05,
"loss": 0.1392,
"step": 2060
},
{
"epoch": 0.1873260785955069,
"grad_norm": 0.4002520740032196,
"learning_rate": 1.8730218400824337e-05,
"loss": 0.1334,
"step": 2070
},
{
"epoch": 0.18823103549693446,
"grad_norm": 0.49055391550064087,
"learning_rate": 1.871588631661879e-05,
"loss": 0.1398,
"step": 2080
},
{
"epoch": 0.18913599239836204,
"grad_norm": 0.3669786751270294,
"learning_rate": 1.8701479349369957e-05,
"loss": 0.1309,
"step": 2090
},
{
"epoch": 0.1900409492997896,
"grad_norm": 0.4466594159603119,
"learning_rate": 1.8686997622856134e-05,
"loss": 0.1361,
"step": 2100
},
{
"epoch": 0.19094590620121718,
"grad_norm": 0.37340307235717773,
"learning_rate": 1.8672441261497915e-05,
"loss": 0.1314,
"step": 2110
},
{
"epoch": 0.19185086310264474,
"grad_norm": 0.35569268465042114,
"learning_rate": 1.8657810390357126e-05,
"loss": 0.1385,
"step": 2120
},
{
"epoch": 0.1927558200040723,
"grad_norm": 0.8156322240829468,
"learning_rate": 1.8643105135135743e-05,
"loss": 0.1358,
"step": 2130
},
{
"epoch": 0.19366077690549988,
"grad_norm": 0.4480380117893219,
"learning_rate": 1.8628325622174818e-05,
"loss": 0.1367,
"step": 2140
},
{
"epoch": 0.19456573380692743,
"grad_norm": 0.5346203446388245,
"learning_rate": 1.86134719784534e-05,
"loss": 0.1367,
"step": 2150
},
{
"epoch": 0.19547069070835502,
"grad_norm": 0.3813647925853729,
"learning_rate": 1.8598544331587427e-05,
"loss": 0.1386,
"step": 2160
},
{
"epoch": 0.19637564760978257,
"grad_norm": 0.5341768860816956,
"learning_rate": 1.858354280982865e-05,
"loss": 0.1298,
"step": 2170
},
{
"epoch": 0.19728060451121016,
"grad_norm": 0.3788771331310272,
"learning_rate": 1.8568467542063505e-05,
"loss": 0.1416,
"step": 2180
},
{
"epoch": 0.1981855614126377,
"grad_norm": 0.28519588708877563,
"learning_rate": 1.8553318657812035e-05,
"loss": 0.1336,
"step": 2190
},
{
"epoch": 0.1990905183140653,
"grad_norm": 0.38430145382881165,
"learning_rate": 1.853809628722676e-05,
"loss": 0.1346,
"step": 2200
},
{
"epoch": 0.19999547521549285,
"grad_norm": 0.3612518906593323,
"learning_rate": 1.8522800561091556e-05,
"loss": 0.1344,
"step": 2210
},
{
"epoch": 0.20090043211692044,
"grad_norm": 0.36791539192199707,
"learning_rate": 1.8507431610820547e-05,
"loss": 0.1345,
"step": 2220
},
{
"epoch": 0.201805389018348,
"grad_norm": 0.8380405902862549,
"learning_rate": 1.8491989568456962e-05,
"loss": 0.1343,
"step": 2230
},
{
"epoch": 0.20271034591977558,
"grad_norm": 0.4786330461502075,
"learning_rate": 1.8476474566671995e-05,
"loss": 0.1409,
"step": 2240
},
{
"epoch": 0.20361530282120313,
"grad_norm": 0.5574005842208862,
"learning_rate": 1.8460886738763698e-05,
"loss": 0.1324,
"step": 2250
},
{
"epoch": 0.20452025972263072,
"grad_norm": 0.3608987033367157,
"learning_rate": 1.8445226218655787e-05,
"loss": 0.1429,
"step": 2260
},
{
"epoch": 0.20542521662405827,
"grad_norm": 0.3293229639530182,
"learning_rate": 1.842949314089654e-05,
"loss": 0.1332,
"step": 2270
},
{
"epoch": 0.20633017352548585,
"grad_norm": 0.594818651676178,
"learning_rate": 1.8413687640657602e-05,
"loss": 0.1354,
"step": 2280
},
{
"epoch": 0.2072351304269134,
"grad_norm": 0.3503284156322479,
"learning_rate": 1.8397809853732846e-05,
"loss": 0.1373,
"step": 2290
},
{
"epoch": 0.208140087328341,
"grad_norm": 0.6148970127105713,
"learning_rate": 1.8381859916537204e-05,
"loss": 0.1435,
"step": 2300
},
{
"epoch": 0.20904504422976855,
"grad_norm": 0.39734190702438354,
"learning_rate": 1.8365837966105486e-05,
"loss": 0.1325,
"step": 2310
},
{
"epoch": 0.20995000113119613,
"grad_norm": 0.3820249140262604,
"learning_rate": 1.8349744140091205e-05,
"loss": 0.1307,
"step": 2320
},
{
"epoch": 0.2108549580326237,
"grad_norm": 0.2815605700016022,
"learning_rate": 1.83335785767654e-05,
"loss": 0.1395,
"step": 2330
},
{
"epoch": 0.21175991493405127,
"grad_norm": 0.26476067304611206,
"learning_rate": 1.831734141501546e-05,
"loss": 0.1332,
"step": 2340
},
{
"epoch": 0.21266487183547883,
"grad_norm": 0.4209696352481842,
"learning_rate": 1.830103279434389e-05,
"loss": 0.1373,
"step": 2350
},
{
"epoch": 0.21356982873690641,
"grad_norm": 0.29990458488464355,
"learning_rate": 1.828465285486716e-05,
"loss": 0.1345,
"step": 2360
},
{
"epoch": 0.21447478563833397,
"grad_norm": 0.39812973141670227,
"learning_rate": 1.826820173731446e-05,
"loss": 0.1361,
"step": 2370
},
{
"epoch": 0.21537974253976155,
"grad_norm": 0.35514476895332336,
"learning_rate": 1.825167958302653e-05,
"loss": 0.1353,
"step": 2380
},
{
"epoch": 0.2162846994411891,
"grad_norm": 0.3101460039615631,
"learning_rate": 1.8235086533954418e-05,
"loss": 0.1369,
"step": 2390
},
{
"epoch": 0.2171896563426167,
"grad_norm": 0.42703959345817566,
"learning_rate": 1.8218422732658263e-05,
"loss": 0.1348,
"step": 2400
},
{
"epoch": 0.21809461324404425,
"grad_norm": 0.44410404562950134,
"learning_rate": 1.820168832230609e-05,
"loss": 0.1292,
"step": 2410
},
{
"epoch": 0.21899957014547183,
"grad_norm": 0.4274084270000458,
"learning_rate": 1.8184883446672545e-05,
"loss": 0.1325,
"step": 2420
},
{
"epoch": 0.2199045270468994,
"grad_norm": 0.3276132643222809,
"learning_rate": 1.81680082501377e-05,
"loss": 0.1304,
"step": 2430
},
{
"epoch": 0.22080948394832697,
"grad_norm": 0.28649523854255676,
"learning_rate": 1.8151062877685785e-05,
"loss": 0.1379,
"step": 2440
},
{
"epoch": 0.22171444084975453,
"grad_norm": 0.3112781345844269,
"learning_rate": 1.813404747490395e-05,
"loss": 0.1324,
"step": 2450
},
{
"epoch": 0.2226193977511821,
"grad_norm": 0.37147268652915955,
"learning_rate": 1.811696218798102e-05,
"loss": 0.1362,
"step": 2460
},
{
"epoch": 0.22352435465260967,
"grad_norm": 0.30297672748565674,
"learning_rate": 1.8099807163706225e-05,
"loss": 0.1382,
"step": 2470
},
{
"epoch": 0.22442931155403723,
"grad_norm": 0.3513183295726776,
"learning_rate": 1.808258254946795e-05,
"loss": 0.1313,
"step": 2480
},
{
"epoch": 0.2253342684554648,
"grad_norm": 0.2990841865539551,
"learning_rate": 1.806528849325248e-05,
"loss": 0.1361,
"step": 2490
},
{
"epoch": 0.22623922535689237,
"grad_norm": 0.44124889373779297,
"learning_rate": 1.8047925143642685e-05,
"loss": 0.1348,
"step": 2500
},
{
"epoch": 0.22714418225831995,
"grad_norm": 0.5837457776069641,
"learning_rate": 1.8030492649816807e-05,
"loss": 0.1384,
"step": 2510
},
{
"epoch": 0.2280491391597475,
"grad_norm": 0.4553276598453522,
"learning_rate": 1.801299116154712e-05,
"loss": 0.1345,
"step": 2520
},
{
"epoch": 0.2289540960611751,
"grad_norm": 0.5152564644813538,
"learning_rate": 1.7995420829198677e-05,
"loss": 0.1319,
"step": 2530
},
{
"epoch": 0.22985905296260264,
"grad_norm": 0.4430082440376282,
"learning_rate": 1.7977781803728012e-05,
"loss": 0.1352,
"step": 2540
},
{
"epoch": 0.23076400986403023,
"grad_norm": 0.47474417090415955,
"learning_rate": 1.7960074236681832e-05,
"loss": 0.1387,
"step": 2550
},
{
"epoch": 0.23166896676545778,
"grad_norm": 0.3780491054058075,
"learning_rate": 1.7942298280195735e-05,
"loss": 0.1369,
"step": 2560
},
{
"epoch": 0.23257392366688537,
"grad_norm": 0.3033972680568695,
"learning_rate": 1.7924454086992874e-05,
"loss": 0.1297,
"step": 2570
},
{
"epoch": 0.23347888056831292,
"grad_norm": 0.3683450520038605,
"learning_rate": 1.7906541810382676e-05,
"loss": 0.1318,
"step": 2580
},
{
"epoch": 0.2343838374697405,
"grad_norm": 0.3137243092060089,
"learning_rate": 1.78885616042595e-05,
"loss": 0.1299,
"step": 2590
},
{
"epoch": 0.23528879437116806,
"grad_norm": 0.6204285621643066,
"learning_rate": 1.787051362310134e-05,
"loss": 0.1328,
"step": 2600
},
{
"epoch": 0.23619375127259565,
"grad_norm": 0.3297698497772217,
"learning_rate": 1.785239802196847e-05,
"loss": 0.1266,
"step": 2610
},
{
"epoch": 0.2370987081740232,
"grad_norm": 0.27107271552085876,
"learning_rate": 1.7834214956502124e-05,
"loss": 0.1386,
"step": 2620
},
{
"epoch": 0.2380036650754508,
"grad_norm": 0.3303036093711853,
"learning_rate": 1.781596458292317e-05,
"loss": 0.1314,
"step": 2630
},
{
"epoch": 0.23890862197687834,
"grad_norm": 0.2860264480113983,
"learning_rate": 1.7797647058030748e-05,
"loss": 0.1341,
"step": 2640
},
{
"epoch": 0.23981357887830593,
"grad_norm": 0.333578497171402,
"learning_rate": 1.7779262539200937e-05,
"loss": 0.134,
"step": 2650
},
{
"epoch": 0.24071853577973348,
"grad_norm": 0.3807545602321625,
"learning_rate": 1.7760811184385406e-05,
"loss": 0.1327,
"step": 2660
},
{
"epoch": 0.24162349268116107,
"grad_norm": 0.3607015013694763,
"learning_rate": 1.7742293152110033e-05,
"loss": 0.1356,
"step": 2670
},
{
"epoch": 0.24252844958258862,
"grad_norm": 0.4713596701622009,
"learning_rate": 1.7723708601473566e-05,
"loss": 0.1371,
"step": 2680
},
{
"epoch": 0.2434334064840162,
"grad_norm": 0.5265551805496216,
"learning_rate": 1.7705057692146258e-05,
"loss": 0.1293,
"step": 2690
},
{
"epoch": 0.24433836338544376,
"grad_norm": 0.35034945607185364,
"learning_rate": 1.768634058436847e-05,
"loss": 0.1354,
"step": 2700
},
{
"epoch": 0.24524332028687135,
"grad_norm": 0.3199910819530487,
"learning_rate": 1.7667557438949328e-05,
"loss": 0.1411,
"step": 2710
},
{
"epoch": 0.2461482771882989,
"grad_norm": 0.379367858171463,
"learning_rate": 1.7648708417265314e-05,
"loss": 0.1345,
"step": 2720
},
{
"epoch": 0.2470532340897265,
"grad_norm": 0.5331190824508667,
"learning_rate": 1.7629793681258892e-05,
"loss": 0.133,
"step": 2730
},
{
"epoch": 0.24795819099115404,
"grad_norm": 0.29094645380973816,
"learning_rate": 1.761081339343711e-05,
"loss": 0.1288,
"step": 2740
},
{
"epoch": 0.24886314789258163,
"grad_norm": 0.47043576836586,
"learning_rate": 1.759176771687022e-05,
"loss": 0.1368,
"step": 2750
},
{
"epoch": 0.24976810479400918,
"grad_norm": 0.8411908149719238,
"learning_rate": 1.7572656815190253e-05,
"loss": 0.1326,
"step": 2760
},
{
"epoch": 0.25067306169543674,
"grad_norm": 0.39577701687812805,
"learning_rate": 1.7553480852589635e-05,
"loss": 0.1336,
"step": 2770
},
{
"epoch": 0.25157801859686435,
"grad_norm": 0.5741309523582458,
"learning_rate": 1.7534239993819758e-05,
"loss": 0.1367,
"step": 2780
},
{
"epoch": 0.2524829754982919,
"grad_norm": 0.3736680746078491,
"learning_rate": 1.7514934404189574e-05,
"loss": 0.1259,
"step": 2790
},
{
"epoch": 0.25338793239971946,
"grad_norm": 0.4589287042617798,
"learning_rate": 1.7495564249564184e-05,
"loss": 0.1319,
"step": 2800
},
{
"epoch": 0.254292889301147,
"grad_norm": 0.4368409514427185,
"learning_rate": 1.7476129696363394e-05,
"loss": 0.1282,
"step": 2810
},
{
"epoch": 0.25519784620257463,
"grad_norm": 0.3239140212535858,
"learning_rate": 1.7456630911560294e-05,
"loss": 0.1309,
"step": 2820
},
{
"epoch": 0.2561028031040022,
"grad_norm": 0.3988368809223175,
"learning_rate": 1.7437068062679827e-05,
"loss": 0.1338,
"step": 2830
},
{
"epoch": 0.25700776000542974,
"grad_norm": 0.45862647891044617,
"learning_rate": 1.7417441317797342e-05,
"loss": 0.1344,
"step": 2840
},
{
"epoch": 0.2579127169068573,
"grad_norm": 0.3606816232204437,
"learning_rate": 1.7397750845537163e-05,
"loss": 0.1369,
"step": 2850
},
{
"epoch": 0.25881767380828485,
"grad_norm": 0.44180789589881897,
"learning_rate": 1.7377996815071122e-05,
"loss": 0.1299,
"step": 2860
},
{
"epoch": 0.25972263070971247,
"grad_norm": 0.30617383122444153,
"learning_rate": 1.7358179396117118e-05,
"loss": 0.1334,
"step": 2870
},
{
"epoch": 0.26062758761114,
"grad_norm": 0.3464939594268799,
"learning_rate": 1.7338298758937656e-05,
"loss": 0.1317,
"step": 2880
},
{
"epoch": 0.2615325445125676,
"grad_norm": 0.3210926949977875,
"learning_rate": 1.7318355074338387e-05,
"loss": 0.1334,
"step": 2890
},
{
"epoch": 0.26243750141399513,
"grad_norm": 0.309063196182251,
"learning_rate": 1.7298348513666632e-05,
"loss": 0.1432,
"step": 2900
},
{
"epoch": 0.26334245831542274,
"grad_norm": 0.3307989835739136,
"learning_rate": 1.727827924880992e-05,
"loss": 0.1376,
"step": 2910
},
{
"epoch": 0.2642474152168503,
"grad_norm": 0.2714211046695709,
"learning_rate": 1.725814745219451e-05,
"loss": 0.1338,
"step": 2920
},
{
"epoch": 0.26515237211827786,
"grad_norm": 0.4215063750743866,
"learning_rate": 1.723795329678389e-05,
"loss": 0.1346,
"step": 2930
},
{
"epoch": 0.2660573290197054,
"grad_norm": 0.36363697052001953,
"learning_rate": 1.721769695607733e-05,
"loss": 0.1323,
"step": 2940
},
{
"epoch": 0.266962285921133,
"grad_norm": 0.3916791081428528,
"learning_rate": 1.7197378604108352e-05,
"loss": 0.1299,
"step": 2950
},
{
"epoch": 0.2678672428225606,
"grad_norm": 0.3785913288593292,
"learning_rate": 1.7176998415443256e-05,
"loss": 0.1328,
"step": 2960
},
{
"epoch": 0.26877219972398814,
"grad_norm": 0.329843670129776,
"learning_rate": 1.7156556565179618e-05,
"loss": 0.1316,
"step": 2970
},
{
"epoch": 0.2696771566254157,
"grad_norm": 0.30012860894203186,
"learning_rate": 1.713605322894478e-05,
"loss": 0.1344,
"step": 2980
},
{
"epoch": 0.2705821135268433,
"grad_norm": 0.33648693561553955,
"learning_rate": 1.7115488582894345e-05,
"loss": 0.1238,
"step": 2990
},
{
"epoch": 0.27148707042827086,
"grad_norm": 0.4068211317062378,
"learning_rate": 1.7094862803710665e-05,
"loss": 0.1414,
"step": 3000
},
{
"epoch": 0.2723920273296984,
"grad_norm": 0.34052276611328125,
"learning_rate": 1.7074176068601318e-05,
"loss": 0.1314,
"step": 3010
},
{
"epoch": 0.273296984231126,
"grad_norm": 0.4234228730201721,
"learning_rate": 1.705342855529759e-05,
"loss": 0.1344,
"step": 3020
},
{
"epoch": 0.2742019411325536,
"grad_norm": 0.3007555603981018,
"learning_rate": 1.7032620442052948e-05,
"loss": 0.1403,
"step": 3030
},
{
"epoch": 0.27510689803398114,
"grad_norm": 0.3519713878631592,
"learning_rate": 1.70117519076415e-05,
"loss": 0.1326,
"step": 3040
},
{
"epoch": 0.2760118549354087,
"grad_norm": 0.6432228088378906,
"learning_rate": 1.699082313135648e-05,
"loss": 0.1339,
"step": 3050
},
{
"epoch": 0.27691681183683625,
"grad_norm": 0.39598795771598816,
"learning_rate": 1.6969834293008674e-05,
"loss": 0.133,
"step": 3060
},
{
"epoch": 0.27782176873826386,
"grad_norm": 0.3361396789550781,
"learning_rate": 1.6948785572924912e-05,
"loss": 0.1258,
"step": 3070
},
{
"epoch": 0.2787267256396914,
"grad_norm": 0.3960329592227936,
"learning_rate": 1.692767715194649e-05,
"loss": 0.1315,
"step": 3080
},
{
"epoch": 0.279631682541119,
"grad_norm": 0.3047797381877899,
"learning_rate": 1.6906509211427633e-05,
"loss": 0.1321,
"step": 3090
},
{
"epoch": 0.28053663944254653,
"grad_norm": 0.3368310332298279,
"learning_rate": 1.6885281933233936e-05,
"loss": 0.1421,
"step": 3100
},
{
"epoch": 0.28144159634397414,
"grad_norm": 0.5269041061401367,
"learning_rate": 1.6863995499740785e-05,
"loss": 0.1342,
"step": 3110
},
{
"epoch": 0.2823465532454017,
"grad_norm": 0.3757074773311615,
"learning_rate": 1.6842650093831817e-05,
"loss": 0.1347,
"step": 3120
},
{
"epoch": 0.28325151014682926,
"grad_norm": 0.4319840669631958,
"learning_rate": 1.6821245898897317e-05,
"loss": 0.1368,
"step": 3130
},
{
"epoch": 0.2841564670482568,
"grad_norm": 0.2823663651943207,
"learning_rate": 1.6799783098832677e-05,
"loss": 0.1318,
"step": 3140
},
{
"epoch": 0.2850614239496844,
"grad_norm": 0.3386325538158417,
"learning_rate": 1.6778261878036784e-05,
"loss": 0.1335,
"step": 3150
},
{
"epoch": 0.285966380851112,
"grad_norm": 0.44977718591690063,
"learning_rate": 1.6756682421410454e-05,
"loss": 0.1342,
"step": 3160
},
{
"epoch": 0.28687133775253953,
"grad_norm": 0.43069374561309814,
"learning_rate": 1.6735044914354853e-05,
"loss": 0.1316,
"step": 3170
},
{
"epoch": 0.2877762946539671,
"grad_norm": 0.47004443407058716,
"learning_rate": 1.6713349542769865e-05,
"loss": 0.1353,
"step": 3180
},
{
"epoch": 0.28868125155539465,
"grad_norm": 0.5096125602722168,
"learning_rate": 1.6691596493052543e-05,
"loss": 0.1346,
"step": 3190
},
{
"epoch": 0.28958620845682226,
"grad_norm": 0.4015505611896515,
"learning_rate": 1.6669785952095468e-05,
"loss": 0.1334,
"step": 3200
},
{
"epoch": 0.2904911653582498,
"grad_norm": 0.37036389112472534,
"learning_rate": 1.6647918107285182e-05,
"loss": 0.127,
"step": 3210
},
{
"epoch": 0.29139612225967737,
"grad_norm": 0.39770999550819397,
"learning_rate": 1.6625993146500536e-05,
"loss": 0.1355,
"step": 3220
},
{
"epoch": 0.2923010791611049,
"grad_norm": 0.43057796359062195,
"learning_rate": 1.6604011258111097e-05,
"loss": 0.1358,
"step": 3230
},
{
"epoch": 0.29320603606253254,
"grad_norm": 0.3283129930496216,
"learning_rate": 1.658197263097555e-05,
"loss": 0.1317,
"step": 3240
},
{
"epoch": 0.2941109929639601,
"grad_norm": 0.28569671511650085,
"learning_rate": 1.6559877454440025e-05,
"loss": 0.1351,
"step": 3250
},
{
"epoch": 0.29501594986538765,
"grad_norm": 0.29221346974372864,
"learning_rate": 1.6537725918336524e-05,
"loss": 0.135,
"step": 3260
},
{
"epoch": 0.2959209067668152,
"grad_norm": 3.769948959350586,
"learning_rate": 1.6515518212981248e-05,
"loss": 0.1562,
"step": 3270
},
{
"epoch": 0.2968258636682428,
"grad_norm": 0.6297938823699951,
"learning_rate": 1.6493254529172996e-05,
"loss": 0.1332,
"step": 3280
},
{
"epoch": 0.2977308205696704,
"grad_norm": 0.4224908649921417,
"learning_rate": 1.647093505819149e-05,
"loss": 0.1362,
"step": 3290
},
{
"epoch": 0.29863577747109793,
"grad_norm": 0.60915607213974,
"learning_rate": 1.6448559991795762e-05,
"loss": 0.1331,
"step": 3300
},
{
"epoch": 0.2995407343725255,
"grad_norm": 0.4966517686843872,
"learning_rate": 1.64261295222225e-05,
"loss": 0.1264,
"step": 3310
},
{
"epoch": 0.3004456912739531,
"grad_norm": 0.8263148069381714,
"learning_rate": 1.6403643842184383e-05,
"loss": 0.1284,
"step": 3320
},
{
"epoch": 0.30135064817538065,
"grad_norm": 0.2521939277648926,
"learning_rate": 1.6381103144868434e-05,
"loss": 0.1321,
"step": 3330
},
{
"epoch": 0.3022556050768082,
"grad_norm": 0.3264107406139374,
"learning_rate": 1.6358507623934368e-05,
"loss": 0.1338,
"step": 3340
},
{
"epoch": 0.30316056197823577,
"grad_norm": 0.3555287718772888,
"learning_rate": 1.6335857473512908e-05,
"loss": 0.1318,
"step": 3350
},
{
"epoch": 0.3040655188796634,
"grad_norm": 0.3218032717704773,
"learning_rate": 1.6313152888204143e-05,
"loss": 0.1244,
"step": 3360
},
{
"epoch": 0.30497047578109093,
"grad_norm": 0.37903180718421936,
"learning_rate": 1.629039406307583e-05,
"loss": 0.1315,
"step": 3370
},
{
"epoch": 0.3058754326825185,
"grad_norm": 0.32837244868278503,
"learning_rate": 1.626758119366174e-05,
"loss": 0.1299,
"step": 3380
},
{
"epoch": 0.30678038958394604,
"grad_norm": 0.5855687856674194,
"learning_rate": 1.6244714475959958e-05,
"loss": 0.1372,
"step": 3390
},
{
"epoch": 0.30768534648537366,
"grad_norm": 0.30140820145606995,
"learning_rate": 1.622179410643123e-05,
"loss": 0.1303,
"step": 3400
},
{
"epoch": 0.3085903033868012,
"grad_norm": 0.32345208525657654,
"learning_rate": 1.619882028199723e-05,
"loss": 0.1318,
"step": 3410
},
{
"epoch": 0.30949526028822877,
"grad_norm": 0.4127529561519623,
"learning_rate": 1.617579320003891e-05,
"loss": 0.1325,
"step": 3420
},
{
"epoch": 0.3104002171896563,
"grad_norm": 0.2518954873085022,
"learning_rate": 1.6152713058394778e-05,
"loss": 0.1323,
"step": 3430
},
{
"epoch": 0.31130517409108394,
"grad_norm": 0.385233610868454,
"learning_rate": 1.612958005535921e-05,
"loss": 0.1321,
"step": 3440
},
{
"epoch": 0.3122101309925115,
"grad_norm": 4.388869285583496,
"learning_rate": 1.6106394389680752e-05,
"loss": 0.1392,
"step": 3450
},
{
"epoch": 0.31311508789393905,
"grad_norm": 0.3289415240287781,
"learning_rate": 1.6083156260560387e-05,
"loss": 0.1319,
"step": 3460
},
{
"epoch": 0.3140200447953666,
"grad_norm": 0.2971116900444031,
"learning_rate": 1.605986586764986e-05,
"loss": 0.1314,
"step": 3470
},
{
"epoch": 0.3149250016967942,
"grad_norm": 0.2933705449104309,
"learning_rate": 1.603652341104993e-05,
"loss": 0.1331,
"step": 3480
},
{
"epoch": 0.31582995859822177,
"grad_norm": 0.2969614863395691,
"learning_rate": 1.6013129091308658e-05,
"loss": 0.1356,
"step": 3490
},
{
"epoch": 0.3167349154996493,
"grad_norm": 0.48023778200149536,
"learning_rate": 1.5989683109419717e-05,
"loss": 0.1296,
"step": 3500
},
{
"epoch": 0.3176398724010769,
"grad_norm": 0.3945216238498688,
"learning_rate": 1.5966185666820608e-05,
"loss": 0.1393,
"step": 3510
},
{
"epoch": 0.3185448293025045,
"grad_norm": 0.34867751598358154,
"learning_rate": 1.5942636965390983e-05,
"loss": 0.1316,
"step": 3520
},
{
"epoch": 0.31944978620393205,
"grad_norm": 0.4952505826950073,
"learning_rate": 1.5919037207450873e-05,
"loss": 0.1292,
"step": 3530
},
{
"epoch": 0.3203547431053596,
"grad_norm": 0.35089966654777527,
"learning_rate": 1.589538659575897e-05,
"loss": 0.1339,
"step": 3540
},
{
"epoch": 0.32125970000678716,
"grad_norm": 0.5478940606117249,
"learning_rate": 1.5871685333510873e-05,
"loss": 0.1331,
"step": 3550
},
{
"epoch": 0.3221646569082147,
"grad_norm": 0.41318562626838684,
"learning_rate": 1.584793362433736e-05,
"loss": 0.1328,
"step": 3560
},
{
"epoch": 0.32306961380964233,
"grad_norm": 0.3964232802391052,
"learning_rate": 1.5824131672302608e-05,
"loss": 0.1445,
"step": 3570
},
{
"epoch": 0.3239745707110699,
"grad_norm": 0.3068403899669647,
"learning_rate": 1.5800279681902483e-05,
"loss": 0.1275,
"step": 3580
},
{
"epoch": 0.32487952761249744,
"grad_norm": 0.2769792079925537,
"learning_rate": 1.5776377858062737e-05,
"loss": 0.1286,
"step": 3590
},
{
"epoch": 0.325784484513925,
"grad_norm": 0.5100188255310059,
"learning_rate": 1.5752426406137275e-05,
"loss": 0.1344,
"step": 3600
},
{
"epoch": 0.3266894414153526,
"grad_norm": 0.3469770550727844,
"learning_rate": 1.5728425531906396e-05,
"loss": 0.1323,
"step": 3610
},
{
"epoch": 0.32759439831678017,
"grad_norm": 0.5681571960449219,
"learning_rate": 1.5704375441574996e-05,
"loss": 0.1335,
"step": 3620
},
{
"epoch": 0.3284993552182077,
"grad_norm": 0.3062500059604645,
"learning_rate": 1.568027634177083e-05,
"loss": 0.1367,
"step": 3630
},
{
"epoch": 0.3294043121196353,
"grad_norm": 0.3792308270931244,
"learning_rate": 1.5656128439542704e-05,
"loss": 0.1301,
"step": 3640
},
{
"epoch": 0.3303092690210629,
"grad_norm": 0.3073217272758484,
"learning_rate": 1.5631931942358723e-05,
"loss": 0.1304,
"step": 3650
},
{
"epoch": 0.33121422592249045,
"grad_norm": 0.3077130615711212,
"learning_rate": 1.560768705810451e-05,
"loss": 0.1293,
"step": 3660
},
{
"epoch": 0.332119182823918,
"grad_norm": 0.34068816900253296,
"learning_rate": 1.558339399508138e-05,
"loss": 0.138,
"step": 3670
},
{
"epoch": 0.33302413972534556,
"grad_norm": 0.366473525762558,
"learning_rate": 1.55590529620046e-05,
"loss": 0.1243,
"step": 3680
},
{
"epoch": 0.33392909662677317,
"grad_norm": 0.40335702896118164,
"learning_rate": 1.553466416800157e-05,
"loss": 0.1305,
"step": 3690
},
{
"epoch": 0.3348340535282007,
"grad_norm": 0.3505435287952423,
"learning_rate": 1.551022782261003e-05,
"loss": 0.1332,
"step": 3700
},
{
"epoch": 0.3357390104296283,
"grad_norm": 0.5044335126876831,
"learning_rate": 1.5485744135776258e-05,
"loss": 0.1326,
"step": 3710
},
{
"epoch": 0.33664396733105584,
"grad_norm": 0.3738217055797577,
"learning_rate": 1.546121331785327e-05,
"loss": 0.1251,
"step": 3720
},
{
"epoch": 0.33754892423248345,
"grad_norm": 0.3503580093383789,
"learning_rate": 1.5436635579599014e-05,
"loss": 0.1349,
"step": 3730
},
{
"epoch": 0.338453881133911,
"grad_norm": 0.3351600766181946,
"learning_rate": 1.541201113217456e-05,
"loss": 0.1275,
"step": 3740
},
{
"epoch": 0.33935883803533856,
"grad_norm": 0.2801141142845154,
"learning_rate": 1.538734018714227e-05,
"loss": 0.1278,
"step": 3750
},
{
"epoch": 0.3402637949367661,
"grad_norm": 0.26787105202674866,
"learning_rate": 1.5362622956463998e-05,
"loss": 0.1413,
"step": 3760
},
{
"epoch": 0.34116875183819373,
"grad_norm": 0.394846111536026,
"learning_rate": 1.5337859652499277e-05,
"loss": 0.1354,
"step": 3770
},
{
"epoch": 0.3420737087396213,
"grad_norm": 0.3125811815261841,
"learning_rate": 1.531305048800346e-05,
"loss": 0.1324,
"step": 3780
},
{
"epoch": 0.34297866564104884,
"grad_norm": 0.3050634264945984,
"learning_rate": 1.5288195676125937e-05,
"loss": 0.1345,
"step": 3790
},
{
"epoch": 0.3438836225424764,
"grad_norm": 0.257493257522583,
"learning_rate": 1.5263295430408255e-05,
"loss": 0.1311,
"step": 3800
},
{
"epoch": 0.344788579443904,
"grad_norm": 0.3533366620540619,
"learning_rate": 1.5238349964782325e-05,
"loss": 0.1282,
"step": 3810
},
{
"epoch": 0.34569353634533156,
"grad_norm": 0.3291451036930084,
"learning_rate": 1.5213359493568562e-05,
"loss": 0.1218,
"step": 3820
},
{
"epoch": 0.3465984932467591,
"grad_norm": 0.3743618130683899,
"learning_rate": 1.5188324231474054e-05,
"loss": 0.1295,
"step": 3830
},
{
"epoch": 0.3475034501481867,
"grad_norm": 0.5372154712677002,
"learning_rate": 1.51632443935907e-05,
"loss": 0.1331,
"step": 3840
},
{
"epoch": 0.3484084070496143,
"grad_norm": 0.3277980387210846,
"learning_rate": 1.5138120195393396e-05,
"loss": 0.1387,
"step": 3850
},
{
"epoch": 0.34931336395104184,
"grad_norm": 0.40068790316581726,
"learning_rate": 1.5112951852738138e-05,
"loss": 0.1301,
"step": 3860
},
{
"epoch": 0.3502183208524694,
"grad_norm": 0.32944217324256897,
"learning_rate": 1.5087739581860213e-05,
"loss": 0.1312,
"step": 3870
},
{
"epoch": 0.35112327775389696,
"grad_norm": 0.3983187675476074,
"learning_rate": 1.50624835993723e-05,
"loss": 0.1325,
"step": 3880
},
{
"epoch": 0.35202823465532457,
"grad_norm": 0.3687704801559448,
"learning_rate": 1.5037184122262645e-05,
"loss": 0.128,
"step": 3890
},
{
"epoch": 0.3529331915567521,
"grad_norm": 0.4385727345943451,
"learning_rate": 1.501184136789317e-05,
"loss": 0.1329,
"step": 3900
},
{
"epoch": 0.3538381484581797,
"grad_norm": 0.29301881790161133,
"learning_rate": 1.4986455553997625e-05,
"loss": 0.1301,
"step": 3910
},
{
"epoch": 0.35474310535960724,
"grad_norm": 0.4024035632610321,
"learning_rate": 1.4961026898679703e-05,
"loss": 0.1325,
"step": 3920
},
{
"epoch": 0.3556480622610348,
"grad_norm": 0.32220858335494995,
"learning_rate": 1.4935555620411168e-05,
"loss": 0.1361,
"step": 3930
},
{
"epoch": 0.3565530191624624,
"grad_norm": 0.2925558388233185,
"learning_rate": 1.4910041938029993e-05,
"loss": 0.1299,
"step": 3940
},
{
"epoch": 0.35745797606388996,
"grad_norm": 0.3419649600982666,
"learning_rate": 1.4884486070738457e-05,
"loss": 0.1371,
"step": 3950
},
{
"epoch": 0.3583629329653175,
"grad_norm": 0.37976405024528503,
"learning_rate": 1.4858888238101278e-05,
"loss": 0.1381,
"step": 3960
},
{
"epoch": 0.35926788986674507,
"grad_norm": 0.3191063106060028,
"learning_rate": 1.483324866004372e-05,
"loss": 0.1299,
"step": 3970
},
{
"epoch": 0.3601728467681727,
"grad_norm": 0.35269689559936523,
"learning_rate": 1.4807567556849707e-05,
"loss": 0.134,
"step": 3980
},
{
"epoch": 0.36107780366960024,
"grad_norm": 0.44557368755340576,
"learning_rate": 1.478184514915993e-05,
"loss": 0.1333,
"step": 3990
},
{
"epoch": 0.3619827605710278,
"grad_norm": 0.26233455538749695,
"learning_rate": 1.4756081657969947e-05,
"loss": 0.126,
"step": 4000
},
{
"epoch": 0.36288771747245535,
"grad_norm": 0.27105700969696045,
"learning_rate": 1.4730277304628287e-05,
"loss": 0.1338,
"step": 4010
},
{
"epoch": 0.36379267437388296,
"grad_norm": 0.45541515946388245,
"learning_rate": 1.4704432310834551e-05,
"loss": 0.1327,
"step": 4020
},
{
"epoch": 0.3646976312753105,
"grad_norm": 0.31574854254722595,
"learning_rate": 1.4678546898637502e-05,
"loss": 0.1329,
"step": 4030
},
{
"epoch": 0.3656025881767381,
"grad_norm": 0.3075743019580841,
"learning_rate": 1.4652621290433166e-05,
"loss": 0.1297,
"step": 4040
},
{
"epoch": 0.36650754507816563,
"grad_norm": 0.32941344380378723,
"learning_rate": 1.4626655708962904e-05,
"loss": 0.125,
"step": 4050
},
{
"epoch": 0.36741250197959324,
"grad_norm": 0.3551495671272278,
"learning_rate": 1.4600650377311523e-05,
"loss": 0.134,
"step": 4060
},
{
"epoch": 0.3683174588810208,
"grad_norm": 0.3904673457145691,
"learning_rate": 1.4574605518905336e-05,
"loss": 0.131,
"step": 4070
},
{
"epoch": 0.36922241578244835,
"grad_norm": 0.3711760342121124,
"learning_rate": 1.4548521357510256e-05,
"loss": 0.1302,
"step": 4080
},
{
"epoch": 0.3701273726838759,
"grad_norm": 0.23497672379016876,
"learning_rate": 1.4522398117229874e-05,
"loss": 0.1352,
"step": 4090
},
{
"epoch": 0.3710323295853035,
"grad_norm": 0.39998266100883484,
"learning_rate": 1.4496236022503523e-05,
"loss": 0.1326,
"step": 4100
},
{
"epoch": 0.3719372864867311,
"grad_norm": 0.38762685656547546,
"learning_rate": 1.4470035298104355e-05,
"loss": 0.1269,
"step": 4110
},
{
"epoch": 0.37284224338815863,
"grad_norm": 0.29660049080848694,
"learning_rate": 1.444379616913742e-05,
"loss": 0.1314,
"step": 4120
},
{
"epoch": 0.3737472002895862,
"grad_norm": 0.3709215223789215,
"learning_rate": 1.4417518861037713e-05,
"loss": 0.1315,
"step": 4130
},
{
"epoch": 0.3746521571910138,
"grad_norm": 0.5628421902656555,
"learning_rate": 1.4391203599568257e-05,
"loss": 0.1345,
"step": 4140
},
{
"epoch": 0.37555711409244136,
"grad_norm": 0.32759326696395874,
"learning_rate": 1.4364850610818147e-05,
"loss": 0.1369,
"step": 4150
},
{
"epoch": 0.3764620709938689,
"grad_norm": 0.33175286650657654,
"learning_rate": 1.4338460121200612e-05,
"loss": 0.1303,
"step": 4160
},
{
"epoch": 0.37736702789529647,
"grad_norm": 0.5355362296104431,
"learning_rate": 1.4312032357451084e-05,
"loss": 0.1283,
"step": 4170
},
{
"epoch": 0.3782719847967241,
"grad_norm": 0.25995931029319763,
"learning_rate": 1.428556754662522e-05,
"loss": 0.1306,
"step": 4180
},
{
"epoch": 0.37917694169815164,
"grad_norm": 0.34952160716056824,
"learning_rate": 1.4259065916096983e-05,
"loss": 0.1315,
"step": 4190
},
{
"epoch": 0.3800818985995792,
"grad_norm": 0.34290722012519836,
"learning_rate": 1.4232527693556673e-05,
"loss": 0.1349,
"step": 4200
},
{
"epoch": 0.38098685550100675,
"grad_norm": 0.2907416522502899,
"learning_rate": 1.4205953107008964e-05,
"loss": 0.134,
"step": 4210
},
{
"epoch": 0.38189181240243436,
"grad_norm": 0.305785208940506,
"learning_rate": 1.4179342384770964e-05,
"loss": 0.1322,
"step": 4220
},
{
"epoch": 0.3827967693038619,
"grad_norm": 0.3259272873401642,
"learning_rate": 1.4152695755470235e-05,
"loss": 0.124,
"step": 4230
},
{
"epoch": 0.3837017262052895,
"grad_norm": 0.4469420611858368,
"learning_rate": 1.4126013448042838e-05,
"loss": 0.1247,
"step": 4240
},
{
"epoch": 0.38460668310671703,
"grad_norm": 0.3158349096775055,
"learning_rate": 1.4099295691731374e-05,
"loss": 0.1213,
"step": 4250
},
{
"epoch": 0.3855116400081446,
"grad_norm": 0.30905407667160034,
"learning_rate": 1.4072542716082986e-05,
"loss": 0.1334,
"step": 4260
},
{
"epoch": 0.3864165969095722,
"grad_norm": 0.29381975531578064,
"learning_rate": 1.4045754750947428e-05,
"loss": 0.1328,
"step": 4270
},
{
"epoch": 0.38732155381099975,
"grad_norm": 0.39547502994537354,
"learning_rate": 1.401893202647505e-05,
"loss": 0.1269,
"step": 4280
},
{
"epoch": 0.3882265107124273,
"grad_norm": 0.3500146269798279,
"learning_rate": 1.3992074773114852e-05,
"loss": 0.1322,
"step": 4290
},
{
"epoch": 0.38913146761385486,
"grad_norm": 0.2726495563983917,
"learning_rate": 1.3965183221612484e-05,
"loss": 0.1337,
"step": 4300
},
{
"epoch": 0.3900364245152825,
"grad_norm": 0.3656439185142517,
"learning_rate": 1.393825760300827e-05,
"loss": 0.1324,
"step": 4310
},
{
"epoch": 0.39094138141671003,
"grad_norm": 0.34188976883888245,
"learning_rate": 1.3911298148635224e-05,
"loss": 0.1358,
"step": 4320
},
{
"epoch": 0.3918463383181376,
"grad_norm": 0.5021139979362488,
"learning_rate": 1.3884305090117069e-05,
"loss": 0.1365,
"step": 4330
},
{
"epoch": 0.39275129521956514,
"grad_norm": 0.32963699102401733,
"learning_rate": 1.3857278659366232e-05,
"loss": 0.1337,
"step": 4340
},
{
"epoch": 0.39365625212099276,
"grad_norm": 0.31096163392066956,
"learning_rate": 1.3830219088581856e-05,
"loss": 0.1305,
"step": 4350
},
{
"epoch": 0.3945612090224203,
"grad_norm": 0.3522721230983734,
"learning_rate": 1.380312661024782e-05,
"loss": 0.1327,
"step": 4360
},
{
"epoch": 0.39546616592384787,
"grad_norm": 0.3698263168334961,
"learning_rate": 1.3776001457130725e-05,
"loss": 0.1363,
"step": 4370
},
{
"epoch": 0.3963711228252754,
"grad_norm": 0.6951903700828552,
"learning_rate": 1.37488438622779e-05,
"loss": 0.1281,
"step": 4380
},
{
"epoch": 0.39727607972670304,
"grad_norm": 0.3918182849884033,
"learning_rate": 1.3721654059015393e-05,
"loss": 0.1338,
"step": 4390
},
{
"epoch": 0.3981810366281306,
"grad_norm": 0.40132880210876465,
"learning_rate": 1.3694432280945978e-05,
"loss": 0.1258,
"step": 4400
},
{
"epoch": 0.39908599352955815,
"grad_norm": 0.36218178272247314,
"learning_rate": 1.3667178761947144e-05,
"loss": 0.1324,
"step": 4410
},
{
"epoch": 0.3999909504309857,
"grad_norm": 0.411044180393219,
"learning_rate": 1.3639893736169083e-05,
"loss": 0.1332,
"step": 4420
},
{
"epoch": 0.4008959073324133,
"grad_norm": 0.3805508017539978,
"learning_rate": 1.3612577438032673e-05,
"loss": 0.1317,
"step": 4430
},
{
"epoch": 0.40180086423384087,
"grad_norm": 0.33358198404312134,
"learning_rate": 1.3585230102227478e-05,
"loss": 0.1254,
"step": 4440
},
{
"epoch": 0.4027058211352684,
"grad_norm": 0.2710610628128052,
"learning_rate": 1.355785196370972e-05,
"loss": 0.1319,
"step": 4450
},
{
"epoch": 0.403610778036696,
"grad_norm": 0.24267303943634033,
"learning_rate": 1.3530443257700272e-05,
"loss": 0.134,
"step": 4460
},
{
"epoch": 0.4045157349381236,
"grad_norm": 0.32278409600257874,
"learning_rate": 1.3503004219682611e-05,
"loss": 0.1377,
"step": 4470
},
{
"epoch": 0.40542069183955115,
"grad_norm": 0.2883910834789276,
"learning_rate": 1.3475535085400836e-05,
"loss": 0.13,
"step": 4480
},
{
"epoch": 0.4063256487409787,
"grad_norm": 0.5005854368209839,
"learning_rate": 1.3448036090857601e-05,
"loss": 0.1299,
"step": 4490
},
{
"epoch": 0.40723060564240626,
"grad_norm": 0.4682227075099945,
"learning_rate": 1.3420507472312121e-05,
"loss": 0.1365,
"step": 4500
},
{
"epoch": 0.4081355625438339,
"grad_norm": 0.3037811815738678,
"learning_rate": 1.3392949466278116e-05,
"loss": 0.1334,
"step": 4510
},
{
"epoch": 0.40904051944526143,
"grad_norm": 0.2893620729446411,
"learning_rate": 1.3365362309521794e-05,
"loss": 0.1313,
"step": 4520
},
{
"epoch": 0.409945476346689,
"grad_norm": 0.2590632736682892,
"learning_rate": 1.3337746239059817e-05,
"loss": 0.1284,
"step": 4530
},
{
"epoch": 0.41085043324811654,
"grad_norm": 0.44863688945770264,
"learning_rate": 1.3310101492157256e-05,
"loss": 0.128,
"step": 4540
},
{
"epoch": 0.41175539014954415,
"grad_norm": 0.3310014009475708,
"learning_rate": 1.328242830632556e-05,
"loss": 0.1352,
"step": 4550
},
{
"epoch": 0.4126603470509717,
"grad_norm": 0.44096454977989197,
"learning_rate": 1.3254726919320509e-05,
"loss": 0.1302,
"step": 4560
},
{
"epoch": 0.41356530395239927,
"grad_norm": 0.30306538939476013,
"learning_rate": 1.322699756914018e-05,
"loss": 0.1364,
"step": 4570
},
{
"epoch": 0.4144702608538268,
"grad_norm": 0.2730850875377655,
"learning_rate": 1.3199240494022891e-05,
"loss": 0.1393,
"step": 4580
},
{
"epoch": 0.41537521775525443,
"grad_norm": 0.21079055964946747,
"learning_rate": 1.3171455932445172e-05,
"loss": 0.1294,
"step": 4590
},
{
"epoch": 0.416280174656682,
"grad_norm": 0.5057855248451233,
"learning_rate": 1.3143644123119692e-05,
"loss": 0.1338,
"step": 4600
},
{
"epoch": 0.41718513155810955,
"grad_norm": 0.3610301613807678,
"learning_rate": 1.3115805304993221e-05,
"loss": 0.1298,
"step": 4610
},
{
"epoch": 0.4180900884595371,
"grad_norm": 0.3835664391517639,
"learning_rate": 1.3087939717244591e-05,
"loss": 0.1303,
"step": 4620
},
{
"epoch": 0.41899504536096466,
"grad_norm": 0.3250782787799835,
"learning_rate": 1.306004759928261e-05,
"loss": 0.1296,
"step": 4630
},
{
"epoch": 0.41990000226239227,
"grad_norm": 0.3284454941749573,
"learning_rate": 1.3032129190744032e-05,
"loss": 0.1285,
"step": 4640
},
{
"epoch": 0.4208049591638198,
"grad_norm": 0.30871763825416565,
"learning_rate": 1.3004184731491478e-05,
"loss": 0.1331,
"step": 4650
},
{
"epoch": 0.4217099160652474,
"grad_norm": 0.2988496720790863,
"learning_rate": 1.29762144616114e-05,
"loss": 0.125,
"step": 4660
},
{
"epoch": 0.42261487296667494,
"grad_norm": 0.30450698733329773,
"learning_rate": 1.2948218621411996e-05,
"loss": 0.134,
"step": 4670
},
{
"epoch": 0.42351982986810255,
"grad_norm": 0.33566537499427795,
"learning_rate": 1.2920197451421145e-05,
"loss": 0.1439,
"step": 4680
},
{
"epoch": 0.4244247867695301,
"grad_norm": 0.4010887145996094,
"learning_rate": 1.2892151192384362e-05,
"loss": 0.1348,
"step": 4690
},
{
"epoch": 0.42532974367095766,
"grad_norm": 0.42090174555778503,
"learning_rate": 1.2864080085262702e-05,
"loss": 0.1315,
"step": 4700
},
{
"epoch": 0.4262347005723852,
"grad_norm": 0.3819758892059326,
"learning_rate": 1.2835984371230722e-05,
"loss": 0.1264,
"step": 4710
},
{
"epoch": 0.42713965747381283,
"grad_norm": 0.6393516659736633,
"learning_rate": 1.2807864291674374e-05,
"loss": 0.1312,
"step": 4720
},
{
"epoch": 0.4280446143752404,
"grad_norm": 0.3352563977241516,
"learning_rate": 1.2779720088188954e-05,
"loss": 0.1337,
"step": 4730
},
{
"epoch": 0.42894957127666794,
"grad_norm": 0.304123193025589,
"learning_rate": 1.2751552002577024e-05,
"loss": 0.1303,
"step": 4740
},
{
"epoch": 0.4298545281780955,
"grad_norm": 0.391735702753067,
"learning_rate": 1.2723360276846322e-05,
"loss": 0.1349,
"step": 4750
},
{
"epoch": 0.4307594850795231,
"grad_norm": 0.2713654935359955,
"learning_rate": 1.26951451532077e-05,
"loss": 0.1326,
"step": 4760
},
{
"epoch": 0.43166444198095066,
"grad_norm": 0.26157835125923157,
"learning_rate": 1.2666906874073024e-05,
"loss": 0.1296,
"step": 4770
},
{
"epoch": 0.4325693988823782,
"grad_norm": 0.36790212988853455,
"learning_rate": 1.2638645682053119e-05,
"loss": 0.1293,
"step": 4780
},
{
"epoch": 0.4334743557838058,
"grad_norm": 0.4308098554611206,
"learning_rate": 1.2610361819955647e-05,
"loss": 0.1294,
"step": 4790
},
{
"epoch": 0.4343793126852334,
"grad_norm": 0.32590603828430176,
"learning_rate": 1.2582055530783059e-05,
"loss": 0.1292,
"step": 4800
},
{
"epoch": 0.43528426958666094,
"grad_norm": 0.41218942403793335,
"learning_rate": 1.2553727057730481e-05,
"loss": 0.1292,
"step": 4810
},
{
"epoch": 0.4361892264880885,
"grad_norm": 0.26972493529319763,
"learning_rate": 1.2525376644183625e-05,
"loss": 0.1288,
"step": 4820
},
{
"epoch": 0.43709418338951606,
"grad_norm": 0.305836021900177,
"learning_rate": 1.2497004533716726e-05,
"loss": 0.133,
"step": 4830
},
{
"epoch": 0.43799914029094367,
"grad_norm": 0.49705770611763,
"learning_rate": 1.246861097009041e-05,
"loss": 0.1312,
"step": 4840
},
{
"epoch": 0.4389040971923712,
"grad_norm": 0.8208497166633606,
"learning_rate": 1.2440196197249634e-05,
"loss": 0.139,
"step": 4850
},
{
"epoch": 0.4398090540937988,
"grad_norm": 0.43357348442077637,
"learning_rate": 1.2411760459321562e-05,
"loss": 0.1347,
"step": 4860
},
{
"epoch": 0.44071401099522634,
"grad_norm": 0.29426875710487366,
"learning_rate": 1.238330400061349e-05,
"loss": 0.1277,
"step": 4870
},
{
"epoch": 0.44161896789665395,
"grad_norm": 0.31841394305229187,
"learning_rate": 1.235482706561074e-05,
"loss": 0.1275,
"step": 4880
},
{
"epoch": 0.4425239247980815,
"grad_norm": 0.3246687650680542,
"learning_rate": 1.2326329898974543e-05,
"loss": 0.1322,
"step": 4890
},
{
"epoch": 0.44342888169950906,
"grad_norm": 0.3303876519203186,
"learning_rate": 1.2297812745539968e-05,
"loss": 0.1249,
"step": 4900
},
{
"epoch": 0.4443338386009366,
"grad_norm": 0.28597357869148254,
"learning_rate": 1.2269275850313788e-05,
"loss": 0.1251,
"step": 4910
},
{
"epoch": 0.4452387955023642,
"grad_norm": 0.4606969356536865,
"learning_rate": 1.2240719458472402e-05,
"loss": 0.1287,
"step": 4920
},
{
"epoch": 0.4461437524037918,
"grad_norm": 0.48796603083610535,
"learning_rate": 1.2212143815359702e-05,
"loss": 0.1283,
"step": 4930
},
{
"epoch": 0.44704870930521934,
"grad_norm": 0.43160954117774963,
"learning_rate": 1.2183549166484988e-05,
"loss": 0.1262,
"step": 4940
},
{
"epoch": 0.4479536662066469,
"grad_norm": 0.43935272097587585,
"learning_rate": 1.2154935757520847e-05,
"loss": 0.131,
"step": 4950
},
{
"epoch": 0.44885862310807445,
"grad_norm": 0.2970845699310303,
"learning_rate": 1.212630383430104e-05,
"loss": 0.1292,
"step": 4960
},
{
"epoch": 0.44976358000950206,
"grad_norm": 0.3177463710308075,
"learning_rate": 1.2097653642818404e-05,
"loss": 0.1269,
"step": 4970
},
{
"epoch": 0.4506685369109296,
"grad_norm": 0.3390902578830719,
"learning_rate": 1.2068985429222712e-05,
"loss": 0.1258,
"step": 4980
},
{
"epoch": 0.4515734938123572,
"grad_norm": 0.32120341062545776,
"learning_rate": 1.204029943981859e-05,
"loss": 0.1346,
"step": 4990
},
{
"epoch": 0.45247845071378473,
"grad_norm": 0.3396281898021698,
"learning_rate": 1.2011595921063388e-05,
"loss": 0.1313,
"step": 5000
},
{
"epoch": 0.45338340761521234,
"grad_norm": 0.45511719584465027,
"learning_rate": 1.1982875119565045e-05,
"loss": 0.1264,
"step": 5010
},
{
"epoch": 0.4542883645166399,
"grad_norm": 0.44824886322021484,
"learning_rate": 1.1954137282079999e-05,
"loss": 0.1283,
"step": 5020
},
{
"epoch": 0.45519332141806745,
"grad_norm": 0.37619948387145996,
"learning_rate": 1.1925382655511044e-05,
"loss": 0.1266,
"step": 5030
},
{
"epoch": 0.456098278319495,
"grad_norm": 0.35430288314819336,
"learning_rate": 1.1896611486905232e-05,
"loss": 0.1324,
"step": 5040
},
{
"epoch": 0.4570032352209226,
"grad_norm": 0.28441110253334045,
"learning_rate": 1.1867824023451719e-05,
"loss": 0.1291,
"step": 5050
},
{
"epoch": 0.4579081921223502,
"grad_norm": 0.41242095828056335,
"learning_rate": 1.1839020512479676e-05,
"loss": 0.1299,
"step": 5060
},
{
"epoch": 0.45881314902377773,
"grad_norm": 0.2977141737937927,
"learning_rate": 1.1810201201456134e-05,
"loss": 0.1344,
"step": 5070
},
{
"epoch": 0.4597181059252053,
"grad_norm": 0.342632919549942,
"learning_rate": 1.1781366337983882e-05,
"loss": 0.1431,
"step": 5080
},
{
"epoch": 0.4606230628266329,
"grad_norm": 0.304565966129303,
"learning_rate": 1.175251616979932e-05,
"loss": 0.1281,
"step": 5090
},
{
"epoch": 0.46152801972806046,
"grad_norm": 0.27322742342948914,
"learning_rate": 1.1723650944770343e-05,
"loss": 0.1252,
"step": 5100
},
{
"epoch": 0.462432976629488,
"grad_norm": 0.2408372461795807,
"learning_rate": 1.1694770910894213e-05,
"loss": 0.1325,
"step": 5110
},
{
"epoch": 0.46333793353091557,
"grad_norm": 0.34792786836624146,
"learning_rate": 1.1665876316295408e-05,
"loss": 0.1248,
"step": 5120
},
{
"epoch": 0.4642428904323432,
"grad_norm": 0.2646757662296295,
"learning_rate": 1.1636967409223521e-05,
"loss": 0.1219,
"step": 5130
},
{
"epoch": 0.46514784733377074,
"grad_norm": 0.30345967411994934,
"learning_rate": 1.1608044438051107e-05,
"loss": 0.132,
"step": 5140
},
{
"epoch": 0.4660528042351983,
"grad_norm": 0.3464104235172272,
"learning_rate": 1.1579107651271544e-05,
"loss": 0.1248,
"step": 5150
},
{
"epoch": 0.46695776113662585,
"grad_norm": 0.3681158423423767,
"learning_rate": 1.1550157297496927e-05,
"loss": 0.1352,
"step": 5160
},
{
"epoch": 0.46786271803805346,
"grad_norm": 0.3495553731918335,
"learning_rate": 1.152119362545589e-05,
"loss": 0.1405,
"step": 5170
},
{
"epoch": 0.468767674939481,
"grad_norm": 0.41019749641418457,
"learning_rate": 1.1492216883991512e-05,
"loss": 0.1313,
"step": 5180
},
{
"epoch": 0.46967263184090857,
"grad_norm": 0.30911651253700256,
"learning_rate": 1.1463227322059143e-05,
"loss": 0.1239,
"step": 5190
},
{
"epoch": 0.47057758874233613,
"grad_norm": 0.32175543904304504,
"learning_rate": 1.1434225188724289e-05,
"loss": 0.1345,
"step": 5200
},
{
"epoch": 0.47148254564376374,
"grad_norm": 0.3402431011199951,
"learning_rate": 1.1405210733160463e-05,
"loss": 0.1305,
"step": 5210
},
{
"epoch": 0.4723875025451913,
"grad_norm": 0.5482098460197449,
"learning_rate": 1.1376184204647047e-05,
"loss": 0.129,
"step": 5220
},
{
"epoch": 0.47329245944661885,
"grad_norm": 0.4262870252132416,
"learning_rate": 1.134714585256714e-05,
"loss": 0.1312,
"step": 5230
},
{
"epoch": 0.4741974163480464,
"grad_norm": 0.28574520349502563,
"learning_rate": 1.1318095926405434e-05,
"loss": 0.1259,
"step": 5240
},
{
"epoch": 0.475102373249474,
"grad_norm": 0.49999454617500305,
"learning_rate": 1.1289034675746056e-05,
"loss": 0.1265,
"step": 5250
},
{
"epoch": 0.4760073301509016,
"grad_norm": 0.2796456515789032,
"learning_rate": 1.1259962350270428e-05,
"loss": 0.1305,
"step": 5260
},
{
"epoch": 0.47691228705232913,
"grad_norm": 0.3368781805038452,
"learning_rate": 1.1230879199755118e-05,
"loss": 0.1334,
"step": 5270
},
{
"epoch": 0.4778172439537567,
"grad_norm": 0.3882569968700409,
"learning_rate": 1.1201785474069706e-05,
"loss": 0.1379,
"step": 5280
},
{
"epoch": 0.4787222008551843,
"grad_norm": 0.3234885036945343,
"learning_rate": 1.1172681423174625e-05,
"loss": 0.1308,
"step": 5290
},
{
"epoch": 0.47962715775661185,
"grad_norm": 0.30723053216934204,
"learning_rate": 1.114356729711902e-05,
"loss": 0.1291,
"step": 5300
},
{
"epoch": 0.4805321146580394,
"grad_norm": 0.34784960746765137,
"learning_rate": 1.1114443346038591e-05,
"loss": 0.1264,
"step": 5310
},
{
"epoch": 0.48143707155946697,
"grad_norm": 0.36275964975357056,
"learning_rate": 1.1085309820153456e-05,
"loss": 0.1247,
"step": 5320
},
{
"epoch": 0.4823420284608945,
"grad_norm": 0.33047980070114136,
"learning_rate": 1.1056166969765991e-05,
"loss": 0.1277,
"step": 5330
},
{
"epoch": 0.48324698536232213,
"grad_norm": 0.3921981155872345,
"learning_rate": 1.1027015045258694e-05,
"loss": 0.1345,
"step": 5340
},
{
"epoch": 0.4841519422637497,
"grad_norm": 0.3603346347808838,
"learning_rate": 1.0997854297092011e-05,
"loss": 0.1232,
"step": 5350
},
{
"epoch": 0.48505689916517725,
"grad_norm": 0.5657104253768921,
"learning_rate": 1.0968684975802206e-05,
"loss": 0.1316,
"step": 5360
},
{
"epoch": 0.4859618560666048,
"grad_norm": 0.28306034207344055,
"learning_rate": 1.0939507331999195e-05,
"loss": 0.1312,
"step": 5370
},
{
"epoch": 0.4868668129680324,
"grad_norm": 0.33064553141593933,
"learning_rate": 1.0910321616364397e-05,
"loss": 0.1347,
"step": 5380
},
{
"epoch": 0.48777176986945997,
"grad_norm": 0.39418330788612366,
"learning_rate": 1.0881128079648586e-05,
"loss": 0.1263,
"step": 5390
},
{
"epoch": 0.4886767267708875,
"grad_norm": 0.3185255527496338,
"learning_rate": 1.0851926972669722e-05,
"loss": 0.1399,
"step": 5400
},
{
"epoch": 0.4895816836723151,
"grad_norm": 0.4073029160499573,
"learning_rate": 1.0822718546310816e-05,
"loss": 0.1303,
"step": 5410
},
{
"epoch": 0.4904866405737427,
"grad_norm": 0.49370768666267395,
"learning_rate": 1.0793503051517758e-05,
"loss": 0.1249,
"step": 5420
},
{
"epoch": 0.49139159747517025,
"grad_norm": 0.2705213725566864,
"learning_rate": 1.0764280739297163e-05,
"loss": 0.1341,
"step": 5430
},
{
"epoch": 0.4922965543765978,
"grad_norm": 0.2689199447631836,
"learning_rate": 1.0735051860714231e-05,
"loss": 0.1306,
"step": 5440
},
{
"epoch": 0.49320151127802536,
"grad_norm": 0.27751344442367554,
"learning_rate": 1.0705816666890561e-05,
"loss": 0.1285,
"step": 5450
},
{
"epoch": 0.494106468179453,
"grad_norm": 0.3316192030906677,
"learning_rate": 1.0676575409002024e-05,
"loss": 0.1362,
"step": 5460
},
{
"epoch": 0.49501142508088053,
"grad_norm": 0.3556595742702484,
"learning_rate": 1.064732833827658e-05,
"loss": 0.1271,
"step": 5470
},
{
"epoch": 0.4959163819823081,
"grad_norm": 0.2501612603664398,
"learning_rate": 1.0618075705992138e-05,
"loss": 0.1307,
"step": 5480
},
{
"epoch": 0.49682133888373564,
"grad_norm": 0.32025519013404846,
"learning_rate": 1.0588817763474388e-05,
"loss": 0.1271,
"step": 5490
},
{
"epoch": 0.49772629578516325,
"grad_norm": 0.3633834421634674,
"learning_rate": 1.0559554762094637e-05,
"loss": 0.1221,
"step": 5500
},
{
"epoch": 0.4986312526865908,
"grad_norm": 0.3367408514022827,
"learning_rate": 1.0530286953267665e-05,
"loss": 0.1288,
"step": 5510
},
{
"epoch": 0.49953620958801837,
"grad_norm": 0.36954525113105774,
"learning_rate": 1.050101458844955e-05,
"loss": 0.1247,
"step": 5520
},
{
"epoch": 0.5004411664894459,
"grad_norm": 0.46406790614128113,
"learning_rate": 1.047173791913551e-05,
"loss": 0.1312,
"step": 5530
},
{
"epoch": 0.5013461233908735,
"grad_norm": 0.32501330971717834,
"learning_rate": 1.044245719685775e-05,
"loss": 0.1323,
"step": 5540
},
{
"epoch": 0.502251080292301,
"grad_norm": 0.31969472765922546,
"learning_rate": 1.0413172673183298e-05,
"loss": 0.1343,
"step": 5550
},
{
"epoch": 0.5031560371937287,
"grad_norm": 0.2659285068511963,
"learning_rate": 1.0383884599711838e-05,
"loss": 0.1244,
"step": 5560
},
{
"epoch": 0.5040609940951563,
"grad_norm": 0.4111124873161316,
"learning_rate": 1.035459322807355e-05,
"loss": 0.1289,
"step": 5570
},
{
"epoch": 0.5049659509965838,
"grad_norm": 0.36066919565200806,
"learning_rate": 1.0325298809926962e-05,
"loss": 0.1297,
"step": 5580
},
{
"epoch": 0.5058709078980114,
"grad_norm": 0.30882957577705383,
"learning_rate": 1.029600159695676e-05,
"loss": 0.1256,
"step": 5590
},
{
"epoch": 0.5067758647994389,
"grad_norm": 0.30170318484306335,
"learning_rate": 1.0266701840871657e-05,
"loss": 0.1266,
"step": 5600
},
{
"epoch": 0.5076808217008665,
"grad_norm": 0.39970946311950684,
"learning_rate": 1.0237399793402203e-05,
"loss": 0.1334,
"step": 5610
},
{
"epoch": 0.508585778602294,
"grad_norm": 0.24716977775096893,
"learning_rate": 1.0208095706298643e-05,
"loss": 0.1277,
"step": 5620
},
{
"epoch": 0.5094907355037216,
"grad_norm": 0.3430007994174957,
"learning_rate": 1.017878983132874e-05,
"loss": 0.1315,
"step": 5630
},
{
"epoch": 0.5103956924051493,
"grad_norm": 0.37827351689338684,
"learning_rate": 1.0149482420275623e-05,
"loss": 0.1288,
"step": 5640
},
{
"epoch": 0.5113006493065768,
"grad_norm": 0.2566516101360321,
"learning_rate": 1.0120173724935614e-05,
"loss": 0.1331,
"step": 5650
},
{
"epoch": 0.5122056062080044,
"grad_norm": 0.327240914106369,
"learning_rate": 1.0090863997116066e-05,
"loss": 0.1267,
"step": 5660
},
{
"epoch": 0.5131105631094319,
"grad_norm": 0.49574097990989685,
"learning_rate": 1.0061553488633217e-05,
"loss": 0.129,
"step": 5670
},
{
"epoch": 0.5140155200108595,
"grad_norm": 0.33093881607055664,
"learning_rate": 1.0032242451309996e-05,
"loss": 0.1289,
"step": 5680
},
{
"epoch": 0.514920476912287,
"grad_norm": 0.30682793259620667,
"learning_rate": 1.0002931136973881e-05,
"loss": 0.1289,
"step": 5690
},
{
"epoch": 0.5158254338137146,
"grad_norm": 0.32405000925064087,
"learning_rate": 9.973619797454734e-06,
"loss": 0.1298,
"step": 5700
},
{
"epoch": 0.5167303907151422,
"grad_norm": 0.34693455696105957,
"learning_rate": 9.944308684582627e-06,
"loss": 0.129,
"step": 5710
},
{
"epoch": 0.5176353476165697,
"grad_norm": 0.44524097442626953,
"learning_rate": 9.914998050185693e-06,
"loss": 0.1252,
"step": 5720
},
{
"epoch": 0.5185403045179974,
"grad_norm": 0.346175879240036,
"learning_rate": 9.885688146087945e-06,
"loss": 0.1324,
"step": 5730
},
{
"epoch": 0.5194452614194249,
"grad_norm": 0.32724258303642273,
"learning_rate": 9.856379224107124e-06,
"loss": 0.1222,
"step": 5740
},
{
"epoch": 0.5203502183208525,
"grad_norm": 0.3304066061973572,
"learning_rate": 9.827071536052536e-06,
"loss": 0.1297,
"step": 5750
},
{
"epoch": 0.52125517522228,
"grad_norm": 0.3203341066837311,
"learning_rate": 9.797765333722888e-06,
"loss": 0.1315,
"step": 5760
},
{
"epoch": 0.5221601321237076,
"grad_norm": 0.33740121126174927,
"learning_rate": 9.768460868904112e-06,
"loss": 0.1281,
"step": 5770
},
{
"epoch": 0.5230650890251352,
"grad_norm": 0.3358958065509796,
"learning_rate": 9.739158393367229e-06,
"loss": 0.1298,
"step": 5780
},
{
"epoch": 0.5239700459265627,
"grad_norm": 0.3097865581512451,
"learning_rate": 9.709858158866147e-06,
"loss": 0.1219,
"step": 5790
},
{
"epoch": 0.5248750028279903,
"grad_norm": 0.3808821141719818,
"learning_rate": 9.680560417135538e-06,
"loss": 0.1244,
"step": 5800
},
{
"epoch": 0.5257799597294179,
"grad_norm": 0.3200414478778839,
"learning_rate": 9.651265419888651e-06,
"loss": 0.1312,
"step": 5810
},
{
"epoch": 0.5266849166308455,
"grad_norm": 0.38478338718414307,
"learning_rate": 9.621973418815154e-06,
"loss": 0.1324,
"step": 5820
},
{
"epoch": 0.527589873532273,
"grad_norm": 0.39911043643951416,
"learning_rate": 9.592684665578978e-06,
"loss": 0.1246,
"step": 5830
},
{
"epoch": 0.5284948304337006,
"grad_norm": 0.328489750623703,
"learning_rate": 9.563399411816141e-06,
"loss": 0.1276,
"step": 5840
},
{
"epoch": 0.5293997873351282,
"grad_norm": 0.4059896171092987,
"learning_rate": 9.534117909132606e-06,
"loss": 0.136,
"step": 5850
},
{
"epoch": 0.5303047442365557,
"grad_norm": 0.3130931854248047,
"learning_rate": 9.5048404091021e-06,
"loss": 0.1286,
"step": 5860
},
{
"epoch": 0.5312097011379833,
"grad_norm": 0.3940030634403229,
"learning_rate": 9.475567163263968e-06,
"loss": 0.1315,
"step": 5870
},
{
"epoch": 0.5321146580394108,
"grad_norm": 0.40726789832115173,
"learning_rate": 9.446298423120995e-06,
"loss": 0.1289,
"step": 5880
},
{
"epoch": 0.5330196149408385,
"grad_norm": 0.5030715465545654,
"learning_rate": 9.417034440137264e-06,
"loss": 0.1305,
"step": 5890
},
{
"epoch": 0.533924571842266,
"grad_norm": 0.3696930706501007,
"learning_rate": 9.387775465735987e-06,
"loss": 0.1337,
"step": 5900
},
{
"epoch": 0.5348295287436936,
"grad_norm": 0.2996613681316376,
"learning_rate": 9.358521751297336e-06,
"loss": 0.1296,
"step": 5910
},
{
"epoch": 0.5357344856451212,
"grad_norm": 0.30213144421577454,
"learning_rate": 9.329273548156305e-06,
"loss": 0.1284,
"step": 5920
},
{
"epoch": 0.5366394425465487,
"grad_norm": 0.35034891963005066,
"learning_rate": 9.300031107600519e-06,
"loss": 0.1258,
"step": 5930
},
{
"epoch": 0.5375443994479763,
"grad_norm": 0.4542511999607086,
"learning_rate": 9.270794680868108e-06,
"loss": 0.1384,
"step": 5940
},
{
"epoch": 0.5384493563494038,
"grad_norm": 0.4177015423774719,
"learning_rate": 9.241564519145529e-06,
"loss": 0.1308,
"step": 5950
},
{
"epoch": 0.5393543132508314,
"grad_norm": 0.3148910701274872,
"learning_rate": 9.212340873565417e-06,
"loss": 0.1256,
"step": 5960
},
{
"epoch": 0.540259270152259,
"grad_norm": 0.3220025300979614,
"learning_rate": 9.183123995204419e-06,
"loss": 0.1356,
"step": 5970
},
{
"epoch": 0.5411642270536866,
"grad_norm": 0.37302547693252563,
"learning_rate": 9.153914135081037e-06,
"loss": 0.1319,
"step": 5980
},
{
"epoch": 0.5420691839551142,
"grad_norm": 0.2666574716567993,
"learning_rate": 9.12471154415348e-06,
"loss": 0.1271,
"step": 5990
},
{
"epoch": 0.5429741408565417,
"grad_norm": 0.333808034658432,
"learning_rate": 9.095516473317506e-06,
"loss": 0.1249,
"step": 6000
},
{
"epoch": 0.5438790977579693,
"grad_norm": 0.29716721177101135,
"learning_rate": 9.066329173404267e-06,
"loss": 0.1331,
"step": 6010
},
{
"epoch": 0.5447840546593968,
"grad_norm": 0.3095798194408417,
"learning_rate": 9.037149895178132e-06,
"loss": 0.1219,
"step": 6020
},
{
"epoch": 0.5456890115608244,
"grad_norm": 0.3963325619697571,
"learning_rate": 9.007978889334573e-06,
"loss": 0.1281,
"step": 6030
},
{
"epoch": 0.546593968462252,
"grad_norm": 0.2934766113758087,
"learning_rate": 8.978816406497977e-06,
"loss": 0.1254,
"step": 6040
},
{
"epoch": 0.5474989253636795,
"grad_norm": 0.3293272852897644,
"learning_rate": 8.949662697219507e-06,
"loss": 0.132,
"step": 6050
},
{
"epoch": 0.5484038822651072,
"grad_norm": 0.47501856088638306,
"learning_rate": 8.920518011974955e-06,
"loss": 0.134,
"step": 6060
},
{
"epoch": 0.5493088391665347,
"grad_norm": 0.38773998618125916,
"learning_rate": 8.891382601162571e-06,
"loss": 0.1347,
"step": 6070
},
{
"epoch": 0.5502137960679623,
"grad_norm": 0.30083420872688293,
"learning_rate": 8.862256715100926e-06,
"loss": 0.1232,
"step": 6080
},
{
"epoch": 0.5511187529693898,
"grad_norm": 0.32377147674560547,
"learning_rate": 8.833140604026763e-06,
"loss": 0.1244,
"step": 6090
},
{
"epoch": 0.5520237098708174,
"grad_norm": 1.7971711158752441,
"learning_rate": 8.804034518092846e-06,
"loss": 0.1269,
"step": 6100
},
{
"epoch": 0.552928666772245,
"grad_norm": 0.28756287693977356,
"learning_rate": 8.7749387073658e-06,
"loss": 0.1285,
"step": 6110
},
{
"epoch": 0.5538336236736725,
"grad_norm": 0.3303268551826477,
"learning_rate": 8.745853421823965e-06,
"loss": 0.131,
"step": 6120
},
{
"epoch": 0.5547385805751001,
"grad_norm": 0.30345699191093445,
"learning_rate": 8.716778911355266e-06,
"loss": 0.1344,
"step": 6130
},
{
"epoch": 0.5556435374765277,
"grad_norm": 0.33741581439971924,
"learning_rate": 8.687715425755047e-06,
"loss": 0.125,
"step": 6140
},
{
"epoch": 0.5565484943779553,
"grad_norm": 0.36916887760162354,
"learning_rate": 8.65866321472393e-06,
"loss": 0.1391,
"step": 6150
},
{
"epoch": 0.5574534512793828,
"grad_norm": 0.2726483941078186,
"learning_rate": 8.62962252786568e-06,
"loss": 0.1274,
"step": 6160
},
{
"epoch": 0.5583584081808104,
"grad_norm": 0.6223147511482239,
"learning_rate": 8.600593614685035e-06,
"loss": 0.1222,
"step": 6170
},
{
"epoch": 0.559263365082238,
"grad_norm": 0.285051554441452,
"learning_rate": 8.571576724585589e-06,
"loss": 0.1316,
"step": 6180
},
{
"epoch": 0.5601683219836655,
"grad_norm": 0.2996697723865509,
"learning_rate": 8.542572106867643e-06,
"loss": 0.1221,
"step": 6190
},
{
"epoch": 0.5610732788850931,
"grad_norm": 0.4449019730091095,
"learning_rate": 8.513580010726052e-06,
"loss": 0.136,
"step": 6200
},
{
"epoch": 0.5619782357865206,
"grad_norm": 0.3610653579235077,
"learning_rate": 8.484600685248089e-06,
"loss": 0.1349,
"step": 6210
},
{
"epoch": 0.5628831926879483,
"grad_norm": 0.23445254564285278,
"learning_rate": 8.455634379411314e-06,
"loss": 0.1304,
"step": 6220
},
{
"epoch": 0.5637881495893758,
"grad_norm": 0.4748559594154358,
"learning_rate": 8.426681342081428e-06,
"loss": 0.1281,
"step": 6230
},
{
"epoch": 0.5646931064908034,
"grad_norm": 0.28520044684410095,
"learning_rate": 8.397741822010128e-06,
"loss": 0.1318,
"step": 6240
},
{
"epoch": 0.565598063392231,
"grad_norm": 0.31549420952796936,
"learning_rate": 8.368816067832986e-06,
"loss": 0.1259,
"step": 6250
},
{
"epoch": 0.5665030202936585,
"grad_norm": 0.3754958510398865,
"learning_rate": 8.339904328067289e-06,
"loss": 0.1317,
"step": 6260
},
{
"epoch": 0.5674079771950861,
"grad_norm": 0.3160054087638855,
"learning_rate": 8.311006851109939e-06,
"loss": 0.1311,
"step": 6270
},
{
"epoch": 0.5683129340965136,
"grad_norm": 0.3136991560459137,
"learning_rate": 8.282123885235276e-06,
"loss": 0.1349,
"step": 6280
},
{
"epoch": 0.5692178909979412,
"grad_norm": 0.38889384269714355,
"learning_rate": 8.253255678592985e-06,
"loss": 0.1349,
"step": 6290
},
{
"epoch": 0.5701228478993688,
"grad_norm": 0.3792050778865814,
"learning_rate": 8.224402479205941e-06,
"loss": 0.1237,
"step": 6300
},
{
"epoch": 0.5710278048007964,
"grad_norm": 0.35518279671669006,
"learning_rate": 8.195564534968074e-06,
"loss": 0.1232,
"step": 6310
},
{
"epoch": 0.571932761702224,
"grad_norm": 0.2767059803009033,
"learning_rate": 8.166742093642263e-06,
"loss": 0.1265,
"step": 6320
},
{
"epoch": 0.5728377186036515,
"grad_norm": 0.37054571509361267,
"learning_rate": 8.137935402858182e-06,
"loss": 0.1288,
"step": 6330
},
{
"epoch": 0.5737426755050791,
"grad_norm": 0.3450683653354645,
"learning_rate": 8.10914471011019e-06,
"loss": 0.1339,
"step": 6340
},
{
"epoch": 0.5746476324065066,
"grad_norm": 0.29977867007255554,
"learning_rate": 8.080370262755191e-06,
"loss": 0.126,
"step": 6350
},
{
"epoch": 0.5755525893079342,
"grad_norm": 0.34314143657684326,
"learning_rate": 8.051612308010526e-06,
"loss": 0.1283,
"step": 6360
},
{
"epoch": 0.5764575462093617,
"grad_norm": 0.37054121494293213,
"learning_rate": 8.022871092951827e-06,
"loss": 0.1292,
"step": 6370
},
{
"epoch": 0.5773625031107893,
"grad_norm": 0.37676891684532166,
"learning_rate": 7.994146864510912e-06,
"loss": 0.1285,
"step": 6380
},
{
"epoch": 0.578267460012217,
"grad_norm": 0.26649826765060425,
"learning_rate": 7.965439869473664e-06,
"loss": 0.1261,
"step": 6390
},
{
"epoch": 0.5791724169136445,
"grad_norm": 0.38938188552856445,
"learning_rate": 7.936750354477891e-06,
"loss": 0.1272,
"step": 6400
},
{
"epoch": 0.5800773738150721,
"grad_norm": 0.32541313767433167,
"learning_rate": 7.908078566011227e-06,
"loss": 0.1233,
"step": 6410
},
{
"epoch": 0.5809823307164996,
"grad_norm": 0.36433953046798706,
"learning_rate": 7.879424750409007e-06,
"loss": 0.1314,
"step": 6420
},
{
"epoch": 0.5818872876179272,
"grad_norm": 0.3940136432647705,
"learning_rate": 7.850789153852157e-06,
"loss": 0.1373,
"step": 6430
},
{
"epoch": 0.5827922445193547,
"grad_norm": 0.3312411904335022,
"learning_rate": 7.822172022365059e-06,
"loss": 0.1258,
"step": 6440
},
{
"epoch": 0.5836972014207823,
"grad_norm": 0.5461381077766418,
"learning_rate": 7.793573601813467e-06,
"loss": 0.1275,
"step": 6450
},
{
"epoch": 0.5846021583222099,
"grad_norm": 0.41021519899368286,
"learning_rate": 7.764994137902366e-06,
"loss": 0.1305,
"step": 6460
},
{
"epoch": 0.5855071152236375,
"grad_norm": 0.5024427175521851,
"learning_rate": 7.736433876173879e-06,
"loss": 0.1264,
"step": 6470
},
{
"epoch": 0.5864120721250651,
"grad_norm": 0.3114100992679596,
"learning_rate": 7.70789306200516e-06,
"loss": 0.1328,
"step": 6480
},
{
"epoch": 0.5873170290264926,
"grad_norm": 0.3421667814254761,
"learning_rate": 7.679371940606265e-06,
"loss": 0.1336,
"step": 6490
},
{
"epoch": 0.5882219859279202,
"grad_norm": 0.4376727044582367,
"learning_rate": 7.650870757018061e-06,
"loss": 0.1277,
"step": 6500
},
{
"epoch": 0.5891269428293477,
"grad_norm": 0.26968899369239807,
"learning_rate": 7.622389756110126e-06,
"loss": 0.1281,
"step": 6510
},
{
"epoch": 0.5900318997307753,
"grad_norm": 0.3418639004230499,
"learning_rate": 7.593929182578634e-06,
"loss": 0.1321,
"step": 6520
},
{
"epoch": 0.5909368566322029,
"grad_norm": 0.3123999536037445,
"learning_rate": 7.565489280944256e-06,
"loss": 0.1257,
"step": 6530
},
{
"epoch": 0.5918418135336304,
"grad_norm": 0.39082077145576477,
"learning_rate": 7.537070295550051e-06,
"loss": 0.1303,
"step": 6540
},
{
"epoch": 0.5927467704350581,
"grad_norm": 0.32185035943984985,
"learning_rate": 7.508672470559385e-06,
"loss": 0.1278,
"step": 6550
},
{
"epoch": 0.5936517273364856,
"grad_norm": 0.6370688080787659,
"learning_rate": 7.480296049953823e-06,
"loss": 0.132,
"step": 6560
},
{
"epoch": 0.5945566842379132,
"grad_norm": 0.44037026166915894,
"learning_rate": 7.451941277531025e-06,
"loss": 0.1264,
"step": 6570
},
{
"epoch": 0.5954616411393407,
"grad_norm": 0.301471471786499,
"learning_rate": 7.423608396902673e-06,
"loss": 0.1261,
"step": 6580
},
{
"epoch": 0.5963665980407683,
"grad_norm": 0.3698810338973999,
"learning_rate": 7.395297651492346e-06,
"loss": 0.1262,
"step": 6590
},
{
"epoch": 0.5972715549421959,
"grad_norm": 0.27682480216026306,
"learning_rate": 7.36700928453346e-06,
"loss": 0.1301,
"step": 6600
},
{
"epoch": 0.5981765118436234,
"grad_norm": 0.3122202455997467,
"learning_rate": 7.338743539067163e-06,
"loss": 0.1325,
"step": 6610
},
{
"epoch": 0.599081468745051,
"grad_norm": 0.3500480651855469,
"learning_rate": 7.310500657940253e-06,
"loss": 0.1332,
"step": 6620
},
{
"epoch": 0.5999864256464786,
"grad_norm": 0.3293434977531433,
"learning_rate": 7.282280883803073e-06,
"loss": 0.1337,
"step": 6630
},
{
"epoch": 0.6008913825479062,
"grad_norm": 0.2992168366909027,
"learning_rate": 7.254084459107453e-06,
"loss": 0.1336,
"step": 6640
},
{
"epoch": 0.6017963394493338,
"grad_norm": 0.3215314745903015,
"learning_rate": 7.225911626104621e-06,
"loss": 0.1283,
"step": 6650
},
{
"epoch": 0.6027012963507613,
"grad_norm": 0.3701172173023224,
"learning_rate": 7.1977626268430965e-06,
"loss": 0.1219,
"step": 6660
},
{
"epoch": 0.6036062532521889,
"grad_norm": 0.3572734594345093,
"learning_rate": 7.1696377031666495e-06,
"loss": 0.1204,
"step": 6670
},
{
"epoch": 0.6045112101536164,
"grad_norm": 0.37793678045272827,
"learning_rate": 7.1415370967121896e-06,
"loss": 0.1253,
"step": 6680
},
{
"epoch": 0.605416167055044,
"grad_norm": 0.2588195204734802,
"learning_rate": 7.113461048907711e-06,
"loss": 0.1247,
"step": 6690
},
{
"epoch": 0.6063211239564715,
"grad_norm": 0.3132305145263672,
"learning_rate": 7.085409800970203e-06,
"loss": 0.1307,
"step": 6700
},
{
"epoch": 0.6072260808578992,
"grad_norm": 0.36036190390586853,
"learning_rate": 7.0573835939035974e-06,
"loss": 0.1322,
"step": 6710
},
{
"epoch": 0.6081310377593268,
"grad_norm": 0.39142096042633057,
"learning_rate": 7.029382668496679e-06,
"loss": 0.1218,
"step": 6720
},
{
"epoch": 0.6090359946607543,
"grad_norm": 0.33356091380119324,
"learning_rate": 7.001407265321019e-06,
"loss": 0.1268,
"step": 6730
},
{
"epoch": 0.6099409515621819,
"grad_norm": 0.3158833086490631,
"learning_rate": 6.973457624728922e-06,
"loss": 0.1248,
"step": 6740
},
{
"epoch": 0.6108459084636094,
"grad_norm": 0.35817354917526245,
"learning_rate": 6.945533986851345e-06,
"loss": 0.1304,
"step": 6750
},
{
"epoch": 0.611750865365037,
"grad_norm": 0.28623586893081665,
"learning_rate": 6.917636591595849e-06,
"loss": 0.1243,
"step": 6760
},
{
"epoch": 0.6126558222664645,
"grad_norm": 0.39012181758880615,
"learning_rate": 6.8897656786445166e-06,
"loss": 0.1213,
"step": 6770
},
{
"epoch": 0.6135607791678921,
"grad_norm": 0.44978418946266174,
"learning_rate": 6.861921487451922e-06,
"loss": 0.1234,
"step": 6780
},
{
"epoch": 0.6144657360693196,
"grad_norm": 0.34891191124916077,
"learning_rate": 6.834104257243043e-06,
"loss": 0.1275,
"step": 6790
},
{
"epoch": 0.6153706929707473,
"grad_norm": 0.29933297634124756,
"learning_rate": 6.806314227011235e-06,
"loss": 0.1307,
"step": 6800
},
{
"epoch": 0.6162756498721749,
"grad_norm": 0.3456212282180786,
"learning_rate": 6.778551635516157e-06,
"loss": 0.1273,
"step": 6810
},
{
"epoch": 0.6171806067736024,
"grad_norm": 0.32694172859191895,
"learning_rate": 6.750816721281719e-06,
"loss": 0.1278,
"step": 6820
},
{
"epoch": 0.61808556367503,
"grad_norm": 0.3350003659725189,
"learning_rate": 6.7231097225940475e-06,
"loss": 0.1318,
"step": 6830
},
{
"epoch": 0.6189905205764575,
"grad_norm": 0.3554823100566864,
"learning_rate": 6.695430877499434e-06,
"loss": 0.1282,
"step": 6840
},
{
"epoch": 0.6198954774778851,
"grad_norm": 0.4120415449142456,
"learning_rate": 6.6677804238022806e-06,
"loss": 0.1311,
"step": 6850
},
{
"epoch": 0.6208004343793126,
"grad_norm": 0.3582000732421875,
"learning_rate": 6.640158599063069e-06,
"loss": 0.1223,
"step": 6860
},
{
"epoch": 0.6217053912807402,
"grad_norm": 0.3458595275878906,
"learning_rate": 6.612565640596307e-06,
"loss": 0.1294,
"step": 6870
},
{
"epoch": 0.6226103481821679,
"grad_norm": 0.2830416262149811,
"learning_rate": 6.585001785468497e-06,
"loss": 0.1273,
"step": 6880
},
{
"epoch": 0.6235153050835954,
"grad_norm": 0.32797959446907043,
"learning_rate": 6.5574672704961025e-06,
"loss": 0.1284,
"step": 6890
},
{
"epoch": 0.624420261985023,
"grad_norm": 0.3323483467102051,
"learning_rate": 6.529962332243509e-06,
"loss": 0.1258,
"step": 6900
},
{
"epoch": 0.6253252188864505,
"grad_norm": 0.2794325649738312,
"learning_rate": 6.5024872070209936e-06,
"loss": 0.1323,
"step": 6910
},
{
"epoch": 0.6262301757878781,
"grad_norm": 0.2866572439670563,
"learning_rate": 6.4750421308826795e-06,
"loss": 0.1269,
"step": 6920
},
{
"epoch": 0.6271351326893057,
"grad_norm": 0.3717053532600403,
"learning_rate": 6.447627339624538e-06,
"loss": 0.1257,
"step": 6930
},
{
"epoch": 0.6280400895907332,
"grad_norm": 0.3445277512073517,
"learning_rate": 6.4202430687823416e-06,
"loss": 0.133,
"step": 6940
},
{
"epoch": 0.6289450464921608,
"grad_norm": 0.4389108419418335,
"learning_rate": 6.39288955362964e-06,
"loss": 0.1271,
"step": 6950
},
{
"epoch": 0.6298500033935884,
"grad_norm": 0.38752782344818115,
"learning_rate": 6.365567029175747e-06,
"loss": 0.1306,
"step": 6960
},
{
"epoch": 0.630754960295016,
"grad_norm": 0.4342532157897949,
"learning_rate": 6.338275730163715e-06,
"loss": 0.1286,
"step": 6970
},
{
"epoch": 0.6316599171964435,
"grad_norm": 0.3530200719833374,
"learning_rate": 6.311015891068328e-06,
"loss": 0.1239,
"step": 6980
},
{
"epoch": 0.6325648740978711,
"grad_norm": 0.33301499485969543,
"learning_rate": 6.283787746094077e-06,
"loss": 0.1311,
"step": 6990
},
{
"epoch": 0.6334698309992987,
"grad_norm": 0.3174525201320648,
"learning_rate": 6.256591529173148e-06,
"loss": 0.1318,
"step": 7000
},
{
"epoch": 0.6343747879007262,
"grad_norm": 0.27925947308540344,
"learning_rate": 6.229427473963416e-06,
"loss": 0.1291,
"step": 7010
},
{
"epoch": 0.6352797448021538,
"grad_norm": 0.36644554138183594,
"learning_rate": 6.20229581384644e-06,
"loss": 0.119,
"step": 7020
},
{
"epoch": 0.6361847017035813,
"grad_norm": 0.4176923930644989,
"learning_rate": 6.1751967819254545e-06,
"loss": 0.126,
"step": 7030
},
{
"epoch": 0.637089658605009,
"grad_norm": 0.30893582105636597,
"learning_rate": 6.148130611023361e-06,
"loss": 0.1283,
"step": 7040
},
{
"epoch": 0.6379946155064365,
"grad_norm": 0.37425440549850464,
"learning_rate": 6.121097533680745e-06,
"loss": 0.1265,
"step": 7050
},
{
"epoch": 0.6388995724078641,
"grad_norm": 0.2834739089012146,
"learning_rate": 6.094097782153853e-06,
"loss": 0.1311,
"step": 7060
},
{
"epoch": 0.6398045293092917,
"grad_norm": 0.3139822781085968,
"learning_rate": 6.0671315884126225e-06,
"loss": 0.1231,
"step": 7070
},
{
"epoch": 0.6407094862107192,
"grad_norm": 0.2844420075416565,
"learning_rate": 6.040199184138668e-06,
"loss": 0.129,
"step": 7080
},
{
"epoch": 0.6416144431121468,
"grad_norm": 0.35503455996513367,
"learning_rate": 6.013300800723312e-06,
"loss": 0.1311,
"step": 7090
},
{
"epoch": 0.6425194000135743,
"grad_norm": 0.35351240634918213,
"learning_rate": 5.986436669265568e-06,
"loss": 0.1331,
"step": 7100
},
{
"epoch": 0.6434243569150019,
"grad_norm": 0.3031436800956726,
"learning_rate": 5.959607020570184e-06,
"loss": 0.1305,
"step": 7110
},
{
"epoch": 0.6443293138164294,
"grad_norm": 0.23189416527748108,
"learning_rate": 5.932812085145647e-06,
"loss": 0.1235,
"step": 7120
},
{
"epoch": 0.6452342707178571,
"grad_norm": 0.2822563946247101,
"learning_rate": 5.906052093202199e-06,
"loss": 0.1269,
"step": 7130
},
{
"epoch": 0.6461392276192847,
"grad_norm": 0.28259310126304626,
"learning_rate": 5.879327274649868e-06,
"loss": 0.1273,
"step": 7140
},
{
"epoch": 0.6470441845207122,
"grad_norm": 0.33720916509628296,
"learning_rate": 5.852637859096475e-06,
"loss": 0.1345,
"step": 7150
},
{
"epoch": 0.6479491414221398,
"grad_norm": 0.3332005441188812,
"learning_rate": 5.825984075845691e-06,
"loss": 0.1248,
"step": 7160
},
{
"epoch": 0.6488540983235673,
"grad_norm": 0.3517606258392334,
"learning_rate": 5.799366153895037e-06,
"loss": 0.1288,
"step": 7170
},
{
"epoch": 0.6497590552249949,
"grad_norm": 0.265109121799469,
"learning_rate": 5.772784321933939e-06,
"loss": 0.1329,
"step": 7180
},
{
"epoch": 0.6506640121264224,
"grad_norm": 0.3004017472267151,
"learning_rate": 5.746238808341751e-06,
"loss": 0.1252,
"step": 7190
},
{
"epoch": 0.65156896902785,
"grad_norm": 0.34260621666908264,
"learning_rate": 5.719729841185786e-06,
"loss": 0.1267,
"step": 7200
},
{
"epoch": 0.6524739259292777,
"grad_norm": 0.24408110976219177,
"learning_rate": 5.693257648219379e-06,
"loss": 0.1296,
"step": 7210
},
{
"epoch": 0.6533788828307052,
"grad_norm": 0.2951405346393585,
"learning_rate": 5.666822456879918e-06,
"loss": 0.1231,
"step": 7220
},
{
"epoch": 0.6542838397321328,
"grad_norm": 0.32168251276016235,
"learning_rate": 5.640424494286878e-06,
"loss": 0.1298,
"step": 7230
},
{
"epoch": 0.6551887966335603,
"grad_norm": 0.2844177186489105,
"learning_rate": 5.614063987239885e-06,
"loss": 0.1289,
"step": 7240
},
{
"epoch": 0.6560937535349879,
"grad_norm": 0.5491533875465393,
"learning_rate": 5.587741162216768e-06,
"loss": 0.1313,
"step": 7250
},
{
"epoch": 0.6569987104364154,
"grad_norm": 0.298951119184494,
"learning_rate": 5.561456245371608e-06,
"loss": 0.125,
"step": 7260
},
{
"epoch": 0.657903667337843,
"grad_norm": 0.3288751542568207,
"learning_rate": 5.535209462532792e-06,
"loss": 0.1296,
"step": 7270
},
{
"epoch": 0.6588086242392706,
"grad_norm": 0.43045946955680847,
"learning_rate": 5.509001039201085e-06,
"loss": 0.1263,
"step": 7280
},
{
"epoch": 0.6597135811406982,
"grad_norm": 0.2680876851081848,
"learning_rate": 5.482831200547667e-06,
"loss": 0.1324,
"step": 7290
},
{
"epoch": 0.6606185380421258,
"grad_norm": 0.3521096110343933,
"learning_rate": 5.456700171412231e-06,
"loss": 0.1204,
"step": 7300
},
{
"epoch": 0.6615234949435533,
"grad_norm": 0.2706452012062073,
"learning_rate": 5.430608176301036e-06,
"loss": 0.1269,
"step": 7310
},
{
"epoch": 0.6624284518449809,
"grad_norm": 0.3557042181491852,
"learning_rate": 5.4045554393849635e-06,
"loss": 0.132,
"step": 7320
},
{
"epoch": 0.6633334087464084,
"grad_norm": 0.3670320510864258,
"learning_rate": 5.378542184497623e-06,
"loss": 0.1257,
"step": 7330
},
{
"epoch": 0.664238365647836,
"grad_norm": 0.31355276703834534,
"learning_rate": 5.3525686351333976e-06,
"loss": 0.1275,
"step": 7340
},
{
"epoch": 0.6651433225492636,
"grad_norm": 0.30157995223999023,
"learning_rate": 5.326635014445547e-06,
"loss": 0.1291,
"step": 7350
},
{
"epoch": 0.6660482794506911,
"grad_norm": 0.2899110019207001,
"learning_rate": 5.300741545244279e-06,
"loss": 0.1311,
"step": 7360
},
{
"epoch": 0.6669532363521188,
"grad_norm": 0.34780648350715637,
"learning_rate": 5.274888449994843e-06,
"loss": 0.1294,
"step": 7370
},
{
"epoch": 0.6678581932535463,
"grad_norm": 0.2876949608325958,
"learning_rate": 5.2490759508155975e-06,
"loss": 0.1303,
"step": 7380
},
{
"epoch": 0.6687631501549739,
"grad_norm": 0.3087066113948822,
"learning_rate": 5.223304269476137e-06,
"loss": 0.1255,
"step": 7390
},
{
"epoch": 0.6696681070564015,
"grad_norm": 0.38678351044654846,
"learning_rate": 5.19757362739535e-06,
"loss": 0.1288,
"step": 7400
},
{
"epoch": 0.670573063957829,
"grad_norm": 0.2943480610847473,
"learning_rate": 5.171884245639545e-06,
"loss": 0.1284,
"step": 7410
},
{
"epoch": 0.6714780208592566,
"grad_norm": 0.28372156620025635,
"learning_rate": 5.146236344920542e-06,
"loss": 0.1292,
"step": 7420
},
{
"epoch": 0.6723829777606841,
"grad_norm": 0.2806905508041382,
"learning_rate": 5.12063014559376e-06,
"loss": 0.1272,
"step": 7430
},
{
"epoch": 0.6732879346621117,
"grad_norm": 0.3286825120449066,
"learning_rate": 5.095065867656351e-06,
"loss": 0.1205,
"step": 7440
},
{
"epoch": 0.6741928915635392,
"grad_norm": 0.33701106905937195,
"learning_rate": 5.0695437307452945e-06,
"loss": 0.1312,
"step": 7450
},
{
"epoch": 0.6750978484649669,
"grad_norm": 0.3478999137878418,
"learning_rate": 5.044063954135508e-06,
"loss": 0.1284,
"step": 7460
},
{
"epoch": 0.6760028053663945,
"grad_norm": 0.28950104117393494,
"learning_rate": 5.018626756737979e-06,
"loss": 0.1267,
"step": 7470
},
{
"epoch": 0.676907762267822,
"grad_norm": 0.3087421655654907,
"learning_rate": 4.9932323570978605e-06,
"loss": 0.1254,
"step": 7480
},
{
"epoch": 0.6778127191692496,
"grad_norm": 0.34977859258651733,
"learning_rate": 4.967880973392607e-06,
"loss": 0.1293,
"step": 7490
},
{
"epoch": 0.6787176760706771,
"grad_norm": 0.3207535147666931,
"learning_rate": 4.942572823430107e-06,
"loss": 0.1268,
"step": 7500
},
{
"epoch": 0.6796226329721047,
"grad_norm": 0.34587785601615906,
"learning_rate": 4.917308124646802e-06,
"loss": 0.1272,
"step": 7510
},
{
"epoch": 0.6805275898735322,
"grad_norm": 0.338029146194458,
"learning_rate": 4.892087094105818e-06,
"loss": 0.1208,
"step": 7520
},
{
"epoch": 0.6814325467749598,
"grad_norm": 0.3155430555343628,
"learning_rate": 4.866909948495101e-06,
"loss": 0.1234,
"step": 7530
},
{
"epoch": 0.6823375036763875,
"grad_norm": 0.27973029017448425,
"learning_rate": 4.841776904125559e-06,
"loss": 0.1301,
"step": 7540
},
{
"epoch": 0.683242460577815,
"grad_norm": 0.3526993989944458,
"learning_rate": 4.816688176929207e-06,
"loss": 0.1258,
"step": 7550
},
{
"epoch": 0.6841474174792426,
"grad_norm": 0.3077426850795746,
"learning_rate": 4.791643982457293e-06,
"loss": 0.1235,
"step": 7560
},
{
"epoch": 0.6850523743806701,
"grad_norm": 0.31144073605537415,
"learning_rate": 4.766644535878476e-06,
"loss": 0.1226,
"step": 7570
},
{
"epoch": 0.6859573312820977,
"grad_norm": 0.32703807950019836,
"learning_rate": 4.741690051976946e-06,
"loss": 0.1265,
"step": 7580
},
{
"epoch": 0.6868622881835252,
"grad_norm": 0.4660681486129761,
"learning_rate": 4.716780745150602e-06,
"loss": 0.1323,
"step": 7590
},
{
"epoch": 0.6877672450849528,
"grad_norm": 0.27197226881980896,
"learning_rate": 4.6919168294092e-06,
"loss": 0.1319,
"step": 7600
},
{
"epoch": 0.6886722019863804,
"grad_norm": 0.3342832624912262,
"learning_rate": 4.6670985183725205e-06,
"loss": 0.134,
"step": 7610
},
{
"epoch": 0.689577158887808,
"grad_norm": 0.33844679594039917,
"learning_rate": 4.642326025268514e-06,
"loss": 0.1282,
"step": 7620
},
{
"epoch": 0.6904821157892356,
"grad_norm": 0.39204105734825134,
"learning_rate": 4.6175995629314994e-06,
"loss": 0.1236,
"step": 7630
},
{
"epoch": 0.6913870726906631,
"grad_norm": 0.38780298829078674,
"learning_rate": 4.592919343800315e-06,
"loss": 0.1316,
"step": 7640
},
{
"epoch": 0.6922920295920907,
"grad_norm": 0.3531728982925415,
"learning_rate": 4.568285579916491e-06,
"loss": 0.1339,
"step": 7650
},
{
"epoch": 0.6931969864935182,
"grad_norm": 0.264414519071579,
"learning_rate": 4.543698482922445e-06,
"loss": 0.1309,
"step": 7660
},
{
"epoch": 0.6941019433949458,
"grad_norm": 0.3809826076030731,
"learning_rate": 4.519158264059642e-06,
"loss": 0.1302,
"step": 7670
},
{
"epoch": 0.6950069002963734,
"grad_norm": 0.3677527904510498,
"learning_rate": 4.4946651341668006e-06,
"loss": 0.128,
"step": 7680
},
{
"epoch": 0.6959118571978009,
"grad_norm": 0.3577388823032379,
"learning_rate": 4.470219303678069e-06,
"loss": 0.1242,
"step": 7690
},
{
"epoch": 0.6968168140992286,
"grad_norm": 0.3218074142932892,
"learning_rate": 4.44582098262122e-06,
"loss": 0.128,
"step": 7700
},
{
"epoch": 0.6977217710006561,
"grad_norm": 0.3113616406917572,
"learning_rate": 4.421470380615841e-06,
"loss": 0.1246,
"step": 7710
},
{
"epoch": 0.6986267279020837,
"grad_norm": 0.4153074622154236,
"learning_rate": 4.397167706871546e-06,
"loss": 0.1236,
"step": 7720
},
{
"epoch": 0.6995316848035112,
"grad_norm": 0.2737235426902771,
"learning_rate": 4.37291317018617e-06,
"loss": 0.1334,
"step": 7730
},
{
"epoch": 0.7004366417049388,
"grad_norm": 0.3413766026496887,
"learning_rate": 4.348706978943965e-06,
"loss": 0.1248,
"step": 7740
},
{
"epoch": 0.7013415986063664,
"grad_norm": 0.44644635915756226,
"learning_rate": 4.324549341113839e-06,
"loss": 0.131,
"step": 7750
},
{
"epoch": 0.7022465555077939,
"grad_norm": 0.3571605384349823,
"learning_rate": 4.300440464247528e-06,
"loss": 0.1219,
"step": 7760
},
{
"epoch": 0.7031515124092215,
"grad_norm": 0.218129500746727,
"learning_rate": 4.276380555477855e-06,
"loss": 0.1224,
"step": 7770
},
{
"epoch": 0.7040564693106491,
"grad_norm": 0.2756895124912262,
"learning_rate": 4.25236982151692e-06,
"loss": 0.1165,
"step": 7780
},
{
"epoch": 0.7049614262120767,
"grad_norm": 0.3429490029811859,
"learning_rate": 4.22840846865434e-06,
"loss": 0.133,
"step": 7790
},
{
"epoch": 0.7058663831135042,
"grad_norm": 0.3609547019004822,
"learning_rate": 4.204496702755471e-06,
"loss": 0.1229,
"step": 7800
},
{
"epoch": 0.7067713400149318,
"grad_norm": 0.31833615899086,
"learning_rate": 4.180634729259635e-06,
"loss": 0.131,
"step": 7810
},
{
"epoch": 0.7076762969163594,
"grad_norm": 0.3234885632991791,
"learning_rate": 4.15682275317836e-06,
"loss": 0.1242,
"step": 7820
},
{
"epoch": 0.7085812538177869,
"grad_norm": 0.331586092710495,
"learning_rate": 4.133060979093623e-06,
"loss": 0.1238,
"step": 7830
},
{
"epoch": 0.7094862107192145,
"grad_norm": 0.29033374786376953,
"learning_rate": 4.109349611156088e-06,
"loss": 0.1231,
"step": 7840
},
{
"epoch": 0.710391167620642,
"grad_norm": 0.4018980860710144,
"learning_rate": 4.085688853083346e-06,
"loss": 0.1276,
"step": 7850
},
{
"epoch": 0.7112961245220696,
"grad_norm": 0.3441830575466156,
"learning_rate": 4.062078908158174e-06,
"loss": 0.1334,
"step": 7860
},
{
"epoch": 0.7122010814234973,
"grad_norm": 0.3075098693370819,
"learning_rate": 4.038519979226785e-06,
"loss": 0.1223,
"step": 7870
},
{
"epoch": 0.7131060383249248,
"grad_norm": 0.4702622890472412,
"learning_rate": 4.015012268697085e-06,
"loss": 0.1274,
"step": 7880
},
{
"epoch": 0.7140109952263524,
"grad_norm": 0.40426555275917053,
"learning_rate": 3.991555978536937e-06,
"loss": 0.1286,
"step": 7890
},
{
"epoch": 0.7149159521277799,
"grad_norm": 0.3282513916492462,
"learning_rate": 3.968151310272417e-06,
"loss": 0.1286,
"step": 7900
},
{
"epoch": 0.7158209090292075,
"grad_norm": 0.31669655442237854,
"learning_rate": 3.944798464986086e-06,
"loss": 0.1228,
"step": 7910
},
{
"epoch": 0.716725865930635,
"grad_norm": 0.3846443295478821,
"learning_rate": 3.9214976433152755e-06,
"loss": 0.1289,
"step": 7920
},
{
"epoch": 0.7176308228320626,
"grad_norm": 0.3068075478076935,
"learning_rate": 3.8982490454503455e-06,
"loss": 0.1258,
"step": 7930
},
{
"epoch": 0.7185357797334901,
"grad_norm": 0.259236216545105,
"learning_rate": 3.875052871132979e-06,
"loss": 0.126,
"step": 7940
},
{
"epoch": 0.7194407366349178,
"grad_norm": 0.32442575693130493,
"learning_rate": 3.851909319654448e-06,
"loss": 0.1282,
"step": 7950
},
{
"epoch": 0.7203456935363454,
"grad_norm": 0.3598691523075104,
"learning_rate": 3.82881858985392e-06,
"loss": 0.127,
"step": 7960
},
{
"epoch": 0.7212506504377729,
"grad_norm": 0.23929624259471893,
"learning_rate": 3.8057808801167463e-06,
"loss": 0.1243,
"step": 7970
},
{
"epoch": 0.7221556073392005,
"grad_norm": 0.31415387988090515,
"learning_rate": 3.782796388372739e-06,
"loss": 0.1309,
"step": 7980
},
{
"epoch": 0.723060564240628,
"grad_norm": 0.3203081786632538,
"learning_rate": 3.7598653120945015e-06,
"loss": 0.1268,
"step": 7990
},
{
"epoch": 0.7239655211420556,
"grad_norm": 0.38148126006126404,
"learning_rate": 3.736987848295699e-06,
"loss": 0.1196,
"step": 8000
},
{
"epoch": 0.7248704780434831,
"grad_norm": 0.2480911761522293,
"learning_rate": 3.7141641935293926e-06,
"loss": 0.1296,
"step": 8010
},
{
"epoch": 0.7257754349449107,
"grad_norm": 0.2723288834095001,
"learning_rate": 3.6913945438863397e-06,
"loss": 0.127,
"step": 8020
},
{
"epoch": 0.7266803918463384,
"grad_norm": 0.3012789785861969,
"learning_rate": 3.6686790949933082e-06,
"loss": 0.1254,
"step": 8030
},
{
"epoch": 0.7275853487477659,
"grad_norm": 0.2832350432872772,
"learning_rate": 3.64601804201139e-06,
"loss": 0.1158,
"step": 8040
},
{
"epoch": 0.7284903056491935,
"grad_norm": 0.30464476346969604,
"learning_rate": 3.6234115796343405e-06,
"loss": 0.1223,
"step": 8050
},
{
"epoch": 0.729395262550621,
"grad_norm": 0.3467896282672882,
"learning_rate": 3.6008599020868985e-06,
"loss": 0.1326,
"step": 8060
},
{
"epoch": 0.7303002194520486,
"grad_norm": 0.35166192054748535,
"learning_rate": 3.5783632031231018e-06,
"loss": 0.1257,
"step": 8070
},
{
"epoch": 0.7312051763534761,
"grad_norm": 0.3181626796722412,
"learning_rate": 3.555921676024653e-06,
"loss": 0.1269,
"step": 8080
},
{
"epoch": 0.7321101332549037,
"grad_norm": 0.30875396728515625,
"learning_rate": 3.53353551359923e-06,
"loss": 0.1297,
"step": 8090
},
{
"epoch": 0.7330150901563313,
"grad_norm": 0.3796384632587433,
"learning_rate": 3.511204908178848e-06,
"loss": 0.1243,
"step": 8100
},
{
"epoch": 0.7339200470577589,
"grad_norm": 0.3839415907859802,
"learning_rate": 3.488930051618201e-06,
"loss": 0.1265,
"step": 8110
},
{
"epoch": 0.7348250039591865,
"grad_norm": 0.34740591049194336,
"learning_rate": 3.4667111352930163e-06,
"loss": 0.1339,
"step": 8120
},
{
"epoch": 0.735729960860614,
"grad_norm": 0.2768769860267639,
"learning_rate": 3.4445483500983944e-06,
"loss": 0.1238,
"step": 8130
},
{
"epoch": 0.7366349177620416,
"grad_norm": 0.3522382378578186,
"learning_rate": 3.4224418864471976e-06,
"loss": 0.1242,
"step": 8140
},
{
"epoch": 0.7375398746634692,
"grad_norm": 0.31364932656288147,
"learning_rate": 3.400391934268391e-06,
"loss": 0.1261,
"step": 8150
},
{
"epoch": 0.7384448315648967,
"grad_norm": 0.3115822374820709,
"learning_rate": 3.378398683005416e-06,
"loss": 0.1248,
"step": 8160
},
{
"epoch": 0.7393497884663243,
"grad_norm": 0.370149165391922,
"learning_rate": 3.356462321614573e-06,
"loss": 0.1294,
"step": 8170
},
{
"epoch": 0.7402547453677518,
"grad_norm": 0.28242307901382446,
"learning_rate": 3.334583038563376e-06,
"loss": 0.1298,
"step": 8180
},
{
"epoch": 0.7411597022691794,
"grad_norm": 0.323049396276474,
"learning_rate": 3.3127610218289617e-06,
"loss": 0.1228,
"step": 8190
},
{
"epoch": 0.742064659170607,
"grad_norm": 0.393040269613266,
"learning_rate": 3.2909964588964514e-06,
"loss": 0.1276,
"step": 8200
},
{
"epoch": 0.7429696160720346,
"grad_norm": 0.36402514576911926,
"learning_rate": 3.269289536757352e-06,
"loss": 0.1296,
"step": 8210
},
{
"epoch": 0.7438745729734622,
"grad_norm": 0.3288993239402771,
"learning_rate": 3.2476404419079487e-06,
"loss": 0.1245,
"step": 8220
},
{
"epoch": 0.7447795298748897,
"grad_norm": 0.40095171332359314,
"learning_rate": 3.226049360347694e-06,
"loss": 0.1275,
"step": 8230
},
{
"epoch": 0.7456844867763173,
"grad_norm": 0.36854180693626404,
"learning_rate": 3.2045164775776137e-06,
"loss": 0.1254,
"step": 8240
},
{
"epoch": 0.7465894436777448,
"grad_norm": 0.35555073618888855,
"learning_rate": 3.1830419785987243e-06,
"loss": 0.1237,
"step": 8250
},
{
"epoch": 0.7474944005791724,
"grad_norm": 0.4191891849040985,
"learning_rate": 3.161626047910431e-06,
"loss": 0.13,
"step": 8260
},
{
"epoch": 0.7483993574805999,
"grad_norm": 0.3702433705329895,
"learning_rate": 3.140268869508949e-06,
"loss": 0.1317,
"step": 8270
},
{
"epoch": 0.7493043143820276,
"grad_norm": 0.27752381563186646,
"learning_rate": 3.1189706268857077e-06,
"loss": 0.1226,
"step": 8280
},
{
"epoch": 0.7502092712834552,
"grad_norm": 0.3350948095321655,
"learning_rate": 3.0977315030258002e-06,
"loss": 0.1309,
"step": 8290
},
{
"epoch": 0.7511142281848827,
"grad_norm": 0.4031756520271301,
"learning_rate": 3.0765516804063932e-06,
"loss": 0.127,
"step": 8300
},
{
"epoch": 0.7520191850863103,
"grad_norm": 0.3240339159965515,
"learning_rate": 3.055431340995163e-06,
"loss": 0.123,
"step": 8310
},
{
"epoch": 0.7529241419877378,
"grad_norm": 0.3321908414363861,
"learning_rate": 3.0343706662487306e-06,
"loss": 0.1258,
"step": 8320
},
{
"epoch": 0.7538290988891654,
"grad_norm": 0.3860868513584137,
"learning_rate": 3.013369837111101e-06,
"loss": 0.1297,
"step": 8330
},
{
"epoch": 0.7547340557905929,
"grad_norm": 0.3689778745174408,
"learning_rate": 2.992429034012121e-06,
"loss": 0.1253,
"step": 8340
},
{
"epoch": 0.7556390126920205,
"grad_norm": 0.27259641885757446,
"learning_rate": 2.9715484368659152e-06,
"loss": 0.1258,
"step": 8350
},
{
"epoch": 0.7565439695934482,
"grad_norm": 0.39256593585014343,
"learning_rate": 2.9507282250693514e-06,
"loss": 0.119,
"step": 8360
},
{
"epoch": 0.7574489264948757,
"grad_norm": 0.33603495359420776,
"learning_rate": 2.9299685775004793e-06,
"loss": 0.1337,
"step": 8370
},
{
"epoch": 0.7583538833963033,
"grad_norm": 0.4902133345603943,
"learning_rate": 2.9092696725170212e-06,
"loss": 0.1352,
"step": 8380
},
{
"epoch": 0.7592588402977308,
"grad_norm": 0.27200329303741455,
"learning_rate": 2.8886316879548205e-06,
"loss": 0.1231,
"step": 8390
},
{
"epoch": 0.7601637971991584,
"grad_norm": 0.27076977491378784,
"learning_rate": 2.868054801126321e-06,
"loss": 0.1209,
"step": 8400
},
{
"epoch": 0.7610687541005859,
"grad_norm": 0.32341551780700684,
"learning_rate": 2.8475391888190395e-06,
"loss": 0.1346,
"step": 8410
},
{
"epoch": 0.7619737110020135,
"grad_norm": 0.29108741879463196,
"learning_rate": 2.8270850272940466e-06,
"loss": 0.1251,
"step": 8420
},
{
"epoch": 0.762878667903441,
"grad_norm": 0.321847528219223,
"learning_rate": 2.806692492284461e-06,
"loss": 0.1248,
"step": 8430
},
{
"epoch": 0.7637836248048687,
"grad_norm": 0.3972260355949402,
"learning_rate": 2.786361758993932e-06,
"loss": 0.1266,
"step": 8440
},
{
"epoch": 0.7646885817062963,
"grad_norm": 0.3229351341724396,
"learning_rate": 2.766093002095137e-06,
"loss": 0.1253,
"step": 8450
},
{
"epoch": 0.7655935386077238,
"grad_norm": 0.32253509759902954,
"learning_rate": 2.745886395728271e-06,
"loss": 0.124,
"step": 8460
},
{
"epoch": 0.7664984955091514,
"grad_norm": 0.34673410654067993,
"learning_rate": 2.725742113499571e-06,
"loss": 0.124,
"step": 8470
},
{
"epoch": 0.767403452410579,
"grad_norm": 0.3298405110836029,
"learning_rate": 2.705660328479809e-06,
"loss": 0.1259,
"step": 8480
},
{
"epoch": 0.7683084093120065,
"grad_norm": 0.454266756772995,
"learning_rate": 2.6856412132027997e-06,
"loss": 0.1211,
"step": 8490
},
{
"epoch": 0.7692133662134341,
"grad_norm": 0.39579710364341736,
"learning_rate": 2.6656849396639415e-06,
"loss": 0.1302,
"step": 8500
},
{
"epoch": 0.7701183231148616,
"grad_norm": 0.2675047814846039,
"learning_rate": 2.6457916793187124e-06,
"loss": 0.123,
"step": 8510
},
{
"epoch": 0.7710232800162892,
"grad_norm": 0.44622719287872314,
"learning_rate": 2.6259616030812128e-06,
"loss": 0.1238,
"step": 8520
},
{
"epoch": 0.7719282369177168,
"grad_norm": 0.40703779458999634,
"learning_rate": 2.6061948813226968e-06,
"loss": 0.1222,
"step": 8530
},
{
"epoch": 0.7728331938191444,
"grad_norm": 0.32759425044059753,
"learning_rate": 2.5864916838701016e-06,
"loss": 0.1257,
"step": 8540
},
{
"epoch": 0.773738150720572,
"grad_norm": 0.2865431308746338,
"learning_rate": 2.5668521800045944e-06,
"loss": 0.1291,
"step": 8550
},
{
"epoch": 0.7746431076219995,
"grad_norm": 0.30442455410957336,
"learning_rate": 2.5472765384601074e-06,
"loss": 0.1214,
"step": 8560
},
{
"epoch": 0.7755480645234271,
"grad_norm": 0.31685060262680054,
"learning_rate": 2.5277649274219064e-06,
"loss": 0.131,
"step": 8570
},
{
"epoch": 0.7764530214248546,
"grad_norm": 0.274147093296051,
"learning_rate": 2.508317514525125e-06,
"loss": 0.1195,
"step": 8580
},
{
"epoch": 0.7773579783262822,
"grad_norm": 0.28254234790802,
"learning_rate": 2.4889344668533453e-06,
"loss": 0.1313,
"step": 8590
},
{
"epoch": 0.7782629352277097,
"grad_norm": 0.3491531014442444,
"learning_rate": 2.469615950937142e-06,
"loss": 0.1279,
"step": 8600
},
{
"epoch": 0.7791678921291374,
"grad_norm": 0.3661726713180542,
"learning_rate": 2.4503621327526694e-06,
"loss": 0.1252,
"step": 8610
},
{
"epoch": 0.780072849030565,
"grad_norm": 0.24949342012405396,
"learning_rate": 2.431173177720223e-06,
"loss": 0.1209,
"step": 8620
},
{
"epoch": 0.7809778059319925,
"grad_norm": 0.24942710995674133,
"learning_rate": 2.4120492507028236e-06,
"loss": 0.1294,
"step": 8630
},
{
"epoch": 0.7818827628334201,
"grad_norm": 0.3253498077392578,
"learning_rate": 2.392990516004804e-06,
"loss": 0.1313,
"step": 8640
},
{
"epoch": 0.7827877197348476,
"grad_norm": 0.31229835748672485,
"learning_rate": 2.3739971373703852e-06,
"loss": 0.1244,
"step": 8650
},
{
"epoch": 0.7836926766362752,
"grad_norm": 0.3631618320941925,
"learning_rate": 2.355069277982286e-06,
"loss": 0.1266,
"step": 8660
},
{
"epoch": 0.7845976335377027,
"grad_norm": 0.2935367822647095,
"learning_rate": 2.3362071004603036e-06,
"loss": 0.1222,
"step": 8670
},
{
"epoch": 0.7855025904391303,
"grad_norm": 0.34212765097618103,
"learning_rate": 2.3174107668599366e-06,
"loss": 0.126,
"step": 8680
},
{
"epoch": 0.786407547340558,
"grad_norm": 0.28019851446151733,
"learning_rate": 2.298680438670976e-06,
"loss": 0.119,
"step": 8690
},
{
"epoch": 0.7873125042419855,
"grad_norm": 0.35425955057144165,
"learning_rate": 2.2800162768161204e-06,
"loss": 0.1237,
"step": 8700
},
{
"epoch": 0.7882174611434131,
"grad_norm": 0.3190707564353943,
"learning_rate": 2.2614184416496022e-06,
"loss": 0.1206,
"step": 8710
},
{
"epoch": 0.7891224180448406,
"grad_norm": 0.40515249967575073,
"learning_rate": 2.2428870929558012e-06,
"loss": 0.1251,
"step": 8720
},
{
"epoch": 0.7900273749462682,
"grad_norm": 0.3554609417915344,
"learning_rate": 2.224422389947879e-06,
"loss": 0.1268,
"step": 8730
},
{
"epoch": 0.7909323318476957,
"grad_norm": 0.37230101227760315,
"learning_rate": 2.2060244912663996e-06,
"loss": 0.134,
"step": 8740
},
{
"epoch": 0.7918372887491233,
"grad_norm": 0.2492271512746811,
"learning_rate": 2.1876935549779766e-06,
"loss": 0.1247,
"step": 8750
},
{
"epoch": 0.7927422456505508,
"grad_norm": 0.3300861418247223,
"learning_rate": 2.169429738573915e-06,
"loss": 0.1243,
"step": 8760
},
{
"epoch": 0.7936472025519785,
"grad_norm": 0.4448375105857849,
"learning_rate": 2.151233198968854e-06,
"loss": 0.121,
"step": 8770
},
{
"epoch": 0.7945521594534061,
"grad_norm": 0.2607729434967041,
"learning_rate": 2.1331040924994216e-06,
"loss": 0.1194,
"step": 8780
},
{
"epoch": 0.7954571163548336,
"grad_norm": 0.3317795991897583,
"learning_rate": 2.1150425749228853e-06,
"loss": 0.122,
"step": 8790
},
{
"epoch": 0.7963620732562612,
"grad_norm": 0.3975413739681244,
"learning_rate": 2.097048801415823e-06,
"loss": 0.1261,
"step": 8800
},
{
"epoch": 0.7972670301576887,
"grad_norm": 0.3129339814186096,
"learning_rate": 2.079122926572784e-06,
"loss": 0.1264,
"step": 8810
},
{
"epoch": 0.7981719870591163,
"grad_norm": 0.32011422514915466,
"learning_rate": 2.0612651044049683e-06,
"loss": 0.1287,
"step": 8820
},
{
"epoch": 0.7990769439605439,
"grad_norm": 0.34052857756614685,
"learning_rate": 2.043475488338885e-06,
"loss": 0.1217,
"step": 8830
},
{
"epoch": 0.7999819008619714,
"grad_norm": 0.2858836054801941,
"learning_rate": 2.0257542312150534e-06,
"loss": 0.1242,
"step": 8840
},
{
"epoch": 0.800886857763399,
"grad_norm": 0.26198819279670715,
"learning_rate": 2.0081014852866843e-06,
"loss": 0.1288,
"step": 8850
},
{
"epoch": 0.8017918146648266,
"grad_norm": 0.3347429633140564,
"learning_rate": 1.9905174022183702e-06,
"loss": 0.1251,
"step": 8860
},
{
"epoch": 0.8026967715662542,
"grad_norm": 0.3790920078754425,
"learning_rate": 1.9730021330847838e-06,
"loss": 0.1263,
"step": 8870
},
{
"epoch": 0.8036017284676817,
"grad_norm": 0.27628254890441895,
"learning_rate": 1.955555828369371e-06,
"loss": 0.1272,
"step": 8880
},
{
"epoch": 0.8045066853691093,
"grad_norm": 0.2933729887008667,
"learning_rate": 1.938178637963074e-06,
"loss": 0.1332,
"step": 8890
},
{
"epoch": 0.8054116422705369,
"grad_norm": 0.2660931348800659,
"learning_rate": 1.9208707111630376e-06,
"loss": 0.1259,
"step": 8900
},
{
"epoch": 0.8063165991719644,
"grad_norm": 0.3294607698917389,
"learning_rate": 1.903632196671311e-06,
"loss": 0.1293,
"step": 8910
},
{
"epoch": 0.807221556073392,
"grad_norm": 0.3121941089630127,
"learning_rate": 1.8864632425936015e-06,
"loss": 0.1289,
"step": 8920
},
{
"epoch": 0.8081265129748195,
"grad_norm": 0.2634756863117218,
"learning_rate": 1.8693639964379661e-06,
"loss": 0.1291,
"step": 8930
},
{
"epoch": 0.8090314698762472,
"grad_norm": 0.3612058162689209,
"learning_rate": 1.852334605113576e-06,
"loss": 0.1284,
"step": 8940
},
{
"epoch": 0.8099364267776747,
"grad_norm": 0.27511492371559143,
"learning_rate": 1.8353752149294335e-06,
"loss": 0.1255,
"step": 8950
},
{
"epoch": 0.8108413836791023,
"grad_norm": 0.5154643058776855,
"learning_rate": 1.8184859715931247e-06,
"loss": 0.1293,
"step": 8960
},
{
"epoch": 0.8117463405805299,
"grad_norm": 0.2842332124710083,
"learning_rate": 1.8016670202095677e-06,
"loss": 0.1233,
"step": 8970
},
{
"epoch": 0.8126512974819574,
"grad_norm": 0.26935648918151855,
"learning_rate": 1.7849185052797525e-06,
"loss": 0.127,
"step": 8980
},
{
"epoch": 0.813556254383385,
"grad_norm": 0.4419654607772827,
"learning_rate": 1.7682405706995243e-06,
"loss": 0.1255,
"step": 8990
},
{
"epoch": 0.8144612112848125,
"grad_norm": 0.27756479382514954,
"learning_rate": 1.7516333597583214e-06,
"loss": 0.1195,
"step": 9000
}
],
"logging_steps": 10,
"max_steps": 11050,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.722768489962275e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}