toy_model__5 / trainer_state.json
VladShash's picture
Upload 13 files
7ad608d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 32425,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0038550501156515036,
"grad_norm": 16.608898162841797,
"learning_rate": 9.999986482322772e-06,
"loss": 1.4836,
"step": 25
},
{
"epoch": 0.007710100231303007,
"grad_norm": 12.18391227722168,
"learning_rate": 9.999943652957133e-06,
"loss": 1.2813,
"step": 50
},
{
"epoch": 0.01156515034695451,
"grad_norm": 11.903943061828613,
"learning_rate": 9.999871488686168e-06,
"loss": 1.2168,
"step": 75
},
{
"epoch": 0.015420200462606014,
"grad_norm": 18.851295471191406,
"learning_rate": 9.999769989933268e-06,
"loss": 1.2391,
"step": 100
},
{
"epoch": 0.01927525057825752,
"grad_norm": 14.748234748840332,
"learning_rate": 9.999639157293928e-06,
"loss": 0.9374,
"step": 125
},
{
"epoch": 0.02313030069390902,
"grad_norm": 12.391430854797363,
"learning_rate": 9.999478991535755e-06,
"loss": 1.1254,
"step": 150
},
{
"epoch": 0.026985350809560524,
"grad_norm": 13.535922050476074,
"learning_rate": 9.99928949359844e-06,
"loss": 1.0,
"step": 175
},
{
"epoch": 0.03084040092521203,
"grad_norm": 13.087531089782715,
"learning_rate": 9.999070664593785e-06,
"loss": 0.901,
"step": 200
},
{
"epoch": 0.03469545104086353,
"grad_norm": 16.20705223083496,
"learning_rate": 9.998822505805667e-06,
"loss": 1.0212,
"step": 225
},
{
"epoch": 0.03855050115651504,
"grad_norm": 15.247299194335938,
"learning_rate": 9.998545018690045e-06,
"loss": 0.8759,
"step": 250
},
{
"epoch": 0.04240555127216654,
"grad_norm": 11.694854736328125,
"learning_rate": 9.998238204874946e-06,
"loss": 0.9611,
"step": 275
},
{
"epoch": 0.04626060138781804,
"grad_norm": 11.789994239807129,
"learning_rate": 9.997902066160468e-06,
"loss": 0.9461,
"step": 300
},
{
"epoch": 0.05011565150346955,
"grad_norm": 8.035028457641602,
"learning_rate": 9.997536604518746e-06,
"loss": 0.9306,
"step": 325
},
{
"epoch": 0.05397070161912105,
"grad_norm": 13.199110984802246,
"learning_rate": 9.997141822093968e-06,
"loss": 0.9511,
"step": 350
},
{
"epoch": 0.05782575173477255,
"grad_norm": 9.918705940246582,
"learning_rate": 9.996717721202334e-06,
"loss": 0.9201,
"step": 375
},
{
"epoch": 0.06168080185042406,
"grad_norm": 13.475760459899902,
"learning_rate": 9.996264304332066e-06,
"loss": 0.9155,
"step": 400
},
{
"epoch": 0.06553585196607556,
"grad_norm": 13.787261009216309,
"learning_rate": 9.995781574143384e-06,
"loss": 0.9128,
"step": 425
},
{
"epoch": 0.06939090208172706,
"grad_norm": 14.03122615814209,
"learning_rate": 9.995269533468486e-06,
"loss": 0.9624,
"step": 450
},
{
"epoch": 0.07324595219737856,
"grad_norm": 13.960323333740234,
"learning_rate": 9.994728185311542e-06,
"loss": 0.7851,
"step": 475
},
{
"epoch": 0.07710100231303008,
"grad_norm": 13.849388122558594,
"learning_rate": 9.994157532848665e-06,
"loss": 0.8017,
"step": 500
},
{
"epoch": 0.08095605242868158,
"grad_norm": 8.507377624511719,
"learning_rate": 9.993557579427901e-06,
"loss": 0.9295,
"step": 525
},
{
"epoch": 0.08481110254433308,
"grad_norm": 11.375807762145996,
"learning_rate": 9.992928328569205e-06,
"loss": 0.909,
"step": 550
},
{
"epoch": 0.08866615265998458,
"grad_norm": 10.749899864196777,
"learning_rate": 9.992269783964422e-06,
"loss": 0.8058,
"step": 575
},
{
"epoch": 0.09252120277563608,
"grad_norm": 13.238548278808594,
"learning_rate": 9.99158194947726e-06,
"loss": 0.8941,
"step": 600
},
{
"epoch": 0.09637625289128758,
"grad_norm": 9.177589416503906,
"learning_rate": 9.990864829143284e-06,
"loss": 0.8742,
"step": 625
},
{
"epoch": 0.1002313030069391,
"grad_norm": 11.465133666992188,
"learning_rate": 9.990118427169864e-06,
"loss": 0.8651,
"step": 650
},
{
"epoch": 0.1040863531225906,
"grad_norm": 13.029391288757324,
"learning_rate": 9.989342747936184e-06,
"loss": 0.9098,
"step": 675
},
{
"epoch": 0.1079414032382421,
"grad_norm": 16.456314086914062,
"learning_rate": 9.988537795993185e-06,
"loss": 0.9324,
"step": 700
},
{
"epoch": 0.1117964533538936,
"grad_norm": 9.585066795349121,
"learning_rate": 9.98770357606356e-06,
"loss": 0.8852,
"step": 725
},
{
"epoch": 0.1156515034695451,
"grad_norm": 11.982830047607422,
"learning_rate": 9.986840093041716e-06,
"loss": 0.9691,
"step": 750
},
{
"epoch": 0.1195065535851966,
"grad_norm": 12.948980331420898,
"learning_rate": 9.98594735199375e-06,
"loss": 0.9242,
"step": 775
},
{
"epoch": 0.12336160370084812,
"grad_norm": 13.323537826538086,
"learning_rate": 9.985025358157416e-06,
"loss": 0.8365,
"step": 800
},
{
"epoch": 0.1272166538164996,
"grad_norm": 11.167157173156738,
"learning_rate": 9.984074116942092e-06,
"loss": 0.895,
"step": 825
},
{
"epoch": 0.13107170393215112,
"grad_norm": 8.00083065032959,
"learning_rate": 9.98309363392876e-06,
"loss": 0.826,
"step": 850
},
{
"epoch": 0.13492675404780263,
"grad_norm": 11.226200103759766,
"learning_rate": 9.982083914869955e-06,
"loss": 0.9629,
"step": 875
},
{
"epoch": 0.13878180416345412,
"grad_norm": 8.985050201416016,
"learning_rate": 9.98104496568975e-06,
"loss": 0.8571,
"step": 900
},
{
"epoch": 0.14263685427910563,
"grad_norm": 12.297369003295898,
"learning_rate": 9.979976792483709e-06,
"loss": 0.9435,
"step": 925
},
{
"epoch": 0.14649190439475712,
"grad_norm": 9.352435111999512,
"learning_rate": 9.978879401518853e-06,
"loss": 0.856,
"step": 950
},
{
"epoch": 0.15034695451040864,
"grad_norm": 8.96108341217041,
"learning_rate": 9.977752799233626e-06,
"loss": 0.8447,
"step": 975
},
{
"epoch": 0.15420200462606015,
"grad_norm": 8.161125183105469,
"learning_rate": 9.976596992237857e-06,
"loss": 0.7626,
"step": 1000
},
{
"epoch": 0.15805705474171164,
"grad_norm": 7.878628253936768,
"learning_rate": 9.975411987312723e-06,
"loss": 0.9266,
"step": 1025
},
{
"epoch": 0.16191210485736315,
"grad_norm": 7.8993120193481445,
"learning_rate": 9.9741977914107e-06,
"loss": 0.9941,
"step": 1050
},
{
"epoch": 0.16576715497301464,
"grad_norm": 10.78333854675293,
"learning_rate": 9.972954411655536e-06,
"loss": 0.9106,
"step": 1075
},
{
"epoch": 0.16962220508866616,
"grad_norm": 10.389150619506836,
"learning_rate": 9.971681855342196e-06,
"loss": 0.9304,
"step": 1100
},
{
"epoch": 0.17347725520431764,
"grad_norm": 11.26492691040039,
"learning_rate": 9.970380129936828e-06,
"loss": 0.8549,
"step": 1125
},
{
"epoch": 0.17733230531996916,
"grad_norm": 16.504281997680664,
"learning_rate": 9.969049243076719e-06,
"loss": 0.8707,
"step": 1150
},
{
"epoch": 0.18118735543562067,
"grad_norm": 9.83029556274414,
"learning_rate": 9.967689202570243e-06,
"loss": 0.7977,
"step": 1175
},
{
"epoch": 0.18504240555127216,
"grad_norm": 20.88360023498535,
"learning_rate": 9.966300016396821e-06,
"loss": 0.9331,
"step": 1200
},
{
"epoch": 0.18889745566692367,
"grad_norm": 10.239086151123047,
"learning_rate": 9.964881692706876e-06,
"loss": 0.7189,
"step": 1225
},
{
"epoch": 0.19275250578257516,
"grad_norm": 10.720580101013184,
"learning_rate": 9.96343423982178e-06,
"loss": 0.8439,
"step": 1250
},
{
"epoch": 0.19660755589822668,
"grad_norm": 12.455538749694824,
"learning_rate": 9.961957666233807e-06,
"loss": 0.8606,
"step": 1275
},
{
"epoch": 0.2004626060138782,
"grad_norm": 8.332338333129883,
"learning_rate": 9.96045198060608e-06,
"loss": 0.8143,
"step": 1300
},
{
"epoch": 0.20431765612952968,
"grad_norm": 11.725828170776367,
"learning_rate": 9.958917191772532e-06,
"loss": 0.8134,
"step": 1325
},
{
"epoch": 0.2081727062451812,
"grad_norm": 7.7737274169921875,
"learning_rate": 9.957353308737841e-06,
"loss": 0.8411,
"step": 1350
},
{
"epoch": 0.21202775636083268,
"grad_norm": 10.63807487487793,
"learning_rate": 9.955760340677383e-06,
"loss": 0.9866,
"step": 1375
},
{
"epoch": 0.2158828064764842,
"grad_norm": 6.718491554260254,
"learning_rate": 9.954138296937175e-06,
"loss": 0.8243,
"step": 1400
},
{
"epoch": 0.2197378565921357,
"grad_norm": 12.062403678894043,
"learning_rate": 9.952487187033824e-06,
"loss": 0.9217,
"step": 1425
},
{
"epoch": 0.2235929067077872,
"grad_norm": 9.561263084411621,
"learning_rate": 9.950807020654472e-06,
"loss": 0.8923,
"step": 1450
},
{
"epoch": 0.2274479568234387,
"grad_norm": 10.816781997680664,
"learning_rate": 9.949097807656731e-06,
"loss": 1.0246,
"step": 1475
},
{
"epoch": 0.2313030069390902,
"grad_norm": 9.85682487487793,
"learning_rate": 9.947359558068638e-06,
"loss": 0.9884,
"step": 1500
},
{
"epoch": 0.23515805705474171,
"grad_norm": 8.669824600219727,
"learning_rate": 9.945592282088583e-06,
"loss": 0.8497,
"step": 1525
},
{
"epoch": 0.2390131071703932,
"grad_norm": 11.811067581176758,
"learning_rate": 9.94379599008526e-06,
"loss": 0.9259,
"step": 1550
},
{
"epoch": 0.24286815728604472,
"grad_norm": 10.513306617736816,
"learning_rate": 9.941970692597596e-06,
"loss": 0.8402,
"step": 1575
},
{
"epoch": 0.24672320740169623,
"grad_norm": 12.475310325622559,
"learning_rate": 9.940116400334698e-06,
"loss": 0.8891,
"step": 1600
},
{
"epoch": 0.25057825751734775,
"grad_norm": 9.706908226013184,
"learning_rate": 9.938233124175787e-06,
"loss": 0.8758,
"step": 1625
},
{
"epoch": 0.2544333076329992,
"grad_norm": 8.308640480041504,
"learning_rate": 9.936320875170133e-06,
"loss": 0.8675,
"step": 1650
},
{
"epoch": 0.2582883577486507,
"grad_norm": 8.413049697875977,
"learning_rate": 9.934379664536994e-06,
"loss": 0.7676,
"step": 1675
},
{
"epoch": 0.26214340786430224,
"grad_norm": 13.29423713684082,
"learning_rate": 9.932409503665536e-06,
"loss": 1.0323,
"step": 1700
},
{
"epoch": 0.26599845797995375,
"grad_norm": 9.084247589111328,
"learning_rate": 9.930410404114796e-06,
"loss": 0.8498,
"step": 1725
},
{
"epoch": 0.26985350809560527,
"grad_norm": 7.67595911026001,
"learning_rate": 9.928382377613578e-06,
"loss": 0.8839,
"step": 1750
},
{
"epoch": 0.2737085582112567,
"grad_norm": 7.744143486022949,
"learning_rate": 9.926325436060413e-06,
"loss": 0.8742,
"step": 1775
},
{
"epoch": 0.27756360832690824,
"grad_norm": 9.411246299743652,
"learning_rate": 9.924239591523472e-06,
"loss": 0.9324,
"step": 1800
},
{
"epoch": 0.28141865844255975,
"grad_norm": 5.600951671600342,
"learning_rate": 9.922124856240507e-06,
"loss": 0.7876,
"step": 1825
},
{
"epoch": 0.28527370855821127,
"grad_norm": 9.795894622802734,
"learning_rate": 9.919981242618764e-06,
"loss": 0.9812,
"step": 1850
},
{
"epoch": 0.2891287586738628,
"grad_norm": 12.65941333770752,
"learning_rate": 9.91780876323493e-06,
"loss": 0.841,
"step": 1875
},
{
"epoch": 0.29298380878951424,
"grad_norm": 9.846212387084961,
"learning_rate": 9.915607430835041e-06,
"loss": 0.8043,
"step": 1900
},
{
"epoch": 0.29683885890516576,
"grad_norm": 8.951935768127441,
"learning_rate": 9.91337725833442e-06,
"loss": 0.7336,
"step": 1925
},
{
"epoch": 0.3006939090208173,
"grad_norm": 10.565713882446289,
"learning_rate": 9.911118258817593e-06,
"loss": 0.9214,
"step": 1950
},
{
"epoch": 0.3045489591364688,
"grad_norm": 11.37509822845459,
"learning_rate": 9.908830445538218e-06,
"loss": 0.8742,
"step": 1975
},
{
"epoch": 0.3084040092521203,
"grad_norm": 8.522985458374023,
"learning_rate": 9.906513831919004e-06,
"loss": 0.7956,
"step": 2000
},
{
"epoch": 0.31225905936777176,
"grad_norm": 11.503297805786133,
"learning_rate": 9.904168431551631e-06,
"loss": 0.9218,
"step": 2025
},
{
"epoch": 0.3161141094834233,
"grad_norm": 9.792448997497559,
"learning_rate": 9.901794258196672e-06,
"loss": 0.7922,
"step": 2050
},
{
"epoch": 0.3199691595990748,
"grad_norm": 8.851276397705078,
"learning_rate": 9.899391325783516e-06,
"loss": 0.8263,
"step": 2075
},
{
"epoch": 0.3238242097147263,
"grad_norm": 9.307406425476074,
"learning_rate": 9.896959648410282e-06,
"loss": 0.8465,
"step": 2100
},
{
"epoch": 0.3276792598303778,
"grad_norm": 9.803622245788574,
"learning_rate": 9.894499240343736e-06,
"loss": 0.9159,
"step": 2125
},
{
"epoch": 0.3315343099460293,
"grad_norm": 9.91102123260498,
"learning_rate": 9.892010116019206e-06,
"loss": 0.8109,
"step": 2150
},
{
"epoch": 0.3353893600616808,
"grad_norm": 5.6517252922058105,
"learning_rate": 9.8894922900405e-06,
"loss": 0.8854,
"step": 2175
},
{
"epoch": 0.3392444101773323,
"grad_norm": 8.251344680786133,
"learning_rate": 9.886945777179829e-06,
"loss": 0.7388,
"step": 2200
},
{
"epoch": 0.3430994602929838,
"grad_norm": 5.0112714767456055,
"learning_rate": 9.884370592377697e-06,
"loss": 0.7958,
"step": 2225
},
{
"epoch": 0.3469545104086353,
"grad_norm": 6.233525276184082,
"learning_rate": 9.881766750742838e-06,
"loss": 0.7946,
"step": 2250
},
{
"epoch": 0.3508095605242868,
"grad_norm": 9.736295700073242,
"learning_rate": 9.879134267552114e-06,
"loss": 0.779,
"step": 2275
},
{
"epoch": 0.3546646106399383,
"grad_norm": 7.2100090980529785,
"learning_rate": 9.876473158250426e-06,
"loss": 0.8867,
"step": 2300
},
{
"epoch": 0.35851966075558983,
"grad_norm": 11.768645286560059,
"learning_rate": 9.873783438450629e-06,
"loss": 0.8176,
"step": 2325
},
{
"epoch": 0.36237471087124135,
"grad_norm": 10.983874320983887,
"learning_rate": 9.871065123933436e-06,
"loss": 0.7658,
"step": 2350
},
{
"epoch": 0.3662297609868928,
"grad_norm": 9.236329078674316,
"learning_rate": 9.868318230647328e-06,
"loss": 0.8207,
"step": 2375
},
{
"epoch": 0.3700848111025443,
"grad_norm": 10.319311141967773,
"learning_rate": 9.865542774708455e-06,
"loss": 0.7839,
"step": 2400
},
{
"epoch": 0.37393986121819583,
"grad_norm": 8.785970687866211,
"learning_rate": 9.86273877240055e-06,
"loss": 0.8256,
"step": 2425
},
{
"epoch": 0.37779491133384735,
"grad_norm": 9.625986099243164,
"learning_rate": 9.859906240174825e-06,
"loss": 0.9255,
"step": 2450
},
{
"epoch": 0.38164996144949886,
"grad_norm": 5.00286340713501,
"learning_rate": 9.857045194649881e-06,
"loss": 0.8422,
"step": 2475
},
{
"epoch": 0.3855050115651503,
"grad_norm": 8.894876480102539,
"learning_rate": 9.854155652611608e-06,
"loss": 0.938,
"step": 2500
},
{
"epoch": 0.38936006168080184,
"grad_norm": 6.8418073654174805,
"learning_rate": 9.851237631013085e-06,
"loss": 0.8938,
"step": 2525
},
{
"epoch": 0.39321511179645335,
"grad_norm": 10.542811393737793,
"learning_rate": 9.848291146974483e-06,
"loss": 0.853,
"step": 2550
},
{
"epoch": 0.39707016191210487,
"grad_norm": 9.824645042419434,
"learning_rate": 9.84531621778296e-06,
"loss": 0.8478,
"step": 2575
},
{
"epoch": 0.4009252120277564,
"grad_norm": 9.339557647705078,
"learning_rate": 9.842312860892568e-06,
"loss": 0.8311,
"step": 2600
},
{
"epoch": 0.40478026214340784,
"grad_norm": 7.598040580749512,
"learning_rate": 9.839281093924145e-06,
"loss": 0.8647,
"step": 2625
},
{
"epoch": 0.40863531225905936,
"grad_norm": 9.201178550720215,
"learning_rate": 9.836220934665208e-06,
"loss": 0.8474,
"step": 2650
},
{
"epoch": 0.4124903623747109,
"grad_norm": 8.161620140075684,
"learning_rate": 9.833132401069857e-06,
"loss": 0.8378,
"step": 2675
},
{
"epoch": 0.4163454124903624,
"grad_norm": 7.657758712768555,
"learning_rate": 9.830015511258665e-06,
"loss": 0.8887,
"step": 2700
},
{
"epoch": 0.4202004626060139,
"grad_norm": 10.415013313293457,
"learning_rate": 9.82687028351857e-06,
"loss": 0.9402,
"step": 2725
},
{
"epoch": 0.42405551272166536,
"grad_norm": 8.337896347045898,
"learning_rate": 9.823696736302774e-06,
"loss": 0.8444,
"step": 2750
},
{
"epoch": 0.4279105628373169,
"grad_norm": 11.229880332946777,
"learning_rate": 9.820494888230628e-06,
"loss": 0.7538,
"step": 2775
},
{
"epoch": 0.4317656129529684,
"grad_norm": 7.037449359893799,
"learning_rate": 9.817264758087522e-06,
"loss": 0.8761,
"step": 2800
},
{
"epoch": 0.4356206630686199,
"grad_norm": 8.318718910217285,
"learning_rate": 9.814006364824786e-06,
"loss": 0.8115,
"step": 2825
},
{
"epoch": 0.4394757131842714,
"grad_norm": 9.426321029663086,
"learning_rate": 9.810719727559564e-06,
"loss": 0.8124,
"step": 2850
},
{
"epoch": 0.4433307632999229,
"grad_norm": 10.300312042236328,
"learning_rate": 9.807404865574715e-06,
"loss": 0.8951,
"step": 2875
},
{
"epoch": 0.4471858134155744,
"grad_norm": 11.817306518554688,
"learning_rate": 9.80406179831869e-06,
"loss": 0.7064,
"step": 2900
},
{
"epoch": 0.4510408635312259,
"grad_norm": 11.302453994750977,
"learning_rate": 9.80069054540542e-06,
"loss": 0.8422,
"step": 2925
},
{
"epoch": 0.4548959136468774,
"grad_norm": 5.307698726654053,
"learning_rate": 9.797291126614206e-06,
"loss": 0.7709,
"step": 2950
},
{
"epoch": 0.45875096376252894,
"grad_norm": 5.95034122467041,
"learning_rate": 9.793863561889599e-06,
"loss": 0.7484,
"step": 2975
},
{
"epoch": 0.4626060138781804,
"grad_norm": 10.917853355407715,
"learning_rate": 9.790407871341283e-06,
"loss": 0.829,
"step": 3000
},
{
"epoch": 0.4664610639938319,
"grad_norm": 9.553736686706543,
"learning_rate": 9.786924075243958e-06,
"loss": 0.8611,
"step": 3025
},
{
"epoch": 0.47031611410948343,
"grad_norm": 9.187798500061035,
"learning_rate": 9.783412194037218e-06,
"loss": 0.8711,
"step": 3050
},
{
"epoch": 0.47417116422513494,
"grad_norm": 13.05002498626709,
"learning_rate": 9.779872248325438e-06,
"loss": 0.7503,
"step": 3075
},
{
"epoch": 0.4780262143407864,
"grad_norm": 9.865554809570312,
"learning_rate": 9.776304258877645e-06,
"loss": 0.9458,
"step": 3100
},
{
"epoch": 0.4818812644564379,
"grad_norm": 5.100190162658691,
"learning_rate": 9.772708246627402e-06,
"loss": 0.7377,
"step": 3125
},
{
"epoch": 0.48573631457208943,
"grad_norm": 8.943501472473145,
"learning_rate": 9.769084232672684e-06,
"loss": 0.8123,
"step": 3150
},
{
"epoch": 0.48959136468774095,
"grad_norm": 12.481297492980957,
"learning_rate": 9.765432238275749e-06,
"loss": 0.917,
"step": 3175
},
{
"epoch": 0.49344641480339246,
"grad_norm": 8.235169410705566,
"learning_rate": 9.761752284863024e-06,
"loss": 0.7308,
"step": 3200
},
{
"epoch": 0.4973014649190439,
"grad_norm": 6.6950907707214355,
"learning_rate": 9.758044394024964e-06,
"loss": 0.822,
"step": 3225
},
{
"epoch": 0.5011565150346955,
"grad_norm": 7.119236946105957,
"learning_rate": 9.754308587515945e-06,
"loss": 0.6874,
"step": 3250
},
{
"epoch": 0.505011565150347,
"grad_norm": 8.58201789855957,
"learning_rate": 9.750544887254117e-06,
"loss": 0.8218,
"step": 3275
},
{
"epoch": 0.5088666152659984,
"grad_norm": 5.805675506591797,
"learning_rate": 9.746753315321289e-06,
"loss": 0.7239,
"step": 3300
},
{
"epoch": 0.51272166538165,
"grad_norm": 12.45322322845459,
"learning_rate": 9.742933893962789e-06,
"loss": 0.8146,
"step": 3325
},
{
"epoch": 0.5165767154973014,
"grad_norm": 6.444241046905518,
"learning_rate": 9.739086645587346e-06,
"loss": 0.7171,
"step": 3350
},
{
"epoch": 0.520431765612953,
"grad_norm": 5.90339469909668,
"learning_rate": 9.735211592766946e-06,
"loss": 0.7101,
"step": 3375
},
{
"epoch": 0.5242868157286045,
"grad_norm": 8.637225151062012,
"learning_rate": 9.731308758236706e-06,
"loss": 0.8791,
"step": 3400
},
{
"epoch": 0.5281418658442559,
"grad_norm": 8.763033866882324,
"learning_rate": 9.727378164894742e-06,
"loss": 0.8751,
"step": 3425
},
{
"epoch": 0.5319969159599075,
"grad_norm": 11.44009017944336,
"learning_rate": 9.723419835802032e-06,
"loss": 0.725,
"step": 3450
},
{
"epoch": 0.535851966075559,
"grad_norm": 10.40001106262207,
"learning_rate": 9.719433794182276e-06,
"loss": 0.7511,
"step": 3475
},
{
"epoch": 0.5397070161912105,
"grad_norm": 12.056906700134277,
"learning_rate": 9.715420063421768e-06,
"loss": 0.9661,
"step": 3500
},
{
"epoch": 0.543562066306862,
"grad_norm": 9.088433265686035,
"learning_rate": 9.71137866706926e-06,
"loss": 0.8129,
"step": 3525
},
{
"epoch": 0.5474171164225135,
"grad_norm": 7.02070426940918,
"learning_rate": 9.707309628835812e-06,
"loss": 0.8255,
"step": 3550
},
{
"epoch": 0.551272166538165,
"grad_norm": 7.946370601654053,
"learning_rate": 9.703212972594663e-06,
"loss": 0.7607,
"step": 3575
},
{
"epoch": 0.5551272166538165,
"grad_norm": 10.598237991333008,
"learning_rate": 9.69908872238109e-06,
"loss": 0.8394,
"step": 3600
},
{
"epoch": 0.558982266769468,
"grad_norm": 7.806949615478516,
"learning_rate": 9.694936902392264e-06,
"loss": 0.847,
"step": 3625
},
{
"epoch": 0.5628373168851195,
"grad_norm": 7.041951656341553,
"learning_rate": 9.690757536987106e-06,
"loss": 0.8041,
"step": 3650
},
{
"epoch": 0.566692367000771,
"grad_norm": 8.392083168029785,
"learning_rate": 9.686550650686153e-06,
"loss": 0.9944,
"step": 3675
},
{
"epoch": 0.5705474171164225,
"grad_norm": 12.13780403137207,
"learning_rate": 9.682316268171403e-06,
"loss": 0.8805,
"step": 3700
},
{
"epoch": 0.574402467232074,
"grad_norm": 11.311466217041016,
"learning_rate": 9.67805441428618e-06,
"loss": 0.784,
"step": 3725
},
{
"epoch": 0.5782575173477256,
"grad_norm": 9.810102462768555,
"learning_rate": 9.673765114034982e-06,
"loss": 0.8473,
"step": 3750
},
{
"epoch": 0.582112567463377,
"grad_norm": 10.17574691772461,
"learning_rate": 9.669448392583334e-06,
"loss": 0.8334,
"step": 3775
},
{
"epoch": 0.5859676175790285,
"grad_norm": 11.017036437988281,
"learning_rate": 9.665104275257645e-06,
"loss": 0.9855,
"step": 3800
},
{
"epoch": 0.5898226676946801,
"grad_norm": 7.330849647521973,
"learning_rate": 9.660732787545057e-06,
"loss": 1.0332,
"step": 3825
},
{
"epoch": 0.5936777178103315,
"grad_norm": 10.396186828613281,
"learning_rate": 9.656333955093295e-06,
"loss": 0.6633,
"step": 3850
},
{
"epoch": 0.5975327679259831,
"grad_norm": 9.76086711883545,
"learning_rate": 9.651907803710516e-06,
"loss": 0.7647,
"step": 3875
},
{
"epoch": 0.6013878180416345,
"grad_norm": 5.8678483963012695,
"learning_rate": 9.647454359365159e-06,
"loss": 0.8668,
"step": 3900
},
{
"epoch": 0.605242868157286,
"grad_norm": 10.050674438476562,
"learning_rate": 9.642973648185792e-06,
"loss": 0.8136,
"step": 3925
},
{
"epoch": 0.6090979182729376,
"grad_norm": 7.7433648109436035,
"learning_rate": 9.638465696460957e-06,
"loss": 0.9112,
"step": 3950
},
{
"epoch": 0.612952968388589,
"grad_norm": 8.573099136352539,
"learning_rate": 9.633930530639024e-06,
"loss": 0.8363,
"step": 3975
},
{
"epoch": 0.6168080185042406,
"grad_norm": 9.313333511352539,
"learning_rate": 9.629368177328022e-06,
"loss": 0.8344,
"step": 4000
},
{
"epoch": 0.6206630686198921,
"grad_norm": 9.854177474975586,
"learning_rate": 9.624778663295493e-06,
"loss": 0.9214,
"step": 4025
},
{
"epoch": 0.6245181187355435,
"grad_norm": 11.585829734802246,
"learning_rate": 9.620162015468334e-06,
"loss": 0.9229,
"step": 4050
},
{
"epoch": 0.6283731688511951,
"grad_norm": 8.363166809082031,
"learning_rate": 9.615518260932632e-06,
"loss": 0.9013,
"step": 4075
},
{
"epoch": 0.6322282189668466,
"grad_norm": 6.946799278259277,
"learning_rate": 9.610847426933518e-06,
"loss": 0.7621,
"step": 4100
},
{
"epoch": 0.6360832690824981,
"grad_norm": 13.243395805358887,
"learning_rate": 9.606149540874994e-06,
"loss": 0.9343,
"step": 4125
},
{
"epoch": 0.6399383191981496,
"grad_norm": 6.437871932983398,
"learning_rate": 9.601424630319778e-06,
"loss": 0.7933,
"step": 4150
},
{
"epoch": 0.643793369313801,
"grad_norm": 12.792265892028809,
"learning_rate": 9.596672722989145e-06,
"loss": 0.782,
"step": 4175
},
{
"epoch": 0.6476484194294526,
"grad_norm": 8.843002319335938,
"learning_rate": 9.591893846762759e-06,
"loss": 0.782,
"step": 4200
},
{
"epoch": 0.6515034695451041,
"grad_norm": 9.300607681274414,
"learning_rate": 9.587088029678512e-06,
"loss": 0.8961,
"step": 4225
},
{
"epoch": 0.6553585196607556,
"grad_norm": 8.529927253723145,
"learning_rate": 9.582255299932359e-06,
"loss": 0.8352,
"step": 4250
},
{
"epoch": 0.6592135697764071,
"grad_norm": 8.342142105102539,
"learning_rate": 9.577395685878155e-06,
"loss": 0.724,
"step": 4275
},
{
"epoch": 0.6630686198920586,
"grad_norm": 8.829712867736816,
"learning_rate": 9.572509216027484e-06,
"loss": 0.8034,
"step": 4300
},
{
"epoch": 0.6669236700077101,
"grad_norm": 10.340896606445312,
"learning_rate": 9.567595919049495e-06,
"loss": 0.8574,
"step": 4325
},
{
"epoch": 0.6707787201233616,
"grad_norm": 9.885419845581055,
"learning_rate": 9.562655823770733e-06,
"loss": 0.794,
"step": 4350
},
{
"epoch": 0.674633770239013,
"grad_norm": 11.733614921569824,
"learning_rate": 9.557688959174972e-06,
"loss": 0.7989,
"step": 4375
},
{
"epoch": 0.6784888203546646,
"grad_norm": 12.726638793945312,
"learning_rate": 9.55269535440304e-06,
"loss": 0.866,
"step": 4400
},
{
"epoch": 0.6823438704703161,
"grad_norm": 10.073393821716309,
"learning_rate": 9.547675038752648e-06,
"loss": 0.8619,
"step": 4425
},
{
"epoch": 0.6861989205859677,
"grad_norm": 8.336028099060059,
"learning_rate": 9.54262804167823e-06,
"loss": 0.8145,
"step": 4450
},
{
"epoch": 0.6900539707016191,
"grad_norm": 4.4542036056518555,
"learning_rate": 9.537554392790754e-06,
"loss": 0.7211,
"step": 4475
},
{
"epoch": 0.6939090208172706,
"grad_norm": 7.8720526695251465,
"learning_rate": 9.532454121857556e-06,
"loss": 0.7576,
"step": 4500
},
{
"epoch": 0.6977640709329221,
"grad_norm": 10.366761207580566,
"learning_rate": 9.527327258802169e-06,
"loss": 0.6862,
"step": 4525
},
{
"epoch": 0.7016191210485736,
"grad_norm": 12.468461990356445,
"learning_rate": 9.52217383370414e-06,
"loss": 0.8133,
"step": 4550
},
{
"epoch": 0.7054741711642252,
"grad_norm": 7.985262393951416,
"learning_rate": 9.516993876798855e-06,
"loss": 0.8456,
"step": 4575
},
{
"epoch": 0.7093292212798766,
"grad_norm": 8.8193941116333,
"learning_rate": 9.511787418477367e-06,
"loss": 0.8173,
"step": 4600
},
{
"epoch": 0.7131842713955281,
"grad_norm": 8.104015350341797,
"learning_rate": 9.506554489286213e-06,
"loss": 0.8371,
"step": 4625
},
{
"epoch": 0.7170393215111797,
"grad_norm": 8.242918968200684,
"learning_rate": 9.501295119927234e-06,
"loss": 0.8245,
"step": 4650
},
{
"epoch": 0.7208943716268311,
"grad_norm": 8.603202819824219,
"learning_rate": 9.4960093412574e-06,
"loss": 0.6737,
"step": 4675
},
{
"epoch": 0.7247494217424827,
"grad_norm": 5.0336079597473145,
"learning_rate": 9.490697184288623e-06,
"loss": 0.6953,
"step": 4700
},
{
"epoch": 0.7286044718581342,
"grad_norm": 8.692102432250977,
"learning_rate": 9.48535868018758e-06,
"loss": 0.7217,
"step": 4725
},
{
"epoch": 0.7324595219737856,
"grad_norm": 6.617276668548584,
"learning_rate": 9.479993860275523e-06,
"loss": 0.7224,
"step": 4750
},
{
"epoch": 0.7363145720894372,
"grad_norm": 9.631497383117676,
"learning_rate": 9.474602756028106e-06,
"loss": 0.9555,
"step": 4775
},
{
"epoch": 0.7401696222050886,
"grad_norm": 10.854545593261719,
"learning_rate": 9.469185399075192e-06,
"loss": 0.7077,
"step": 4800
},
{
"epoch": 0.7440246723207402,
"grad_norm": 7.565122604370117,
"learning_rate": 9.46374182120067e-06,
"loss": 0.7945,
"step": 4825
},
{
"epoch": 0.7478797224363917,
"grad_norm": 6.456957817077637,
"learning_rate": 9.458272054342267e-06,
"loss": 0.7428,
"step": 4850
},
{
"epoch": 0.7517347725520431,
"grad_norm": 11.974895477294922,
"learning_rate": 9.452776130591364e-06,
"loss": 0.701,
"step": 4875
},
{
"epoch": 0.7555898226676947,
"grad_norm": 6.930185317993164,
"learning_rate": 9.447254082192805e-06,
"loss": 0.7299,
"step": 4900
},
{
"epoch": 0.7594448727833462,
"grad_norm": 9.224297523498535,
"learning_rate": 9.441705941544707e-06,
"loss": 0.8717,
"step": 4925
},
{
"epoch": 0.7632999228989977,
"grad_norm": 8.671370506286621,
"learning_rate": 9.436131741198279e-06,
"loss": 0.9658,
"step": 4950
},
{
"epoch": 0.7671549730146492,
"grad_norm": 9.06234073638916,
"learning_rate": 9.430531513857608e-06,
"loss": 0.8743,
"step": 4975
},
{
"epoch": 0.7710100231303006,
"grad_norm": 7.48011589050293,
"learning_rate": 9.424905292379497e-06,
"loss": 0.7173,
"step": 5000
},
{
"epoch": 0.7748650732459522,
"grad_norm": 5.954658031463623,
"learning_rate": 9.419253109773257e-06,
"loss": 0.8436,
"step": 5025
},
{
"epoch": 0.7787201233616037,
"grad_norm": 7.4661478996276855,
"learning_rate": 9.413574999200502e-06,
"loss": 0.9183,
"step": 5050
},
{
"epoch": 0.7825751734772552,
"grad_norm": 12.923650741577148,
"learning_rate": 9.40787099397498e-06,
"loss": 0.8006,
"step": 5075
},
{
"epoch": 0.7864302235929067,
"grad_norm": 7.419551849365234,
"learning_rate": 9.402141127562357e-06,
"loss": 0.8914,
"step": 5100
},
{
"epoch": 0.7902852737085582,
"grad_norm": 5.117984294891357,
"learning_rate": 9.396385433580029e-06,
"loss": 0.8755,
"step": 5125
},
{
"epoch": 0.7941403238242097,
"grad_norm": 8.648459434509277,
"learning_rate": 9.390603945796926e-06,
"loss": 0.6825,
"step": 5150
},
{
"epoch": 0.7979953739398612,
"grad_norm": 9.014701843261719,
"learning_rate": 9.384796698133308e-06,
"loss": 0.7647,
"step": 5175
},
{
"epoch": 0.8018504240555128,
"grad_norm": 7.861103057861328,
"learning_rate": 9.378963724660573e-06,
"loss": 0.7884,
"step": 5200
},
{
"epoch": 0.8057054741711642,
"grad_norm": 9.6216459274292,
"learning_rate": 9.373105059601049e-06,
"loss": 0.8367,
"step": 5225
},
{
"epoch": 0.8095605242868157,
"grad_norm": 10.21359920501709,
"learning_rate": 9.367220737327802e-06,
"loss": 0.8844,
"step": 5250
},
{
"epoch": 0.8134155744024673,
"grad_norm": 9.506515502929688,
"learning_rate": 9.361310792364426e-06,
"loss": 0.7336,
"step": 5275
},
{
"epoch": 0.8172706245181187,
"grad_norm": 10.80900764465332,
"learning_rate": 9.355375259384852e-06,
"loss": 0.7519,
"step": 5300
},
{
"epoch": 0.8211256746337703,
"grad_norm": 8.112074851989746,
"learning_rate": 9.349414173213127e-06,
"loss": 0.8356,
"step": 5325
},
{
"epoch": 0.8249807247494217,
"grad_norm": 9.96387004852295,
"learning_rate": 9.343427568823229e-06,
"loss": 0.8538,
"step": 5350
},
{
"epoch": 0.8288357748650732,
"grad_norm": 7.8798370361328125,
"learning_rate": 9.337415481338845e-06,
"loss": 0.9264,
"step": 5375
},
{
"epoch": 0.8326908249807248,
"grad_norm": 11.040996551513672,
"learning_rate": 9.331377946033176e-06,
"loss": 0.891,
"step": 5400
},
{
"epoch": 0.8365458750963762,
"grad_norm": 8.020524978637695,
"learning_rate": 9.325314998328729e-06,
"loss": 0.7564,
"step": 5425
},
{
"epoch": 0.8404009252120278,
"grad_norm": 7.921288013458252,
"learning_rate": 9.319226673797103e-06,
"loss": 0.8595,
"step": 5450
},
{
"epoch": 0.8442559753276793,
"grad_norm": 9.271672248840332,
"learning_rate": 9.313113008158785e-06,
"loss": 0.8447,
"step": 5475
},
{
"epoch": 0.8481110254433307,
"grad_norm": 10.206059455871582,
"learning_rate": 9.306974037282941e-06,
"loss": 0.7761,
"step": 5500
},
{
"epoch": 0.8519660755589823,
"grad_norm": 7.826180458068848,
"learning_rate": 9.3008097971872e-06,
"loss": 0.8939,
"step": 5525
},
{
"epoch": 0.8558211256746338,
"grad_norm": 11.341131210327148,
"learning_rate": 9.294620324037452e-06,
"loss": 0.8221,
"step": 5550
},
{
"epoch": 0.8596761757902853,
"grad_norm": 8.876914024353027,
"learning_rate": 9.288405654147627e-06,
"loss": 0.8388,
"step": 5575
},
{
"epoch": 0.8635312259059368,
"grad_norm": 7.0240888595581055,
"learning_rate": 9.282165823979489e-06,
"loss": 0.7299,
"step": 5600
},
{
"epoch": 0.8673862760215882,
"grad_norm": 9.497668266296387,
"learning_rate": 9.275900870142412e-06,
"loss": 0.7772,
"step": 5625
},
{
"epoch": 0.8712413261372398,
"grad_norm": 9.582475662231445,
"learning_rate": 9.269610829393177e-06,
"loss": 0.7818,
"step": 5650
},
{
"epoch": 0.8750963762528913,
"grad_norm": 9.7567138671875,
"learning_rate": 9.263295738635752e-06,
"loss": 0.8613,
"step": 5675
},
{
"epoch": 0.8789514263685428,
"grad_norm": 9.330400466918945,
"learning_rate": 9.25695563492107e-06,
"loss": 0.8696,
"step": 5700
},
{
"epoch": 0.8828064764841943,
"grad_norm": 9.952261924743652,
"learning_rate": 9.250590555446819e-06,
"loss": 0.7997,
"step": 5725
},
{
"epoch": 0.8866615265998458,
"grad_norm": 9.868045806884766,
"learning_rate": 9.244200537557222e-06,
"loss": 0.7162,
"step": 5750
},
{
"epoch": 0.8905165767154973,
"grad_norm": 7.933254241943359,
"learning_rate": 9.237785618742815e-06,
"loss": 0.7762,
"step": 5775
},
{
"epoch": 0.8943716268311488,
"grad_norm": 12.200456619262695,
"learning_rate": 9.231345836640228e-06,
"loss": 0.7758,
"step": 5800
},
{
"epoch": 0.8982266769468004,
"grad_norm": 9.943805694580078,
"learning_rate": 9.224881229031968e-06,
"loss": 0.9267,
"step": 5825
},
{
"epoch": 0.9020817270624518,
"grad_norm": 4.955184459686279,
"learning_rate": 9.218391833846193e-06,
"loss": 0.8925,
"step": 5850
},
{
"epoch": 0.9059367771781033,
"grad_norm": 11.516079902648926,
"learning_rate": 9.211877689156488e-06,
"loss": 0.8999,
"step": 5875
},
{
"epoch": 0.9097918272937549,
"grad_norm": 10.042754173278809,
"learning_rate": 9.20533883318165e-06,
"loss": 0.6636,
"step": 5900
},
{
"epoch": 0.9136468774094063,
"grad_norm": 8.897378921508789,
"learning_rate": 9.198775304285457e-06,
"loss": 0.8256,
"step": 5925
},
{
"epoch": 0.9175019275250579,
"grad_norm": 6.932818412780762,
"learning_rate": 9.192187140976436e-06,
"loss": 0.7192,
"step": 5950
},
{
"epoch": 0.9213569776407093,
"grad_norm": 8.470138549804688,
"learning_rate": 9.18557438190766e-06,
"loss": 0.9041,
"step": 5975
},
{
"epoch": 0.9252120277563608,
"grad_norm": 5.760867118835449,
"learning_rate": 9.178937065876494e-06,
"loss": 0.76,
"step": 6000
},
{
"epoch": 0.9290670778720124,
"grad_norm": 7.360080718994141,
"learning_rate": 9.172275231824387e-06,
"loss": 0.8082,
"step": 6025
},
{
"epoch": 0.9329221279876638,
"grad_norm": 10.077314376831055,
"learning_rate": 9.16558891883663e-06,
"loss": 0.9417,
"step": 6050
},
{
"epoch": 0.9367771781033153,
"grad_norm": 11.441020965576172,
"learning_rate": 9.15887816614214e-06,
"loss": 0.8991,
"step": 6075
},
{
"epoch": 0.9406322282189669,
"grad_norm": 8.397335052490234,
"learning_rate": 9.152143013113218e-06,
"loss": 0.7826,
"step": 6100
},
{
"epoch": 0.9444872783346183,
"grad_norm": 8.355264663696289,
"learning_rate": 9.145383499265323e-06,
"loss": 0.744,
"step": 6125
},
{
"epoch": 0.9483423284502699,
"grad_norm": 8.08262825012207,
"learning_rate": 9.138599664256847e-06,
"loss": 0.7599,
"step": 6150
},
{
"epoch": 0.9521973785659213,
"grad_norm": 9.135760307312012,
"learning_rate": 9.131791547888864e-06,
"loss": 0.934,
"step": 6175
},
{
"epoch": 0.9560524286815728,
"grad_norm": 10.364896774291992,
"learning_rate": 9.12495919010492e-06,
"loss": 0.8957,
"step": 6200
},
{
"epoch": 0.9599074787972244,
"grad_norm": 9.866580963134766,
"learning_rate": 9.118102630990776e-06,
"loss": 0.823,
"step": 6225
},
{
"epoch": 0.9637625289128758,
"grad_norm": 8.708639144897461,
"learning_rate": 9.111221910774188e-06,
"loss": 0.8413,
"step": 6250
},
{
"epoch": 0.9676175790285274,
"grad_norm": 9.847127914428711,
"learning_rate": 9.104317069824668e-06,
"loss": 0.8716,
"step": 6275
},
{
"epoch": 0.9714726291441789,
"grad_norm": 10.873041152954102,
"learning_rate": 9.097388148653243e-06,
"loss": 0.8618,
"step": 6300
},
{
"epoch": 0.9753276792598303,
"grad_norm": 8.315295219421387,
"learning_rate": 9.09043518791222e-06,
"loss": 0.9405,
"step": 6325
},
{
"epoch": 0.9791827293754819,
"grad_norm": 8.513904571533203,
"learning_rate": 9.08345822839495e-06,
"loss": 0.7109,
"step": 6350
},
{
"epoch": 0.9830377794911334,
"grad_norm": 9.227296829223633,
"learning_rate": 9.07645731103558e-06,
"loss": 0.8274,
"step": 6375
},
{
"epoch": 0.9868928296067849,
"grad_norm": 6.035506725311279,
"learning_rate": 9.069432476908825e-06,
"loss": 0.7888,
"step": 6400
},
{
"epoch": 0.9907478797224364,
"grad_norm": 10.379996299743652,
"learning_rate": 9.06238376722972e-06,
"loss": 0.8945,
"step": 6425
},
{
"epoch": 0.9946029298380878,
"grad_norm": 11.054986953735352,
"learning_rate": 9.055311223353372e-06,
"loss": 0.7322,
"step": 6450
},
{
"epoch": 0.9984579799537394,
"grad_norm": 8.378334045410156,
"learning_rate": 9.048214886774733e-06,
"loss": 0.7671,
"step": 6475
},
{
"epoch": 1.002313030069391,
"grad_norm": 6.018606662750244,
"learning_rate": 9.041094799128342e-06,
"loss": 0.7057,
"step": 6500
},
{
"epoch": 1.0061680801850423,
"grad_norm": 10.119317054748535,
"learning_rate": 9.033951002188096e-06,
"loss": 0.5273,
"step": 6525
},
{
"epoch": 1.010023130300694,
"grad_norm": 7.7219038009643555,
"learning_rate": 9.026783537866978e-06,
"loss": 0.4959,
"step": 6550
},
{
"epoch": 1.0138781804163455,
"grad_norm": 9.894926071166992,
"learning_rate": 9.01959244821685e-06,
"loss": 0.4924,
"step": 6575
},
{
"epoch": 1.0177332305319968,
"grad_norm": 11.07059383392334,
"learning_rate": 9.012377775428167e-06,
"loss": 0.4465,
"step": 6600
},
{
"epoch": 1.0215882806476484,
"grad_norm": 9.254422187805176,
"learning_rate": 9.005139561829759e-06,
"loss": 0.5173,
"step": 6625
},
{
"epoch": 1.0254433307633,
"grad_norm": 7.215500831604004,
"learning_rate": 8.997877849888564e-06,
"loss": 0.496,
"step": 6650
},
{
"epoch": 1.0292983808789515,
"grad_norm": 6.909051895141602,
"learning_rate": 8.99059268220939e-06,
"loss": 0.5117,
"step": 6675
},
{
"epoch": 1.0331534309946029,
"grad_norm": 11.559415817260742,
"learning_rate": 8.98328410153466e-06,
"loss": 0.5436,
"step": 6700
},
{
"epoch": 1.0370084811102545,
"grad_norm": 7.60066032409668,
"learning_rate": 8.975952150744159e-06,
"loss": 0.5219,
"step": 6725
},
{
"epoch": 1.040863531225906,
"grad_norm": 7.906856060028076,
"learning_rate": 8.96859687285479e-06,
"loss": 0.6516,
"step": 6750
},
{
"epoch": 1.0447185813415574,
"grad_norm": 3.3445956707000732,
"learning_rate": 8.961218311020316e-06,
"loss": 0.5476,
"step": 6775
},
{
"epoch": 1.048573631457209,
"grad_norm": 12.707012176513672,
"learning_rate": 8.953816508531106e-06,
"loss": 0.5194,
"step": 6800
},
{
"epoch": 1.0524286815728605,
"grad_norm": 9.242384910583496,
"learning_rate": 8.946391508813886e-06,
"loss": 0.5039,
"step": 6825
},
{
"epoch": 1.0562837316885119,
"grad_norm": 4.946049690246582,
"learning_rate": 8.93894335543148e-06,
"loss": 0.4557,
"step": 6850
},
{
"epoch": 1.0601387818041634,
"grad_norm": 10.867353439331055,
"learning_rate": 8.931472092082552e-06,
"loss": 0.4875,
"step": 6875
},
{
"epoch": 1.063993831919815,
"grad_norm": 8.44529914855957,
"learning_rate": 8.92397776260136e-06,
"loss": 0.499,
"step": 6900
},
{
"epoch": 1.0678488820354666,
"grad_norm": 5.33937931060791,
"learning_rate": 8.916460410957488e-06,
"loss": 0.5031,
"step": 6925
},
{
"epoch": 1.071703932151118,
"grad_norm": 8.168314933776855,
"learning_rate": 8.908920081255593e-06,
"loss": 0.6211,
"step": 6950
},
{
"epoch": 1.0755589822667695,
"grad_norm": 5.861813545227051,
"learning_rate": 8.901356817735142e-06,
"loss": 0.505,
"step": 6975
},
{
"epoch": 1.079414032382421,
"grad_norm": 9.891986846923828,
"learning_rate": 8.893770664770162e-06,
"loss": 0.6063,
"step": 7000
},
{
"epoch": 1.0832690824980724,
"grad_norm": 7.747451305389404,
"learning_rate": 8.886161666868971e-06,
"loss": 0.6167,
"step": 7025
},
{
"epoch": 1.087124132613724,
"grad_norm": 9.20585823059082,
"learning_rate": 8.878529868673915e-06,
"loss": 0.5689,
"step": 7050
},
{
"epoch": 1.0909791827293756,
"grad_norm": 11.797416687011719,
"learning_rate": 8.870875314961119e-06,
"loss": 0.4784,
"step": 7075
},
{
"epoch": 1.094834232845027,
"grad_norm": 8.297890663146973,
"learning_rate": 8.863198050640208e-06,
"loss": 0.4881,
"step": 7100
},
{
"epoch": 1.0986892829606785,
"grad_norm": 11.986342430114746,
"learning_rate": 8.855498120754053e-06,
"loss": 0.5925,
"step": 7125
},
{
"epoch": 1.10254433307633,
"grad_norm": 9.048810958862305,
"learning_rate": 8.84777557047851e-06,
"loss": 0.5588,
"step": 7150
},
{
"epoch": 1.1063993831919814,
"grad_norm": 9.838631629943848,
"learning_rate": 8.840030445122142e-06,
"loss": 0.5175,
"step": 7175
},
{
"epoch": 1.110254433307633,
"grad_norm": 8.671347618103027,
"learning_rate": 8.832262790125965e-06,
"loss": 0.6014,
"step": 7200
},
{
"epoch": 1.1141094834232845,
"grad_norm": 10.086674690246582,
"learning_rate": 8.82447265106318e-06,
"loss": 0.5071,
"step": 7225
},
{
"epoch": 1.117964533538936,
"grad_norm": 11.334117889404297,
"learning_rate": 8.816660073638898e-06,
"loss": 0.5349,
"step": 7250
},
{
"epoch": 1.1218195836545874,
"grad_norm": 9.23147201538086,
"learning_rate": 8.80882510368988e-06,
"loss": 0.5202,
"step": 7275
},
{
"epoch": 1.125674633770239,
"grad_norm": 6.442698955535889,
"learning_rate": 8.800967787184266e-06,
"loss": 0.4422,
"step": 7300
},
{
"epoch": 1.1295296838858906,
"grad_norm": 8.339442253112793,
"learning_rate": 8.7930881702213e-06,
"loss": 0.5211,
"step": 7325
},
{
"epoch": 1.133384734001542,
"grad_norm": 13.899686813354492,
"learning_rate": 8.785186299031069e-06,
"loss": 0.5819,
"step": 7350
},
{
"epoch": 1.1372397841171935,
"grad_norm": 7.355388641357422,
"learning_rate": 8.777262219974222e-06,
"loss": 0.607,
"step": 7375
},
{
"epoch": 1.141094834232845,
"grad_norm": 5.7306623458862305,
"learning_rate": 8.769315979541706e-06,
"loss": 0.521,
"step": 7400
},
{
"epoch": 1.1449498843484966,
"grad_norm": 5.767916679382324,
"learning_rate": 8.761347624354488e-06,
"loss": 0.5581,
"step": 7425
},
{
"epoch": 1.148804934464148,
"grad_norm": 6.155060291290283,
"learning_rate": 8.753357201163283e-06,
"loss": 0.4737,
"step": 7450
},
{
"epoch": 1.1526599845797996,
"grad_norm": 11.28679084777832,
"learning_rate": 8.745344756848285e-06,
"loss": 0.5749,
"step": 7475
},
{
"epoch": 1.1565150346954511,
"grad_norm": 9.059491157531738,
"learning_rate": 8.73731033841888e-06,
"loss": 0.5029,
"step": 7500
},
{
"epoch": 1.1603700848111025,
"grad_norm": 9.721566200256348,
"learning_rate": 8.729253993013376e-06,
"loss": 0.482,
"step": 7525
},
{
"epoch": 1.164225134926754,
"grad_norm": 6.940269470214844,
"learning_rate": 8.721175767898737e-06,
"loss": 0.5422,
"step": 7550
},
{
"epoch": 1.1680801850424056,
"grad_norm": 7.176370143890381,
"learning_rate": 8.71307571047029e-06,
"loss": 0.5388,
"step": 7575
},
{
"epoch": 1.171935235158057,
"grad_norm": 13.643762588500977,
"learning_rate": 8.704953868251453e-06,
"loss": 0.5321,
"step": 7600
},
{
"epoch": 1.1757902852737085,
"grad_norm": 3.0095925331115723,
"learning_rate": 8.696810288893458e-06,
"loss": 0.4465,
"step": 7625
},
{
"epoch": 1.1796453353893601,
"grad_norm": 5.344406604766846,
"learning_rate": 8.688645020175071e-06,
"loss": 0.5168,
"step": 7650
},
{
"epoch": 1.1835003855050115,
"grad_norm": 7.636973857879639,
"learning_rate": 8.680458110002305e-06,
"loss": 0.409,
"step": 7675
},
{
"epoch": 1.187355435620663,
"grad_norm": 8.79349136352539,
"learning_rate": 8.67224960640815e-06,
"loss": 0.5249,
"step": 7700
},
{
"epoch": 1.1912104857363146,
"grad_norm": 11.105514526367188,
"learning_rate": 8.664019557552286e-06,
"loss": 0.4775,
"step": 7725
},
{
"epoch": 1.195065535851966,
"grad_norm": 7.652461051940918,
"learning_rate": 8.655768011720795e-06,
"loss": 0.5499,
"step": 7750
},
{
"epoch": 1.1989205859676175,
"grad_norm": 9.048026084899902,
"learning_rate": 8.647495017325889e-06,
"loss": 0.6171,
"step": 7775
},
{
"epoch": 1.202775636083269,
"grad_norm": 6.920886039733887,
"learning_rate": 8.639200622905612e-06,
"loss": 0.5427,
"step": 7800
},
{
"epoch": 1.2066306861989207,
"grad_norm": 5.6886396408081055,
"learning_rate": 8.630884877123573e-06,
"loss": 0.5517,
"step": 7825
},
{
"epoch": 1.210485736314572,
"grad_norm": 12.065085411071777,
"learning_rate": 8.62254782876864e-06,
"loss": 0.486,
"step": 7850
},
{
"epoch": 1.2143407864302236,
"grad_norm": 5.747442722320557,
"learning_rate": 8.61418952675467e-06,
"loss": 0.52,
"step": 7875
},
{
"epoch": 1.2181958365458752,
"grad_norm": 7.072346210479736,
"learning_rate": 8.605810020120218e-06,
"loss": 0.6147,
"step": 7900
},
{
"epoch": 1.2220508866615265,
"grad_norm": 10.231158256530762,
"learning_rate": 8.597409358028241e-06,
"loss": 0.4889,
"step": 7925
},
{
"epoch": 1.225905936777178,
"grad_norm": 11.127062797546387,
"learning_rate": 8.588987589765822e-06,
"loss": 0.54,
"step": 7950
},
{
"epoch": 1.2297609868928296,
"grad_norm": 10.189970016479492,
"learning_rate": 8.580544764743875e-06,
"loss": 0.532,
"step": 7975
},
{
"epoch": 1.2336160370084812,
"grad_norm": 8.174032211303711,
"learning_rate": 8.572080932496848e-06,
"loss": 0.6165,
"step": 8000
},
{
"epoch": 1.2374710871241326,
"grad_norm": 9.724369049072266,
"learning_rate": 8.563596142682447e-06,
"loss": 0.63,
"step": 8025
},
{
"epoch": 1.2413261372397841,
"grad_norm": 11.391420364379883,
"learning_rate": 8.555090445081334e-06,
"loss": 0.4851,
"step": 8050
},
{
"epoch": 1.2451811873554357,
"grad_norm": 13.044631958007812,
"learning_rate": 8.546563889596837e-06,
"loss": 0.6101,
"step": 8075
},
{
"epoch": 1.249036237471087,
"grad_norm": 9.989757537841797,
"learning_rate": 8.538016526254662e-06,
"loss": 0.5313,
"step": 8100
},
{
"epoch": 1.2528912875867386,
"grad_norm": 9.661147117614746,
"learning_rate": 8.52944840520259e-06,
"loss": 0.4898,
"step": 8125
},
{
"epoch": 1.2567463377023902,
"grad_norm": 11.322773933410645,
"learning_rate": 8.520859576710191e-06,
"loss": 0.5108,
"step": 8150
},
{
"epoch": 1.2606013878180415,
"grad_norm": 8.221813201904297,
"learning_rate": 8.512250091168533e-06,
"loss": 0.485,
"step": 8175
},
{
"epoch": 1.2644564379336931,
"grad_norm": 11.434172630310059,
"learning_rate": 8.503619999089866e-06,
"loss": 0.5734,
"step": 8200
},
{
"epoch": 1.2683114880493447,
"grad_norm": 9.098228454589844,
"learning_rate": 8.494969351107353e-06,
"loss": 0.496,
"step": 8225
},
{
"epoch": 1.272166538164996,
"grad_norm": 9.831649780273438,
"learning_rate": 8.48629819797475e-06,
"loss": 0.4865,
"step": 8250
},
{
"epoch": 1.2760215882806476,
"grad_norm": 8.599019050598145,
"learning_rate": 8.477606590566124e-06,
"loss": 0.4883,
"step": 8275
},
{
"epoch": 1.2798766383962992,
"grad_norm": 11.627860069274902,
"learning_rate": 8.468894579875547e-06,
"loss": 0.5715,
"step": 8300
},
{
"epoch": 1.2837316885119505,
"grad_norm": 3.541821241378784,
"learning_rate": 8.460162217016794e-06,
"loss": 0.4551,
"step": 8325
},
{
"epoch": 1.287586738627602,
"grad_norm": 10.509103775024414,
"learning_rate": 8.451409553223049e-06,
"loss": 0.499,
"step": 8350
},
{
"epoch": 1.2914417887432537,
"grad_norm": 7.862640380859375,
"learning_rate": 8.44263663984661e-06,
"loss": 0.5769,
"step": 8375
},
{
"epoch": 1.2952968388589052,
"grad_norm": 10.078428268432617,
"learning_rate": 8.433843528358564e-06,
"loss": 0.513,
"step": 8400
},
{
"epoch": 1.2991518889745568,
"grad_norm": 8.753310203552246,
"learning_rate": 8.425030270348517e-06,
"loss": 0.6029,
"step": 8425
},
{
"epoch": 1.3030069390902081,
"grad_norm": 10.529582023620605,
"learning_rate": 8.41619691752427e-06,
"loss": 0.5291,
"step": 8450
},
{
"epoch": 1.3068619892058597,
"grad_norm": 8.42566967010498,
"learning_rate": 8.40734352171152e-06,
"loss": 0.4822,
"step": 8475
},
{
"epoch": 1.3107170393215113,
"grad_norm": 10.086427688598633,
"learning_rate": 8.398470134853558e-06,
"loss": 0.4959,
"step": 8500
},
{
"epoch": 1.3145720894371626,
"grad_norm": 5.5653910636901855,
"learning_rate": 8.389576809010962e-06,
"loss": 0.4872,
"step": 8525
},
{
"epoch": 1.3184271395528142,
"grad_norm": 11.507160186767578,
"learning_rate": 8.380663596361293e-06,
"loss": 0.4785,
"step": 8550
},
{
"epoch": 1.3222821896684658,
"grad_norm": 6.287635326385498,
"learning_rate": 8.371730549198796e-06,
"loss": 0.476,
"step": 8575
},
{
"epoch": 1.3261372397841171,
"grad_norm": 12.566143989562988,
"learning_rate": 8.362777719934074e-06,
"loss": 0.4681,
"step": 8600
},
{
"epoch": 1.3299922898997687,
"grad_norm": 11.357763290405273,
"learning_rate": 8.353805161093801e-06,
"loss": 0.4886,
"step": 8625
},
{
"epoch": 1.3338473400154203,
"grad_norm": 11.33309555053711,
"learning_rate": 8.344812925320406e-06,
"loss": 0.5226,
"step": 8650
},
{
"epoch": 1.3377023901310716,
"grad_norm": 8.458454132080078,
"learning_rate": 8.335801065371757e-06,
"loss": 0.5103,
"step": 8675
},
{
"epoch": 1.3415574402467232,
"grad_norm": 6.284041404724121,
"learning_rate": 8.326769634120862e-06,
"loss": 0.4103,
"step": 8700
},
{
"epoch": 1.3454124903623748,
"grad_norm": 9.507338523864746,
"learning_rate": 8.317718684555554e-06,
"loss": 0.5397,
"step": 8725
},
{
"epoch": 1.349267540478026,
"grad_norm": 10.399945259094238,
"learning_rate": 8.308648269778181e-06,
"loss": 0.5419,
"step": 8750
},
{
"epoch": 1.3531225905936777,
"grad_norm": 6.871647834777832,
"learning_rate": 8.299558443005298e-06,
"loss": 0.5185,
"step": 8775
},
{
"epoch": 1.3569776407093292,
"grad_norm": 4.136816501617432,
"learning_rate": 8.290449257567344e-06,
"loss": 0.5646,
"step": 8800
},
{
"epoch": 1.3608326908249806,
"grad_norm": 8.791040420532227,
"learning_rate": 8.281320766908341e-06,
"loss": 0.5408,
"step": 8825
},
{
"epoch": 1.3646877409406322,
"grad_norm": 9.468535423278809,
"learning_rate": 8.272173024585573e-06,
"loss": 0.5096,
"step": 8850
},
{
"epoch": 1.3685427910562837,
"grad_norm": 11.554373741149902,
"learning_rate": 8.263006084269277e-06,
"loss": 0.5726,
"step": 8875
},
{
"epoch": 1.3723978411719353,
"grad_norm": 13.874961853027344,
"learning_rate": 8.253819999742324e-06,
"loss": 0.5361,
"step": 8900
},
{
"epoch": 1.3762528912875869,
"grad_norm": 7.872951507568359,
"learning_rate": 8.244614824899906e-06,
"loss": 0.5004,
"step": 8925
},
{
"epoch": 1.3801079414032382,
"grad_norm": 9.024027824401855,
"learning_rate": 8.23539061374922e-06,
"loss": 0.5592,
"step": 8950
},
{
"epoch": 1.3839629915188898,
"grad_norm": 8.904004096984863,
"learning_rate": 8.226147420409143e-06,
"loss": 0.5054,
"step": 8975
},
{
"epoch": 1.3878180416345414,
"grad_norm": 9.416498184204102,
"learning_rate": 8.21688529910993e-06,
"loss": 0.4857,
"step": 9000
},
{
"epoch": 1.3916730917501927,
"grad_norm": 10.115750312805176,
"learning_rate": 8.207604304192884e-06,
"loss": 0.5484,
"step": 9025
},
{
"epoch": 1.3955281418658443,
"grad_norm": 11.466856956481934,
"learning_rate": 8.198304490110038e-06,
"loss": 0.5908,
"step": 9050
},
{
"epoch": 1.3993831919814959,
"grad_norm": 7.2764363288879395,
"learning_rate": 8.188985911423841e-06,
"loss": 0.6034,
"step": 9075
},
{
"epoch": 1.4032382420971472,
"grad_norm": 4.788441181182861,
"learning_rate": 8.179648622806834e-06,
"loss": 0.4535,
"step": 9100
},
{
"epoch": 1.4070932922127988,
"grad_norm": 11.202897071838379,
"learning_rate": 8.17029267904133e-06,
"loss": 0.5634,
"step": 9125
},
{
"epoch": 1.4109483423284503,
"grad_norm": 4.8849196434021,
"learning_rate": 8.160918135019094e-06,
"loss": 0.5516,
"step": 9150
},
{
"epoch": 1.4148033924441017,
"grad_norm": 13.66285514831543,
"learning_rate": 8.151525045741014e-06,
"loss": 0.4364,
"step": 9175
},
{
"epoch": 1.4186584425597533,
"grad_norm": 8.950008392333984,
"learning_rate": 8.142113466316793e-06,
"loss": 0.5872,
"step": 9200
},
{
"epoch": 1.4225134926754048,
"grad_norm": 10.990318298339844,
"learning_rate": 8.132683451964605e-06,
"loss": 0.5218,
"step": 9225
},
{
"epoch": 1.4263685427910562,
"grad_norm": 7.806232452392578,
"learning_rate": 8.123235058010796e-06,
"loss": 0.4994,
"step": 9250
},
{
"epoch": 1.4302235929067078,
"grad_norm": 7.3956217765808105,
"learning_rate": 8.113768339889535e-06,
"loss": 0.6016,
"step": 9275
},
{
"epoch": 1.4340786430223593,
"grad_norm": 6.363986968994141,
"learning_rate": 8.104283353142506e-06,
"loss": 0.6047,
"step": 9300
},
{
"epoch": 1.4379336931380107,
"grad_norm": 10.691247940063477,
"learning_rate": 8.094780153418573e-06,
"loss": 0.5637,
"step": 9325
},
{
"epoch": 1.4417887432536622,
"grad_norm": 8.655353546142578,
"learning_rate": 8.085258796473458e-06,
"loss": 0.4978,
"step": 9350
},
{
"epoch": 1.4456437933693138,
"grad_norm": 10.034195899963379,
"learning_rate": 8.075719338169408e-06,
"loss": 0.4541,
"step": 9375
},
{
"epoch": 1.4494988434849654,
"grad_norm": 10.09195327758789,
"learning_rate": 8.06616183447488e-06,
"loss": 0.6211,
"step": 9400
},
{
"epoch": 1.4533538936006167,
"grad_norm": 12.030340194702148,
"learning_rate": 8.056586341464194e-06,
"loss": 0.5863,
"step": 9425
},
{
"epoch": 1.4572089437162683,
"grad_norm": 11.241909980773926,
"learning_rate": 8.046992915317224e-06,
"loss": 0.5014,
"step": 9450
},
{
"epoch": 1.4610639938319199,
"grad_norm": 8.376391410827637,
"learning_rate": 8.037381612319052e-06,
"loss": 0.5378,
"step": 9475
},
{
"epoch": 1.4649190439475714,
"grad_norm": 6.140998840332031,
"learning_rate": 8.027752488859644e-06,
"loss": 0.5234,
"step": 9500
},
{
"epoch": 1.4687740940632228,
"grad_norm": 8.74001693725586,
"learning_rate": 8.018105601433526e-06,
"loss": 0.5639,
"step": 9525
},
{
"epoch": 1.4726291441788744,
"grad_norm": 11.376422882080078,
"learning_rate": 8.008441006639437e-06,
"loss": 0.5396,
"step": 9550
},
{
"epoch": 1.476484194294526,
"grad_norm": 8.72598648071289,
"learning_rate": 7.998758761180016e-06,
"loss": 0.4902,
"step": 9575
},
{
"epoch": 1.4803392444101773,
"grad_norm": 7.389594554901123,
"learning_rate": 7.989058921861448e-06,
"loss": 0.4666,
"step": 9600
},
{
"epoch": 1.4841942945258288,
"grad_norm": 14.138694763183594,
"learning_rate": 7.979341545593153e-06,
"loss": 0.5326,
"step": 9625
},
{
"epoch": 1.4880493446414804,
"grad_norm": 11.284343719482422,
"learning_rate": 7.969606689387433e-06,
"loss": 0.5076,
"step": 9650
},
{
"epoch": 1.4919043947571318,
"grad_norm": 12.364653587341309,
"learning_rate": 7.95985441035915e-06,
"loss": 0.5759,
"step": 9675
},
{
"epoch": 1.4957594448727833,
"grad_norm": 7.59297513961792,
"learning_rate": 7.950084765725385e-06,
"loss": 0.5442,
"step": 9700
},
{
"epoch": 1.499614494988435,
"grad_norm": 7.339725494384766,
"learning_rate": 7.940297812805104e-06,
"loss": 0.5211,
"step": 9725
},
{
"epoch": 1.5034695451040863,
"grad_norm": 9.601153373718262,
"learning_rate": 7.930493609018822e-06,
"loss": 0.4818,
"step": 9750
},
{
"epoch": 1.5073245952197378,
"grad_norm": 8.515989303588867,
"learning_rate": 7.920672211888263e-06,
"loss": 0.6484,
"step": 9775
},
{
"epoch": 1.5111796453353894,
"grad_norm": 8.856532096862793,
"learning_rate": 7.910833679036032e-06,
"loss": 0.529,
"step": 9800
},
{
"epoch": 1.5150346954510407,
"grad_norm": 11.153056144714355,
"learning_rate": 7.90097806818526e-06,
"loss": 0.5492,
"step": 9825
},
{
"epoch": 1.5188897455666923,
"grad_norm": 13.278436660766602,
"learning_rate": 7.891105437159284e-06,
"loss": 0.5408,
"step": 9850
},
{
"epoch": 1.5227447956823439,
"grad_norm": 11.355679512023926,
"learning_rate": 7.881215843881296e-06,
"loss": 0.5888,
"step": 9875
},
{
"epoch": 1.5265998457979952,
"grad_norm": 11.082463264465332,
"learning_rate": 7.871309346374005e-06,
"loss": 0.5146,
"step": 9900
},
{
"epoch": 1.530454895913647,
"grad_norm": 10.298775672912598,
"learning_rate": 7.861386002759302e-06,
"loss": 0.5371,
"step": 9925
},
{
"epoch": 1.5343099460292984,
"grad_norm": 5.375964641571045,
"learning_rate": 7.851445871257909e-06,
"loss": 0.5871,
"step": 9950
},
{
"epoch": 1.5381649961449497,
"grad_norm": 10.021186828613281,
"learning_rate": 7.841489010189047e-06,
"loss": 0.5536,
"step": 9975
},
{
"epoch": 1.5420200462606015,
"grad_norm": 8.89342975616455,
"learning_rate": 7.831515477970093e-06,
"loss": 0.4262,
"step": 10000
},
{
"epoch": 1.5458750963762529,
"grad_norm": 8.71821117401123,
"learning_rate": 7.821525333116226e-06,
"loss": 0.5898,
"step": 10025
},
{
"epoch": 1.5497301464919044,
"grad_norm": 8.132588386535645,
"learning_rate": 7.811518634240103e-06,
"loss": 0.6425,
"step": 10050
},
{
"epoch": 1.553585196607556,
"grad_norm": 9.722726821899414,
"learning_rate": 7.801495440051495e-06,
"loss": 0.5682,
"step": 10075
},
{
"epoch": 1.5574402467232074,
"grad_norm": 9.070693969726562,
"learning_rate": 7.791455809356954e-06,
"loss": 0.507,
"step": 10100
},
{
"epoch": 1.561295296838859,
"grad_norm": 17.253707885742188,
"learning_rate": 7.781399801059469e-06,
"loss": 0.4379,
"step": 10125
},
{
"epoch": 1.5651503469545105,
"grad_norm": 9.190994262695312,
"learning_rate": 7.771327474158114e-06,
"loss": 0.5496,
"step": 10150
},
{
"epoch": 1.5690053970701618,
"grad_norm": 7.940937519073486,
"learning_rate": 7.761238887747707e-06,
"loss": 0.5811,
"step": 10175
},
{
"epoch": 1.5728604471858134,
"grad_norm": 3.5593698024749756,
"learning_rate": 7.751134101018463e-06,
"loss": 0.4845,
"step": 10200
},
{
"epoch": 1.576715497301465,
"grad_norm": 8.284226417541504,
"learning_rate": 7.741013173255637e-06,
"loss": 0.5191,
"step": 10225
},
{
"epoch": 1.5805705474171163,
"grad_norm": 11.53990650177002,
"learning_rate": 7.730876163839195e-06,
"loss": 0.5306,
"step": 10250
},
{
"epoch": 1.584425597532768,
"grad_norm": 10.26392936706543,
"learning_rate": 7.720723132243446e-06,
"loss": 0.4636,
"step": 10275
},
{
"epoch": 1.5882806476484195,
"grad_norm": 7.877215385437012,
"learning_rate": 7.710554138036707e-06,
"loss": 0.517,
"step": 10300
},
{
"epoch": 1.5921356977640708,
"grad_norm": 9.344793319702148,
"learning_rate": 7.700369240880944e-06,
"loss": 0.4611,
"step": 10325
},
{
"epoch": 1.5959907478797224,
"grad_norm": 11.912636756896973,
"learning_rate": 7.690168500531437e-06,
"loss": 0.5678,
"step": 10350
},
{
"epoch": 1.599845797995374,
"grad_norm": 9.530919075012207,
"learning_rate": 7.679951976836401e-06,
"loss": 0.5541,
"step": 10375
},
{
"epoch": 1.6037008481110253,
"grad_norm": 12.328695297241211,
"learning_rate": 7.669719729736669e-06,
"loss": 0.6063,
"step": 10400
},
{
"epoch": 1.607555898226677,
"grad_norm": 6.692688465118408,
"learning_rate": 7.659471819265316e-06,
"loss": 0.5489,
"step": 10425
},
{
"epoch": 1.6114109483423285,
"grad_norm": 11.644367218017578,
"learning_rate": 7.649208305547317e-06,
"loss": 0.4056,
"step": 10450
},
{
"epoch": 1.6152659984579798,
"grad_norm": 11.885711669921875,
"learning_rate": 7.638929248799187e-06,
"loss": 0.6173,
"step": 10475
},
{
"epoch": 1.6191210485736316,
"grad_norm": 12.212340354919434,
"learning_rate": 7.628634709328644e-06,
"loss": 0.5973,
"step": 10500
},
{
"epoch": 1.622976098689283,
"grad_norm": 7.975712776184082,
"learning_rate": 7.618324747534229e-06,
"loss": 0.542,
"step": 10525
},
{
"epoch": 1.6268311488049345,
"grad_norm": 9.488859176635742,
"learning_rate": 7.607999423904982e-06,
"loss": 0.5567,
"step": 10550
},
{
"epoch": 1.630686198920586,
"grad_norm": 11.337456703186035,
"learning_rate": 7.597658799020058e-06,
"loss": 0.6015,
"step": 10575
},
{
"epoch": 1.6345412490362374,
"grad_norm": 12.311405181884766,
"learning_rate": 7.587302933548395e-06,
"loss": 0.5477,
"step": 10600
},
{
"epoch": 1.638396299151889,
"grad_norm": 6.970990180969238,
"learning_rate": 7.57693188824834e-06,
"loss": 0.5682,
"step": 10625
},
{
"epoch": 1.6422513492675406,
"grad_norm": 7.132321357727051,
"learning_rate": 7.566545723967309e-06,
"loss": 0.5554,
"step": 10650
},
{
"epoch": 1.646106399383192,
"grad_norm": 8.085411071777344,
"learning_rate": 7.556144501641418e-06,
"loss": 0.5431,
"step": 10675
},
{
"epoch": 1.6499614494988435,
"grad_norm": 10.503854751586914,
"learning_rate": 7.545728282295127e-06,
"loss": 0.5855,
"step": 10700
},
{
"epoch": 1.653816499614495,
"grad_norm": 4.731517791748047,
"learning_rate": 7.535297127040886e-06,
"loss": 0.5929,
"step": 10725
},
{
"epoch": 1.6576715497301464,
"grad_norm": 8.450960159301758,
"learning_rate": 7.524851097078778e-06,
"loss": 0.5574,
"step": 10750
},
{
"epoch": 1.661526599845798,
"grad_norm": 12.14348316192627,
"learning_rate": 7.514390253696151e-06,
"loss": 0.564,
"step": 10775
},
{
"epoch": 1.6653816499614496,
"grad_norm": 8.517688751220703,
"learning_rate": 7.503914658267268e-06,
"loss": 0.5542,
"step": 10800
},
{
"epoch": 1.669236700077101,
"grad_norm": 5.898883819580078,
"learning_rate": 7.493424372252942e-06,
"loss": 0.5628,
"step": 10825
},
{
"epoch": 1.6730917501927525,
"grad_norm": 9.810076713562012,
"learning_rate": 7.482919457200173e-06,
"loss": 0.4793,
"step": 10850
},
{
"epoch": 1.676946800308404,
"grad_norm": 8.19438648223877,
"learning_rate": 7.4723999747417975e-06,
"loss": 0.4564,
"step": 10875
},
{
"epoch": 1.6808018504240554,
"grad_norm": 8.194275856018066,
"learning_rate": 7.461865986596114e-06,
"loss": 0.499,
"step": 10900
},
{
"epoch": 1.6846569005397072,
"grad_norm": 8.513113975524902,
"learning_rate": 7.451317554566527e-06,
"loss": 0.5126,
"step": 10925
},
{
"epoch": 1.6885119506553585,
"grad_norm": 9.31811809539795,
"learning_rate": 7.440754740541183e-06,
"loss": 0.5661,
"step": 10950
},
{
"epoch": 1.6923670007710099,
"grad_norm": 9.18864917755127,
"learning_rate": 7.430177606492616e-06,
"loss": 0.5016,
"step": 10975
},
{
"epoch": 1.6962220508866617,
"grad_norm": 10.510931015014648,
"learning_rate": 7.419586214477366e-06,
"loss": 0.5314,
"step": 11000
},
{
"epoch": 1.700077101002313,
"grad_norm": 8.722918510437012,
"learning_rate": 7.408980626635631e-06,
"loss": 0.5602,
"step": 11025
},
{
"epoch": 1.7039321511179646,
"grad_norm": 12.12399673461914,
"learning_rate": 7.398360905190894e-06,
"loss": 0.5081,
"step": 11050
},
{
"epoch": 1.7077872012336162,
"grad_norm": 13.294553756713867,
"learning_rate": 7.387727112449565e-06,
"loss": 0.5335,
"step": 11075
},
{
"epoch": 1.7116422513492675,
"grad_norm": 8.201335906982422,
"learning_rate": 7.377079310800604e-06,
"loss": 0.5275,
"step": 11100
},
{
"epoch": 1.715497301464919,
"grad_norm": 7.972436904907227,
"learning_rate": 7.366417562715169e-06,
"loss": 0.4989,
"step": 11125
},
{
"epoch": 1.7193523515805706,
"grad_norm": 5.6840949058532715,
"learning_rate": 7.355741930746238e-06,
"loss": 0.5936,
"step": 11150
},
{
"epoch": 1.723207401696222,
"grad_norm": 7.218390464782715,
"learning_rate": 7.345052477528245e-06,
"loss": 0.4187,
"step": 11175
},
{
"epoch": 1.7270624518118736,
"grad_norm": 9.847237586975098,
"learning_rate": 7.334349265776719e-06,
"loss": 0.5737,
"step": 11200
},
{
"epoch": 1.7309175019275251,
"grad_norm": 8.323447227478027,
"learning_rate": 7.3236323582879085e-06,
"loss": 0.5634,
"step": 11225
},
{
"epoch": 1.7347725520431765,
"grad_norm": 8.513628005981445,
"learning_rate": 7.3129018179384134e-06,
"loss": 0.5843,
"step": 11250
},
{
"epoch": 1.738627602158828,
"grad_norm": 9.567562103271484,
"learning_rate": 7.302157707684821e-06,
"loss": 0.4727,
"step": 11275
},
{
"epoch": 1.7424826522744796,
"grad_norm": 7.2932915687561035,
"learning_rate": 7.2914000905633365e-06,
"loss": 0.4881,
"step": 11300
},
{
"epoch": 1.746337702390131,
"grad_norm": 9.525776863098145,
"learning_rate": 7.280629029689402e-06,
"loss": 0.5105,
"step": 11325
},
{
"epoch": 1.7501927525057825,
"grad_norm": 8.903553009033203,
"learning_rate": 7.269844588257343e-06,
"loss": 0.5055,
"step": 11350
},
{
"epoch": 1.7540478026214341,
"grad_norm": 11.193290710449219,
"learning_rate": 7.259046829539984e-06,
"loss": 0.5071,
"step": 11375
},
{
"epoch": 1.7579028527370855,
"grad_norm": 7.516224384307861,
"learning_rate": 7.248235816888288e-06,
"loss": 0.5162,
"step": 11400
},
{
"epoch": 1.761757902852737,
"grad_norm": 15.788073539733887,
"learning_rate": 7.237411613730973e-06,
"loss": 0.6102,
"step": 11425
},
{
"epoch": 1.7656129529683886,
"grad_norm": 8.609055519104004,
"learning_rate": 7.226574283574152e-06,
"loss": 0.5987,
"step": 11450
},
{
"epoch": 1.76946800308404,
"grad_norm": 9.303633689880371,
"learning_rate": 7.2157238900009515e-06,
"loss": 0.6273,
"step": 11475
},
{
"epoch": 1.7733230531996917,
"grad_norm": 11.042442321777344,
"learning_rate": 7.204860496671142e-06,
"loss": 0.5039,
"step": 11500
},
{
"epoch": 1.777178103315343,
"grad_norm": 10.328720092773438,
"learning_rate": 7.193984167320765e-06,
"loss": 0.5863,
"step": 11525
},
{
"epoch": 1.7810331534309944,
"grad_norm": 7.442523002624512,
"learning_rate": 7.18309496576176e-06,
"loss": 0.5166,
"step": 11550
},
{
"epoch": 1.7848882035466462,
"grad_norm": 6.905193328857422,
"learning_rate": 7.172192955881583e-06,
"loss": 0.5504,
"step": 11575
},
{
"epoch": 1.7887432536622976,
"grad_norm": 9.047221183776855,
"learning_rate": 7.1612782016428425e-06,
"loss": 0.5145,
"step": 11600
},
{
"epoch": 1.7925983037779492,
"grad_norm": 8.46165943145752,
"learning_rate": 7.150350767082916e-06,
"loss": 0.4998,
"step": 11625
},
{
"epoch": 1.7964533538936007,
"grad_norm": 5.901994705200195,
"learning_rate": 7.139410716313579e-06,
"loss": 0.5307,
"step": 11650
},
{
"epoch": 1.800308404009252,
"grad_norm": 9.717686653137207,
"learning_rate": 7.128458113520624e-06,
"loss": 0.5481,
"step": 11675
},
{
"epoch": 1.8041634541249036,
"grad_norm": 6.8741326332092285,
"learning_rate": 7.117493022963488e-06,
"loss": 0.4816,
"step": 11700
},
{
"epoch": 1.8080185042405552,
"grad_norm": 11.298688888549805,
"learning_rate": 7.1065155089748735e-06,
"loss": 0.5202,
"step": 11725
},
{
"epoch": 1.8118735543562066,
"grad_norm": 12.379132270812988,
"learning_rate": 7.095525635960379e-06,
"loss": 0.5423,
"step": 11750
},
{
"epoch": 1.8157286044718581,
"grad_norm": 8.774430274963379,
"learning_rate": 7.084523468398101e-06,
"loss": 0.5722,
"step": 11775
},
{
"epoch": 1.8195836545875097,
"grad_norm": 12.321609497070312,
"learning_rate": 7.07350907083828e-06,
"loss": 0.5695,
"step": 11800
},
{
"epoch": 1.823438704703161,
"grad_norm": 10.942859649658203,
"learning_rate": 7.062482507902904e-06,
"loss": 0.5731,
"step": 11825
},
{
"epoch": 1.8272937548188126,
"grad_norm": 9.582926750183105,
"learning_rate": 7.051443844285339e-06,
"loss": 0.6066,
"step": 11850
},
{
"epoch": 1.8311488049344642,
"grad_norm": 10.249176025390625,
"learning_rate": 7.040393144749946e-06,
"loss": 0.502,
"step": 11875
},
{
"epoch": 1.8350038550501155,
"grad_norm": 8.083261489868164,
"learning_rate": 7.029330474131698e-06,
"loss": 0.4515,
"step": 11900
},
{
"epoch": 1.838858905165767,
"grad_norm": 6.912391185760498,
"learning_rate": 7.0182558973358085e-06,
"loss": 0.5324,
"step": 11925
},
{
"epoch": 1.8427139552814187,
"grad_norm": 7.553276538848877,
"learning_rate": 7.0071694793373406e-06,
"loss": 0.5428,
"step": 11950
},
{
"epoch": 1.84656900539707,
"grad_norm": 13.767989158630371,
"learning_rate": 6.996071285180832e-06,
"loss": 0.5122,
"step": 11975
},
{
"epoch": 1.8504240555127218,
"grad_norm": 8.248040199279785,
"learning_rate": 6.984961379979911e-06,
"loss": 0.5544,
"step": 12000
},
{
"epoch": 1.8542791056283732,
"grad_norm": 11.559853553771973,
"learning_rate": 6.973839828916917e-06,
"loss": 0.5244,
"step": 12025
},
{
"epoch": 1.8581341557440245,
"grad_norm": 10.446723937988281,
"learning_rate": 6.962706697242512e-06,
"loss": 0.4629,
"step": 12050
},
{
"epoch": 1.8619892058596763,
"grad_norm": 7.66959810256958,
"learning_rate": 6.951562050275309e-06,
"loss": 0.5103,
"step": 12075
},
{
"epoch": 1.8658442559753277,
"grad_norm": 6.154583930969238,
"learning_rate": 6.9404059534014745e-06,
"loss": 0.4887,
"step": 12100
},
{
"epoch": 1.8696993060909792,
"grad_norm": 14.175530433654785,
"learning_rate": 6.929238472074355e-06,
"loss": 0.4937,
"step": 12125
},
{
"epoch": 1.8735543562066308,
"grad_norm": 12.877251625061035,
"learning_rate": 6.9180596718140925e-06,
"loss": 0.5657,
"step": 12150
},
{
"epoch": 1.8774094063222821,
"grad_norm": 12.50566291809082,
"learning_rate": 6.9068696182072355e-06,
"loss": 0.5991,
"step": 12175
},
{
"epoch": 1.8812644564379337,
"grad_norm": 8.946805953979492,
"learning_rate": 6.895668376906354e-06,
"loss": 0.5373,
"step": 12200
},
{
"epoch": 1.8851195065535853,
"grad_norm": 8.240991592407227,
"learning_rate": 6.884456013629661e-06,
"loss": 0.452,
"step": 12225
},
{
"epoch": 1.8889745566692366,
"grad_norm": 5.963514804840088,
"learning_rate": 6.873232594160623e-06,
"loss": 0.5203,
"step": 12250
},
{
"epoch": 1.8928296067848882,
"grad_norm": 6.990925312042236,
"learning_rate": 6.8619981843475655e-06,
"loss": 0.4542,
"step": 12275
},
{
"epoch": 1.8966846569005398,
"grad_norm": 9.573740005493164,
"learning_rate": 6.850752850103307e-06,
"loss": 0.6374,
"step": 12300
},
{
"epoch": 1.9005397070161911,
"grad_norm": 10.438694953918457,
"learning_rate": 6.839496657404752e-06,
"loss": 0.4875,
"step": 12325
},
{
"epoch": 1.9043947571318427,
"grad_norm": 11.561971664428711,
"learning_rate": 6.828229672292512e-06,
"loss": 0.6264,
"step": 12350
},
{
"epoch": 1.9082498072474943,
"grad_norm": 12.459427833557129,
"learning_rate": 6.816951960870524e-06,
"loss": 0.624,
"step": 12375
},
{
"epoch": 1.9121048573631456,
"grad_norm": 11.697875022888184,
"learning_rate": 6.805663589305651e-06,
"loss": 0.5197,
"step": 12400
},
{
"epoch": 1.9159599074787972,
"grad_norm": 5.736695766448975,
"learning_rate": 6.794364623827302e-06,
"loss": 0.5233,
"step": 12425
},
{
"epoch": 1.9198149575944488,
"grad_norm": 6.249728679656982,
"learning_rate": 6.7830551307270405e-06,
"loss": 0.5375,
"step": 12450
},
{
"epoch": 1.9236700077101,
"grad_norm": 5.488771915435791,
"learning_rate": 6.7717351763581954e-06,
"loss": 0.5137,
"step": 12475
},
{
"epoch": 1.927525057825752,
"grad_norm": 9.779924392700195,
"learning_rate": 6.760404827135474e-06,
"loss": 0.5825,
"step": 12500
},
{
"epoch": 1.9313801079414032,
"grad_norm": 7.1002631187438965,
"learning_rate": 6.74906414953457e-06,
"loss": 0.5519,
"step": 12525
},
{
"epoch": 1.9352351580570546,
"grad_norm": 8.021027565002441,
"learning_rate": 6.7377132100917745e-06,
"loss": 0.5348,
"step": 12550
},
{
"epoch": 1.9390902081727064,
"grad_norm": 9.327821731567383,
"learning_rate": 6.726352075403582e-06,
"loss": 0.5486,
"step": 12575
},
{
"epoch": 1.9429452582883577,
"grad_norm": 7.505486011505127,
"learning_rate": 6.714980812126308e-06,
"loss": 0.5861,
"step": 12600
},
{
"epoch": 1.9468003084040093,
"grad_norm": 7.565356731414795,
"learning_rate": 6.703599486975692e-06,
"loss": 0.4886,
"step": 12625
},
{
"epoch": 1.9506553585196609,
"grad_norm": 10.52251148223877,
"learning_rate": 6.692208166726501e-06,
"loss": 0.5215,
"step": 12650
},
{
"epoch": 1.9545104086353122,
"grad_norm": 8.883962631225586,
"learning_rate": 6.680806918212154e-06,
"loss": 0.5301,
"step": 12675
},
{
"epoch": 1.9583654587509638,
"grad_norm": 7.735800266265869,
"learning_rate": 6.6693958083243095e-06,
"loss": 0.4528,
"step": 12700
},
{
"epoch": 1.9622205088666154,
"grad_norm": 9.604096412658691,
"learning_rate": 6.65797490401249e-06,
"loss": 0.5261,
"step": 12725
},
{
"epoch": 1.9660755589822667,
"grad_norm": 6.650146961212158,
"learning_rate": 6.646544272283682e-06,
"loss": 0.5879,
"step": 12750
},
{
"epoch": 1.9699306090979183,
"grad_norm": 12.350354194641113,
"learning_rate": 6.635103980201936e-06,
"loss": 0.542,
"step": 12775
},
{
"epoch": 1.9737856592135699,
"grad_norm": 8.362752914428711,
"learning_rate": 6.623654094887988e-06,
"loss": 0.4874,
"step": 12800
},
{
"epoch": 1.9776407093292212,
"grad_norm": 11.609392166137695,
"learning_rate": 6.612194683518855e-06,
"loss": 0.4773,
"step": 12825
},
{
"epoch": 1.9814957594448728,
"grad_norm": 12.433268547058105,
"learning_rate": 6.6007258133274465e-06,
"loss": 0.5187,
"step": 12850
},
{
"epoch": 1.9853508095605243,
"grad_norm": 11.452352523803711,
"learning_rate": 6.589247551602164e-06,
"loss": 0.5056,
"step": 12875
},
{
"epoch": 1.9892058596761757,
"grad_norm": 14.193047523498535,
"learning_rate": 6.577759965686509e-06,
"loss": 0.5754,
"step": 12900
},
{
"epoch": 1.9930609097918273,
"grad_norm": 9.333495140075684,
"learning_rate": 6.566263122978689e-06,
"loss": 0.5351,
"step": 12925
},
{
"epoch": 1.9969159599074788,
"grad_norm": 10.230022430419922,
"learning_rate": 6.5547570909312275e-06,
"loss": 0.5947,
"step": 12950
},
{
"epoch": 2.00077101002313,
"grad_norm": 7.866212844848633,
"learning_rate": 6.543241937050553e-06,
"loss": 0.4873,
"step": 12975
},
{
"epoch": 2.004626060138782,
"grad_norm": 6.736771106719971,
"learning_rate": 6.531717728896617e-06,
"loss": 0.2748,
"step": 13000
},
{
"epoch": 2.0084811102544333,
"grad_norm": 7.0699357986450195,
"learning_rate": 6.520184534082494e-06,
"loss": 0.2476,
"step": 13025
},
{
"epoch": 2.0123361603700847,
"grad_norm": 4.682115077972412,
"learning_rate": 6.508642420273984e-06,
"loss": 0.2423,
"step": 13050
},
{
"epoch": 2.0161912104857365,
"grad_norm": 14.793108940124512,
"learning_rate": 6.497091455189209e-06,
"loss": 0.3074,
"step": 13075
},
{
"epoch": 2.020046260601388,
"grad_norm": 4.285236358642578,
"learning_rate": 6.48553170659823e-06,
"loss": 0.2607,
"step": 13100
},
{
"epoch": 2.023901310717039,
"grad_norm": 10.649596214294434,
"learning_rate": 6.473963242322634e-06,
"loss": 0.2847,
"step": 13125
},
{
"epoch": 2.027756360832691,
"grad_norm": 8.940801620483398,
"learning_rate": 6.462386130235149e-06,
"loss": 0.2867,
"step": 13150
},
{
"epoch": 2.0316114109483423,
"grad_norm": 11.055030822753906,
"learning_rate": 6.450800438259237e-06,
"loss": 0.2377,
"step": 13175
},
{
"epoch": 2.0354664610639936,
"grad_norm": 7.941375255584717,
"learning_rate": 6.439206234368701e-06,
"loss": 0.3066,
"step": 13200
},
{
"epoch": 2.0393215111796454,
"grad_norm": 8.227485656738281,
"learning_rate": 6.427603586587281e-06,
"loss": 0.2618,
"step": 13225
},
{
"epoch": 2.043176561295297,
"grad_norm": 8.643232345581055,
"learning_rate": 6.415992562988258e-06,
"loss": 0.3087,
"step": 13250
},
{
"epoch": 2.047031611410948,
"grad_norm": 10.568580627441406,
"learning_rate": 6.404373231694056e-06,
"loss": 0.3075,
"step": 13275
},
{
"epoch": 2.0508866615266,
"grad_norm": 10.652327537536621,
"learning_rate": 6.392745660875841e-06,
"loss": 0.3359,
"step": 13300
},
{
"epoch": 2.0547417116422513,
"grad_norm": 6.572065830230713,
"learning_rate": 6.38110991875312e-06,
"loss": 0.24,
"step": 13325
},
{
"epoch": 2.058596761757903,
"grad_norm": 11.19629955291748,
"learning_rate": 6.369466073593338e-06,
"loss": 0.2646,
"step": 13350
},
{
"epoch": 2.0624518118735544,
"grad_norm": 14.49629020690918,
"learning_rate": 6.357814193711487e-06,
"loss": 0.3042,
"step": 13375
},
{
"epoch": 2.0663068619892058,
"grad_norm": 12.417818069458008,
"learning_rate": 6.346154347469695e-06,
"loss": 0.2873,
"step": 13400
},
{
"epoch": 2.0701619121048576,
"grad_norm": 9.44558334350586,
"learning_rate": 6.3344866032768306e-06,
"loss": 0.2421,
"step": 13425
},
{
"epoch": 2.074016962220509,
"grad_norm": 7.87114953994751,
"learning_rate": 6.3228110295880974e-06,
"loss": 0.2502,
"step": 13450
},
{
"epoch": 2.0778720123361603,
"grad_norm": 6.933884620666504,
"learning_rate": 6.311127694904638e-06,
"loss": 0.2842,
"step": 13475
},
{
"epoch": 2.081727062451812,
"grad_norm": 10.668693542480469,
"learning_rate": 6.299436667773131e-06,
"loss": 0.2911,
"step": 13500
},
{
"epoch": 2.0855821125674634,
"grad_norm": 10.54037857055664,
"learning_rate": 6.287738016785383e-06,
"loss": 0.2803,
"step": 13525
},
{
"epoch": 2.0894371626831147,
"grad_norm": 7.743317127227783,
"learning_rate": 6.276031810577929e-06,
"loss": 0.2466,
"step": 13550
},
{
"epoch": 2.0932922127987665,
"grad_norm": 7.318984031677246,
"learning_rate": 6.264318117831634e-06,
"loss": 0.3117,
"step": 13575
},
{
"epoch": 2.097147262914418,
"grad_norm": 10.879950523376465,
"learning_rate": 6.252597007271287e-06,
"loss": 0.2674,
"step": 13600
},
{
"epoch": 2.1010023130300692,
"grad_norm": 10.071147918701172,
"learning_rate": 6.2408685476651955e-06,
"loss": 0.2976,
"step": 13625
},
{
"epoch": 2.104857363145721,
"grad_norm": 10.434429168701172,
"learning_rate": 6.2291328078247885e-06,
"loss": 0.2732,
"step": 13650
},
{
"epoch": 2.1087124132613724,
"grad_norm": 7.184157848358154,
"learning_rate": 6.2173898566042e-06,
"loss": 0.1922,
"step": 13675
},
{
"epoch": 2.1125674633770237,
"grad_norm": 16.419221878051758,
"learning_rate": 6.205639762899884e-06,
"loss": 0.2835,
"step": 13700
},
{
"epoch": 2.1164225134926755,
"grad_norm": 8.063368797302246,
"learning_rate": 6.193882595650193e-06,
"loss": 0.2831,
"step": 13725
},
{
"epoch": 2.120277563608327,
"grad_norm": 8.21772289276123,
"learning_rate": 6.1821184238349815e-06,
"loss": 0.2613,
"step": 13750
},
{
"epoch": 2.124132613723978,
"grad_norm": 6.443696022033691,
"learning_rate": 6.1703473164752e-06,
"loss": 0.3244,
"step": 13775
},
{
"epoch": 2.12798766383963,
"grad_norm": 8.925273895263672,
"learning_rate": 6.158569342632491e-06,
"loss": 0.2665,
"step": 13800
},
{
"epoch": 2.1318427139552814,
"grad_norm": 14.582781791687012,
"learning_rate": 6.146784571408785e-06,
"loss": 0.2885,
"step": 13825
},
{
"epoch": 2.135697764070933,
"grad_norm": 9.872135162353516,
"learning_rate": 6.13499307194589e-06,
"loss": 0.3011,
"step": 13850
},
{
"epoch": 2.1395528141865845,
"grad_norm": 6.854443550109863,
"learning_rate": 6.123194913425087e-06,
"loss": 0.3061,
"step": 13875
},
{
"epoch": 2.143407864302236,
"grad_norm": 10.82694149017334,
"learning_rate": 6.1113901650667295e-06,
"loss": 0.2892,
"step": 13900
},
{
"epoch": 2.1472629144178876,
"grad_norm": 8.101310729980469,
"learning_rate": 6.0995788961298354e-06,
"loss": 0.2683,
"step": 13925
},
{
"epoch": 2.151117964533539,
"grad_norm": 9.208884239196777,
"learning_rate": 6.087761175911676e-06,
"loss": 0.2887,
"step": 13950
},
{
"epoch": 2.1549730146491903,
"grad_norm": 10.26357364654541,
"learning_rate": 6.0759370737473734e-06,
"loss": 0.2045,
"step": 13975
},
{
"epoch": 2.158828064764842,
"grad_norm": 13.05822467803955,
"learning_rate": 6.064106659009491e-06,
"loss": 0.3116,
"step": 14000
},
{
"epoch": 2.1626831148804935,
"grad_norm": 7.703470230102539,
"learning_rate": 6.052270001107634e-06,
"loss": 0.2889,
"step": 14025
},
{
"epoch": 2.166538164996145,
"grad_norm": 6.503296852111816,
"learning_rate": 6.04042716948803e-06,
"loss": 0.2839,
"step": 14050
},
{
"epoch": 2.1703932151117966,
"grad_norm": 9.92291259765625,
"learning_rate": 6.028578233633131e-06,
"loss": 0.26,
"step": 14075
},
{
"epoch": 2.174248265227448,
"grad_norm": 8.040242195129395,
"learning_rate": 6.016723263061203e-06,
"loss": 0.3093,
"step": 14100
},
{
"epoch": 2.1781033153430993,
"grad_norm": 6.951204299926758,
"learning_rate": 6.004862327325918e-06,
"loss": 0.297,
"step": 14125
},
{
"epoch": 2.181958365458751,
"grad_norm": 8.58872127532959,
"learning_rate": 5.992995496015945e-06,
"loss": 0.2971,
"step": 14150
},
{
"epoch": 2.1858134155744025,
"grad_norm": 3.393548011779785,
"learning_rate": 5.9811228387545465e-06,
"loss": 0.2585,
"step": 14175
},
{
"epoch": 2.189668465690054,
"grad_norm": 6.551548957824707,
"learning_rate": 5.969244425199158e-06,
"loss": 0.3007,
"step": 14200
},
{
"epoch": 2.1935235158057056,
"grad_norm": 7.244775295257568,
"learning_rate": 5.957360325040994e-06,
"loss": 0.3139,
"step": 14225
},
{
"epoch": 2.197378565921357,
"grad_norm": 11.040155410766602,
"learning_rate": 5.945470608004632e-06,
"loss": 0.3068,
"step": 14250
},
{
"epoch": 2.2012336160370083,
"grad_norm": 12.16425609588623,
"learning_rate": 5.933575343847602e-06,
"loss": 0.2748,
"step": 14275
},
{
"epoch": 2.20508866615266,
"grad_norm": 12.443540573120117,
"learning_rate": 5.921674602359982e-06,
"loss": 0.2521,
"step": 14300
},
{
"epoch": 2.2089437162683114,
"grad_norm": 9.918098449707031,
"learning_rate": 5.909768453363979e-06,
"loss": 0.2712,
"step": 14325
},
{
"epoch": 2.212798766383963,
"grad_norm": 5.254764556884766,
"learning_rate": 5.897856966713535e-06,
"loss": 0.3071,
"step": 14350
},
{
"epoch": 2.2166538164996146,
"grad_norm": 8.445236206054688,
"learning_rate": 5.885940212293905e-06,
"loss": 0.3144,
"step": 14375
},
{
"epoch": 2.220508866615266,
"grad_norm": 12.68058967590332,
"learning_rate": 5.874018260021246e-06,
"loss": 0.2997,
"step": 14400
},
{
"epoch": 2.2243639167309173,
"grad_norm": 11.399242401123047,
"learning_rate": 5.862091179842216e-06,
"loss": 0.374,
"step": 14425
},
{
"epoch": 2.228218966846569,
"grad_norm": 4.394013404846191,
"learning_rate": 5.850159041733557e-06,
"loss": 0.2138,
"step": 14450
},
{
"epoch": 2.2320740169622204,
"grad_norm": 8.710460662841797,
"learning_rate": 5.838221915701688e-06,
"loss": 0.2571,
"step": 14475
},
{
"epoch": 2.235929067077872,
"grad_norm": 9.865557670593262,
"learning_rate": 5.82627987178229e-06,
"loss": 0.2784,
"step": 14500
},
{
"epoch": 2.2397841171935235,
"grad_norm": 12.480177879333496,
"learning_rate": 5.814332980039896e-06,
"loss": 0.2484,
"step": 14525
},
{
"epoch": 2.243639167309175,
"grad_norm": 14.494118690490723,
"learning_rate": 5.802381310567484e-06,
"loss": 0.2876,
"step": 14550
},
{
"epoch": 2.2474942174248267,
"grad_norm": 12.612906455993652,
"learning_rate": 5.790424933486065e-06,
"loss": 0.289,
"step": 14575
},
{
"epoch": 2.251349267540478,
"grad_norm": 8.49714183807373,
"learning_rate": 5.778463918944266e-06,
"loss": 0.3188,
"step": 14600
},
{
"epoch": 2.2552043176561294,
"grad_norm": 11.339805603027344,
"learning_rate": 5.766498337117924e-06,
"loss": 0.2372,
"step": 14625
},
{
"epoch": 2.259059367771781,
"grad_norm": 7.394718647003174,
"learning_rate": 5.754528258209671e-06,
"loss": 0.2826,
"step": 14650
},
{
"epoch": 2.2629144178874325,
"grad_norm": 10.278124809265137,
"learning_rate": 5.7425537524485275e-06,
"loss": 0.2472,
"step": 14675
},
{
"epoch": 2.266769468003084,
"grad_norm": 7.263669013977051,
"learning_rate": 5.7305748900894806e-06,
"loss": 0.2792,
"step": 14700
},
{
"epoch": 2.2706245181187357,
"grad_norm": 13.025028228759766,
"learning_rate": 5.718591741413082e-06,
"loss": 0.2945,
"step": 14725
},
{
"epoch": 2.274479568234387,
"grad_norm": 12.685118675231934,
"learning_rate": 5.706604376725033e-06,
"loss": 0.2348,
"step": 14750
},
{
"epoch": 2.2783346183500384,
"grad_norm": 16.206722259521484,
"learning_rate": 5.6946128663557635e-06,
"loss": 0.2808,
"step": 14775
},
{
"epoch": 2.28218966846569,
"grad_norm": 14.512044906616211,
"learning_rate": 5.682617280660033e-06,
"loss": 0.2915,
"step": 14800
},
{
"epoch": 2.2860447185813415,
"grad_norm": 15.623675346374512,
"learning_rate": 5.67061769001651e-06,
"loss": 0.2792,
"step": 14825
},
{
"epoch": 2.2898997686969933,
"grad_norm": 10.42396354675293,
"learning_rate": 5.658614164827358e-06,
"loss": 0.23,
"step": 14850
},
{
"epoch": 2.2937548188126446,
"grad_norm": 9.883713722229004,
"learning_rate": 5.6466067755178226e-06,
"loss": 0.2615,
"step": 14875
},
{
"epoch": 2.297609868928296,
"grad_norm": 12.14278793334961,
"learning_rate": 5.634595592535827e-06,
"loss": 0.2888,
"step": 14900
},
{
"epoch": 2.301464919043948,
"grad_norm": 9.987312316894531,
"learning_rate": 5.622580686351547e-06,
"loss": 0.2429,
"step": 14925
},
{
"epoch": 2.305319969159599,
"grad_norm": 8.568805694580078,
"learning_rate": 5.610562127457007e-06,
"loss": 0.2566,
"step": 14950
},
{
"epoch": 2.3091750192752505,
"grad_norm": 12.70766544342041,
"learning_rate": 5.598539986365654e-06,
"loss": 0.3429,
"step": 14975
},
{
"epoch": 2.3130300693909023,
"grad_norm": 6.177096366882324,
"learning_rate": 5.586514333611961e-06,
"loss": 0.2434,
"step": 15000
},
{
"epoch": 2.3168851195065536,
"grad_norm": 12.66221809387207,
"learning_rate": 5.574485239750998e-06,
"loss": 0.2958,
"step": 15025
},
{
"epoch": 2.320740169622205,
"grad_norm": 10.357071876525879,
"learning_rate": 5.562452775358028e-06,
"loss": 0.2553,
"step": 15050
},
{
"epoch": 2.3245952197378568,
"grad_norm": 9.472440719604492,
"learning_rate": 5.550417011028086e-06,
"loss": 0.3276,
"step": 15075
},
{
"epoch": 2.328450269853508,
"grad_norm": 7.8861212730407715,
"learning_rate": 5.53837801737557e-06,
"loss": 0.3256,
"step": 15100
},
{
"epoch": 2.3323053199691595,
"grad_norm": 10.168670654296875,
"learning_rate": 5.526335865033823e-06,
"loss": 0.336,
"step": 15125
},
{
"epoch": 2.3361603700848113,
"grad_norm": 8.54845905303955,
"learning_rate": 5.514290624654722e-06,
"loss": 0.2719,
"step": 15150
},
{
"epoch": 2.3400154202004626,
"grad_norm": 7.871292591094971,
"learning_rate": 5.50224236690826e-06,
"loss": 0.2878,
"step": 15175
},
{
"epoch": 2.343870470316114,
"grad_norm": 9.581894874572754,
"learning_rate": 5.490191162482133e-06,
"loss": 0.2585,
"step": 15200
},
{
"epoch": 2.3477255204317657,
"grad_norm": 9.766436576843262,
"learning_rate": 5.478137082081328e-06,
"loss": 0.3189,
"step": 15225
},
{
"epoch": 2.351580570547417,
"grad_norm": 10.144207954406738,
"learning_rate": 5.4660801964277015e-06,
"loss": 0.2391,
"step": 15250
},
{
"epoch": 2.3554356206630684,
"grad_norm": 9.748770713806152,
"learning_rate": 5.4540205762595714e-06,
"loss": 0.2635,
"step": 15275
},
{
"epoch": 2.3592906707787202,
"grad_norm": 12.085159301757812,
"learning_rate": 5.441958292331297e-06,
"loss": 0.3298,
"step": 15300
},
{
"epoch": 2.3631457208943716,
"grad_norm": 10.519356727600098,
"learning_rate": 5.42989341541287e-06,
"loss": 0.3364,
"step": 15325
},
{
"epoch": 2.367000771010023,
"grad_norm": 7.1762189865112305,
"learning_rate": 5.417826016289489e-06,
"loss": 0.2631,
"step": 15350
},
{
"epoch": 2.3708558211256747,
"grad_norm": 7.298311710357666,
"learning_rate": 5.405756165761158e-06,
"loss": 0.2925,
"step": 15375
},
{
"epoch": 2.374710871241326,
"grad_norm": 14.725603103637695,
"learning_rate": 5.393683934642257e-06,
"loss": 0.268,
"step": 15400
},
{
"epoch": 2.3785659213569774,
"grad_norm": 12.149480819702148,
"learning_rate": 5.3816093937611344e-06,
"loss": 0.2902,
"step": 15425
},
{
"epoch": 2.382420971472629,
"grad_norm": 6.900524616241455,
"learning_rate": 5.369532613959695e-06,
"loss": 0.2851,
"step": 15450
},
{
"epoch": 2.3862760215882806,
"grad_norm": 13.183350563049316,
"learning_rate": 5.357453666092972e-06,
"loss": 0.2761,
"step": 15475
},
{
"epoch": 2.390131071703932,
"grad_norm": 9.373590469360352,
"learning_rate": 5.345372621028725e-06,
"loss": 0.2982,
"step": 15500
},
{
"epoch": 2.3939861218195837,
"grad_norm": 4.575671195983887,
"learning_rate": 5.333289549647014e-06,
"loss": 0.2896,
"step": 15525
},
{
"epoch": 2.397841171935235,
"grad_norm": 10.887511253356934,
"learning_rate": 5.321204522839789e-06,
"loss": 0.2735,
"step": 15550
},
{
"epoch": 2.401696222050887,
"grad_norm": 8.057476043701172,
"learning_rate": 5.309117611510475e-06,
"loss": 0.2755,
"step": 15575
},
{
"epoch": 2.405551272166538,
"grad_norm": 10.257162094116211,
"learning_rate": 5.2970288865735474e-06,
"loss": 0.2939,
"step": 15600
},
{
"epoch": 2.4094063222821895,
"grad_norm": 6.080315589904785,
"learning_rate": 5.284938418954128e-06,
"loss": 0.2566,
"step": 15625
},
{
"epoch": 2.4132613723978413,
"grad_norm": 6.65130615234375,
"learning_rate": 5.272846279587559e-06,
"loss": 0.2996,
"step": 15650
},
{
"epoch": 2.4171164225134927,
"grad_norm": 4.899411201477051,
"learning_rate": 5.260752539418994e-06,
"loss": 0.2852,
"step": 15675
},
{
"epoch": 2.420971472629144,
"grad_norm": 7.077582359313965,
"learning_rate": 5.248657269402978e-06,
"loss": 0.2844,
"step": 15700
},
{
"epoch": 2.424826522744796,
"grad_norm": 9.571953773498535,
"learning_rate": 5.2365605405030296e-06,
"loss": 0.3084,
"step": 15725
},
{
"epoch": 2.428681572860447,
"grad_norm": 10.793221473693848,
"learning_rate": 5.2244624236912275e-06,
"loss": 0.2872,
"step": 15750
},
{
"epoch": 2.4325366229760985,
"grad_norm": 7.808874607086182,
"learning_rate": 5.212362989947796e-06,
"loss": 0.2708,
"step": 15775
},
{
"epoch": 2.4363916730917503,
"grad_norm": 7.427307605743408,
"learning_rate": 5.200262310260682e-06,
"loss": 0.3697,
"step": 15800
},
{
"epoch": 2.4402467232074017,
"grad_norm": 10.553243637084961,
"learning_rate": 5.188160455625143e-06,
"loss": 0.2778,
"step": 15825
},
{
"epoch": 2.444101773323053,
"grad_norm": 7.460685729980469,
"learning_rate": 5.176057497043336e-06,
"loss": 0.3237,
"step": 15850
},
{
"epoch": 2.447956823438705,
"grad_norm": 12.230719566345215,
"learning_rate": 5.163953505523883e-06,
"loss": 0.315,
"step": 15875
},
{
"epoch": 2.451811873554356,
"grad_norm": 9.193891525268555,
"learning_rate": 5.15184855208148e-06,
"loss": 0.2878,
"step": 15900
},
{
"epoch": 2.455666923670008,
"grad_norm": 21.651145935058594,
"learning_rate": 5.139742707736456e-06,
"loss": 0.2445,
"step": 15925
},
{
"epoch": 2.4595219737856593,
"grad_norm": 10.970038414001465,
"learning_rate": 5.127636043514374e-06,
"loss": 0.2865,
"step": 15950
},
{
"epoch": 2.4633770239013106,
"grad_norm": 11.244210243225098,
"learning_rate": 5.115528630445599e-06,
"loss": 0.2731,
"step": 15975
},
{
"epoch": 2.4672320740169624,
"grad_norm": 9.13939380645752,
"learning_rate": 5.103420539564899e-06,
"loss": 0.2461,
"step": 16000
},
{
"epoch": 2.4710871241326138,
"grad_norm": 9.489975929260254,
"learning_rate": 5.091311841911015e-06,
"loss": 0.2691,
"step": 16025
},
{
"epoch": 2.474942174248265,
"grad_norm": 9.385398864746094,
"learning_rate": 5.079202608526247e-06,
"loss": 0.3043,
"step": 16050
},
{
"epoch": 2.478797224363917,
"grad_norm": 10.858476638793945,
"learning_rate": 5.067092910456035e-06,
"loss": 0.3453,
"step": 16075
},
{
"epoch": 2.4826522744795683,
"grad_norm": 11.217155456542969,
"learning_rate": 5.0549828187485535e-06,
"loss": 0.3237,
"step": 16100
},
{
"epoch": 2.4865073245952196,
"grad_norm": 11.37563419342041,
"learning_rate": 5.04287240445428e-06,
"loss": 0.3099,
"step": 16125
},
{
"epoch": 2.4903623747108714,
"grad_norm": 11.476361274719238,
"learning_rate": 5.030761738625586e-06,
"loss": 0.2882,
"step": 16150
},
{
"epoch": 2.4942174248265228,
"grad_norm": 3.880314588546753,
"learning_rate": 5.01865089231632e-06,
"loss": 0.2557,
"step": 16175
},
{
"epoch": 2.498072474942174,
"grad_norm": 10.355463981628418,
"learning_rate": 5.006539936581389e-06,
"loss": 0.3162,
"step": 16200
},
{
"epoch": 2.501927525057826,
"grad_norm": 13.213906288146973,
"learning_rate": 4.994428942476342e-06,
"loss": 0.3081,
"step": 16225
},
{
"epoch": 2.5057825751734772,
"grad_norm": 10.77859115600586,
"learning_rate": 4.982317981056952e-06,
"loss": 0.3029,
"step": 16250
},
{
"epoch": 2.5096376252891286,
"grad_norm": 9.533126831054688,
"learning_rate": 4.9702071233788024e-06,
"loss": 0.3504,
"step": 16275
},
{
"epoch": 2.5134926754047804,
"grad_norm": 9.308165550231934,
"learning_rate": 4.958096440496864e-06,
"loss": 0.2232,
"step": 16300
},
{
"epoch": 2.5173477255204317,
"grad_norm": 8.081320762634277,
"learning_rate": 4.945986003465088e-06,
"loss": 0.344,
"step": 16325
},
{
"epoch": 2.521202775636083,
"grad_norm": 10.340813636779785,
"learning_rate": 4.9338758833359775e-06,
"loss": 0.2858,
"step": 16350
},
{
"epoch": 2.525057825751735,
"grad_norm": 7.0694732666015625,
"learning_rate": 4.921766151160177e-06,
"loss": 0.2867,
"step": 16375
},
{
"epoch": 2.5289128758673862,
"grad_norm": 9.34334945678711,
"learning_rate": 4.9096568779860615e-06,
"loss": 0.262,
"step": 16400
},
{
"epoch": 2.5327679259830376,
"grad_norm": 8.892664909362793,
"learning_rate": 4.897548134859304e-06,
"loss": 0.2947,
"step": 16425
},
{
"epoch": 2.5366229760986894,
"grad_norm": 8.4851713180542,
"learning_rate": 4.885439992822476e-06,
"loss": 0.306,
"step": 16450
},
{
"epoch": 2.5404780262143407,
"grad_norm": 8.288248062133789,
"learning_rate": 4.873332522914615e-06,
"loss": 0.3815,
"step": 16475
},
{
"epoch": 2.544333076329992,
"grad_norm": 10.876388549804688,
"learning_rate": 4.861225796170818e-06,
"loss": 0.3351,
"step": 16500
},
{
"epoch": 2.548188126445644,
"grad_norm": 9.699716567993164,
"learning_rate": 4.849119883621821e-06,
"loss": 0.2901,
"step": 16525
},
{
"epoch": 2.552043176561295,
"grad_norm": 9.040721893310547,
"learning_rate": 4.8370148562935885e-06,
"loss": 0.3066,
"step": 16550
},
{
"epoch": 2.5558982266769465,
"grad_norm": 13.633066177368164,
"learning_rate": 4.824910785206883e-06,
"loss": 0.3319,
"step": 16575
},
{
"epoch": 2.5597532767925983,
"grad_norm": 11.06735897064209,
"learning_rate": 4.8128077413768635e-06,
"loss": 0.3046,
"step": 16600
},
{
"epoch": 2.5636083269082497,
"grad_norm": 9.440793991088867,
"learning_rate": 4.800705795812655e-06,
"loss": 0.3313,
"step": 16625
},
{
"epoch": 2.567463377023901,
"grad_norm": 9.293932914733887,
"learning_rate": 4.788605019516948e-06,
"loss": 0.3079,
"step": 16650
},
{
"epoch": 2.571318427139553,
"grad_norm": 9.266508102416992,
"learning_rate": 4.7765054834855655e-06,
"loss": 0.2514,
"step": 16675
},
{
"epoch": 2.575173477255204,
"grad_norm": 11.086885452270508,
"learning_rate": 4.764407258707054e-06,
"loss": 0.3122,
"step": 16700
},
{
"epoch": 2.579028527370856,
"grad_norm": 2.9275074005126953,
"learning_rate": 4.752310416162275e-06,
"loss": 0.2531,
"step": 16725
},
{
"epoch": 2.5828835774865073,
"grad_norm": 10.78078556060791,
"learning_rate": 4.74021502682397e-06,
"loss": 0.2942,
"step": 16750
},
{
"epoch": 2.5867386276021587,
"grad_norm": 9.587366104125977,
"learning_rate": 4.728121161656361e-06,
"loss": 0.2574,
"step": 16775
},
{
"epoch": 2.5905936777178105,
"grad_norm": 11.237876892089844,
"learning_rate": 4.716028891614725e-06,
"loss": 0.3185,
"step": 16800
},
{
"epoch": 2.594448727833462,
"grad_norm": 9.442562103271484,
"learning_rate": 4.7039382876449805e-06,
"loss": 0.2534,
"step": 16825
},
{
"epoch": 2.5983037779491136,
"grad_norm": 6.219342231750488,
"learning_rate": 4.691849420683271e-06,
"loss": 0.2573,
"step": 16850
},
{
"epoch": 2.602158828064765,
"grad_norm": 6.631523132324219,
"learning_rate": 4.67976236165555e-06,
"loss": 0.3191,
"step": 16875
},
{
"epoch": 2.6060138781804163,
"grad_norm": 6.616753578186035,
"learning_rate": 4.667677181477164e-06,
"loss": 0.2814,
"step": 16900
},
{
"epoch": 2.609868928296068,
"grad_norm": 8.976804733276367,
"learning_rate": 4.655593951052434e-06,
"loss": 0.2789,
"step": 16925
},
{
"epoch": 2.6137239784117194,
"grad_norm": 12.891767501831055,
"learning_rate": 4.643512741274242e-06,
"loss": 0.3049,
"step": 16950
},
{
"epoch": 2.617579028527371,
"grad_norm": 14.167659759521484,
"learning_rate": 4.6314336230236194e-06,
"loss": 0.3405,
"step": 16975
},
{
"epoch": 2.6214340786430226,
"grad_norm": 11.668992042541504,
"learning_rate": 4.619356667169318e-06,
"loss": 0.2577,
"step": 17000
},
{
"epoch": 2.625289128758674,
"grad_norm": 20.31926918029785,
"learning_rate": 4.607281944567413e-06,
"loss": 0.3177,
"step": 17025
},
{
"epoch": 2.6291441788743253,
"grad_norm": 11.018213272094727,
"learning_rate": 4.595209526060868e-06,
"loss": 0.2608,
"step": 17050
},
{
"epoch": 2.632999228989977,
"grad_norm": 10.879366874694824,
"learning_rate": 4.583139482479134e-06,
"loss": 0.2711,
"step": 17075
},
{
"epoch": 2.6368542791056284,
"grad_norm": 4.431464195251465,
"learning_rate": 4.5710718846377246e-06,
"loss": 0.3065,
"step": 17100
},
{
"epoch": 2.6407093292212798,
"grad_norm": 11.422054290771484,
"learning_rate": 4.559006803337807e-06,
"loss": 0.233,
"step": 17125
},
{
"epoch": 2.6445643793369316,
"grad_norm": 12.025290489196777,
"learning_rate": 4.546944309365782e-06,
"loss": 0.3647,
"step": 17150
},
{
"epoch": 2.648419429452583,
"grad_norm": 4.830737113952637,
"learning_rate": 4.534884473492869e-06,
"loss": 0.3205,
"step": 17175
},
{
"epoch": 2.6522744795682343,
"grad_norm": 5.7897820472717285,
"learning_rate": 4.522827366474698e-06,
"loss": 0.2524,
"step": 17200
},
{
"epoch": 2.656129529683886,
"grad_norm": 4.773533821105957,
"learning_rate": 4.510773059050882e-06,
"loss": 0.2615,
"step": 17225
},
{
"epoch": 2.6599845797995374,
"grad_norm": 7.355379104614258,
"learning_rate": 4.498721621944611e-06,
"loss": 0.2379,
"step": 17250
},
{
"epoch": 2.6638396299151887,
"grad_norm": 2.115938425064087,
"learning_rate": 4.486673125862237e-06,
"loss": 0.2226,
"step": 17275
},
{
"epoch": 2.6676946800308405,
"grad_norm": 9.673666000366211,
"learning_rate": 4.474627641492854e-06,
"loss": 0.342,
"step": 17300
},
{
"epoch": 2.671549730146492,
"grad_norm": 10.405682563781738,
"learning_rate": 4.462585239507886e-06,
"loss": 0.2276,
"step": 17325
},
{
"epoch": 2.6754047802621432,
"grad_norm": 9.802292823791504,
"learning_rate": 4.450545990560677e-06,
"loss": 0.2814,
"step": 17350
},
{
"epoch": 2.679259830377795,
"grad_norm": 5.3616414070129395,
"learning_rate": 4.4385099652860655e-06,
"loss": 0.25,
"step": 17375
},
{
"epoch": 2.6831148804934464,
"grad_norm": 9.817632675170898,
"learning_rate": 4.42647723429998e-06,
"loss": 0.2544,
"step": 17400
},
{
"epoch": 2.6869699306090977,
"grad_norm": 9.161683082580566,
"learning_rate": 4.414447868199023e-06,
"loss": 0.3113,
"step": 17425
},
{
"epoch": 2.6908249807247495,
"grad_norm": 6.944971084594727,
"learning_rate": 4.402421937560052e-06,
"loss": 0.3221,
"step": 17450
},
{
"epoch": 2.694680030840401,
"grad_norm": 13.24308967590332,
"learning_rate": 4.39039951293977e-06,
"loss": 0.3324,
"step": 17475
},
{
"epoch": 2.698535080956052,
"grad_norm": 9.548050880432129,
"learning_rate": 4.378380664874306e-06,
"loss": 0.3273,
"step": 17500
},
{
"epoch": 2.702390131071704,
"grad_norm": 8.773509979248047,
"learning_rate": 4.366365463878814e-06,
"loss": 0.2826,
"step": 17525
},
{
"epoch": 2.7062451811873554,
"grad_norm": 10.149378776550293,
"learning_rate": 4.354353980447042e-06,
"loss": 0.2211,
"step": 17550
},
{
"epoch": 2.7101002313030067,
"grad_norm": 9.664222717285156,
"learning_rate": 4.3423462850509295e-06,
"loss": 0.291,
"step": 17575
},
{
"epoch": 2.7139552814186585,
"grad_norm": 7.333618640899658,
"learning_rate": 4.330342448140193e-06,
"loss": 0.2722,
"step": 17600
},
{
"epoch": 2.71781033153431,
"grad_norm": 12.827810287475586,
"learning_rate": 4.318342540141909e-06,
"loss": 0.2663,
"step": 17625
},
{
"epoch": 2.721665381649961,
"grad_norm": 8.374723434448242,
"learning_rate": 4.3063466314601075e-06,
"loss": 0.2346,
"step": 17650
},
{
"epoch": 2.725520431765613,
"grad_norm": 10.546065330505371,
"learning_rate": 4.294354792475347e-06,
"loss": 0.2341,
"step": 17675
},
{
"epoch": 2.7293754818812643,
"grad_norm": 10.947388648986816,
"learning_rate": 4.282367093544315e-06,
"loss": 0.2571,
"step": 17700
},
{
"epoch": 2.733230531996916,
"grad_norm": 11.971232414245605,
"learning_rate": 4.270383604999404e-06,
"loss": 0.276,
"step": 17725
},
{
"epoch": 2.7370855821125675,
"grad_norm": 9.849080085754395,
"learning_rate": 4.25840439714831e-06,
"loss": 0.2841,
"step": 17750
},
{
"epoch": 2.740940632228219,
"grad_norm": 8.390685081481934,
"learning_rate": 4.246429540273609e-06,
"loss": 0.3115,
"step": 17775
},
{
"epoch": 2.7447956823438706,
"grad_norm": 8.906442642211914,
"learning_rate": 4.234459104632351e-06,
"loss": 0.2755,
"step": 17800
},
{
"epoch": 2.748650732459522,
"grad_norm": 12.104633331298828,
"learning_rate": 4.2224931604556465e-06,
"loss": 0.2833,
"step": 17825
},
{
"epoch": 2.7525057825751738,
"grad_norm": 10.492051124572754,
"learning_rate": 4.210531777948256e-06,
"loss": 0.2916,
"step": 17850
},
{
"epoch": 2.756360832690825,
"grad_norm": 6.358097553253174,
"learning_rate": 4.198575027288174e-06,
"loss": 0.2748,
"step": 17875
},
{
"epoch": 2.7602158828064765,
"grad_norm": 10.532133102416992,
"learning_rate": 4.186622978626222e-06,
"loss": 0.2413,
"step": 17900
},
{
"epoch": 2.7640709329221282,
"grad_norm": 13.972955703735352,
"learning_rate": 4.17467570208563e-06,
"loss": 0.2731,
"step": 17925
},
{
"epoch": 2.7679259830377796,
"grad_norm": 14.427913665771484,
"learning_rate": 4.162733267761635e-06,
"loss": 0.3278,
"step": 17950
},
{
"epoch": 2.771781033153431,
"grad_norm": 13.79145336151123,
"learning_rate": 4.150795745721065e-06,
"loss": 0.2896,
"step": 17975
},
{
"epoch": 2.7756360832690827,
"grad_norm": 8.224387168884277,
"learning_rate": 4.138863206001924e-06,
"loss": 0.292,
"step": 18000
},
{
"epoch": 2.779491133384734,
"grad_norm": 9.952670097351074,
"learning_rate": 4.126935718612985e-06,
"loss": 0.2776,
"step": 18025
},
{
"epoch": 2.7833461835003854,
"grad_norm": 11.750216484069824,
"learning_rate": 4.115013353533378e-06,
"loss": 0.2257,
"step": 18050
},
{
"epoch": 2.787201233616037,
"grad_norm": 8.650907516479492,
"learning_rate": 4.1030961807121835e-06,
"loss": 0.2831,
"step": 18075
},
{
"epoch": 2.7910562837316886,
"grad_norm": 8.431353569030762,
"learning_rate": 4.091184270068016e-06,
"loss": 0.2609,
"step": 18100
},
{
"epoch": 2.79491133384734,
"grad_norm": 11.888062477111816,
"learning_rate": 4.079277691488617e-06,
"loss": 0.3155,
"step": 18125
},
{
"epoch": 2.7987663839629917,
"grad_norm": 12.635673522949219,
"learning_rate": 4.067376514830444e-06,
"loss": 0.2727,
"step": 18150
},
{
"epoch": 2.802621434078643,
"grad_norm": 8.785150527954102,
"learning_rate": 4.055480809918264e-06,
"loss": 0.3571,
"step": 18175
},
{
"epoch": 2.8064764841942944,
"grad_norm": 6.311487674713135,
"learning_rate": 4.043590646544739e-06,
"loss": 0.3036,
"step": 18200
},
{
"epoch": 2.810331534309946,
"grad_norm": 6.0879740715026855,
"learning_rate": 4.031706094470016e-06,
"loss": 0.2803,
"step": 18225
},
{
"epoch": 2.8141865844255975,
"grad_norm": 4.374220371246338,
"learning_rate": 4.0198272234213246e-06,
"loss": 0.3003,
"step": 18250
},
{
"epoch": 2.818041634541249,
"grad_norm": 6.665703296661377,
"learning_rate": 4.007954103092559e-06,
"loss": 0.2732,
"step": 18275
},
{
"epoch": 2.8218966846569007,
"grad_norm": 11.401281356811523,
"learning_rate": 3.9960868031438815e-06,
"loss": 0.2907,
"step": 18300
},
{
"epoch": 2.825751734772552,
"grad_norm": 8.864492416381836,
"learning_rate": 3.984225393201298e-06,
"loss": 0.2591,
"step": 18325
},
{
"epoch": 2.8296067848882034,
"grad_norm": 12.262740135192871,
"learning_rate": 3.972369942856261e-06,
"loss": 0.2947,
"step": 18350
},
{
"epoch": 2.833461835003855,
"grad_norm": 9.077651023864746,
"learning_rate": 3.960520521665256e-06,
"loss": 0.2667,
"step": 18375
},
{
"epoch": 2.8373168851195065,
"grad_norm": 20.405847549438477,
"learning_rate": 3.948677199149396e-06,
"loss": 0.3082,
"step": 18400
},
{
"epoch": 2.841171935235158,
"grad_norm": 11.096578598022461,
"learning_rate": 3.936840044794016e-06,
"loss": 0.2345,
"step": 18425
},
{
"epoch": 2.8450269853508097,
"grad_norm": 10.999439239501953,
"learning_rate": 3.925009128048255e-06,
"loss": 0.2613,
"step": 18450
},
{
"epoch": 2.848882035466461,
"grad_norm": 11.137248992919922,
"learning_rate": 3.913184518324662e-06,
"loss": 0.3123,
"step": 18475
},
{
"epoch": 2.8527370855821124,
"grad_norm": 3.871978521347046,
"learning_rate": 3.90136628499878e-06,
"loss": 0.2916,
"step": 18500
},
{
"epoch": 2.856592135697764,
"grad_norm": 7.061714172363281,
"learning_rate": 3.889554497408742e-06,
"loss": 0.2365,
"step": 18525
},
{
"epoch": 2.8604471858134155,
"grad_norm": 10.275923728942871,
"learning_rate": 3.877749224854862e-06,
"loss": 0.3008,
"step": 18550
},
{
"epoch": 2.864302235929067,
"grad_norm": 9.240686416625977,
"learning_rate": 3.865950536599229e-06,
"loss": 0.2786,
"step": 18575
},
{
"epoch": 2.8681572860447186,
"grad_norm": 11.975162506103516,
"learning_rate": 3.854158501865308e-06,
"loss": 0.2761,
"step": 18600
},
{
"epoch": 2.87201233616037,
"grad_norm": 11.954338073730469,
"learning_rate": 3.842373189837522e-06,
"loss": 0.3105,
"step": 18625
},
{
"epoch": 2.8758673862760213,
"grad_norm": 12.442068099975586,
"learning_rate": 3.830594669660853e-06,
"loss": 0.253,
"step": 18650
},
{
"epoch": 2.879722436391673,
"grad_norm": 7.103127956390381,
"learning_rate": 3.818823010440433e-06,
"loss": 0.2849,
"step": 18675
},
{
"epoch": 2.8835774865073245,
"grad_norm": 4.1993231773376465,
"learning_rate": 3.8070582812411428e-06,
"loss": 0.2837,
"step": 18700
},
{
"epoch": 2.887432536622976,
"grad_norm": 9.685484886169434,
"learning_rate": 3.7953005510872045e-06,
"loss": 0.2793,
"step": 18725
},
{
"epoch": 2.8912875867386276,
"grad_norm": 12.104543685913086,
"learning_rate": 3.783549888961775e-06,
"loss": 0.2784,
"step": 18750
},
{
"epoch": 2.895142636854279,
"grad_norm": 6.225340843200684,
"learning_rate": 3.7718063638065426e-06,
"loss": 0.2401,
"step": 18775
},
{
"epoch": 2.8989976869699308,
"grad_norm": 8.04455280303955,
"learning_rate": 3.7600700445213246e-06,
"loss": 0.2373,
"step": 18800
},
{
"epoch": 2.902852737085582,
"grad_norm": 12.773852348327637,
"learning_rate": 3.74834099996366e-06,
"loss": 0.3124,
"step": 18825
},
{
"epoch": 2.9067077872012335,
"grad_norm": 7.7987751960754395,
"learning_rate": 3.736619298948406e-06,
"loss": 0.2847,
"step": 18850
},
{
"epoch": 2.9105628373168853,
"grad_norm": 10.535285949707031,
"learning_rate": 3.7249050102473365e-06,
"loss": 0.2832,
"step": 18875
},
{
"epoch": 2.9144178874325366,
"grad_norm": 9.952566146850586,
"learning_rate": 3.713198202588733e-06,
"loss": 0.3184,
"step": 18900
},
{
"epoch": 2.9182729375481884,
"grad_norm": 7.76224946975708,
"learning_rate": 3.701498944656993e-06,
"loss": 0.2281,
"step": 18925
},
{
"epoch": 2.9221279876638397,
"grad_norm": 13.121888160705566,
"learning_rate": 3.6898073050922118e-06,
"loss": 0.3241,
"step": 18950
},
{
"epoch": 2.925983037779491,
"grad_norm": 5.2703375816345215,
"learning_rate": 3.6781233524897917e-06,
"loss": 0.2922,
"step": 18975
},
{
"epoch": 2.929838087895143,
"grad_norm": 8.370783805847168,
"learning_rate": 3.666447155400034e-06,
"loss": 0.2813,
"step": 19000
},
{
"epoch": 2.9336931380107942,
"grad_norm": 9.90772533416748,
"learning_rate": 3.6547787823277366e-06,
"loss": 0.227,
"step": 19025
},
{
"epoch": 2.9375481881264456,
"grad_norm": 8.77448844909668,
"learning_rate": 3.6431183017317963e-06,
"loss": 0.3326,
"step": 19050
},
{
"epoch": 2.9414032382420974,
"grad_norm": 9.442300796508789,
"learning_rate": 3.6314657820248016e-06,
"loss": 0.2532,
"step": 19075
},
{
"epoch": 2.9452582883577487,
"grad_norm": 12.113017082214355,
"learning_rate": 3.6198212915726374e-06,
"loss": 0.2846,
"step": 19100
},
{
"epoch": 2.9491133384734,
"grad_norm": 10.078252792358398,
"learning_rate": 3.608184898694075e-06,
"loss": 0.2415,
"step": 19125
},
{
"epoch": 2.952968388589052,
"grad_norm": 9.863677024841309,
"learning_rate": 3.5965566716603846e-06,
"loss": 0.2738,
"step": 19150
},
{
"epoch": 2.956823438704703,
"grad_norm": 12.960750579833984,
"learning_rate": 3.5849366786949203e-06,
"loss": 0.2632,
"step": 19175
},
{
"epoch": 2.9606784888203546,
"grad_norm": 8.359341621398926,
"learning_rate": 3.5733249879727283e-06,
"loss": 0.3223,
"step": 19200
},
{
"epoch": 2.9645335389360064,
"grad_norm": 8.351648330688477,
"learning_rate": 3.5617216676201493e-06,
"loss": 0.2459,
"step": 19225
},
{
"epoch": 2.9683885890516577,
"grad_norm": 12.69826602935791,
"learning_rate": 3.5501267857144102e-06,
"loss": 0.3093,
"step": 19250
},
{
"epoch": 2.972243639167309,
"grad_norm": 6.269009590148926,
"learning_rate": 3.538540410283228e-06,
"loss": 0.2734,
"step": 19275
},
{
"epoch": 2.976098689282961,
"grad_norm": 12.896618843078613,
"learning_rate": 3.526962609304416e-06,
"loss": 0.3411,
"step": 19300
},
{
"epoch": 2.979953739398612,
"grad_norm": 5.591207981109619,
"learning_rate": 3.5153934507054793e-06,
"loss": 0.3198,
"step": 19325
},
{
"epoch": 2.9838087895142635,
"grad_norm": 8.253841400146484,
"learning_rate": 3.503833002363215e-06,
"loss": 0.2514,
"step": 19350
},
{
"epoch": 2.9876638396299153,
"grad_norm": 8.453317642211914,
"learning_rate": 3.492281332103321e-06,
"loss": 0.2653,
"step": 19375
},
{
"epoch": 2.9915188897455667,
"grad_norm": 10.928768157958984,
"learning_rate": 3.4807385076999923e-06,
"loss": 0.3267,
"step": 19400
},
{
"epoch": 2.995373939861218,
"grad_norm": 7.040345668792725,
"learning_rate": 3.4692045968755215e-06,
"loss": 0.2983,
"step": 19425
},
{
"epoch": 2.99922898997687,
"grad_norm": 11.468915939331055,
"learning_rate": 3.457679667299909e-06,
"loss": 0.3196,
"step": 19450
},
{
"epoch": 3.003084040092521,
"grad_norm": 5.387165069580078,
"learning_rate": 3.446163786590462e-06,
"loss": 0.1636,
"step": 19475
},
{
"epoch": 3.0069390902081725,
"grad_norm": 5.879426956176758,
"learning_rate": 3.434657022311394e-06,
"loss": 0.1017,
"step": 19500
},
{
"epoch": 3.0107941403238243,
"grad_norm": 6.856146812438965,
"learning_rate": 3.4231594419734334e-06,
"loss": 0.1097,
"step": 19525
},
{
"epoch": 3.0146491904394757,
"grad_norm": 4.485756874084473,
"learning_rate": 3.411671113033429e-06,
"loss": 0.126,
"step": 19550
},
{
"epoch": 3.018504240555127,
"grad_norm": 11.01356029510498,
"learning_rate": 3.4001921028939476e-06,
"loss": 0.0898,
"step": 19575
},
{
"epoch": 3.022359290670779,
"grad_norm": 7.513741493225098,
"learning_rate": 3.3887224789028815e-06,
"loss": 0.1026,
"step": 19600
},
{
"epoch": 3.02621434078643,
"grad_norm": 8.171934127807617,
"learning_rate": 3.3772623083530598e-06,
"loss": 0.1387,
"step": 19625
},
{
"epoch": 3.030069390902082,
"grad_norm": 7.734066963195801,
"learning_rate": 3.3658116584818412e-06,
"loss": 0.1031,
"step": 19650
},
{
"epoch": 3.0339244410177333,
"grad_norm": 7.977196216583252,
"learning_rate": 3.354370596470727e-06,
"loss": 0.1226,
"step": 19675
},
{
"epoch": 3.0377794911333846,
"grad_norm": 6.4046525955200195,
"learning_rate": 3.3429391894449726e-06,
"loss": 0.1562,
"step": 19700
},
{
"epoch": 3.0416345412490364,
"grad_norm": 8.365998268127441,
"learning_rate": 3.331517504473179e-06,
"loss": 0.1485,
"step": 19725
},
{
"epoch": 3.0454895913646878,
"grad_norm": 4.57725191116333,
"learning_rate": 3.3201056085669113e-06,
"loss": 0.1221,
"step": 19750
},
{
"epoch": 3.049344641480339,
"grad_norm": 6.356462001800537,
"learning_rate": 3.3087035686803017e-06,
"loss": 0.0993,
"step": 19775
},
{
"epoch": 3.053199691595991,
"grad_norm": 3.4471752643585205,
"learning_rate": 3.297311451709656e-06,
"loss": 0.1167,
"step": 19800
},
{
"epoch": 3.0570547417116423,
"grad_norm": 6.51023006439209,
"learning_rate": 3.2859293244930624e-06,
"loss": 0.119,
"step": 19825
},
{
"epoch": 3.0609097918272936,
"grad_norm": 4.519765377044678,
"learning_rate": 3.274557253809996e-06,
"loss": 0.1105,
"step": 19850
},
{
"epoch": 3.0647648419429454,
"grad_norm": 9.077949523925781,
"learning_rate": 3.263195306380936e-06,
"loss": 0.1327,
"step": 19875
},
{
"epoch": 3.0686198920585968,
"grad_norm": 9.671181678771973,
"learning_rate": 3.251843548866962e-06,
"loss": 0.1424,
"step": 19900
},
{
"epoch": 3.072474942174248,
"grad_norm": 4.150481700897217,
"learning_rate": 3.2405020478693705e-06,
"loss": 0.1332,
"step": 19925
},
{
"epoch": 3.0763299922899,
"grad_norm": 8.400745391845703,
"learning_rate": 3.229170869929284e-06,
"loss": 0.118,
"step": 19950
},
{
"epoch": 3.0801850424055512,
"grad_norm": 6.1614155769348145,
"learning_rate": 3.217850081527258e-06,
"loss": 0.1406,
"step": 19975
},
{
"epoch": 3.0840400925212026,
"grad_norm": 8.866299629211426,
"learning_rate": 3.206539749082891e-06,
"loss": 0.1146,
"step": 20000
},
{
"epoch": 3.0878951426368544,
"grad_norm": 11.265491485595703,
"learning_rate": 3.1952399389544386e-06,
"loss": 0.1638,
"step": 20025
},
{
"epoch": 3.0917501927525057,
"grad_norm": 8.28701400756836,
"learning_rate": 3.1839507174384198e-06,
"loss": 0.1231,
"step": 20050
},
{
"epoch": 3.095605242868157,
"grad_norm": 6.407596111297607,
"learning_rate": 3.1726721507692293e-06,
"loss": 0.0932,
"step": 20075
},
{
"epoch": 3.099460292983809,
"grad_norm": 8.92701244354248,
"learning_rate": 3.1614043051187487e-06,
"loss": 0.1304,
"step": 20100
},
{
"epoch": 3.10331534309946,
"grad_norm": 5.273784160614014,
"learning_rate": 3.1501472465959624e-06,
"loss": 0.1359,
"step": 20125
},
{
"epoch": 3.107170393215112,
"grad_norm": 8.544116020202637,
"learning_rate": 3.138901041246562e-06,
"loss": 0.1232,
"step": 20150
},
{
"epoch": 3.1110254433307634,
"grad_norm": 11.999884605407715,
"learning_rate": 3.1276657550525674e-06,
"loss": 0.1427,
"step": 20175
},
{
"epoch": 3.1148804934464147,
"grad_norm": 9.57628059387207,
"learning_rate": 3.116441453931931e-06,
"loss": 0.1199,
"step": 20200
},
{
"epoch": 3.1187355435620665,
"grad_norm": 5.148153305053711,
"learning_rate": 3.1052282037381577e-06,
"loss": 0.1088,
"step": 20225
},
{
"epoch": 3.122590593677718,
"grad_norm": 5.2608537673950195,
"learning_rate": 3.0940260702599145e-06,
"loss": 0.1235,
"step": 20250
},
{
"epoch": 3.126445643793369,
"grad_norm": 4.419956684112549,
"learning_rate": 3.0828351192206487e-06,
"loss": 0.1523,
"step": 20275
},
{
"epoch": 3.130300693909021,
"grad_norm": 5.4912848472595215,
"learning_rate": 3.0716554162781963e-06,
"loss": 0.1298,
"step": 20300
},
{
"epoch": 3.1341557440246723,
"grad_norm": 7.892848014831543,
"learning_rate": 3.0604870270244024e-06,
"loss": 0.1084,
"step": 20325
},
{
"epoch": 3.1380107941403237,
"grad_norm": 6.533749103546143,
"learning_rate": 3.049330016984735e-06,
"loss": 0.0971,
"step": 20350
},
{
"epoch": 3.1418658442559755,
"grad_norm": 7.852967262268066,
"learning_rate": 3.038184451617898e-06,
"loss": 0.1073,
"step": 20375
},
{
"epoch": 3.145720894371627,
"grad_norm": 6.752570152282715,
"learning_rate": 3.0270503963154485e-06,
"loss": 0.1101,
"step": 20400
},
{
"epoch": 3.149575944487278,
"grad_norm": 13.965591430664062,
"learning_rate": 3.0159279164014134e-06,
"loss": 0.1072,
"step": 20425
},
{
"epoch": 3.15343099460293,
"grad_norm": 5.109891891479492,
"learning_rate": 3.0048170771319097e-06,
"loss": 0.0901,
"step": 20450
},
{
"epoch": 3.1572860447185813,
"grad_norm": 9.386459350585938,
"learning_rate": 2.9937179436947515e-06,
"loss": 0.1388,
"step": 20475
},
{
"epoch": 3.1611410948342327,
"grad_norm": 7.110087871551514,
"learning_rate": 2.982630581209084e-06,
"loss": 0.1191,
"step": 20500
},
{
"epoch": 3.1649961449498845,
"grad_norm": 12.44914722442627,
"learning_rate": 2.9715550547249834e-06,
"loss": 0.1249,
"step": 20525
},
{
"epoch": 3.168851195065536,
"grad_norm": 9.878494262695312,
"learning_rate": 2.9604914292230856e-06,
"loss": 0.1389,
"step": 20550
},
{
"epoch": 3.172706245181187,
"grad_norm": 9.951390266418457,
"learning_rate": 2.949439769614203e-06,
"loss": 0.1363,
"step": 20575
},
{
"epoch": 3.176561295296839,
"grad_norm": 9.471918106079102,
"learning_rate": 2.9384001407389462e-06,
"loss": 0.1064,
"step": 20600
},
{
"epoch": 3.1804163454124903,
"grad_norm": 8.348705291748047,
"learning_rate": 2.927372607367337e-06,
"loss": 0.1234,
"step": 20625
},
{
"epoch": 3.1842713955281416,
"grad_norm": 5.135454177856445,
"learning_rate": 2.916357234198434e-06,
"loss": 0.1052,
"step": 20650
},
{
"epoch": 3.1881264456437934,
"grad_norm": 8.725805282592773,
"learning_rate": 2.9053540858599506e-06,
"loss": 0.1326,
"step": 20675
},
{
"epoch": 3.191981495759445,
"grad_norm": 9.94555377960205,
"learning_rate": 2.894363226907879e-06,
"loss": 0.1186,
"step": 20700
},
{
"epoch": 3.1958365458750966,
"grad_norm": 10.841123580932617,
"learning_rate": 2.883384721826108e-06,
"loss": 0.124,
"step": 20725
},
{
"epoch": 3.199691595990748,
"grad_norm": 6.398654937744141,
"learning_rate": 2.8724186350260418e-06,
"loss": 0.1022,
"step": 20750
},
{
"epoch": 3.2035466461063993,
"grad_norm": 7.522919654846191,
"learning_rate": 2.8614650308462313e-06,
"loss": 0.1147,
"step": 20775
},
{
"epoch": 3.207401696222051,
"grad_norm": 7.400803565979004,
"learning_rate": 2.8505239735519878e-06,
"loss": 0.1261,
"step": 20800
},
{
"epoch": 3.2112567463377024,
"grad_norm": 10.420129776000977,
"learning_rate": 2.839595527335014e-06,
"loss": 0.0994,
"step": 20825
},
{
"epoch": 3.2151117964533538,
"grad_norm": 14.275862693786621,
"learning_rate": 2.828679756313014e-06,
"loss": 0.1435,
"step": 20850
},
{
"epoch": 3.2189668465690056,
"grad_norm": 8.413034439086914,
"learning_rate": 2.8177767245293352e-06,
"loss": 0.1212,
"step": 20875
},
{
"epoch": 3.222821896684657,
"grad_norm": 6.920238018035889,
"learning_rate": 2.806886495952581e-06,
"loss": 0.1221,
"step": 20900
},
{
"epoch": 3.2266769468003083,
"grad_norm": 6.907994747161865,
"learning_rate": 2.7960091344762315e-06,
"loss": 0.1155,
"step": 20925
},
{
"epoch": 3.23053199691596,
"grad_norm": 7.724101543426514,
"learning_rate": 2.7851447039182823e-06,
"loss": 0.1337,
"step": 20950
},
{
"epoch": 3.2343870470316114,
"grad_norm": 5.470908164978027,
"learning_rate": 2.7742932680208616e-06,
"loss": 0.1403,
"step": 20975
},
{
"epoch": 3.2382420971472627,
"grad_norm": 9.696648597717285,
"learning_rate": 2.7634548904498528e-06,
"loss": 0.1564,
"step": 21000
},
{
"epoch": 3.2420971472629145,
"grad_norm": 8.626696586608887,
"learning_rate": 2.752629634794529e-06,
"loss": 0.134,
"step": 21025
},
{
"epoch": 3.245952197378566,
"grad_norm": 3.644073247909546,
"learning_rate": 2.7418175645671795e-06,
"loss": 0.1253,
"step": 21050
},
{
"epoch": 3.2498072474942172,
"grad_norm": 8.510153770446777,
"learning_rate": 2.7310187432027256e-06,
"loss": 0.1136,
"step": 21075
},
{
"epoch": 3.253662297609869,
"grad_norm": 4.62969446182251,
"learning_rate": 2.7202332340583647e-06,
"loss": 0.1253,
"step": 21100
},
{
"epoch": 3.2575173477255204,
"grad_norm": 7.307432174682617,
"learning_rate": 2.7094611004131865e-06,
"loss": 0.0996,
"step": 21125
},
{
"epoch": 3.261372397841172,
"grad_norm": 5.4069013595581055,
"learning_rate": 2.69870240546781e-06,
"loss": 0.1099,
"step": 21150
},
{
"epoch": 3.2652274479568235,
"grad_norm": 7.278680801391602,
"learning_rate": 2.6879572123440022e-06,
"loss": 0.1145,
"step": 21175
},
{
"epoch": 3.269082498072475,
"grad_norm": 8.290931701660156,
"learning_rate": 2.6772255840843196e-06,
"loss": 0.107,
"step": 21200
},
{
"epoch": 3.2729375481881267,
"grad_norm": 6.768774032592773,
"learning_rate": 2.6665075836517346e-06,
"loss": 0.0943,
"step": 21225
},
{
"epoch": 3.276792598303778,
"grad_norm": 5.0795087814331055,
"learning_rate": 2.6558032739292565e-06,
"loss": 0.1239,
"step": 21250
},
{
"epoch": 3.2806476484194294,
"grad_norm": 16.384565353393555,
"learning_rate": 2.645112717719578e-06,
"loss": 0.1209,
"step": 21275
},
{
"epoch": 3.284502698535081,
"grad_norm": 8.257716178894043,
"learning_rate": 2.6344359777446988e-06,
"loss": 0.0792,
"step": 21300
},
{
"epoch": 3.2883577486507325,
"grad_norm": 7.083979606628418,
"learning_rate": 2.6237731166455514e-06,
"loss": 0.1176,
"step": 21325
},
{
"epoch": 3.292212798766384,
"grad_norm": 8.932221412658691,
"learning_rate": 2.6131241969816478e-06,
"loss": 0.1282,
"step": 21350
},
{
"epoch": 3.2960678488820356,
"grad_norm": 6.137622356414795,
"learning_rate": 2.602489281230704e-06,
"loss": 0.1086,
"step": 21375
},
{
"epoch": 3.299922898997687,
"grad_norm": 1.592561960220337,
"learning_rate": 2.591868431788268e-06,
"loss": 0.1206,
"step": 21400
},
{
"epoch": 3.3037779491133383,
"grad_norm": 6.968202114105225,
"learning_rate": 2.5812617109673675e-06,
"loss": 0.1242,
"step": 21425
},
{
"epoch": 3.30763299922899,
"grad_norm": 5.464489459991455,
"learning_rate": 2.5706691809981333e-06,
"loss": 0.0887,
"step": 21450
},
{
"epoch": 3.3114880493446415,
"grad_norm": 10.201210021972656,
"learning_rate": 2.5600909040274404e-06,
"loss": 0.1303,
"step": 21475
},
{
"epoch": 3.315343099460293,
"grad_norm": 11.269229888916016,
"learning_rate": 2.5495269421185355e-06,
"loss": 0.1415,
"step": 21500
},
{
"epoch": 3.3191981495759446,
"grad_norm": 11.454387664794922,
"learning_rate": 2.5389773572506825e-06,
"loss": 0.1656,
"step": 21525
},
{
"epoch": 3.323053199691596,
"grad_norm": 15.136128425598145,
"learning_rate": 2.5284422113187967e-06,
"loss": 0.1117,
"step": 21550
},
{
"epoch": 3.3269082498072473,
"grad_norm": 11.155787467956543,
"learning_rate": 2.5179215661330724e-06,
"loss": 0.1337,
"step": 21575
},
{
"epoch": 3.330763299922899,
"grad_norm": 6.526209354400635,
"learning_rate": 2.507415483418633e-06,
"loss": 0.1373,
"step": 21600
},
{
"epoch": 3.3346183500385504,
"grad_norm": 6.626813888549805,
"learning_rate": 2.4969240248151634e-06,
"loss": 0.1154,
"step": 21625
},
{
"epoch": 3.338473400154202,
"grad_norm": 11.29115104675293,
"learning_rate": 2.486447251876542e-06,
"loss": 0.1183,
"step": 21650
},
{
"epoch": 3.3423284502698536,
"grad_norm": 9.870936393737793,
"learning_rate": 2.4759852260704927e-06,
"loss": 0.1288,
"step": 21675
},
{
"epoch": 3.346183500385505,
"grad_norm": 7.866130352020264,
"learning_rate": 2.4655380087782155e-06,
"loss": 0.1452,
"step": 21700
},
{
"epoch": 3.3500385505011563,
"grad_norm": 8.198760986328125,
"learning_rate": 2.455105661294022e-06,
"loss": 0.1538,
"step": 21725
},
{
"epoch": 3.353893600616808,
"grad_norm": 6.8911919593811035,
"learning_rate": 2.4446882448249946e-06,
"loss": 0.1153,
"step": 21750
},
{
"epoch": 3.3577486507324594,
"grad_norm": 8.507146835327148,
"learning_rate": 2.4342858204906023e-06,
"loss": 0.0951,
"step": 21775
},
{
"epoch": 3.3616037008481108,
"grad_norm": 12.867559432983398,
"learning_rate": 2.423898449322362e-06,
"loss": 0.1407,
"step": 21800
},
{
"epoch": 3.3654587509637626,
"grad_norm": 10.051011085510254,
"learning_rate": 2.413526192263468e-06,
"loss": 0.1469,
"step": 21825
},
{
"epoch": 3.369313801079414,
"grad_norm": 5.155115127563477,
"learning_rate": 2.4031691101684423e-06,
"loss": 0.1146,
"step": 21850
},
{
"epoch": 3.3731688511950657,
"grad_norm": 11.490079879760742,
"learning_rate": 2.3928272638027777e-06,
"loss": 0.106,
"step": 21875
},
{
"epoch": 3.377023901310717,
"grad_norm": 11.593327522277832,
"learning_rate": 2.38250071384257e-06,
"loss": 0.1284,
"step": 21900
},
{
"epoch": 3.3808789514263684,
"grad_norm": 16.53786277770996,
"learning_rate": 2.372189520874176e-06,
"loss": 0.1042,
"step": 21925
},
{
"epoch": 3.38473400154202,
"grad_norm": 9.45279312133789,
"learning_rate": 2.3618937453938558e-06,
"loss": 0.1515,
"step": 21950
},
{
"epoch": 3.3885890516576715,
"grad_norm": 2.748750686645508,
"learning_rate": 2.3516134478074043e-06,
"loss": 0.1164,
"step": 21975
},
{
"epoch": 3.392444101773323,
"grad_norm": 4.7133331298828125,
"learning_rate": 2.341348688429817e-06,
"loss": 0.1101,
"step": 22000
},
{
"epoch": 3.3962991518889747,
"grad_norm": 6.6426568031311035,
"learning_rate": 2.3310995274849167e-06,
"loss": 0.1277,
"step": 22025
},
{
"epoch": 3.400154202004626,
"grad_norm": 5.261310577392578,
"learning_rate": 2.320866025105016e-06,
"loss": 0.1201,
"step": 22050
},
{
"epoch": 3.4040092521202774,
"grad_norm": 8.949080467224121,
"learning_rate": 2.3106482413305605e-06,
"loss": 0.1475,
"step": 22075
},
{
"epoch": 3.407864302235929,
"grad_norm": 3.1100921630859375,
"learning_rate": 2.3004462361097645e-06,
"loss": 0.1186,
"step": 22100
},
{
"epoch": 3.4117193523515805,
"grad_norm": 10.295735359191895,
"learning_rate": 2.2902600692982774e-06,
"loss": 0.1169,
"step": 22125
},
{
"epoch": 3.4155744024672323,
"grad_norm": 8.572005271911621,
"learning_rate": 2.2800898006588174e-06,
"loss": 0.1019,
"step": 22150
},
{
"epoch": 3.4194294525828837,
"grad_norm": 6.087860107421875,
"learning_rate": 2.2699354898608315e-06,
"loss": 0.0885,
"step": 22175
},
{
"epoch": 3.423284502698535,
"grad_norm": 3.5558598041534424,
"learning_rate": 2.2597971964801435e-06,
"loss": 0.1173,
"step": 22200
},
{
"epoch": 3.427139552814187,
"grad_norm": 5.779830455780029,
"learning_rate": 2.249674979998594e-06,
"loss": 0.1069,
"step": 22225
},
{
"epoch": 3.430994602929838,
"grad_norm": 7.2625274658203125,
"learning_rate": 2.239568899803707e-06,
"loss": 0.0862,
"step": 22250
},
{
"epoch": 3.4348496530454895,
"grad_norm": 6.926580905914307,
"learning_rate": 2.2294790151883338e-06,
"loss": 0.15,
"step": 22275
},
{
"epoch": 3.4387047031611413,
"grad_norm": 12.504677772521973,
"learning_rate": 2.2194053853502996e-06,
"loss": 0.1336,
"step": 22300
},
{
"epoch": 3.4425597532767926,
"grad_norm": 10.718069076538086,
"learning_rate": 2.20934806939207e-06,
"loss": 0.1141,
"step": 22325
},
{
"epoch": 3.446414803392444,
"grad_norm": 12.015519142150879,
"learning_rate": 2.1993071263203865e-06,
"loss": 0.1305,
"step": 22350
},
{
"epoch": 3.450269853508096,
"grad_norm": 8.437941551208496,
"learning_rate": 2.189282615045941e-06,
"loss": 0.1512,
"step": 22375
},
{
"epoch": 3.454124903623747,
"grad_norm": 2.126424789428711,
"learning_rate": 2.179274594383015e-06,
"loss": 0.1198,
"step": 22400
},
{
"epoch": 3.4579799537393985,
"grad_norm": 6.558461666107178,
"learning_rate": 2.169283123049134e-06,
"loss": 0.1185,
"step": 22425
},
{
"epoch": 3.4618350038550503,
"grad_norm": 8.4487943649292,
"learning_rate": 2.1593082596647347e-06,
"loss": 0.1107,
"step": 22450
},
{
"epoch": 3.4656900539707016,
"grad_norm": 5.250937461853027,
"learning_rate": 2.1493500627528086e-06,
"loss": 0.1076,
"step": 22475
},
{
"epoch": 3.469545104086353,
"grad_norm": 4.514431953430176,
"learning_rate": 2.139408590738568e-06,
"loss": 0.1158,
"step": 22500
},
{
"epoch": 3.4734001542020048,
"grad_norm": 12.771848678588867,
"learning_rate": 2.1294839019491005e-06,
"loss": 0.1057,
"step": 22525
},
{
"epoch": 3.477255204317656,
"grad_norm": 13.102221488952637,
"learning_rate": 2.119576054613019e-06,
"loss": 0.1017,
"step": 22550
},
{
"epoch": 3.4811102544333075,
"grad_norm": 8.106521606445312,
"learning_rate": 2.1096851068601343e-06,
"loss": 0.1342,
"step": 22575
},
{
"epoch": 3.4849653045489593,
"grad_norm": 8.431148529052734,
"learning_rate": 2.099811116721105e-06,
"loss": 0.1303,
"step": 22600
},
{
"epoch": 3.4888203546646106,
"grad_norm": 17.26299476623535,
"learning_rate": 2.089954142127093e-06,
"loss": 0.1094,
"step": 22625
},
{
"epoch": 3.492675404780262,
"grad_norm": 5.573319435119629,
"learning_rate": 2.080114240909437e-06,
"loss": 0.1153,
"step": 22650
},
{
"epoch": 3.4965304548959137,
"grad_norm": 8.843880653381348,
"learning_rate": 2.0702914707992972e-06,
"loss": 0.1157,
"step": 22675
},
{
"epoch": 3.500385505011565,
"grad_norm": 5.568359851837158,
"learning_rate": 2.0604858894273344e-06,
"loss": 0.0939,
"step": 22700
},
{
"epoch": 3.5042405551272164,
"grad_norm": 11.386759757995605,
"learning_rate": 2.0506975543233564e-06,
"loss": 0.1197,
"step": 22725
},
{
"epoch": 3.5080956052428682,
"grad_norm": 4.930935382843018,
"learning_rate": 2.040926522915984e-06,
"loss": 0.1358,
"step": 22750
},
{
"epoch": 3.5119506553585196,
"grad_norm": 7.518167018890381,
"learning_rate": 2.0311728525323233e-06,
"loss": 0.1003,
"step": 22775
},
{
"epoch": 3.515805705474171,
"grad_norm": 6.179995059967041,
"learning_rate": 2.021436600397615e-06,
"loss": 0.1109,
"step": 22800
},
{
"epoch": 3.5196607555898227,
"grad_norm": 5.020857334136963,
"learning_rate": 2.011717823634911e-06,
"loss": 0.1029,
"step": 22825
},
{
"epoch": 3.523515805705474,
"grad_norm": 6.9608588218688965,
"learning_rate": 2.0020165792647357e-06,
"loss": 0.1291,
"step": 22850
},
{
"epoch": 3.5273708558211254,
"grad_norm": 4.67839241027832,
"learning_rate": 1.9923329242047435e-06,
"loss": 0.0876,
"step": 22875
},
{
"epoch": 3.531225905936777,
"grad_norm": 9.877412796020508,
"learning_rate": 1.9826669152693976e-06,
"loss": 0.1003,
"step": 22900
},
{
"epoch": 3.5350809560524286,
"grad_norm": 7.6078596115112305,
"learning_rate": 1.9730186091696303e-06,
"loss": 0.1069,
"step": 22925
},
{
"epoch": 3.5389360061680804,
"grad_norm": 7.267652988433838,
"learning_rate": 1.9633880625125047e-06,
"loss": 0.1025,
"step": 22950
},
{
"epoch": 3.5427910562837317,
"grad_norm": 6.974407196044922,
"learning_rate": 1.9537753318008966e-06,
"loss": 0.1124,
"step": 22975
},
{
"epoch": 3.546646106399383,
"grad_norm": 6.338265419006348,
"learning_rate": 1.944180473433145e-06,
"loss": 0.1152,
"step": 23000
},
{
"epoch": 3.550501156515035,
"grad_norm": 3.4521892070770264,
"learning_rate": 1.9346035437027416e-06,
"loss": 0.0992,
"step": 23025
},
{
"epoch": 3.554356206630686,
"grad_norm": 2.416567802429199,
"learning_rate": 1.925044598797986e-06,
"loss": 0.1611,
"step": 23050
},
{
"epoch": 3.5582112567463375,
"grad_norm": 7.557652473449707,
"learning_rate": 1.9155036948016546e-06,
"loss": 0.1126,
"step": 23075
},
{
"epoch": 3.5620663068619893,
"grad_norm": 9.03351879119873,
"learning_rate": 1.905980887690685e-06,
"loss": 0.106,
"step": 23100
},
{
"epoch": 3.5659213569776407,
"grad_norm": 4.699378967285156,
"learning_rate": 1.8964762333358327e-06,
"loss": 0.1458,
"step": 23125
},
{
"epoch": 3.5697764070932925,
"grad_norm": 8.233428955078125,
"learning_rate": 1.8869897875013548e-06,
"loss": 0.1127,
"step": 23150
},
{
"epoch": 3.573631457208944,
"grad_norm": 15.474151611328125,
"learning_rate": 1.8775216058446783e-06,
"loss": 0.1222,
"step": 23175
},
{
"epoch": 3.577486507324595,
"grad_norm": 11.501723289489746,
"learning_rate": 1.8680717439160679e-06,
"loss": 0.157,
"step": 23200
},
{
"epoch": 3.581341557440247,
"grad_norm": 5.580605506896973,
"learning_rate": 1.8586402571583118e-06,
"loss": 0.1734,
"step": 23225
},
{
"epoch": 3.5851966075558983,
"grad_norm": 9.259316444396973,
"learning_rate": 1.8492272009063894e-06,
"loss": 0.1476,
"step": 23250
},
{
"epoch": 3.5890516576715497,
"grad_norm": 7.23510217666626,
"learning_rate": 1.8398326303871423e-06,
"loss": 0.1305,
"step": 23275
},
{
"epoch": 3.5929067077872014,
"grad_norm": 9.326400756835938,
"learning_rate": 1.8304566007189605e-06,
"loss": 0.1278,
"step": 23300
},
{
"epoch": 3.596761757902853,
"grad_norm": 8.78858470916748,
"learning_rate": 1.8210991669114525e-06,
"loss": 0.0944,
"step": 23325
},
{
"epoch": 3.600616808018504,
"grad_norm": 9.1303071975708,
"learning_rate": 1.8117603838651242e-06,
"loss": 0.1241,
"step": 23350
},
{
"epoch": 3.604471858134156,
"grad_norm": 5.380366802215576,
"learning_rate": 1.8024403063710582e-06,
"loss": 0.1102,
"step": 23375
},
{
"epoch": 3.6083269082498073,
"grad_norm": 13.162090301513672,
"learning_rate": 1.7931389891105856e-06,
"loss": 0.1157,
"step": 23400
},
{
"epoch": 3.6121819583654586,
"grad_norm": 10.518856048583984,
"learning_rate": 1.7838564866549762e-06,
"loss": 0.1129,
"step": 23425
},
{
"epoch": 3.6160370084811104,
"grad_norm": 7.937816143035889,
"learning_rate": 1.7745928534651074e-06,
"loss": 0.1052,
"step": 23450
},
{
"epoch": 3.6198920585967618,
"grad_norm": 6.695240497589111,
"learning_rate": 1.7653481438911535e-06,
"loss": 0.1269,
"step": 23475
},
{
"epoch": 3.623747108712413,
"grad_norm": 10.914417266845703,
"learning_rate": 1.7561224121722636e-06,
"loss": 0.1324,
"step": 23500
},
{
"epoch": 3.627602158828065,
"grad_norm": 6.560018539428711,
"learning_rate": 1.7469157124362374e-06,
"loss": 0.1216,
"step": 23525
},
{
"epoch": 3.6314572089437163,
"grad_norm": 9.865793228149414,
"learning_rate": 1.7377280986992185e-06,
"loss": 0.1126,
"step": 23550
},
{
"epoch": 3.6353122590593676,
"grad_norm": 10.021103858947754,
"learning_rate": 1.728559624865372e-06,
"loss": 0.1308,
"step": 23575
},
{
"epoch": 3.6391673091750194,
"grad_norm": 10.210054397583008,
"learning_rate": 1.7194103447265625e-06,
"loss": 0.103,
"step": 23600
},
{
"epoch": 3.6430223592906708,
"grad_norm": 4.66168212890625,
"learning_rate": 1.710280311962051e-06,
"loss": 0.1094,
"step": 23625
},
{
"epoch": 3.646877409406322,
"grad_norm": 12.833625793457031,
"learning_rate": 1.7011695801381694e-06,
"loss": 0.1143,
"step": 23650
},
{
"epoch": 3.650732459521974,
"grad_norm": 8.13430404663086,
"learning_rate": 1.6920782027080124e-06,
"loss": 0.1141,
"step": 23675
},
{
"epoch": 3.6545875096376252,
"grad_norm": 9.954723358154297,
"learning_rate": 1.6830062330111214e-06,
"loss": 0.1194,
"step": 23700
},
{
"epoch": 3.6584425597532766,
"grad_norm": 11.517424583435059,
"learning_rate": 1.673953724273167e-06,
"loss": 0.1105,
"step": 23725
},
{
"epoch": 3.6622976098689284,
"grad_norm": 8.514795303344727,
"learning_rate": 1.6649207296056479e-06,
"loss": 0.1228,
"step": 23750
},
{
"epoch": 3.6661526599845797,
"grad_norm": 6.922457695007324,
"learning_rate": 1.6559073020055687e-06,
"loss": 0.1399,
"step": 23775
},
{
"epoch": 3.670007710100231,
"grad_norm": 4.010049343109131,
"learning_rate": 1.6469134943551345e-06,
"loss": 0.1424,
"step": 23800
},
{
"epoch": 3.673862760215883,
"grad_norm": 13.174199104309082,
"learning_rate": 1.637939359421441e-06,
"loss": 0.1302,
"step": 23825
},
{
"epoch": 3.677717810331534,
"grad_norm": 2.7568843364715576,
"learning_rate": 1.6289849498561584e-06,
"loss": 0.1342,
"step": 23850
},
{
"epoch": 3.6815728604471856,
"grad_norm": 6.765602111816406,
"learning_rate": 1.6200503181952315e-06,
"loss": 0.1257,
"step": 23875
},
{
"epoch": 3.6854279105628374,
"grad_norm": 4.946558952331543,
"learning_rate": 1.6111355168585674e-06,
"loss": 0.109,
"step": 23900
},
{
"epoch": 3.6892829606784887,
"grad_norm": 5.697614669799805,
"learning_rate": 1.6022405981497213e-06,
"loss": 0.1483,
"step": 23925
},
{
"epoch": 3.69313801079414,
"grad_norm": 10.446500778198242,
"learning_rate": 1.5933656142556075e-06,
"loss": 0.1321,
"step": 23950
},
{
"epoch": 3.696993060909792,
"grad_norm": 7.87567138671875,
"learning_rate": 1.5845106172461705e-06,
"loss": 0.146,
"step": 23975
},
{
"epoch": 3.700848111025443,
"grad_norm": 9.988269805908203,
"learning_rate": 1.5756756590740973e-06,
"loss": 0.1205,
"step": 24000
},
{
"epoch": 3.704703161141095,
"grad_norm": 5.261201858520508,
"learning_rate": 1.5668607915745053e-06,
"loss": 0.1104,
"step": 24025
},
{
"epoch": 3.7085582112567463,
"grad_norm": 11.093449592590332,
"learning_rate": 1.5580660664646358e-06,
"loss": 0.1232,
"step": 24050
},
{
"epoch": 3.7124132613723977,
"grad_norm": 6.619266986846924,
"learning_rate": 1.549291535343559e-06,
"loss": 0.1079,
"step": 24075
},
{
"epoch": 3.7162683114880495,
"grad_norm": 9.868049621582031,
"learning_rate": 1.540537249691859e-06,
"loss": 0.1108,
"step": 24100
},
{
"epoch": 3.720123361603701,
"grad_norm": 2.1785800457000732,
"learning_rate": 1.5318032608713446e-06,
"loss": 0.0916,
"step": 24125
},
{
"epoch": 3.7239784117193526,
"grad_norm": 10.12387466430664,
"learning_rate": 1.523089620124743e-06,
"loss": 0.1375,
"step": 24150
},
{
"epoch": 3.727833461835004,
"grad_norm": 6.709667682647705,
"learning_rate": 1.5143963785753906e-06,
"loss": 0.1147,
"step": 24175
},
{
"epoch": 3.7316885119506553,
"grad_norm": 9.225015640258789,
"learning_rate": 1.5057235872269493e-06,
"loss": 0.1172,
"step": 24200
},
{
"epoch": 3.735543562066307,
"grad_norm": 5.36521577835083,
"learning_rate": 1.4970712969630952e-06,
"loss": 0.1268,
"step": 24225
},
{
"epoch": 3.7393986121819585,
"grad_norm": 10.580253601074219,
"learning_rate": 1.4884395585472194e-06,
"loss": 0.1195,
"step": 24250
},
{
"epoch": 3.74325366229761,
"grad_norm": 7.890168190002441,
"learning_rate": 1.4798284226221448e-06,
"loss": 0.1018,
"step": 24275
},
{
"epoch": 3.7471087124132616,
"grad_norm": 8.8533296585083,
"learning_rate": 1.4712379397098075e-06,
"loss": 0.1377,
"step": 24300
},
{
"epoch": 3.750963762528913,
"grad_norm": 6.100067138671875,
"learning_rate": 1.4626681602109776e-06,
"loss": 0.0932,
"step": 24325
},
{
"epoch": 3.7548188126445643,
"grad_norm": 6.630173206329346,
"learning_rate": 1.454119134404957e-06,
"loss": 0.1223,
"step": 24350
},
{
"epoch": 3.758673862760216,
"grad_norm": 12.35831356048584,
"learning_rate": 1.4455909124492811e-06,
"loss": 0.1239,
"step": 24375
},
{
"epoch": 3.7625289128758674,
"grad_norm": 8.832963943481445,
"learning_rate": 1.4370835443794328e-06,
"loss": 0.131,
"step": 24400
},
{
"epoch": 3.766383962991519,
"grad_norm": 5.409282207489014,
"learning_rate": 1.4285970801085392e-06,
"loss": 0.1353,
"step": 24425
},
{
"epoch": 3.7702390131071706,
"grad_norm": 7.144223213195801,
"learning_rate": 1.4201315694270878e-06,
"loss": 0.1106,
"step": 24450
},
{
"epoch": 3.774094063222822,
"grad_norm": 6.219675540924072,
"learning_rate": 1.4116870620026318e-06,
"loss": 0.1117,
"step": 24475
},
{
"epoch": 3.7779491133384733,
"grad_norm": 3.6316025257110596,
"learning_rate": 1.4032636073794902e-06,
"loss": 0.1103,
"step": 24500
},
{
"epoch": 3.781804163454125,
"grad_norm": 6.3065266609191895,
"learning_rate": 1.3948612549784717e-06,
"loss": 0.1002,
"step": 24525
},
{
"epoch": 3.7856592135697764,
"grad_norm": 9.909113883972168,
"learning_rate": 1.3864800540965735e-06,
"loss": 0.1152,
"step": 24550
},
{
"epoch": 3.7895142636854278,
"grad_norm": 9.750158309936523,
"learning_rate": 1.3781200539066962e-06,
"loss": 0.1139,
"step": 24575
},
{
"epoch": 3.7933693138010796,
"grad_norm": 7.727182388305664,
"learning_rate": 1.3697813034573576e-06,
"loss": 0.1364,
"step": 24600
},
{
"epoch": 3.797224363916731,
"grad_norm": 7.190638542175293,
"learning_rate": 1.361463851672397e-06,
"loss": 0.1173,
"step": 24625
},
{
"epoch": 3.8010794140323823,
"grad_norm": 8.281079292297363,
"learning_rate": 1.3531677473506977e-06,
"loss": 0.145,
"step": 24650
},
{
"epoch": 3.804934464148034,
"grad_norm": 9.231086730957031,
"learning_rate": 1.3448930391658966e-06,
"loss": 0.0883,
"step": 24675
},
{
"epoch": 3.8087895142636854,
"grad_norm": 5.426698207855225,
"learning_rate": 1.3366397756660949e-06,
"loss": 0.1597,
"step": 24700
},
{
"epoch": 3.8126445643793367,
"grad_norm": 4.128628253936768,
"learning_rate": 1.3284080052735804e-06,
"loss": 0.1339,
"step": 24725
},
{
"epoch": 3.8164996144949885,
"grad_norm": 8.45683765411377,
"learning_rate": 1.3201977762845369e-06,
"loss": 0.1228,
"step": 24750
},
{
"epoch": 3.82035466461064,
"grad_norm": 5.29550313949585,
"learning_rate": 1.312009136868766e-06,
"loss": 0.1031,
"step": 24775
},
{
"epoch": 3.8242097147262912,
"grad_norm": 9.903135299682617,
"learning_rate": 1.303842135069403e-06,
"loss": 0.1301,
"step": 24800
},
{
"epoch": 3.828064764841943,
"grad_norm": 7.510071277618408,
"learning_rate": 1.2956968188026298e-06,
"loss": 0.1262,
"step": 24825
},
{
"epoch": 3.8319198149575944,
"grad_norm": 9.486001968383789,
"learning_rate": 1.2875732358574033e-06,
"loss": 0.1227,
"step": 24850
},
{
"epoch": 3.8357748650732457,
"grad_norm": 8.058210372924805,
"learning_rate": 1.2794714338951675e-06,
"loss": 0.121,
"step": 24875
},
{
"epoch": 3.8396299151888975,
"grad_norm": 8.384005546569824,
"learning_rate": 1.2713914604495769e-06,
"loss": 0.1233,
"step": 24900
},
{
"epoch": 3.843484965304549,
"grad_norm": 11.536874771118164,
"learning_rate": 1.2633333629262184e-06,
"loss": 0.1337,
"step": 24925
},
{
"epoch": 3.8473400154202,
"grad_norm": 18.2574462890625,
"learning_rate": 1.255297188602328e-06,
"loss": 0.1084,
"step": 24950
},
{
"epoch": 3.851195065535852,
"grad_norm": 4.979963302612305,
"learning_rate": 1.24728298462652e-06,
"loss": 0.1,
"step": 24975
},
{
"epoch": 3.8550501156515034,
"grad_norm": 5.4192891120910645,
"learning_rate": 1.2392907980185087e-06,
"loss": 0.1501,
"step": 25000
},
{
"epoch": 3.8589051657671547,
"grad_norm": 9.014104843139648,
"learning_rate": 1.2313206756688283e-06,
"loss": 0.125,
"step": 25025
},
{
"epoch": 3.8627602158828065,
"grad_norm": 8.235286712646484,
"learning_rate": 1.2233726643385652e-06,
"loss": 0.1282,
"step": 25050
},
{
"epoch": 3.866615265998458,
"grad_norm": 4.127931118011475,
"learning_rate": 1.2154468106590734e-06,
"loss": 0.1131,
"step": 25075
},
{
"epoch": 3.8704703161141096,
"grad_norm": 7.632324695587158,
"learning_rate": 1.2075431611317124e-06,
"loss": 0.1088,
"step": 25100
},
{
"epoch": 3.874325366229761,
"grad_norm": 6.121814250946045,
"learning_rate": 1.199661762127568e-06,
"loss": 0.1107,
"step": 25125
},
{
"epoch": 3.8781804163454123,
"grad_norm": 6.5897135734558105,
"learning_rate": 1.1918026598871774e-06,
"loss": 0.12,
"step": 25150
},
{
"epoch": 3.882035466461064,
"grad_norm": 4.682891368865967,
"learning_rate": 1.1839659005202652e-06,
"loss": 0.1314,
"step": 25175
},
{
"epoch": 3.8858905165767155,
"grad_norm": 6.911609172821045,
"learning_rate": 1.1761515300054693e-06,
"loss": 0.1249,
"step": 25200
},
{
"epoch": 3.8897455666923673,
"grad_norm": 10.79582691192627,
"learning_rate": 1.1683595941900694e-06,
"loss": 0.1439,
"step": 25225
},
{
"epoch": 3.8936006168080186,
"grad_norm": 7.092615127563477,
"learning_rate": 1.1605901387897229e-06,
"loss": 0.1096,
"step": 25250
},
{
"epoch": 3.89745566692367,
"grad_norm": 7.009284496307373,
"learning_rate": 1.1528432093881869e-06,
"loss": 0.0982,
"step": 25275
},
{
"epoch": 3.9013107170393218,
"grad_norm": 6.315096855163574,
"learning_rate": 1.145118851437066e-06,
"loss": 0.1357,
"step": 25300
},
{
"epoch": 3.905165767154973,
"grad_norm": 9.677032470703125,
"learning_rate": 1.1374171102555292e-06,
"loss": 0.151,
"step": 25325
},
{
"epoch": 3.9090208172706244,
"grad_norm": 9.338630676269531,
"learning_rate": 1.1297380310300571e-06,
"loss": 0.1384,
"step": 25350
},
{
"epoch": 3.9128758673862762,
"grad_norm": 6.915759563446045,
"learning_rate": 1.1220816588141708e-06,
"loss": 0.1201,
"step": 25375
},
{
"epoch": 3.9167309175019276,
"grad_norm": 5.117621898651123,
"learning_rate": 1.1144480385281653e-06,
"loss": 0.1093,
"step": 25400
},
{
"epoch": 3.920585967617579,
"grad_norm": 7.7056427001953125,
"learning_rate": 1.106837214958852e-06,
"loss": 0.1029,
"step": 25425
},
{
"epoch": 3.9244410177332307,
"grad_norm": 5.538569450378418,
"learning_rate": 1.099249232759293e-06,
"loss": 0.0899,
"step": 25450
},
{
"epoch": 3.928296067848882,
"grad_norm": 7.2144694328308105,
"learning_rate": 1.0916841364485358e-06,
"loss": 0.1092,
"step": 25475
},
{
"epoch": 3.9321511179645334,
"grad_norm": 9.859286308288574,
"learning_rate": 1.084141970411358e-06,
"loss": 0.1259,
"step": 25500
},
{
"epoch": 3.936006168080185,
"grad_norm": 7.732922554016113,
"learning_rate": 1.0766227788980038e-06,
"loss": 0.1412,
"step": 25525
},
{
"epoch": 3.9398612181958366,
"grad_norm": 1.3876274824142456,
"learning_rate": 1.0691266060239253e-06,
"loss": 0.1301,
"step": 25550
},
{
"epoch": 3.943716268311488,
"grad_norm": 9.473686218261719,
"learning_rate": 1.061653495769523e-06,
"loss": 0.1518,
"step": 25575
},
{
"epoch": 3.9475713184271397,
"grad_norm": 6.305518627166748,
"learning_rate": 1.0542034919798848e-06,
"loss": 0.1221,
"step": 25600
},
{
"epoch": 3.951426368542791,
"grad_norm": 8.425362586975098,
"learning_rate": 1.0467766383645378e-06,
"loss": 0.1266,
"step": 25625
},
{
"epoch": 3.9552814186584424,
"grad_norm": 7.328820705413818,
"learning_rate": 1.039372978497179e-06,
"loss": 0.1635,
"step": 25650
},
{
"epoch": 3.959136468774094,
"grad_norm": 7.371628284454346,
"learning_rate": 1.031992555815432e-06,
"loss": 0.1263,
"step": 25675
},
{
"epoch": 3.9629915188897455,
"grad_norm": 7.828273296356201,
"learning_rate": 1.024635413620586e-06,
"loss": 0.1195,
"step": 25700
},
{
"epoch": 3.966846569005397,
"grad_norm": 9.037369728088379,
"learning_rate": 1.0173015950773391e-06,
"loss": 0.1118,
"step": 25725
},
{
"epoch": 3.9707016191210487,
"grad_norm": 9.443474769592285,
"learning_rate": 1.0099911432135512e-06,
"loss": 0.1288,
"step": 25750
},
{
"epoch": 3.9745566692367,
"grad_norm": 4.290574550628662,
"learning_rate": 1.002704100919991e-06,
"loss": 0.1065,
"step": 25775
},
{
"epoch": 3.9784117193523514,
"grad_norm": 6.752484321594238,
"learning_rate": 9.954405109500758e-07,
"loss": 0.0988,
"step": 25800
},
{
"epoch": 3.982266769468003,
"grad_norm": 3.236395835876465,
"learning_rate": 9.882004159196324e-07,
"loss": 0.1082,
"step": 25825
},
{
"epoch": 3.9861218195836545,
"grad_norm": 6.216182708740234,
"learning_rate": 9.809838583066394e-07,
"loss": 0.1338,
"step": 25850
},
{
"epoch": 3.989976869699306,
"grad_norm": 9.444483757019043,
"learning_rate": 9.737908804509822e-07,
"loss": 0.1118,
"step": 25875
},
{
"epoch": 3.9938319198149577,
"grad_norm": 9.531264305114746,
"learning_rate": 9.66621524554201e-07,
"loss": 0.119,
"step": 25900
},
{
"epoch": 3.997686969930609,
"grad_norm": 4.927842140197754,
"learning_rate": 9.59475832679243e-07,
"loss": 0.1171,
"step": 25925
},
{
"epoch": 4.00154202004626,
"grad_norm": 5.128291606903076,
"learning_rate": 9.523538467502224e-07,
"loss": 0.0911,
"step": 25950
},
{
"epoch": 4.005397070161912,
"grad_norm": 7.214539527893066,
"learning_rate": 9.452556085521647e-07,
"loss": 0.0553,
"step": 25975
},
{
"epoch": 4.009252120277564,
"grad_norm": 4.325976371765137,
"learning_rate": 9.381811597307683e-07,
"loss": 0.0648,
"step": 26000
},
{
"epoch": 4.013107170393215,
"grad_norm": 2.449784517288208,
"learning_rate": 9.311305417921607e-07,
"loss": 0.0671,
"step": 26025
},
{
"epoch": 4.016962220508867,
"grad_norm": 2.7883942127227783,
"learning_rate": 9.241037961026461e-07,
"loss": 0.0776,
"step": 26050
},
{
"epoch": 4.020817270624518,
"grad_norm": 6.782441139221191,
"learning_rate": 9.171009638884759e-07,
"loss": 0.0549,
"step": 26075
},
{
"epoch": 4.024672320740169,
"grad_norm": 2.380558967590332,
"learning_rate": 9.101220862355975e-07,
"loss": 0.0803,
"step": 26100
},
{
"epoch": 4.028527370855821,
"grad_norm": 3.5904738903045654,
"learning_rate": 9.031672040894112e-07,
"loss": 0.0389,
"step": 26125
},
{
"epoch": 4.032382420971473,
"grad_norm": 7.021317005157471,
"learning_rate": 8.962363582545447e-07,
"loss": 0.0646,
"step": 26150
},
{
"epoch": 4.036237471087124,
"grad_norm": 4.77400541305542,
"learning_rate": 8.89329589394593e-07,
"loss": 0.0403,
"step": 26175
},
{
"epoch": 4.040092521202776,
"grad_norm": 4.667660236358643,
"learning_rate": 8.824469380318967e-07,
"loss": 0.0359,
"step": 26200
},
{
"epoch": 4.043947571318427,
"grad_norm": 4.778940677642822,
"learning_rate": 8.755884445472973e-07,
"loss": 0.0537,
"step": 26225
},
{
"epoch": 4.047802621434078,
"grad_norm": 3.638836622238159,
"learning_rate": 8.687541491798967e-07,
"loss": 0.0352,
"step": 26250
},
{
"epoch": 4.05165767154973,
"grad_norm": 5.079896450042725,
"learning_rate": 8.619440920268307e-07,
"loss": 0.0571,
"step": 26275
},
{
"epoch": 4.055512721665382,
"grad_norm": 10.79236125946045,
"learning_rate": 8.551583130430241e-07,
"loss": 0.0506,
"step": 26300
},
{
"epoch": 4.059367771781033,
"grad_norm": 9.683244705200195,
"learning_rate": 8.483968520409636e-07,
"loss": 0.0458,
"step": 26325
},
{
"epoch": 4.063222821896685,
"grad_norm": 6.350785732269287,
"learning_rate": 8.416597486904609e-07,
"loss": 0.0459,
"step": 26350
},
{
"epoch": 4.067077872012336,
"grad_norm": 8.821796417236328,
"learning_rate": 8.349470425184164e-07,
"loss": 0.0448,
"step": 26375
},
{
"epoch": 4.070932922127987,
"grad_norm": 5.784400939941406,
"learning_rate": 8.282587729085955e-07,
"loss": 0.044,
"step": 26400
},
{
"epoch": 4.074787972243639,
"grad_norm": 2.338291645050049,
"learning_rate": 8.215949791013933e-07,
"loss": 0.0585,
"step": 26425
},
{
"epoch": 4.078643022359291,
"grad_norm": 7.46394681930542,
"learning_rate": 8.149557001935981e-07,
"loss": 0.0556,
"step": 26450
},
{
"epoch": 4.082498072474942,
"grad_norm": 7.44688606262207,
"learning_rate": 8.083409751381777e-07,
"loss": 0.0519,
"step": 26475
},
{
"epoch": 4.086353122590594,
"grad_norm": 3.8436038494110107,
"learning_rate": 8.017508427440318e-07,
"loss": 0.0699,
"step": 26500
},
{
"epoch": 4.090208172706245,
"grad_norm": 6.464486598968506,
"learning_rate": 7.95185341675781e-07,
"loss": 0.0492,
"step": 26525
},
{
"epoch": 4.094063222821896,
"grad_norm": 7.237420082092285,
"learning_rate": 7.886445104535289e-07,
"loss": 0.0455,
"step": 26550
},
{
"epoch": 4.097918272937548,
"grad_norm": 3.3453290462493896,
"learning_rate": 7.821283874526403e-07,
"loss": 0.061,
"step": 26575
},
{
"epoch": 4.1017733230532,
"grad_norm": 5.772636890411377,
"learning_rate": 7.756370109035177e-07,
"loss": 0.058,
"step": 26600
},
{
"epoch": 4.105628373168851,
"grad_norm": 10.061744689941406,
"learning_rate": 7.691704188913718e-07,
"loss": 0.0648,
"step": 26625
},
{
"epoch": 4.109483423284503,
"grad_norm": 6.11315393447876,
"learning_rate": 7.627286493560038e-07,
"loss": 0.0389,
"step": 26650
},
{
"epoch": 4.113338473400154,
"grad_norm": 8.67935848236084,
"learning_rate": 7.563117400915803e-07,
"loss": 0.0423,
"step": 26675
},
{
"epoch": 4.117193523515806,
"grad_norm": 8.060785293579102,
"learning_rate": 7.499197287464094e-07,
"loss": 0.0524,
"step": 26700
},
{
"epoch": 4.121048573631457,
"grad_norm": 2.4734082221984863,
"learning_rate": 7.435526528227238e-07,
"loss": 0.0532,
"step": 26725
},
{
"epoch": 4.124903623747109,
"grad_norm": 8.875064849853516,
"learning_rate": 7.372105496764597e-07,
"loss": 0.0423,
"step": 26750
},
{
"epoch": 4.128758673862761,
"grad_norm": 4.835430145263672,
"learning_rate": 7.308934565170322e-07,
"loss": 0.0434,
"step": 26775
},
{
"epoch": 4.1326137239784115,
"grad_norm": 7.1492180824279785,
"learning_rate": 7.246014104071292e-07,
"loss": 0.0515,
"step": 26800
},
{
"epoch": 4.136468774094063,
"grad_norm": 6.602258682250977,
"learning_rate": 7.183344482624788e-07,
"loss": 0.0507,
"step": 26825
},
{
"epoch": 4.140323824209715,
"grad_norm": 8.508008003234863,
"learning_rate": 7.120926068516443e-07,
"loss": 0.0702,
"step": 26850
},
{
"epoch": 4.144178874325366,
"grad_norm": 2.718414306640625,
"learning_rate": 7.058759227958057e-07,
"loss": 0.0513,
"step": 26875
},
{
"epoch": 4.148033924441018,
"grad_norm": 3.9830615520477295,
"learning_rate": 6.996844325685392e-07,
"loss": 0.0543,
"step": 26900
},
{
"epoch": 4.15188897455667,
"grad_norm": 8.627047538757324,
"learning_rate": 6.93518172495612e-07,
"loss": 0.0601,
"step": 26925
},
{
"epoch": 4.1557440246723205,
"grad_norm": 11.737249374389648,
"learning_rate": 6.873771787547612e-07,
"loss": 0.0629,
"step": 26950
},
{
"epoch": 4.159599074787972,
"grad_norm": 5.22245979309082,
"learning_rate": 6.81261487375487e-07,
"loss": 0.0604,
"step": 26975
},
{
"epoch": 4.163454124903624,
"grad_norm": 3.9162354469299316,
"learning_rate": 6.751711342388412e-07,
"loss": 0.0599,
"step": 27000
},
{
"epoch": 4.167309175019275,
"grad_norm": 4.3824381828308105,
"learning_rate": 6.69106155077211e-07,
"loss": 0.0478,
"step": 27025
},
{
"epoch": 4.171164225134927,
"grad_norm": 8.110997200012207,
"learning_rate": 6.630665854741159e-07,
"loss": 0.0535,
"step": 27050
},
{
"epoch": 4.175019275250579,
"grad_norm": 6.809072017669678,
"learning_rate": 6.570524608639956e-07,
"loss": 0.0491,
"step": 27075
},
{
"epoch": 4.1788743253662295,
"grad_norm": 6.785584926605225,
"learning_rate": 6.510638165320032e-07,
"loss": 0.0501,
"step": 27100
},
{
"epoch": 4.182729375481881,
"grad_norm": 6.9713826179504395,
"learning_rate": 6.451006876137989e-07,
"loss": 0.0491,
"step": 27125
},
{
"epoch": 4.186584425597533,
"grad_norm": 0.8903509974479675,
"learning_rate": 6.391631090953387e-07,
"loss": 0.0641,
"step": 27150
},
{
"epoch": 4.190439475713184,
"grad_norm": 5.400376319885254,
"learning_rate": 6.332511158126776e-07,
"loss": 0.0343,
"step": 27175
},
{
"epoch": 4.194294525828836,
"grad_norm": 0.9980877637863159,
"learning_rate": 6.273647424517592e-07,
"loss": 0.0497,
"step": 27200
},
{
"epoch": 4.198149575944488,
"grad_norm": 5.953235626220703,
"learning_rate": 6.215040235482134e-07,
"loss": 0.0568,
"step": 27225
},
{
"epoch": 4.2020046260601385,
"grad_norm": 3.7557125091552734,
"learning_rate": 6.156689934871552e-07,
"loss": 0.0385,
"step": 27250
},
{
"epoch": 4.20585967617579,
"grad_norm": 1.1404926776885986,
"learning_rate": 6.098596865029793e-07,
"loss": 0.0343,
"step": 27275
},
{
"epoch": 4.209714726291442,
"grad_norm": 5.386295318603516,
"learning_rate": 6.040761366791653e-07,
"loss": 0.049,
"step": 27300
},
{
"epoch": 4.213569776407093,
"grad_norm": 0.9487815499305725,
"learning_rate": 5.983183779480739e-07,
"loss": 0.0414,
"step": 27325
},
{
"epoch": 4.217424826522745,
"grad_norm": 4.312638282775879,
"learning_rate": 5.925864440907453e-07,
"loss": 0.0477,
"step": 27350
},
{
"epoch": 4.2212798766383965,
"grad_norm": 2.526244878768921,
"learning_rate": 5.868803687367064e-07,
"loss": 0.0508,
"step": 27375
},
{
"epoch": 4.2251349267540474,
"grad_norm": 1.8727853298187256,
"learning_rate": 5.812001853637711e-07,
"loss": 0.0347,
"step": 27400
},
{
"epoch": 4.228989976869699,
"grad_norm": 3.879274845123291,
"learning_rate": 5.755459272978431e-07,
"loss": 0.0581,
"step": 27425
},
{
"epoch": 4.232845026985351,
"grad_norm": 3.3455216884613037,
"learning_rate": 5.699176277127221e-07,
"loss": 0.0435,
"step": 27450
},
{
"epoch": 4.236700077101002,
"grad_norm": 3.564358949661255,
"learning_rate": 5.643153196299056e-07,
"loss": 0.0367,
"step": 27475
},
{
"epoch": 4.240555127216654,
"grad_norm": 11.249282836914062,
"learning_rate": 5.587390359183997e-07,
"loss": 0.0453,
"step": 27500
},
{
"epoch": 4.2444101773323055,
"grad_norm": 4.554286956787109,
"learning_rate": 5.531888092945265e-07,
"loss": 0.038,
"step": 27525
},
{
"epoch": 4.248265227447956,
"grad_norm": 3.6033146381378174,
"learning_rate": 5.476646723217244e-07,
"loss": 0.0411,
"step": 27550
},
{
"epoch": 4.252120277563608,
"grad_norm": 5.931132793426514,
"learning_rate": 5.421666574103674e-07,
"loss": 0.0471,
"step": 27575
},
{
"epoch": 4.25597532767926,
"grad_norm": 5.69351863861084,
"learning_rate": 5.366947968175673e-07,
"loss": 0.0446,
"step": 27600
},
{
"epoch": 4.259830377794911,
"grad_norm": 4.139880180358887,
"learning_rate": 5.312491226469891e-07,
"loss": 0.0495,
"step": 27625
},
{
"epoch": 4.263685427910563,
"grad_norm": 8.035470962524414,
"learning_rate": 5.258296668486607e-07,
"loss": 0.0619,
"step": 27650
},
{
"epoch": 4.2675404780262145,
"grad_norm": 2.8856546878814697,
"learning_rate": 5.204364612187828e-07,
"loss": 0.0488,
"step": 27675
},
{
"epoch": 4.271395528141866,
"grad_norm": 6.430315971374512,
"learning_rate": 5.150695373995496e-07,
"loss": 0.0442,
"step": 27700
},
{
"epoch": 4.275250578257517,
"grad_norm": 3.356950044631958,
"learning_rate": 5.097289268789552e-07,
"loss": 0.0281,
"step": 27725
},
{
"epoch": 4.279105628373169,
"grad_norm": 2.2608771324157715,
"learning_rate": 5.044146609906136e-07,
"loss": 0.0422,
"step": 27750
},
{
"epoch": 4.28296067848882,
"grad_norm": 1.8129510879516602,
"learning_rate": 4.991267709135749e-07,
"loss": 0.0453,
"step": 27775
},
{
"epoch": 4.286815728604472,
"grad_norm": 2.924614667892456,
"learning_rate": 4.938652876721378e-07,
"loss": 0.0402,
"step": 27800
},
{
"epoch": 4.2906707787201235,
"grad_norm": 3.2685673236846924,
"learning_rate": 4.886302421356732e-07,
"loss": 0.0682,
"step": 27825
},
{
"epoch": 4.294525828835775,
"grad_norm": 3.3606674671173096,
"learning_rate": 4.834216650184421e-07,
"loss": 0.0516,
"step": 27850
},
{
"epoch": 4.298380878951426,
"grad_norm": 1.944153070449829,
"learning_rate": 4.782395868794087e-07,
"loss": 0.0498,
"step": 27875
},
{
"epoch": 4.302235929067078,
"grad_norm": 3.226762056350708,
"learning_rate": 4.730840381220736e-07,
"loss": 0.0435,
"step": 27900
},
{
"epoch": 4.30609097918273,
"grad_norm": 4.163146495819092,
"learning_rate": 4.679550489942819e-07,
"loss": 0.0501,
"step": 27925
},
{
"epoch": 4.309946029298381,
"grad_norm": 6.545179843902588,
"learning_rate": 4.628526495880553e-07,
"loss": 0.0583,
"step": 27950
},
{
"epoch": 4.3138010794140325,
"grad_norm": 1.5406627655029297,
"learning_rate": 4.577768698394136e-07,
"loss": 0.0542,
"step": 27975
},
{
"epoch": 4.317656129529684,
"grad_norm": 2.5607690811157227,
"learning_rate": 4.5272773952819424e-07,
"loss": 0.0588,
"step": 28000
},
{
"epoch": 4.321511179645335,
"grad_norm": 6.036596298217773,
"learning_rate": 4.4770528827788317e-07,
"loss": 0.0544,
"step": 28025
},
{
"epoch": 4.325366229760987,
"grad_norm": 2.857830762863159,
"learning_rate": 4.4270954555543975e-07,
"loss": 0.0474,
"step": 28050
},
{
"epoch": 4.329221279876639,
"grad_norm": 0.8273962736129761,
"learning_rate": 4.3774054067112157e-07,
"loss": 0.0592,
"step": 28075
},
{
"epoch": 4.33307632999229,
"grad_norm": 2.6081111431121826,
"learning_rate": 4.327983027783161e-07,
"loss": 0.0569,
"step": 28100
},
{
"epoch": 4.336931380107941,
"grad_norm": 8.01623249053955,
"learning_rate": 4.278828608733643e-07,
"loss": 0.0599,
"step": 28125
},
{
"epoch": 4.340786430223593,
"grad_norm": 5.856451511383057,
"learning_rate": 4.22994243795396e-07,
"loss": 0.0543,
"step": 28150
},
{
"epoch": 4.344641480339244,
"grad_norm": 7.028669357299805,
"learning_rate": 4.181324802261605e-07,
"loss": 0.0562,
"step": 28175
},
{
"epoch": 4.348496530454896,
"grad_norm": 5.266681671142578,
"learning_rate": 4.132975986898513e-07,
"loss": 0.0489,
"step": 28200
},
{
"epoch": 4.352351580570548,
"grad_norm": 4.28916072845459,
"learning_rate": 4.084896275529482e-07,
"loss": 0.0385,
"step": 28225
},
{
"epoch": 4.356206630686199,
"grad_norm": 7.336819171905518,
"learning_rate": 4.0370859502404323e-07,
"loss": 0.0511,
"step": 28250
},
{
"epoch": 4.36006168080185,
"grad_norm": 4.3281779289245605,
"learning_rate": 3.989545291536812e-07,
"loss": 0.0368,
"step": 28275
},
{
"epoch": 4.363916730917502,
"grad_norm": 1.2710989713668823,
"learning_rate": 3.942274578341909e-07,
"loss": 0.0499,
"step": 28300
},
{
"epoch": 4.367771781033153,
"grad_norm": 2.805614709854126,
"learning_rate": 3.89527408799521e-07,
"loss": 0.0685,
"step": 28325
},
{
"epoch": 4.371626831148805,
"grad_norm": 3.1615355014801025,
"learning_rate": 3.848544096250828e-07,
"loss": 0.0546,
"step": 28350
},
{
"epoch": 4.375481881264457,
"grad_norm": 2.7767813205718994,
"learning_rate": 3.8020848772758246e-07,
"loss": 0.0439,
"step": 28375
},
{
"epoch": 4.379336931380108,
"grad_norm": 6.944403648376465,
"learning_rate": 3.755896703648626e-07,
"loss": 0.0491,
"step": 28400
},
{
"epoch": 4.383191981495759,
"grad_norm": 1.873370885848999,
"learning_rate": 3.709979846357442e-07,
"loss": 0.061,
"step": 28425
},
{
"epoch": 4.387047031611411,
"grad_norm": 4.76448917388916,
"learning_rate": 3.664334574798617e-07,
"loss": 0.0521,
"step": 28450
},
{
"epoch": 4.390902081727062,
"grad_norm": 3.3911397457122803,
"learning_rate": 3.618961156775125e-07,
"loss": 0.0647,
"step": 28475
},
{
"epoch": 4.394757131842714,
"grad_norm": 4.642607688903809,
"learning_rate": 3.573859858494955e-07,
"loss": 0.0428,
"step": 28500
},
{
"epoch": 4.398612181958366,
"grad_norm": 3.1891379356384277,
"learning_rate": 3.5290309445695394e-07,
"loss": 0.0398,
"step": 28525
},
{
"epoch": 4.402467232074017,
"grad_norm": 8.402615547180176,
"learning_rate": 3.484474678012251e-07,
"loss": 0.0529,
"step": 28550
},
{
"epoch": 4.406322282189668,
"grad_norm": 6.129350662231445,
"learning_rate": 3.4401913202367797e-07,
"loss": 0.0454,
"step": 28575
},
{
"epoch": 4.41017733230532,
"grad_norm": 5.72160005569458,
"learning_rate": 3.396181131055698e-07,
"loss": 0.0532,
"step": 28600
},
{
"epoch": 4.414032382420971,
"grad_norm": 2.8791778087615967,
"learning_rate": 3.3524443686788587e-07,
"loss": 0.041,
"step": 28625
},
{
"epoch": 4.417887432536623,
"grad_norm": 2.173323631286621,
"learning_rate": 3.3089812897118936e-07,
"loss": 0.0578,
"step": 28650
},
{
"epoch": 4.421742482652275,
"grad_norm": 2.1249160766601562,
"learning_rate": 3.265792149154762e-07,
"loss": 0.0391,
"step": 28675
},
{
"epoch": 4.425597532767926,
"grad_norm": 6.4937615394592285,
"learning_rate": 3.2228772004001765e-07,
"loss": 0.0571,
"step": 28700
},
{
"epoch": 4.429452582883577,
"grad_norm": 5.587518692016602,
"learning_rate": 3.180236695232164e-07,
"loss": 0.0469,
"step": 28725
},
{
"epoch": 4.433307632999229,
"grad_norm": 8.642132759094238,
"learning_rate": 3.1378708838245955e-07,
"loss": 0.0528,
"step": 28750
},
{
"epoch": 4.437162683114881,
"grad_norm": 6.956964015960693,
"learning_rate": 3.0957800147396634e-07,
"loss": 0.0514,
"step": 28775
},
{
"epoch": 4.441017733230532,
"grad_norm": 3.9507009983062744,
"learning_rate": 3.0539643349264956e-07,
"loss": 0.0444,
"step": 28800
},
{
"epoch": 4.444872783346184,
"grad_norm": 4.584397315979004,
"learning_rate": 3.012424089719662e-07,
"loss": 0.0626,
"step": 28825
},
{
"epoch": 4.4487278334618345,
"grad_norm": 19.1092472076416,
"learning_rate": 2.97115952283773e-07,
"loss": 0.0619,
"step": 28850
},
{
"epoch": 4.452582883577486,
"grad_norm": 5.3885297775268555,
"learning_rate": 2.930170876381877e-07,
"loss": 0.0439,
"step": 28875
},
{
"epoch": 4.456437933693138,
"grad_norm": 7.875032901763916,
"learning_rate": 2.889458390834404e-07,
"loss": 0.0427,
"step": 28900
},
{
"epoch": 4.46029298380879,
"grad_norm": 2.7519497871398926,
"learning_rate": 2.849022305057397e-07,
"loss": 0.0508,
"step": 28925
},
{
"epoch": 4.464148033924441,
"grad_norm": 6.233062267303467,
"learning_rate": 2.8088628562912837e-07,
"loss": 0.0632,
"step": 28950
},
{
"epoch": 4.468003084040093,
"grad_norm": 4.052979946136475,
"learning_rate": 2.768980280153427e-07,
"loss": 0.04,
"step": 28975
},
{
"epoch": 4.471858134155744,
"grad_norm": 13.528153419494629,
"learning_rate": 2.7293748106368034e-07,
"loss": 0.0458,
"step": 29000
},
{
"epoch": 4.475713184271395,
"grad_norm": 1.890171766281128,
"learning_rate": 2.6900466801085603e-07,
"loss": 0.0561,
"step": 29025
},
{
"epoch": 4.479568234387047,
"grad_norm": 2.2191109657287598,
"learning_rate": 2.650996119308702e-07,
"loss": 0.0515,
"step": 29050
},
{
"epoch": 4.483423284502699,
"grad_norm": 8.668838500976562,
"learning_rate": 2.6122233573487086e-07,
"loss": 0.0513,
"step": 29075
},
{
"epoch": 4.48727833461835,
"grad_norm": 3.5857250690460205,
"learning_rate": 2.5737286217101975e-07,
"loss": 0.0664,
"step": 29100
},
{
"epoch": 4.491133384734002,
"grad_norm": 1.4956778287887573,
"learning_rate": 2.535512138243601e-07,
"loss": 0.0414,
"step": 29125
},
{
"epoch": 4.494988434849653,
"grad_norm": 3.5447378158569336,
"learning_rate": 2.497574131166841e-07,
"loss": 0.0414,
"step": 29150
},
{
"epoch": 4.498843484965304,
"grad_norm": 7.222159385681152,
"learning_rate": 2.459914823063986e-07,
"loss": 0.0491,
"step": 29175
},
{
"epoch": 4.502698535080956,
"grad_norm": 3.686474323272705,
"learning_rate": 2.4225344348839775e-07,
"loss": 0.0441,
"step": 29200
},
{
"epoch": 4.506553585196608,
"grad_norm": 8.481548309326172,
"learning_rate": 2.3854331859393064e-07,
"loss": 0.05,
"step": 29225
},
{
"epoch": 4.510408635312259,
"grad_norm": 8.785652160644531,
"learning_rate": 2.3486112939047623e-07,
"loss": 0.0608,
"step": 29250
},
{
"epoch": 4.514263685427911,
"grad_norm": 5.748463153839111,
"learning_rate": 2.3120689748161175e-07,
"loss": 0.0323,
"step": 29275
},
{
"epoch": 4.518118735543562,
"grad_norm": 8.590340614318848,
"learning_rate": 2.275806443068884e-07,
"loss": 0.0579,
"step": 29300
},
{
"epoch": 4.521973785659213,
"grad_norm": 5.212619304656982,
"learning_rate": 2.239823911417055e-07,
"loss": 0.0367,
"step": 29325
},
{
"epoch": 4.525828835774865,
"grad_norm": 3.7377119064331055,
"learning_rate": 2.2041215909718305e-07,
"loss": 0.0534,
"step": 29350
},
{
"epoch": 4.529683885890517,
"grad_norm": 2.107555866241455,
"learning_rate": 2.1686996912004098e-07,
"loss": 0.0381,
"step": 29375
},
{
"epoch": 4.533538936006168,
"grad_norm": 7.098761558532715,
"learning_rate": 2.1335584199247584e-07,
"loss": 0.0417,
"step": 29400
},
{
"epoch": 4.5373939861218195,
"grad_norm": 1.0062240362167358,
"learning_rate": 2.098697983320358e-07,
"loss": 0.0615,
"step": 29425
},
{
"epoch": 4.541249036237471,
"grad_norm": 6.572259902954102,
"learning_rate": 2.064118585915048e-07,
"loss": 0.0693,
"step": 29450
},
{
"epoch": 4.545104086353122,
"grad_norm": 7.6246819496154785,
"learning_rate": 2.0298204305877867e-07,
"loss": 0.0479,
"step": 29475
},
{
"epoch": 4.548959136468774,
"grad_norm": 10.483940124511719,
"learning_rate": 1.9958037185674517e-07,
"loss": 0.0522,
"step": 29500
},
{
"epoch": 4.552814186584426,
"grad_norm": 4.1152873039245605,
"learning_rate": 1.9620686494317252e-07,
"loss": 0.0598,
"step": 29525
},
{
"epoch": 4.556669236700077,
"grad_norm": 4.319024562835693,
"learning_rate": 1.9286154211058227e-07,
"loss": 0.0361,
"step": 29550
},
{
"epoch": 4.5605242868157285,
"grad_norm": 2.4022743701934814,
"learning_rate": 1.8954442298614206e-07,
"loss": 0.0286,
"step": 29575
},
{
"epoch": 4.56437933693138,
"grad_norm": 3.555640935897827,
"learning_rate": 1.8625552703154748e-07,
"loss": 0.0465,
"step": 29600
},
{
"epoch": 4.568234387047031,
"grad_norm": 2.663213014602661,
"learning_rate": 1.8299487354290491e-07,
"loss": 0.0335,
"step": 29625
},
{
"epoch": 4.572089437162683,
"grad_norm": 7.092471599578857,
"learning_rate": 1.7976248165062325e-07,
"loss": 0.0545,
"step": 29650
},
{
"epoch": 4.575944487278335,
"grad_norm": 2.873157024383545,
"learning_rate": 1.7655837031929802e-07,
"loss": 0.0478,
"step": 29675
},
{
"epoch": 4.579799537393987,
"grad_norm": 5.116678714752197,
"learning_rate": 1.7338255834760064e-07,
"loss": 0.0534,
"step": 29700
},
{
"epoch": 4.5836545875096375,
"grad_norm": 5.352801322937012,
"learning_rate": 1.7023506436817106e-07,
"loss": 0.0472,
"step": 29725
},
{
"epoch": 4.587509637625289,
"grad_norm": 7.377298355102539,
"learning_rate": 1.6711590684750422e-07,
"loss": 0.0362,
"step": 29750
},
{
"epoch": 4.59136468774094,
"grad_norm": 3.2186317443847656,
"learning_rate": 1.6402510408584427e-07,
"loss": 0.0444,
"step": 29775
},
{
"epoch": 4.595219737856592,
"grad_norm": 4.758965492248535,
"learning_rate": 1.6096267421707834e-07,
"loss": 0.0512,
"step": 29800
},
{
"epoch": 4.599074787972244,
"grad_norm": 3.9415712356567383,
"learning_rate": 1.5792863520862457e-07,
"loss": 0.0569,
"step": 29825
},
{
"epoch": 4.602929838087896,
"grad_norm": 4.901118755340576,
"learning_rate": 1.5492300486133537e-07,
"loss": 0.0372,
"step": 29850
},
{
"epoch": 4.6067848882035465,
"grad_norm": 2.336989164352417,
"learning_rate": 1.5194580080938436e-07,
"loss": 0.044,
"step": 29875
},
{
"epoch": 4.610639938319198,
"grad_norm": 1.7713156938552856,
"learning_rate": 1.4899704052016794e-07,
"loss": 0.0385,
"step": 29900
},
{
"epoch": 4.614494988434849,
"grad_norm": 5.644339084625244,
"learning_rate": 1.4607674129420269e-07,
"loss": 0.0497,
"step": 29925
},
{
"epoch": 4.618350038550501,
"grad_norm": 4.766188144683838,
"learning_rate": 1.4318492026502152e-07,
"loss": 0.0475,
"step": 29950
},
{
"epoch": 4.622205088666153,
"grad_norm": 3.597555160522461,
"learning_rate": 1.40321594399076e-07,
"loss": 0.0446,
"step": 29975
},
{
"epoch": 4.6260601387818046,
"grad_norm": 3.8907299041748047,
"learning_rate": 1.3748678049563258e-07,
"loss": 0.0632,
"step": 30000
},
{
"epoch": 4.6299151888974555,
"grad_norm": 10.890676498413086,
"learning_rate": 1.3468049518667868e-07,
"loss": 0.0532,
"step": 30025
},
{
"epoch": 4.633770239013107,
"grad_norm": 3.604957342147827,
"learning_rate": 1.319027549368229e-07,
"loss": 0.048,
"step": 30050
},
{
"epoch": 4.637625289128759,
"grad_norm": 3.338784694671631,
"learning_rate": 1.2915357604319777e-07,
"loss": 0.032,
"step": 30075
},
{
"epoch": 4.64148033924441,
"grad_norm": 8.31986141204834,
"learning_rate": 1.2643297463536597e-07,
"loss": 0.0428,
"step": 30100
},
{
"epoch": 4.645335389360062,
"grad_norm": 6.411100387573242,
"learning_rate": 1.2374096667522484e-07,
"loss": 0.041,
"step": 30125
},
{
"epoch": 4.6491904394757135,
"grad_norm": 9.488824844360352,
"learning_rate": 1.2107756795691095e-07,
"loss": 0.0577,
"step": 30150
},
{
"epoch": 4.653045489591364,
"grad_norm": 2.1697256565093994,
"learning_rate": 1.1844279410671178e-07,
"loss": 0.0447,
"step": 30175
},
{
"epoch": 4.656900539707016,
"grad_norm": 5.731645107269287,
"learning_rate": 1.1583666058296805e-07,
"loss": 0.0405,
"step": 30200
},
{
"epoch": 4.660755589822668,
"grad_norm": 11.044076919555664,
"learning_rate": 1.1325918267598879e-07,
"loss": 0.0589,
"step": 30225
},
{
"epoch": 4.664610639938319,
"grad_norm": 1.4008193016052246,
"learning_rate": 1.1071037550795916e-07,
"loss": 0.0437,
"step": 30250
},
{
"epoch": 4.668465690053971,
"grad_norm": 6.674703121185303,
"learning_rate": 1.0819025403284999e-07,
"loss": 0.0547,
"step": 30275
},
{
"epoch": 4.6723207401696225,
"grad_norm": 4.803305625915527,
"learning_rate": 1.0569883303633455e-07,
"loss": 0.0441,
"step": 30300
},
{
"epoch": 4.676175790285273,
"grad_norm": 2.8617444038391113,
"learning_rate": 1.0323612713569575e-07,
"loss": 0.0513,
"step": 30325
},
{
"epoch": 4.680030840400925,
"grad_norm": 4.508880138397217,
"learning_rate": 1.0080215077974575e-07,
"loss": 0.0466,
"step": 30350
},
{
"epoch": 4.683885890516577,
"grad_norm": 1.8428984880447388,
"learning_rate": 9.839691824873875e-08,
"loss": 0.0541,
"step": 30375
},
{
"epoch": 4.687740940632228,
"grad_norm": 5.859665393829346,
"learning_rate": 9.602044365428776e-08,
"loss": 0.0513,
"step": 30400
},
{
"epoch": 4.69159599074788,
"grad_norm": 5.51134729385376,
"learning_rate": 9.367274093928125e-08,
"loss": 0.0498,
"step": 30425
},
{
"epoch": 4.6954510408635315,
"grad_norm": 1.2680703401565552,
"learning_rate": 9.135382387780168e-08,
"loss": 0.0451,
"step": 30450
},
{
"epoch": 4.699306090979182,
"grad_norm": 2.4471595287323,
"learning_rate": 8.906370607504433e-08,
"loss": 0.058,
"step": 30475
},
{
"epoch": 4.703161141094834,
"grad_norm": 10.326664924621582,
"learning_rate": 8.680240096723969e-08,
"loss": 0.0369,
"step": 30500
},
{
"epoch": 4.707016191210486,
"grad_norm": 4.283406734466553,
"learning_rate": 8.456992182157065e-08,
"loss": 0.0422,
"step": 30525
},
{
"epoch": 4.710871241326137,
"grad_norm": 3.263824701309204,
"learning_rate": 8.236628173609762e-08,
"loss": 0.0374,
"step": 30550
},
{
"epoch": 4.714726291441789,
"grad_norm": 2.9537744522094727,
"learning_rate": 8.019149363968081e-08,
"loss": 0.0449,
"step": 30575
},
{
"epoch": 4.7185813415574405,
"grad_norm": 7.638341426849365,
"learning_rate": 7.804557029190584e-08,
"loss": 0.0513,
"step": 30600
},
{
"epoch": 4.722436391673091,
"grad_norm": 5.652314186096191,
"learning_rate": 7.59285242830049e-08,
"loss": 0.0424,
"step": 30625
},
{
"epoch": 4.726291441788743,
"grad_norm": 10.834895133972168,
"learning_rate": 7.384036803378735e-08,
"loss": 0.0653,
"step": 30650
},
{
"epoch": 4.730146491904395,
"grad_norm": 5.6966552734375,
"learning_rate": 7.17811137955643e-08,
"loss": 0.0583,
"step": 30675
},
{
"epoch": 4.734001542020046,
"grad_norm": 6.812707424163818,
"learning_rate": 6.975077365007799e-08,
"loss": 0.0607,
"step": 30700
},
{
"epoch": 4.737856592135698,
"grad_norm": 3.0957865715026855,
"learning_rate": 6.774935950942918e-08,
"loss": 0.0497,
"step": 30725
},
{
"epoch": 4.7417116422513494,
"grad_norm": 1.1080137491226196,
"learning_rate": 6.577688311600883e-08,
"loss": 0.0391,
"step": 30750
},
{
"epoch": 4.745566692367001,
"grad_norm": 5.256198883056641,
"learning_rate": 6.383335604243035e-08,
"loss": 0.0562,
"step": 30775
},
{
"epoch": 4.749421742482652,
"grad_norm": 6.855607986450195,
"learning_rate": 6.191878969145748e-08,
"loss": 0.0494,
"step": 30800
},
{
"epoch": 4.753276792598304,
"grad_norm": 3.2536323070526123,
"learning_rate": 6.003319529594209e-08,
"loss": 0.0423,
"step": 30825
},
{
"epoch": 4.757131842713955,
"grad_norm": 4.973302364349365,
"learning_rate": 5.81765839187548e-08,
"loss": 0.0382,
"step": 30850
},
{
"epoch": 4.760986892829607,
"grad_norm": 3.9073121547698975,
"learning_rate": 5.634896645272281e-08,
"loss": 0.0594,
"step": 30875
},
{
"epoch": 4.764841942945258,
"grad_norm": 1.932100534439087,
"learning_rate": 5.4550353620563825e-08,
"loss": 0.0398,
"step": 30900
},
{
"epoch": 4.76869699306091,
"grad_norm": 10.511844635009766,
"learning_rate": 5.278075597482391e-08,
"loss": 0.0573,
"step": 30925
},
{
"epoch": 4.772552043176561,
"grad_norm": 5.501337051391602,
"learning_rate": 5.1040183897816954e-08,
"loss": 0.0535,
"step": 30950
},
{
"epoch": 4.776407093292213,
"grad_norm": 5.393566608428955,
"learning_rate": 4.9328647601559756e-08,
"loss": 0.0391,
"step": 30975
},
{
"epoch": 4.780262143407864,
"grad_norm": 4.601632595062256,
"learning_rate": 4.764615712771758e-08,
"loss": 0.0455,
"step": 31000
},
{
"epoch": 4.784117193523516,
"grad_norm": 3.017916440963745,
"learning_rate": 4.599272234754204e-08,
"loss": 0.0549,
"step": 31025
},
{
"epoch": 4.787972243639167,
"grad_norm": 11.536879539489746,
"learning_rate": 4.436835296181163e-08,
"loss": 0.0528,
"step": 31050
},
{
"epoch": 4.791827293754819,
"grad_norm": 10.335283279418945,
"learning_rate": 4.277305850077906e-08,
"loss": 0.0679,
"step": 31075
},
{
"epoch": 4.79568234387047,
"grad_norm": 1.792244791984558,
"learning_rate": 4.1206848324111815e-08,
"loss": 0.0415,
"step": 31100
},
{
"epoch": 4.799537393986122,
"grad_norm": 7.723292827606201,
"learning_rate": 3.966973162083887e-08,
"loss": 0.0525,
"step": 31125
},
{
"epoch": 4.803392444101774,
"grad_norm": 10.610712051391602,
"learning_rate": 3.816171740929686e-08,
"loss": 0.0451,
"step": 31150
},
{
"epoch": 4.807247494217425,
"grad_norm": 5.772136688232422,
"learning_rate": 3.668281453707567e-08,
"loss": 0.0469,
"step": 31175
},
{
"epoch": 4.811102544333076,
"grad_norm": 4.91958475112915,
"learning_rate": 3.5233031680969585e-08,
"loss": 0.0512,
"step": 31200
},
{
"epoch": 4.814957594448728,
"grad_norm": 3.47731351852417,
"learning_rate": 3.381237734692122e-08,
"loss": 0.0448,
"step": 31225
},
{
"epoch": 4.818812644564379,
"grad_norm": 4.344484329223633,
"learning_rate": 3.2420859869977674e-08,
"loss": 0.0603,
"step": 31250
},
{
"epoch": 4.822667694680031,
"grad_norm": 1.0801258087158203,
"learning_rate": 3.105848741423778e-08,
"loss": 0.0416,
"step": 31275
},
{
"epoch": 4.826522744795683,
"grad_norm": 4.1883745193481445,
"learning_rate": 2.972526797280384e-08,
"loss": 0.0428,
"step": 31300
},
{
"epoch": 4.830377794911334,
"grad_norm": 7.557757377624512,
"learning_rate": 2.8421209367738845e-08,
"loss": 0.053,
"step": 31325
},
{
"epoch": 4.834232845026985,
"grad_norm": 3.2704110145568848,
"learning_rate": 2.7146319250014873e-08,
"loss": 0.0439,
"step": 31350
},
{
"epoch": 4.838087895142637,
"grad_norm": 4.630594730377197,
"learning_rate": 2.590060509947312e-08,
"loss": 0.0451,
"step": 31375
},
{
"epoch": 4.841942945258288,
"grad_norm": 4.2274980545043945,
"learning_rate": 2.4684074224776698e-08,
"loss": 0.0413,
"step": 31400
},
{
"epoch": 4.84579799537394,
"grad_norm": 3.1766464710235596,
"learning_rate": 2.3496733763370695e-08,
"loss": 0.0415,
"step": 31425
},
{
"epoch": 4.849653045489592,
"grad_norm": 3.7895710468292236,
"learning_rate": 2.2338590681436068e-08,
"loss": 0.04,
"step": 31450
},
{
"epoch": 4.8535080956052425,
"grad_norm": 5.877419471740723,
"learning_rate": 2.12096517738547e-08,
"loss": 0.0456,
"step": 31475
},
{
"epoch": 4.857363145720894,
"grad_norm": 3.9661877155303955,
"learning_rate": 2.0109923664162757e-08,
"loss": 0.0449,
"step": 31500
},
{
"epoch": 4.861218195836546,
"grad_norm": 5.422728538513184,
"learning_rate": 1.903941280451793e-08,
"loss": 0.0375,
"step": 31525
},
{
"epoch": 4.865073245952197,
"grad_norm": 5.891030311584473,
"learning_rate": 1.7998125475657824e-08,
"loss": 0.0619,
"step": 31550
},
{
"epoch": 4.868928296067849,
"grad_norm": 4.016377925872803,
"learning_rate": 1.6986067786863848e-08,
"loss": 0.0437,
"step": 31575
},
{
"epoch": 4.872783346183501,
"grad_norm": 1.3693541288375854,
"learning_rate": 1.6003245675926816e-08,
"loss": 0.0406,
"step": 31600
},
{
"epoch": 4.8766383962991515,
"grad_norm": 3.7623348236083984,
"learning_rate": 1.5049664909110306e-08,
"loss": 0.0512,
"step": 31625
},
{
"epoch": 4.880493446414803,
"grad_norm": 4.602240085601807,
"learning_rate": 1.4125331081117355e-08,
"loss": 0.0575,
"step": 31650
},
{
"epoch": 4.884348496530455,
"grad_norm": 4.906894683837891,
"learning_rate": 1.323024961505881e-08,
"loss": 0.0429,
"step": 31675
},
{
"epoch": 4.888203546646106,
"grad_norm": 8.693130493164062,
"learning_rate": 1.2364425762418919e-08,
"loss": 0.0345,
"step": 31700
},
{
"epoch": 4.892058596761758,
"grad_norm": 4.49963903427124,
"learning_rate": 1.1527864603027573e-08,
"loss": 0.065,
"step": 31725
},
{
"epoch": 4.89591364687741,
"grad_norm": 9.294318199157715,
"learning_rate": 1.072057104502866e-08,
"loss": 0.0378,
"step": 31750
},
{
"epoch": 4.8997686969930605,
"grad_norm": 7.211676120758057,
"learning_rate": 9.942549824851211e-09,
"loss": 0.0395,
"step": 31775
},
{
"epoch": 4.903623747108712,
"grad_norm": 5.318414211273193,
"learning_rate": 9.193805507183295e-09,
"loss": 0.0394,
"step": 31800
},
{
"epoch": 4.907478797224364,
"grad_norm": 3.5780348777770996,
"learning_rate": 8.474342484942056e-09,
"loss": 0.0589,
"step": 31825
},
{
"epoch": 4.911333847340016,
"grad_norm": 2.642815113067627,
"learning_rate": 7.784164979251496e-09,
"loss": 0.0724,
"step": 31850
},
{
"epoch": 4.915188897455667,
"grad_norm": 11.0081205368042,
"learning_rate": 7.123277039415844e-09,
"loss": 0.0486,
"step": 31875
},
{
"epoch": 4.919043947571319,
"grad_norm": 3.9760489463806152,
"learning_rate": 6.491682542895672e-09,
"loss": 0.045,
"step": 31900
},
{
"epoch": 4.9228989976869695,
"grad_norm": 8.580735206604004,
"learning_rate": 5.889385195285147e-09,
"loss": 0.0531,
"step": 31925
},
{
"epoch": 4.926754047802621,
"grad_norm": 2.2585766315460205,
"learning_rate": 5.316388530292038e-09,
"loss": 0.0406,
"step": 31950
},
{
"epoch": 4.930609097918273,
"grad_norm": 7.158880710601807,
"learning_rate": 4.772695909714409e-09,
"loss": 0.0508,
"step": 31975
},
{
"epoch": 4.934464148033925,
"grad_norm": 1.2747946977615356,
"learning_rate": 4.258310523422293e-09,
"loss": 0.0497,
"step": 32000
},
{
"epoch": 4.938319198149576,
"grad_norm": 6.368330955505371,
"learning_rate": 3.7732353893393805e-09,
"loss": 0.0496,
"step": 32025
},
{
"epoch": 4.9421742482652276,
"grad_norm": 3.998837471008301,
"learning_rate": 3.317473353424139e-09,
"loss": 0.035,
"step": 32050
},
{
"epoch": 4.9460292983808785,
"grad_norm": 3.9956557750701904,
"learning_rate": 2.8910270896548297e-09,
"loss": 0.0388,
"step": 32075
},
{
"epoch": 4.94988434849653,
"grad_norm": 9.725882530212402,
"learning_rate": 2.4938991000100775e-09,
"loss": 0.0498,
"step": 32100
},
{
"epoch": 4.953739398612182,
"grad_norm": 2.3593921661376953,
"learning_rate": 2.126091714459988e-09,
"loss": 0.0659,
"step": 32125
},
{
"epoch": 4.957594448727834,
"grad_norm": 2.1392416954040527,
"learning_rate": 1.7876070909472743e-09,
"loss": 0.0406,
"step": 32150
},
{
"epoch": 4.961449498843485,
"grad_norm": 9.84459114074707,
"learning_rate": 1.4784472153778206e-09,
"loss": 0.0506,
"step": 32175
},
{
"epoch": 4.9653045489591365,
"grad_norm": 6.583609580993652,
"learning_rate": 1.1986139016062492e-09,
"loss": 0.0464,
"step": 32200
},
{
"epoch": 4.969159599074788,
"grad_norm": 7.535484313964844,
"learning_rate": 9.481087914281484e-10,
"loss": 0.048,
"step": 32225
},
{
"epoch": 4.973014649190439,
"grad_norm": 9.076619148254395,
"learning_rate": 7.269333545689705e-10,
"loss": 0.0405,
"step": 32250
},
{
"epoch": 4.976869699306091,
"grad_norm": 1.754348635673523,
"learning_rate": 5.350888886751504e-10,
"loss": 0.0461,
"step": 32275
},
{
"epoch": 4.980724749421743,
"grad_norm": 2.8099377155303955,
"learning_rate": 3.725765193074438e-10,
"loss": 0.0404,
"step": 32300
},
{
"epoch": 4.984579799537394,
"grad_norm": 7.887497425079346,
"learning_rate": 2.3939719993426593e-10,
"loss": 0.0434,
"step": 32325
},
{
"epoch": 4.9884348496530455,
"grad_norm": 4.467358589172363,
"learning_rate": 1.3555171192392024e-10,
"loss": 0.0603,
"step": 32350
},
{
"epoch": 4.992289899768697,
"grad_norm": 5.653683662414551,
"learning_rate": 6.104066454293289e-11,
"loss": 0.0475,
"step": 32375
},
{
"epoch": 4.996144949884348,
"grad_norm": 7.203802108764648,
"learning_rate": 1.5864494951611796e-11,
"loss": 0.0438,
"step": 32400
},
{
"epoch": 5.0,
"grad_norm": 7.300978660583496,
"learning_rate": 2.3468200160969847e-14,
"loss": 0.0481,
"step": 32425
}
],
"logging_steps": 25,
"max_steps": 32425,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.773942584016896e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}