knc6's picture
Upload 11 files
2a68d8b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.988593155893536,
"eval_steps": 500,
"global_step": 1970,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025348542458808617,
"grad_norm": 1.1835554838180542,
"learning_rate": 0.0,
"loss": 2.7162,
"step": 1
},
{
"epoch": 0.005069708491761723,
"grad_norm": 1.1406067609786987,
"learning_rate": 4e-05,
"loss": 2.7021,
"step": 2
},
{
"epoch": 0.0076045627376425855,
"grad_norm": 1.1929512023925781,
"learning_rate": 8e-05,
"loss": 2.5728,
"step": 3
},
{
"epoch": 0.010139416983523447,
"grad_norm": 1.523325800895691,
"learning_rate": 0.00012,
"loss": 2.5825,
"step": 4
},
{
"epoch": 0.012674271229404309,
"grad_norm": 1.712708592414856,
"learning_rate": 0.00016,
"loss": 2.1986,
"step": 5
},
{
"epoch": 0.015209125475285171,
"grad_norm": 1.263485312461853,
"learning_rate": 0.0002,
"loss": 2.1478,
"step": 6
},
{
"epoch": 0.017743979721166033,
"grad_norm": 1.2837083339691162,
"learning_rate": 0.00019989821882951655,
"loss": 2.2153,
"step": 7
},
{
"epoch": 0.020278833967046894,
"grad_norm": 1.0831111669540405,
"learning_rate": 0.0001997964376590331,
"loss": 1.9272,
"step": 8
},
{
"epoch": 0.022813688212927757,
"grad_norm": 0.7921498417854309,
"learning_rate": 0.00019969465648854963,
"loss": 1.4929,
"step": 9
},
{
"epoch": 0.025348542458808618,
"grad_norm": 0.9243067502975464,
"learning_rate": 0.00019959287531806617,
"loss": 1.4312,
"step": 10
},
{
"epoch": 0.02788339670468948,
"grad_norm": 1.2378944158554077,
"learning_rate": 0.0001994910941475827,
"loss": 1.1605,
"step": 11
},
{
"epoch": 0.030418250950570342,
"grad_norm": 1.401106834411621,
"learning_rate": 0.00019938931297709925,
"loss": 1.0236,
"step": 12
},
{
"epoch": 0.032953105196451206,
"grad_norm": 1.0503413677215576,
"learning_rate": 0.00019928753180661578,
"loss": 0.8441,
"step": 13
},
{
"epoch": 0.035487959442332066,
"grad_norm": 0.928716778755188,
"learning_rate": 0.00019918575063613232,
"loss": 0.8098,
"step": 14
},
{
"epoch": 0.03802281368821293,
"grad_norm": 0.6546494364738464,
"learning_rate": 0.00019908396946564886,
"loss": 0.5083,
"step": 15
},
{
"epoch": 0.04055766793409379,
"grad_norm": 0.8399775624275208,
"learning_rate": 0.0001989821882951654,
"loss": 0.5798,
"step": 16
},
{
"epoch": 0.043092522179974654,
"grad_norm": 0.6111662983894348,
"learning_rate": 0.00019888040712468194,
"loss": 0.471,
"step": 17
},
{
"epoch": 0.045627376425855515,
"grad_norm": 0.6786199808120728,
"learning_rate": 0.00019877862595419848,
"loss": 0.5124,
"step": 18
},
{
"epoch": 0.048162230671736375,
"grad_norm": 0.7001961469650269,
"learning_rate": 0.00019867684478371502,
"loss": 0.5764,
"step": 19
},
{
"epoch": 0.050697084917617236,
"grad_norm": 0.5670634508132935,
"learning_rate": 0.00019857506361323156,
"loss": 0.5595,
"step": 20
},
{
"epoch": 0.053231939163498096,
"grad_norm": 0.6825580596923828,
"learning_rate": 0.0001984732824427481,
"loss": 0.6601,
"step": 21
},
{
"epoch": 0.05576679340937896,
"grad_norm": 0.5777536630630493,
"learning_rate": 0.00019837150127226464,
"loss": 0.6232,
"step": 22
},
{
"epoch": 0.058301647655259824,
"grad_norm": 0.7791958451271057,
"learning_rate": 0.00019826972010178118,
"loss": 0.4741,
"step": 23
},
{
"epoch": 0.060836501901140684,
"grad_norm": 0.7647196054458618,
"learning_rate": 0.00019816793893129772,
"loss": 0.574,
"step": 24
},
{
"epoch": 0.06337135614702155,
"grad_norm": 0.6175855398178101,
"learning_rate": 0.00019806615776081426,
"loss": 0.6792,
"step": 25
},
{
"epoch": 0.06590621039290241,
"grad_norm": 0.7071298360824585,
"learning_rate": 0.0001979643765903308,
"loss": 0.6333,
"step": 26
},
{
"epoch": 0.06844106463878327,
"grad_norm": 0.7675352692604065,
"learning_rate": 0.00019786259541984734,
"loss": 0.5004,
"step": 27
},
{
"epoch": 0.07097591888466413,
"grad_norm": 0.6224766969680786,
"learning_rate": 0.00019776081424936387,
"loss": 0.5649,
"step": 28
},
{
"epoch": 0.07351077313054499,
"grad_norm": 0.6023550629615784,
"learning_rate": 0.00019765903307888041,
"loss": 0.4004,
"step": 29
},
{
"epoch": 0.07604562737642585,
"grad_norm": 0.6253474354743958,
"learning_rate": 0.00019755725190839695,
"loss": 0.548,
"step": 30
},
{
"epoch": 0.07858048162230671,
"grad_norm": 0.43560266494750977,
"learning_rate": 0.00019745547073791352,
"loss": 0.4721,
"step": 31
},
{
"epoch": 0.08111533586818757,
"grad_norm": 0.6321932077407837,
"learning_rate": 0.00019735368956743003,
"loss": 0.4671,
"step": 32
},
{
"epoch": 0.08365019011406843,
"grad_norm": 0.41977155208587646,
"learning_rate": 0.00019725190839694657,
"loss": 0.3716,
"step": 33
},
{
"epoch": 0.08618504435994931,
"grad_norm": 0.4449223279953003,
"learning_rate": 0.0001971501272264631,
"loss": 0.6045,
"step": 34
},
{
"epoch": 0.08871989860583017,
"grad_norm": 0.5593668222427368,
"learning_rate": 0.00019704834605597965,
"loss": 0.3789,
"step": 35
},
{
"epoch": 0.09125475285171103,
"grad_norm": 0.4293775260448456,
"learning_rate": 0.0001969465648854962,
"loss": 0.3834,
"step": 36
},
{
"epoch": 0.09378960709759189,
"grad_norm": 0.49535441398620605,
"learning_rate": 0.00019684478371501273,
"loss": 0.5504,
"step": 37
},
{
"epoch": 0.09632446134347275,
"grad_norm": 0.4620949625968933,
"learning_rate": 0.00019674300254452927,
"loss": 0.3212,
"step": 38
},
{
"epoch": 0.09885931558935361,
"grad_norm": 0.46665605902671814,
"learning_rate": 0.0001966412213740458,
"loss": 0.4868,
"step": 39
},
{
"epoch": 0.10139416983523447,
"grad_norm": 0.4120428264141083,
"learning_rate": 0.00019653944020356235,
"loss": 0.4926,
"step": 40
},
{
"epoch": 0.10392902408111533,
"grad_norm": 0.41570335626602173,
"learning_rate": 0.00019643765903307889,
"loss": 0.5068,
"step": 41
},
{
"epoch": 0.10646387832699619,
"grad_norm": 0.4141896665096283,
"learning_rate": 0.00019633587786259542,
"loss": 0.4064,
"step": 42
},
{
"epoch": 0.10899873257287707,
"grad_norm": 0.3192928433418274,
"learning_rate": 0.00019623409669211196,
"loss": 0.4581,
"step": 43
},
{
"epoch": 0.11153358681875793,
"grad_norm": 0.4188425838947296,
"learning_rate": 0.00019613231552162853,
"loss": 0.371,
"step": 44
},
{
"epoch": 0.11406844106463879,
"grad_norm": 0.3750368654727936,
"learning_rate": 0.00019603053435114504,
"loss": 0.3728,
"step": 45
},
{
"epoch": 0.11660329531051965,
"grad_norm": 0.5102046728134155,
"learning_rate": 0.00019592875318066158,
"loss": 0.357,
"step": 46
},
{
"epoch": 0.11913814955640051,
"grad_norm": 0.4143039882183075,
"learning_rate": 0.00019582697201017812,
"loss": 0.4373,
"step": 47
},
{
"epoch": 0.12167300380228137,
"grad_norm": 0.42558473348617554,
"learning_rate": 0.00019572519083969466,
"loss": 0.5877,
"step": 48
},
{
"epoch": 0.12420785804816223,
"grad_norm": 0.35768038034439087,
"learning_rate": 0.0001956234096692112,
"loss": 0.3326,
"step": 49
},
{
"epoch": 0.1267427122940431,
"grad_norm": 0.32826319336891174,
"learning_rate": 0.00019552162849872774,
"loss": 0.3521,
"step": 50
},
{
"epoch": 0.12927756653992395,
"grad_norm": 0.3507271409034729,
"learning_rate": 0.00019541984732824428,
"loss": 0.4157,
"step": 51
},
{
"epoch": 0.13181242078580482,
"grad_norm": 0.5069169402122498,
"learning_rate": 0.00019531806615776082,
"loss": 0.4453,
"step": 52
},
{
"epoch": 0.13434727503168567,
"grad_norm": 0.4759957492351532,
"learning_rate": 0.00019521628498727736,
"loss": 0.5131,
"step": 53
},
{
"epoch": 0.13688212927756654,
"grad_norm": 0.4045158326625824,
"learning_rate": 0.0001951145038167939,
"loss": 0.3927,
"step": 54
},
{
"epoch": 0.1394169835234474,
"grad_norm": 0.49629393219947815,
"learning_rate": 0.00019501272264631046,
"loss": 0.4708,
"step": 55
},
{
"epoch": 0.14195183776932827,
"grad_norm": 0.3735599219799042,
"learning_rate": 0.00019491094147582698,
"loss": 0.4076,
"step": 56
},
{
"epoch": 0.1444866920152091,
"grad_norm": 0.4713466763496399,
"learning_rate": 0.00019480916030534354,
"loss": 0.4187,
"step": 57
},
{
"epoch": 0.14702154626108999,
"grad_norm": 0.6454377770423889,
"learning_rate": 0.00019470737913486005,
"loss": 0.4032,
"step": 58
},
{
"epoch": 0.14955640050697086,
"grad_norm": 0.39378786087036133,
"learning_rate": 0.00019460559796437662,
"loss": 0.3508,
"step": 59
},
{
"epoch": 0.1520912547528517,
"grad_norm": 0.3768695592880249,
"learning_rate": 0.00019450381679389313,
"loss": 0.3129,
"step": 60
},
{
"epoch": 0.15462610899873258,
"grad_norm": 0.4250476062297821,
"learning_rate": 0.00019440203562340967,
"loss": 0.3426,
"step": 61
},
{
"epoch": 0.15716096324461343,
"grad_norm": 0.3653964698314667,
"learning_rate": 0.0001943002544529262,
"loss": 0.3339,
"step": 62
},
{
"epoch": 0.1596958174904943,
"grad_norm": 0.4973353445529938,
"learning_rate": 0.00019419847328244275,
"loss": 0.4759,
"step": 63
},
{
"epoch": 0.16223067173637515,
"grad_norm": 0.41738295555114746,
"learning_rate": 0.0001940966921119593,
"loss": 0.3809,
"step": 64
},
{
"epoch": 0.16476552598225602,
"grad_norm": 0.42326119542121887,
"learning_rate": 0.00019399491094147583,
"loss": 0.3399,
"step": 65
},
{
"epoch": 0.16730038022813687,
"grad_norm": 0.4244116246700287,
"learning_rate": 0.00019389312977099237,
"loss": 0.4085,
"step": 66
},
{
"epoch": 0.16983523447401774,
"grad_norm": 0.40235379338264465,
"learning_rate": 0.0001937913486005089,
"loss": 0.3016,
"step": 67
},
{
"epoch": 0.17237008871989862,
"grad_norm": 0.3983120322227478,
"learning_rate": 0.00019368956743002547,
"loss": 0.5101,
"step": 68
},
{
"epoch": 0.17490494296577946,
"grad_norm": 0.4857071042060852,
"learning_rate": 0.00019358778625954199,
"loss": 0.3131,
"step": 69
},
{
"epoch": 0.17743979721166034,
"grad_norm": 0.5238108038902283,
"learning_rate": 0.00019348600508905855,
"loss": 0.5841,
"step": 70
},
{
"epoch": 0.17997465145754118,
"grad_norm": 0.5322052240371704,
"learning_rate": 0.00019338422391857506,
"loss": 0.3895,
"step": 71
},
{
"epoch": 0.18250950570342206,
"grad_norm": 0.4643409252166748,
"learning_rate": 0.00019328244274809163,
"loss": 0.364,
"step": 72
},
{
"epoch": 0.1850443599493029,
"grad_norm": 0.36517271399497986,
"learning_rate": 0.00019318066157760814,
"loss": 0.4092,
"step": 73
},
{
"epoch": 0.18757921419518378,
"grad_norm": 0.49409031867980957,
"learning_rate": 0.00019307888040712468,
"loss": 0.3359,
"step": 74
},
{
"epoch": 0.19011406844106463,
"grad_norm": 0.44665688276290894,
"learning_rate": 0.00019297709923664122,
"loss": 0.3275,
"step": 75
},
{
"epoch": 0.1926489226869455,
"grad_norm": 0.353208065032959,
"learning_rate": 0.00019287531806615776,
"loss": 0.3396,
"step": 76
},
{
"epoch": 0.19518377693282637,
"grad_norm": 0.4061962366104126,
"learning_rate": 0.0001927735368956743,
"loss": 0.4658,
"step": 77
},
{
"epoch": 0.19771863117870722,
"grad_norm": 0.4785591959953308,
"learning_rate": 0.00019267175572519084,
"loss": 0.4705,
"step": 78
},
{
"epoch": 0.2002534854245881,
"grad_norm": 0.44644224643707275,
"learning_rate": 0.00019256997455470738,
"loss": 0.3573,
"step": 79
},
{
"epoch": 0.20278833967046894,
"grad_norm": 0.4554955065250397,
"learning_rate": 0.00019246819338422392,
"loss": 0.3822,
"step": 80
},
{
"epoch": 0.20532319391634982,
"grad_norm": 0.4537349343299866,
"learning_rate": 0.00019236641221374049,
"loss": 0.5222,
"step": 81
},
{
"epoch": 0.20785804816223066,
"grad_norm": 0.32820987701416016,
"learning_rate": 0.000192264631043257,
"loss": 0.3185,
"step": 82
},
{
"epoch": 0.21039290240811154,
"grad_norm": 0.39827391505241394,
"learning_rate": 0.00019216284987277356,
"loss": 0.3693,
"step": 83
},
{
"epoch": 0.21292775665399238,
"grad_norm": 0.4188093841075897,
"learning_rate": 0.00019206106870229008,
"loss": 0.4168,
"step": 84
},
{
"epoch": 0.21546261089987326,
"grad_norm": 0.4770517349243164,
"learning_rate": 0.00019195928753180664,
"loss": 0.4113,
"step": 85
},
{
"epoch": 0.21799746514575413,
"grad_norm": 0.346224844455719,
"learning_rate": 0.00019185750636132315,
"loss": 0.4238,
"step": 86
},
{
"epoch": 0.22053231939163498,
"grad_norm": 0.37398770451545715,
"learning_rate": 0.00019175572519083972,
"loss": 0.4285,
"step": 87
},
{
"epoch": 0.22306717363751585,
"grad_norm": 0.35467982292175293,
"learning_rate": 0.00019165394402035623,
"loss": 0.3201,
"step": 88
},
{
"epoch": 0.2256020278833967,
"grad_norm": 0.3411659002304077,
"learning_rate": 0.00019155216284987277,
"loss": 0.3428,
"step": 89
},
{
"epoch": 0.22813688212927757,
"grad_norm": 0.4002087712287903,
"learning_rate": 0.0001914503816793893,
"loss": 0.5375,
"step": 90
},
{
"epoch": 0.23067173637515842,
"grad_norm": 0.4339190423488617,
"learning_rate": 0.00019134860050890585,
"loss": 0.3355,
"step": 91
},
{
"epoch": 0.2332065906210393,
"grad_norm": 0.43449410796165466,
"learning_rate": 0.00019124681933842242,
"loss": 0.4355,
"step": 92
},
{
"epoch": 0.23574144486692014,
"grad_norm": 0.4565323293209076,
"learning_rate": 0.00019114503816793893,
"loss": 0.3178,
"step": 93
},
{
"epoch": 0.23827629911280102,
"grad_norm": 0.46309894323349,
"learning_rate": 0.0001910432569974555,
"loss": 0.3308,
"step": 94
},
{
"epoch": 0.24081115335868186,
"grad_norm": 0.3554096817970276,
"learning_rate": 0.000190941475826972,
"loss": 0.3358,
"step": 95
},
{
"epoch": 0.24334600760456274,
"grad_norm": 0.39129987359046936,
"learning_rate": 0.00019083969465648857,
"loss": 0.3988,
"step": 96
},
{
"epoch": 0.2458808618504436,
"grad_norm": 0.4193456470966339,
"learning_rate": 0.0001907379134860051,
"loss": 0.4064,
"step": 97
},
{
"epoch": 0.24841571609632446,
"grad_norm": 0.39571425318717957,
"learning_rate": 0.00019063613231552165,
"loss": 0.3213,
"step": 98
},
{
"epoch": 0.2509505703422053,
"grad_norm": 0.48566195368766785,
"learning_rate": 0.00019053435114503817,
"loss": 0.3505,
"step": 99
},
{
"epoch": 0.2534854245880862,
"grad_norm": 0.43266433477401733,
"learning_rate": 0.00019043256997455473,
"loss": 0.3579,
"step": 100
},
{
"epoch": 0.25602027883396705,
"grad_norm": 0.31110769510269165,
"learning_rate": 0.00019033078880407124,
"loss": 0.2832,
"step": 101
},
{
"epoch": 0.2585551330798479,
"grad_norm": 0.40166690945625305,
"learning_rate": 0.00019022900763358778,
"loss": 0.2964,
"step": 102
},
{
"epoch": 0.26108998732572875,
"grad_norm": 0.554072380065918,
"learning_rate": 0.00019012722646310432,
"loss": 0.3661,
"step": 103
},
{
"epoch": 0.26362484157160965,
"grad_norm": 0.45009374618530273,
"learning_rate": 0.00019002544529262086,
"loss": 0.3812,
"step": 104
},
{
"epoch": 0.2661596958174905,
"grad_norm": 0.48349273204803467,
"learning_rate": 0.00018992366412213743,
"loss": 0.4183,
"step": 105
},
{
"epoch": 0.26869455006337134,
"grad_norm": 0.4157555103302002,
"learning_rate": 0.00018982188295165394,
"loss": 0.2962,
"step": 106
},
{
"epoch": 0.27122940430925224,
"grad_norm": 0.3300265073776245,
"learning_rate": 0.0001897201017811705,
"loss": 0.3351,
"step": 107
},
{
"epoch": 0.2737642585551331,
"grad_norm": 0.3690893054008484,
"learning_rate": 0.00018961832061068702,
"loss": 0.3251,
"step": 108
},
{
"epoch": 0.27629911280101394,
"grad_norm": 0.49013710021972656,
"learning_rate": 0.00018951653944020359,
"loss": 0.4757,
"step": 109
},
{
"epoch": 0.2788339670468948,
"grad_norm": 0.4416143000125885,
"learning_rate": 0.0001894147582697201,
"loss": 0.4421,
"step": 110
},
{
"epoch": 0.2813688212927757,
"grad_norm": 0.3613321781158447,
"learning_rate": 0.00018931297709923666,
"loss": 0.3475,
"step": 111
},
{
"epoch": 0.28390367553865653,
"grad_norm": 0.45548489689826965,
"learning_rate": 0.00018921119592875318,
"loss": 0.3587,
"step": 112
},
{
"epoch": 0.2864385297845374,
"grad_norm": 0.49439120292663574,
"learning_rate": 0.00018910941475826974,
"loss": 0.4017,
"step": 113
},
{
"epoch": 0.2889733840304182,
"grad_norm": 0.35214680433273315,
"learning_rate": 0.00018900763358778626,
"loss": 0.2645,
"step": 114
},
{
"epoch": 0.2915082382762991,
"grad_norm": 0.5512099266052246,
"learning_rate": 0.00018890585241730282,
"loss": 0.3736,
"step": 115
},
{
"epoch": 0.29404309252217997,
"grad_norm": 0.4146886467933655,
"learning_rate": 0.00018880407124681936,
"loss": 0.3361,
"step": 116
},
{
"epoch": 0.2965779467680608,
"grad_norm": 0.42954355478286743,
"learning_rate": 0.00018870229007633587,
"loss": 0.3841,
"step": 117
},
{
"epoch": 0.2991128010139417,
"grad_norm": 0.47189798951148987,
"learning_rate": 0.00018860050890585244,
"loss": 0.3591,
"step": 118
},
{
"epoch": 0.30164765525982257,
"grad_norm": 0.5082337260246277,
"learning_rate": 0.00018849872773536895,
"loss": 0.4249,
"step": 119
},
{
"epoch": 0.3041825095057034,
"grad_norm": 0.4005051255226135,
"learning_rate": 0.00018839694656488552,
"loss": 0.4433,
"step": 120
},
{
"epoch": 0.30671736375158426,
"grad_norm": 0.4730987250804901,
"learning_rate": 0.00018829516539440203,
"loss": 0.3575,
"step": 121
},
{
"epoch": 0.30925221799746516,
"grad_norm": 0.5227373242378235,
"learning_rate": 0.0001881933842239186,
"loss": 0.3511,
"step": 122
},
{
"epoch": 0.311787072243346,
"grad_norm": 0.3693684935569763,
"learning_rate": 0.0001880916030534351,
"loss": 0.3097,
"step": 123
},
{
"epoch": 0.31432192648922685,
"grad_norm": 0.45321500301361084,
"learning_rate": 0.00018798982188295168,
"loss": 0.4464,
"step": 124
},
{
"epoch": 0.31685678073510776,
"grad_norm": 0.3797638714313507,
"learning_rate": 0.0001878880407124682,
"loss": 0.328,
"step": 125
},
{
"epoch": 0.3193916349809886,
"grad_norm": 0.3996891975402832,
"learning_rate": 0.00018778625954198475,
"loss": 0.28,
"step": 126
},
{
"epoch": 0.32192648922686945,
"grad_norm": 0.3931027352809906,
"learning_rate": 0.00018768447837150127,
"loss": 0.2439,
"step": 127
},
{
"epoch": 0.3244613434727503,
"grad_norm": 0.4259742200374603,
"learning_rate": 0.00018758269720101783,
"loss": 0.3068,
"step": 128
},
{
"epoch": 0.3269961977186312,
"grad_norm": 0.4267159402370453,
"learning_rate": 0.00018748091603053437,
"loss": 0.3405,
"step": 129
},
{
"epoch": 0.32953105196451205,
"grad_norm": 0.41900908946990967,
"learning_rate": 0.0001873791348600509,
"loss": 0.327,
"step": 130
},
{
"epoch": 0.3320659062103929,
"grad_norm": 0.436499685049057,
"learning_rate": 0.00018727735368956745,
"loss": 0.5089,
"step": 131
},
{
"epoch": 0.33460076045627374,
"grad_norm": 0.43961402773857117,
"learning_rate": 0.00018717557251908396,
"loss": 0.339,
"step": 132
},
{
"epoch": 0.33713561470215464,
"grad_norm": 0.45645856857299805,
"learning_rate": 0.00018707379134860053,
"loss": 0.3738,
"step": 133
},
{
"epoch": 0.3396704689480355,
"grad_norm": 0.36948803067207336,
"learning_rate": 0.00018697201017811704,
"loss": 0.2777,
"step": 134
},
{
"epoch": 0.34220532319391633,
"grad_norm": 0.32040536403656006,
"learning_rate": 0.0001868702290076336,
"loss": 0.3679,
"step": 135
},
{
"epoch": 0.34474017743979724,
"grad_norm": 0.37474381923675537,
"learning_rate": 0.00018676844783715012,
"loss": 0.4282,
"step": 136
},
{
"epoch": 0.3472750316856781,
"grad_norm": 0.4243752360343933,
"learning_rate": 0.0001866666666666667,
"loss": 0.533,
"step": 137
},
{
"epoch": 0.34980988593155893,
"grad_norm": 0.39162227511405945,
"learning_rate": 0.0001865648854961832,
"loss": 0.2989,
"step": 138
},
{
"epoch": 0.3523447401774398,
"grad_norm": 0.3585897386074066,
"learning_rate": 0.00018646310432569977,
"loss": 0.3368,
"step": 139
},
{
"epoch": 0.3548795944233207,
"grad_norm": 0.39330482482910156,
"learning_rate": 0.00018636132315521628,
"loss": 0.4904,
"step": 140
},
{
"epoch": 0.3574144486692015,
"grad_norm": 0.3404198884963989,
"learning_rate": 0.00018625954198473284,
"loss": 0.2684,
"step": 141
},
{
"epoch": 0.35994930291508237,
"grad_norm": 0.34813976287841797,
"learning_rate": 0.00018615776081424938,
"loss": 0.2988,
"step": 142
},
{
"epoch": 0.36248415716096327,
"grad_norm": 0.4100090265274048,
"learning_rate": 0.00018605597964376592,
"loss": 0.3325,
"step": 143
},
{
"epoch": 0.3650190114068441,
"grad_norm": 0.2897261083126068,
"learning_rate": 0.00018595419847328246,
"loss": 0.2487,
"step": 144
},
{
"epoch": 0.36755386565272496,
"grad_norm": 0.43023669719696045,
"learning_rate": 0.00018585241730279897,
"loss": 0.4875,
"step": 145
},
{
"epoch": 0.3700887198986058,
"grad_norm": 0.39708128571510315,
"learning_rate": 0.00018575063613231554,
"loss": 0.3742,
"step": 146
},
{
"epoch": 0.3726235741444867,
"grad_norm": 0.4191845953464508,
"learning_rate": 0.00018564885496183205,
"loss": 0.3253,
"step": 147
},
{
"epoch": 0.37515842839036756,
"grad_norm": 0.3373403549194336,
"learning_rate": 0.00018554707379134862,
"loss": 0.2636,
"step": 148
},
{
"epoch": 0.3776932826362484,
"grad_norm": 0.3522009551525116,
"learning_rate": 0.00018544529262086513,
"loss": 0.2413,
"step": 149
},
{
"epoch": 0.38022813688212925,
"grad_norm": 0.4140997529029846,
"learning_rate": 0.0001853435114503817,
"loss": 0.3663,
"step": 150
},
{
"epoch": 0.38276299112801015,
"grad_norm": 0.3986112177371979,
"learning_rate": 0.0001852417302798982,
"loss": 0.276,
"step": 151
},
{
"epoch": 0.385297845373891,
"grad_norm": 0.46847087144851685,
"learning_rate": 0.00018513994910941478,
"loss": 0.3369,
"step": 152
},
{
"epoch": 0.38783269961977185,
"grad_norm": 0.43623679876327515,
"learning_rate": 0.00018503816793893132,
"loss": 0.37,
"step": 153
},
{
"epoch": 0.39036755386565275,
"grad_norm": 0.4128822684288025,
"learning_rate": 0.00018493638676844785,
"loss": 0.3763,
"step": 154
},
{
"epoch": 0.3929024081115336,
"grad_norm": 0.3352810740470886,
"learning_rate": 0.0001848346055979644,
"loss": 0.2446,
"step": 155
},
{
"epoch": 0.39543726235741444,
"grad_norm": 0.580634355545044,
"learning_rate": 0.00018473282442748093,
"loss": 0.3691,
"step": 156
},
{
"epoch": 0.3979721166032953,
"grad_norm": 0.452499657869339,
"learning_rate": 0.00018463104325699747,
"loss": 0.4361,
"step": 157
},
{
"epoch": 0.4005069708491762,
"grad_norm": 0.4160007834434509,
"learning_rate": 0.000184529262086514,
"loss": 0.4003,
"step": 158
},
{
"epoch": 0.40304182509505704,
"grad_norm": 0.3049513101577759,
"learning_rate": 0.00018442748091603055,
"loss": 0.2167,
"step": 159
},
{
"epoch": 0.4055766793409379,
"grad_norm": 0.38912078738212585,
"learning_rate": 0.00018432569974554706,
"loss": 0.2766,
"step": 160
},
{
"epoch": 0.40811153358681873,
"grad_norm": 0.4433249831199646,
"learning_rate": 0.00018422391857506363,
"loss": 0.3331,
"step": 161
},
{
"epoch": 0.41064638783269963,
"grad_norm": 0.36410561203956604,
"learning_rate": 0.00018412213740458014,
"loss": 0.2719,
"step": 162
},
{
"epoch": 0.4131812420785805,
"grad_norm": 0.47044846415519714,
"learning_rate": 0.0001840203562340967,
"loss": 0.3602,
"step": 163
},
{
"epoch": 0.4157160963244613,
"grad_norm": 0.38755008578300476,
"learning_rate": 0.00018391857506361322,
"loss": 0.2815,
"step": 164
},
{
"epoch": 0.41825095057034223,
"grad_norm": 0.39241930842399597,
"learning_rate": 0.0001838167938931298,
"loss": 0.3642,
"step": 165
},
{
"epoch": 0.4207858048162231,
"grad_norm": 0.37138187885284424,
"learning_rate": 0.00018371501272264633,
"loss": 0.267,
"step": 166
},
{
"epoch": 0.4233206590621039,
"grad_norm": 0.4508083462715149,
"learning_rate": 0.00018361323155216287,
"loss": 0.4093,
"step": 167
},
{
"epoch": 0.42585551330798477,
"grad_norm": 0.4390806257724762,
"learning_rate": 0.0001835114503816794,
"loss": 0.424,
"step": 168
},
{
"epoch": 0.42839036755386567,
"grad_norm": 0.4640062153339386,
"learning_rate": 0.00018340966921119594,
"loss": 0.4065,
"step": 169
},
{
"epoch": 0.4309252217997465,
"grad_norm": 0.37822040915489197,
"learning_rate": 0.00018330788804071248,
"loss": 0.2854,
"step": 170
},
{
"epoch": 0.43346007604562736,
"grad_norm": 0.3658731281757355,
"learning_rate": 0.00018320610687022902,
"loss": 0.2826,
"step": 171
},
{
"epoch": 0.43599493029150826,
"grad_norm": 0.4271928369998932,
"learning_rate": 0.00018310432569974556,
"loss": 0.4538,
"step": 172
},
{
"epoch": 0.4385297845373891,
"grad_norm": 0.33550775051116943,
"learning_rate": 0.00018300254452926207,
"loss": 0.3015,
"step": 173
},
{
"epoch": 0.44106463878326996,
"grad_norm": 0.5374005436897278,
"learning_rate": 0.00018290076335877864,
"loss": 0.2771,
"step": 174
},
{
"epoch": 0.4435994930291508,
"grad_norm": 0.4630737602710724,
"learning_rate": 0.00018279898218829515,
"loss": 0.3786,
"step": 175
},
{
"epoch": 0.4461343472750317,
"grad_norm": 0.4163656234741211,
"learning_rate": 0.00018269720101781172,
"loss": 0.3224,
"step": 176
},
{
"epoch": 0.44866920152091255,
"grad_norm": 0.43972182273864746,
"learning_rate": 0.00018259541984732826,
"loss": 0.4192,
"step": 177
},
{
"epoch": 0.4512040557667934,
"grad_norm": 0.4114130437374115,
"learning_rate": 0.0001824936386768448,
"loss": 0.2979,
"step": 178
},
{
"epoch": 0.45373891001267425,
"grad_norm": 0.5002878308296204,
"learning_rate": 0.00018239185750636134,
"loss": 0.3339,
"step": 179
},
{
"epoch": 0.45627376425855515,
"grad_norm": 0.42383208870887756,
"learning_rate": 0.00018229007633587788,
"loss": 0.2958,
"step": 180
},
{
"epoch": 0.458808618504436,
"grad_norm": 0.3234981894493103,
"learning_rate": 0.00018218829516539442,
"loss": 0.2215,
"step": 181
},
{
"epoch": 0.46134347275031684,
"grad_norm": 0.33356910943984985,
"learning_rate": 0.00018208651399491096,
"loss": 0.3017,
"step": 182
},
{
"epoch": 0.46387832699619774,
"grad_norm": 0.442376047372818,
"learning_rate": 0.0001819847328244275,
"loss": 0.2751,
"step": 183
},
{
"epoch": 0.4664131812420786,
"grad_norm": 0.4563845992088318,
"learning_rate": 0.00018188295165394403,
"loss": 0.3001,
"step": 184
},
{
"epoch": 0.46894803548795944,
"grad_norm": 0.3957296907901764,
"learning_rate": 0.00018178117048346057,
"loss": 0.3864,
"step": 185
},
{
"epoch": 0.4714828897338403,
"grad_norm": 0.32932132482528687,
"learning_rate": 0.0001816793893129771,
"loss": 0.2528,
"step": 186
},
{
"epoch": 0.4740177439797212,
"grad_norm": 0.3960365951061249,
"learning_rate": 0.00018157760814249365,
"loss": 0.3975,
"step": 187
},
{
"epoch": 0.47655259822560203,
"grad_norm": 0.38450995087623596,
"learning_rate": 0.00018147582697201016,
"loss": 0.2552,
"step": 188
},
{
"epoch": 0.4790874524714829,
"grad_norm": 0.4259994626045227,
"learning_rate": 0.00018137404580152673,
"loss": 0.3,
"step": 189
},
{
"epoch": 0.4816223067173637,
"grad_norm": 0.4965859055519104,
"learning_rate": 0.00018127226463104327,
"loss": 0.3099,
"step": 190
},
{
"epoch": 0.4841571609632446,
"grad_norm": 0.38229548931121826,
"learning_rate": 0.0001811704834605598,
"loss": 0.3799,
"step": 191
},
{
"epoch": 0.4866920152091255,
"grad_norm": 0.4622017741203308,
"learning_rate": 0.00018106870229007635,
"loss": 0.4815,
"step": 192
},
{
"epoch": 0.4892268694550063,
"grad_norm": 0.3207991123199463,
"learning_rate": 0.0001809669211195929,
"loss": 0.2534,
"step": 193
},
{
"epoch": 0.4917617237008872,
"grad_norm": 0.3322354555130005,
"learning_rate": 0.00018086513994910943,
"loss": 0.2331,
"step": 194
},
{
"epoch": 0.49429657794676807,
"grad_norm": 0.35752132534980774,
"learning_rate": 0.00018076335877862597,
"loss": 0.3621,
"step": 195
},
{
"epoch": 0.4968314321926489,
"grad_norm": 0.2801353633403778,
"learning_rate": 0.0001806615776081425,
"loss": 0.2198,
"step": 196
},
{
"epoch": 0.49936628643852976,
"grad_norm": 0.5065000057220459,
"learning_rate": 0.00018055979643765905,
"loss": 0.3806,
"step": 197
},
{
"epoch": 0.5019011406844106,
"grad_norm": 0.4308508336544037,
"learning_rate": 0.00018045801526717558,
"loss": 0.4028,
"step": 198
},
{
"epoch": 0.5044359949302915,
"grad_norm": 0.5432320833206177,
"learning_rate": 0.00018035623409669212,
"loss": 0.506,
"step": 199
},
{
"epoch": 0.5069708491761724,
"grad_norm": 0.37079155445098877,
"learning_rate": 0.00018025445292620866,
"loss": 0.2242,
"step": 200
},
{
"epoch": 0.5095057034220533,
"grad_norm": 0.3533012568950653,
"learning_rate": 0.00018015267175572518,
"loss": 0.3462,
"step": 201
},
{
"epoch": 0.5120405576679341,
"grad_norm": 0.37727662920951843,
"learning_rate": 0.00018005089058524174,
"loss": 0.2421,
"step": 202
},
{
"epoch": 0.514575411913815,
"grad_norm": 0.42737269401550293,
"learning_rate": 0.00017994910941475828,
"loss": 0.3338,
"step": 203
},
{
"epoch": 0.5171102661596958,
"grad_norm": 0.41085687279701233,
"learning_rate": 0.00017984732824427482,
"loss": 0.4233,
"step": 204
},
{
"epoch": 0.5196451204055766,
"grad_norm": 0.4871644675731659,
"learning_rate": 0.00017974554707379136,
"loss": 0.3504,
"step": 205
},
{
"epoch": 0.5221799746514575,
"grad_norm": 0.308347225189209,
"learning_rate": 0.0001796437659033079,
"loss": 0.27,
"step": 206
},
{
"epoch": 0.5247148288973384,
"grad_norm": 0.31587716937065125,
"learning_rate": 0.00017954198473282444,
"loss": 0.3161,
"step": 207
},
{
"epoch": 0.5272496831432193,
"grad_norm": 0.471392959356308,
"learning_rate": 0.00017944020356234098,
"loss": 0.3758,
"step": 208
},
{
"epoch": 0.5297845373891001,
"grad_norm": 0.33414778113365173,
"learning_rate": 0.00017933842239185752,
"loss": 0.3095,
"step": 209
},
{
"epoch": 0.532319391634981,
"grad_norm": 0.26553916931152344,
"learning_rate": 0.00017923664122137406,
"loss": 0.232,
"step": 210
},
{
"epoch": 0.5348542458808618,
"grad_norm": 0.27914223074913025,
"learning_rate": 0.0001791348600508906,
"loss": 0.2438,
"step": 211
},
{
"epoch": 0.5373891001267427,
"grad_norm": 0.36625003814697266,
"learning_rate": 0.00017903307888040713,
"loss": 0.2479,
"step": 212
},
{
"epoch": 0.5399239543726235,
"grad_norm": 0.3876325488090515,
"learning_rate": 0.00017893129770992367,
"loss": 0.3428,
"step": 213
},
{
"epoch": 0.5424588086185045,
"grad_norm": 0.5402606129646301,
"learning_rate": 0.0001788295165394402,
"loss": 0.394,
"step": 214
},
{
"epoch": 0.5449936628643853,
"grad_norm": 0.4023256301879883,
"learning_rate": 0.00017872773536895675,
"loss": 0.3348,
"step": 215
},
{
"epoch": 0.5475285171102662,
"grad_norm": 0.4440263509750366,
"learning_rate": 0.0001786259541984733,
"loss": 0.3001,
"step": 216
},
{
"epoch": 0.550063371356147,
"grad_norm": 0.39178457856178284,
"learning_rate": 0.00017852417302798983,
"loss": 0.2561,
"step": 217
},
{
"epoch": 0.5525982256020279,
"grad_norm": 0.5261508226394653,
"learning_rate": 0.00017842239185750637,
"loss": 0.4583,
"step": 218
},
{
"epoch": 0.5551330798479087,
"grad_norm": 0.3981377184391022,
"learning_rate": 0.0001783206106870229,
"loss": 0.265,
"step": 219
},
{
"epoch": 0.5576679340937896,
"grad_norm": 0.3689790666103363,
"learning_rate": 0.00017821882951653945,
"loss": 0.3965,
"step": 220
},
{
"epoch": 0.5602027883396705,
"grad_norm": 0.38442498445510864,
"learning_rate": 0.000178117048346056,
"loss": 0.268,
"step": 221
},
{
"epoch": 0.5627376425855514,
"grad_norm": 0.3051845133304596,
"learning_rate": 0.00017801526717557253,
"loss": 0.2362,
"step": 222
},
{
"epoch": 0.5652724968314322,
"grad_norm": 0.41551336646080017,
"learning_rate": 0.00017791348600508907,
"loss": 0.3428,
"step": 223
},
{
"epoch": 0.5678073510773131,
"grad_norm": 0.2885109484195709,
"learning_rate": 0.0001778117048346056,
"loss": 0.2328,
"step": 224
},
{
"epoch": 0.5703422053231939,
"grad_norm": 0.48813045024871826,
"learning_rate": 0.00017770992366412215,
"loss": 0.3502,
"step": 225
},
{
"epoch": 0.5728770595690748,
"grad_norm": 0.4413661062717438,
"learning_rate": 0.00017760814249363869,
"loss": 0.2687,
"step": 226
},
{
"epoch": 0.5754119138149556,
"grad_norm": 0.422799289226532,
"learning_rate": 0.00017750636132315522,
"loss": 0.4776,
"step": 227
},
{
"epoch": 0.5779467680608364,
"grad_norm": 0.39486098289489746,
"learning_rate": 0.00017740458015267176,
"loss": 0.3551,
"step": 228
},
{
"epoch": 0.5804816223067174,
"grad_norm": 0.366207480430603,
"learning_rate": 0.0001773027989821883,
"loss": 0.2639,
"step": 229
},
{
"epoch": 0.5830164765525983,
"grad_norm": 0.334626704454422,
"learning_rate": 0.00017720101781170484,
"loss": 0.2407,
"step": 230
},
{
"epoch": 0.5855513307984791,
"grad_norm": 0.5580838918685913,
"learning_rate": 0.00017709923664122138,
"loss": 0.3856,
"step": 231
},
{
"epoch": 0.5880861850443599,
"grad_norm": 0.3495747148990631,
"learning_rate": 0.00017699745547073792,
"loss": 0.3113,
"step": 232
},
{
"epoch": 0.5906210392902408,
"grad_norm": 0.38515543937683105,
"learning_rate": 0.00017689567430025446,
"loss": 0.3765,
"step": 233
},
{
"epoch": 0.5931558935361216,
"grad_norm": 0.43240851163864136,
"learning_rate": 0.000176793893129771,
"loss": 0.3094,
"step": 234
},
{
"epoch": 0.5956907477820025,
"grad_norm": 0.42353445291519165,
"learning_rate": 0.00017669211195928754,
"loss": 0.2992,
"step": 235
},
{
"epoch": 0.5982256020278834,
"grad_norm": 0.42463192343711853,
"learning_rate": 0.00017659033078880408,
"loss": 0.2486,
"step": 236
},
{
"epoch": 0.6007604562737643,
"grad_norm": 0.4749039113521576,
"learning_rate": 0.00017648854961832062,
"loss": 0.3742,
"step": 237
},
{
"epoch": 0.6032953105196451,
"grad_norm": 0.5651363730430603,
"learning_rate": 0.00017638676844783716,
"loss": 0.3079,
"step": 238
},
{
"epoch": 0.605830164765526,
"grad_norm": 0.34195011854171753,
"learning_rate": 0.0001762849872773537,
"loss": 0.3236,
"step": 239
},
{
"epoch": 0.6083650190114068,
"grad_norm": 0.5522583723068237,
"learning_rate": 0.00017618320610687024,
"loss": 0.3026,
"step": 240
},
{
"epoch": 0.6108998732572877,
"grad_norm": 0.41445448994636536,
"learning_rate": 0.00017608142493638677,
"loss": 0.32,
"step": 241
},
{
"epoch": 0.6134347275031685,
"grad_norm": 0.5023159384727478,
"learning_rate": 0.00017597964376590331,
"loss": 0.2658,
"step": 242
},
{
"epoch": 0.6159695817490495,
"grad_norm": 0.39539164304733276,
"learning_rate": 0.00017587786259541985,
"loss": 0.2687,
"step": 243
},
{
"epoch": 0.6185044359949303,
"grad_norm": 0.3105890154838562,
"learning_rate": 0.0001757760814249364,
"loss": 0.2224,
"step": 244
},
{
"epoch": 0.6210392902408112,
"grad_norm": 0.3665928840637207,
"learning_rate": 0.00017567430025445293,
"loss": 0.3101,
"step": 245
},
{
"epoch": 0.623574144486692,
"grad_norm": 0.28569111227989197,
"learning_rate": 0.00017557251908396947,
"loss": 0.2316,
"step": 246
},
{
"epoch": 0.6261089987325729,
"grad_norm": 0.24598725140094757,
"learning_rate": 0.000175470737913486,
"loss": 0.2314,
"step": 247
},
{
"epoch": 0.6286438529784537,
"grad_norm": 0.4301004111766815,
"learning_rate": 0.00017536895674300255,
"loss": 0.2606,
"step": 248
},
{
"epoch": 0.6311787072243346,
"grad_norm": 0.36598455905914307,
"learning_rate": 0.0001752671755725191,
"loss": 0.2243,
"step": 249
},
{
"epoch": 0.6337135614702155,
"grad_norm": 0.31714677810668945,
"learning_rate": 0.00017516539440203563,
"loss": 0.2561,
"step": 250
},
{
"epoch": 0.6362484157160964,
"grad_norm": 0.5131182670593262,
"learning_rate": 0.0001750636132315522,
"loss": 0.3216,
"step": 251
},
{
"epoch": 0.6387832699619772,
"grad_norm": 0.4067549407482147,
"learning_rate": 0.0001749618320610687,
"loss": 0.3032,
"step": 252
},
{
"epoch": 0.641318124207858,
"grad_norm": 0.6457440853118896,
"learning_rate": 0.00017486005089058525,
"loss": 0.349,
"step": 253
},
{
"epoch": 0.6438529784537389,
"grad_norm": 0.3759848177433014,
"learning_rate": 0.00017475826972010179,
"loss": 0.2974,
"step": 254
},
{
"epoch": 0.6463878326996197,
"grad_norm": 0.40348076820373535,
"learning_rate": 0.00017465648854961833,
"loss": 0.2781,
"step": 255
},
{
"epoch": 0.6489226869455006,
"grad_norm": 0.2639053463935852,
"learning_rate": 0.00017455470737913486,
"loss": 0.2413,
"step": 256
},
{
"epoch": 0.6514575411913816,
"grad_norm": 0.4014027416706085,
"learning_rate": 0.0001744529262086514,
"loss": 0.2878,
"step": 257
},
{
"epoch": 0.6539923954372624,
"grad_norm": 0.4871384799480438,
"learning_rate": 0.00017435114503816794,
"loss": 0.2527,
"step": 258
},
{
"epoch": 0.6565272496831432,
"grad_norm": 0.28687578439712524,
"learning_rate": 0.00017424936386768448,
"loss": 0.2233,
"step": 259
},
{
"epoch": 0.6590621039290241,
"grad_norm": 0.36948761343955994,
"learning_rate": 0.00017414758269720102,
"loss": 0.3007,
"step": 260
},
{
"epoch": 0.6615969581749049,
"grad_norm": 0.6034134030342102,
"learning_rate": 0.00017404580152671756,
"loss": 0.3054,
"step": 261
},
{
"epoch": 0.6641318124207858,
"grad_norm": 0.3481515645980835,
"learning_rate": 0.0001739440203562341,
"loss": 0.2388,
"step": 262
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.3772611916065216,
"learning_rate": 0.00017384223918575064,
"loss": 0.317,
"step": 263
},
{
"epoch": 0.6692015209125475,
"grad_norm": 0.4693986177444458,
"learning_rate": 0.0001737404580152672,
"loss": 0.3441,
"step": 264
},
{
"epoch": 0.6717363751584284,
"grad_norm": 0.38484400510787964,
"learning_rate": 0.00017363867684478372,
"loss": 0.2637,
"step": 265
},
{
"epoch": 0.6742712294043093,
"grad_norm": 0.3638555407524109,
"learning_rate": 0.00017353689567430026,
"loss": 0.2695,
"step": 266
},
{
"epoch": 0.6768060836501901,
"grad_norm": 0.36848586797714233,
"learning_rate": 0.0001734351145038168,
"loss": 0.3149,
"step": 267
},
{
"epoch": 0.679340937896071,
"grad_norm": 0.31740638613700867,
"learning_rate": 0.00017333333333333334,
"loss": 0.3049,
"step": 268
},
{
"epoch": 0.6818757921419518,
"grad_norm": 0.41415438055992126,
"learning_rate": 0.00017323155216284988,
"loss": 0.231,
"step": 269
},
{
"epoch": 0.6844106463878327,
"grad_norm": 0.41449829936027527,
"learning_rate": 0.00017312977099236641,
"loss": 0.3344,
"step": 270
},
{
"epoch": 0.6869455006337135,
"grad_norm": 0.30683189630508423,
"learning_rate": 0.00017302798982188295,
"loss": 0.283,
"step": 271
},
{
"epoch": 0.6894803548795945,
"grad_norm": 0.29896244406700134,
"learning_rate": 0.0001729262086513995,
"loss": 0.2363,
"step": 272
},
{
"epoch": 0.6920152091254753,
"grad_norm": 0.44181492924690247,
"learning_rate": 0.00017282442748091603,
"loss": 0.3439,
"step": 273
},
{
"epoch": 0.6945500633713562,
"grad_norm": 0.43460434675216675,
"learning_rate": 0.00017272264631043257,
"loss": 0.3004,
"step": 274
},
{
"epoch": 0.697084917617237,
"grad_norm": 0.40781405568122864,
"learning_rate": 0.00017262086513994914,
"loss": 0.2554,
"step": 275
},
{
"epoch": 0.6996197718631179,
"grad_norm": 0.39359861612319946,
"learning_rate": 0.00017251908396946565,
"loss": 0.3094,
"step": 276
},
{
"epoch": 0.7021546261089987,
"grad_norm": 0.4507496953010559,
"learning_rate": 0.00017241730279898222,
"loss": 0.2985,
"step": 277
},
{
"epoch": 0.7046894803548795,
"grad_norm": 0.4513093829154968,
"learning_rate": 0.00017231552162849873,
"loss": 0.4,
"step": 278
},
{
"epoch": 0.7072243346007605,
"grad_norm": 0.3133571147918701,
"learning_rate": 0.0001722137404580153,
"loss": 0.2241,
"step": 279
},
{
"epoch": 0.7097591888466414,
"grad_norm": 0.36957162618637085,
"learning_rate": 0.0001721119592875318,
"loss": 0.2461,
"step": 280
},
{
"epoch": 0.7122940430925222,
"grad_norm": 0.4224545955657959,
"learning_rate": 0.00017201017811704835,
"loss": 0.3178,
"step": 281
},
{
"epoch": 0.714828897338403,
"grad_norm": 0.4696861207485199,
"learning_rate": 0.0001719083969465649,
"loss": 0.3911,
"step": 282
},
{
"epoch": 0.7173637515842839,
"grad_norm": 0.44058746099472046,
"learning_rate": 0.00017180661577608143,
"loss": 0.3169,
"step": 283
},
{
"epoch": 0.7198986058301647,
"grad_norm": 0.32616788148880005,
"learning_rate": 0.00017170483460559797,
"loss": 0.2441,
"step": 284
},
{
"epoch": 0.7224334600760456,
"grad_norm": 0.3941279649734497,
"learning_rate": 0.0001716030534351145,
"loss": 0.3433,
"step": 285
},
{
"epoch": 0.7249683143219265,
"grad_norm": 0.3746216297149658,
"learning_rate": 0.00017150127226463104,
"loss": 0.3993,
"step": 286
},
{
"epoch": 0.7275031685678074,
"grad_norm": 0.3758716881275177,
"learning_rate": 0.00017139949109414758,
"loss": 0.3139,
"step": 287
},
{
"epoch": 0.7300380228136882,
"grad_norm": 0.35631927847862244,
"learning_rate": 0.00017129770992366415,
"loss": 0.2316,
"step": 288
},
{
"epoch": 0.7325728770595691,
"grad_norm": 0.48128026723861694,
"learning_rate": 0.00017119592875318066,
"loss": 0.3306,
"step": 289
},
{
"epoch": 0.7351077313054499,
"grad_norm": 0.3464122414588928,
"learning_rate": 0.00017109414758269723,
"loss": 0.3148,
"step": 290
},
{
"epoch": 0.7376425855513308,
"grad_norm": 0.3772057294845581,
"learning_rate": 0.00017099236641221374,
"loss": 0.274,
"step": 291
},
{
"epoch": 0.7401774397972116,
"grad_norm": 0.2896706759929657,
"learning_rate": 0.0001708905852417303,
"loss": 0.2275,
"step": 292
},
{
"epoch": 0.7427122940430925,
"grad_norm": 0.48482832312583923,
"learning_rate": 0.00017078880407124682,
"loss": 0.2913,
"step": 293
},
{
"epoch": 0.7452471482889734,
"grad_norm": 0.3086034655570984,
"learning_rate": 0.00017068702290076336,
"loss": 0.2453,
"step": 294
},
{
"epoch": 0.7477820025348543,
"grad_norm": 0.42840075492858887,
"learning_rate": 0.0001705852417302799,
"loss": 0.352,
"step": 295
},
{
"epoch": 0.7503168567807351,
"grad_norm": 0.4574609398841858,
"learning_rate": 0.00017048346055979644,
"loss": 0.3698,
"step": 296
},
{
"epoch": 0.752851711026616,
"grad_norm": 0.4295889735221863,
"learning_rate": 0.00017038167938931298,
"loss": 0.3341,
"step": 297
},
{
"epoch": 0.7553865652724968,
"grad_norm": 0.46036672592163086,
"learning_rate": 0.00017027989821882952,
"loss": 0.3175,
"step": 298
},
{
"epoch": 0.7579214195183777,
"grad_norm": 0.45897790789604187,
"learning_rate": 0.00017017811704834608,
"loss": 0.31,
"step": 299
},
{
"epoch": 0.7604562737642585,
"grad_norm": 0.2966432273387909,
"learning_rate": 0.0001700763358778626,
"loss": 0.2439,
"step": 300
},
{
"epoch": 0.7629911280101395,
"grad_norm": 0.32714638113975525,
"learning_rate": 0.00016997455470737916,
"loss": 0.2653,
"step": 301
},
{
"epoch": 0.7655259822560203,
"grad_norm": 0.32264646887779236,
"learning_rate": 0.00016987277353689567,
"loss": 0.2728,
"step": 302
},
{
"epoch": 0.7680608365019012,
"grad_norm": 0.4073767066001892,
"learning_rate": 0.00016977099236641224,
"loss": 0.3501,
"step": 303
},
{
"epoch": 0.770595690747782,
"grad_norm": 0.5493949055671692,
"learning_rate": 0.00016966921119592875,
"loss": 0.3212,
"step": 304
},
{
"epoch": 0.7731305449936628,
"grad_norm": 0.335705429315567,
"learning_rate": 0.00016956743002544532,
"loss": 0.299,
"step": 305
},
{
"epoch": 0.7756653992395437,
"grad_norm": 0.32758405804634094,
"learning_rate": 0.00016946564885496183,
"loss": 0.2547,
"step": 306
},
{
"epoch": 0.7782002534854245,
"grad_norm": 0.32411983609199524,
"learning_rate": 0.0001693638676844784,
"loss": 0.2593,
"step": 307
},
{
"epoch": 0.7807351077313055,
"grad_norm": 0.5713444352149963,
"learning_rate": 0.0001692620865139949,
"loss": 0.3661,
"step": 308
},
{
"epoch": 0.7832699619771863,
"grad_norm": 0.3287065327167511,
"learning_rate": 0.00016916030534351145,
"loss": 0.2559,
"step": 309
},
{
"epoch": 0.7858048162230672,
"grad_norm": 0.3499440550804138,
"learning_rate": 0.000169058524173028,
"loss": 0.3489,
"step": 310
},
{
"epoch": 0.788339670468948,
"grad_norm": 0.259787917137146,
"learning_rate": 0.00016895674300254453,
"loss": 0.2451,
"step": 311
},
{
"epoch": 0.7908745247148289,
"grad_norm": 0.3902716338634491,
"learning_rate": 0.0001688549618320611,
"loss": 0.2821,
"step": 312
},
{
"epoch": 0.7934093789607097,
"grad_norm": 0.4061296582221985,
"learning_rate": 0.0001687531806615776,
"loss": 0.4289,
"step": 313
},
{
"epoch": 0.7959442332065906,
"grad_norm": 0.3062605857849121,
"learning_rate": 0.00016865139949109417,
"loss": 0.2489,
"step": 314
},
{
"epoch": 0.7984790874524715,
"grad_norm": 0.36886945366859436,
"learning_rate": 0.00016854961832061068,
"loss": 0.4049,
"step": 315
},
{
"epoch": 0.8010139416983524,
"grad_norm": 0.25828975439071655,
"learning_rate": 0.00016844783715012725,
"loss": 0.238,
"step": 316
},
{
"epoch": 0.8035487959442332,
"grad_norm": 0.39747142791748047,
"learning_rate": 0.00016834605597964376,
"loss": 0.3928,
"step": 317
},
{
"epoch": 0.8060836501901141,
"grad_norm": 0.3884779214859009,
"learning_rate": 0.00016824427480916033,
"loss": 0.2881,
"step": 318
},
{
"epoch": 0.8086185044359949,
"grad_norm": 0.3687349855899811,
"learning_rate": 0.00016814249363867684,
"loss": 0.3662,
"step": 319
},
{
"epoch": 0.8111533586818758,
"grad_norm": 0.3631541132926941,
"learning_rate": 0.0001680407124681934,
"loss": 0.2657,
"step": 320
},
{
"epoch": 0.8136882129277566,
"grad_norm": 0.3174535930156708,
"learning_rate": 0.00016793893129770992,
"loss": 0.2636,
"step": 321
},
{
"epoch": 0.8162230671736375,
"grad_norm": 0.44168904423713684,
"learning_rate": 0.00016783715012722646,
"loss": 0.2882,
"step": 322
},
{
"epoch": 0.8187579214195184,
"grad_norm": 0.370685875415802,
"learning_rate": 0.000167735368956743,
"loss": 0.3228,
"step": 323
},
{
"epoch": 0.8212927756653993,
"grad_norm": 0.3001299798488617,
"learning_rate": 0.00016763358778625954,
"loss": 0.2256,
"step": 324
},
{
"epoch": 0.8238276299112801,
"grad_norm": 0.37992653250694275,
"learning_rate": 0.0001675318066157761,
"loss": 0.2633,
"step": 325
},
{
"epoch": 0.826362484157161,
"grad_norm": 0.4739125072956085,
"learning_rate": 0.00016743002544529262,
"loss": 0.3044,
"step": 326
},
{
"epoch": 0.8288973384030418,
"grad_norm": 0.36424344778060913,
"learning_rate": 0.00016732824427480918,
"loss": 0.3311,
"step": 327
},
{
"epoch": 0.8314321926489227,
"grad_norm": 0.4474777579307556,
"learning_rate": 0.0001672264631043257,
"loss": 0.4099,
"step": 328
},
{
"epoch": 0.8339670468948035,
"grad_norm": 0.4337301552295685,
"learning_rate": 0.00016712468193384226,
"loss": 0.3567,
"step": 329
},
{
"epoch": 0.8365019011406845,
"grad_norm": 0.37666353583335876,
"learning_rate": 0.00016702290076335877,
"loss": 0.3079,
"step": 330
},
{
"epoch": 0.8390367553865653,
"grad_norm": 0.36810433864593506,
"learning_rate": 0.00016692111959287534,
"loss": 0.414,
"step": 331
},
{
"epoch": 0.8415716096324461,
"grad_norm": 0.3914581537246704,
"learning_rate": 0.00016681933842239185,
"loss": 0.2807,
"step": 332
},
{
"epoch": 0.844106463878327,
"grad_norm": 0.3891938626766205,
"learning_rate": 0.00016671755725190842,
"loss": 0.3101,
"step": 333
},
{
"epoch": 0.8466413181242078,
"grad_norm": 0.4397302269935608,
"learning_rate": 0.00016661577608142493,
"loss": 0.2659,
"step": 334
},
{
"epoch": 0.8491761723700887,
"grad_norm": 0.3152853846549988,
"learning_rate": 0.0001665139949109415,
"loss": 0.308,
"step": 335
},
{
"epoch": 0.8517110266159695,
"grad_norm": 0.2894272208213806,
"learning_rate": 0.00016641221374045804,
"loss": 0.2675,
"step": 336
},
{
"epoch": 0.8542458808618505,
"grad_norm": 0.27995947003364563,
"learning_rate": 0.00016631043256997455,
"loss": 0.2603,
"step": 337
},
{
"epoch": 0.8567807351077313,
"grad_norm": 0.42209070920944214,
"learning_rate": 0.00016620865139949112,
"loss": 0.3417,
"step": 338
},
{
"epoch": 0.8593155893536122,
"grad_norm": 0.3781871795654297,
"learning_rate": 0.00016610687022900763,
"loss": 0.3441,
"step": 339
},
{
"epoch": 0.861850443599493,
"grad_norm": 0.3438952565193176,
"learning_rate": 0.0001660050890585242,
"loss": 0.2249,
"step": 340
},
{
"epoch": 0.8643852978453739,
"grad_norm": 0.32164961099624634,
"learning_rate": 0.0001659033078880407,
"loss": 0.2472,
"step": 341
},
{
"epoch": 0.8669201520912547,
"grad_norm": 0.3517252504825592,
"learning_rate": 0.00016580152671755727,
"loss": 0.2434,
"step": 342
},
{
"epoch": 0.8694550063371356,
"grad_norm": 0.29841092228889465,
"learning_rate": 0.00016569974554707378,
"loss": 0.2536,
"step": 343
},
{
"epoch": 0.8719898605830165,
"grad_norm": 0.3351423144340515,
"learning_rate": 0.00016559796437659035,
"loss": 0.2501,
"step": 344
},
{
"epoch": 0.8745247148288974,
"grad_norm": 0.3979301154613495,
"learning_rate": 0.00016549618320610686,
"loss": 0.2358,
"step": 345
},
{
"epoch": 0.8770595690747782,
"grad_norm": 0.3859489858150482,
"learning_rate": 0.00016539440203562343,
"loss": 0.2675,
"step": 346
},
{
"epoch": 0.8795944233206591,
"grad_norm": 0.3836475908756256,
"learning_rate": 0.00016529262086513994,
"loss": 0.2179,
"step": 347
},
{
"epoch": 0.8821292775665399,
"grad_norm": 0.3986142575740814,
"learning_rate": 0.0001651908396946565,
"loss": 0.2599,
"step": 348
},
{
"epoch": 0.8846641318124208,
"grad_norm": 0.4105628430843353,
"learning_rate": 0.00016508905852417305,
"loss": 0.242,
"step": 349
},
{
"epoch": 0.8871989860583016,
"grad_norm": 0.34334608912467957,
"learning_rate": 0.00016498727735368956,
"loss": 0.2771,
"step": 350
},
{
"epoch": 0.8897338403041825,
"grad_norm": 0.3412443995475769,
"learning_rate": 0.00016488549618320613,
"loss": 0.2289,
"step": 351
},
{
"epoch": 0.8922686945500634,
"grad_norm": 0.3596668541431427,
"learning_rate": 0.00016478371501272264,
"loss": 0.2253,
"step": 352
},
{
"epoch": 0.8948035487959443,
"grad_norm": 0.43112802505493164,
"learning_rate": 0.0001646819338422392,
"loss": 0.3116,
"step": 353
},
{
"epoch": 0.8973384030418251,
"grad_norm": 0.4306243062019348,
"learning_rate": 0.00016458015267175572,
"loss": 0.3099,
"step": 354
},
{
"epoch": 0.899873257287706,
"grad_norm": 0.2773829996585846,
"learning_rate": 0.00016447837150127228,
"loss": 0.2765,
"step": 355
},
{
"epoch": 0.9024081115335868,
"grad_norm": 0.5014198422431946,
"learning_rate": 0.0001643765903307888,
"loss": 0.302,
"step": 356
},
{
"epoch": 0.9049429657794676,
"grad_norm": 0.4376792013645172,
"learning_rate": 0.00016427480916030536,
"loss": 0.2967,
"step": 357
},
{
"epoch": 0.9074778200253485,
"grad_norm": 0.34460946917533875,
"learning_rate": 0.00016417302798982187,
"loss": 0.3678,
"step": 358
},
{
"epoch": 0.9100126742712294,
"grad_norm": 0.23346909880638123,
"learning_rate": 0.00016407124681933844,
"loss": 0.2409,
"step": 359
},
{
"epoch": 0.9125475285171103,
"grad_norm": 0.35633108019828796,
"learning_rate": 0.00016396946564885498,
"loss": 0.3555,
"step": 360
},
{
"epoch": 0.9150823827629911,
"grad_norm": 0.26780250668525696,
"learning_rate": 0.00016386768447837152,
"loss": 0.2543,
"step": 361
},
{
"epoch": 0.917617237008872,
"grad_norm": 0.34583303332328796,
"learning_rate": 0.00016376590330788806,
"loss": 0.2444,
"step": 362
},
{
"epoch": 0.9201520912547528,
"grad_norm": 0.38331279158592224,
"learning_rate": 0.0001636641221374046,
"loss": 0.3549,
"step": 363
},
{
"epoch": 0.9226869455006337,
"grad_norm": 0.37290483713150024,
"learning_rate": 0.00016356234096692114,
"loss": 0.3311,
"step": 364
},
{
"epoch": 0.9252217997465145,
"grad_norm": 0.406568318605423,
"learning_rate": 0.00016346055979643765,
"loss": 0.2774,
"step": 365
},
{
"epoch": 0.9277566539923955,
"grad_norm": 0.35498303174972534,
"learning_rate": 0.00016335877862595422,
"loss": 0.2121,
"step": 366
},
{
"epoch": 0.9302915082382763,
"grad_norm": 0.3682021498680115,
"learning_rate": 0.00016325699745547073,
"loss": 0.2648,
"step": 367
},
{
"epoch": 0.9328263624841572,
"grad_norm": 0.37826359272003174,
"learning_rate": 0.0001631552162849873,
"loss": 0.2214,
"step": 368
},
{
"epoch": 0.935361216730038,
"grad_norm": 0.4018029570579529,
"learning_rate": 0.0001630534351145038,
"loss": 0.2291,
"step": 369
},
{
"epoch": 0.9378960709759189,
"grad_norm": 0.4628411531448364,
"learning_rate": 0.00016295165394402037,
"loss": 0.3486,
"step": 370
},
{
"epoch": 0.9404309252217997,
"grad_norm": 0.5615106821060181,
"learning_rate": 0.00016284987277353689,
"loss": 0.3281,
"step": 371
},
{
"epoch": 0.9429657794676806,
"grad_norm": 0.40337833762168884,
"learning_rate": 0.00016274809160305345,
"loss": 0.22,
"step": 372
},
{
"epoch": 0.9455006337135615,
"grad_norm": 0.4247727692127228,
"learning_rate": 0.00016264631043257,
"loss": 0.2801,
"step": 373
},
{
"epoch": 0.9480354879594424,
"grad_norm": 0.28746598958969116,
"learning_rate": 0.00016254452926208653,
"loss": 0.2349,
"step": 374
},
{
"epoch": 0.9505703422053232,
"grad_norm": 0.3654968738555908,
"learning_rate": 0.00016244274809160307,
"loss": 0.2696,
"step": 375
},
{
"epoch": 0.9531051964512041,
"grad_norm": 0.3999825417995453,
"learning_rate": 0.0001623409669211196,
"loss": 0.4228,
"step": 376
},
{
"epoch": 0.9556400506970849,
"grad_norm": 0.3065613806247711,
"learning_rate": 0.00016223918575063615,
"loss": 0.2505,
"step": 377
},
{
"epoch": 0.9581749049429658,
"grad_norm": 0.3503481149673462,
"learning_rate": 0.0001621374045801527,
"loss": 0.2953,
"step": 378
},
{
"epoch": 0.9607097591888466,
"grad_norm": 0.28918176889419556,
"learning_rate": 0.00016203562340966923,
"loss": 0.2454,
"step": 379
},
{
"epoch": 0.9632446134347274,
"grad_norm": 0.3047085404396057,
"learning_rate": 0.00016193384223918574,
"loss": 0.2639,
"step": 380
},
{
"epoch": 0.9657794676806084,
"grad_norm": 0.3775922358036041,
"learning_rate": 0.0001618320610687023,
"loss": 0.3787,
"step": 381
},
{
"epoch": 0.9683143219264893,
"grad_norm": 0.32147660851478577,
"learning_rate": 0.00016173027989821882,
"loss": 0.2273,
"step": 382
},
{
"epoch": 0.9708491761723701,
"grad_norm": 0.355747252702713,
"learning_rate": 0.00016162849872773538,
"loss": 0.2805,
"step": 383
},
{
"epoch": 0.973384030418251,
"grad_norm": 0.2670198082923889,
"learning_rate": 0.0001615267175572519,
"loss": 0.2393,
"step": 384
},
{
"epoch": 0.9759188846641318,
"grad_norm": 0.3395114839076996,
"learning_rate": 0.00016142493638676846,
"loss": 0.2893,
"step": 385
},
{
"epoch": 0.9784537389100126,
"grad_norm": 0.3189052641391754,
"learning_rate": 0.000161323155216285,
"loss": 0.2442,
"step": 386
},
{
"epoch": 0.9809885931558935,
"grad_norm": 0.49379605054855347,
"learning_rate": 0.00016122137404580154,
"loss": 0.3126,
"step": 387
},
{
"epoch": 0.9835234474017744,
"grad_norm": 0.2787371575832367,
"learning_rate": 0.00016111959287531808,
"loss": 0.2329,
"step": 388
},
{
"epoch": 0.9860583016476553,
"grad_norm": 0.3559485673904419,
"learning_rate": 0.00016101781170483462,
"loss": 0.335,
"step": 389
},
{
"epoch": 0.9885931558935361,
"grad_norm": 0.43041396141052246,
"learning_rate": 0.00016091603053435116,
"loss": 0.3069,
"step": 390
},
{
"epoch": 0.991128010139417,
"grad_norm": 0.3231935203075409,
"learning_rate": 0.0001608142493638677,
"loss": 0.2354,
"step": 391
},
{
"epoch": 0.9936628643852978,
"grad_norm": 0.3676549792289734,
"learning_rate": 0.00016071246819338424,
"loss": 0.2958,
"step": 392
},
{
"epoch": 0.9961977186311787,
"grad_norm": 0.37902191281318665,
"learning_rate": 0.00016061068702290075,
"loss": 0.2792,
"step": 393
},
{
"epoch": 0.9987325728770595,
"grad_norm": 0.47126442193984985,
"learning_rate": 0.00016050890585241732,
"loss": 0.4871,
"step": 394
},
{
"epoch": 1.0,
"grad_norm": 0.4303727447986603,
"learning_rate": 0.00016040712468193383,
"loss": 0.2121,
"step": 395
},
{
"epoch": 1.002534854245881,
"grad_norm": 0.3156070411205292,
"learning_rate": 0.0001603053435114504,
"loss": 0.2528,
"step": 396
},
{
"epoch": 1.0050697084917617,
"grad_norm": 0.3030865788459778,
"learning_rate": 0.00016020356234096693,
"loss": 0.2029,
"step": 397
},
{
"epoch": 1.0076045627376427,
"grad_norm": 0.2900277376174927,
"learning_rate": 0.00016010178117048347,
"loss": 0.2192,
"step": 398
},
{
"epoch": 1.0101394169835234,
"grad_norm": 0.4288582503795624,
"learning_rate": 0.00016,
"loss": 0.308,
"step": 399
},
{
"epoch": 1.0126742712294043,
"grad_norm": 0.3376273214817047,
"learning_rate": 0.00015989821882951655,
"loss": 0.2569,
"step": 400
},
{
"epoch": 1.015209125475285,
"grad_norm": 0.39375385642051697,
"learning_rate": 0.0001597964376590331,
"loss": 0.2104,
"step": 401
},
{
"epoch": 1.017743979721166,
"grad_norm": 0.2907378077507019,
"learning_rate": 0.00015969465648854963,
"loss": 0.2057,
"step": 402
},
{
"epoch": 1.020278833967047,
"grad_norm": 0.3524622917175293,
"learning_rate": 0.00015959287531806617,
"loss": 0.2296,
"step": 403
},
{
"epoch": 1.0228136882129277,
"grad_norm": 0.36487293243408203,
"learning_rate": 0.0001594910941475827,
"loss": 0.2133,
"step": 404
},
{
"epoch": 1.0253485424588087,
"grad_norm": 0.4489257335662842,
"learning_rate": 0.00015938931297709925,
"loss": 0.2162,
"step": 405
},
{
"epoch": 1.0278833967046894,
"grad_norm": 0.41142696142196655,
"learning_rate": 0.0001592875318066158,
"loss": 0.2383,
"step": 406
},
{
"epoch": 1.0304182509505704,
"grad_norm": 0.3364538848400116,
"learning_rate": 0.00015918575063613233,
"loss": 0.2077,
"step": 407
},
{
"epoch": 1.0329531051964511,
"grad_norm": 0.576775312423706,
"learning_rate": 0.00015908396946564884,
"loss": 0.2435,
"step": 408
},
{
"epoch": 1.035487959442332,
"grad_norm": 0.6190880537033081,
"learning_rate": 0.0001589821882951654,
"loss": 0.252,
"step": 409
},
{
"epoch": 1.038022813688213,
"grad_norm": 0.4943700432777405,
"learning_rate": 0.00015888040712468195,
"loss": 0.3275,
"step": 410
},
{
"epoch": 1.0405576679340938,
"grad_norm": 0.3160712420940399,
"learning_rate": 0.00015877862595419848,
"loss": 0.217,
"step": 411
},
{
"epoch": 1.0430925221799747,
"grad_norm": 0.34546172618865967,
"learning_rate": 0.00015867684478371502,
"loss": 0.2509,
"step": 412
},
{
"epoch": 1.0456273764258555,
"grad_norm": 0.3498256802558899,
"learning_rate": 0.00015857506361323156,
"loss": 0.2376,
"step": 413
},
{
"epoch": 1.0481622306717364,
"grad_norm": 0.29526984691619873,
"learning_rate": 0.0001584732824427481,
"loss": 0.2305,
"step": 414
},
{
"epoch": 1.0506970849176172,
"grad_norm": 0.30113956332206726,
"learning_rate": 0.00015837150127226464,
"loss": 0.2205,
"step": 415
},
{
"epoch": 1.053231939163498,
"grad_norm": 0.4007863402366638,
"learning_rate": 0.00015826972010178118,
"loss": 0.2407,
"step": 416
},
{
"epoch": 1.055766793409379,
"grad_norm": 0.2594064176082611,
"learning_rate": 0.00015816793893129772,
"loss": 0.1923,
"step": 417
},
{
"epoch": 1.0583016476552598,
"grad_norm": 0.23412476480007172,
"learning_rate": 0.00015806615776081426,
"loss": 0.2158,
"step": 418
},
{
"epoch": 1.0608365019011408,
"grad_norm": 0.397443562746048,
"learning_rate": 0.0001579643765903308,
"loss": 0.3666,
"step": 419
},
{
"epoch": 1.0633713561470215,
"grad_norm": 0.3756926655769348,
"learning_rate": 0.00015786259541984734,
"loss": 0.2081,
"step": 420
},
{
"epoch": 1.0659062103929025,
"grad_norm": 0.5698515772819519,
"learning_rate": 0.00015776081424936388,
"loss": 0.2265,
"step": 421
},
{
"epoch": 1.0684410646387832,
"grad_norm": 0.3608737289905548,
"learning_rate": 0.00015765903307888042,
"loss": 0.3821,
"step": 422
},
{
"epoch": 1.0709759188846641,
"grad_norm": 0.4109106957912445,
"learning_rate": 0.00015755725190839696,
"loss": 0.3484,
"step": 423
},
{
"epoch": 1.073510773130545,
"grad_norm": 0.38270992040634155,
"learning_rate": 0.0001574554707379135,
"loss": 0.2365,
"step": 424
},
{
"epoch": 1.0760456273764258,
"grad_norm": 0.2857488989830017,
"learning_rate": 0.00015735368956743004,
"loss": 0.263,
"step": 425
},
{
"epoch": 1.0785804816223068,
"grad_norm": 0.25236523151397705,
"learning_rate": 0.00015725190839694657,
"loss": 0.2216,
"step": 426
},
{
"epoch": 1.0811153358681875,
"grad_norm": 0.40370991826057434,
"learning_rate": 0.00015715012722646311,
"loss": 0.3711,
"step": 427
},
{
"epoch": 1.0836501901140685,
"grad_norm": 0.2624306380748749,
"learning_rate": 0.00015704834605597965,
"loss": 0.2082,
"step": 428
},
{
"epoch": 1.0861850443599492,
"grad_norm": 0.4375905692577362,
"learning_rate": 0.0001569465648854962,
"loss": 0.3474,
"step": 429
},
{
"epoch": 1.0887198986058302,
"grad_norm": 0.3287188410758972,
"learning_rate": 0.00015684478371501273,
"loss": 0.3097,
"step": 430
},
{
"epoch": 1.091254752851711,
"grad_norm": 0.2669587731361389,
"learning_rate": 0.00015674300254452927,
"loss": 0.229,
"step": 431
},
{
"epoch": 1.0937896070975919,
"grad_norm": 0.28192129731178284,
"learning_rate": 0.0001566412213740458,
"loss": 0.2226,
"step": 432
},
{
"epoch": 1.0963244613434728,
"grad_norm": 0.30673590302467346,
"learning_rate": 0.00015653944020356235,
"loss": 0.2331,
"step": 433
},
{
"epoch": 1.0988593155893536,
"grad_norm": 0.34343135356903076,
"learning_rate": 0.0001564376590330789,
"loss": 0.2567,
"step": 434
},
{
"epoch": 1.1013941698352345,
"grad_norm": 0.4853306710720062,
"learning_rate": 0.00015633587786259543,
"loss": 0.3688,
"step": 435
},
{
"epoch": 1.1039290240811153,
"grad_norm": 0.42215099930763245,
"learning_rate": 0.00015623409669211197,
"loss": 0.3465,
"step": 436
},
{
"epoch": 1.1064638783269962,
"grad_norm": 0.5882295370101929,
"learning_rate": 0.0001561323155216285,
"loss": 0.4502,
"step": 437
},
{
"epoch": 1.1089987325728772,
"grad_norm": 0.44578316807746887,
"learning_rate": 0.00015603053435114505,
"loss": 0.3345,
"step": 438
},
{
"epoch": 1.111533586818758,
"grad_norm": 0.366653174161911,
"learning_rate": 0.00015592875318066159,
"loss": 0.2111,
"step": 439
},
{
"epoch": 1.1140684410646389,
"grad_norm": 0.4964495003223419,
"learning_rate": 0.00015582697201017812,
"loss": 0.2731,
"step": 440
},
{
"epoch": 1.1166032953105196,
"grad_norm": 0.3171039819717407,
"learning_rate": 0.00015572519083969466,
"loss": 0.2148,
"step": 441
},
{
"epoch": 1.1191381495564006,
"grad_norm": 0.3483026921749115,
"learning_rate": 0.0001556234096692112,
"loss": 0.2481,
"step": 442
},
{
"epoch": 1.1216730038022813,
"grad_norm": 0.37379321455955505,
"learning_rate": 0.00015552162849872774,
"loss": 0.3292,
"step": 443
},
{
"epoch": 1.1242078580481623,
"grad_norm": 0.32108721137046814,
"learning_rate": 0.00015541984732824428,
"loss": 0.3363,
"step": 444
},
{
"epoch": 1.126742712294043,
"grad_norm": 0.3879946768283844,
"learning_rate": 0.00015531806615776082,
"loss": 0.2891,
"step": 445
},
{
"epoch": 1.129277566539924,
"grad_norm": 0.2334345281124115,
"learning_rate": 0.00015521628498727736,
"loss": 0.2183,
"step": 446
},
{
"epoch": 1.131812420785805,
"grad_norm": 0.274795264005661,
"learning_rate": 0.0001551145038167939,
"loss": 0.2002,
"step": 447
},
{
"epoch": 1.1343472750316856,
"grad_norm": 0.45602667331695557,
"learning_rate": 0.00015501272264631044,
"loss": 0.3282,
"step": 448
},
{
"epoch": 1.1368821292775666,
"grad_norm": 0.25433096289634705,
"learning_rate": 0.00015491094147582698,
"loss": 0.2195,
"step": 449
},
{
"epoch": 1.1394169835234473,
"grad_norm": 0.3606742024421692,
"learning_rate": 0.00015480916030534352,
"loss": 0.244,
"step": 450
},
{
"epoch": 1.1419518377693283,
"grad_norm": 0.3597625494003296,
"learning_rate": 0.00015470737913486006,
"loss": 0.2117,
"step": 451
},
{
"epoch": 1.144486692015209,
"grad_norm": 0.32967302203178406,
"learning_rate": 0.0001546055979643766,
"loss": 0.2662,
"step": 452
},
{
"epoch": 1.14702154626109,
"grad_norm": 0.32538869976997375,
"learning_rate": 0.00015450381679389314,
"loss": 0.2439,
"step": 453
},
{
"epoch": 1.149556400506971,
"grad_norm": 0.36263129115104675,
"learning_rate": 0.00015440203562340968,
"loss": 0.2688,
"step": 454
},
{
"epoch": 1.1520912547528517,
"grad_norm": 0.4200229346752167,
"learning_rate": 0.00015430025445292621,
"loss": 0.3201,
"step": 455
},
{
"epoch": 1.1546261089987326,
"grad_norm": 0.35889115929603577,
"learning_rate": 0.00015419847328244275,
"loss": 0.2584,
"step": 456
},
{
"epoch": 1.1571609632446134,
"grad_norm": 0.36060044169425964,
"learning_rate": 0.0001540966921119593,
"loss": 0.2496,
"step": 457
},
{
"epoch": 1.1596958174904943,
"grad_norm": 0.3046696186065674,
"learning_rate": 0.00015399491094147583,
"loss": 0.2102,
"step": 458
},
{
"epoch": 1.162230671736375,
"grad_norm": 0.4576256275177002,
"learning_rate": 0.00015389312977099237,
"loss": 0.3594,
"step": 459
},
{
"epoch": 1.164765525982256,
"grad_norm": 0.3436565697193146,
"learning_rate": 0.0001537913486005089,
"loss": 0.2289,
"step": 460
},
{
"epoch": 1.167300380228137,
"grad_norm": 0.4197808802127838,
"learning_rate": 0.00015368956743002545,
"loss": 0.2863,
"step": 461
},
{
"epoch": 1.1698352344740177,
"grad_norm": 0.3584151566028595,
"learning_rate": 0.000153587786259542,
"loss": 0.2797,
"step": 462
},
{
"epoch": 1.1723700887198987,
"grad_norm": 0.29760056734085083,
"learning_rate": 0.00015348600508905853,
"loss": 0.212,
"step": 463
},
{
"epoch": 1.1749049429657794,
"grad_norm": 0.3856862485408783,
"learning_rate": 0.00015338422391857507,
"loss": 0.2986,
"step": 464
},
{
"epoch": 1.1774397972116604,
"grad_norm": 0.42522993683815,
"learning_rate": 0.0001532824427480916,
"loss": 0.2869,
"step": 465
},
{
"epoch": 1.179974651457541,
"grad_norm": 0.33221253752708435,
"learning_rate": 0.00015318066157760815,
"loss": 0.2236,
"step": 466
},
{
"epoch": 1.182509505703422,
"grad_norm": 0.35414496064186096,
"learning_rate": 0.00015307888040712469,
"loss": 0.2658,
"step": 467
},
{
"epoch": 1.1850443599493028,
"grad_norm": 0.41883930563926697,
"learning_rate": 0.00015297709923664123,
"loss": 0.3939,
"step": 468
},
{
"epoch": 1.1875792141951838,
"grad_norm": 0.3070299029350281,
"learning_rate": 0.00015287531806615776,
"loss": 0.2208,
"step": 469
},
{
"epoch": 1.1901140684410647,
"grad_norm": 0.30749714374542236,
"learning_rate": 0.0001527735368956743,
"loss": 0.242,
"step": 470
},
{
"epoch": 1.1926489226869454,
"grad_norm": 0.2579677104949951,
"learning_rate": 0.00015267175572519084,
"loss": 0.2435,
"step": 471
},
{
"epoch": 1.1951837769328264,
"grad_norm": 0.46220460534095764,
"learning_rate": 0.00015256997455470738,
"loss": 0.2803,
"step": 472
},
{
"epoch": 1.1977186311787071,
"grad_norm": 0.3824957609176636,
"learning_rate": 0.00015246819338422392,
"loss": 0.3143,
"step": 473
},
{
"epoch": 1.200253485424588,
"grad_norm": 0.3049899637699127,
"learning_rate": 0.00015236641221374046,
"loss": 0.2231,
"step": 474
},
{
"epoch": 1.202788339670469,
"grad_norm": 0.4378805458545685,
"learning_rate": 0.000152264631043257,
"loss": 0.2041,
"step": 475
},
{
"epoch": 1.2053231939163498,
"grad_norm": 0.3902495801448822,
"learning_rate": 0.00015216284987277354,
"loss": 0.3055,
"step": 476
},
{
"epoch": 1.2078580481622307,
"grad_norm": 0.3150664269924164,
"learning_rate": 0.00015206106870229008,
"loss": 0.2222,
"step": 477
},
{
"epoch": 1.2103929024081115,
"grad_norm": 0.3551795184612274,
"learning_rate": 0.00015195928753180662,
"loss": 0.2304,
"step": 478
},
{
"epoch": 1.2129277566539924,
"grad_norm": 0.35522422194480896,
"learning_rate": 0.00015185750636132316,
"loss": 0.2636,
"step": 479
},
{
"epoch": 1.2154626108998732,
"grad_norm": 0.35261449217796326,
"learning_rate": 0.0001517557251908397,
"loss": 0.2743,
"step": 480
},
{
"epoch": 1.2179974651457541,
"grad_norm": 0.4755167067050934,
"learning_rate": 0.00015165394402035624,
"loss": 0.321,
"step": 481
},
{
"epoch": 1.2205323193916349,
"grad_norm": 0.36083585023880005,
"learning_rate": 0.0001515521628498728,
"loss": 0.2549,
"step": 482
},
{
"epoch": 1.2230671736375158,
"grad_norm": 0.3213503956794739,
"learning_rate": 0.00015145038167938932,
"loss": 0.2685,
"step": 483
},
{
"epoch": 1.2256020278833968,
"grad_norm": 0.29988422989845276,
"learning_rate": 0.00015134860050890588,
"loss": 0.3253,
"step": 484
},
{
"epoch": 1.2281368821292775,
"grad_norm": 0.3549601435661316,
"learning_rate": 0.0001512468193384224,
"loss": 0.2574,
"step": 485
},
{
"epoch": 1.2306717363751585,
"grad_norm": 0.33347830176353455,
"learning_rate": 0.00015114503816793893,
"loss": 0.3408,
"step": 486
},
{
"epoch": 1.2332065906210392,
"grad_norm": 0.2988692820072174,
"learning_rate": 0.00015104325699745547,
"loss": 0.2583,
"step": 487
},
{
"epoch": 1.2357414448669202,
"grad_norm": 0.2710984945297241,
"learning_rate": 0.000150941475826972,
"loss": 0.2708,
"step": 488
},
{
"epoch": 1.2382762991128011,
"grad_norm": 0.28278592228889465,
"learning_rate": 0.00015083969465648855,
"loss": 0.2345,
"step": 489
},
{
"epoch": 1.2408111533586819,
"grad_norm": 0.31838810443878174,
"learning_rate": 0.0001507379134860051,
"loss": 0.2193,
"step": 490
},
{
"epoch": 1.2433460076045628,
"grad_norm": 0.31196919083595276,
"learning_rate": 0.00015063613231552163,
"loss": 0.2334,
"step": 491
},
{
"epoch": 1.2458808618504436,
"grad_norm": 0.3953218460083008,
"learning_rate": 0.00015053435114503817,
"loss": 0.2716,
"step": 492
},
{
"epoch": 1.2484157160963245,
"grad_norm": 0.4814457297325134,
"learning_rate": 0.0001504325699745547,
"loss": 0.2847,
"step": 493
},
{
"epoch": 1.2509505703422052,
"grad_norm": 0.5870761275291443,
"learning_rate": 0.00015033078880407125,
"loss": 0.3685,
"step": 494
},
{
"epoch": 1.2534854245880862,
"grad_norm": 0.30315646529197693,
"learning_rate": 0.00015022900763358781,
"loss": 0.2112,
"step": 495
},
{
"epoch": 1.256020278833967,
"grad_norm": 0.4358583390712738,
"learning_rate": 0.00015012722646310433,
"loss": 0.279,
"step": 496
},
{
"epoch": 1.258555133079848,
"grad_norm": 0.3699369728565216,
"learning_rate": 0.0001500254452926209,
"loss": 0.2941,
"step": 497
},
{
"epoch": 1.2610899873257289,
"grad_norm": 0.338522344827652,
"learning_rate": 0.0001499236641221374,
"loss": 0.273,
"step": 498
},
{
"epoch": 1.2636248415716096,
"grad_norm": 0.29661208391189575,
"learning_rate": 0.00014982188295165397,
"loss": 0.23,
"step": 499
},
{
"epoch": 1.2661596958174905,
"grad_norm": 0.4247685968875885,
"learning_rate": 0.00014972010178117048,
"loss": 0.3112,
"step": 500
},
{
"epoch": 1.2686945500633713,
"grad_norm": 0.44488340616226196,
"learning_rate": 0.00014961832061068702,
"loss": 0.3796,
"step": 501
},
{
"epoch": 1.2712294043092522,
"grad_norm": 0.30672356486320496,
"learning_rate": 0.00014951653944020356,
"loss": 0.2222,
"step": 502
},
{
"epoch": 1.2737642585551332,
"grad_norm": 0.3291172981262207,
"learning_rate": 0.0001494147582697201,
"loss": 0.2177,
"step": 503
},
{
"epoch": 1.276299112801014,
"grad_norm": 0.4180152118206024,
"learning_rate": 0.00014931297709923664,
"loss": 0.3673,
"step": 504
},
{
"epoch": 1.2788339670468947,
"grad_norm": 0.41350388526916504,
"learning_rate": 0.00014921119592875318,
"loss": 0.2544,
"step": 505
},
{
"epoch": 1.2813688212927756,
"grad_norm": 0.3517690598964691,
"learning_rate": 0.00014910941475826972,
"loss": 0.2139,
"step": 506
},
{
"epoch": 1.2839036755386566,
"grad_norm": 0.4273949861526489,
"learning_rate": 0.00014900763358778626,
"loss": 0.255,
"step": 507
},
{
"epoch": 1.2864385297845373,
"grad_norm": 0.3510381877422333,
"learning_rate": 0.00014890585241730283,
"loss": 0.2503,
"step": 508
},
{
"epoch": 1.2889733840304183,
"grad_norm": 0.4069119393825531,
"learning_rate": 0.00014880407124681934,
"loss": 0.3267,
"step": 509
},
{
"epoch": 1.291508238276299,
"grad_norm": 0.6244072318077087,
"learning_rate": 0.0001487022900763359,
"loss": 0.2519,
"step": 510
},
{
"epoch": 1.29404309252218,
"grad_norm": 0.473450630903244,
"learning_rate": 0.00014860050890585242,
"loss": 0.3093,
"step": 511
},
{
"epoch": 1.296577946768061,
"grad_norm": 0.3139822781085968,
"learning_rate": 0.00014849872773536898,
"loss": 0.2396,
"step": 512
},
{
"epoch": 1.2991128010139417,
"grad_norm": 0.23700624704360962,
"learning_rate": 0.0001483969465648855,
"loss": 0.1945,
"step": 513
},
{
"epoch": 1.3016476552598226,
"grad_norm": 0.42849189043045044,
"learning_rate": 0.00014829516539440203,
"loss": 0.2275,
"step": 514
},
{
"epoch": 1.3041825095057034,
"grad_norm": 0.4083426296710968,
"learning_rate": 0.00014819338422391857,
"loss": 0.3626,
"step": 515
},
{
"epoch": 1.3067173637515843,
"grad_norm": 0.4541410207748413,
"learning_rate": 0.0001480916030534351,
"loss": 0.3102,
"step": 516
},
{
"epoch": 1.3092522179974653,
"grad_norm": 0.6483343839645386,
"learning_rate": 0.00014798982188295165,
"loss": 0.3427,
"step": 517
},
{
"epoch": 1.311787072243346,
"grad_norm": 0.3928525447845459,
"learning_rate": 0.0001478880407124682,
"loss": 0.3155,
"step": 518
},
{
"epoch": 1.3143219264892267,
"grad_norm": 0.319035142660141,
"learning_rate": 0.00014778625954198476,
"loss": 0.2555,
"step": 519
},
{
"epoch": 1.3168567807351077,
"grad_norm": 0.2855183780193329,
"learning_rate": 0.00014768447837150127,
"loss": 0.2115,
"step": 520
},
{
"epoch": 1.3193916349809887,
"grad_norm": 0.3499714136123657,
"learning_rate": 0.00014758269720101784,
"loss": 0.254,
"step": 521
},
{
"epoch": 1.3219264892268694,
"grad_norm": 0.40895748138427734,
"learning_rate": 0.00014748091603053435,
"loss": 0.2975,
"step": 522
},
{
"epoch": 1.3244613434727504,
"grad_norm": 0.30614539980888367,
"learning_rate": 0.00014737913486005091,
"loss": 0.2584,
"step": 523
},
{
"epoch": 1.326996197718631,
"grad_norm": 0.2832574248313904,
"learning_rate": 0.00014727735368956743,
"loss": 0.2259,
"step": 524
},
{
"epoch": 1.329531051964512,
"grad_norm": 0.3444589674472809,
"learning_rate": 0.000147175572519084,
"loss": 0.2608,
"step": 525
},
{
"epoch": 1.332065906210393,
"grad_norm": 0.35170844197273254,
"learning_rate": 0.0001470737913486005,
"loss": 0.3019,
"step": 526
},
{
"epoch": 1.3346007604562737,
"grad_norm": 0.46164563298225403,
"learning_rate": 0.00014697201017811707,
"loss": 0.2024,
"step": 527
},
{
"epoch": 1.3371356147021547,
"grad_norm": 0.2369971126317978,
"learning_rate": 0.00014687022900763358,
"loss": 0.1967,
"step": 528
},
{
"epoch": 1.3396704689480354,
"grad_norm": 0.43180060386657715,
"learning_rate": 0.00014676844783715012,
"loss": 0.2415,
"step": 529
},
{
"epoch": 1.3422053231939164,
"grad_norm": 0.3531292676925659,
"learning_rate": 0.00014666666666666666,
"loss": 0.2283,
"step": 530
},
{
"epoch": 1.3447401774397973,
"grad_norm": 0.49374547600746155,
"learning_rate": 0.0001465648854961832,
"loss": 0.3025,
"step": 531
},
{
"epoch": 1.347275031685678,
"grad_norm": 0.4822668731212616,
"learning_rate": 0.00014646310432569977,
"loss": 0.3498,
"step": 532
},
{
"epoch": 1.3498098859315588,
"grad_norm": 0.4463392496109009,
"learning_rate": 0.00014636132315521628,
"loss": 0.2186,
"step": 533
},
{
"epoch": 1.3523447401774398,
"grad_norm": 0.40042299032211304,
"learning_rate": 0.00014625954198473285,
"loss": 0.2316,
"step": 534
},
{
"epoch": 1.3548795944233207,
"grad_norm": 0.41266927123069763,
"learning_rate": 0.00014615776081424936,
"loss": 0.2324,
"step": 535
},
{
"epoch": 1.3574144486692015,
"grad_norm": 0.46208152174949646,
"learning_rate": 0.00014605597964376593,
"loss": 0.2261,
"step": 536
},
{
"epoch": 1.3599493029150824,
"grad_norm": 0.38895705342292786,
"learning_rate": 0.00014595419847328244,
"loss": 0.2732,
"step": 537
},
{
"epoch": 1.3624841571609632,
"grad_norm": 0.4489743113517761,
"learning_rate": 0.000145852417302799,
"loss": 0.3197,
"step": 538
},
{
"epoch": 1.3650190114068441,
"grad_norm": 0.25082916021347046,
"learning_rate": 0.00014575063613231552,
"loss": 0.2096,
"step": 539
},
{
"epoch": 1.367553865652725,
"grad_norm": 0.3681942820549011,
"learning_rate": 0.00014564885496183208,
"loss": 0.2496,
"step": 540
},
{
"epoch": 1.3700887198986058,
"grad_norm": 0.30986878275871277,
"learning_rate": 0.0001455470737913486,
"loss": 0.2244,
"step": 541
},
{
"epoch": 1.3726235741444868,
"grad_norm": 0.42349961400032043,
"learning_rate": 0.00014544529262086513,
"loss": 0.2315,
"step": 542
},
{
"epoch": 1.3751584283903675,
"grad_norm": 0.29656872153282166,
"learning_rate": 0.00014534351145038167,
"loss": 0.2458,
"step": 543
},
{
"epoch": 1.3776932826362485,
"grad_norm": 0.4033924341201782,
"learning_rate": 0.0001452417302798982,
"loss": 0.3506,
"step": 544
},
{
"epoch": 1.3802281368821292,
"grad_norm": 0.3998583257198334,
"learning_rate": 0.00014513994910941478,
"loss": 0.3108,
"step": 545
},
{
"epoch": 1.3827629911280102,
"grad_norm": 0.3335135281085968,
"learning_rate": 0.0001450381679389313,
"loss": 0.2816,
"step": 546
},
{
"epoch": 1.385297845373891,
"grad_norm": 0.39304816722869873,
"learning_rate": 0.00014493638676844786,
"loss": 0.3968,
"step": 547
},
{
"epoch": 1.3878326996197718,
"grad_norm": 0.34913384914398193,
"learning_rate": 0.00014483460559796437,
"loss": 0.2653,
"step": 548
},
{
"epoch": 1.3903675538656528,
"grad_norm": 0.3312399387359619,
"learning_rate": 0.00014473282442748094,
"loss": 0.2629,
"step": 549
},
{
"epoch": 1.3929024081115335,
"grad_norm": 0.31613558530807495,
"learning_rate": 0.00014463104325699745,
"loss": 0.2033,
"step": 550
},
{
"epoch": 1.3954372623574145,
"grad_norm": 0.2872864603996277,
"learning_rate": 0.00014452926208651402,
"loss": 0.2097,
"step": 551
},
{
"epoch": 1.3979721166032952,
"grad_norm": 0.24432098865509033,
"learning_rate": 0.00014442748091603053,
"loss": 0.2172,
"step": 552
},
{
"epoch": 1.4005069708491762,
"grad_norm": 0.31649062037467957,
"learning_rate": 0.0001443256997455471,
"loss": 0.2255,
"step": 553
},
{
"epoch": 1.4030418250950571,
"grad_norm": 0.2483261376619339,
"learning_rate": 0.0001442239185750636,
"loss": 0.1856,
"step": 554
},
{
"epoch": 1.4055766793409379,
"grad_norm": 0.437757670879364,
"learning_rate": 0.00014412213740458017,
"loss": 0.2713,
"step": 555
},
{
"epoch": 1.4081115335868186,
"grad_norm": 0.43551307916641235,
"learning_rate": 0.0001440203562340967,
"loss": 0.2654,
"step": 556
},
{
"epoch": 1.4106463878326996,
"grad_norm": 0.5781947374343872,
"learning_rate": 0.00014391857506361322,
"loss": 0.3242,
"step": 557
},
{
"epoch": 1.4131812420785805,
"grad_norm": 0.3809725344181061,
"learning_rate": 0.0001438167938931298,
"loss": 0.2176,
"step": 558
},
{
"epoch": 1.4157160963244613,
"grad_norm": 0.38208654522895813,
"learning_rate": 0.0001437150127226463,
"loss": 0.2043,
"step": 559
},
{
"epoch": 1.4182509505703422,
"grad_norm": 0.39930659532546997,
"learning_rate": 0.00014361323155216287,
"loss": 0.2914,
"step": 560
},
{
"epoch": 1.420785804816223,
"grad_norm": 0.3019846975803375,
"learning_rate": 0.00014351145038167938,
"loss": 0.2037,
"step": 561
},
{
"epoch": 1.423320659062104,
"grad_norm": 0.4549913704395294,
"learning_rate": 0.00014340966921119595,
"loss": 0.2308,
"step": 562
},
{
"epoch": 1.4258555133079849,
"grad_norm": 0.38887929916381836,
"learning_rate": 0.00014330788804071246,
"loss": 0.2339,
"step": 563
},
{
"epoch": 1.4283903675538656,
"grad_norm": 0.3481290340423584,
"learning_rate": 0.00014320610687022903,
"loss": 0.2206,
"step": 564
},
{
"epoch": 1.4309252217997466,
"grad_norm": 0.46603840589523315,
"learning_rate": 0.00014310432569974554,
"loss": 0.3006,
"step": 565
},
{
"epoch": 1.4334600760456273,
"grad_norm": 0.3586963713169098,
"learning_rate": 0.0001430025445292621,
"loss": 0.2646,
"step": 566
},
{
"epoch": 1.4359949302915083,
"grad_norm": 0.3106522560119629,
"learning_rate": 0.00014290076335877862,
"loss": 0.2725,
"step": 567
},
{
"epoch": 1.4385297845373892,
"grad_norm": 0.48086050152778625,
"learning_rate": 0.00014279898218829518,
"loss": 0.3007,
"step": 568
},
{
"epoch": 1.44106463878327,
"grad_norm": 0.44636330008506775,
"learning_rate": 0.00014269720101781172,
"loss": 0.3755,
"step": 569
},
{
"epoch": 1.4435994930291507,
"grad_norm": 0.3114064633846283,
"learning_rate": 0.00014259541984732824,
"loss": 0.2606,
"step": 570
},
{
"epoch": 1.4461343472750317,
"grad_norm": 0.358394593000412,
"learning_rate": 0.0001424936386768448,
"loss": 0.27,
"step": 571
},
{
"epoch": 1.4486692015209126,
"grad_norm": 0.3568032681941986,
"learning_rate": 0.00014239185750636131,
"loss": 0.2767,
"step": 572
},
{
"epoch": 1.4512040557667933,
"grad_norm": 0.4407200515270233,
"learning_rate": 0.00014229007633587788,
"loss": 0.3786,
"step": 573
},
{
"epoch": 1.4537389100126743,
"grad_norm": 0.4096840023994446,
"learning_rate": 0.0001421882951653944,
"loss": 0.3199,
"step": 574
},
{
"epoch": 1.456273764258555,
"grad_norm": 0.3343110680580139,
"learning_rate": 0.00014208651399491096,
"loss": 0.2538,
"step": 575
},
{
"epoch": 1.458808618504436,
"grad_norm": 0.27782517671585083,
"learning_rate": 0.00014198473282442747,
"loss": 0.2179,
"step": 576
},
{
"epoch": 1.461343472750317,
"grad_norm": 0.2901310920715332,
"learning_rate": 0.00014188295165394404,
"loss": 0.2552,
"step": 577
},
{
"epoch": 1.4638783269961977,
"grad_norm": 0.3634903132915497,
"learning_rate": 0.00014178117048346055,
"loss": 0.257,
"step": 578
},
{
"epoch": 1.4664131812420786,
"grad_norm": 0.37307262420654297,
"learning_rate": 0.00014167938931297712,
"loss": 0.254,
"step": 579
},
{
"epoch": 1.4689480354879594,
"grad_norm": 0.27726346254348755,
"learning_rate": 0.00014157760814249366,
"loss": 0.1938,
"step": 580
},
{
"epoch": 1.4714828897338403,
"grad_norm": 0.3364371657371521,
"learning_rate": 0.0001414758269720102,
"loss": 0.2094,
"step": 581
},
{
"epoch": 1.4740177439797213,
"grad_norm": 0.4418800473213196,
"learning_rate": 0.00014137404580152673,
"loss": 0.3243,
"step": 582
},
{
"epoch": 1.476552598225602,
"grad_norm": 0.42042022943496704,
"learning_rate": 0.00014127226463104327,
"loss": 0.2333,
"step": 583
},
{
"epoch": 1.4790874524714828,
"grad_norm": 0.36881470680236816,
"learning_rate": 0.0001411704834605598,
"loss": 0.2513,
"step": 584
},
{
"epoch": 1.4816223067173637,
"grad_norm": 0.4009782671928406,
"learning_rate": 0.00014106870229007632,
"loss": 0.3085,
"step": 585
},
{
"epoch": 1.4841571609632447,
"grad_norm": 0.43179744482040405,
"learning_rate": 0.0001409669211195929,
"loss": 0.3189,
"step": 586
},
{
"epoch": 1.4866920152091254,
"grad_norm": 0.3721300959587097,
"learning_rate": 0.0001408651399491094,
"loss": 0.2318,
"step": 587
},
{
"epoch": 1.4892268694550064,
"grad_norm": 0.3875066339969635,
"learning_rate": 0.00014076335877862597,
"loss": 0.2753,
"step": 588
},
{
"epoch": 1.491761723700887,
"grad_norm": 0.35223937034606934,
"learning_rate": 0.00014066157760814248,
"loss": 0.2257,
"step": 589
},
{
"epoch": 1.494296577946768,
"grad_norm": 0.30979710817337036,
"learning_rate": 0.00014055979643765905,
"loss": 0.2149,
"step": 590
},
{
"epoch": 1.496831432192649,
"grad_norm": 0.23923753201961517,
"learning_rate": 0.00014045801526717556,
"loss": 0.1911,
"step": 591
},
{
"epoch": 1.4993662864385298,
"grad_norm": 0.40893304347991943,
"learning_rate": 0.00014035623409669213,
"loss": 0.2756,
"step": 592
},
{
"epoch": 1.5019011406844105,
"grad_norm": 0.2659086585044861,
"learning_rate": 0.00014025445292620867,
"loss": 0.2154,
"step": 593
},
{
"epoch": 1.5044359949302915,
"grad_norm": 0.30749884247779846,
"learning_rate": 0.0001401526717557252,
"loss": 0.2184,
"step": 594
},
{
"epoch": 1.5069708491761724,
"grad_norm": 0.3892879784107208,
"learning_rate": 0.00014005089058524175,
"loss": 0.2849,
"step": 595
},
{
"epoch": 1.5095057034220534,
"grad_norm": 0.5041462779045105,
"learning_rate": 0.00013994910941475828,
"loss": 0.2551,
"step": 596
},
{
"epoch": 1.512040557667934,
"grad_norm": 0.4143123924732208,
"learning_rate": 0.00013984732824427482,
"loss": 0.2485,
"step": 597
},
{
"epoch": 1.5145754119138148,
"grad_norm": 0.5315548181533813,
"learning_rate": 0.00013974554707379136,
"loss": 0.3242,
"step": 598
},
{
"epoch": 1.5171102661596958,
"grad_norm": 0.28680169582366943,
"learning_rate": 0.0001396437659033079,
"loss": 0.227,
"step": 599
},
{
"epoch": 1.5196451204055768,
"grad_norm": 0.3015950620174408,
"learning_rate": 0.00013954198473282441,
"loss": 0.2122,
"step": 600
},
{
"epoch": 1.5221799746514575,
"grad_norm": 0.30785971879959106,
"learning_rate": 0.00013944020356234098,
"loss": 0.2194,
"step": 601
},
{
"epoch": 1.5247148288973384,
"grad_norm": 0.3596206605434418,
"learning_rate": 0.0001393384223918575,
"loss": 0.2574,
"step": 602
},
{
"epoch": 1.5272496831432192,
"grad_norm": 0.18499840795993805,
"learning_rate": 0.00013923664122137406,
"loss": 0.1944,
"step": 603
},
{
"epoch": 1.5297845373891001,
"grad_norm": 0.4346081614494324,
"learning_rate": 0.00013913486005089057,
"loss": 0.3187,
"step": 604
},
{
"epoch": 1.532319391634981,
"grad_norm": 0.46154457330703735,
"learning_rate": 0.00013903307888040714,
"loss": 0.3149,
"step": 605
},
{
"epoch": 1.5348542458808618,
"grad_norm": 0.3444209098815918,
"learning_rate": 0.00013893129770992368,
"loss": 0.2801,
"step": 606
},
{
"epoch": 1.5373891001267426,
"grad_norm": 0.550620436668396,
"learning_rate": 0.00013882951653944022,
"loss": 0.3038,
"step": 607
},
{
"epoch": 1.5399239543726235,
"grad_norm": 0.36603689193725586,
"learning_rate": 0.00013872773536895676,
"loss": 0.3224,
"step": 608
},
{
"epoch": 1.5424588086185045,
"grad_norm": 0.213638037443161,
"learning_rate": 0.0001386259541984733,
"loss": 0.2081,
"step": 609
},
{
"epoch": 1.5449936628643854,
"grad_norm": 0.34508904814720154,
"learning_rate": 0.00013852417302798983,
"loss": 0.2474,
"step": 610
},
{
"epoch": 1.5475285171102662,
"grad_norm": 0.42072099447250366,
"learning_rate": 0.00013842239185750637,
"loss": 0.3049,
"step": 611
},
{
"epoch": 1.550063371356147,
"grad_norm": 0.3760271966457367,
"learning_rate": 0.0001383206106870229,
"loss": 0.2499,
"step": 612
},
{
"epoch": 1.5525982256020279,
"grad_norm": 0.24040678143501282,
"learning_rate": 0.00013821882951653943,
"loss": 0.2134,
"step": 613
},
{
"epoch": 1.5551330798479088,
"grad_norm": 0.458035945892334,
"learning_rate": 0.000138117048346056,
"loss": 0.3375,
"step": 614
},
{
"epoch": 1.5576679340937896,
"grad_norm": 0.30446937680244446,
"learning_rate": 0.0001380152671755725,
"loss": 0.2252,
"step": 615
},
{
"epoch": 1.5602027883396705,
"grad_norm": 0.3036455810070038,
"learning_rate": 0.00013791348600508907,
"loss": 0.2095,
"step": 616
},
{
"epoch": 1.5627376425855513,
"grad_norm": 0.4190979301929474,
"learning_rate": 0.0001378117048346056,
"loss": 0.2932,
"step": 617
},
{
"epoch": 1.5652724968314322,
"grad_norm": 0.27648523449897766,
"learning_rate": 0.00013770992366412215,
"loss": 0.2133,
"step": 618
},
{
"epoch": 1.5678073510773132,
"grad_norm": 0.28326693177223206,
"learning_rate": 0.0001376081424936387,
"loss": 0.2087,
"step": 619
},
{
"epoch": 1.570342205323194,
"grad_norm": 0.3020143508911133,
"learning_rate": 0.00013750636132315523,
"loss": 0.2321,
"step": 620
},
{
"epoch": 1.5728770595690746,
"grad_norm": 0.3246900141239166,
"learning_rate": 0.00013740458015267177,
"loss": 0.2121,
"step": 621
},
{
"epoch": 1.5754119138149556,
"grad_norm": 0.3806106448173523,
"learning_rate": 0.0001373027989821883,
"loss": 0.2856,
"step": 622
},
{
"epoch": 1.5779467680608366,
"grad_norm": 0.3568238317966461,
"learning_rate": 0.00013720101781170485,
"loss": 0.2579,
"step": 623
},
{
"epoch": 1.5804816223067175,
"grad_norm": 0.45590534806251526,
"learning_rate": 0.00013709923664122139,
"loss": 0.2059,
"step": 624
},
{
"epoch": 1.5830164765525983,
"grad_norm": 0.41996893286705017,
"learning_rate": 0.00013699745547073792,
"loss": 0.2154,
"step": 625
},
{
"epoch": 1.585551330798479,
"grad_norm": 0.5142170190811157,
"learning_rate": 0.00013689567430025446,
"loss": 0.2708,
"step": 626
},
{
"epoch": 1.58808618504436,
"grad_norm": 0.36335933208465576,
"learning_rate": 0.000136793893129771,
"loss": 0.2501,
"step": 627
},
{
"epoch": 1.590621039290241,
"grad_norm": 0.3186666667461395,
"learning_rate": 0.00013669211195928752,
"loss": 0.2227,
"step": 628
},
{
"epoch": 1.5931558935361216,
"grad_norm": 0.29709601402282715,
"learning_rate": 0.00013659033078880408,
"loss": 0.2265,
"step": 629
},
{
"epoch": 1.5956907477820024,
"grad_norm": 0.2891612648963928,
"learning_rate": 0.00013648854961832062,
"loss": 0.2298,
"step": 630
},
{
"epoch": 1.5982256020278833,
"grad_norm": 0.2191978096961975,
"learning_rate": 0.00013638676844783716,
"loss": 0.2049,
"step": 631
},
{
"epoch": 1.6007604562737643,
"grad_norm": 0.37781399488449097,
"learning_rate": 0.0001362849872773537,
"loss": 0.3664,
"step": 632
},
{
"epoch": 1.6032953105196452,
"grad_norm": 0.3082154393196106,
"learning_rate": 0.00013618320610687024,
"loss": 0.2063,
"step": 633
},
{
"epoch": 1.605830164765526,
"grad_norm": 0.318317711353302,
"learning_rate": 0.00013608142493638678,
"loss": 0.2085,
"step": 634
},
{
"epoch": 1.6083650190114067,
"grad_norm": 0.45566102862358093,
"learning_rate": 0.00013597964376590332,
"loss": 0.2876,
"step": 635
},
{
"epoch": 1.6108998732572877,
"grad_norm": 0.3186021149158478,
"learning_rate": 0.00013587786259541986,
"loss": 0.2704,
"step": 636
},
{
"epoch": 1.6134347275031686,
"grad_norm": 0.28905680775642395,
"learning_rate": 0.0001357760814249364,
"loss": 0.209,
"step": 637
},
{
"epoch": 1.6159695817490496,
"grad_norm": 0.23341360688209534,
"learning_rate": 0.00013567430025445294,
"loss": 0.1835,
"step": 638
},
{
"epoch": 1.6185044359949303,
"grad_norm": 0.336247056722641,
"learning_rate": 0.00013557251908396947,
"loss": 0.2547,
"step": 639
},
{
"epoch": 1.621039290240811,
"grad_norm": 0.3736225366592407,
"learning_rate": 0.00013547073791348601,
"loss": 0.3053,
"step": 640
},
{
"epoch": 1.623574144486692,
"grad_norm": 0.3983825743198395,
"learning_rate": 0.00013536895674300255,
"loss": 0.2395,
"step": 641
},
{
"epoch": 1.626108998732573,
"grad_norm": 0.35913559794425964,
"learning_rate": 0.0001352671755725191,
"loss": 0.2918,
"step": 642
},
{
"epoch": 1.6286438529784537,
"grad_norm": 0.2984326183795929,
"learning_rate": 0.00013516539440203563,
"loss": 0.2148,
"step": 643
},
{
"epoch": 1.6311787072243344,
"grad_norm": 0.3113880753517151,
"learning_rate": 0.00013506361323155217,
"loss": 0.2044,
"step": 644
},
{
"epoch": 1.6337135614702154,
"grad_norm": 0.5340004563331604,
"learning_rate": 0.0001349618320610687,
"loss": 0.3234,
"step": 645
},
{
"epoch": 1.6362484157160964,
"grad_norm": 0.38927194476127625,
"learning_rate": 0.00013486005089058525,
"loss": 0.2866,
"step": 646
},
{
"epoch": 1.6387832699619773,
"grad_norm": 0.38895881175994873,
"learning_rate": 0.0001347582697201018,
"loss": 0.2324,
"step": 647
},
{
"epoch": 1.641318124207858,
"grad_norm": 0.41959917545318604,
"learning_rate": 0.00013465648854961833,
"loss": 0.2666,
"step": 648
},
{
"epoch": 1.6438529784537388,
"grad_norm": 0.4299626648426056,
"learning_rate": 0.00013455470737913487,
"loss": 0.2905,
"step": 649
},
{
"epoch": 1.6463878326996197,
"grad_norm": 0.4236285090446472,
"learning_rate": 0.0001344529262086514,
"loss": 0.292,
"step": 650
},
{
"epoch": 1.6489226869455007,
"grad_norm": 0.8049849271774292,
"learning_rate": 0.00013435114503816795,
"loss": 0.2351,
"step": 651
},
{
"epoch": 1.6514575411913817,
"grad_norm": 0.3420075476169586,
"learning_rate": 0.00013424936386768449,
"loss": 0.2355,
"step": 652
},
{
"epoch": 1.6539923954372624,
"grad_norm": 0.3632122874259949,
"learning_rate": 0.00013414758269720103,
"loss": 0.2377,
"step": 653
},
{
"epoch": 1.6565272496831431,
"grad_norm": 0.27961722016334534,
"learning_rate": 0.00013404580152671756,
"loss": 0.2299,
"step": 654
},
{
"epoch": 1.659062103929024,
"grad_norm": 0.3043057918548584,
"learning_rate": 0.0001339440203562341,
"loss": 0.2321,
"step": 655
},
{
"epoch": 1.661596958174905,
"grad_norm": 0.3421036899089813,
"learning_rate": 0.00013384223918575064,
"loss": 0.2492,
"step": 656
},
{
"epoch": 1.6641318124207858,
"grad_norm": 0.39606526494026184,
"learning_rate": 0.00013374045801526718,
"loss": 0.3401,
"step": 657
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.35081973671913147,
"learning_rate": 0.00013363867684478372,
"loss": 0.2175,
"step": 658
},
{
"epoch": 1.6692015209125475,
"grad_norm": 0.420175701379776,
"learning_rate": 0.00013353689567430026,
"loss": 0.2813,
"step": 659
},
{
"epoch": 1.6717363751584284,
"grad_norm": 0.24181438982486725,
"learning_rate": 0.0001334351145038168,
"loss": 0.219,
"step": 660
},
{
"epoch": 1.6742712294043094,
"grad_norm": 0.6243584752082825,
"learning_rate": 0.00013333333333333334,
"loss": 0.3087,
"step": 661
},
{
"epoch": 1.6768060836501901,
"grad_norm": 0.4036748707294464,
"learning_rate": 0.00013323155216284988,
"loss": 0.251,
"step": 662
},
{
"epoch": 1.6793409378960709,
"grad_norm": 0.39555415511131287,
"learning_rate": 0.00013312977099236642,
"loss": 0.3279,
"step": 663
},
{
"epoch": 1.6818757921419518,
"grad_norm": 0.4018571674823761,
"learning_rate": 0.00013302798982188296,
"loss": 0.2337,
"step": 664
},
{
"epoch": 1.6844106463878328,
"grad_norm": 0.36354130506515503,
"learning_rate": 0.0001329262086513995,
"loss": 0.2503,
"step": 665
},
{
"epoch": 1.6869455006337135,
"grad_norm": 0.32249706983566284,
"learning_rate": 0.00013282442748091604,
"loss": 0.27,
"step": 666
},
{
"epoch": 1.6894803548795945,
"grad_norm": 0.33560654520988464,
"learning_rate": 0.00013272264631043258,
"loss": 0.203,
"step": 667
},
{
"epoch": 1.6920152091254752,
"grad_norm": 0.39997267723083496,
"learning_rate": 0.00013262086513994911,
"loss": 0.2662,
"step": 668
},
{
"epoch": 1.6945500633713562,
"grad_norm": 0.6739961504936218,
"learning_rate": 0.00013251908396946565,
"loss": 0.2803,
"step": 669
},
{
"epoch": 1.6970849176172371,
"grad_norm": 0.5863606929779053,
"learning_rate": 0.0001324173027989822,
"loss": 0.351,
"step": 670
},
{
"epoch": 1.6996197718631179,
"grad_norm": 0.4408819079399109,
"learning_rate": 0.00013231552162849873,
"loss": 0.1814,
"step": 671
},
{
"epoch": 1.7021546261089986,
"grad_norm": 0.3341253697872162,
"learning_rate": 0.00013221374045801527,
"loss": 0.2156,
"step": 672
},
{
"epoch": 1.7046894803548795,
"grad_norm": 0.3035176992416382,
"learning_rate": 0.0001321119592875318,
"loss": 0.2308,
"step": 673
},
{
"epoch": 1.7072243346007605,
"grad_norm": 0.4395483136177063,
"learning_rate": 0.00013201017811704835,
"loss": 0.3418,
"step": 674
},
{
"epoch": 1.7097591888466415,
"grad_norm": 0.22972792387008667,
"learning_rate": 0.0001319083969465649,
"loss": 0.1873,
"step": 675
},
{
"epoch": 1.7122940430925222,
"grad_norm": 0.47378918528556824,
"learning_rate": 0.00013180661577608143,
"loss": 0.2514,
"step": 676
},
{
"epoch": 1.714828897338403,
"grad_norm": 0.3947070240974426,
"learning_rate": 0.00013170483460559797,
"loss": 0.2289,
"step": 677
},
{
"epoch": 1.717363751584284,
"grad_norm": 0.3789718747138977,
"learning_rate": 0.0001316030534351145,
"loss": 0.2476,
"step": 678
},
{
"epoch": 1.7198986058301649,
"grad_norm": 0.4904823899269104,
"learning_rate": 0.00013150127226463105,
"loss": 0.2163,
"step": 679
},
{
"epoch": 1.7224334600760456,
"grad_norm": 0.3285132646560669,
"learning_rate": 0.0001313994910941476,
"loss": 0.2786,
"step": 680
},
{
"epoch": 1.7249683143219265,
"grad_norm": 0.4326847493648529,
"learning_rate": 0.00013129770992366413,
"loss": 0.2409,
"step": 681
},
{
"epoch": 1.7275031685678073,
"grad_norm": 0.3819947838783264,
"learning_rate": 0.00013119592875318067,
"loss": 0.2076,
"step": 682
},
{
"epoch": 1.7300380228136882,
"grad_norm": 0.4046533703804016,
"learning_rate": 0.0001310941475826972,
"loss": 0.2717,
"step": 683
},
{
"epoch": 1.7325728770595692,
"grad_norm": 0.34681758284568787,
"learning_rate": 0.00013099236641221374,
"loss": 0.2389,
"step": 684
},
{
"epoch": 1.73510773130545,
"grad_norm": 0.35155028104782104,
"learning_rate": 0.00013089058524173028,
"loss": 0.2407,
"step": 685
},
{
"epoch": 1.7376425855513307,
"grad_norm": 0.3306678533554077,
"learning_rate": 0.00013078880407124682,
"loss": 0.2767,
"step": 686
},
{
"epoch": 1.7401774397972116,
"grad_norm": 0.27715572714805603,
"learning_rate": 0.00013068702290076336,
"loss": 0.1955,
"step": 687
},
{
"epoch": 1.7427122940430926,
"grad_norm": 0.3591010272502899,
"learning_rate": 0.0001305852417302799,
"loss": 0.2269,
"step": 688
},
{
"epoch": 1.7452471482889735,
"grad_norm": 0.39104408025741577,
"learning_rate": 0.00013048346055979644,
"loss": 0.2392,
"step": 689
},
{
"epoch": 1.7477820025348543,
"grad_norm": 0.44545605778694153,
"learning_rate": 0.00013038167938931298,
"loss": 0.2823,
"step": 690
},
{
"epoch": 1.750316856780735,
"grad_norm": 0.29502785205841064,
"learning_rate": 0.00013027989821882952,
"loss": 0.1899,
"step": 691
},
{
"epoch": 1.752851711026616,
"grad_norm": 0.40423381328582764,
"learning_rate": 0.00013017811704834606,
"loss": 0.2069,
"step": 692
},
{
"epoch": 1.755386565272497,
"grad_norm": 0.38649502396583557,
"learning_rate": 0.0001300763358778626,
"loss": 0.1938,
"step": 693
},
{
"epoch": 1.7579214195183777,
"grad_norm": 0.40014389157295227,
"learning_rate": 0.00012997455470737914,
"loss": 0.2825,
"step": 694
},
{
"epoch": 1.7604562737642584,
"grad_norm": 0.4783387780189514,
"learning_rate": 0.00012987277353689568,
"loss": 0.2629,
"step": 695
},
{
"epoch": 1.7629911280101394,
"grad_norm": 0.4938651919364929,
"learning_rate": 0.00012977099236641222,
"loss": 0.2976,
"step": 696
},
{
"epoch": 1.7655259822560203,
"grad_norm": 0.32507607340812683,
"learning_rate": 0.00012966921119592875,
"loss": 0.2097,
"step": 697
},
{
"epoch": 1.7680608365019013,
"grad_norm": 0.31158536672592163,
"learning_rate": 0.0001295674300254453,
"loss": 0.223,
"step": 698
},
{
"epoch": 1.770595690747782,
"grad_norm": 0.5594013333320618,
"learning_rate": 0.00012946564885496183,
"loss": 0.3523,
"step": 699
},
{
"epoch": 1.7731305449936627,
"grad_norm": 0.5820282697677612,
"learning_rate": 0.00012936386768447837,
"loss": 0.3181,
"step": 700
},
{
"epoch": 1.7756653992395437,
"grad_norm": 0.3635233938694,
"learning_rate": 0.0001292620865139949,
"loss": 0.2387,
"step": 701
},
{
"epoch": 1.7782002534854247,
"grad_norm": 0.3195054531097412,
"learning_rate": 0.00012916030534351148,
"loss": 0.2046,
"step": 702
},
{
"epoch": 1.7807351077313056,
"grad_norm": 0.3483947217464447,
"learning_rate": 0.000129058524173028,
"loss": 0.2576,
"step": 703
},
{
"epoch": 1.7832699619771863,
"grad_norm": 0.3419065475463867,
"learning_rate": 0.00012895674300254456,
"loss": 0.2361,
"step": 704
},
{
"epoch": 1.785804816223067,
"grad_norm": 0.3142557442188263,
"learning_rate": 0.00012885496183206107,
"loss": 0.2172,
"step": 705
},
{
"epoch": 1.788339670468948,
"grad_norm": 0.3502836227416992,
"learning_rate": 0.0001287531806615776,
"loss": 0.2621,
"step": 706
},
{
"epoch": 1.790874524714829,
"grad_norm": 0.37896937131881714,
"learning_rate": 0.00012865139949109415,
"loss": 0.2374,
"step": 707
},
{
"epoch": 1.7934093789607097,
"grad_norm": 0.3880506455898285,
"learning_rate": 0.0001285496183206107,
"loss": 0.2862,
"step": 708
},
{
"epoch": 1.7959442332065905,
"grad_norm": 0.2648681700229645,
"learning_rate": 0.00012844783715012723,
"loss": 0.206,
"step": 709
},
{
"epoch": 1.7984790874524714,
"grad_norm": 0.25072911381721497,
"learning_rate": 0.00012834605597964377,
"loss": 0.2123,
"step": 710
},
{
"epoch": 1.8010139416983524,
"grad_norm": 0.3076663315296173,
"learning_rate": 0.0001282442748091603,
"loss": 0.2983,
"step": 711
},
{
"epoch": 1.8035487959442333,
"grad_norm": 0.4219549000263214,
"learning_rate": 0.00012814249363867684,
"loss": 0.2213,
"step": 712
},
{
"epoch": 1.806083650190114,
"grad_norm": 0.2831745445728302,
"learning_rate": 0.00012804071246819338,
"loss": 0.2062,
"step": 713
},
{
"epoch": 1.8086185044359948,
"grad_norm": 0.4014468491077423,
"learning_rate": 0.00012793893129770992,
"loss": 0.2945,
"step": 714
},
{
"epoch": 1.8111533586818758,
"grad_norm": 0.2980962097644806,
"learning_rate": 0.0001278371501272265,
"loss": 0.2179,
"step": 715
},
{
"epoch": 1.8136882129277567,
"grad_norm": 0.2338070124387741,
"learning_rate": 0.000127735368956743,
"loss": 0.1664,
"step": 716
},
{
"epoch": 1.8162230671736375,
"grad_norm": 0.6155439615249634,
"learning_rate": 0.00012763358778625957,
"loss": 0.3429,
"step": 717
},
{
"epoch": 1.8187579214195184,
"grad_norm": 0.46969589591026306,
"learning_rate": 0.00012753180661577608,
"loss": 0.2584,
"step": 718
},
{
"epoch": 1.8212927756653992,
"grad_norm": 0.5578194260597229,
"learning_rate": 0.00012743002544529265,
"loss": 0.2695,
"step": 719
},
{
"epoch": 1.8238276299112801,
"grad_norm": 0.34903043508529663,
"learning_rate": 0.00012732824427480916,
"loss": 0.2119,
"step": 720
},
{
"epoch": 1.826362484157161,
"grad_norm": 0.3990432322025299,
"learning_rate": 0.0001272264631043257,
"loss": 0.2487,
"step": 721
},
{
"epoch": 1.8288973384030418,
"grad_norm": 0.3382611572742462,
"learning_rate": 0.00012712468193384224,
"loss": 0.2313,
"step": 722
},
{
"epoch": 1.8314321926489225,
"grad_norm": 0.30938395857810974,
"learning_rate": 0.00012702290076335878,
"loss": 0.2113,
"step": 723
},
{
"epoch": 1.8339670468948035,
"grad_norm": 0.39266690611839294,
"learning_rate": 0.00012692111959287532,
"loss": 0.2609,
"step": 724
},
{
"epoch": 1.8365019011406845,
"grad_norm": 0.4396655261516571,
"learning_rate": 0.00012681933842239186,
"loss": 0.2518,
"step": 725
},
{
"epoch": 1.8390367553865654,
"grad_norm": 0.4134500324726105,
"learning_rate": 0.0001267175572519084,
"loss": 0.3317,
"step": 726
},
{
"epoch": 1.8415716096324461,
"grad_norm": 0.29644638299942017,
"learning_rate": 0.00012661577608142493,
"loss": 0.1912,
"step": 727
},
{
"epoch": 1.8441064638783269,
"grad_norm": 0.3661201596260071,
"learning_rate": 0.0001265139949109415,
"loss": 0.2911,
"step": 728
},
{
"epoch": 1.8466413181242078,
"grad_norm": 0.4504169225692749,
"learning_rate": 0.000126412213740458,
"loss": 0.3409,
"step": 729
},
{
"epoch": 1.8491761723700888,
"grad_norm": 0.28516069054603577,
"learning_rate": 0.00012631043256997458,
"loss": 0.254,
"step": 730
},
{
"epoch": 1.8517110266159695,
"grad_norm": 0.33754590153694153,
"learning_rate": 0.0001262086513994911,
"loss": 0.2275,
"step": 731
},
{
"epoch": 1.8542458808618505,
"grad_norm": 0.26562589406967163,
"learning_rate": 0.00012610687022900766,
"loss": 0.1979,
"step": 732
},
{
"epoch": 1.8567807351077312,
"grad_norm": 0.3081592321395874,
"learning_rate": 0.00012600508905852417,
"loss": 0.2099,
"step": 733
},
{
"epoch": 1.8593155893536122,
"grad_norm": 0.34866124391555786,
"learning_rate": 0.0001259033078880407,
"loss": 0.3038,
"step": 734
},
{
"epoch": 1.8618504435994931,
"grad_norm": 0.2867881953716278,
"learning_rate": 0.00012580152671755725,
"loss": 0.2225,
"step": 735
},
{
"epoch": 1.8643852978453739,
"grad_norm": 0.2374526560306549,
"learning_rate": 0.0001256997455470738,
"loss": 0.1945,
"step": 736
},
{
"epoch": 1.8669201520912546,
"grad_norm": 0.3072168827056885,
"learning_rate": 0.00012559796437659033,
"loss": 0.2135,
"step": 737
},
{
"epoch": 1.8694550063371356,
"grad_norm": 0.36897239089012146,
"learning_rate": 0.00012549618320610687,
"loss": 0.3225,
"step": 738
},
{
"epoch": 1.8719898605830165,
"grad_norm": 0.3114832937717438,
"learning_rate": 0.00012539440203562343,
"loss": 0.2064,
"step": 739
},
{
"epoch": 1.8745247148288975,
"grad_norm": 0.40082940459251404,
"learning_rate": 0.00012529262086513995,
"loss": 0.2145,
"step": 740
},
{
"epoch": 1.8770595690747782,
"grad_norm": 0.28362375497817993,
"learning_rate": 0.0001251908396946565,
"loss": 0.2044,
"step": 741
},
{
"epoch": 1.879594423320659,
"grad_norm": 0.2738857567310333,
"learning_rate": 0.00012508905852417302,
"loss": 0.1852,
"step": 742
},
{
"epoch": 1.88212927756654,
"grad_norm": 0.37283095717430115,
"learning_rate": 0.0001249872773536896,
"loss": 0.248,
"step": 743
},
{
"epoch": 1.8846641318124209,
"grad_norm": 0.3065252900123596,
"learning_rate": 0.0001248854961832061,
"loss": 0.2028,
"step": 744
},
{
"epoch": 1.8871989860583016,
"grad_norm": 0.2891787588596344,
"learning_rate": 0.00012478371501272267,
"loss": 0.1977,
"step": 745
},
{
"epoch": 1.8897338403041823,
"grad_norm": 0.5002029538154602,
"learning_rate": 0.00012468193384223918,
"loss": 0.2731,
"step": 746
},
{
"epoch": 1.8922686945500633,
"grad_norm": 0.34734681248664856,
"learning_rate": 0.00012458015267175575,
"loss": 0.2236,
"step": 747
},
{
"epoch": 1.8948035487959443,
"grad_norm": 0.4372716248035431,
"learning_rate": 0.00012447837150127226,
"loss": 0.3787,
"step": 748
},
{
"epoch": 1.8973384030418252,
"grad_norm": 0.41203773021698,
"learning_rate": 0.0001243765903307888,
"loss": 0.2385,
"step": 749
},
{
"epoch": 1.899873257287706,
"grad_norm": 0.28231269121170044,
"learning_rate": 0.00012427480916030534,
"loss": 0.1966,
"step": 750
},
{
"epoch": 1.9024081115335867,
"grad_norm": 0.3689015209674835,
"learning_rate": 0.00012417302798982188,
"loss": 0.2266,
"step": 751
},
{
"epoch": 1.9049429657794676,
"grad_norm": 0.35862621665000916,
"learning_rate": 0.00012407124681933844,
"loss": 0.2226,
"step": 752
},
{
"epoch": 1.9074778200253486,
"grad_norm": 0.27552056312561035,
"learning_rate": 0.00012396946564885496,
"loss": 0.2049,
"step": 753
},
{
"epoch": 1.9100126742712296,
"grad_norm": 0.3665705919265747,
"learning_rate": 0.00012386768447837152,
"loss": 0.2262,
"step": 754
},
{
"epoch": 1.9125475285171103,
"grad_norm": 0.37812677025794983,
"learning_rate": 0.00012376590330788803,
"loss": 0.2561,
"step": 755
},
{
"epoch": 1.915082382762991,
"grad_norm": 0.34638741612434387,
"learning_rate": 0.0001236641221374046,
"loss": 0.2152,
"step": 756
},
{
"epoch": 1.917617237008872,
"grad_norm": 0.3499183654785156,
"learning_rate": 0.00012356234096692111,
"loss": 0.2823,
"step": 757
},
{
"epoch": 1.920152091254753,
"grad_norm": 0.3274863362312317,
"learning_rate": 0.00012346055979643768,
"loss": 0.202,
"step": 758
},
{
"epoch": 1.9226869455006337,
"grad_norm": 0.4568060338497162,
"learning_rate": 0.0001233587786259542,
"loss": 0.3531,
"step": 759
},
{
"epoch": 1.9252217997465144,
"grad_norm": 0.3351891040802002,
"learning_rate": 0.00012325699745547076,
"loss": 0.3491,
"step": 760
},
{
"epoch": 1.9277566539923954,
"grad_norm": 0.3045225739479065,
"learning_rate": 0.00012315521628498727,
"loss": 0.2412,
"step": 761
},
{
"epoch": 1.9302915082382763,
"grad_norm": 0.4453962445259094,
"learning_rate": 0.0001230534351145038,
"loss": 0.485,
"step": 762
},
{
"epoch": 1.9328263624841573,
"grad_norm": 0.4568649232387543,
"learning_rate": 0.00012295165394402038,
"loss": 0.4203,
"step": 763
},
{
"epoch": 1.935361216730038,
"grad_norm": 0.33376067876815796,
"learning_rate": 0.0001228498727735369,
"loss": 0.2287,
"step": 764
},
{
"epoch": 1.9378960709759188,
"grad_norm": 0.2670106887817383,
"learning_rate": 0.00012274809160305346,
"loss": 0.2265,
"step": 765
},
{
"epoch": 1.9404309252217997,
"grad_norm": 0.25930914282798767,
"learning_rate": 0.00012264631043256997,
"loss": 0.2661,
"step": 766
},
{
"epoch": 1.9429657794676807,
"grad_norm": 0.22364859282970428,
"learning_rate": 0.00012254452926208653,
"loss": 0.1938,
"step": 767
},
{
"epoch": 1.9455006337135616,
"grad_norm": 0.4107860028743744,
"learning_rate": 0.00012244274809160305,
"loss": 0.3227,
"step": 768
},
{
"epoch": 1.9480354879594424,
"grad_norm": 0.24454613029956818,
"learning_rate": 0.0001223409669211196,
"loss": 0.2813,
"step": 769
},
{
"epoch": 1.950570342205323,
"grad_norm": 0.28310418128967285,
"learning_rate": 0.00012223918575063612,
"loss": 0.2065,
"step": 770
},
{
"epoch": 1.953105196451204,
"grad_norm": 0.28080177307128906,
"learning_rate": 0.0001221374045801527,
"loss": 0.1941,
"step": 771
},
{
"epoch": 1.955640050697085,
"grad_norm": 0.365400105714798,
"learning_rate": 0.0001220356234096692,
"loss": 0.2657,
"step": 772
},
{
"epoch": 1.9581749049429658,
"grad_norm": 0.3115444779396057,
"learning_rate": 0.00012193384223918576,
"loss": 0.2117,
"step": 773
},
{
"epoch": 1.9607097591888465,
"grad_norm": 0.30900898575782776,
"learning_rate": 0.00012183206106870228,
"loss": 0.2563,
"step": 774
},
{
"epoch": 1.9632446134347274,
"grad_norm": 0.341789573431015,
"learning_rate": 0.00012173027989821883,
"loss": 0.2396,
"step": 775
},
{
"epoch": 1.9657794676806084,
"grad_norm": 0.39556756615638733,
"learning_rate": 0.00012162849872773539,
"loss": 0.2203,
"step": 776
},
{
"epoch": 1.9683143219264894,
"grad_norm": 0.4282820224761963,
"learning_rate": 0.00012152671755725191,
"loss": 0.2476,
"step": 777
},
{
"epoch": 1.97084917617237,
"grad_norm": 0.3683648109436035,
"learning_rate": 0.00012142493638676847,
"loss": 0.2414,
"step": 778
},
{
"epoch": 1.9733840304182508,
"grad_norm": 0.19751296937465668,
"learning_rate": 0.00012132315521628499,
"loss": 0.1622,
"step": 779
},
{
"epoch": 1.9759188846641318,
"grad_norm": 0.4522268772125244,
"learning_rate": 0.00012122137404580154,
"loss": 0.3372,
"step": 780
},
{
"epoch": 1.9784537389100127,
"grad_norm": 0.3386411666870117,
"learning_rate": 0.00012111959287531807,
"loss": 0.1966,
"step": 781
},
{
"epoch": 1.9809885931558935,
"grad_norm": 0.3266599178314209,
"learning_rate": 0.00012101781170483461,
"loss": 0.2507,
"step": 782
},
{
"epoch": 1.9835234474017744,
"grad_norm": 0.395271897315979,
"learning_rate": 0.00012091603053435115,
"loss": 0.2626,
"step": 783
},
{
"epoch": 1.9860583016476552,
"grad_norm": 0.23269407451152802,
"learning_rate": 0.00012081424936386769,
"loss": 0.1806,
"step": 784
},
{
"epoch": 1.9885931558935361,
"grad_norm": 0.3929823040962219,
"learning_rate": 0.00012071246819338421,
"loss": 0.2912,
"step": 785
},
{
"epoch": 1.991128010139417,
"grad_norm": 0.2597116529941559,
"learning_rate": 0.00012061068702290077,
"loss": 0.1918,
"step": 786
},
{
"epoch": 1.9936628643852978,
"grad_norm": 0.44690757989883423,
"learning_rate": 0.00012050890585241729,
"loss": 0.2644,
"step": 787
},
{
"epoch": 1.9961977186311786,
"grad_norm": 0.4133460819721222,
"learning_rate": 0.00012040712468193385,
"loss": 0.2541,
"step": 788
},
{
"epoch": 1.9987325728770595,
"grad_norm": 0.33399301767349243,
"learning_rate": 0.0001203053435114504,
"loss": 0.2778,
"step": 789
},
{
"epoch": 2.0,
"grad_norm": 0.6268282532691956,
"learning_rate": 0.00012020356234096692,
"loss": 0.3105,
"step": 790
},
{
"epoch": 2.002534854245881,
"grad_norm": 0.38419365882873535,
"learning_rate": 0.00012010178117048348,
"loss": 0.2352,
"step": 791
},
{
"epoch": 2.005069708491762,
"grad_norm": 0.30469566583633423,
"learning_rate": 0.00012,
"loss": 0.2011,
"step": 792
},
{
"epoch": 2.0076045627376424,
"grad_norm": 0.36411482095718384,
"learning_rate": 0.00011989821882951656,
"loss": 0.2324,
"step": 793
},
{
"epoch": 2.0101394169835234,
"grad_norm": 0.40986311435699463,
"learning_rate": 0.00011979643765903308,
"loss": 0.2217,
"step": 794
},
{
"epoch": 2.0126742712294043,
"grad_norm": 0.46682968735694885,
"learning_rate": 0.00011969465648854963,
"loss": 0.2688,
"step": 795
},
{
"epoch": 2.0152091254752853,
"grad_norm": 0.31846344470977783,
"learning_rate": 0.00011959287531806616,
"loss": 0.1984,
"step": 796
},
{
"epoch": 2.017743979721166,
"grad_norm": 0.48346126079559326,
"learning_rate": 0.0001194910941475827,
"loss": 0.2404,
"step": 797
},
{
"epoch": 2.0202788339670468,
"grad_norm": 0.5090253949165344,
"learning_rate": 0.00011938931297709924,
"loss": 0.2363,
"step": 798
},
{
"epoch": 2.0228136882129277,
"grad_norm": 0.4886679947376251,
"learning_rate": 0.00011928753180661578,
"loss": 0.2656,
"step": 799
},
{
"epoch": 2.0253485424588087,
"grad_norm": 0.5652650594711304,
"learning_rate": 0.00011918575063613233,
"loss": 0.2444,
"step": 800
},
{
"epoch": 2.0278833967046896,
"grad_norm": 0.7158893346786499,
"learning_rate": 0.00011908396946564886,
"loss": 0.2362,
"step": 801
},
{
"epoch": 2.03041825095057,
"grad_norm": 0.5168672800064087,
"learning_rate": 0.00011898218829516541,
"loss": 0.2067,
"step": 802
},
{
"epoch": 2.032953105196451,
"grad_norm": 0.7243991494178772,
"learning_rate": 0.00011888040712468194,
"loss": 0.2458,
"step": 803
},
{
"epoch": 2.035487959442332,
"grad_norm": 0.4199936091899872,
"learning_rate": 0.00011877862595419849,
"loss": 0.2009,
"step": 804
},
{
"epoch": 2.038022813688213,
"grad_norm": 0.41791805624961853,
"learning_rate": 0.00011867684478371501,
"loss": 0.2325,
"step": 805
},
{
"epoch": 2.040557667934094,
"grad_norm": 0.6389465928077698,
"learning_rate": 0.00011857506361323157,
"loss": 0.2636,
"step": 806
},
{
"epoch": 2.0430925221799745,
"grad_norm": 0.6254114508628845,
"learning_rate": 0.00011847328244274809,
"loss": 0.2292,
"step": 807
},
{
"epoch": 2.0456273764258555,
"grad_norm": 0.8436942100524902,
"learning_rate": 0.00011837150127226465,
"loss": 0.2913,
"step": 808
},
{
"epoch": 2.0481622306717364,
"grad_norm": 0.42698097229003906,
"learning_rate": 0.00011826972010178117,
"loss": 0.2107,
"step": 809
},
{
"epoch": 2.0506970849176174,
"grad_norm": 0.432607501745224,
"learning_rate": 0.00011816793893129771,
"loss": 0.1851,
"step": 810
},
{
"epoch": 2.053231939163498,
"grad_norm": 0.48241573572158813,
"learning_rate": 0.00011806615776081425,
"loss": 0.2333,
"step": 811
},
{
"epoch": 2.055766793409379,
"grad_norm": 0.3920150101184845,
"learning_rate": 0.00011796437659033079,
"loss": 0.2256,
"step": 812
},
{
"epoch": 2.05830164765526,
"grad_norm": 0.3601329028606415,
"learning_rate": 0.00011786259541984734,
"loss": 0.2428,
"step": 813
},
{
"epoch": 2.0608365019011408,
"grad_norm": 0.428524911403656,
"learning_rate": 0.00011776081424936387,
"loss": 0.3109,
"step": 814
},
{
"epoch": 2.0633713561470217,
"grad_norm": 0.22846737504005432,
"learning_rate": 0.00011765903307888042,
"loss": 0.1715,
"step": 815
},
{
"epoch": 2.0659062103929022,
"grad_norm": 0.3656214475631714,
"learning_rate": 0.00011755725190839695,
"loss": 0.2211,
"step": 816
},
{
"epoch": 2.068441064638783,
"grad_norm": 0.2633965015411377,
"learning_rate": 0.0001174554707379135,
"loss": 0.1933,
"step": 817
},
{
"epoch": 2.070975918884664,
"grad_norm": 0.4318942129611969,
"learning_rate": 0.00011735368956743003,
"loss": 0.2829,
"step": 818
},
{
"epoch": 2.073510773130545,
"grad_norm": 0.2643216848373413,
"learning_rate": 0.00011725190839694658,
"loss": 0.1938,
"step": 819
},
{
"epoch": 2.076045627376426,
"grad_norm": 0.4560074508190155,
"learning_rate": 0.0001171501272264631,
"loss": 0.3017,
"step": 820
},
{
"epoch": 2.0785804816223066,
"grad_norm": 0.380374550819397,
"learning_rate": 0.00011704834605597966,
"loss": 0.2141,
"step": 821
},
{
"epoch": 2.0811153358681875,
"grad_norm": 0.321417897939682,
"learning_rate": 0.00011694656488549618,
"loss": 0.2058,
"step": 822
},
{
"epoch": 2.0836501901140685,
"grad_norm": 0.350496768951416,
"learning_rate": 0.00011684478371501274,
"loss": 0.1761,
"step": 823
},
{
"epoch": 2.0861850443599494,
"grad_norm": 0.35794898867607117,
"learning_rate": 0.00011674300254452927,
"loss": 0.2016,
"step": 824
},
{
"epoch": 2.08871989860583,
"grad_norm": 0.37890860438346863,
"learning_rate": 0.0001166412213740458,
"loss": 0.253,
"step": 825
},
{
"epoch": 2.091254752851711,
"grad_norm": 0.41833457350730896,
"learning_rate": 0.00011653944020356235,
"loss": 0.2012,
"step": 826
},
{
"epoch": 2.093789607097592,
"grad_norm": 0.49572086334228516,
"learning_rate": 0.00011643765903307888,
"loss": 0.214,
"step": 827
},
{
"epoch": 2.096324461343473,
"grad_norm": 0.44266751408576965,
"learning_rate": 0.00011633587786259543,
"loss": 0.2496,
"step": 828
},
{
"epoch": 2.098859315589354,
"grad_norm": 0.7018102407455444,
"learning_rate": 0.00011623409669211196,
"loss": 0.3996,
"step": 829
},
{
"epoch": 2.1013941698352343,
"grad_norm": 0.42781826853752136,
"learning_rate": 0.00011613231552162851,
"loss": 0.2325,
"step": 830
},
{
"epoch": 2.1039290240811153,
"grad_norm": 0.35814788937568665,
"learning_rate": 0.00011603053435114504,
"loss": 0.2003,
"step": 831
},
{
"epoch": 2.106463878326996,
"grad_norm": 0.2381380945444107,
"learning_rate": 0.00011592875318066159,
"loss": 0.1791,
"step": 832
},
{
"epoch": 2.108998732572877,
"grad_norm": 0.3152197003364563,
"learning_rate": 0.00011582697201017811,
"loss": 0.1802,
"step": 833
},
{
"epoch": 2.111533586818758,
"grad_norm": 0.3493264615535736,
"learning_rate": 0.00011572519083969467,
"loss": 0.173,
"step": 834
},
{
"epoch": 2.1140684410646386,
"grad_norm": 0.339036762714386,
"learning_rate": 0.0001156234096692112,
"loss": 0.1875,
"step": 835
},
{
"epoch": 2.1166032953105196,
"grad_norm": 0.3622972369194031,
"learning_rate": 0.00011552162849872775,
"loss": 0.1892,
"step": 836
},
{
"epoch": 2.1191381495564006,
"grad_norm": 0.7021862268447876,
"learning_rate": 0.00011541984732824429,
"loss": 0.272,
"step": 837
},
{
"epoch": 2.1216730038022815,
"grad_norm": 0.4027453064918518,
"learning_rate": 0.00011531806615776081,
"loss": 0.2296,
"step": 838
},
{
"epoch": 2.124207858048162,
"grad_norm": 0.3509223163127899,
"learning_rate": 0.00011521628498727736,
"loss": 0.1812,
"step": 839
},
{
"epoch": 2.126742712294043,
"grad_norm": 0.4156752824783325,
"learning_rate": 0.00011511450381679389,
"loss": 0.2444,
"step": 840
},
{
"epoch": 2.129277566539924,
"grad_norm": 0.3596971035003662,
"learning_rate": 0.00011501272264631044,
"loss": 0.1944,
"step": 841
},
{
"epoch": 2.131812420785805,
"grad_norm": 0.4088239371776581,
"learning_rate": 0.00011491094147582697,
"loss": 0.1892,
"step": 842
},
{
"epoch": 2.134347275031686,
"grad_norm": 0.3603368103504181,
"learning_rate": 0.00011480916030534352,
"loss": 0.1955,
"step": 843
},
{
"epoch": 2.1368821292775664,
"grad_norm": 0.3702489733695984,
"learning_rate": 0.00011470737913486005,
"loss": 0.2401,
"step": 844
},
{
"epoch": 2.1394169835234473,
"grad_norm": 0.427312433719635,
"learning_rate": 0.0001146055979643766,
"loss": 0.2097,
"step": 845
},
{
"epoch": 2.1419518377693283,
"grad_norm": 0.34239426255226135,
"learning_rate": 0.00011450381679389313,
"loss": 0.2055,
"step": 846
},
{
"epoch": 2.1444866920152093,
"grad_norm": 0.522627055644989,
"learning_rate": 0.00011440203562340968,
"loss": 0.2206,
"step": 847
},
{
"epoch": 2.14702154626109,
"grad_norm": 0.5005999207496643,
"learning_rate": 0.0001143002544529262,
"loss": 0.2187,
"step": 848
},
{
"epoch": 2.1495564005069707,
"grad_norm": 0.4834093451499939,
"learning_rate": 0.00011419847328244276,
"loss": 0.2616,
"step": 849
},
{
"epoch": 2.1520912547528517,
"grad_norm": 0.3305776119232178,
"learning_rate": 0.0001140966921119593,
"loss": 0.2193,
"step": 850
},
{
"epoch": 2.1546261089987326,
"grad_norm": 0.3691657781600952,
"learning_rate": 0.00011399491094147584,
"loss": 0.2343,
"step": 851
},
{
"epoch": 2.1571609632446136,
"grad_norm": 0.4711242914199829,
"learning_rate": 0.00011389312977099238,
"loss": 0.2961,
"step": 852
},
{
"epoch": 2.159695817490494,
"grad_norm": 0.4091726839542389,
"learning_rate": 0.0001137913486005089,
"loss": 0.2735,
"step": 853
},
{
"epoch": 2.162230671736375,
"grad_norm": 0.28634020686149597,
"learning_rate": 0.00011368956743002545,
"loss": 0.2026,
"step": 854
},
{
"epoch": 2.164765525982256,
"grad_norm": 0.3120497763156891,
"learning_rate": 0.00011358778625954198,
"loss": 0.1826,
"step": 855
},
{
"epoch": 2.167300380228137,
"grad_norm": 0.3803773522377014,
"learning_rate": 0.00011348600508905853,
"loss": 0.2206,
"step": 856
},
{
"epoch": 2.169835234474018,
"grad_norm": 0.4069412648677826,
"learning_rate": 0.00011338422391857506,
"loss": 0.23,
"step": 857
},
{
"epoch": 2.1723700887198985,
"grad_norm": 0.31032097339630127,
"learning_rate": 0.00011328244274809161,
"loss": 0.1774,
"step": 858
},
{
"epoch": 2.1749049429657794,
"grad_norm": 0.3429819941520691,
"learning_rate": 0.00011318066157760814,
"loss": 0.207,
"step": 859
},
{
"epoch": 2.1774397972116604,
"grad_norm": 0.32155394554138184,
"learning_rate": 0.00011307888040712469,
"loss": 0.1817,
"step": 860
},
{
"epoch": 2.1799746514575413,
"grad_norm": 0.3859189450740814,
"learning_rate": 0.00011297709923664124,
"loss": 0.205,
"step": 861
},
{
"epoch": 2.182509505703422,
"grad_norm": 0.33794042468070984,
"learning_rate": 0.00011287531806615777,
"loss": 0.2002,
"step": 862
},
{
"epoch": 2.185044359949303,
"grad_norm": 0.38762131333351135,
"learning_rate": 0.00011277353689567431,
"loss": 0.206,
"step": 863
},
{
"epoch": 2.1875792141951838,
"grad_norm": 0.35734203457832336,
"learning_rate": 0.00011267175572519085,
"loss": 0.2332,
"step": 864
},
{
"epoch": 2.1901140684410647,
"grad_norm": 0.32456931471824646,
"learning_rate": 0.00011256997455470739,
"loss": 0.1873,
"step": 865
},
{
"epoch": 2.1926489226869457,
"grad_norm": 0.5198532938957214,
"learning_rate": 0.00011246819338422391,
"loss": 0.2408,
"step": 866
},
{
"epoch": 2.195183776932826,
"grad_norm": 0.3863469362258911,
"learning_rate": 0.00011236641221374046,
"loss": 0.1778,
"step": 867
},
{
"epoch": 2.197718631178707,
"grad_norm": 0.39902037382125854,
"learning_rate": 0.00011226463104325699,
"loss": 0.1982,
"step": 868
},
{
"epoch": 2.200253485424588,
"grad_norm": 0.3974783718585968,
"learning_rate": 0.00011216284987277354,
"loss": 0.2157,
"step": 869
},
{
"epoch": 2.202788339670469,
"grad_norm": 0.33785662055015564,
"learning_rate": 0.00011206106870229007,
"loss": 0.2152,
"step": 870
},
{
"epoch": 2.20532319391635,
"grad_norm": 0.4233367145061493,
"learning_rate": 0.00011195928753180662,
"loss": 0.2992,
"step": 871
},
{
"epoch": 2.2078580481622305,
"grad_norm": 0.37665534019470215,
"learning_rate": 0.00011185750636132315,
"loss": 0.2273,
"step": 872
},
{
"epoch": 2.2103929024081115,
"grad_norm": 0.3841243088245392,
"learning_rate": 0.0001117557251908397,
"loss": 0.1991,
"step": 873
},
{
"epoch": 2.2129277566539924,
"grad_norm": 0.3544892966747284,
"learning_rate": 0.00011165394402035625,
"loss": 0.2098,
"step": 874
},
{
"epoch": 2.2154626108998734,
"grad_norm": 0.43662142753601074,
"learning_rate": 0.00011155216284987278,
"loss": 0.2411,
"step": 875
},
{
"epoch": 2.2179974651457544,
"grad_norm": 0.3305199146270752,
"learning_rate": 0.00011145038167938933,
"loss": 0.1803,
"step": 876
},
{
"epoch": 2.220532319391635,
"grad_norm": 0.34674328565597534,
"learning_rate": 0.00011134860050890586,
"loss": 0.2206,
"step": 877
},
{
"epoch": 2.223067173637516,
"grad_norm": 0.39985305070877075,
"learning_rate": 0.0001112468193384224,
"loss": 0.2951,
"step": 878
},
{
"epoch": 2.225602027883397,
"grad_norm": 0.36231693625450134,
"learning_rate": 0.00011114503816793894,
"loss": 0.2601,
"step": 879
},
{
"epoch": 2.2281368821292777,
"grad_norm": 0.4199659526348114,
"learning_rate": 0.00011104325699745548,
"loss": 0.2719,
"step": 880
},
{
"epoch": 2.2306717363751583,
"grad_norm": 0.3472574055194855,
"learning_rate": 0.000110941475826972,
"loss": 0.2437,
"step": 881
},
{
"epoch": 2.233206590621039,
"grad_norm": 0.2765200436115265,
"learning_rate": 0.00011083969465648855,
"loss": 0.1983,
"step": 882
},
{
"epoch": 2.23574144486692,
"grad_norm": 0.4466260075569153,
"learning_rate": 0.00011073791348600508,
"loss": 0.2323,
"step": 883
},
{
"epoch": 2.238276299112801,
"grad_norm": 0.43661364912986755,
"learning_rate": 0.00011063613231552163,
"loss": 0.2957,
"step": 884
},
{
"epoch": 2.240811153358682,
"grad_norm": 0.3262166976928711,
"learning_rate": 0.00011053435114503819,
"loss": 0.195,
"step": 885
},
{
"epoch": 2.2433460076045626,
"grad_norm": 0.5085666179656982,
"learning_rate": 0.00011043256997455471,
"loss": 0.3349,
"step": 886
},
{
"epoch": 2.2458808618504436,
"grad_norm": 0.46551409363746643,
"learning_rate": 0.00011033078880407126,
"loss": 0.3318,
"step": 887
},
{
"epoch": 2.2484157160963245,
"grad_norm": 0.425530344247818,
"learning_rate": 0.00011022900763358779,
"loss": 0.2857,
"step": 888
},
{
"epoch": 2.2509505703422055,
"grad_norm": 0.3377918601036072,
"learning_rate": 0.00011012722646310434,
"loss": 0.2215,
"step": 889
},
{
"epoch": 2.253485424588086,
"grad_norm": 0.3491476774215698,
"learning_rate": 0.00011002544529262087,
"loss": 0.2471,
"step": 890
},
{
"epoch": 2.256020278833967,
"grad_norm": 0.3779531419277191,
"learning_rate": 0.00010992366412213742,
"loss": 0.1984,
"step": 891
},
{
"epoch": 2.258555133079848,
"grad_norm": 0.425077885389328,
"learning_rate": 0.00010982188295165395,
"loss": 0.2535,
"step": 892
},
{
"epoch": 2.261089987325729,
"grad_norm": 0.40296900272369385,
"learning_rate": 0.00010972010178117049,
"loss": 0.1955,
"step": 893
},
{
"epoch": 2.26362484157161,
"grad_norm": 0.4394761919975281,
"learning_rate": 0.00010961832061068703,
"loss": 0.2638,
"step": 894
},
{
"epoch": 2.2661596958174903,
"grad_norm": 0.4743111729621887,
"learning_rate": 0.00010951653944020357,
"loss": 0.1932,
"step": 895
},
{
"epoch": 2.2686945500633713,
"grad_norm": 0.5121330618858337,
"learning_rate": 0.00010941475826972009,
"loss": 0.2541,
"step": 896
},
{
"epoch": 2.2712294043092522,
"grad_norm": 0.2810382544994354,
"learning_rate": 0.00010931297709923664,
"loss": 0.1884,
"step": 897
},
{
"epoch": 2.273764258555133,
"grad_norm": 0.3637334108352661,
"learning_rate": 0.0001092111959287532,
"loss": 0.2208,
"step": 898
},
{
"epoch": 2.2762991128010137,
"grad_norm": 0.4116186201572418,
"learning_rate": 0.00010910941475826972,
"loss": 0.1898,
"step": 899
},
{
"epoch": 2.2788339670468947,
"grad_norm": 0.4166296720504761,
"learning_rate": 0.00010900763358778628,
"loss": 0.2399,
"step": 900
},
{
"epoch": 2.2813688212927756,
"grad_norm": 0.5998784303665161,
"learning_rate": 0.0001089058524173028,
"loss": 0.2926,
"step": 901
},
{
"epoch": 2.2839036755386566,
"grad_norm": 0.6252371668815613,
"learning_rate": 0.00010880407124681935,
"loss": 0.2392,
"step": 902
},
{
"epoch": 2.2864385297845375,
"grad_norm": 0.4495537579059601,
"learning_rate": 0.00010870229007633588,
"loss": 0.2142,
"step": 903
},
{
"epoch": 2.288973384030418,
"grad_norm": 0.5659827589988708,
"learning_rate": 0.00010860050890585243,
"loss": 0.2993,
"step": 904
},
{
"epoch": 2.291508238276299,
"grad_norm": 0.4290786385536194,
"learning_rate": 0.00010849872773536896,
"loss": 0.3127,
"step": 905
},
{
"epoch": 2.29404309252218,
"grad_norm": 0.3835826516151428,
"learning_rate": 0.0001083969465648855,
"loss": 0.1927,
"step": 906
},
{
"epoch": 2.296577946768061,
"grad_norm": 0.4915788769721985,
"learning_rate": 0.00010829516539440204,
"loss": 0.2553,
"step": 907
},
{
"epoch": 2.299112801013942,
"grad_norm": 0.42122524976730347,
"learning_rate": 0.00010819338422391858,
"loss": 0.2133,
"step": 908
},
{
"epoch": 2.3016476552598224,
"grad_norm": 0.3904586732387543,
"learning_rate": 0.0001080916030534351,
"loss": 0.2064,
"step": 909
},
{
"epoch": 2.3041825095057034,
"grad_norm": 0.3680777847766876,
"learning_rate": 0.00010798982188295166,
"loss": 0.1989,
"step": 910
},
{
"epoch": 2.3067173637515843,
"grad_norm": 0.44054466485977173,
"learning_rate": 0.00010788804071246821,
"loss": 0.2386,
"step": 911
},
{
"epoch": 2.3092522179974653,
"grad_norm": 0.28730717301368713,
"learning_rate": 0.00010778625954198473,
"loss": 0.175,
"step": 912
},
{
"epoch": 2.3117870722433462,
"grad_norm": 0.4209315776824951,
"learning_rate": 0.00010768447837150129,
"loss": 0.2197,
"step": 913
},
{
"epoch": 2.3143219264892267,
"grad_norm": 0.41457393765449524,
"learning_rate": 0.00010758269720101781,
"loss": 0.202,
"step": 914
},
{
"epoch": 2.3168567807351077,
"grad_norm": 0.40807071328163147,
"learning_rate": 0.00010748091603053437,
"loss": 0.3087,
"step": 915
},
{
"epoch": 2.3193916349809887,
"grad_norm": 0.42118731141090393,
"learning_rate": 0.00010737913486005089,
"loss": 0.2269,
"step": 916
},
{
"epoch": 2.3219264892268696,
"grad_norm": 0.3436257541179657,
"learning_rate": 0.00010727735368956744,
"loss": 0.1987,
"step": 917
},
{
"epoch": 2.32446134347275,
"grad_norm": 0.3721463978290558,
"learning_rate": 0.00010717557251908397,
"loss": 0.2081,
"step": 918
},
{
"epoch": 2.326996197718631,
"grad_norm": 0.45050719380378723,
"learning_rate": 0.00010707379134860052,
"loss": 0.2199,
"step": 919
},
{
"epoch": 2.329531051964512,
"grad_norm": 0.42665717005729675,
"learning_rate": 0.00010697201017811705,
"loss": 0.2176,
"step": 920
},
{
"epoch": 2.332065906210393,
"grad_norm": 0.35217922925949097,
"learning_rate": 0.00010687022900763359,
"loss": 0.1915,
"step": 921
},
{
"epoch": 2.334600760456274,
"grad_norm": 0.5407602190971375,
"learning_rate": 0.00010676844783715014,
"loss": 0.2309,
"step": 922
},
{
"epoch": 2.3371356147021545,
"grad_norm": 0.6984291076660156,
"learning_rate": 0.00010666666666666667,
"loss": 0.2779,
"step": 923
},
{
"epoch": 2.3396704689480354,
"grad_norm": 0.5333911776542664,
"learning_rate": 0.00010656488549618322,
"loss": 0.2659,
"step": 924
},
{
"epoch": 2.3422053231939164,
"grad_norm": 0.5130952596664429,
"learning_rate": 0.00010646310432569974,
"loss": 0.315,
"step": 925
},
{
"epoch": 2.3447401774397973,
"grad_norm": 0.3874262869358063,
"learning_rate": 0.0001063613231552163,
"loss": 0.294,
"step": 926
},
{
"epoch": 2.347275031685678,
"grad_norm": 0.37864431738853455,
"learning_rate": 0.00010625954198473282,
"loss": 0.1894,
"step": 927
},
{
"epoch": 2.349809885931559,
"grad_norm": 0.406448632478714,
"learning_rate": 0.00010615776081424938,
"loss": 0.1913,
"step": 928
},
{
"epoch": 2.3523447401774398,
"grad_norm": 0.4278213381767273,
"learning_rate": 0.0001060559796437659,
"loss": 0.2136,
"step": 929
},
{
"epoch": 2.3548795944233207,
"grad_norm": 0.3853738009929657,
"learning_rate": 0.00010595419847328246,
"loss": 0.213,
"step": 930
},
{
"epoch": 2.3574144486692017,
"grad_norm": 0.3785664737224579,
"learning_rate": 0.00010585241730279898,
"loss": 0.22,
"step": 931
},
{
"epoch": 2.359949302915082,
"grad_norm": 0.5863676071166992,
"learning_rate": 0.00010575063613231553,
"loss": 0.2305,
"step": 932
},
{
"epoch": 2.362484157160963,
"grad_norm": 0.36629414558410645,
"learning_rate": 0.00010564885496183206,
"loss": 0.2041,
"step": 933
},
{
"epoch": 2.365019011406844,
"grad_norm": 0.44699156284332275,
"learning_rate": 0.0001055470737913486,
"loss": 0.2763,
"step": 934
},
{
"epoch": 2.367553865652725,
"grad_norm": 0.4775685667991638,
"learning_rate": 0.00010544529262086515,
"loss": 0.2779,
"step": 935
},
{
"epoch": 2.3700887198986056,
"grad_norm": 0.3192265033721924,
"learning_rate": 0.00010534351145038168,
"loss": 0.1861,
"step": 936
},
{
"epoch": 2.3726235741444865,
"grad_norm": 0.3589562177658081,
"learning_rate": 0.00010524173027989823,
"loss": 0.2266,
"step": 937
},
{
"epoch": 2.3751584283903675,
"grad_norm": 0.36193573474884033,
"learning_rate": 0.00010513994910941476,
"loss": 0.2105,
"step": 938
},
{
"epoch": 2.3776932826362485,
"grad_norm": 0.4141902029514313,
"learning_rate": 0.00010503816793893131,
"loss": 0.2676,
"step": 939
},
{
"epoch": 2.3802281368821294,
"grad_norm": 0.3118525445461273,
"learning_rate": 0.00010493638676844783,
"loss": 0.1941,
"step": 940
},
{
"epoch": 2.3827629911280104,
"grad_norm": 0.3232119679450989,
"learning_rate": 0.00010483460559796439,
"loss": 0.2065,
"step": 941
},
{
"epoch": 2.385297845373891,
"grad_norm": 0.30440258979797363,
"learning_rate": 0.00010473282442748091,
"loss": 0.1834,
"step": 942
},
{
"epoch": 2.387832699619772,
"grad_norm": 0.5841143131256104,
"learning_rate": 0.00010463104325699747,
"loss": 0.3785,
"step": 943
},
{
"epoch": 2.390367553865653,
"grad_norm": 0.31851619482040405,
"learning_rate": 0.00010452926208651399,
"loss": 0.1798,
"step": 944
},
{
"epoch": 2.3929024081115338,
"grad_norm": 0.3820517361164093,
"learning_rate": 0.00010442748091603054,
"loss": 0.2376,
"step": 945
},
{
"epoch": 2.3954372623574143,
"grad_norm": 0.4379272758960724,
"learning_rate": 0.00010432569974554708,
"loss": 0.2356,
"step": 946
},
{
"epoch": 2.3979721166032952,
"grad_norm": 0.3120323419570923,
"learning_rate": 0.00010422391857506362,
"loss": 0.1936,
"step": 947
},
{
"epoch": 2.400506970849176,
"grad_norm": 0.3143107295036316,
"learning_rate": 0.00010412213740458016,
"loss": 0.184,
"step": 948
},
{
"epoch": 2.403041825095057,
"grad_norm": 0.44618573784828186,
"learning_rate": 0.00010402035623409669,
"loss": 0.2468,
"step": 949
},
{
"epoch": 2.405576679340938,
"grad_norm": 0.3838117718696594,
"learning_rate": 0.00010391857506361324,
"loss": 0.2276,
"step": 950
},
{
"epoch": 2.4081115335868186,
"grad_norm": 0.3427219092845917,
"learning_rate": 0.00010381679389312977,
"loss": 0.2169,
"step": 951
},
{
"epoch": 2.4106463878326996,
"grad_norm": 0.3738270699977875,
"learning_rate": 0.00010371501272264632,
"loss": 0.2447,
"step": 952
},
{
"epoch": 2.4131812420785805,
"grad_norm": 0.33645015954971313,
"learning_rate": 0.00010361323155216285,
"loss": 0.1939,
"step": 953
},
{
"epoch": 2.4157160963244615,
"grad_norm": 0.45420047640800476,
"learning_rate": 0.0001035114503816794,
"loss": 0.242,
"step": 954
},
{
"epoch": 2.418250950570342,
"grad_norm": 0.47141382098197937,
"learning_rate": 0.00010340966921119592,
"loss": 0.2923,
"step": 955
},
{
"epoch": 2.420785804816223,
"grad_norm": 0.42177528142929077,
"learning_rate": 0.00010330788804071248,
"loss": 0.2827,
"step": 956
},
{
"epoch": 2.423320659062104,
"grad_norm": 0.409502774477005,
"learning_rate": 0.000103206106870229,
"loss": 0.2016,
"step": 957
},
{
"epoch": 2.425855513307985,
"grad_norm": 0.47684770822525024,
"learning_rate": 0.00010310432569974556,
"loss": 0.2093,
"step": 958
},
{
"epoch": 2.428390367553866,
"grad_norm": 0.3357095718383789,
"learning_rate": 0.0001030025445292621,
"loss": 0.1744,
"step": 959
},
{
"epoch": 2.4309252217997463,
"grad_norm": 0.4120575487613678,
"learning_rate": 0.00010290076335877863,
"loss": 0.214,
"step": 960
},
{
"epoch": 2.4334600760456273,
"grad_norm": 0.5090222954750061,
"learning_rate": 0.00010279898218829517,
"loss": 0.2427,
"step": 961
},
{
"epoch": 2.4359949302915083,
"grad_norm": 0.4142550528049469,
"learning_rate": 0.0001026972010178117,
"loss": 0.2412,
"step": 962
},
{
"epoch": 2.4385297845373892,
"grad_norm": 0.3446972966194153,
"learning_rate": 0.00010259541984732825,
"loss": 0.1952,
"step": 963
},
{
"epoch": 2.4410646387832697,
"grad_norm": 0.37858110666275024,
"learning_rate": 0.00010249363867684478,
"loss": 0.1964,
"step": 964
},
{
"epoch": 2.4435994930291507,
"grad_norm": 0.3989041745662689,
"learning_rate": 0.00010239185750636133,
"loss": 0.2115,
"step": 965
},
{
"epoch": 2.4461343472750317,
"grad_norm": 0.3948146402835846,
"learning_rate": 0.00010229007633587786,
"loss": 0.2067,
"step": 966
},
{
"epoch": 2.4486692015209126,
"grad_norm": 0.3683820068836212,
"learning_rate": 0.00010218829516539441,
"loss": 0.1881,
"step": 967
},
{
"epoch": 2.4512040557667936,
"grad_norm": 0.36742380261421204,
"learning_rate": 0.00010208651399491094,
"loss": 0.2302,
"step": 968
},
{
"epoch": 2.453738910012674,
"grad_norm": 0.32195988297462463,
"learning_rate": 0.00010198473282442749,
"loss": 0.1994,
"step": 969
},
{
"epoch": 2.456273764258555,
"grad_norm": 0.42296963930130005,
"learning_rate": 0.00010188295165394401,
"loss": 0.2657,
"step": 970
},
{
"epoch": 2.458808618504436,
"grad_norm": 0.3555774688720703,
"learning_rate": 0.00010178117048346057,
"loss": 0.1812,
"step": 971
},
{
"epoch": 2.461343472750317,
"grad_norm": 0.6991668343544006,
"learning_rate": 0.00010167938931297712,
"loss": 0.4318,
"step": 972
},
{
"epoch": 2.463878326996198,
"grad_norm": 0.4290355443954468,
"learning_rate": 0.00010157760814249365,
"loss": 0.1856,
"step": 973
},
{
"epoch": 2.4664131812420784,
"grad_norm": 0.3479045331478119,
"learning_rate": 0.00010147582697201018,
"loss": 0.1844,
"step": 974
},
{
"epoch": 2.4689480354879594,
"grad_norm": 0.3862701952457428,
"learning_rate": 0.00010137404580152672,
"loss": 0.2108,
"step": 975
},
{
"epoch": 2.4714828897338403,
"grad_norm": 0.34411442279815674,
"learning_rate": 0.00010127226463104326,
"loss": 0.1851,
"step": 976
},
{
"epoch": 2.4740177439797213,
"grad_norm": 0.2434609979391098,
"learning_rate": 0.00010117048346055979,
"loss": 0.1757,
"step": 977
},
{
"epoch": 2.4765525982256023,
"grad_norm": 0.3341599106788635,
"learning_rate": 0.00010106870229007634,
"loss": 0.1879,
"step": 978
},
{
"epoch": 2.4790874524714828,
"grad_norm": 0.27678003907203674,
"learning_rate": 0.00010096692111959287,
"loss": 0.1943,
"step": 979
},
{
"epoch": 2.4816223067173637,
"grad_norm": 0.2388005256652832,
"learning_rate": 0.00010086513994910942,
"loss": 0.1804,
"step": 980
},
{
"epoch": 2.4841571609632447,
"grad_norm": 0.5265661478042603,
"learning_rate": 0.00010076335877862595,
"loss": 0.2813,
"step": 981
},
{
"epoch": 2.4866920152091256,
"grad_norm": 0.337007075548172,
"learning_rate": 0.0001006615776081425,
"loss": 0.1976,
"step": 982
},
{
"epoch": 2.489226869455006,
"grad_norm": 0.42700427770614624,
"learning_rate": 0.00010055979643765905,
"loss": 0.2031,
"step": 983
},
{
"epoch": 2.491761723700887,
"grad_norm": 0.3900333642959595,
"learning_rate": 0.00010045801526717558,
"loss": 0.2178,
"step": 984
},
{
"epoch": 2.494296577946768,
"grad_norm": 0.45332932472229004,
"learning_rate": 0.00010035623409669213,
"loss": 0.2537,
"step": 985
},
{
"epoch": 2.496831432192649,
"grad_norm": 0.30331265926361084,
"learning_rate": 0.00010025445292620866,
"loss": 0.2074,
"step": 986
},
{
"epoch": 2.49936628643853,
"grad_norm": 0.3379949927330017,
"learning_rate": 0.0001001526717557252,
"loss": 0.1768,
"step": 987
},
{
"epoch": 2.5019011406844105,
"grad_norm": 0.40859973430633545,
"learning_rate": 0.00010005089058524174,
"loss": 0.1984,
"step": 988
},
{
"epoch": 2.5044359949302915,
"grad_norm": 0.3993757963180542,
"learning_rate": 9.994910941475827e-05,
"loss": 0.2162,
"step": 989
},
{
"epoch": 2.5069708491761724,
"grad_norm": 0.5887713432312012,
"learning_rate": 9.984732824427481e-05,
"loss": 0.2806,
"step": 990
},
{
"epoch": 2.5095057034220534,
"grad_norm": 0.3590678572654724,
"learning_rate": 9.974554707379135e-05,
"loss": 0.2045,
"step": 991
},
{
"epoch": 2.512040557667934,
"grad_norm": 0.3090289831161499,
"learning_rate": 9.964376590330789e-05,
"loss": 0.2151,
"step": 992
},
{
"epoch": 2.514575411913815,
"grad_norm": 0.42125657200813293,
"learning_rate": 9.954198473282443e-05,
"loss": 0.2277,
"step": 993
},
{
"epoch": 2.517110266159696,
"grad_norm": 0.3213401734828949,
"learning_rate": 9.944020356234097e-05,
"loss": 0.1927,
"step": 994
},
{
"epoch": 2.5196451204055768,
"grad_norm": 0.4558688998222351,
"learning_rate": 9.933842239185751e-05,
"loss": 0.2418,
"step": 995
},
{
"epoch": 2.5221799746514577,
"grad_norm": 0.5181113481521606,
"learning_rate": 9.923664122137405e-05,
"loss": 0.2955,
"step": 996
},
{
"epoch": 2.5247148288973387,
"grad_norm": 0.409424751996994,
"learning_rate": 9.913486005089059e-05,
"loss": 0.226,
"step": 997
},
{
"epoch": 2.527249683143219,
"grad_norm": 0.44536876678466797,
"learning_rate": 9.903307888040713e-05,
"loss": 0.2412,
"step": 998
},
{
"epoch": 2.5297845373891,
"grad_norm": 0.5028473734855652,
"learning_rate": 9.893129770992367e-05,
"loss": 0.2658,
"step": 999
},
{
"epoch": 2.532319391634981,
"grad_norm": 0.3157128691673279,
"learning_rate": 9.882951653944021e-05,
"loss": 0.1939,
"step": 1000
},
{
"epoch": 2.5348542458808616,
"grad_norm": 0.3184659481048584,
"learning_rate": 9.872773536895676e-05,
"loss": 0.2113,
"step": 1001
},
{
"epoch": 2.5373891001267426,
"grad_norm": 0.5658953785896301,
"learning_rate": 9.862595419847329e-05,
"loss": 0.2641,
"step": 1002
},
{
"epoch": 2.5399239543726235,
"grad_norm": 0.5306189060211182,
"learning_rate": 9.852417302798982e-05,
"loss": 0.2495,
"step": 1003
},
{
"epoch": 2.5424588086185045,
"grad_norm": 0.5272448062896729,
"learning_rate": 9.842239185750636e-05,
"loss": 0.2212,
"step": 1004
},
{
"epoch": 2.5449936628643854,
"grad_norm": 0.3216992914676666,
"learning_rate": 9.83206106870229e-05,
"loss": 0.2284,
"step": 1005
},
{
"epoch": 2.5475285171102664,
"grad_norm": 0.3573670983314514,
"learning_rate": 9.821882951653944e-05,
"loss": 0.2568,
"step": 1006
},
{
"epoch": 2.550063371356147,
"grad_norm": 0.4088655710220337,
"learning_rate": 9.811704834605598e-05,
"loss": 0.2033,
"step": 1007
},
{
"epoch": 2.552598225602028,
"grad_norm": 0.33729737997055054,
"learning_rate": 9.801526717557252e-05,
"loss": 0.1843,
"step": 1008
},
{
"epoch": 2.555133079847909,
"grad_norm": 0.3298558294773102,
"learning_rate": 9.791348600508906e-05,
"loss": 0.193,
"step": 1009
},
{
"epoch": 2.5576679340937893,
"grad_norm": 0.33454427123069763,
"learning_rate": 9.78117048346056e-05,
"loss": 0.1823,
"step": 1010
},
{
"epoch": 2.5602027883396703,
"grad_norm": 0.3466435670852661,
"learning_rate": 9.770992366412214e-05,
"loss": 0.2204,
"step": 1011
},
{
"epoch": 2.5627376425855513,
"grad_norm": 0.3551004230976105,
"learning_rate": 9.760814249363868e-05,
"loss": 0.2027,
"step": 1012
},
{
"epoch": 2.565272496831432,
"grad_norm": 0.4317062795162201,
"learning_rate": 9.750636132315523e-05,
"loss": 0.2099,
"step": 1013
},
{
"epoch": 2.567807351077313,
"grad_norm": 0.5695217847824097,
"learning_rate": 9.740458015267177e-05,
"loss": 0.2547,
"step": 1014
},
{
"epoch": 2.570342205323194,
"grad_norm": 0.4523742198944092,
"learning_rate": 9.730279898218831e-05,
"loss": 0.2501,
"step": 1015
},
{
"epoch": 2.5728770595690746,
"grad_norm": 0.3191470503807068,
"learning_rate": 9.720101781170484e-05,
"loss": 0.1918,
"step": 1016
},
{
"epoch": 2.5754119138149556,
"grad_norm": 0.36234062910079956,
"learning_rate": 9.709923664122138e-05,
"loss": 0.2081,
"step": 1017
},
{
"epoch": 2.5779467680608366,
"grad_norm": 0.42196425795555115,
"learning_rate": 9.699745547073791e-05,
"loss": 0.2801,
"step": 1018
},
{
"epoch": 2.5804816223067175,
"grad_norm": 0.3382538855075836,
"learning_rate": 9.689567430025445e-05,
"loss": 0.221,
"step": 1019
},
{
"epoch": 2.583016476552598,
"grad_norm": 0.5736209750175476,
"learning_rate": 9.679389312977099e-05,
"loss": 0.2684,
"step": 1020
},
{
"epoch": 2.585551330798479,
"grad_norm": 0.4692763686180115,
"learning_rate": 9.669211195928753e-05,
"loss": 0.244,
"step": 1021
},
{
"epoch": 2.58808618504436,
"grad_norm": 0.4888627827167511,
"learning_rate": 9.659033078880407e-05,
"loss": 0.2493,
"step": 1022
},
{
"epoch": 2.590621039290241,
"grad_norm": 0.29745686054229736,
"learning_rate": 9.648854961832061e-05,
"loss": 0.1757,
"step": 1023
},
{
"epoch": 2.593155893536122,
"grad_norm": 0.476639062166214,
"learning_rate": 9.638676844783715e-05,
"loss": 0.2031,
"step": 1024
},
{
"epoch": 2.5956907477820024,
"grad_norm": 0.4214845895767212,
"learning_rate": 9.628498727735369e-05,
"loss": 0.2588,
"step": 1025
},
{
"epoch": 2.5982256020278833,
"grad_norm": 0.3036046326160431,
"learning_rate": 9.618320610687024e-05,
"loss": 0.2031,
"step": 1026
},
{
"epoch": 2.6007604562737643,
"grad_norm": 0.7941879630088806,
"learning_rate": 9.608142493638678e-05,
"loss": 0.2096,
"step": 1027
},
{
"epoch": 2.6032953105196452,
"grad_norm": 0.36381933093070984,
"learning_rate": 9.597964376590332e-05,
"loss": 0.2102,
"step": 1028
},
{
"epoch": 2.6058301647655258,
"grad_norm": 0.3213381767272949,
"learning_rate": 9.587786259541986e-05,
"loss": 0.1884,
"step": 1029
},
{
"epoch": 2.6083650190114067,
"grad_norm": 0.38559427857398987,
"learning_rate": 9.577608142493639e-05,
"loss": 0.2229,
"step": 1030
},
{
"epoch": 2.6108998732572877,
"grad_norm": 0.4000662863254547,
"learning_rate": 9.567430025445293e-05,
"loss": 0.198,
"step": 1031
},
{
"epoch": 2.6134347275031686,
"grad_norm": 0.3635396659374237,
"learning_rate": 9.557251908396946e-05,
"loss": 0.2267,
"step": 1032
},
{
"epoch": 2.6159695817490496,
"grad_norm": 0.31810763478279114,
"learning_rate": 9.5470737913486e-05,
"loss": 0.1691,
"step": 1033
},
{
"epoch": 2.6185044359949305,
"grad_norm": 0.29606062173843384,
"learning_rate": 9.536895674300254e-05,
"loss": 0.1834,
"step": 1034
},
{
"epoch": 2.621039290240811,
"grad_norm": 0.3528769612312317,
"learning_rate": 9.526717557251908e-05,
"loss": 0.2086,
"step": 1035
},
{
"epoch": 2.623574144486692,
"grad_norm": 0.4795662760734558,
"learning_rate": 9.516539440203562e-05,
"loss": 0.2429,
"step": 1036
},
{
"epoch": 2.626108998732573,
"grad_norm": 0.4627299904823303,
"learning_rate": 9.506361323155216e-05,
"loss": 0.1956,
"step": 1037
},
{
"epoch": 2.6286438529784535,
"grad_norm": 0.3330387473106384,
"learning_rate": 9.496183206106871e-05,
"loss": 0.1891,
"step": 1038
},
{
"epoch": 2.6311787072243344,
"grad_norm": 0.4265390634536743,
"learning_rate": 9.486005089058525e-05,
"loss": 0.2086,
"step": 1039
},
{
"epoch": 2.6337135614702154,
"grad_norm": 0.37214142084121704,
"learning_rate": 9.475826972010179e-05,
"loss": 0.2321,
"step": 1040
},
{
"epoch": 2.6362484157160964,
"grad_norm": 0.4183201491832733,
"learning_rate": 9.465648854961833e-05,
"loss": 0.2029,
"step": 1041
},
{
"epoch": 2.6387832699619773,
"grad_norm": 0.5688794851303101,
"learning_rate": 9.455470737913487e-05,
"loss": 0.2481,
"step": 1042
},
{
"epoch": 2.6413181242078583,
"grad_norm": 0.38355833292007446,
"learning_rate": 9.445292620865141e-05,
"loss": 0.1989,
"step": 1043
},
{
"epoch": 2.643852978453739,
"grad_norm": 0.4998534023761749,
"learning_rate": 9.435114503816794e-05,
"loss": 0.2272,
"step": 1044
},
{
"epoch": 2.6463878326996197,
"grad_norm": 0.2796792685985565,
"learning_rate": 9.424936386768448e-05,
"loss": 0.1694,
"step": 1045
},
{
"epoch": 2.6489226869455007,
"grad_norm": 0.30551543831825256,
"learning_rate": 9.414758269720102e-05,
"loss": 0.1782,
"step": 1046
},
{
"epoch": 2.6514575411913817,
"grad_norm": 0.3933429718017578,
"learning_rate": 9.404580152671755e-05,
"loss": 0.272,
"step": 1047
},
{
"epoch": 2.653992395437262,
"grad_norm": 0.3543720841407776,
"learning_rate": 9.39440203562341e-05,
"loss": 0.2271,
"step": 1048
},
{
"epoch": 2.656527249683143,
"grad_norm": 0.2716831564903259,
"learning_rate": 9.384223918575063e-05,
"loss": 0.1898,
"step": 1049
},
{
"epoch": 2.659062103929024,
"grad_norm": 0.3037743866443634,
"learning_rate": 9.374045801526719e-05,
"loss": 0.1911,
"step": 1050
},
{
"epoch": 2.661596958174905,
"grad_norm": 0.4390093982219696,
"learning_rate": 9.363867684478373e-05,
"loss": 0.2369,
"step": 1051
},
{
"epoch": 2.664131812420786,
"grad_norm": 0.3383953273296356,
"learning_rate": 9.353689567430026e-05,
"loss": 0.2519,
"step": 1052
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.28227975964546204,
"learning_rate": 9.34351145038168e-05,
"loss": 0.1926,
"step": 1053
},
{
"epoch": 2.6692015209125475,
"grad_norm": 0.33451253175735474,
"learning_rate": 9.333333333333334e-05,
"loss": 0.1864,
"step": 1054
},
{
"epoch": 2.6717363751584284,
"grad_norm": 0.4116145372390747,
"learning_rate": 9.323155216284988e-05,
"loss": 0.2462,
"step": 1055
},
{
"epoch": 2.6742712294043094,
"grad_norm": 0.43822887539863586,
"learning_rate": 9.312977099236642e-05,
"loss": 0.2014,
"step": 1056
},
{
"epoch": 2.67680608365019,
"grad_norm": 0.4394984841346741,
"learning_rate": 9.302798982188296e-05,
"loss": 0.2378,
"step": 1057
},
{
"epoch": 2.679340937896071,
"grad_norm": 0.4073251783847809,
"learning_rate": 9.292620865139949e-05,
"loss": 0.2711,
"step": 1058
},
{
"epoch": 2.681875792141952,
"grad_norm": 0.3316657841205597,
"learning_rate": 9.282442748091603e-05,
"loss": 0.214,
"step": 1059
},
{
"epoch": 2.6844106463878328,
"grad_norm": 0.2994216978549957,
"learning_rate": 9.272264631043257e-05,
"loss": 0.1838,
"step": 1060
},
{
"epoch": 2.6869455006337137,
"grad_norm": 0.5388765335083008,
"learning_rate": 9.26208651399491e-05,
"loss": 0.277,
"step": 1061
},
{
"epoch": 2.6894803548795947,
"grad_norm": 0.3714945912361145,
"learning_rate": 9.251908396946566e-05,
"loss": 0.2428,
"step": 1062
},
{
"epoch": 2.692015209125475,
"grad_norm": 0.32202383875846863,
"learning_rate": 9.24173027989822e-05,
"loss": 0.2063,
"step": 1063
},
{
"epoch": 2.694550063371356,
"grad_norm": 0.4116881191730499,
"learning_rate": 9.231552162849874e-05,
"loss": 0.2661,
"step": 1064
},
{
"epoch": 2.697084917617237,
"grad_norm": 0.36626386642456055,
"learning_rate": 9.221374045801528e-05,
"loss": 0.2897,
"step": 1065
},
{
"epoch": 2.6996197718631176,
"grad_norm": 0.33859655261039734,
"learning_rate": 9.211195928753181e-05,
"loss": 0.1959,
"step": 1066
},
{
"epoch": 2.7021546261089986,
"grad_norm": 0.38263705372810364,
"learning_rate": 9.201017811704835e-05,
"loss": 0.2827,
"step": 1067
},
{
"epoch": 2.7046894803548795,
"grad_norm": 0.3557961583137512,
"learning_rate": 9.19083969465649e-05,
"loss": 0.176,
"step": 1068
},
{
"epoch": 2.7072243346007605,
"grad_norm": 0.35334861278533936,
"learning_rate": 9.180661577608143e-05,
"loss": 0.2183,
"step": 1069
},
{
"epoch": 2.7097591888466415,
"grad_norm": 0.4672026038169861,
"learning_rate": 9.170483460559797e-05,
"loss": 0.2715,
"step": 1070
},
{
"epoch": 2.7122940430925224,
"grad_norm": 0.41585099697113037,
"learning_rate": 9.160305343511451e-05,
"loss": 0.1912,
"step": 1071
},
{
"epoch": 2.714828897338403,
"grad_norm": 0.54674232006073,
"learning_rate": 9.150127226463104e-05,
"loss": 0.2493,
"step": 1072
},
{
"epoch": 2.717363751584284,
"grad_norm": 0.30595988035202026,
"learning_rate": 9.139949109414758e-05,
"loss": 0.1843,
"step": 1073
},
{
"epoch": 2.719898605830165,
"grad_norm": 0.3521415889263153,
"learning_rate": 9.129770992366413e-05,
"loss": 0.2047,
"step": 1074
},
{
"epoch": 2.7224334600760454,
"grad_norm": 0.47393590211868286,
"learning_rate": 9.119592875318067e-05,
"loss": 0.3398,
"step": 1075
},
{
"epoch": 2.7249683143219263,
"grad_norm": 0.4672793745994568,
"learning_rate": 9.109414758269721e-05,
"loss": 0.3569,
"step": 1076
},
{
"epoch": 2.7275031685678073,
"grad_norm": 0.41231435537338257,
"learning_rate": 9.099236641221375e-05,
"loss": 0.2323,
"step": 1077
},
{
"epoch": 2.7300380228136882,
"grad_norm": 0.36700156331062317,
"learning_rate": 9.089058524173029e-05,
"loss": 0.2023,
"step": 1078
},
{
"epoch": 2.732572877059569,
"grad_norm": 0.32198184728622437,
"learning_rate": 9.078880407124683e-05,
"loss": 0.1814,
"step": 1079
},
{
"epoch": 2.73510773130545,
"grad_norm": 0.46826303005218506,
"learning_rate": 9.068702290076337e-05,
"loss": 0.2216,
"step": 1080
},
{
"epoch": 2.7376425855513307,
"grad_norm": 0.3026100695133209,
"learning_rate": 9.05852417302799e-05,
"loss": 0.1826,
"step": 1081
},
{
"epoch": 2.7401774397972116,
"grad_norm": 0.2897210717201233,
"learning_rate": 9.048346055979644e-05,
"loss": 0.1853,
"step": 1082
},
{
"epoch": 2.7427122940430926,
"grad_norm": 0.296286940574646,
"learning_rate": 9.038167938931298e-05,
"loss": 0.1776,
"step": 1083
},
{
"epoch": 2.7452471482889735,
"grad_norm": 0.374600887298584,
"learning_rate": 9.027989821882952e-05,
"loss": 0.2031,
"step": 1084
},
{
"epoch": 2.747782002534854,
"grad_norm": 0.5333495140075684,
"learning_rate": 9.017811704834606e-05,
"loss": 0.2798,
"step": 1085
},
{
"epoch": 2.750316856780735,
"grad_norm": 0.43342864513397217,
"learning_rate": 9.007633587786259e-05,
"loss": 0.2063,
"step": 1086
},
{
"epoch": 2.752851711026616,
"grad_norm": 0.5283639430999756,
"learning_rate": 8.997455470737914e-05,
"loss": 0.25,
"step": 1087
},
{
"epoch": 2.755386565272497,
"grad_norm": 0.556190013885498,
"learning_rate": 8.987277353689568e-05,
"loss": 0.2044,
"step": 1088
},
{
"epoch": 2.757921419518378,
"grad_norm": 0.35083258152008057,
"learning_rate": 8.977099236641222e-05,
"loss": 0.188,
"step": 1089
},
{
"epoch": 2.7604562737642584,
"grad_norm": 0.42917102575302124,
"learning_rate": 8.966921119592876e-05,
"loss": 0.2511,
"step": 1090
},
{
"epoch": 2.7629911280101394,
"grad_norm": 0.5665780305862427,
"learning_rate": 8.95674300254453e-05,
"loss": 0.3307,
"step": 1091
},
{
"epoch": 2.7655259822560203,
"grad_norm": 0.40193435549736023,
"learning_rate": 8.946564885496184e-05,
"loss": 0.2453,
"step": 1092
},
{
"epoch": 2.7680608365019013,
"grad_norm": 0.46344733238220215,
"learning_rate": 8.936386768447838e-05,
"loss": 0.2096,
"step": 1093
},
{
"epoch": 2.770595690747782,
"grad_norm": 0.4600921869277954,
"learning_rate": 8.926208651399492e-05,
"loss": 0.2161,
"step": 1094
},
{
"epoch": 2.7731305449936627,
"grad_norm": 0.46053385734558105,
"learning_rate": 8.916030534351145e-05,
"loss": 0.2369,
"step": 1095
},
{
"epoch": 2.7756653992395437,
"grad_norm": 0.45449280738830566,
"learning_rate": 8.9058524173028e-05,
"loss": 0.2344,
"step": 1096
},
{
"epoch": 2.7782002534854247,
"grad_norm": 0.39411383867263794,
"learning_rate": 8.895674300254453e-05,
"loss": 0.2082,
"step": 1097
},
{
"epoch": 2.7807351077313056,
"grad_norm": 0.38967519998550415,
"learning_rate": 8.885496183206107e-05,
"loss": 0.2264,
"step": 1098
},
{
"epoch": 2.7832699619771866,
"grad_norm": 0.3357069194316864,
"learning_rate": 8.875318066157761e-05,
"loss": 0.1896,
"step": 1099
},
{
"epoch": 2.785804816223067,
"grad_norm": 0.4941220283508301,
"learning_rate": 8.865139949109415e-05,
"loss": 0.3003,
"step": 1100
},
{
"epoch": 2.788339670468948,
"grad_norm": 0.3897833526134491,
"learning_rate": 8.854961832061069e-05,
"loss": 0.1907,
"step": 1101
},
{
"epoch": 2.790874524714829,
"grad_norm": 0.4247800409793854,
"learning_rate": 8.844783715012723e-05,
"loss": 0.1843,
"step": 1102
},
{
"epoch": 2.7934093789607095,
"grad_norm": 0.46850237250328064,
"learning_rate": 8.834605597964377e-05,
"loss": 0.2501,
"step": 1103
},
{
"epoch": 2.7959442332065905,
"grad_norm": 0.4753093421459198,
"learning_rate": 8.824427480916031e-05,
"loss": 0.2277,
"step": 1104
},
{
"epoch": 2.7984790874524714,
"grad_norm": 0.3235141932964325,
"learning_rate": 8.814249363867685e-05,
"loss": 0.1817,
"step": 1105
},
{
"epoch": 2.8010139416983524,
"grad_norm": 0.48403674364089966,
"learning_rate": 8.804071246819339e-05,
"loss": 0.2278,
"step": 1106
},
{
"epoch": 2.8035487959442333,
"grad_norm": 0.30417025089263916,
"learning_rate": 8.793893129770993e-05,
"loss": 0.1867,
"step": 1107
},
{
"epoch": 2.8060836501901143,
"grad_norm": 0.30289140343666077,
"learning_rate": 8.783715012722647e-05,
"loss": 0.1898,
"step": 1108
},
{
"epoch": 2.808618504435995,
"grad_norm": 0.47156116366386414,
"learning_rate": 8.7735368956743e-05,
"loss": 0.2381,
"step": 1109
},
{
"epoch": 2.8111533586818758,
"grad_norm": 0.4420924186706543,
"learning_rate": 8.763358778625954e-05,
"loss": 0.251,
"step": 1110
},
{
"epoch": 2.8136882129277567,
"grad_norm": 0.42235851287841797,
"learning_rate": 8.75318066157761e-05,
"loss": 0.2007,
"step": 1111
},
{
"epoch": 2.8162230671736372,
"grad_norm": 0.40069061517715454,
"learning_rate": 8.743002544529262e-05,
"loss": 0.2052,
"step": 1112
},
{
"epoch": 2.818757921419518,
"grad_norm": 0.5213333368301392,
"learning_rate": 8.732824427480916e-05,
"loss": 0.2236,
"step": 1113
},
{
"epoch": 2.821292775665399,
"grad_norm": 0.3919121026992798,
"learning_rate": 8.72264631043257e-05,
"loss": 0.2338,
"step": 1114
},
{
"epoch": 2.82382762991128,
"grad_norm": 0.4295049011707306,
"learning_rate": 8.712468193384224e-05,
"loss": 0.2713,
"step": 1115
},
{
"epoch": 2.826362484157161,
"grad_norm": 0.25834596157073975,
"learning_rate": 8.702290076335878e-05,
"loss": 0.1701,
"step": 1116
},
{
"epoch": 2.828897338403042,
"grad_norm": 0.36217084527015686,
"learning_rate": 8.692111959287532e-05,
"loss": 0.1963,
"step": 1117
},
{
"epoch": 2.8314321926489225,
"grad_norm": 0.39089757204055786,
"learning_rate": 8.681933842239186e-05,
"loss": 0.186,
"step": 1118
},
{
"epoch": 2.8339670468948035,
"grad_norm": 0.45900896191596985,
"learning_rate": 8.67175572519084e-05,
"loss": 0.22,
"step": 1119
},
{
"epoch": 2.8365019011406845,
"grad_norm": 0.2946614623069763,
"learning_rate": 8.661577608142494e-05,
"loss": 0.1771,
"step": 1120
},
{
"epoch": 2.8390367553865654,
"grad_norm": 0.4160090982913971,
"learning_rate": 8.651399491094148e-05,
"loss": 0.2083,
"step": 1121
},
{
"epoch": 2.841571609632446,
"grad_norm": 0.43507587909698486,
"learning_rate": 8.641221374045802e-05,
"loss": 0.2595,
"step": 1122
},
{
"epoch": 2.844106463878327,
"grad_norm": 0.449813574552536,
"learning_rate": 8.631043256997457e-05,
"loss": 0.2982,
"step": 1123
},
{
"epoch": 2.846641318124208,
"grad_norm": 0.33715054392814636,
"learning_rate": 8.620865139949111e-05,
"loss": 0.1851,
"step": 1124
},
{
"epoch": 2.849176172370089,
"grad_norm": 0.4767422676086426,
"learning_rate": 8.610687022900765e-05,
"loss": 0.2865,
"step": 1125
},
{
"epoch": 2.8517110266159698,
"grad_norm": 0.4232870042324066,
"learning_rate": 8.600508905852417e-05,
"loss": 0.2355,
"step": 1126
},
{
"epoch": 2.8542458808618507,
"grad_norm": 0.286565363407135,
"learning_rate": 8.590330788804071e-05,
"loss": 0.188,
"step": 1127
},
{
"epoch": 2.8567807351077312,
"grad_norm": 0.304606169462204,
"learning_rate": 8.580152671755725e-05,
"loss": 0.2367,
"step": 1128
},
{
"epoch": 2.859315589353612,
"grad_norm": 0.4730917811393738,
"learning_rate": 8.569974554707379e-05,
"loss": 0.2925,
"step": 1129
},
{
"epoch": 2.861850443599493,
"grad_norm": 0.348651647567749,
"learning_rate": 8.559796437659033e-05,
"loss": 0.242,
"step": 1130
},
{
"epoch": 2.8643852978453737,
"grad_norm": 0.31156882643699646,
"learning_rate": 8.549618320610687e-05,
"loss": 0.1865,
"step": 1131
},
{
"epoch": 2.8669201520912546,
"grad_norm": 0.4416813254356384,
"learning_rate": 8.539440203562341e-05,
"loss": 0.311,
"step": 1132
},
{
"epoch": 2.8694550063371356,
"grad_norm": 0.2997666895389557,
"learning_rate": 8.529262086513995e-05,
"loss": 0.1956,
"step": 1133
},
{
"epoch": 2.8719898605830165,
"grad_norm": 0.30020904541015625,
"learning_rate": 8.519083969465649e-05,
"loss": 0.206,
"step": 1134
},
{
"epoch": 2.8745247148288975,
"grad_norm": 0.4457029104232788,
"learning_rate": 8.508905852417304e-05,
"loss": 0.2422,
"step": 1135
},
{
"epoch": 2.8770595690747784,
"grad_norm": 0.3519587218761444,
"learning_rate": 8.498727735368958e-05,
"loss": 0.2277,
"step": 1136
},
{
"epoch": 2.879594423320659,
"grad_norm": 0.3482111394405365,
"learning_rate": 8.488549618320612e-05,
"loss": 0.1981,
"step": 1137
},
{
"epoch": 2.88212927756654,
"grad_norm": 0.31978392601013184,
"learning_rate": 8.478371501272266e-05,
"loss": 0.1849,
"step": 1138
},
{
"epoch": 2.884664131812421,
"grad_norm": 0.2380414754152298,
"learning_rate": 8.46819338422392e-05,
"loss": 0.1619,
"step": 1139
},
{
"epoch": 2.8871989860583014,
"grad_norm": 0.25577735900878906,
"learning_rate": 8.458015267175572e-05,
"loss": 0.1594,
"step": 1140
},
{
"epoch": 2.8897338403041823,
"grad_norm": 0.36093661189079285,
"learning_rate": 8.447837150127226e-05,
"loss": 0.1937,
"step": 1141
},
{
"epoch": 2.8922686945500633,
"grad_norm": 0.3542689085006714,
"learning_rate": 8.43765903307888e-05,
"loss": 0.2219,
"step": 1142
},
{
"epoch": 2.8948035487959443,
"grad_norm": 0.3966139853000641,
"learning_rate": 8.427480916030534e-05,
"loss": 0.2427,
"step": 1143
},
{
"epoch": 2.897338403041825,
"grad_norm": 0.3684738278388977,
"learning_rate": 8.417302798982188e-05,
"loss": 0.2093,
"step": 1144
},
{
"epoch": 2.899873257287706,
"grad_norm": 0.430477499961853,
"learning_rate": 8.407124681933842e-05,
"loss": 0.2266,
"step": 1145
},
{
"epoch": 2.9024081115335867,
"grad_norm": 0.32896652817726135,
"learning_rate": 8.396946564885496e-05,
"loss": 0.2447,
"step": 1146
},
{
"epoch": 2.9049429657794676,
"grad_norm": 0.45568832755088806,
"learning_rate": 8.38676844783715e-05,
"loss": 0.2251,
"step": 1147
},
{
"epoch": 2.9074778200253486,
"grad_norm": 0.48290732502937317,
"learning_rate": 8.376590330788805e-05,
"loss": 0.2471,
"step": 1148
},
{
"epoch": 2.9100126742712296,
"grad_norm": 0.40795937180519104,
"learning_rate": 8.366412213740459e-05,
"loss": 0.2031,
"step": 1149
},
{
"epoch": 2.91254752851711,
"grad_norm": 0.362835168838501,
"learning_rate": 8.356234096692113e-05,
"loss": 0.1991,
"step": 1150
},
{
"epoch": 2.915082382762991,
"grad_norm": 0.38601744174957275,
"learning_rate": 8.346055979643767e-05,
"loss": 0.1821,
"step": 1151
},
{
"epoch": 2.917617237008872,
"grad_norm": 0.2641182541847229,
"learning_rate": 8.335877862595421e-05,
"loss": 0.16,
"step": 1152
},
{
"epoch": 2.920152091254753,
"grad_norm": 0.5600478053092957,
"learning_rate": 8.325699745547075e-05,
"loss": 0.2476,
"step": 1153
},
{
"epoch": 2.922686945500634,
"grad_norm": 0.3873019516468048,
"learning_rate": 8.315521628498727e-05,
"loss": 0.2264,
"step": 1154
},
{
"epoch": 2.9252217997465144,
"grad_norm": 0.2946743667125702,
"learning_rate": 8.305343511450381e-05,
"loss": 0.1776,
"step": 1155
},
{
"epoch": 2.9277566539923954,
"grad_norm": 0.3886416554450989,
"learning_rate": 8.295165394402035e-05,
"loss": 0.2123,
"step": 1156
},
{
"epoch": 2.9302915082382763,
"grad_norm": 0.39706671237945557,
"learning_rate": 8.284987277353689e-05,
"loss": 0.2319,
"step": 1157
},
{
"epoch": 2.9328263624841573,
"grad_norm": 0.30693602561950684,
"learning_rate": 8.274809160305343e-05,
"loss": 0.1939,
"step": 1158
},
{
"epoch": 2.935361216730038,
"grad_norm": 0.37277474999427795,
"learning_rate": 8.264631043256997e-05,
"loss": 0.2194,
"step": 1159
},
{
"epoch": 2.9378960709759188,
"grad_norm": 0.442508727312088,
"learning_rate": 8.254452926208652e-05,
"loss": 0.2142,
"step": 1160
},
{
"epoch": 2.9404309252217997,
"grad_norm": 0.275898814201355,
"learning_rate": 8.244274809160306e-05,
"loss": 0.1791,
"step": 1161
},
{
"epoch": 2.9429657794676807,
"grad_norm": 0.4033918082714081,
"learning_rate": 8.23409669211196e-05,
"loss": 0.295,
"step": 1162
},
{
"epoch": 2.9455006337135616,
"grad_norm": 0.46713244915008545,
"learning_rate": 8.223918575063614e-05,
"loss": 0.2662,
"step": 1163
},
{
"epoch": 2.9480354879594426,
"grad_norm": 0.37975406646728516,
"learning_rate": 8.213740458015268e-05,
"loss": 0.1915,
"step": 1164
},
{
"epoch": 2.950570342205323,
"grad_norm": 0.31382545828819275,
"learning_rate": 8.203562340966922e-05,
"loss": 0.1793,
"step": 1165
},
{
"epoch": 2.953105196451204,
"grad_norm": 0.42415499687194824,
"learning_rate": 8.193384223918576e-05,
"loss": 0.2375,
"step": 1166
},
{
"epoch": 2.955640050697085,
"grad_norm": 0.4227803647518158,
"learning_rate": 8.18320610687023e-05,
"loss": 0.213,
"step": 1167
},
{
"epoch": 2.9581749049429655,
"grad_norm": 0.3395853638648987,
"learning_rate": 8.173027989821882e-05,
"loss": 0.1942,
"step": 1168
},
{
"epoch": 2.9607097591888465,
"grad_norm": 0.4627746641635895,
"learning_rate": 8.162849872773536e-05,
"loss": 0.2266,
"step": 1169
},
{
"epoch": 2.9632446134347274,
"grad_norm": 0.36325398087501526,
"learning_rate": 8.15267175572519e-05,
"loss": 0.2176,
"step": 1170
},
{
"epoch": 2.9657794676806084,
"grad_norm": 0.4188767671585083,
"learning_rate": 8.142493638676844e-05,
"loss": 0.1992,
"step": 1171
},
{
"epoch": 2.9683143219264894,
"grad_norm": 0.3149709403514862,
"learning_rate": 8.1323155216285e-05,
"loss": 0.1829,
"step": 1172
},
{
"epoch": 2.9708491761723703,
"grad_norm": 0.26542145013809204,
"learning_rate": 8.122137404580153e-05,
"loss": 0.1801,
"step": 1173
},
{
"epoch": 2.973384030418251,
"grad_norm": 0.28748998045921326,
"learning_rate": 8.111959287531807e-05,
"loss": 0.1764,
"step": 1174
},
{
"epoch": 2.975918884664132,
"grad_norm": 0.3103797733783722,
"learning_rate": 8.101781170483461e-05,
"loss": 0.2047,
"step": 1175
},
{
"epoch": 2.9784537389100127,
"grad_norm": 0.3357256054878235,
"learning_rate": 8.091603053435115e-05,
"loss": 0.2303,
"step": 1176
},
{
"epoch": 2.9809885931558933,
"grad_norm": 0.4399915933609009,
"learning_rate": 8.081424936386769e-05,
"loss": 0.2423,
"step": 1177
},
{
"epoch": 2.983523447401774,
"grad_norm": 0.3486070930957794,
"learning_rate": 8.071246819338423e-05,
"loss": 0.19,
"step": 1178
},
{
"epoch": 2.986058301647655,
"grad_norm": 0.33286648988723755,
"learning_rate": 8.061068702290077e-05,
"loss": 0.1788,
"step": 1179
},
{
"epoch": 2.988593155893536,
"grad_norm": 0.2841028571128845,
"learning_rate": 8.050890585241731e-05,
"loss": 0.167,
"step": 1180
},
{
"epoch": 2.991128010139417,
"grad_norm": 0.44933149218559265,
"learning_rate": 8.040712468193385e-05,
"loss": 0.3098,
"step": 1181
},
{
"epoch": 2.993662864385298,
"grad_norm": 0.2849741280078888,
"learning_rate": 8.030534351145038e-05,
"loss": 0.1896,
"step": 1182
},
{
"epoch": 2.9961977186311786,
"grad_norm": 0.39720216393470764,
"learning_rate": 8.020356234096691e-05,
"loss": 0.2426,
"step": 1183
},
{
"epoch": 2.9987325728770595,
"grad_norm": 0.3838231563568115,
"learning_rate": 8.010178117048347e-05,
"loss": 0.2194,
"step": 1184
},
{
"epoch": 3.0,
"grad_norm": 0.6684709787368774,
"learning_rate": 8e-05,
"loss": 0.2783,
"step": 1185
},
{
"epoch": 3.002534854245881,
"grad_norm": 0.44380757212638855,
"learning_rate": 7.989821882951655e-05,
"loss": 0.2938,
"step": 1186
},
{
"epoch": 3.005069708491762,
"grad_norm": 0.4787996709346771,
"learning_rate": 7.979643765903309e-05,
"loss": 0.2998,
"step": 1187
},
{
"epoch": 3.0076045627376424,
"grad_norm": 0.36355340480804443,
"learning_rate": 7.969465648854962e-05,
"loss": 0.1555,
"step": 1188
},
{
"epoch": 3.0101394169835234,
"grad_norm": 0.37890535593032837,
"learning_rate": 7.959287531806616e-05,
"loss": 0.1743,
"step": 1189
},
{
"epoch": 3.0126742712294043,
"grad_norm": 0.4317542612552643,
"learning_rate": 7.94910941475827e-05,
"loss": 0.1891,
"step": 1190
},
{
"epoch": 3.0152091254752853,
"grad_norm": 0.3477863669395447,
"learning_rate": 7.938931297709924e-05,
"loss": 0.1576,
"step": 1191
},
{
"epoch": 3.017743979721166,
"grad_norm": 0.414050817489624,
"learning_rate": 7.928753180661578e-05,
"loss": 0.2014,
"step": 1192
},
{
"epoch": 3.0202788339670468,
"grad_norm": 0.3596842288970947,
"learning_rate": 7.918575063613232e-05,
"loss": 0.1482,
"step": 1193
},
{
"epoch": 3.0228136882129277,
"grad_norm": 0.49169921875,
"learning_rate": 7.908396946564886e-05,
"loss": 0.1686,
"step": 1194
},
{
"epoch": 3.0253485424588087,
"grad_norm": 0.44806674122810364,
"learning_rate": 7.89821882951654e-05,
"loss": 0.2044,
"step": 1195
},
{
"epoch": 3.0278833967046896,
"grad_norm": 0.43101197481155396,
"learning_rate": 7.888040712468194e-05,
"loss": 0.1911,
"step": 1196
},
{
"epoch": 3.03041825095057,
"grad_norm": 0.5595632195472717,
"learning_rate": 7.877862595419848e-05,
"loss": 0.1823,
"step": 1197
},
{
"epoch": 3.032953105196451,
"grad_norm": 0.5024780035018921,
"learning_rate": 7.867684478371502e-05,
"loss": 0.1789,
"step": 1198
},
{
"epoch": 3.035487959442332,
"grad_norm": 0.4227488934993744,
"learning_rate": 7.857506361323156e-05,
"loss": 0.1539,
"step": 1199
},
{
"epoch": 3.038022813688213,
"grad_norm": 0.43486127257347107,
"learning_rate": 7.84732824427481e-05,
"loss": 0.1577,
"step": 1200
},
{
"epoch": 3.040557667934094,
"grad_norm": 0.47951167821884155,
"learning_rate": 7.837150127226464e-05,
"loss": 0.1975,
"step": 1201
},
{
"epoch": 3.0430925221799745,
"grad_norm": 0.4223075211048126,
"learning_rate": 7.826972010178117e-05,
"loss": 0.1719,
"step": 1202
},
{
"epoch": 3.0456273764258555,
"grad_norm": 0.6699900031089783,
"learning_rate": 7.816793893129771e-05,
"loss": 0.2139,
"step": 1203
},
{
"epoch": 3.0481622306717364,
"grad_norm": 0.6038373708724976,
"learning_rate": 7.806615776081425e-05,
"loss": 0.2163,
"step": 1204
},
{
"epoch": 3.0506970849176174,
"grad_norm": 0.530208945274353,
"learning_rate": 7.796437659033079e-05,
"loss": 0.1482,
"step": 1205
},
{
"epoch": 3.053231939163498,
"grad_norm": 0.6380701661109924,
"learning_rate": 7.786259541984733e-05,
"loss": 0.2191,
"step": 1206
},
{
"epoch": 3.055766793409379,
"grad_norm": 0.6455860137939453,
"learning_rate": 7.776081424936387e-05,
"loss": 0.1812,
"step": 1207
},
{
"epoch": 3.05830164765526,
"grad_norm": 0.5198556184768677,
"learning_rate": 7.765903307888041e-05,
"loss": 0.1602,
"step": 1208
},
{
"epoch": 3.0608365019011408,
"grad_norm": 0.4842750132083893,
"learning_rate": 7.755725190839695e-05,
"loss": 0.1739,
"step": 1209
},
{
"epoch": 3.0633713561470217,
"grad_norm": 0.6345165371894836,
"learning_rate": 7.745547073791349e-05,
"loss": 0.1841,
"step": 1210
},
{
"epoch": 3.0659062103929022,
"grad_norm": 0.551673173904419,
"learning_rate": 7.735368956743003e-05,
"loss": 0.1755,
"step": 1211
},
{
"epoch": 3.068441064638783,
"grad_norm": 0.5332705974578857,
"learning_rate": 7.725190839694657e-05,
"loss": 0.2175,
"step": 1212
},
{
"epoch": 3.070975918884664,
"grad_norm": 0.6630911231040955,
"learning_rate": 7.715012722646311e-05,
"loss": 0.2868,
"step": 1213
},
{
"epoch": 3.073510773130545,
"grad_norm": 0.42508792877197266,
"learning_rate": 7.704834605597965e-05,
"loss": 0.1811,
"step": 1214
},
{
"epoch": 3.076045627376426,
"grad_norm": 0.504231870174408,
"learning_rate": 7.694656488549619e-05,
"loss": 0.1765,
"step": 1215
},
{
"epoch": 3.0785804816223066,
"grad_norm": 0.39370813965797424,
"learning_rate": 7.684478371501273e-05,
"loss": 0.1739,
"step": 1216
},
{
"epoch": 3.0811153358681875,
"grad_norm": 0.5411176085472107,
"learning_rate": 7.674300254452926e-05,
"loss": 0.2015,
"step": 1217
},
{
"epoch": 3.0836501901140685,
"grad_norm": 0.58034348487854,
"learning_rate": 7.66412213740458e-05,
"loss": 0.2293,
"step": 1218
},
{
"epoch": 3.0861850443599494,
"grad_norm": 0.48355352878570557,
"learning_rate": 7.653944020356234e-05,
"loss": 0.1858,
"step": 1219
},
{
"epoch": 3.08871989860583,
"grad_norm": 0.3532313406467438,
"learning_rate": 7.643765903307888e-05,
"loss": 0.1689,
"step": 1220
},
{
"epoch": 3.091254752851711,
"grad_norm": 0.36245197057724,
"learning_rate": 7.633587786259542e-05,
"loss": 0.1744,
"step": 1221
},
{
"epoch": 3.093789607097592,
"grad_norm": 0.4752829372882843,
"learning_rate": 7.623409669211196e-05,
"loss": 0.1733,
"step": 1222
},
{
"epoch": 3.096324461343473,
"grad_norm": 0.3701539933681488,
"learning_rate": 7.61323155216285e-05,
"loss": 0.158,
"step": 1223
},
{
"epoch": 3.098859315589354,
"grad_norm": 0.45548319816589355,
"learning_rate": 7.603053435114504e-05,
"loss": 0.1822,
"step": 1224
},
{
"epoch": 3.1013941698352343,
"grad_norm": 0.376499205827713,
"learning_rate": 7.592875318066158e-05,
"loss": 0.1613,
"step": 1225
},
{
"epoch": 3.1039290240811153,
"grad_norm": 0.4430786967277527,
"learning_rate": 7.582697201017812e-05,
"loss": 0.1691,
"step": 1226
},
{
"epoch": 3.106463878326996,
"grad_norm": 0.44311538338661194,
"learning_rate": 7.572519083969466e-05,
"loss": 0.1853,
"step": 1227
},
{
"epoch": 3.108998732572877,
"grad_norm": 0.5815149545669556,
"learning_rate": 7.56234096692112e-05,
"loss": 0.2039,
"step": 1228
},
{
"epoch": 3.111533586818758,
"grad_norm": 0.5101373195648193,
"learning_rate": 7.552162849872774e-05,
"loss": 0.2022,
"step": 1229
},
{
"epoch": 3.1140684410646386,
"grad_norm": 0.6038093566894531,
"learning_rate": 7.541984732824428e-05,
"loss": 0.1859,
"step": 1230
},
{
"epoch": 3.1166032953105196,
"grad_norm": 0.5133914351463318,
"learning_rate": 7.531806615776081e-05,
"loss": 0.1626,
"step": 1231
},
{
"epoch": 3.1191381495564006,
"grad_norm": 0.40495821833610535,
"learning_rate": 7.521628498727735e-05,
"loss": 0.1739,
"step": 1232
},
{
"epoch": 3.1216730038022815,
"grad_norm": 0.6585063934326172,
"learning_rate": 7.511450381679391e-05,
"loss": 0.2402,
"step": 1233
},
{
"epoch": 3.124207858048162,
"grad_norm": 0.45598068833351135,
"learning_rate": 7.501272264631045e-05,
"loss": 0.1632,
"step": 1234
},
{
"epoch": 3.126742712294043,
"grad_norm": 0.42114904522895813,
"learning_rate": 7.491094147582699e-05,
"loss": 0.1638,
"step": 1235
},
{
"epoch": 3.129277566539924,
"grad_norm": 0.443198561668396,
"learning_rate": 7.480916030534351e-05,
"loss": 0.2148,
"step": 1236
},
{
"epoch": 3.131812420785805,
"grad_norm": 0.5573143362998962,
"learning_rate": 7.470737913486005e-05,
"loss": 0.2219,
"step": 1237
},
{
"epoch": 3.134347275031686,
"grad_norm": 0.6023311614990234,
"learning_rate": 7.460559796437659e-05,
"loss": 0.1987,
"step": 1238
},
{
"epoch": 3.1368821292775664,
"grad_norm": 0.5282934904098511,
"learning_rate": 7.450381679389313e-05,
"loss": 0.2377,
"step": 1239
},
{
"epoch": 3.1394169835234473,
"grad_norm": 0.49694669246673584,
"learning_rate": 7.440203562340967e-05,
"loss": 0.1804,
"step": 1240
},
{
"epoch": 3.1419518377693283,
"grad_norm": 0.43045276403427124,
"learning_rate": 7.430025445292621e-05,
"loss": 0.1635,
"step": 1241
},
{
"epoch": 3.1444866920152093,
"grad_norm": 0.4798453152179718,
"learning_rate": 7.419847328244275e-05,
"loss": 0.1696,
"step": 1242
},
{
"epoch": 3.14702154626109,
"grad_norm": 0.5173293352127075,
"learning_rate": 7.409669211195929e-05,
"loss": 0.1802,
"step": 1243
},
{
"epoch": 3.1495564005069707,
"grad_norm": 0.5398945808410645,
"learning_rate": 7.399491094147583e-05,
"loss": 0.1949,
"step": 1244
},
{
"epoch": 3.1520912547528517,
"grad_norm": 0.5297830700874329,
"learning_rate": 7.389312977099238e-05,
"loss": 0.1987,
"step": 1245
},
{
"epoch": 3.1546261089987326,
"grad_norm": 0.5320866703987122,
"learning_rate": 7.379134860050892e-05,
"loss": 0.1715,
"step": 1246
},
{
"epoch": 3.1571609632446136,
"grad_norm": 0.6132882833480835,
"learning_rate": 7.368956743002546e-05,
"loss": 0.3204,
"step": 1247
},
{
"epoch": 3.159695817490494,
"grad_norm": 0.4120640158653259,
"learning_rate": 7.3587786259542e-05,
"loss": 0.157,
"step": 1248
},
{
"epoch": 3.162230671736375,
"grad_norm": 0.6765384674072266,
"learning_rate": 7.348600508905854e-05,
"loss": 0.2186,
"step": 1249
},
{
"epoch": 3.164765525982256,
"grad_norm": 0.6318830847740173,
"learning_rate": 7.338422391857506e-05,
"loss": 0.2189,
"step": 1250
},
{
"epoch": 3.167300380228137,
"grad_norm": 0.508305013179779,
"learning_rate": 7.32824427480916e-05,
"loss": 0.1962,
"step": 1251
},
{
"epoch": 3.169835234474018,
"grad_norm": 0.603520393371582,
"learning_rate": 7.318066157760814e-05,
"loss": 0.2615,
"step": 1252
},
{
"epoch": 3.1723700887198985,
"grad_norm": 0.7639157176017761,
"learning_rate": 7.307888040712468e-05,
"loss": 0.2982,
"step": 1253
},
{
"epoch": 3.1749049429657794,
"grad_norm": 0.5995659232139587,
"learning_rate": 7.297709923664122e-05,
"loss": 0.2206,
"step": 1254
},
{
"epoch": 3.1774397972116604,
"grad_norm": 0.6512479186058044,
"learning_rate": 7.287531806615776e-05,
"loss": 0.2065,
"step": 1255
},
{
"epoch": 3.1799746514575413,
"grad_norm": 0.4128544330596924,
"learning_rate": 7.27735368956743e-05,
"loss": 0.1589,
"step": 1256
},
{
"epoch": 3.182509505703422,
"grad_norm": 0.5341802835464478,
"learning_rate": 7.267175572519084e-05,
"loss": 0.1812,
"step": 1257
},
{
"epoch": 3.185044359949303,
"grad_norm": 0.38032597303390503,
"learning_rate": 7.256997455470739e-05,
"loss": 0.1773,
"step": 1258
},
{
"epoch": 3.1875792141951838,
"grad_norm": 0.5732728838920593,
"learning_rate": 7.246819338422393e-05,
"loss": 0.2047,
"step": 1259
},
{
"epoch": 3.1901140684410647,
"grad_norm": 0.47396236658096313,
"learning_rate": 7.236641221374047e-05,
"loss": 0.2095,
"step": 1260
},
{
"epoch": 3.1926489226869457,
"grad_norm": 0.4764629304409027,
"learning_rate": 7.226463104325701e-05,
"loss": 0.1802,
"step": 1261
},
{
"epoch": 3.195183776932826,
"grad_norm": 0.5802401304244995,
"learning_rate": 7.216284987277355e-05,
"loss": 0.1821,
"step": 1262
},
{
"epoch": 3.197718631178707,
"grad_norm": 0.47988972067832947,
"learning_rate": 7.206106870229009e-05,
"loss": 0.163,
"step": 1263
},
{
"epoch": 3.200253485424588,
"grad_norm": 0.48500359058380127,
"learning_rate": 7.195928753180661e-05,
"loss": 0.1739,
"step": 1264
},
{
"epoch": 3.202788339670469,
"grad_norm": 0.7479031682014465,
"learning_rate": 7.185750636132315e-05,
"loss": 0.2646,
"step": 1265
},
{
"epoch": 3.20532319391635,
"grad_norm": 0.48695701360702515,
"learning_rate": 7.175572519083969e-05,
"loss": 0.1822,
"step": 1266
},
{
"epoch": 3.2078580481622305,
"grad_norm": 0.712354838848114,
"learning_rate": 7.165394402035623e-05,
"loss": 0.1827,
"step": 1267
},
{
"epoch": 3.2103929024081115,
"grad_norm": 0.4304606020450592,
"learning_rate": 7.155216284987277e-05,
"loss": 0.1759,
"step": 1268
},
{
"epoch": 3.2129277566539924,
"grad_norm": 0.44741392135620117,
"learning_rate": 7.145038167938931e-05,
"loss": 0.1979,
"step": 1269
},
{
"epoch": 3.2154626108998734,
"grad_norm": 0.3691045045852661,
"learning_rate": 7.134860050890586e-05,
"loss": 0.1575,
"step": 1270
},
{
"epoch": 3.2179974651457544,
"grad_norm": 0.4908023476600647,
"learning_rate": 7.12468193384224e-05,
"loss": 0.1854,
"step": 1271
},
{
"epoch": 3.220532319391635,
"grad_norm": 0.3953510820865631,
"learning_rate": 7.114503816793894e-05,
"loss": 0.1821,
"step": 1272
},
{
"epoch": 3.223067173637516,
"grad_norm": 0.35227248072624207,
"learning_rate": 7.104325699745548e-05,
"loss": 0.173,
"step": 1273
},
{
"epoch": 3.225602027883397,
"grad_norm": 0.41285187005996704,
"learning_rate": 7.094147582697202e-05,
"loss": 0.1708,
"step": 1274
},
{
"epoch": 3.2281368821292777,
"grad_norm": 0.5076828002929688,
"learning_rate": 7.083969465648856e-05,
"loss": 0.2128,
"step": 1275
},
{
"epoch": 3.2306717363751583,
"grad_norm": 0.5385151505470276,
"learning_rate": 7.07379134860051e-05,
"loss": 0.2181,
"step": 1276
},
{
"epoch": 3.233206590621039,
"grad_norm": 0.4620850086212158,
"learning_rate": 7.063613231552164e-05,
"loss": 0.212,
"step": 1277
},
{
"epoch": 3.23574144486692,
"grad_norm": 0.6768701672554016,
"learning_rate": 7.053435114503816e-05,
"loss": 0.2704,
"step": 1278
},
{
"epoch": 3.238276299112801,
"grad_norm": 0.43216967582702637,
"learning_rate": 7.04325699745547e-05,
"loss": 0.1633,
"step": 1279
},
{
"epoch": 3.240811153358682,
"grad_norm": 0.3756103813648224,
"learning_rate": 7.033078880407124e-05,
"loss": 0.1767,
"step": 1280
},
{
"epoch": 3.2433460076045626,
"grad_norm": 0.612819254398346,
"learning_rate": 7.022900763358778e-05,
"loss": 0.2563,
"step": 1281
},
{
"epoch": 3.2458808618504436,
"grad_norm": 0.5477813482284546,
"learning_rate": 7.012722646310433e-05,
"loss": 0.2053,
"step": 1282
},
{
"epoch": 3.2484157160963245,
"grad_norm": 0.3412390351295471,
"learning_rate": 7.002544529262087e-05,
"loss": 0.1506,
"step": 1283
},
{
"epoch": 3.2509505703422055,
"grad_norm": 0.34337860345840454,
"learning_rate": 6.992366412213741e-05,
"loss": 0.1612,
"step": 1284
},
{
"epoch": 3.253485424588086,
"grad_norm": 0.37943509221076965,
"learning_rate": 6.982188295165395e-05,
"loss": 0.168,
"step": 1285
},
{
"epoch": 3.256020278833967,
"grad_norm": 0.6030418872833252,
"learning_rate": 6.972010178117049e-05,
"loss": 0.2146,
"step": 1286
},
{
"epoch": 3.258555133079848,
"grad_norm": 0.34367507696151733,
"learning_rate": 6.961832061068703e-05,
"loss": 0.1726,
"step": 1287
},
{
"epoch": 3.261089987325729,
"grad_norm": 0.3952295780181885,
"learning_rate": 6.951653944020357e-05,
"loss": 0.1754,
"step": 1288
},
{
"epoch": 3.26362484157161,
"grad_norm": 0.5151681900024414,
"learning_rate": 6.941475826972011e-05,
"loss": 0.1849,
"step": 1289
},
{
"epoch": 3.2661596958174903,
"grad_norm": 0.496988445520401,
"learning_rate": 6.931297709923665e-05,
"loss": 0.1938,
"step": 1290
},
{
"epoch": 3.2686945500633713,
"grad_norm": 0.45343711972236633,
"learning_rate": 6.921119592875319e-05,
"loss": 0.1845,
"step": 1291
},
{
"epoch": 3.2712294043092522,
"grad_norm": 0.5323635935783386,
"learning_rate": 6.910941475826971e-05,
"loss": 0.177,
"step": 1292
},
{
"epoch": 3.273764258555133,
"grad_norm": 0.39680036902427673,
"learning_rate": 6.900763358778625e-05,
"loss": 0.1843,
"step": 1293
},
{
"epoch": 3.2762991128010137,
"grad_norm": 0.4767110049724579,
"learning_rate": 6.89058524173028e-05,
"loss": 0.2103,
"step": 1294
},
{
"epoch": 3.2788339670468947,
"grad_norm": 0.5565052032470703,
"learning_rate": 6.880407124681934e-05,
"loss": 0.2185,
"step": 1295
},
{
"epoch": 3.2813688212927756,
"grad_norm": 0.5472534894943237,
"learning_rate": 6.870229007633588e-05,
"loss": 0.2237,
"step": 1296
},
{
"epoch": 3.2839036755386566,
"grad_norm": 0.632560133934021,
"learning_rate": 6.860050890585242e-05,
"loss": 0.2213,
"step": 1297
},
{
"epoch": 3.2864385297845375,
"grad_norm": 0.5626386404037476,
"learning_rate": 6.849872773536896e-05,
"loss": 0.2324,
"step": 1298
},
{
"epoch": 3.288973384030418,
"grad_norm": 0.5527671575546265,
"learning_rate": 6.83969465648855e-05,
"loss": 0.227,
"step": 1299
},
{
"epoch": 3.291508238276299,
"grad_norm": 0.6093178391456604,
"learning_rate": 6.829516539440204e-05,
"loss": 0.2368,
"step": 1300
},
{
"epoch": 3.29404309252218,
"grad_norm": 0.3845243453979492,
"learning_rate": 6.819338422391858e-05,
"loss": 0.1804,
"step": 1301
},
{
"epoch": 3.296577946768061,
"grad_norm": 0.6384890079498291,
"learning_rate": 6.809160305343512e-05,
"loss": 0.2598,
"step": 1302
},
{
"epoch": 3.299112801013942,
"grad_norm": 0.5135822892189026,
"learning_rate": 6.798982188295166e-05,
"loss": 0.2142,
"step": 1303
},
{
"epoch": 3.3016476552598224,
"grad_norm": 0.4996071457862854,
"learning_rate": 6.78880407124682e-05,
"loss": 0.2107,
"step": 1304
},
{
"epoch": 3.3041825095057034,
"grad_norm": 0.31445005536079407,
"learning_rate": 6.778625954198474e-05,
"loss": 0.1764,
"step": 1305
},
{
"epoch": 3.3067173637515843,
"grad_norm": 0.544301450252533,
"learning_rate": 6.768447837150128e-05,
"loss": 0.2856,
"step": 1306
},
{
"epoch": 3.3092522179974653,
"grad_norm": 0.5029551982879639,
"learning_rate": 6.758269720101782e-05,
"loss": 0.2374,
"step": 1307
},
{
"epoch": 3.3117870722433462,
"grad_norm": 0.3769523799419403,
"learning_rate": 6.748091603053436e-05,
"loss": 0.1853,
"step": 1308
},
{
"epoch": 3.3143219264892267,
"grad_norm": 0.3540287911891937,
"learning_rate": 6.73791348600509e-05,
"loss": 0.193,
"step": 1309
},
{
"epoch": 3.3168567807351077,
"grad_norm": 0.42674198746681213,
"learning_rate": 6.727735368956743e-05,
"loss": 0.1953,
"step": 1310
},
{
"epoch": 3.3193916349809887,
"grad_norm": 0.5152068138122559,
"learning_rate": 6.717557251908397e-05,
"loss": 0.1871,
"step": 1311
},
{
"epoch": 3.3219264892268696,
"grad_norm": 0.48964372277259827,
"learning_rate": 6.707379134860051e-05,
"loss": 0.2142,
"step": 1312
},
{
"epoch": 3.32446134347275,
"grad_norm": 0.5390191674232483,
"learning_rate": 6.697201017811705e-05,
"loss": 0.1764,
"step": 1313
},
{
"epoch": 3.326996197718631,
"grad_norm": 0.3849482238292694,
"learning_rate": 6.687022900763359e-05,
"loss": 0.1681,
"step": 1314
},
{
"epoch": 3.329531051964512,
"grad_norm": 0.36165010929107666,
"learning_rate": 6.676844783715013e-05,
"loss": 0.148,
"step": 1315
},
{
"epoch": 3.332065906210393,
"grad_norm": 0.47739362716674805,
"learning_rate": 6.666666666666667e-05,
"loss": 0.1748,
"step": 1316
},
{
"epoch": 3.334600760456274,
"grad_norm": 0.41228094696998596,
"learning_rate": 6.656488549618321e-05,
"loss": 0.2006,
"step": 1317
},
{
"epoch": 3.3371356147021545,
"grad_norm": 0.43494951725006104,
"learning_rate": 6.646310432569975e-05,
"loss": 0.1821,
"step": 1318
},
{
"epoch": 3.3396704689480354,
"grad_norm": 0.5502039194107056,
"learning_rate": 6.636132315521629e-05,
"loss": 0.208,
"step": 1319
},
{
"epoch": 3.3422053231939164,
"grad_norm": 0.5151738524436951,
"learning_rate": 6.625954198473283e-05,
"loss": 0.2304,
"step": 1320
},
{
"epoch": 3.3447401774397973,
"grad_norm": 0.3866114914417267,
"learning_rate": 6.615776081424937e-05,
"loss": 0.1738,
"step": 1321
},
{
"epoch": 3.347275031685678,
"grad_norm": 0.5542702674865723,
"learning_rate": 6.60559796437659e-05,
"loss": 0.1885,
"step": 1322
},
{
"epoch": 3.349809885931559,
"grad_norm": 0.5107680559158325,
"learning_rate": 6.595419847328245e-05,
"loss": 0.1856,
"step": 1323
},
{
"epoch": 3.3523447401774398,
"grad_norm": 0.8266568183898926,
"learning_rate": 6.585241730279898e-05,
"loss": 0.2826,
"step": 1324
},
{
"epoch": 3.3548795944233207,
"grad_norm": 0.45209088921546936,
"learning_rate": 6.575063613231552e-05,
"loss": 0.1519,
"step": 1325
},
{
"epoch": 3.3574144486692017,
"grad_norm": 0.4708397388458252,
"learning_rate": 6.564885496183206e-05,
"loss": 0.1834,
"step": 1326
},
{
"epoch": 3.359949302915082,
"grad_norm": 0.39958736300468445,
"learning_rate": 6.55470737913486e-05,
"loss": 0.1444,
"step": 1327
},
{
"epoch": 3.362484157160963,
"grad_norm": 0.5764468312263489,
"learning_rate": 6.544529262086514e-05,
"loss": 0.2024,
"step": 1328
},
{
"epoch": 3.365019011406844,
"grad_norm": 0.4573269188404083,
"learning_rate": 6.534351145038168e-05,
"loss": 0.1857,
"step": 1329
},
{
"epoch": 3.367553865652725,
"grad_norm": 0.598423957824707,
"learning_rate": 6.524173027989822e-05,
"loss": 0.2206,
"step": 1330
},
{
"epoch": 3.3700887198986056,
"grad_norm": 0.5643012523651123,
"learning_rate": 6.513994910941476e-05,
"loss": 0.157,
"step": 1331
},
{
"epoch": 3.3726235741444865,
"grad_norm": 0.6568096876144409,
"learning_rate": 6.50381679389313e-05,
"loss": 0.2588,
"step": 1332
},
{
"epoch": 3.3751584283903675,
"grad_norm": 0.6552339792251587,
"learning_rate": 6.493638676844784e-05,
"loss": 0.2032,
"step": 1333
},
{
"epoch": 3.3776932826362485,
"grad_norm": 0.5274556279182434,
"learning_rate": 6.483460559796438e-05,
"loss": 0.1877,
"step": 1334
},
{
"epoch": 3.3802281368821294,
"grad_norm": 0.43894869089126587,
"learning_rate": 6.473282442748092e-05,
"loss": 0.155,
"step": 1335
},
{
"epoch": 3.3827629911280104,
"grad_norm": 0.6116171479225159,
"learning_rate": 6.463104325699746e-05,
"loss": 0.2978,
"step": 1336
},
{
"epoch": 3.385297845373891,
"grad_norm": 0.4588301479816437,
"learning_rate": 6.4529262086514e-05,
"loss": 0.1765,
"step": 1337
},
{
"epoch": 3.387832699619772,
"grad_norm": 0.4299813508987427,
"learning_rate": 6.442748091603053e-05,
"loss": 0.1725,
"step": 1338
},
{
"epoch": 3.390367553865653,
"grad_norm": 0.4996776580810547,
"learning_rate": 6.432569974554707e-05,
"loss": 0.1815,
"step": 1339
},
{
"epoch": 3.3929024081115338,
"grad_norm": 0.42195963859558105,
"learning_rate": 6.422391857506361e-05,
"loss": 0.1544,
"step": 1340
},
{
"epoch": 3.3954372623574143,
"grad_norm": 0.3918668031692505,
"learning_rate": 6.412213740458015e-05,
"loss": 0.1677,
"step": 1341
},
{
"epoch": 3.3979721166032952,
"grad_norm": 0.5436106324195862,
"learning_rate": 6.402035623409669e-05,
"loss": 0.2624,
"step": 1342
},
{
"epoch": 3.400506970849176,
"grad_norm": 0.5056617856025696,
"learning_rate": 6.391857506361324e-05,
"loss": 0.1735,
"step": 1343
},
{
"epoch": 3.403041825095057,
"grad_norm": 0.497035950422287,
"learning_rate": 6.381679389312978e-05,
"loss": 0.192,
"step": 1344
},
{
"epoch": 3.405576679340938,
"grad_norm": 0.4464019238948822,
"learning_rate": 6.371501272264632e-05,
"loss": 0.165,
"step": 1345
},
{
"epoch": 3.4081115335868186,
"grad_norm": 0.3940610885620117,
"learning_rate": 6.361323155216285e-05,
"loss": 0.1698,
"step": 1346
},
{
"epoch": 3.4106463878326996,
"grad_norm": 0.34197869896888733,
"learning_rate": 6.351145038167939e-05,
"loss": 0.1676,
"step": 1347
},
{
"epoch": 3.4131812420785805,
"grad_norm": 0.5477511286735535,
"learning_rate": 6.340966921119593e-05,
"loss": 0.2913,
"step": 1348
},
{
"epoch": 3.4157160963244615,
"grad_norm": 0.47384947538375854,
"learning_rate": 6.330788804071247e-05,
"loss": 0.1807,
"step": 1349
},
{
"epoch": 3.418250950570342,
"grad_norm": 0.4805784821510315,
"learning_rate": 6.3206106870229e-05,
"loss": 0.1844,
"step": 1350
},
{
"epoch": 3.420785804816223,
"grad_norm": 0.4914521276950836,
"learning_rate": 6.310432569974555e-05,
"loss": 0.21,
"step": 1351
},
{
"epoch": 3.423320659062104,
"grad_norm": 0.42754796147346497,
"learning_rate": 6.300254452926209e-05,
"loss": 0.2003,
"step": 1352
},
{
"epoch": 3.425855513307985,
"grad_norm": 0.5367889404296875,
"learning_rate": 6.290076335877862e-05,
"loss": 0.2126,
"step": 1353
},
{
"epoch": 3.428390367553866,
"grad_norm": 0.5015621781349182,
"learning_rate": 6.279898218829516e-05,
"loss": 0.176,
"step": 1354
},
{
"epoch": 3.4309252217997463,
"grad_norm": 0.4498123228549957,
"learning_rate": 6.269720101781172e-05,
"loss": 0.1963,
"step": 1355
},
{
"epoch": 3.4334600760456273,
"grad_norm": 0.4548507034778595,
"learning_rate": 6.259541984732826e-05,
"loss": 0.185,
"step": 1356
},
{
"epoch": 3.4359949302915083,
"grad_norm": 0.5188789963722229,
"learning_rate": 6.24936386768448e-05,
"loss": 0.2152,
"step": 1357
},
{
"epoch": 3.4385297845373892,
"grad_norm": 0.5717540979385376,
"learning_rate": 6.239185750636133e-05,
"loss": 0.2541,
"step": 1358
},
{
"epoch": 3.4410646387832697,
"grad_norm": 0.43195176124572754,
"learning_rate": 6.229007633587787e-05,
"loss": 0.1841,
"step": 1359
},
{
"epoch": 3.4435994930291507,
"grad_norm": 0.8148223161697388,
"learning_rate": 6.21882951653944e-05,
"loss": 0.1903,
"step": 1360
},
{
"epoch": 3.4461343472750317,
"grad_norm": 0.39928868412971497,
"learning_rate": 6.208651399491094e-05,
"loss": 0.1551,
"step": 1361
},
{
"epoch": 3.4486692015209126,
"grad_norm": 0.8072621822357178,
"learning_rate": 6.198473282442748e-05,
"loss": 0.1973,
"step": 1362
},
{
"epoch": 3.4512040557667936,
"grad_norm": 0.6420927047729492,
"learning_rate": 6.188295165394402e-05,
"loss": 0.2304,
"step": 1363
},
{
"epoch": 3.453738910012674,
"grad_norm": 0.4896611273288727,
"learning_rate": 6.178117048346056e-05,
"loss": 0.1968,
"step": 1364
},
{
"epoch": 3.456273764258555,
"grad_norm": 0.5518379211425781,
"learning_rate": 6.16793893129771e-05,
"loss": 0.2136,
"step": 1365
},
{
"epoch": 3.458808618504436,
"grad_norm": 0.35489922761917114,
"learning_rate": 6.157760814249364e-05,
"loss": 0.1735,
"step": 1366
},
{
"epoch": 3.461343472750317,
"grad_norm": 0.3575512766838074,
"learning_rate": 6.147582697201019e-05,
"loss": 0.1704,
"step": 1367
},
{
"epoch": 3.463878326996198,
"grad_norm": 0.46745261549949646,
"learning_rate": 6.137404580152673e-05,
"loss": 0.1702,
"step": 1368
},
{
"epoch": 3.4664131812420784,
"grad_norm": 0.39378833770751953,
"learning_rate": 6.127226463104327e-05,
"loss": 0.1512,
"step": 1369
},
{
"epoch": 3.4689480354879594,
"grad_norm": 0.5645838975906372,
"learning_rate": 6.11704834605598e-05,
"loss": 0.2053,
"step": 1370
},
{
"epoch": 3.4714828897338403,
"grad_norm": 0.3613208830356598,
"learning_rate": 6.106870229007635e-05,
"loss": 0.1749,
"step": 1371
},
{
"epoch": 3.4740177439797213,
"grad_norm": 0.573124349117279,
"learning_rate": 6.096692111959288e-05,
"loss": 0.2229,
"step": 1372
},
{
"epoch": 3.4765525982256023,
"grad_norm": 0.43110212683677673,
"learning_rate": 6.086513994910942e-05,
"loss": 0.2082,
"step": 1373
},
{
"epoch": 3.4790874524714828,
"grad_norm": 0.6268284320831299,
"learning_rate": 6.076335877862596e-05,
"loss": 0.2826,
"step": 1374
},
{
"epoch": 3.4816223067173637,
"grad_norm": 0.5699491500854492,
"learning_rate": 6.0661577608142496e-05,
"loss": 0.2373,
"step": 1375
},
{
"epoch": 3.4841571609632447,
"grad_norm": 0.451548308134079,
"learning_rate": 6.0559796437659035e-05,
"loss": 0.1782,
"step": 1376
},
{
"epoch": 3.4866920152091256,
"grad_norm": 0.44955211877822876,
"learning_rate": 6.0458015267175575e-05,
"loss": 0.1896,
"step": 1377
},
{
"epoch": 3.489226869455006,
"grad_norm": 0.44076019525527954,
"learning_rate": 6.035623409669211e-05,
"loss": 0.1854,
"step": 1378
},
{
"epoch": 3.491761723700887,
"grad_norm": 0.8012815117835999,
"learning_rate": 6.0254452926208646e-05,
"loss": 0.2067,
"step": 1379
},
{
"epoch": 3.494296577946768,
"grad_norm": 0.5558981895446777,
"learning_rate": 6.01526717557252e-05,
"loss": 0.1913,
"step": 1380
},
{
"epoch": 3.496831432192649,
"grad_norm": 0.42501258850097656,
"learning_rate": 6.005089058524174e-05,
"loss": 0.1781,
"step": 1381
},
{
"epoch": 3.49936628643853,
"grad_norm": 0.3618164658546448,
"learning_rate": 5.994910941475828e-05,
"loss": 0.1472,
"step": 1382
},
{
"epoch": 3.5019011406844105,
"grad_norm": 0.5384409427642822,
"learning_rate": 5.984732824427482e-05,
"loss": 0.2063,
"step": 1383
},
{
"epoch": 3.5044359949302915,
"grad_norm": 0.5103084444999695,
"learning_rate": 5.974554707379135e-05,
"loss": 0.1737,
"step": 1384
},
{
"epoch": 3.5069708491761724,
"grad_norm": 0.37908968329429626,
"learning_rate": 5.964376590330789e-05,
"loss": 0.1599,
"step": 1385
},
{
"epoch": 3.5095057034220534,
"grad_norm": 0.5049726963043213,
"learning_rate": 5.954198473282443e-05,
"loss": 0.1891,
"step": 1386
},
{
"epoch": 3.512040557667934,
"grad_norm": 0.4436114430427551,
"learning_rate": 5.944020356234097e-05,
"loss": 0.1667,
"step": 1387
},
{
"epoch": 3.514575411913815,
"grad_norm": 0.6733534336090088,
"learning_rate": 5.933842239185751e-05,
"loss": 0.2714,
"step": 1388
},
{
"epoch": 3.517110266159696,
"grad_norm": 0.7258228659629822,
"learning_rate": 5.9236641221374046e-05,
"loss": 0.258,
"step": 1389
},
{
"epoch": 3.5196451204055768,
"grad_norm": 0.6425923705101013,
"learning_rate": 5.9134860050890586e-05,
"loss": 0.1791,
"step": 1390
},
{
"epoch": 3.5221799746514577,
"grad_norm": 0.45786988735198975,
"learning_rate": 5.9033078880407125e-05,
"loss": 0.1989,
"step": 1391
},
{
"epoch": 3.5247148288973387,
"grad_norm": 0.43258994817733765,
"learning_rate": 5.893129770992367e-05,
"loss": 0.166,
"step": 1392
},
{
"epoch": 3.527249683143219,
"grad_norm": 0.36486050486564636,
"learning_rate": 5.882951653944021e-05,
"loss": 0.1634,
"step": 1393
},
{
"epoch": 3.5297845373891,
"grad_norm": 0.5883339047431946,
"learning_rate": 5.872773536895675e-05,
"loss": 0.2236,
"step": 1394
},
{
"epoch": 3.532319391634981,
"grad_norm": 0.6296584010124207,
"learning_rate": 5.862595419847329e-05,
"loss": 0.1866,
"step": 1395
},
{
"epoch": 3.5348542458808616,
"grad_norm": 0.4262075126171112,
"learning_rate": 5.852417302798983e-05,
"loss": 0.1707,
"step": 1396
},
{
"epoch": 3.5373891001267426,
"grad_norm": 0.459573894739151,
"learning_rate": 5.842239185750637e-05,
"loss": 0.1654,
"step": 1397
},
{
"epoch": 3.5399239543726235,
"grad_norm": 0.47115570306777954,
"learning_rate": 5.83206106870229e-05,
"loss": 0.1936,
"step": 1398
},
{
"epoch": 3.5424588086185045,
"grad_norm": 0.41362589597702026,
"learning_rate": 5.821882951653944e-05,
"loss": 0.1897,
"step": 1399
},
{
"epoch": 3.5449936628643854,
"grad_norm": 0.4314422607421875,
"learning_rate": 5.811704834605598e-05,
"loss": 0.172,
"step": 1400
},
{
"epoch": 3.5475285171102664,
"grad_norm": 0.48116129636764526,
"learning_rate": 5.801526717557252e-05,
"loss": 0.1721,
"step": 1401
},
{
"epoch": 3.550063371356147,
"grad_norm": 0.3902725279331207,
"learning_rate": 5.791348600508906e-05,
"loss": 0.1886,
"step": 1402
},
{
"epoch": 3.552598225602028,
"grad_norm": 0.37996864318847656,
"learning_rate": 5.78117048346056e-05,
"loss": 0.1705,
"step": 1403
},
{
"epoch": 3.555133079847909,
"grad_norm": 0.589279294013977,
"learning_rate": 5.770992366412214e-05,
"loss": 0.1848,
"step": 1404
},
{
"epoch": 3.5576679340937893,
"grad_norm": 0.4233790636062622,
"learning_rate": 5.760814249363868e-05,
"loss": 0.18,
"step": 1405
},
{
"epoch": 3.5602027883396703,
"grad_norm": 0.3760955333709717,
"learning_rate": 5.750636132315522e-05,
"loss": 0.1743,
"step": 1406
},
{
"epoch": 3.5627376425855513,
"grad_norm": 0.552793562412262,
"learning_rate": 5.740458015267176e-05,
"loss": 0.2315,
"step": 1407
},
{
"epoch": 3.565272496831432,
"grad_norm": 0.5440211892127991,
"learning_rate": 5.73027989821883e-05,
"loss": 0.186,
"step": 1408
},
{
"epoch": 3.567807351077313,
"grad_norm": 0.5183967351913452,
"learning_rate": 5.720101781170484e-05,
"loss": 0.1626,
"step": 1409
},
{
"epoch": 3.570342205323194,
"grad_norm": 0.47962069511413574,
"learning_rate": 5.709923664122138e-05,
"loss": 0.1813,
"step": 1410
},
{
"epoch": 3.5728770595690746,
"grad_norm": 0.8065668940544128,
"learning_rate": 5.699745547073792e-05,
"loss": 0.2537,
"step": 1411
},
{
"epoch": 3.5754119138149556,
"grad_norm": 0.46018585562705994,
"learning_rate": 5.689567430025445e-05,
"loss": 0.1756,
"step": 1412
},
{
"epoch": 3.5779467680608366,
"grad_norm": 0.5229590535163879,
"learning_rate": 5.679389312977099e-05,
"loss": 0.1873,
"step": 1413
},
{
"epoch": 3.5804816223067175,
"grad_norm": 0.510209321975708,
"learning_rate": 5.669211195928753e-05,
"loss": 0.167,
"step": 1414
},
{
"epoch": 3.583016476552598,
"grad_norm": 0.4264031648635864,
"learning_rate": 5.659033078880407e-05,
"loss": 0.1705,
"step": 1415
},
{
"epoch": 3.585551330798479,
"grad_norm": 0.6208323240280151,
"learning_rate": 5.648854961832062e-05,
"loss": 0.2268,
"step": 1416
},
{
"epoch": 3.58808618504436,
"grad_norm": 0.3730670213699341,
"learning_rate": 5.6386768447837154e-05,
"loss": 0.1676,
"step": 1417
},
{
"epoch": 3.590621039290241,
"grad_norm": 0.52936190366745,
"learning_rate": 5.628498727735369e-05,
"loss": 0.2055,
"step": 1418
},
{
"epoch": 3.593155893536122,
"grad_norm": 0.44800981879234314,
"learning_rate": 5.618320610687023e-05,
"loss": 0.1782,
"step": 1419
},
{
"epoch": 3.5956907477820024,
"grad_norm": 0.37429654598236084,
"learning_rate": 5.608142493638677e-05,
"loss": 0.1566,
"step": 1420
},
{
"epoch": 3.5982256020278833,
"grad_norm": 0.5618942975997925,
"learning_rate": 5.597964376590331e-05,
"loss": 0.2249,
"step": 1421
},
{
"epoch": 3.6007604562737643,
"grad_norm": 0.6893648505210876,
"learning_rate": 5.587786259541985e-05,
"loss": 0.2104,
"step": 1422
},
{
"epoch": 3.6032953105196452,
"grad_norm": 0.4185943603515625,
"learning_rate": 5.577608142493639e-05,
"loss": 0.1729,
"step": 1423
},
{
"epoch": 3.6058301647655258,
"grad_norm": 0.46326011419296265,
"learning_rate": 5.567430025445293e-05,
"loss": 0.1888,
"step": 1424
},
{
"epoch": 3.6083650190114067,
"grad_norm": 0.4564262628555298,
"learning_rate": 5.557251908396947e-05,
"loss": 0.1957,
"step": 1425
},
{
"epoch": 3.6108998732572877,
"grad_norm": 0.654411256313324,
"learning_rate": 5.5470737913486e-05,
"loss": 0.2101,
"step": 1426
},
{
"epoch": 3.6134347275031686,
"grad_norm": 0.4059501886367798,
"learning_rate": 5.536895674300254e-05,
"loss": 0.1638,
"step": 1427
},
{
"epoch": 3.6159695817490496,
"grad_norm": 0.4155724346637726,
"learning_rate": 5.526717557251909e-05,
"loss": 0.1799,
"step": 1428
},
{
"epoch": 3.6185044359949305,
"grad_norm": 0.4041290581226349,
"learning_rate": 5.516539440203563e-05,
"loss": 0.1755,
"step": 1429
},
{
"epoch": 3.621039290240811,
"grad_norm": 0.3458746373653412,
"learning_rate": 5.506361323155217e-05,
"loss": 0.1474,
"step": 1430
},
{
"epoch": 3.623574144486692,
"grad_norm": 0.5046303272247314,
"learning_rate": 5.496183206106871e-05,
"loss": 0.2554,
"step": 1431
},
{
"epoch": 3.626108998732573,
"grad_norm": 0.4284549951553345,
"learning_rate": 5.4860050890585244e-05,
"loss": 0.1855,
"step": 1432
},
{
"epoch": 3.6286438529784535,
"grad_norm": 0.5116839408874512,
"learning_rate": 5.475826972010178e-05,
"loss": 0.1777,
"step": 1433
},
{
"epoch": 3.6311787072243344,
"grad_norm": 0.4303711950778961,
"learning_rate": 5.465648854961832e-05,
"loss": 0.1792,
"step": 1434
},
{
"epoch": 3.6337135614702154,
"grad_norm": 0.4602053463459015,
"learning_rate": 5.455470737913486e-05,
"loss": 0.1716,
"step": 1435
},
{
"epoch": 3.6362484157160964,
"grad_norm": 0.47606271505355835,
"learning_rate": 5.44529262086514e-05,
"loss": 0.2063,
"step": 1436
},
{
"epoch": 3.6387832699619773,
"grad_norm": 0.5861607193946838,
"learning_rate": 5.435114503816794e-05,
"loss": 0.2133,
"step": 1437
},
{
"epoch": 3.6413181242078583,
"grad_norm": 0.42663708329200745,
"learning_rate": 5.424936386768448e-05,
"loss": 0.1662,
"step": 1438
},
{
"epoch": 3.643852978453739,
"grad_norm": 0.6255937218666077,
"learning_rate": 5.414758269720102e-05,
"loss": 0.1875,
"step": 1439
},
{
"epoch": 3.6463878326996197,
"grad_norm": 0.5422307252883911,
"learning_rate": 5.404580152671755e-05,
"loss": 0.1624,
"step": 1440
},
{
"epoch": 3.6489226869455007,
"grad_norm": 0.540477991104126,
"learning_rate": 5.3944020356234104e-05,
"loss": 0.2489,
"step": 1441
},
{
"epoch": 3.6514575411913817,
"grad_norm": 0.5656100511550903,
"learning_rate": 5.3842239185750643e-05,
"loss": 0.2289,
"step": 1442
},
{
"epoch": 3.653992395437262,
"grad_norm": 0.5202456712722778,
"learning_rate": 5.374045801526718e-05,
"loss": 0.23,
"step": 1443
},
{
"epoch": 3.656527249683143,
"grad_norm": 0.5069813132286072,
"learning_rate": 5.363867684478372e-05,
"loss": 0.1845,
"step": 1444
},
{
"epoch": 3.659062103929024,
"grad_norm": 0.5711066126823425,
"learning_rate": 5.353689567430026e-05,
"loss": 0.2076,
"step": 1445
},
{
"epoch": 3.661596958174905,
"grad_norm": 0.5115897059440613,
"learning_rate": 5.3435114503816794e-05,
"loss": 0.1696,
"step": 1446
},
{
"epoch": 3.664131812420786,
"grad_norm": 0.6119818687438965,
"learning_rate": 5.333333333333333e-05,
"loss": 0.1905,
"step": 1447
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.7333729863166809,
"learning_rate": 5.323155216284987e-05,
"loss": 0.2208,
"step": 1448
},
{
"epoch": 3.6692015209125475,
"grad_norm": 0.5657917857170105,
"learning_rate": 5.312977099236641e-05,
"loss": 0.218,
"step": 1449
},
{
"epoch": 3.6717363751584284,
"grad_norm": 0.5568459033966064,
"learning_rate": 5.302798982188295e-05,
"loss": 0.1957,
"step": 1450
},
{
"epoch": 3.6742712294043094,
"grad_norm": 0.40060222148895264,
"learning_rate": 5.292620865139949e-05,
"loss": 0.1634,
"step": 1451
},
{
"epoch": 3.67680608365019,
"grad_norm": 0.5395296216011047,
"learning_rate": 5.282442748091603e-05,
"loss": 0.2284,
"step": 1452
},
{
"epoch": 3.679340937896071,
"grad_norm": 0.395298570394516,
"learning_rate": 5.2722646310432576e-05,
"loss": 0.1717,
"step": 1453
},
{
"epoch": 3.681875792141952,
"grad_norm": 0.4693946838378906,
"learning_rate": 5.2620865139949115e-05,
"loss": 0.1719,
"step": 1454
},
{
"epoch": 3.6844106463878328,
"grad_norm": 0.5206104516983032,
"learning_rate": 5.2519083969465654e-05,
"loss": 0.2158,
"step": 1455
},
{
"epoch": 3.6869455006337137,
"grad_norm": 0.5576691031455994,
"learning_rate": 5.2417302798982194e-05,
"loss": 0.2031,
"step": 1456
},
{
"epoch": 3.6894803548795947,
"grad_norm": 0.5826637148857117,
"learning_rate": 5.231552162849873e-05,
"loss": 0.2785,
"step": 1457
},
{
"epoch": 3.692015209125475,
"grad_norm": 0.5928865075111389,
"learning_rate": 5.221374045801527e-05,
"loss": 0.1765,
"step": 1458
},
{
"epoch": 3.694550063371356,
"grad_norm": 0.5932832956314087,
"learning_rate": 5.211195928753181e-05,
"loss": 0.1767,
"step": 1459
},
{
"epoch": 3.697084917617237,
"grad_norm": 0.4178262948989868,
"learning_rate": 5.2010178117048344e-05,
"loss": 0.1636,
"step": 1460
},
{
"epoch": 3.6996197718631176,
"grad_norm": 0.6029627919197083,
"learning_rate": 5.1908396946564884e-05,
"loss": 0.2086,
"step": 1461
},
{
"epoch": 3.7021546261089986,
"grad_norm": 0.48641863465309143,
"learning_rate": 5.180661577608142e-05,
"loss": 0.1613,
"step": 1462
},
{
"epoch": 3.7046894803548795,
"grad_norm": 0.40176740288734436,
"learning_rate": 5.170483460559796e-05,
"loss": 0.1647,
"step": 1463
},
{
"epoch": 3.7072243346007605,
"grad_norm": 0.42600035667419434,
"learning_rate": 5.16030534351145e-05,
"loss": 0.1818,
"step": 1464
},
{
"epoch": 3.7097591888466415,
"grad_norm": 0.48061972856521606,
"learning_rate": 5.150127226463105e-05,
"loss": 0.187,
"step": 1465
},
{
"epoch": 3.7122940430925224,
"grad_norm": 0.4085710346698761,
"learning_rate": 5.139949109414759e-05,
"loss": 0.1562,
"step": 1466
},
{
"epoch": 3.714828897338403,
"grad_norm": 0.4378439486026764,
"learning_rate": 5.1297709923664126e-05,
"loss": 0.1723,
"step": 1467
},
{
"epoch": 3.717363751584284,
"grad_norm": 0.5806863307952881,
"learning_rate": 5.1195928753180665e-05,
"loss": 0.2069,
"step": 1468
},
{
"epoch": 3.719898605830165,
"grad_norm": 0.4711120128631592,
"learning_rate": 5.1094147582697205e-05,
"loss": 0.1851,
"step": 1469
},
{
"epoch": 3.7224334600760454,
"grad_norm": 0.47227099537849426,
"learning_rate": 5.0992366412213744e-05,
"loss": 0.1885,
"step": 1470
},
{
"epoch": 3.7249683143219263,
"grad_norm": 0.4405531585216522,
"learning_rate": 5.0890585241730283e-05,
"loss": 0.1662,
"step": 1471
},
{
"epoch": 3.7275031685678073,
"grad_norm": 0.5168079733848572,
"learning_rate": 5.078880407124682e-05,
"loss": 0.2002,
"step": 1472
},
{
"epoch": 3.7300380228136882,
"grad_norm": 0.3839830160140991,
"learning_rate": 5.068702290076336e-05,
"loss": 0.168,
"step": 1473
},
{
"epoch": 3.732572877059569,
"grad_norm": 0.338012158870697,
"learning_rate": 5.0585241730279895e-05,
"loss": 0.1596,
"step": 1474
},
{
"epoch": 3.73510773130545,
"grad_norm": 0.5466023087501526,
"learning_rate": 5.0483460559796434e-05,
"loss": 0.2379,
"step": 1475
},
{
"epoch": 3.7376425855513307,
"grad_norm": 0.44543328881263733,
"learning_rate": 5.038167938931297e-05,
"loss": 0.1778,
"step": 1476
},
{
"epoch": 3.7401774397972116,
"grad_norm": 0.4166903793811798,
"learning_rate": 5.0279898218829526e-05,
"loss": 0.1554,
"step": 1477
},
{
"epoch": 3.7427122940430926,
"grad_norm": 0.3806212544441223,
"learning_rate": 5.0178117048346065e-05,
"loss": 0.1648,
"step": 1478
},
{
"epoch": 3.7452471482889735,
"grad_norm": 0.5990723967552185,
"learning_rate": 5.00763358778626e-05,
"loss": 0.2348,
"step": 1479
},
{
"epoch": 3.747782002534854,
"grad_norm": 0.715096116065979,
"learning_rate": 4.997455470737914e-05,
"loss": 0.2201,
"step": 1480
},
{
"epoch": 3.750316856780735,
"grad_norm": 0.6297019124031067,
"learning_rate": 4.9872773536895677e-05,
"loss": 0.2398,
"step": 1481
},
{
"epoch": 3.752851711026616,
"grad_norm": 0.6131380200386047,
"learning_rate": 4.9770992366412216e-05,
"loss": 0.2128,
"step": 1482
},
{
"epoch": 3.755386565272497,
"grad_norm": 0.5018277764320374,
"learning_rate": 4.9669211195928755e-05,
"loss": 0.1913,
"step": 1483
},
{
"epoch": 3.757921419518378,
"grad_norm": 0.516939103603363,
"learning_rate": 4.9567430025445294e-05,
"loss": 0.1958,
"step": 1484
},
{
"epoch": 3.7604562737642584,
"grad_norm": 0.4485652446746826,
"learning_rate": 4.9465648854961834e-05,
"loss": 0.1678,
"step": 1485
},
{
"epoch": 3.7629911280101394,
"grad_norm": 0.6227991580963135,
"learning_rate": 4.936386768447838e-05,
"loss": 0.2403,
"step": 1486
},
{
"epoch": 3.7655259822560203,
"grad_norm": 0.42331916093826294,
"learning_rate": 4.926208651399491e-05,
"loss": 0.1673,
"step": 1487
},
{
"epoch": 3.7680608365019013,
"grad_norm": 0.5072351098060608,
"learning_rate": 4.916030534351145e-05,
"loss": 0.204,
"step": 1488
},
{
"epoch": 3.770595690747782,
"grad_norm": 0.445578008890152,
"learning_rate": 4.905852417302799e-05,
"loss": 0.1908,
"step": 1489
},
{
"epoch": 3.7731305449936627,
"grad_norm": 0.49046698212623596,
"learning_rate": 4.895674300254453e-05,
"loss": 0.1615,
"step": 1490
},
{
"epoch": 3.7756653992395437,
"grad_norm": 0.37768882513046265,
"learning_rate": 4.885496183206107e-05,
"loss": 0.1604,
"step": 1491
},
{
"epoch": 3.7782002534854247,
"grad_norm": 0.38343289494514465,
"learning_rate": 4.8753180661577616e-05,
"loss": 0.1709,
"step": 1492
},
{
"epoch": 3.7807351077313056,
"grad_norm": 0.4102202355861664,
"learning_rate": 4.8651399491094155e-05,
"loss": 0.1629,
"step": 1493
},
{
"epoch": 3.7832699619771866,
"grad_norm": 0.4545007050037384,
"learning_rate": 4.854961832061069e-05,
"loss": 0.1709,
"step": 1494
},
{
"epoch": 3.785804816223067,
"grad_norm": 0.48300206661224365,
"learning_rate": 4.844783715012723e-05,
"loss": 0.2211,
"step": 1495
},
{
"epoch": 3.788339670468948,
"grad_norm": 0.5301868319511414,
"learning_rate": 4.8346055979643766e-05,
"loss": 0.2053,
"step": 1496
},
{
"epoch": 3.790874524714829,
"grad_norm": 0.48716598749160767,
"learning_rate": 4.8244274809160306e-05,
"loss": 0.2392,
"step": 1497
},
{
"epoch": 3.7934093789607095,
"grad_norm": 0.6201879978179932,
"learning_rate": 4.8142493638676845e-05,
"loss": 0.2267,
"step": 1498
},
{
"epoch": 3.7959442332065905,
"grad_norm": 0.46254560351371765,
"learning_rate": 4.804071246819339e-05,
"loss": 0.1824,
"step": 1499
},
{
"epoch": 3.7984790874524714,
"grad_norm": 0.6153382658958435,
"learning_rate": 4.793893129770993e-05,
"loss": 0.2095,
"step": 1500
},
{
"epoch": 3.8010139416983524,
"grad_norm": 0.6054911613464355,
"learning_rate": 4.783715012722646e-05,
"loss": 0.2291,
"step": 1501
},
{
"epoch": 3.8035487959442333,
"grad_norm": 0.3899902403354645,
"learning_rate": 4.7735368956743e-05,
"loss": 0.1507,
"step": 1502
},
{
"epoch": 3.8060836501901143,
"grad_norm": 0.4634632170200348,
"learning_rate": 4.763358778625954e-05,
"loss": 0.1436,
"step": 1503
},
{
"epoch": 3.808618504435995,
"grad_norm": 0.6829271912574768,
"learning_rate": 4.753180661577608e-05,
"loss": 0.2611,
"step": 1504
},
{
"epoch": 3.8111533586818758,
"grad_norm": 0.553393542766571,
"learning_rate": 4.743002544529263e-05,
"loss": 0.1862,
"step": 1505
},
{
"epoch": 3.8136882129277567,
"grad_norm": 0.4285520315170288,
"learning_rate": 4.7328244274809166e-05,
"loss": 0.1522,
"step": 1506
},
{
"epoch": 3.8162230671736372,
"grad_norm": 0.5505307912826538,
"learning_rate": 4.7226463104325705e-05,
"loss": 0.2056,
"step": 1507
},
{
"epoch": 3.818757921419518,
"grad_norm": 0.635071873664856,
"learning_rate": 4.712468193384224e-05,
"loss": 0.1899,
"step": 1508
},
{
"epoch": 3.821292775665399,
"grad_norm": 0.4297153353691101,
"learning_rate": 4.702290076335878e-05,
"loss": 0.1632,
"step": 1509
},
{
"epoch": 3.82382762991128,
"grad_norm": 0.5538508892059326,
"learning_rate": 4.6921119592875317e-05,
"loss": 0.1965,
"step": 1510
},
{
"epoch": 3.826362484157161,
"grad_norm": 0.6736975908279419,
"learning_rate": 4.681933842239186e-05,
"loss": 0.2334,
"step": 1511
},
{
"epoch": 3.828897338403042,
"grad_norm": 0.49381881952285767,
"learning_rate": 4.67175572519084e-05,
"loss": 0.2074,
"step": 1512
},
{
"epoch": 3.8314321926489225,
"grad_norm": 0.4285455346107483,
"learning_rate": 4.661577608142494e-05,
"loss": 0.176,
"step": 1513
},
{
"epoch": 3.8339670468948035,
"grad_norm": 0.5771308541297913,
"learning_rate": 4.651399491094148e-05,
"loss": 0.229,
"step": 1514
},
{
"epoch": 3.8365019011406845,
"grad_norm": 0.4749429225921631,
"learning_rate": 4.641221374045801e-05,
"loss": 0.1968,
"step": 1515
},
{
"epoch": 3.8390367553865654,
"grad_norm": 0.48094430565834045,
"learning_rate": 4.631043256997455e-05,
"loss": 0.1982,
"step": 1516
},
{
"epoch": 3.841571609632446,
"grad_norm": 0.49878042936325073,
"learning_rate": 4.62086513994911e-05,
"loss": 0.1552,
"step": 1517
},
{
"epoch": 3.844106463878327,
"grad_norm": 0.4872034192085266,
"learning_rate": 4.610687022900764e-05,
"loss": 0.1808,
"step": 1518
},
{
"epoch": 3.846641318124208,
"grad_norm": 0.4905577600002289,
"learning_rate": 4.600508905852418e-05,
"loss": 0.1703,
"step": 1519
},
{
"epoch": 3.849176172370089,
"grad_norm": 0.49980783462524414,
"learning_rate": 4.5903307888040716e-05,
"loss": 0.1727,
"step": 1520
},
{
"epoch": 3.8517110266159698,
"grad_norm": 0.5426180958747864,
"learning_rate": 4.5801526717557256e-05,
"loss": 0.2192,
"step": 1521
},
{
"epoch": 3.8542458808618507,
"grad_norm": 0.6399853825569153,
"learning_rate": 4.569974554707379e-05,
"loss": 0.2387,
"step": 1522
},
{
"epoch": 3.8567807351077312,
"grad_norm": 0.5311464667320251,
"learning_rate": 4.5597964376590334e-05,
"loss": 0.1976,
"step": 1523
},
{
"epoch": 3.859315589353612,
"grad_norm": 0.5433202981948853,
"learning_rate": 4.5496183206106874e-05,
"loss": 0.1916,
"step": 1524
},
{
"epoch": 3.861850443599493,
"grad_norm": 0.4024597704410553,
"learning_rate": 4.539440203562341e-05,
"loss": 0.1643,
"step": 1525
},
{
"epoch": 3.8643852978453737,
"grad_norm": 0.347566157579422,
"learning_rate": 4.529262086513995e-05,
"loss": 0.1676,
"step": 1526
},
{
"epoch": 3.8669201520912546,
"grad_norm": 0.45405861735343933,
"learning_rate": 4.519083969465649e-05,
"loss": 0.1963,
"step": 1527
},
{
"epoch": 3.8694550063371356,
"grad_norm": 0.6430472731590271,
"learning_rate": 4.508905852417303e-05,
"loss": 0.2322,
"step": 1528
},
{
"epoch": 3.8719898605830165,
"grad_norm": 0.4391939043998718,
"learning_rate": 4.498727735368957e-05,
"loss": 0.1871,
"step": 1529
},
{
"epoch": 3.8745247148288975,
"grad_norm": 0.47301623225212097,
"learning_rate": 4.488549618320611e-05,
"loss": 0.1549,
"step": 1530
},
{
"epoch": 3.8770595690747784,
"grad_norm": 0.4237573742866516,
"learning_rate": 4.478371501272265e-05,
"loss": 0.1548,
"step": 1531
},
{
"epoch": 3.879594423320659,
"grad_norm": 0.5859849452972412,
"learning_rate": 4.468193384223919e-05,
"loss": 0.2023,
"step": 1532
},
{
"epoch": 3.88212927756654,
"grad_norm": 0.45050573348999023,
"learning_rate": 4.458015267175573e-05,
"loss": 0.165,
"step": 1533
},
{
"epoch": 3.884664131812421,
"grad_norm": 0.5347339510917664,
"learning_rate": 4.447837150127227e-05,
"loss": 0.1854,
"step": 1534
},
{
"epoch": 3.8871989860583014,
"grad_norm": 0.375836580991745,
"learning_rate": 4.4376590330788806e-05,
"loss": 0.152,
"step": 1535
},
{
"epoch": 3.8897338403041823,
"grad_norm": 0.5403718948364258,
"learning_rate": 4.4274809160305345e-05,
"loss": 0.2065,
"step": 1536
},
{
"epoch": 3.8922686945500633,
"grad_norm": 0.5624736547470093,
"learning_rate": 4.4173027989821885e-05,
"loss": 0.1857,
"step": 1537
},
{
"epoch": 3.8948035487959443,
"grad_norm": 0.5971560478210449,
"learning_rate": 4.4071246819338424e-05,
"loss": 0.1928,
"step": 1538
},
{
"epoch": 3.897338403041825,
"grad_norm": 0.5225517153739929,
"learning_rate": 4.396946564885496e-05,
"loss": 0.2054,
"step": 1539
},
{
"epoch": 3.899873257287706,
"grad_norm": 0.47341519594192505,
"learning_rate": 4.38676844783715e-05,
"loss": 0.1786,
"step": 1540
},
{
"epoch": 3.9024081115335867,
"grad_norm": 0.3734676241874695,
"learning_rate": 4.376590330788805e-05,
"loss": 0.1447,
"step": 1541
},
{
"epoch": 3.9049429657794676,
"grad_norm": 0.5003755688667297,
"learning_rate": 4.366412213740458e-05,
"loss": 0.1734,
"step": 1542
},
{
"epoch": 3.9074778200253486,
"grad_norm": 0.41165000200271606,
"learning_rate": 4.356234096692112e-05,
"loss": 0.172,
"step": 1543
},
{
"epoch": 3.9100126742712296,
"grad_norm": 0.45096197724342346,
"learning_rate": 4.346055979643766e-05,
"loss": 0.1726,
"step": 1544
},
{
"epoch": 3.91254752851711,
"grad_norm": 0.5445842146873474,
"learning_rate": 4.33587786259542e-05,
"loss": 0.206,
"step": 1545
},
{
"epoch": 3.915082382762991,
"grad_norm": 0.5139321088790894,
"learning_rate": 4.325699745547074e-05,
"loss": 0.1803,
"step": 1546
},
{
"epoch": 3.917617237008872,
"grad_norm": 0.5652433633804321,
"learning_rate": 4.3155216284987285e-05,
"loss": 0.2051,
"step": 1547
},
{
"epoch": 3.920152091254753,
"grad_norm": 0.38091734051704407,
"learning_rate": 4.3053435114503824e-05,
"loss": 0.1541,
"step": 1548
},
{
"epoch": 3.922686945500634,
"grad_norm": 0.3614705801010132,
"learning_rate": 4.2951653944020356e-05,
"loss": 0.147,
"step": 1549
},
{
"epoch": 3.9252217997465144,
"grad_norm": 0.4551761746406555,
"learning_rate": 4.2849872773536896e-05,
"loss": 0.1685,
"step": 1550
},
{
"epoch": 3.9277566539923954,
"grad_norm": 0.5226624011993408,
"learning_rate": 4.2748091603053435e-05,
"loss": 0.1727,
"step": 1551
},
{
"epoch": 3.9302915082382763,
"grad_norm": 0.3541867136955261,
"learning_rate": 4.2646310432569974e-05,
"loss": 0.1488,
"step": 1552
},
{
"epoch": 3.9328263624841573,
"grad_norm": 0.4599204659461975,
"learning_rate": 4.254452926208652e-05,
"loss": 0.1536,
"step": 1553
},
{
"epoch": 3.935361216730038,
"grad_norm": 0.45082637667655945,
"learning_rate": 4.244274809160306e-05,
"loss": 0.1671,
"step": 1554
},
{
"epoch": 3.9378960709759188,
"grad_norm": 0.6053276658058167,
"learning_rate": 4.23409669211196e-05,
"loss": 0.2043,
"step": 1555
},
{
"epoch": 3.9404309252217997,
"grad_norm": 0.506443440914154,
"learning_rate": 4.223918575063613e-05,
"loss": 0.1893,
"step": 1556
},
{
"epoch": 3.9429657794676807,
"grad_norm": 0.6029784679412842,
"learning_rate": 4.213740458015267e-05,
"loss": 0.201,
"step": 1557
},
{
"epoch": 3.9455006337135616,
"grad_norm": 0.3993350863456726,
"learning_rate": 4.203562340966921e-05,
"loss": 0.1637,
"step": 1558
},
{
"epoch": 3.9480354879594426,
"grad_norm": 0.5887712836265564,
"learning_rate": 4.193384223918575e-05,
"loss": 0.2207,
"step": 1559
},
{
"epoch": 3.950570342205323,
"grad_norm": 0.5538966059684753,
"learning_rate": 4.1832061068702296e-05,
"loss": 0.1674,
"step": 1560
},
{
"epoch": 3.953105196451204,
"grad_norm": 0.4831174910068512,
"learning_rate": 4.1730279898218835e-05,
"loss": 0.1694,
"step": 1561
},
{
"epoch": 3.955640050697085,
"grad_norm": 0.39700761437416077,
"learning_rate": 4.1628498727735374e-05,
"loss": 0.1695,
"step": 1562
},
{
"epoch": 3.9581749049429655,
"grad_norm": 0.5388202667236328,
"learning_rate": 4.152671755725191e-05,
"loss": 0.1769,
"step": 1563
},
{
"epoch": 3.9607097591888465,
"grad_norm": 0.5717085599899292,
"learning_rate": 4.1424936386768446e-05,
"loss": 0.2602,
"step": 1564
},
{
"epoch": 3.9632446134347274,
"grad_norm": 0.4135623872280121,
"learning_rate": 4.1323155216284985e-05,
"loss": 0.1512,
"step": 1565
},
{
"epoch": 3.9657794676806084,
"grad_norm": 0.478411465883255,
"learning_rate": 4.122137404580153e-05,
"loss": 0.1967,
"step": 1566
},
{
"epoch": 3.9683143219264894,
"grad_norm": 0.4836915135383606,
"learning_rate": 4.111959287531807e-05,
"loss": 0.2297,
"step": 1567
},
{
"epoch": 3.9708491761723703,
"grad_norm": 0.6355355978012085,
"learning_rate": 4.101781170483461e-05,
"loss": 0.2291,
"step": 1568
},
{
"epoch": 3.973384030418251,
"grad_norm": 0.42811089754104614,
"learning_rate": 4.091603053435115e-05,
"loss": 0.1518,
"step": 1569
},
{
"epoch": 3.975918884664132,
"grad_norm": 0.5778828859329224,
"learning_rate": 4.081424936386768e-05,
"loss": 0.1638,
"step": 1570
},
{
"epoch": 3.9784537389100127,
"grad_norm": 0.4650358259677887,
"learning_rate": 4.071246819338422e-05,
"loss": 0.1658,
"step": 1571
},
{
"epoch": 3.9809885931558933,
"grad_norm": 0.5939072966575623,
"learning_rate": 4.061068702290077e-05,
"loss": 0.2276,
"step": 1572
},
{
"epoch": 3.983523447401774,
"grad_norm": 0.5296881794929504,
"learning_rate": 4.050890585241731e-05,
"loss": 0.1895,
"step": 1573
},
{
"epoch": 3.986058301647655,
"grad_norm": 0.4479645788669586,
"learning_rate": 4.0407124681933846e-05,
"loss": 0.168,
"step": 1574
},
{
"epoch": 3.988593155893536,
"grad_norm": 0.6041486859321594,
"learning_rate": 4.0305343511450385e-05,
"loss": 0.2225,
"step": 1575
},
{
"epoch": 3.991128010139417,
"grad_norm": 1.0764771699905396,
"learning_rate": 4.0203562340966925e-05,
"loss": 0.1736,
"step": 1576
},
{
"epoch": 3.993662864385298,
"grad_norm": 0.4830266535282135,
"learning_rate": 4.010178117048346e-05,
"loss": 0.2017,
"step": 1577
},
{
"epoch": 3.9961977186311786,
"grad_norm": 0.4032004773616791,
"learning_rate": 4e-05,
"loss": 0.1723,
"step": 1578
},
{
"epoch": 3.9987325728770595,
"grad_norm": 0.4441380798816681,
"learning_rate": 3.989821882951654e-05,
"loss": 0.1714,
"step": 1579
},
{
"epoch": 4.0,
"grad_norm": 0.673060953617096,
"learning_rate": 3.979643765903308e-05,
"loss": 0.1651,
"step": 1580
},
{
"epoch": 4.002534854245881,
"grad_norm": 0.5185714960098267,
"learning_rate": 3.969465648854962e-05,
"loss": 0.1877,
"step": 1581
},
{
"epoch": 4.005069708491762,
"grad_norm": 0.4302978217601776,
"learning_rate": 3.959287531806616e-05,
"loss": 0.1575,
"step": 1582
},
{
"epoch": 4.007604562737643,
"grad_norm": 0.45982813835144043,
"learning_rate": 3.94910941475827e-05,
"loss": 0.1615,
"step": 1583
},
{
"epoch": 4.010139416983524,
"grad_norm": 0.4118313789367676,
"learning_rate": 3.938931297709924e-05,
"loss": 0.1508,
"step": 1584
},
{
"epoch": 4.012674271229404,
"grad_norm": 0.6039855480194092,
"learning_rate": 3.928753180661578e-05,
"loss": 0.1782,
"step": 1585
},
{
"epoch": 4.015209125475285,
"grad_norm": 0.4311355948448181,
"learning_rate": 3.918575063613232e-05,
"loss": 0.1488,
"step": 1586
},
{
"epoch": 4.017743979721166,
"grad_norm": 0.7398537993431091,
"learning_rate": 3.908396946564886e-05,
"loss": 0.1879,
"step": 1587
},
{
"epoch": 4.020278833967047,
"grad_norm": 0.37064164876937866,
"learning_rate": 3.8982188295165396e-05,
"loss": 0.1257,
"step": 1588
},
{
"epoch": 4.022813688212928,
"grad_norm": 0.46931344270706177,
"learning_rate": 3.8880407124681936e-05,
"loss": 0.1579,
"step": 1589
},
{
"epoch": 4.025348542458809,
"grad_norm": 0.4544156789779663,
"learning_rate": 3.8778625954198475e-05,
"loss": 0.134,
"step": 1590
},
{
"epoch": 4.02788339670469,
"grad_norm": 0.5562132000923157,
"learning_rate": 3.8676844783715014e-05,
"loss": 0.1488,
"step": 1591
},
{
"epoch": 4.030418250950571,
"grad_norm": 0.5679481625556946,
"learning_rate": 3.8575063613231554e-05,
"loss": 0.1322,
"step": 1592
},
{
"epoch": 4.032953105196452,
"grad_norm": 0.6101714372634888,
"learning_rate": 3.847328244274809e-05,
"loss": 0.1534,
"step": 1593
},
{
"epoch": 4.035487959442332,
"grad_norm": 0.8060622215270996,
"learning_rate": 3.837150127226463e-05,
"loss": 0.1986,
"step": 1594
},
{
"epoch": 4.038022813688213,
"grad_norm": 0.5501425266265869,
"learning_rate": 3.826972010178117e-05,
"loss": 0.1444,
"step": 1595
},
{
"epoch": 4.0405576679340935,
"grad_norm": 0.5117461085319519,
"learning_rate": 3.816793893129771e-05,
"loss": 0.1259,
"step": 1596
},
{
"epoch": 4.0430925221799745,
"grad_norm": 0.571770429611206,
"learning_rate": 3.806615776081425e-05,
"loss": 0.1413,
"step": 1597
},
{
"epoch": 4.0456273764258555,
"grad_norm": 0.7756439447402954,
"learning_rate": 3.796437659033079e-05,
"loss": 0.1874,
"step": 1598
},
{
"epoch": 4.048162230671736,
"grad_norm": 0.6393389701843262,
"learning_rate": 3.786259541984733e-05,
"loss": 0.1226,
"step": 1599
},
{
"epoch": 4.050697084917617,
"grad_norm": 0.7177454233169556,
"learning_rate": 3.776081424936387e-05,
"loss": 0.1382,
"step": 1600
},
{
"epoch": 4.053231939163498,
"grad_norm": 0.6561391353607178,
"learning_rate": 3.765903307888041e-05,
"loss": 0.1557,
"step": 1601
},
{
"epoch": 4.055766793409379,
"grad_norm": 0.8319444060325623,
"learning_rate": 3.7557251908396954e-05,
"loss": 0.1608,
"step": 1602
},
{
"epoch": 4.05830164765526,
"grad_norm": 0.7468693852424622,
"learning_rate": 3.745547073791349e-05,
"loss": 0.1442,
"step": 1603
},
{
"epoch": 4.06083650190114,
"grad_norm": 0.623657763004303,
"learning_rate": 3.7353689567430025e-05,
"loss": 0.1395,
"step": 1604
},
{
"epoch": 4.063371356147021,
"grad_norm": 0.5870152115821838,
"learning_rate": 3.7251908396946565e-05,
"loss": 0.1322,
"step": 1605
},
{
"epoch": 4.065906210392902,
"grad_norm": 0.6840811371803284,
"learning_rate": 3.7150127226463104e-05,
"loss": 0.132,
"step": 1606
},
{
"epoch": 4.068441064638783,
"grad_norm": 0.6177504658699036,
"learning_rate": 3.704834605597964e-05,
"loss": 0.1265,
"step": 1607
},
{
"epoch": 4.070975918884664,
"grad_norm": 0.6908831000328064,
"learning_rate": 3.694656488549619e-05,
"loss": 0.1593,
"step": 1608
},
{
"epoch": 4.073510773130545,
"grad_norm": 0.787434458732605,
"learning_rate": 3.684478371501273e-05,
"loss": 0.1184,
"step": 1609
},
{
"epoch": 4.076045627376426,
"grad_norm": 0.8011195063591003,
"learning_rate": 3.674300254452927e-05,
"loss": 0.1341,
"step": 1610
},
{
"epoch": 4.078580481622307,
"grad_norm": 0.5523831248283386,
"learning_rate": 3.66412213740458e-05,
"loss": 0.1283,
"step": 1611
},
{
"epoch": 4.081115335868188,
"grad_norm": 0.6396963596343994,
"learning_rate": 3.653944020356234e-05,
"loss": 0.1424,
"step": 1612
},
{
"epoch": 4.083650190114068,
"grad_norm": 0.7471883893013,
"learning_rate": 3.643765903307888e-05,
"loss": 0.1627,
"step": 1613
},
{
"epoch": 4.086185044359949,
"grad_norm": 0.5498061776161194,
"learning_rate": 3.633587786259542e-05,
"loss": 0.1478,
"step": 1614
},
{
"epoch": 4.08871989860583,
"grad_norm": 0.6853391528129578,
"learning_rate": 3.6234096692111965e-05,
"loss": 0.1588,
"step": 1615
},
{
"epoch": 4.091254752851711,
"grad_norm": 0.6638361811637878,
"learning_rate": 3.6132315521628504e-05,
"loss": 0.1695,
"step": 1616
},
{
"epoch": 4.093789607097592,
"grad_norm": 0.6155263781547546,
"learning_rate": 3.603053435114504e-05,
"loss": 0.1355,
"step": 1617
},
{
"epoch": 4.096324461343473,
"grad_norm": 0.574590265750885,
"learning_rate": 3.5928753180661576e-05,
"loss": 0.1498,
"step": 1618
},
{
"epoch": 4.098859315589354,
"grad_norm": 0.5972251296043396,
"learning_rate": 3.5826972010178115e-05,
"loss": 0.1684,
"step": 1619
},
{
"epoch": 4.101394169835235,
"grad_norm": 0.668618381023407,
"learning_rate": 3.5725190839694654e-05,
"loss": 0.1377,
"step": 1620
},
{
"epoch": 4.103929024081116,
"grad_norm": 0.6238232851028442,
"learning_rate": 3.56234096692112e-05,
"loss": 0.2025,
"step": 1621
},
{
"epoch": 4.106463878326996,
"grad_norm": 0.9182467460632324,
"learning_rate": 3.552162849872774e-05,
"loss": 0.1539,
"step": 1622
},
{
"epoch": 4.108998732572877,
"grad_norm": 0.6368919014930725,
"learning_rate": 3.541984732824428e-05,
"loss": 0.1421,
"step": 1623
},
{
"epoch": 4.111533586818758,
"grad_norm": 0.7871132493019104,
"learning_rate": 3.531806615776082e-05,
"loss": 0.1482,
"step": 1624
},
{
"epoch": 4.114068441064639,
"grad_norm": 0.7697343230247498,
"learning_rate": 3.521628498727735e-05,
"loss": 0.1607,
"step": 1625
},
{
"epoch": 4.11660329531052,
"grad_norm": 0.5805296897888184,
"learning_rate": 3.511450381679389e-05,
"loss": 0.1497,
"step": 1626
},
{
"epoch": 4.119138149556401,
"grad_norm": 0.6484183073043823,
"learning_rate": 3.5012722646310436e-05,
"loss": 0.1827,
"step": 1627
},
{
"epoch": 4.1216730038022815,
"grad_norm": 1.0351064205169678,
"learning_rate": 3.4910941475826976e-05,
"loss": 0.2331,
"step": 1628
},
{
"epoch": 4.1242078580481625,
"grad_norm": 0.620452344417572,
"learning_rate": 3.4809160305343515e-05,
"loss": 0.1516,
"step": 1629
},
{
"epoch": 4.126742712294043,
"grad_norm": 0.6269112229347229,
"learning_rate": 3.4707379134860054e-05,
"loss": 0.1322,
"step": 1630
},
{
"epoch": 4.129277566539924,
"grad_norm": 0.7780957221984863,
"learning_rate": 3.4605597964376594e-05,
"loss": 0.1974,
"step": 1631
},
{
"epoch": 4.1318124207858045,
"grad_norm": 0.6183624267578125,
"learning_rate": 3.4503816793893126e-05,
"loss": 0.1423,
"step": 1632
},
{
"epoch": 4.134347275031685,
"grad_norm": 0.715943455696106,
"learning_rate": 3.440203562340967e-05,
"loss": 0.1422,
"step": 1633
},
{
"epoch": 4.136882129277566,
"grad_norm": 0.6383997201919556,
"learning_rate": 3.430025445292621e-05,
"loss": 0.1566,
"step": 1634
},
{
"epoch": 4.139416983523447,
"grad_norm": 0.6354379653930664,
"learning_rate": 3.419847328244275e-05,
"loss": 0.14,
"step": 1635
},
{
"epoch": 4.141951837769328,
"grad_norm": 0.5692049264907837,
"learning_rate": 3.409669211195929e-05,
"loss": 0.1315,
"step": 1636
},
{
"epoch": 4.144486692015209,
"grad_norm": 0.5286855697631836,
"learning_rate": 3.399491094147583e-05,
"loss": 0.119,
"step": 1637
},
{
"epoch": 4.14702154626109,
"grad_norm": 0.6007808446884155,
"learning_rate": 3.389312977099237e-05,
"loss": 0.1368,
"step": 1638
},
{
"epoch": 4.149556400506971,
"grad_norm": 0.8727791905403137,
"learning_rate": 3.379134860050891e-05,
"loss": 0.1635,
"step": 1639
},
{
"epoch": 4.152091254752852,
"grad_norm": 0.7203207015991211,
"learning_rate": 3.368956743002545e-05,
"loss": 0.1668,
"step": 1640
},
{
"epoch": 4.154626108998732,
"grad_norm": 0.7178492546081543,
"learning_rate": 3.358778625954199e-05,
"loss": 0.1601,
"step": 1641
},
{
"epoch": 4.157160963244613,
"grad_norm": 0.6133365035057068,
"learning_rate": 3.3486005089058526e-05,
"loss": 0.1438,
"step": 1642
},
{
"epoch": 4.159695817490494,
"grad_norm": 0.690122127532959,
"learning_rate": 3.3384223918575065e-05,
"loss": 0.1592,
"step": 1643
},
{
"epoch": 4.162230671736375,
"grad_norm": 0.5469484925270081,
"learning_rate": 3.3282442748091605e-05,
"loss": 0.1499,
"step": 1644
},
{
"epoch": 4.164765525982256,
"grad_norm": 0.7380850911140442,
"learning_rate": 3.3180661577608144e-05,
"loss": 0.1724,
"step": 1645
},
{
"epoch": 4.167300380228137,
"grad_norm": 0.6949165463447571,
"learning_rate": 3.307888040712468e-05,
"loss": 0.1642,
"step": 1646
},
{
"epoch": 4.169835234474018,
"grad_norm": 0.6445840001106262,
"learning_rate": 3.297709923664122e-05,
"loss": 0.1576,
"step": 1647
},
{
"epoch": 4.172370088719899,
"grad_norm": 0.577178418636322,
"learning_rate": 3.287531806615776e-05,
"loss": 0.1482,
"step": 1648
},
{
"epoch": 4.17490494296578,
"grad_norm": 0.5232000350952148,
"learning_rate": 3.27735368956743e-05,
"loss": 0.1385,
"step": 1649
},
{
"epoch": 4.17743979721166,
"grad_norm": 0.8429796695709229,
"learning_rate": 3.267175572519084e-05,
"loss": 0.2456,
"step": 1650
},
{
"epoch": 4.179974651457541,
"grad_norm": 0.5647293925285339,
"learning_rate": 3.256997455470738e-05,
"loss": 0.1482,
"step": 1651
},
{
"epoch": 4.182509505703422,
"grad_norm": 0.7679947018623352,
"learning_rate": 3.246819338422392e-05,
"loss": 0.1705,
"step": 1652
},
{
"epoch": 4.185044359949303,
"grad_norm": 0.7913497686386108,
"learning_rate": 3.236641221374046e-05,
"loss": 0.2133,
"step": 1653
},
{
"epoch": 4.187579214195184,
"grad_norm": 0.5105036497116089,
"learning_rate": 3.2264631043257e-05,
"loss": 0.1335,
"step": 1654
},
{
"epoch": 4.190114068441065,
"grad_norm": 0.6503207087516785,
"learning_rate": 3.216284987277354e-05,
"loss": 0.1872,
"step": 1655
},
{
"epoch": 4.192648922686946,
"grad_norm": 0.9579104781150818,
"learning_rate": 3.2061068702290076e-05,
"loss": 0.1985,
"step": 1656
},
{
"epoch": 4.195183776932827,
"grad_norm": 0.5334345698356628,
"learning_rate": 3.195928753180662e-05,
"loss": 0.137,
"step": 1657
},
{
"epoch": 4.197718631178708,
"grad_norm": 0.7031605243682861,
"learning_rate": 3.185750636132316e-05,
"loss": 0.1574,
"step": 1658
},
{
"epoch": 4.200253485424588,
"grad_norm": 0.6237590909004211,
"learning_rate": 3.1755725190839694e-05,
"loss": 0.1686,
"step": 1659
},
{
"epoch": 4.202788339670469,
"grad_norm": 0.827680230140686,
"learning_rate": 3.1653944020356234e-05,
"loss": 0.1765,
"step": 1660
},
{
"epoch": 4.20532319391635,
"grad_norm": 0.6170578002929688,
"learning_rate": 3.155216284987277e-05,
"loss": 0.1699,
"step": 1661
},
{
"epoch": 4.2078580481622305,
"grad_norm": 0.600803017616272,
"learning_rate": 3.145038167938931e-05,
"loss": 0.1345,
"step": 1662
},
{
"epoch": 4.2103929024081115,
"grad_norm": 0.5505921840667725,
"learning_rate": 3.134860050890586e-05,
"loss": 0.1418,
"step": 1663
},
{
"epoch": 4.212927756653992,
"grad_norm": 0.5893916487693787,
"learning_rate": 3.12468193384224e-05,
"loss": 0.1414,
"step": 1664
},
{
"epoch": 4.215462610899873,
"grad_norm": 0.7622592449188232,
"learning_rate": 3.114503816793894e-05,
"loss": 0.1568,
"step": 1665
},
{
"epoch": 4.217997465145754,
"grad_norm": 0.6462287306785583,
"learning_rate": 3.104325699745547e-05,
"loss": 0.1641,
"step": 1666
},
{
"epoch": 4.220532319391635,
"grad_norm": 0.4971311092376709,
"learning_rate": 3.094147582697201e-05,
"loss": 0.1276,
"step": 1667
},
{
"epoch": 4.223067173637516,
"grad_norm": 0.7270475029945374,
"learning_rate": 3.083969465648855e-05,
"loss": 0.1603,
"step": 1668
},
{
"epoch": 4.225602027883396,
"grad_norm": 0.5765766501426697,
"learning_rate": 3.0737913486005094e-05,
"loss": 0.1341,
"step": 1669
},
{
"epoch": 4.228136882129277,
"grad_norm": 0.577694296836853,
"learning_rate": 3.0636132315521633e-05,
"loss": 0.1415,
"step": 1670
},
{
"epoch": 4.230671736375158,
"grad_norm": 0.6085098385810852,
"learning_rate": 3.053435114503817e-05,
"loss": 0.1359,
"step": 1671
},
{
"epoch": 4.233206590621039,
"grad_norm": 0.6224119663238525,
"learning_rate": 3.043256997455471e-05,
"loss": 0.1494,
"step": 1672
},
{
"epoch": 4.23574144486692,
"grad_norm": 0.4535973072052002,
"learning_rate": 3.0330788804071248e-05,
"loss": 0.1415,
"step": 1673
},
{
"epoch": 4.238276299112801,
"grad_norm": 0.6283777356147766,
"learning_rate": 3.0229007633587787e-05,
"loss": 0.1569,
"step": 1674
},
{
"epoch": 4.240811153358682,
"grad_norm": 0.6005566120147705,
"learning_rate": 3.0127226463104323e-05,
"loss": 0.1385,
"step": 1675
},
{
"epoch": 4.243346007604563,
"grad_norm": 0.6437854766845703,
"learning_rate": 3.002544529262087e-05,
"loss": 0.1584,
"step": 1676
},
{
"epoch": 4.245880861850444,
"grad_norm": 0.5184986591339111,
"learning_rate": 2.992366412213741e-05,
"loss": 0.1384,
"step": 1677
},
{
"epoch": 4.248415716096324,
"grad_norm": 0.5969160199165344,
"learning_rate": 2.9821882951653945e-05,
"loss": 0.1609,
"step": 1678
},
{
"epoch": 4.250950570342205,
"grad_norm": 0.85272616147995,
"learning_rate": 2.9720101781170484e-05,
"loss": 0.178,
"step": 1679
},
{
"epoch": 4.253485424588086,
"grad_norm": 0.5351912379264832,
"learning_rate": 2.9618320610687023e-05,
"loss": 0.1465,
"step": 1680
},
{
"epoch": 4.256020278833967,
"grad_norm": 0.5821883678436279,
"learning_rate": 2.9516539440203562e-05,
"loss": 0.135,
"step": 1681
},
{
"epoch": 4.258555133079848,
"grad_norm": 0.5453548431396484,
"learning_rate": 2.9414758269720105e-05,
"loss": 0.1287,
"step": 1682
},
{
"epoch": 4.261089987325729,
"grad_norm": 0.6280243396759033,
"learning_rate": 2.9312977099236644e-05,
"loss": 0.152,
"step": 1683
},
{
"epoch": 4.26362484157161,
"grad_norm": 0.5709437131881714,
"learning_rate": 2.9211195928753184e-05,
"loss": 0.1487,
"step": 1684
},
{
"epoch": 4.266159695817491,
"grad_norm": 0.4667048752307892,
"learning_rate": 2.910941475826972e-05,
"loss": 0.129,
"step": 1685
},
{
"epoch": 4.268694550063372,
"grad_norm": 0.5744767189025879,
"learning_rate": 2.900763358778626e-05,
"loss": 0.1668,
"step": 1686
},
{
"epoch": 4.271229404309253,
"grad_norm": 0.552631139755249,
"learning_rate": 2.89058524173028e-05,
"loss": 0.128,
"step": 1687
},
{
"epoch": 4.273764258555133,
"grad_norm": 0.46616679430007935,
"learning_rate": 2.880407124681934e-05,
"loss": 0.1168,
"step": 1688
},
{
"epoch": 4.276299112801014,
"grad_norm": 0.7842658758163452,
"learning_rate": 2.870229007633588e-05,
"loss": 0.1617,
"step": 1689
},
{
"epoch": 4.278833967046895,
"grad_norm": 0.5530945062637329,
"learning_rate": 2.860050890585242e-05,
"loss": 0.1619,
"step": 1690
},
{
"epoch": 4.281368821292776,
"grad_norm": 0.9341786503791809,
"learning_rate": 2.849872773536896e-05,
"loss": 0.231,
"step": 1691
},
{
"epoch": 4.283903675538657,
"grad_norm": 0.8043704032897949,
"learning_rate": 2.8396946564885495e-05,
"loss": 0.1826,
"step": 1692
},
{
"epoch": 4.2864385297845375,
"grad_norm": 0.4446638524532318,
"learning_rate": 2.8295165394402034e-05,
"loss": 0.1413,
"step": 1693
},
{
"epoch": 4.2889733840304185,
"grad_norm": 0.6845833659172058,
"learning_rate": 2.8193384223918577e-05,
"loss": 0.1577,
"step": 1694
},
{
"epoch": 4.2915082382762995,
"grad_norm": 0.6702572107315063,
"learning_rate": 2.8091603053435116e-05,
"loss": 0.1714,
"step": 1695
},
{
"epoch": 4.29404309252218,
"grad_norm": 0.6405001282691956,
"learning_rate": 2.7989821882951656e-05,
"loss": 0.1527,
"step": 1696
},
{
"epoch": 4.2965779467680605,
"grad_norm": 0.6155828833580017,
"learning_rate": 2.7888040712468195e-05,
"loss": 0.1471,
"step": 1697
},
{
"epoch": 4.299112801013941,
"grad_norm": 0.5606924295425415,
"learning_rate": 2.7786259541984734e-05,
"loss": 0.1331,
"step": 1698
},
{
"epoch": 4.301647655259822,
"grad_norm": 0.7498462200164795,
"learning_rate": 2.768447837150127e-05,
"loss": 0.1713,
"step": 1699
},
{
"epoch": 4.304182509505703,
"grad_norm": 0.6262723803520203,
"learning_rate": 2.7582697201017816e-05,
"loss": 0.1585,
"step": 1700
},
{
"epoch": 4.306717363751584,
"grad_norm": 0.6729116439819336,
"learning_rate": 2.7480916030534355e-05,
"loss": 0.1347,
"step": 1701
},
{
"epoch": 4.309252217997465,
"grad_norm": 0.7870539426803589,
"learning_rate": 2.737913486005089e-05,
"loss": 0.1512,
"step": 1702
},
{
"epoch": 4.311787072243346,
"grad_norm": 0.4943903684616089,
"learning_rate": 2.727735368956743e-05,
"loss": 0.1274,
"step": 1703
},
{
"epoch": 4.314321926489227,
"grad_norm": 0.4763108193874359,
"learning_rate": 2.717557251908397e-05,
"loss": 0.1228,
"step": 1704
},
{
"epoch": 4.316856780735108,
"grad_norm": 0.6400578618049622,
"learning_rate": 2.707379134860051e-05,
"loss": 0.1558,
"step": 1705
},
{
"epoch": 4.319391634980988,
"grad_norm": 0.5445212125778198,
"learning_rate": 2.6972010178117052e-05,
"loss": 0.1328,
"step": 1706
},
{
"epoch": 4.321926489226869,
"grad_norm": 0.6329374313354492,
"learning_rate": 2.687022900763359e-05,
"loss": 0.1615,
"step": 1707
},
{
"epoch": 4.32446134347275,
"grad_norm": 0.5299343466758728,
"learning_rate": 2.676844783715013e-05,
"loss": 0.122,
"step": 1708
},
{
"epoch": 4.326996197718631,
"grad_norm": 0.6486507058143616,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.1553,
"step": 1709
},
{
"epoch": 4.329531051964512,
"grad_norm": 0.6306889653205872,
"learning_rate": 2.6564885496183206e-05,
"loss": 0.1638,
"step": 1710
},
{
"epoch": 4.332065906210393,
"grad_norm": 0.6417018175125122,
"learning_rate": 2.6463104325699745e-05,
"loss": 0.1404,
"step": 1711
},
{
"epoch": 4.334600760456274,
"grad_norm": 0.7283552289009094,
"learning_rate": 2.6361323155216288e-05,
"loss": 0.1837,
"step": 1712
},
{
"epoch": 4.337135614702155,
"grad_norm": 0.7142099142074585,
"learning_rate": 2.6259541984732827e-05,
"loss": 0.1535,
"step": 1713
},
{
"epoch": 4.339670468948036,
"grad_norm": 0.6059632897377014,
"learning_rate": 2.6157760814249367e-05,
"loss": 0.1551,
"step": 1714
},
{
"epoch": 4.342205323193916,
"grad_norm": 0.6492133140563965,
"learning_rate": 2.6055979643765906e-05,
"loss": 0.1413,
"step": 1715
},
{
"epoch": 4.344740177439797,
"grad_norm": 0.7166099548339844,
"learning_rate": 2.5954198473282442e-05,
"loss": 0.1534,
"step": 1716
},
{
"epoch": 4.347275031685678,
"grad_norm": 0.6357300877571106,
"learning_rate": 2.585241730279898e-05,
"loss": 0.1445,
"step": 1717
},
{
"epoch": 4.349809885931559,
"grad_norm": 0.6684461236000061,
"learning_rate": 2.5750636132315524e-05,
"loss": 0.1469,
"step": 1718
},
{
"epoch": 4.35234474017744,
"grad_norm": 0.7808713912963867,
"learning_rate": 2.5648854961832063e-05,
"loss": 0.1892,
"step": 1719
},
{
"epoch": 4.354879594423321,
"grad_norm": 0.6660336852073669,
"learning_rate": 2.5547073791348602e-05,
"loss": 0.1545,
"step": 1720
},
{
"epoch": 4.357414448669202,
"grad_norm": 0.7266603112220764,
"learning_rate": 2.5445292620865142e-05,
"loss": 0.1346,
"step": 1721
},
{
"epoch": 4.359949302915083,
"grad_norm": 0.5710493326187134,
"learning_rate": 2.534351145038168e-05,
"loss": 0.1199,
"step": 1722
},
{
"epoch": 4.362484157160964,
"grad_norm": 0.6178765296936035,
"learning_rate": 2.5241730279898217e-05,
"loss": 0.1416,
"step": 1723
},
{
"epoch": 4.365019011406844,
"grad_norm": 0.5881832242012024,
"learning_rate": 2.5139949109414763e-05,
"loss": 0.1389,
"step": 1724
},
{
"epoch": 4.367553865652725,
"grad_norm": 0.5589767694473267,
"learning_rate": 2.50381679389313e-05,
"loss": 0.1356,
"step": 1725
},
{
"epoch": 4.370088719898606,
"grad_norm": 0.611072301864624,
"learning_rate": 2.4936386768447838e-05,
"loss": 0.1618,
"step": 1726
},
{
"epoch": 4.3726235741444865,
"grad_norm": 1.0045723915100098,
"learning_rate": 2.4834605597964378e-05,
"loss": 0.2004,
"step": 1727
},
{
"epoch": 4.3751584283903675,
"grad_norm": 1.0154621601104736,
"learning_rate": 2.4732824427480917e-05,
"loss": 0.1593,
"step": 1728
},
{
"epoch": 4.3776932826362485,
"grad_norm": 0.7933842539787292,
"learning_rate": 2.4631043256997456e-05,
"loss": 0.183,
"step": 1729
},
{
"epoch": 4.380228136882129,
"grad_norm": 0.8141732811927795,
"learning_rate": 2.4529262086513996e-05,
"loss": 0.1412,
"step": 1730
},
{
"epoch": 4.38276299112801,
"grad_norm": 0.6575155854225159,
"learning_rate": 2.4427480916030535e-05,
"loss": 0.1592,
"step": 1731
},
{
"epoch": 4.385297845373891,
"grad_norm": 0.7710108757019043,
"learning_rate": 2.4325699745547078e-05,
"loss": 0.2306,
"step": 1732
},
{
"epoch": 4.387832699619771,
"grad_norm": 0.6438276767730713,
"learning_rate": 2.4223918575063613e-05,
"loss": 0.143,
"step": 1733
},
{
"epoch": 4.390367553865652,
"grad_norm": 0.7019467949867249,
"learning_rate": 2.4122137404580153e-05,
"loss": 0.1641,
"step": 1734
},
{
"epoch": 4.392902408111533,
"grad_norm": 0.598584771156311,
"learning_rate": 2.4020356234096695e-05,
"loss": 0.1456,
"step": 1735
},
{
"epoch": 4.395437262357414,
"grad_norm": 0.6024305820465088,
"learning_rate": 2.391857506361323e-05,
"loss": 0.1287,
"step": 1736
},
{
"epoch": 4.397972116603295,
"grad_norm": 0.8446558713912964,
"learning_rate": 2.381679389312977e-05,
"loss": 0.1705,
"step": 1737
},
{
"epoch": 4.400506970849176,
"grad_norm": 0.5697831511497498,
"learning_rate": 2.3715012722646313e-05,
"loss": 0.1386,
"step": 1738
},
{
"epoch": 4.403041825095057,
"grad_norm": 0.6655327677726746,
"learning_rate": 2.3613231552162853e-05,
"loss": 0.186,
"step": 1739
},
{
"epoch": 4.405576679340938,
"grad_norm": 1.1001065969467163,
"learning_rate": 2.351145038167939e-05,
"loss": 0.2531,
"step": 1740
},
{
"epoch": 4.408111533586819,
"grad_norm": 0.5302372574806213,
"learning_rate": 2.340966921119593e-05,
"loss": 0.1342,
"step": 1741
},
{
"epoch": 4.4106463878327,
"grad_norm": 0.6450605392456055,
"learning_rate": 2.330788804071247e-05,
"loss": 0.1499,
"step": 1742
},
{
"epoch": 4.41318124207858,
"grad_norm": 0.5733135342597961,
"learning_rate": 2.3206106870229007e-05,
"loss": 0.166,
"step": 1743
},
{
"epoch": 4.415716096324461,
"grad_norm": 0.609865665435791,
"learning_rate": 2.310432569974555e-05,
"loss": 0.1306,
"step": 1744
},
{
"epoch": 4.418250950570342,
"grad_norm": 0.5957082509994507,
"learning_rate": 2.300254452926209e-05,
"loss": 0.1309,
"step": 1745
},
{
"epoch": 4.420785804816223,
"grad_norm": 0.5951780080795288,
"learning_rate": 2.2900763358778628e-05,
"loss": 0.1366,
"step": 1746
},
{
"epoch": 4.423320659062104,
"grad_norm": 0.7225191593170166,
"learning_rate": 2.2798982188295167e-05,
"loss": 0.1825,
"step": 1747
},
{
"epoch": 4.425855513307985,
"grad_norm": 0.6427996158599854,
"learning_rate": 2.2697201017811707e-05,
"loss": 0.1326,
"step": 1748
},
{
"epoch": 4.428390367553866,
"grad_norm": 0.49267786741256714,
"learning_rate": 2.2595419847328246e-05,
"loss": 0.1367,
"step": 1749
},
{
"epoch": 4.430925221799747,
"grad_norm": 0.5365452766418457,
"learning_rate": 2.2493638676844785e-05,
"loss": 0.1456,
"step": 1750
},
{
"epoch": 4.433460076045628,
"grad_norm": 0.65265291929245,
"learning_rate": 2.2391857506361324e-05,
"loss": 0.1379,
"step": 1751
},
{
"epoch": 4.435994930291509,
"grad_norm": 0.5401502847671509,
"learning_rate": 2.2290076335877864e-05,
"loss": 0.1293,
"step": 1752
},
{
"epoch": 4.438529784537389,
"grad_norm": 0.6832171678543091,
"learning_rate": 2.2188295165394403e-05,
"loss": 0.1448,
"step": 1753
},
{
"epoch": 4.44106463878327,
"grad_norm": 0.8080681562423706,
"learning_rate": 2.2086513994910942e-05,
"loss": 0.1832,
"step": 1754
},
{
"epoch": 4.443599493029151,
"grad_norm": 0.6201688051223755,
"learning_rate": 2.198473282442748e-05,
"loss": 0.159,
"step": 1755
},
{
"epoch": 4.446134347275032,
"grad_norm": 0.8549275994300842,
"learning_rate": 2.1882951653944024e-05,
"loss": 0.2103,
"step": 1756
},
{
"epoch": 4.448669201520913,
"grad_norm": 0.5879942178726196,
"learning_rate": 2.178117048346056e-05,
"loss": 0.1524,
"step": 1757
},
{
"epoch": 4.451204055766794,
"grad_norm": 0.6592312455177307,
"learning_rate": 2.16793893129771e-05,
"loss": 0.1535,
"step": 1758
},
{
"epoch": 4.4537389100126745,
"grad_norm": 0.6493979096412659,
"learning_rate": 2.1577608142493642e-05,
"loss": 0.1451,
"step": 1759
},
{
"epoch": 4.4562737642585555,
"grad_norm": 0.7973134517669678,
"learning_rate": 2.1475826972010178e-05,
"loss": 0.1519,
"step": 1760
},
{
"epoch": 4.458808618504436,
"grad_norm": 0.7703438401222229,
"learning_rate": 2.1374045801526718e-05,
"loss": 0.1653,
"step": 1761
},
{
"epoch": 4.4613434727503165,
"grad_norm": 1.0013222694396973,
"learning_rate": 2.127226463104326e-05,
"loss": 0.2064,
"step": 1762
},
{
"epoch": 4.4638783269961975,
"grad_norm": 0.7007017135620117,
"learning_rate": 2.11704834605598e-05,
"loss": 0.1401,
"step": 1763
},
{
"epoch": 4.466413181242078,
"grad_norm": 0.5366234183311462,
"learning_rate": 2.1068702290076335e-05,
"loss": 0.1389,
"step": 1764
},
{
"epoch": 4.468948035487959,
"grad_norm": 0.7167120575904846,
"learning_rate": 2.0966921119592875e-05,
"loss": 0.1817,
"step": 1765
},
{
"epoch": 4.47148288973384,
"grad_norm": 0.7901313900947571,
"learning_rate": 2.0865139949109417e-05,
"loss": 0.1817,
"step": 1766
},
{
"epoch": 4.474017743979721,
"grad_norm": 0.6681633591651917,
"learning_rate": 2.0763358778625953e-05,
"loss": 0.1458,
"step": 1767
},
{
"epoch": 4.476552598225602,
"grad_norm": 0.5067597031593323,
"learning_rate": 2.0661577608142493e-05,
"loss": 0.1301,
"step": 1768
},
{
"epoch": 4.479087452471483,
"grad_norm": 0.6582893133163452,
"learning_rate": 2.0559796437659035e-05,
"loss": 0.1576,
"step": 1769
},
{
"epoch": 4.481622306717364,
"grad_norm": 0.6628451943397522,
"learning_rate": 2.0458015267175575e-05,
"loss": 0.168,
"step": 1770
},
{
"epoch": 4.484157160963244,
"grad_norm": 0.5435721278190613,
"learning_rate": 2.035623409669211e-05,
"loss": 0.1476,
"step": 1771
},
{
"epoch": 4.486692015209125,
"grad_norm": 0.6182110905647278,
"learning_rate": 2.0254452926208653e-05,
"loss": 0.1441,
"step": 1772
},
{
"epoch": 4.489226869455006,
"grad_norm": 0.9246516823768616,
"learning_rate": 2.0152671755725193e-05,
"loss": 0.1747,
"step": 1773
},
{
"epoch": 4.491761723700887,
"grad_norm": 0.5967719554901123,
"learning_rate": 2.005089058524173e-05,
"loss": 0.1461,
"step": 1774
},
{
"epoch": 4.494296577946768,
"grad_norm": 0.5998682379722595,
"learning_rate": 1.994910941475827e-05,
"loss": 0.1276,
"step": 1775
},
{
"epoch": 4.496831432192649,
"grad_norm": 0.6168457865715027,
"learning_rate": 1.984732824427481e-05,
"loss": 0.1407,
"step": 1776
},
{
"epoch": 4.49936628643853,
"grad_norm": 0.6580602526664734,
"learning_rate": 1.974554707379135e-05,
"loss": 0.149,
"step": 1777
},
{
"epoch": 4.501901140684411,
"grad_norm": 0.5117031335830688,
"learning_rate": 1.964376590330789e-05,
"loss": 0.1397,
"step": 1778
},
{
"epoch": 4.504435994930292,
"grad_norm": 0.4603317975997925,
"learning_rate": 1.954198473282443e-05,
"loss": 0.1211,
"step": 1779
},
{
"epoch": 4.506970849176172,
"grad_norm": 0.5981631278991699,
"learning_rate": 1.9440203562340968e-05,
"loss": 0.1371,
"step": 1780
},
{
"epoch": 4.509505703422053,
"grad_norm": 0.6693590879440308,
"learning_rate": 1.9338422391857507e-05,
"loss": 0.1495,
"step": 1781
},
{
"epoch": 4.512040557667934,
"grad_norm": 0.5286784172058105,
"learning_rate": 1.9236641221374046e-05,
"loss": 0.1304,
"step": 1782
},
{
"epoch": 4.514575411913815,
"grad_norm": 0.7040352821350098,
"learning_rate": 1.9134860050890586e-05,
"loss": 0.1584,
"step": 1783
},
{
"epoch": 4.517110266159696,
"grad_norm": 0.6396339535713196,
"learning_rate": 1.9033078880407125e-05,
"loss": 0.1529,
"step": 1784
},
{
"epoch": 4.519645120405577,
"grad_norm": 0.6708245873451233,
"learning_rate": 1.8931297709923664e-05,
"loss": 0.1477,
"step": 1785
},
{
"epoch": 4.522179974651458,
"grad_norm": 0.6562108993530273,
"learning_rate": 1.8829516539440204e-05,
"loss": 0.1499,
"step": 1786
},
{
"epoch": 4.524714828897339,
"grad_norm": 0.5181876420974731,
"learning_rate": 1.8727735368956746e-05,
"loss": 0.1398,
"step": 1787
},
{
"epoch": 4.52724968314322,
"grad_norm": 0.5952017307281494,
"learning_rate": 1.8625954198473282e-05,
"loss": 0.1438,
"step": 1788
},
{
"epoch": 4.5297845373891,
"grad_norm": 0.6668636202812195,
"learning_rate": 1.852417302798982e-05,
"loss": 0.1805,
"step": 1789
},
{
"epoch": 4.532319391634981,
"grad_norm": 0.5433321595191956,
"learning_rate": 1.8422391857506364e-05,
"loss": 0.1397,
"step": 1790
},
{
"epoch": 4.534854245880862,
"grad_norm": 0.5353025197982788,
"learning_rate": 1.83206106870229e-05,
"loss": 0.1419,
"step": 1791
},
{
"epoch": 4.537389100126743,
"grad_norm": 0.6123271584510803,
"learning_rate": 1.821882951653944e-05,
"loss": 0.1493,
"step": 1792
},
{
"epoch": 4.5399239543726235,
"grad_norm": 0.6581493616104126,
"learning_rate": 1.8117048346055982e-05,
"loss": 0.1467,
"step": 1793
},
{
"epoch": 4.5424588086185045,
"grad_norm": 0.5537798404693604,
"learning_rate": 1.801526717557252e-05,
"loss": 0.1467,
"step": 1794
},
{
"epoch": 4.544993662864385,
"grad_norm": 0.7163582444190979,
"learning_rate": 1.7913486005089058e-05,
"loss": 0.1736,
"step": 1795
},
{
"epoch": 4.547528517110266,
"grad_norm": 0.694922149181366,
"learning_rate": 1.78117048346056e-05,
"loss": 0.1516,
"step": 1796
},
{
"epoch": 4.550063371356147,
"grad_norm": 0.7119778394699097,
"learning_rate": 1.770992366412214e-05,
"loss": 0.1899,
"step": 1797
},
{
"epoch": 4.552598225602027,
"grad_norm": 0.7570186853408813,
"learning_rate": 1.7608142493638675e-05,
"loss": 0.1951,
"step": 1798
},
{
"epoch": 4.555133079847908,
"grad_norm": 0.6789132356643677,
"learning_rate": 1.7506361323155218e-05,
"loss": 0.1475,
"step": 1799
},
{
"epoch": 4.557667934093789,
"grad_norm": 0.5750378966331482,
"learning_rate": 1.7404580152671757e-05,
"loss": 0.1431,
"step": 1800
},
{
"epoch": 4.56020278833967,
"grad_norm": 0.6066502332687378,
"learning_rate": 1.7302798982188297e-05,
"loss": 0.16,
"step": 1801
},
{
"epoch": 4.562737642585551,
"grad_norm": 0.5730226039886475,
"learning_rate": 1.7201017811704836e-05,
"loss": 0.1455,
"step": 1802
},
{
"epoch": 4.565272496831432,
"grad_norm": 0.5752687454223633,
"learning_rate": 1.7099236641221375e-05,
"loss": 0.1281,
"step": 1803
},
{
"epoch": 4.567807351077313,
"grad_norm": 0.5497205853462219,
"learning_rate": 1.6997455470737915e-05,
"loss": 0.1431,
"step": 1804
},
{
"epoch": 4.570342205323194,
"grad_norm": 0.7738269567489624,
"learning_rate": 1.6895674300254454e-05,
"loss": 0.1523,
"step": 1805
},
{
"epoch": 4.572877059569075,
"grad_norm": 0.5750918388366699,
"learning_rate": 1.6793893129770993e-05,
"loss": 0.1466,
"step": 1806
},
{
"epoch": 4.575411913814955,
"grad_norm": 0.5575040578842163,
"learning_rate": 1.6692111959287533e-05,
"loss": 0.1267,
"step": 1807
},
{
"epoch": 4.577946768060836,
"grad_norm": 0.509616494178772,
"learning_rate": 1.6590330788804072e-05,
"loss": 0.1434,
"step": 1808
},
{
"epoch": 4.580481622306717,
"grad_norm": 0.643009603023529,
"learning_rate": 1.648854961832061e-05,
"loss": 0.136,
"step": 1809
},
{
"epoch": 4.583016476552598,
"grad_norm": 0.5133553743362427,
"learning_rate": 1.638676844783715e-05,
"loss": 0.1223,
"step": 1810
},
{
"epoch": 4.585551330798479,
"grad_norm": 0.7505659461021423,
"learning_rate": 1.628498727735369e-05,
"loss": 0.1607,
"step": 1811
},
{
"epoch": 4.58808618504436,
"grad_norm": 0.6981300711631775,
"learning_rate": 1.618320610687023e-05,
"loss": 0.1525,
"step": 1812
},
{
"epoch": 4.590621039290241,
"grad_norm": 0.4981435537338257,
"learning_rate": 1.608142493638677e-05,
"loss": 0.1236,
"step": 1813
},
{
"epoch": 4.593155893536122,
"grad_norm": 0.6467440724372864,
"learning_rate": 1.597964376590331e-05,
"loss": 0.153,
"step": 1814
},
{
"epoch": 4.595690747782003,
"grad_norm": 0.6843181848526001,
"learning_rate": 1.5877862595419847e-05,
"loss": 0.1604,
"step": 1815
},
{
"epoch": 4.598225602027884,
"grad_norm": 0.49898776412010193,
"learning_rate": 1.5776081424936386e-05,
"loss": 0.1165,
"step": 1816
},
{
"epoch": 4.600760456273765,
"grad_norm": 0.6252351403236389,
"learning_rate": 1.567430025445293e-05,
"loss": 0.1228,
"step": 1817
},
{
"epoch": 4.603295310519645,
"grad_norm": 0.5452350974082947,
"learning_rate": 1.557251908396947e-05,
"loss": 0.1245,
"step": 1818
},
{
"epoch": 4.605830164765526,
"grad_norm": 0.6847854852676392,
"learning_rate": 1.5470737913486004e-05,
"loss": 0.1462,
"step": 1819
},
{
"epoch": 4.608365019011407,
"grad_norm": 0.49941131472587585,
"learning_rate": 1.5368956743002547e-05,
"loss": 0.1268,
"step": 1820
},
{
"epoch": 4.610899873257288,
"grad_norm": 0.581243097782135,
"learning_rate": 1.5267175572519086e-05,
"loss": 0.1296,
"step": 1821
},
{
"epoch": 4.613434727503169,
"grad_norm": 0.8345553874969482,
"learning_rate": 1.5165394402035624e-05,
"loss": 0.1307,
"step": 1822
},
{
"epoch": 4.61596958174905,
"grad_norm": 0.6534408926963806,
"learning_rate": 1.5063613231552162e-05,
"loss": 0.1446,
"step": 1823
},
{
"epoch": 4.6185044359949305,
"grad_norm": 0.7743064165115356,
"learning_rate": 1.4961832061068704e-05,
"loss": 0.2027,
"step": 1824
},
{
"epoch": 4.6210392902408115,
"grad_norm": 0.6709569096565247,
"learning_rate": 1.4860050890585242e-05,
"loss": 0.1427,
"step": 1825
},
{
"epoch": 4.6235741444866925,
"grad_norm": 0.6598264575004578,
"learning_rate": 1.4758269720101781e-05,
"loss": 0.1399,
"step": 1826
},
{
"epoch": 4.6261089987325725,
"grad_norm": 0.49041053652763367,
"learning_rate": 1.4656488549618322e-05,
"loss": 0.133,
"step": 1827
},
{
"epoch": 4.6286438529784535,
"grad_norm": 0.6697686910629272,
"learning_rate": 1.455470737913486e-05,
"loss": 0.1735,
"step": 1828
},
{
"epoch": 4.6311787072243344,
"grad_norm": 0.5481597781181335,
"learning_rate": 1.44529262086514e-05,
"loss": 0.1244,
"step": 1829
},
{
"epoch": 4.633713561470215,
"grad_norm": 0.6251161694526672,
"learning_rate": 1.435114503816794e-05,
"loss": 0.1436,
"step": 1830
},
{
"epoch": 4.636248415716096,
"grad_norm": 0.7515272498130798,
"learning_rate": 1.424936386768448e-05,
"loss": 0.1493,
"step": 1831
},
{
"epoch": 4.638783269961977,
"grad_norm": 0.8478451371192932,
"learning_rate": 1.4147582697201017e-05,
"loss": 0.1519,
"step": 1832
},
{
"epoch": 4.641318124207858,
"grad_norm": 0.5417062640190125,
"learning_rate": 1.4045801526717558e-05,
"loss": 0.1318,
"step": 1833
},
{
"epoch": 4.643852978453739,
"grad_norm": 0.6493893265724182,
"learning_rate": 1.3944020356234097e-05,
"loss": 0.1546,
"step": 1834
},
{
"epoch": 4.64638783269962,
"grad_norm": 0.8475616574287415,
"learning_rate": 1.3842239185750635e-05,
"loss": 0.172,
"step": 1835
},
{
"epoch": 4.6489226869455,
"grad_norm": 0.5484082698822021,
"learning_rate": 1.3740458015267178e-05,
"loss": 0.1203,
"step": 1836
},
{
"epoch": 4.651457541191381,
"grad_norm": 0.6533843874931335,
"learning_rate": 1.3638676844783715e-05,
"loss": 0.1501,
"step": 1837
},
{
"epoch": 4.653992395437262,
"grad_norm": 0.7521854043006897,
"learning_rate": 1.3536895674300255e-05,
"loss": 0.1955,
"step": 1838
},
{
"epoch": 4.656527249683143,
"grad_norm": 0.6500900983810425,
"learning_rate": 1.3435114503816796e-05,
"loss": 0.14,
"step": 1839
},
{
"epoch": 4.659062103929024,
"grad_norm": 0.7133599519729614,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.1707,
"step": 1840
},
{
"epoch": 4.661596958174905,
"grad_norm": 0.7065775394439697,
"learning_rate": 1.3231552162849873e-05,
"loss": 0.144,
"step": 1841
},
{
"epoch": 4.664131812420786,
"grad_norm": 0.7716514468193054,
"learning_rate": 1.3129770992366414e-05,
"loss": 0.1792,
"step": 1842
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.9312828779220581,
"learning_rate": 1.3027989821882953e-05,
"loss": 0.2139,
"step": 1843
},
{
"epoch": 4.669201520912548,
"grad_norm": 0.5163487792015076,
"learning_rate": 1.292620865139949e-05,
"loss": 0.139,
"step": 1844
},
{
"epoch": 4.671736375158428,
"grad_norm": 0.7424818277359009,
"learning_rate": 1.2824427480916032e-05,
"loss": 0.1533,
"step": 1845
},
{
"epoch": 4.674271229404309,
"grad_norm": 0.5935065150260925,
"learning_rate": 1.2722646310432571e-05,
"loss": 0.1319,
"step": 1846
},
{
"epoch": 4.67680608365019,
"grad_norm": 0.7372322678565979,
"learning_rate": 1.2620865139949108e-05,
"loss": 0.1832,
"step": 1847
},
{
"epoch": 4.679340937896071,
"grad_norm": 0.5936238765716553,
"learning_rate": 1.251908396946565e-05,
"loss": 0.1357,
"step": 1848
},
{
"epoch": 4.681875792141952,
"grad_norm": 0.6689032316207886,
"learning_rate": 1.2417302798982189e-05,
"loss": 0.1709,
"step": 1849
},
{
"epoch": 4.684410646387833,
"grad_norm": 0.6519850492477417,
"learning_rate": 1.2315521628498728e-05,
"loss": 0.1438,
"step": 1850
},
{
"epoch": 4.686945500633714,
"grad_norm": 0.5853939056396484,
"learning_rate": 1.2213740458015267e-05,
"loss": 0.134,
"step": 1851
},
{
"epoch": 4.689480354879595,
"grad_norm": 0.5059859752655029,
"learning_rate": 1.2111959287531807e-05,
"loss": 0.1088,
"step": 1852
},
{
"epoch": 4.692015209125476,
"grad_norm": 0.6989784240722656,
"learning_rate": 1.2010178117048348e-05,
"loss": 0.1527,
"step": 1853
},
{
"epoch": 4.694550063371356,
"grad_norm": 0.5851006507873535,
"learning_rate": 1.1908396946564885e-05,
"loss": 0.143,
"step": 1854
},
{
"epoch": 4.697084917617237,
"grad_norm": 0.5606602430343628,
"learning_rate": 1.1806615776081426e-05,
"loss": 0.1288,
"step": 1855
},
{
"epoch": 4.699619771863118,
"grad_norm": 0.6175526976585388,
"learning_rate": 1.1704834605597966e-05,
"loss": 0.1564,
"step": 1856
},
{
"epoch": 4.702154626108999,
"grad_norm": 0.5776654481887817,
"learning_rate": 1.1603053435114503e-05,
"loss": 0.1323,
"step": 1857
},
{
"epoch": 4.7046894803548795,
"grad_norm": 0.5664159059524536,
"learning_rate": 1.1501272264631044e-05,
"loss": 0.1371,
"step": 1858
},
{
"epoch": 4.7072243346007605,
"grad_norm": 0.7187889218330383,
"learning_rate": 1.1399491094147584e-05,
"loss": 0.1476,
"step": 1859
},
{
"epoch": 4.7097591888466415,
"grad_norm": 0.5795005559921265,
"learning_rate": 1.1297709923664123e-05,
"loss": 0.1373,
"step": 1860
},
{
"epoch": 4.712294043092522,
"grad_norm": 0.5491251945495605,
"learning_rate": 1.1195928753180662e-05,
"loss": 0.1192,
"step": 1861
},
{
"epoch": 4.714828897338403,
"grad_norm": 0.4715762734413147,
"learning_rate": 1.1094147582697202e-05,
"loss": 0.1106,
"step": 1862
},
{
"epoch": 4.7173637515842834,
"grad_norm": 0.6300286054611206,
"learning_rate": 1.099236641221374e-05,
"loss": 0.138,
"step": 1863
},
{
"epoch": 4.719898605830164,
"grad_norm": 0.7265313267707825,
"learning_rate": 1.089058524173028e-05,
"loss": 0.2246,
"step": 1864
},
{
"epoch": 4.722433460076045,
"grad_norm": 0.7080928087234497,
"learning_rate": 1.0788804071246821e-05,
"loss": 0.1335,
"step": 1865
},
{
"epoch": 4.724968314321926,
"grad_norm": 0.605714738368988,
"learning_rate": 1.0687022900763359e-05,
"loss": 0.1412,
"step": 1866
},
{
"epoch": 4.727503168567807,
"grad_norm": 0.6648192405700684,
"learning_rate": 1.05852417302799e-05,
"loss": 0.1648,
"step": 1867
},
{
"epoch": 4.730038022813688,
"grad_norm": 0.6057281494140625,
"learning_rate": 1.0483460559796437e-05,
"loss": 0.1266,
"step": 1868
},
{
"epoch": 4.732572877059569,
"grad_norm": 0.6135514974594116,
"learning_rate": 1.0381679389312977e-05,
"loss": 0.1457,
"step": 1869
},
{
"epoch": 4.73510773130545,
"grad_norm": 0.6599459052085876,
"learning_rate": 1.0279898218829518e-05,
"loss": 0.1558,
"step": 1870
},
{
"epoch": 4.737642585551331,
"grad_norm": 0.5975873470306396,
"learning_rate": 1.0178117048346055e-05,
"loss": 0.134,
"step": 1871
},
{
"epoch": 4.740177439797211,
"grad_norm": 0.6581792235374451,
"learning_rate": 1.0076335877862596e-05,
"loss": 0.1463,
"step": 1872
},
{
"epoch": 4.742712294043092,
"grad_norm": 0.5627064108848572,
"learning_rate": 9.974554707379136e-06,
"loss": 0.1238,
"step": 1873
},
{
"epoch": 4.745247148288973,
"grad_norm": 0.6461361050605774,
"learning_rate": 9.872773536895675e-06,
"loss": 0.1621,
"step": 1874
},
{
"epoch": 4.747782002534854,
"grad_norm": 0.5615333914756775,
"learning_rate": 9.770992366412214e-06,
"loss": 0.1387,
"step": 1875
},
{
"epoch": 4.750316856780735,
"grad_norm": 0.6830117702484131,
"learning_rate": 9.669211195928754e-06,
"loss": 0.1397,
"step": 1876
},
{
"epoch": 4.752851711026616,
"grad_norm": 0.731072187423706,
"learning_rate": 9.567430025445293e-06,
"loss": 0.1508,
"step": 1877
},
{
"epoch": 4.755386565272497,
"grad_norm": 0.7469286918640137,
"learning_rate": 9.465648854961832e-06,
"loss": 0.1944,
"step": 1878
},
{
"epoch": 4.757921419518378,
"grad_norm": 0.700532078742981,
"learning_rate": 9.363867684478373e-06,
"loss": 0.1697,
"step": 1879
},
{
"epoch": 4.760456273764259,
"grad_norm": 0.7140323519706726,
"learning_rate": 9.26208651399491e-06,
"loss": 0.1597,
"step": 1880
},
{
"epoch": 4.76299112801014,
"grad_norm": 0.6711133718490601,
"learning_rate": 9.16030534351145e-06,
"loss": 0.1731,
"step": 1881
},
{
"epoch": 4.765525982256021,
"grad_norm": 0.43002957105636597,
"learning_rate": 9.058524173027991e-06,
"loss": 0.1181,
"step": 1882
},
{
"epoch": 4.768060836501901,
"grad_norm": 0.669159471988678,
"learning_rate": 8.956743002544529e-06,
"loss": 0.1578,
"step": 1883
},
{
"epoch": 4.770595690747782,
"grad_norm": 0.5030307769775391,
"learning_rate": 8.85496183206107e-06,
"loss": 0.1213,
"step": 1884
},
{
"epoch": 4.773130544993663,
"grad_norm": 0.7841615080833435,
"learning_rate": 8.753180661577609e-06,
"loss": 0.1619,
"step": 1885
},
{
"epoch": 4.775665399239544,
"grad_norm": 0.5570418834686279,
"learning_rate": 8.651399491094148e-06,
"loss": 0.1308,
"step": 1886
},
{
"epoch": 4.778200253485425,
"grad_norm": 0.6690031886100769,
"learning_rate": 8.549618320610688e-06,
"loss": 0.1413,
"step": 1887
},
{
"epoch": 4.780735107731306,
"grad_norm": 0.524140477180481,
"learning_rate": 8.447837150127227e-06,
"loss": 0.1354,
"step": 1888
},
{
"epoch": 4.783269961977187,
"grad_norm": 0.5612379908561707,
"learning_rate": 8.346055979643766e-06,
"loss": 0.1375,
"step": 1889
},
{
"epoch": 4.7858048162230675,
"grad_norm": 0.851925790309906,
"learning_rate": 8.244274809160306e-06,
"loss": 0.1783,
"step": 1890
},
{
"epoch": 4.7883396704689485,
"grad_norm": 0.8507834672927856,
"learning_rate": 8.142493638676845e-06,
"loss": 0.1743,
"step": 1891
},
{
"epoch": 4.7908745247148286,
"grad_norm": 0.8136033415794373,
"learning_rate": 8.040712468193384e-06,
"loss": 0.1381,
"step": 1892
},
{
"epoch": 4.7934093789607095,
"grad_norm": 0.7247329354286194,
"learning_rate": 7.938931297709924e-06,
"loss": 0.1793,
"step": 1893
},
{
"epoch": 4.7959442332065905,
"grad_norm": 0.5494823455810547,
"learning_rate": 7.837150127226465e-06,
"loss": 0.1231,
"step": 1894
},
{
"epoch": 4.798479087452471,
"grad_norm": 0.6107218861579895,
"learning_rate": 7.735368956743002e-06,
"loss": 0.1358,
"step": 1895
},
{
"epoch": 4.801013941698352,
"grad_norm": 0.6297575235366821,
"learning_rate": 7.633587786259543e-06,
"loss": 0.1699,
"step": 1896
},
{
"epoch": 4.803548795944233,
"grad_norm": 0.8669266700744629,
"learning_rate": 7.531806615776081e-06,
"loss": 0.1982,
"step": 1897
},
{
"epoch": 4.806083650190114,
"grad_norm": 0.583975076675415,
"learning_rate": 7.430025445292621e-06,
"loss": 0.1517,
"step": 1898
},
{
"epoch": 4.808618504435995,
"grad_norm": 0.6059403419494629,
"learning_rate": 7.328244274809161e-06,
"loss": 0.138,
"step": 1899
},
{
"epoch": 4.811153358681876,
"grad_norm": 1.0802148580551147,
"learning_rate": 7.2264631043257e-06,
"loss": 0.1677,
"step": 1900
},
{
"epoch": 4.813688212927756,
"grad_norm": 0.5637528300285339,
"learning_rate": 7.12468193384224e-06,
"loss": 0.1517,
"step": 1901
},
{
"epoch": 4.816223067173637,
"grad_norm": 0.6925719976425171,
"learning_rate": 7.022900763358779e-06,
"loss": 0.1636,
"step": 1902
},
{
"epoch": 4.818757921419518,
"grad_norm": 0.6529707908630371,
"learning_rate": 6.9211195928753175e-06,
"loss": 0.1587,
"step": 1903
},
{
"epoch": 4.821292775665399,
"grad_norm": 1.1477290391921997,
"learning_rate": 6.819338422391858e-06,
"loss": 0.1655,
"step": 1904
},
{
"epoch": 4.82382762991128,
"grad_norm": 0.7867985367774963,
"learning_rate": 6.717557251908398e-06,
"loss": 0.1955,
"step": 1905
},
{
"epoch": 4.826362484157161,
"grad_norm": 0.617871105670929,
"learning_rate": 6.615776081424936e-06,
"loss": 0.1554,
"step": 1906
},
{
"epoch": 4.828897338403042,
"grad_norm": 0.5985192656517029,
"learning_rate": 6.5139949109414765e-06,
"loss": 0.1484,
"step": 1907
},
{
"epoch": 4.831432192648923,
"grad_norm": 0.6069400310516357,
"learning_rate": 6.412213740458016e-06,
"loss": 0.1326,
"step": 1908
},
{
"epoch": 4.833967046894804,
"grad_norm": 0.9009010195732117,
"learning_rate": 6.310432569974554e-06,
"loss": 0.1999,
"step": 1909
},
{
"epoch": 4.836501901140684,
"grad_norm": 0.5913792848587036,
"learning_rate": 6.208651399491094e-06,
"loss": 0.1381,
"step": 1910
},
{
"epoch": 4.839036755386565,
"grad_norm": 0.5730859637260437,
"learning_rate": 6.106870229007634e-06,
"loss": 0.1346,
"step": 1911
},
{
"epoch": 4.841571609632446,
"grad_norm": 0.6579172611236572,
"learning_rate": 6.005089058524174e-06,
"loss": 0.1572,
"step": 1912
},
{
"epoch": 4.844106463878327,
"grad_norm": 0.5854265093803406,
"learning_rate": 5.903307888040713e-06,
"loss": 0.1359,
"step": 1913
},
{
"epoch": 4.846641318124208,
"grad_norm": 0.7668277025222778,
"learning_rate": 5.801526717557252e-06,
"loss": 0.1728,
"step": 1914
},
{
"epoch": 4.849176172370089,
"grad_norm": 0.8092861175537109,
"learning_rate": 5.699745547073792e-06,
"loss": 0.1741,
"step": 1915
},
{
"epoch": 4.85171102661597,
"grad_norm": 0.6868001818656921,
"learning_rate": 5.597964376590331e-06,
"loss": 0.1604,
"step": 1916
},
{
"epoch": 4.854245880861851,
"grad_norm": 0.6506228446960449,
"learning_rate": 5.49618320610687e-06,
"loss": 0.1459,
"step": 1917
},
{
"epoch": 4.856780735107732,
"grad_norm": 0.6033440232276917,
"learning_rate": 5.394402035623411e-06,
"loss": 0.1435,
"step": 1918
},
{
"epoch": 4.859315589353612,
"grad_norm": 0.7446348071098328,
"learning_rate": 5.29262086513995e-06,
"loss": 0.165,
"step": 1919
},
{
"epoch": 4.861850443599493,
"grad_norm": 0.5380656123161316,
"learning_rate": 5.190839694656488e-06,
"loss": 0.1504,
"step": 1920
},
{
"epoch": 4.864385297845374,
"grad_norm": 0.6752755641937256,
"learning_rate": 5.089058524173028e-06,
"loss": 0.1616,
"step": 1921
},
{
"epoch": 4.866920152091255,
"grad_norm": 0.6897322535514832,
"learning_rate": 4.987277353689568e-06,
"loss": 0.1409,
"step": 1922
},
{
"epoch": 4.869455006337136,
"grad_norm": 0.5405673980712891,
"learning_rate": 4.885496183206107e-06,
"loss": 0.1215,
"step": 1923
},
{
"epoch": 4.8719898605830165,
"grad_norm": 0.6921371221542358,
"learning_rate": 4.7837150127226464e-06,
"loss": 0.1554,
"step": 1924
},
{
"epoch": 4.8745247148288975,
"grad_norm": 0.6672477722167969,
"learning_rate": 4.681933842239187e-06,
"loss": 0.1685,
"step": 1925
},
{
"epoch": 4.8770595690747784,
"grad_norm": 0.5887411236763,
"learning_rate": 4.580152671755725e-06,
"loss": 0.1495,
"step": 1926
},
{
"epoch": 4.879594423320659,
"grad_norm": 0.8119281530380249,
"learning_rate": 4.478371501272264e-06,
"loss": 0.1778,
"step": 1927
},
{
"epoch": 4.8821292775665395,
"grad_norm": 0.6423155665397644,
"learning_rate": 4.3765903307888045e-06,
"loss": 0.1532,
"step": 1928
},
{
"epoch": 4.88466413181242,
"grad_norm": 0.576859712600708,
"learning_rate": 4.274809160305344e-06,
"loss": 0.1474,
"step": 1929
},
{
"epoch": 4.887198986058301,
"grad_norm": 0.668792188167572,
"learning_rate": 4.173027989821883e-06,
"loss": 0.1583,
"step": 1930
},
{
"epoch": 4.889733840304182,
"grad_norm": 0.727428138256073,
"learning_rate": 4.0712468193384225e-06,
"loss": 0.1759,
"step": 1931
},
{
"epoch": 4.892268694550063,
"grad_norm": 0.7260742783546448,
"learning_rate": 3.969465648854962e-06,
"loss": 0.1665,
"step": 1932
},
{
"epoch": 4.894803548795944,
"grad_norm": 0.6192269921302795,
"learning_rate": 3.867684478371501e-06,
"loss": 0.1377,
"step": 1933
},
{
"epoch": 4.897338403041825,
"grad_norm": 0.7672135233879089,
"learning_rate": 3.7659033078880404e-06,
"loss": 0.1696,
"step": 1934
},
{
"epoch": 4.899873257287706,
"grad_norm": 0.5162369012832642,
"learning_rate": 3.6641221374045806e-06,
"loss": 0.1384,
"step": 1935
},
{
"epoch": 4.902408111533587,
"grad_norm": 0.6594913601875305,
"learning_rate": 3.56234096692112e-06,
"loss": 0.1714,
"step": 1936
},
{
"epoch": 4.904942965779467,
"grad_norm": 0.7748851776123047,
"learning_rate": 3.4605597964376588e-06,
"loss": 0.2014,
"step": 1937
},
{
"epoch": 4.907477820025348,
"grad_norm": 0.6400601267814636,
"learning_rate": 3.358778625954199e-06,
"loss": 0.1522,
"step": 1938
},
{
"epoch": 4.910012674271229,
"grad_norm": 0.5443174839019775,
"learning_rate": 3.2569974554707382e-06,
"loss": 0.1276,
"step": 1939
},
{
"epoch": 4.91254752851711,
"grad_norm": 0.6544225811958313,
"learning_rate": 3.155216284987277e-06,
"loss": 0.1441,
"step": 1940
},
{
"epoch": 4.915082382762991,
"grad_norm": 0.6579450368881226,
"learning_rate": 3.053435114503817e-06,
"loss": 0.1688,
"step": 1941
},
{
"epoch": 4.917617237008872,
"grad_norm": 0.594393253326416,
"learning_rate": 2.9516539440203566e-06,
"loss": 0.1586,
"step": 1942
},
{
"epoch": 4.920152091254753,
"grad_norm": 0.6417977213859558,
"learning_rate": 2.849872773536896e-06,
"loss": 0.1389,
"step": 1943
},
{
"epoch": 4.922686945500634,
"grad_norm": 0.5247513055801392,
"learning_rate": 2.748091603053435e-06,
"loss": 0.1282,
"step": 1944
},
{
"epoch": 4.925221799746515,
"grad_norm": 0.6372106075286865,
"learning_rate": 2.646310432569975e-06,
"loss": 0.1391,
"step": 1945
},
{
"epoch": 4.927756653992396,
"grad_norm": 0.5967155694961548,
"learning_rate": 2.544529262086514e-06,
"loss": 0.1358,
"step": 1946
},
{
"epoch": 4.930291508238277,
"grad_norm": 0.6050627827644348,
"learning_rate": 2.4427480916030536e-06,
"loss": 0.1449,
"step": 1947
},
{
"epoch": 4.932826362484157,
"grad_norm": 0.7595526576042175,
"learning_rate": 2.3409669211195933e-06,
"loss": 0.1838,
"step": 1948
},
{
"epoch": 4.935361216730038,
"grad_norm": 0.7220463156700134,
"learning_rate": 2.239185750636132e-06,
"loss": 0.1695,
"step": 1949
},
{
"epoch": 4.937896070975919,
"grad_norm": 0.4891555905342102,
"learning_rate": 2.137404580152672e-06,
"loss": 0.1394,
"step": 1950
},
{
"epoch": 4.9404309252218,
"grad_norm": 0.5262938141822815,
"learning_rate": 2.0356234096692112e-06,
"loss": 0.1452,
"step": 1951
},
{
"epoch": 4.942965779467681,
"grad_norm": 0.7193884253501892,
"learning_rate": 1.9338422391857505e-06,
"loss": 0.176,
"step": 1952
},
{
"epoch": 4.945500633713562,
"grad_norm": 0.7117200493812561,
"learning_rate": 1.8320610687022903e-06,
"loss": 0.1697,
"step": 1953
},
{
"epoch": 4.948035487959443,
"grad_norm": 0.7884610891342163,
"learning_rate": 1.7302798982188294e-06,
"loss": 0.1864,
"step": 1954
},
{
"epoch": 4.9505703422053235,
"grad_norm": 0.8606098890304565,
"learning_rate": 1.6284987277353691e-06,
"loss": 0.1568,
"step": 1955
},
{
"epoch": 4.9531051964512045,
"grad_norm": 0.5030885338783264,
"learning_rate": 1.5267175572519084e-06,
"loss": 0.1306,
"step": 1956
},
{
"epoch": 4.955640050697085,
"grad_norm": 0.5155559182167053,
"learning_rate": 1.424936386768448e-06,
"loss": 0.1311,
"step": 1957
},
{
"epoch": 4.9581749049429655,
"grad_norm": 0.4945980906486511,
"learning_rate": 1.3231552162849875e-06,
"loss": 0.1212,
"step": 1958
},
{
"epoch": 4.9607097591888465,
"grad_norm": 0.79302978515625,
"learning_rate": 1.2213740458015268e-06,
"loss": 0.1763,
"step": 1959
},
{
"epoch": 4.9632446134347274,
"grad_norm": 0.6397921442985535,
"learning_rate": 1.119592875318066e-06,
"loss": 0.1416,
"step": 1960
},
{
"epoch": 4.965779467680608,
"grad_norm": 0.6680799722671509,
"learning_rate": 1.0178117048346056e-06,
"loss": 0.1519,
"step": 1961
},
{
"epoch": 4.968314321926489,
"grad_norm": 0.5919336080551147,
"learning_rate": 9.160305343511451e-07,
"loss": 0.16,
"step": 1962
},
{
"epoch": 4.97084917617237,
"grad_norm": 0.5929127335548401,
"learning_rate": 8.142493638676846e-07,
"loss": 0.143,
"step": 1963
},
{
"epoch": 4.973384030418251,
"grad_norm": 0.5678686499595642,
"learning_rate": 7.12468193384224e-07,
"loss": 0.1236,
"step": 1964
},
{
"epoch": 4.975918884664132,
"grad_norm": 0.5478057861328125,
"learning_rate": 6.106870229007634e-07,
"loss": 0.1407,
"step": 1965
},
{
"epoch": 4.978453738910012,
"grad_norm": 0.6003939509391785,
"learning_rate": 5.089058524173028e-07,
"loss": 0.1315,
"step": 1966
},
{
"epoch": 4.980988593155893,
"grad_norm": 0.5943416357040405,
"learning_rate": 4.071246819338423e-07,
"loss": 0.1451,
"step": 1967
},
{
"epoch": 4.983523447401774,
"grad_norm": 0.5419045090675354,
"learning_rate": 3.053435114503817e-07,
"loss": 0.1338,
"step": 1968
},
{
"epoch": 4.986058301647655,
"grad_norm": 0.5665134787559509,
"learning_rate": 2.0356234096692114e-07,
"loss": 0.1347,
"step": 1969
},
{
"epoch": 4.988593155893536,
"grad_norm": 0.5646002292633057,
"learning_rate": 1.0178117048346057e-07,
"loss": 0.1352,
"step": 1970
}
],
"logging_steps": 1,
"max_steps": 1970,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5558390987853286e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}