9b-90 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
3906029 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004434589800443459,
"grad_norm": 4.369466304779053,
"learning_rate": 1.098901098901099e-07,
"loss": 1.865166187286377,
"step": 2
},
{
"epoch": 0.008869179600886918,
"grad_norm": 6.864917278289795,
"learning_rate": 3.296703296703297e-07,
"loss": 2.130244493484497,
"step": 4
},
{
"epoch": 0.013303769401330377,
"grad_norm": 3.4614903926849365,
"learning_rate": 5.494505494505495e-07,
"loss": 1.9028818607330322,
"step": 6
},
{
"epoch": 0.017738359201773836,
"grad_norm": 1.170404076576233,
"learning_rate": 7.692307692307694e-07,
"loss": 1.815584659576416,
"step": 8
},
{
"epoch": 0.022172949002217297,
"grad_norm": 4.44560432434082,
"learning_rate": 9.890109890109891e-07,
"loss": 1.616060733795166,
"step": 10
},
{
"epoch": 0.026607538802660754,
"grad_norm": 3.004225730895996,
"learning_rate": 1.2087912087912089e-06,
"loss": 2.0138046741485596,
"step": 12
},
{
"epoch": 0.031042128603104215,
"grad_norm": 6.603450298309326,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.5741312503814697,
"step": 14
},
{
"epoch": 0.03547671840354767,
"grad_norm": 3.626671552658081,
"learning_rate": 1.6483516483516484e-06,
"loss": 1.1557213068008423,
"step": 16
},
{
"epoch": 0.03991130820399113,
"grad_norm": 1.9218900203704834,
"learning_rate": 1.8681318681318684e-06,
"loss": 1.167083978652954,
"step": 18
},
{
"epoch": 0.04434589800443459,
"grad_norm": 27.988502502441406,
"learning_rate": 2.0879120879120883e-06,
"loss": 1.2604156732559204,
"step": 20
},
{
"epoch": 0.04878048780487805,
"grad_norm": 1.9937968254089355,
"learning_rate": 2.307692307692308e-06,
"loss": 1.300764799118042,
"step": 22
},
{
"epoch": 0.05321507760532151,
"grad_norm": 1.050702691078186,
"learning_rate": 2.5274725274725274e-06,
"loss": 1.5187550783157349,
"step": 24
},
{
"epoch": 0.057649667405764965,
"grad_norm": 0.8804575204849243,
"learning_rate": 2.7472527472527476e-06,
"loss": 1.2839621305465698,
"step": 26
},
{
"epoch": 0.06208425720620843,
"grad_norm": 1.4461461305618286,
"learning_rate": 2.9670329670329673e-06,
"loss": 1.0896079540252686,
"step": 28
},
{
"epoch": 0.06651884700665188,
"grad_norm": 1.1735917329788208,
"learning_rate": 3.1868131868131867e-06,
"loss": 1.2101552486419678,
"step": 30
},
{
"epoch": 0.07095343680709534,
"grad_norm": 1.792823076248169,
"learning_rate": 3.406593406593407e-06,
"loss": 1.1142032146453857,
"step": 32
},
{
"epoch": 0.07538802660753881,
"grad_norm": 0.8939670920372009,
"learning_rate": 3.6263736263736266e-06,
"loss": 1.4834434986114502,
"step": 34
},
{
"epoch": 0.07982261640798226,
"grad_norm": 5.579226970672607,
"learning_rate": 3.846153846153847e-06,
"loss": 1.2127963304519653,
"step": 36
},
{
"epoch": 0.08425720620842572,
"grad_norm": 11.267181396484375,
"learning_rate": 4.065934065934066e-06,
"loss": 1.535402774810791,
"step": 38
},
{
"epoch": 0.08869179600886919,
"grad_norm": 1.9921720027923584,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.7790605425834656,
"step": 40
},
{
"epoch": 0.09312638580931264,
"grad_norm": 1.5248279571533203,
"learning_rate": 4.505494505494506e-06,
"loss": 1.3986105918884277,
"step": 42
},
{
"epoch": 0.0975609756097561,
"grad_norm": 0.9806777238845825,
"learning_rate": 4.725274725274726e-06,
"loss": 1.3404041528701782,
"step": 44
},
{
"epoch": 0.10199556541019955,
"grad_norm": 1.6693533658981323,
"learning_rate": 4.945054945054946e-06,
"loss": 1.3681906461715698,
"step": 46
},
{
"epoch": 0.10643015521064302,
"grad_norm": 2.5410993099212646,
"learning_rate": 5.164835164835166e-06,
"loss": 1.6145933866500854,
"step": 48
},
{
"epoch": 0.11086474501108648,
"grad_norm": 1.414282202720642,
"learning_rate": 5.384615384615385e-06,
"loss": 1.4454115629196167,
"step": 50
},
{
"epoch": 0.11529933481152993,
"grad_norm": 1.7944461107254028,
"learning_rate": 5.604395604395605e-06,
"loss": 1.3703702688217163,
"step": 52
},
{
"epoch": 0.1197339246119734,
"grad_norm": 1.0293529033660889,
"learning_rate": 5.824175824175825e-06,
"loss": 1.368741750717163,
"step": 54
},
{
"epoch": 0.12416851441241686,
"grad_norm": 2.983600616455078,
"learning_rate": 6.043956043956044e-06,
"loss": 1.1135094165802002,
"step": 56
},
{
"epoch": 0.1286031042128603,
"grad_norm": 1.0177669525146484,
"learning_rate": 6.2637362637362645e-06,
"loss": 1.3418025970458984,
"step": 58
},
{
"epoch": 0.13303769401330376,
"grad_norm": 0.7596590518951416,
"learning_rate": 6.483516483516485e-06,
"loss": 1.3871124982833862,
"step": 60
},
{
"epoch": 0.13747228381374724,
"grad_norm": 1.77037513256073,
"learning_rate": 6.703296703296703e-06,
"loss": 1.336590051651001,
"step": 62
},
{
"epoch": 0.1419068736141907,
"grad_norm": 1.6162333488464355,
"learning_rate": 6.923076923076923e-06,
"loss": 1.3942538499832153,
"step": 64
},
{
"epoch": 0.14634146341463414,
"grad_norm": 3.2967028617858887,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.4562139511108398,
"step": 66
},
{
"epoch": 0.15077605321507762,
"grad_norm": 0.9108039140701294,
"learning_rate": 7.362637362637364e-06,
"loss": 1.324294924736023,
"step": 68
},
{
"epoch": 0.15521064301552107,
"grad_norm": 3.4899282455444336,
"learning_rate": 7.582417582417583e-06,
"loss": 1.3148343563079834,
"step": 70
},
{
"epoch": 0.15964523281596452,
"grad_norm": 1.4243505001068115,
"learning_rate": 7.802197802197802e-06,
"loss": 1.3413867950439453,
"step": 72
},
{
"epoch": 0.164079822616408,
"grad_norm": 0.9133158922195435,
"learning_rate": 8.021978021978023e-06,
"loss": 1.2512249946594238,
"step": 74
},
{
"epoch": 0.16851441241685144,
"grad_norm": 1.0660244226455688,
"learning_rate": 8.241758241758243e-06,
"loss": 1.3262264728546143,
"step": 76
},
{
"epoch": 0.1729490022172949,
"grad_norm": 1.9149298667907715,
"learning_rate": 8.461538461538462e-06,
"loss": 1.3764103651046753,
"step": 78
},
{
"epoch": 0.17738359201773837,
"grad_norm": 1.6447206735610962,
"learning_rate": 8.681318681318681e-06,
"loss": 1.3489444255828857,
"step": 80
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.179309368133545,
"learning_rate": 8.9010989010989e-06,
"loss": 0.8915775418281555,
"step": 82
},
{
"epoch": 0.18625277161862527,
"grad_norm": 1.079959511756897,
"learning_rate": 9.120879120879122e-06,
"loss": 1.0530604124069214,
"step": 84
},
{
"epoch": 0.19068736141906872,
"grad_norm": 1.5468087196350098,
"learning_rate": 9.340659340659341e-06,
"loss": 1.0116887092590332,
"step": 86
},
{
"epoch": 0.1951219512195122,
"grad_norm": 1.0850789546966553,
"learning_rate": 9.560439560439562e-06,
"loss": 1.2681747674942017,
"step": 88
},
{
"epoch": 0.19955654101995565,
"grad_norm": 3.765213966369629,
"learning_rate": 9.780219780219781e-06,
"loss": 1.398827314376831,
"step": 90
},
{
"epoch": 0.2039911308203991,
"grad_norm": 1.0919721126556396,
"learning_rate": 1e-05,
"loss": 1.2438007593154907,
"step": 92
},
{
"epoch": 0.20842572062084258,
"grad_norm": 1.347847819328308,
"learning_rate": 9.99996972898091e-06,
"loss": 1.2643123865127563,
"step": 94
},
{
"epoch": 0.21286031042128603,
"grad_norm": 2.1960842609405518,
"learning_rate": 9.999878916330893e-06,
"loss": 1.8062665462493896,
"step": 96
},
{
"epoch": 0.21729490022172948,
"grad_norm": 1.4442038536071777,
"learning_rate": 9.999727563271727e-06,
"loss": 0.9751254320144653,
"step": 98
},
{
"epoch": 0.22172949002217296,
"grad_norm": 1.2550246715545654,
"learning_rate": 9.999515671839682e-06,
"loss": 1.1938117742538452,
"step": 100
},
{
"epoch": 0.2261640798226164,
"grad_norm": 1.0004876852035522,
"learning_rate": 9.999243244885499e-06,
"loss": 1.3510534763336182,
"step": 102
},
{
"epoch": 0.23059866962305986,
"grad_norm": 1.2936842441558838,
"learning_rate": 9.998910286074355e-06,
"loss": 1.0911916494369507,
"step": 104
},
{
"epoch": 0.23503325942350334,
"grad_norm": 1.3704265356063843,
"learning_rate": 9.998516799885806e-06,
"loss": 1.3209298849105835,
"step": 106
},
{
"epoch": 0.2394678492239468,
"grad_norm": 1.4554086923599243,
"learning_rate": 9.998062791613729e-06,
"loss": 1.513480305671692,
"step": 108
},
{
"epoch": 0.24390243902439024,
"grad_norm": 1.170039176940918,
"learning_rate": 9.997548267366255e-06,
"loss": 1.0687693357467651,
"step": 110
},
{
"epoch": 0.24833702882483372,
"grad_norm": 3.054081678390503,
"learning_rate": 9.996973234065685e-06,
"loss": 1.310200810432434,
"step": 112
},
{
"epoch": 0.25277161862527714,
"grad_norm": 1.4177216291427612,
"learning_rate": 9.996337699448392e-06,
"loss": 0.8176467418670654,
"step": 114
},
{
"epoch": 0.2572062084257206,
"grad_norm": 1.1188538074493408,
"learning_rate": 9.995641672064726e-06,
"loss": 1.3813865184783936,
"step": 116
},
{
"epoch": 0.2616407982261641,
"grad_norm": 0.9215674996376038,
"learning_rate": 9.994885161278885e-06,
"loss": 1.1077316999435425,
"step": 118
},
{
"epoch": 0.2660753880266075,
"grad_norm": 1.6228851079940796,
"learning_rate": 9.994068177268807e-06,
"loss": 1.2811754941940308,
"step": 120
},
{
"epoch": 0.270509977827051,
"grad_norm": 1.9863255023956299,
"learning_rate": 9.993190731026024e-06,
"loss": 1.1968728303909302,
"step": 122
},
{
"epoch": 0.2749445676274945,
"grad_norm": 1.8475158214569092,
"learning_rate": 9.992252834355503e-06,
"loss": 1.5891046524047852,
"step": 124
},
{
"epoch": 0.2793791574279379,
"grad_norm": 1.485600471496582,
"learning_rate": 9.99125449987551e-06,
"loss": 0.9440798163414001,
"step": 126
},
{
"epoch": 0.2838137472283814,
"grad_norm": 1.168931484222412,
"learning_rate": 9.990195741017422e-06,
"loss": 1.21555495262146,
"step": 128
},
{
"epoch": 0.28824833702882485,
"grad_norm": 1.28008234500885,
"learning_rate": 9.989076572025554e-06,
"loss": 0.9523183703422546,
"step": 130
},
{
"epoch": 0.2926829268292683,
"grad_norm": 3.5025172233581543,
"learning_rate": 9.987897007956968e-06,
"loss": 0.9987781047821045,
"step": 132
},
{
"epoch": 0.29711751662971175,
"grad_norm": 2.9309399127960205,
"learning_rate": 9.986657064681267e-06,
"loss": 0.9768642783164978,
"step": 134
},
{
"epoch": 0.30155210643015523,
"grad_norm": 6.351615905761719,
"learning_rate": 9.98535675888038e-06,
"loss": 1.2585439682006836,
"step": 136
},
{
"epoch": 0.30598669623059865,
"grad_norm": 5.213984489440918,
"learning_rate": 9.983996108048345e-06,
"loss": 0.7967538833618164,
"step": 138
},
{
"epoch": 0.31042128603104213,
"grad_norm": 1.3048728704452515,
"learning_rate": 9.982575130491068e-06,
"loss": 1.1146520376205444,
"step": 140
},
{
"epoch": 0.3148558758314856,
"grad_norm": 0.8629128336906433,
"learning_rate": 9.981093845326079e-06,
"loss": 0.9639315009117126,
"step": 142
},
{
"epoch": 0.31929046563192903,
"grad_norm": 1.43756902217865,
"learning_rate": 9.979552272482268e-06,
"loss": 0.9869639873504639,
"step": 144
},
{
"epoch": 0.3237250554323725,
"grad_norm": 2.2892510890960693,
"learning_rate": 9.977950432699629e-06,
"loss": 0.9267846345901489,
"step": 146
},
{
"epoch": 0.328159645232816,
"grad_norm": 2.10282301902771,
"learning_rate": 9.976288347528972e-06,
"loss": 1.4183735847473145,
"step": 148
},
{
"epoch": 0.3325942350332594,
"grad_norm": 2.502537488937378,
"learning_rate": 9.974566039331634e-06,
"loss": 1.1835788488388062,
"step": 150
},
{
"epoch": 0.3370288248337029,
"grad_norm": 2.951907157897949,
"learning_rate": 9.972783531279184e-06,
"loss": 1.0829112529754639,
"step": 152
},
{
"epoch": 0.34146341463414637,
"grad_norm": 1.5924861431121826,
"learning_rate": 9.970940847353103e-06,
"loss": 1.2782995700836182,
"step": 154
},
{
"epoch": 0.3458980044345898,
"grad_norm": 3.426605701446533,
"learning_rate": 9.969038012344465e-06,
"loss": 1.2216734886169434,
"step": 156
},
{
"epoch": 0.35033259423503327,
"grad_norm": 2.055274724960327,
"learning_rate": 9.967075051853609e-06,
"loss": 1.3016668558120728,
"step": 158
},
{
"epoch": 0.35476718403547675,
"grad_norm": 0.5714741945266724,
"learning_rate": 9.965051992289782e-06,
"loss": 1.135823130607605,
"step": 160
},
{
"epoch": 0.35920177383592017,
"grad_norm": 4.934542655944824,
"learning_rate": 9.962968860870798e-06,
"loss": 0.8752337098121643,
"step": 162
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.104079008102417,
"learning_rate": 9.96082568562266e-06,
"loss": 0.904317319393158,
"step": 164
},
{
"epoch": 0.36807095343680707,
"grad_norm": 1.8156555891036987,
"learning_rate": 9.958622495379193e-06,
"loss": 1.284702181816101,
"step": 166
},
{
"epoch": 0.37250554323725055,
"grad_norm": 1.5921517610549927,
"learning_rate": 9.956359319781642e-06,
"loss": 1.2042418718338013,
"step": 168
},
{
"epoch": 0.376940133037694,
"grad_norm": 1.4465107917785645,
"learning_rate": 9.954036189278292e-06,
"loss": 1.2029085159301758,
"step": 170
},
{
"epoch": 0.38137472283813745,
"grad_norm": 2.0260727405548096,
"learning_rate": 9.951653135124045e-06,
"loss": 0.7804557681083679,
"step": 172
},
{
"epoch": 0.3858093126385809,
"grad_norm": 3.316241979598999,
"learning_rate": 9.94921018938e-06,
"loss": 1.6452889442443848,
"step": 174
},
{
"epoch": 0.3902439024390244,
"grad_norm": 1.2219425439834595,
"learning_rate": 9.946707384913027e-06,
"loss": 1.2721954584121704,
"step": 176
},
{
"epoch": 0.3946784922394678,
"grad_norm": 1.0764427185058594,
"learning_rate": 9.944144755395321e-06,
"loss": 1.325971245765686,
"step": 178
},
{
"epoch": 0.3991130820399113,
"grad_norm": 0.585249662399292,
"learning_rate": 9.941522335303955e-06,
"loss": 1.0814615488052368,
"step": 180
},
{
"epoch": 0.4035476718403548,
"grad_norm": 1.7605838775634766,
"learning_rate": 9.938840159920406e-06,
"loss": 1.2727550268173218,
"step": 182
},
{
"epoch": 0.4079822616407982,
"grad_norm": 0.8892675638198853,
"learning_rate": 9.93609826533009e-06,
"loss": 0.8603274822235107,
"step": 184
},
{
"epoch": 0.4124168514412417,
"grad_norm": 1.1300294399261475,
"learning_rate": 9.933296688421872e-06,
"loss": 0.986240565776825,
"step": 186
},
{
"epoch": 0.41685144124168516,
"grad_norm": 1.141814947128296,
"learning_rate": 9.930435466887564e-06,
"loss": 0.99045330286026,
"step": 188
},
{
"epoch": 0.4212860310421286,
"grad_norm": 2.3094711303710938,
"learning_rate": 9.927514639221433e-06,
"loss": 1.0205762386322021,
"step": 190
},
{
"epoch": 0.42572062084257206,
"grad_norm": 3.4740211963653564,
"learning_rate": 9.92453424471967e-06,
"loss": 0.8767862319946289,
"step": 192
},
{
"epoch": 0.43015521064301554,
"grad_norm": 1.3851454257965088,
"learning_rate": 9.921494323479862e-06,
"loss": 1.306305170059204,
"step": 194
},
{
"epoch": 0.43458980044345896,
"grad_norm": 2.2007641792297363,
"learning_rate": 9.918394916400465e-06,
"loss": 1.5621771812438965,
"step": 196
},
{
"epoch": 0.43902439024390244,
"grad_norm": 4.152230262756348,
"learning_rate": 9.915236065180235e-06,
"loss": 1.2867047786712646,
"step": 198
},
{
"epoch": 0.4434589800443459,
"grad_norm": 0.7970097064971924,
"learning_rate": 9.912017812317684e-06,
"loss": 1.1508140563964844,
"step": 200
},
{
"epoch": 0.44789356984478934,
"grad_norm": 1.0820010900497437,
"learning_rate": 9.908740201110497e-06,
"loss": 1.2738271951675415,
"step": 202
},
{
"epoch": 0.4523281596452328,
"grad_norm": 0.833768367767334,
"learning_rate": 9.905403275654951e-06,
"loss": 1.229623556137085,
"step": 204
},
{
"epoch": 0.4567627494456763,
"grad_norm": 1.3860057592391968,
"learning_rate": 9.902007080845336e-06,
"loss": 1.0840635299682617,
"step": 206
},
{
"epoch": 0.4611973392461197,
"grad_norm": 1.5478756427764893,
"learning_rate": 9.898551662373325e-06,
"loss": 1.0657180547714233,
"step": 208
},
{
"epoch": 0.4656319290465632,
"grad_norm": 1.0406877994537354,
"learning_rate": 9.895037066727382e-06,
"loss": 0.7027549743652344,
"step": 210
},
{
"epoch": 0.4700665188470067,
"grad_norm": 4.616578578948975,
"learning_rate": 9.891463341192124e-06,
"loss": 0.8683266043663025,
"step": 212
},
{
"epoch": 0.4745011086474501,
"grad_norm": 0.9766786098480225,
"learning_rate": 9.88783053384769e-06,
"loss": 1.1040089130401611,
"step": 214
},
{
"epoch": 0.4789356984478936,
"grad_norm": 1.0400160551071167,
"learning_rate": 9.884138693569095e-06,
"loss": 1.1147682666778564,
"step": 216
},
{
"epoch": 0.48337028824833705,
"grad_norm": 1.1635631322860718,
"learning_rate": 9.88038787002557e-06,
"loss": 1.1477664709091187,
"step": 218
},
{
"epoch": 0.4878048780487805,
"grad_norm": 3.343935489654541,
"learning_rate": 9.876578113679891e-06,
"loss": 1.24171781539917,
"step": 220
},
{
"epoch": 0.49223946784922396,
"grad_norm": 1.407214879989624,
"learning_rate": 9.872709475787708e-06,
"loss": 1.2408087253570557,
"step": 222
},
{
"epoch": 0.49667405764966743,
"grad_norm": 0.8646455407142639,
"learning_rate": 9.868782008396848e-06,
"loss": 1.2195643186569214,
"step": 224
},
{
"epoch": 0.5011086474501109,
"grad_norm": 1.0053114891052246,
"learning_rate": 9.864795764346615e-06,
"loss": 1.2099742889404297,
"step": 226
},
{
"epoch": 0.5055432372505543,
"grad_norm": 1.0781813859939575,
"learning_rate": 9.860750797267085e-06,
"loss": 1.2584586143493652,
"step": 228
},
{
"epoch": 0.5099778270509978,
"grad_norm": 2.1706690788269043,
"learning_rate": 9.856647161578384e-06,
"loss": 1.8491344451904297,
"step": 230
},
{
"epoch": 0.5144124168514412,
"grad_norm": 1.5501409769058228,
"learning_rate": 9.852484912489946e-06,
"loss": 0.8673834204673767,
"step": 232
},
{
"epoch": 0.5188470066518847,
"grad_norm": 1.1405872106552124,
"learning_rate": 9.848264105999783e-06,
"loss": 1.287527322769165,
"step": 234
},
{
"epoch": 0.5232815964523282,
"grad_norm": 1.5920919179916382,
"learning_rate": 9.843984798893722e-06,
"loss": 0.9661246538162231,
"step": 236
},
{
"epoch": 0.5277161862527716,
"grad_norm": 1.4155439138412476,
"learning_rate": 9.839647048744645e-06,
"loss": 1.0282375812530518,
"step": 238
},
{
"epoch": 0.532150776053215,
"grad_norm": 1.3770828247070312,
"learning_rate": 9.83525091391172e-06,
"loss": 1.2707240581512451,
"step": 240
},
{
"epoch": 0.5365853658536586,
"grad_norm": 1.1959309577941895,
"learning_rate": 9.8307964535396e-06,
"loss": 1.3354653120040894,
"step": 242
},
{
"epoch": 0.541019955654102,
"grad_norm": 5.0889363288879395,
"learning_rate": 9.826283727557644e-06,
"loss": 0.9864997863769531,
"step": 244
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.7636584639549255,
"learning_rate": 9.821712796679106e-06,
"loss": 1.2685621976852417,
"step": 246
},
{
"epoch": 0.549889135254989,
"grad_norm": 1.781443476676941,
"learning_rate": 9.817083722400309e-06,
"loss": 1.4938619136810303,
"step": 248
},
{
"epoch": 0.5543237250554324,
"grad_norm": 2.2715258598327637,
"learning_rate": 9.812396566999832e-06,
"loss": 1.2405654191970825,
"step": 250
},
{
"epoch": 0.5587583148558758,
"grad_norm": 30.650293350219727,
"learning_rate": 9.807651393537659e-06,
"loss": 0.6915596127510071,
"step": 252
},
{
"epoch": 0.5631929046563193,
"grad_norm": 4.594031810760498,
"learning_rate": 9.802848265854343e-06,
"loss": 1.4810152053833008,
"step": 254
},
{
"epoch": 0.5676274944567627,
"grad_norm": 8.177288055419922,
"learning_rate": 9.797987248570137e-06,
"loss": 1.3323289155960083,
"step": 256
},
{
"epoch": 0.5720620842572062,
"grad_norm": 1.162073016166687,
"learning_rate": 9.793068407084125e-06,
"loss": 1.2933639287948608,
"step": 258
},
{
"epoch": 0.5764966740576497,
"grad_norm": 1.2364484071731567,
"learning_rate": 9.78809180757335e-06,
"loss": 1.2908434867858887,
"step": 260
},
{
"epoch": 0.5809312638580931,
"grad_norm": 2.779216766357422,
"learning_rate": 9.783057516991921e-06,
"loss": 0.7582840919494629,
"step": 262
},
{
"epoch": 0.5853658536585366,
"grad_norm": 1.085245966911316,
"learning_rate": 9.777965603070106e-06,
"loss": 1.3473342657089233,
"step": 264
},
{
"epoch": 0.5898004434589801,
"grad_norm": 1.5546146631240845,
"learning_rate": 9.772816134313424e-06,
"loss": 1.0741627216339111,
"step": 266
},
{
"epoch": 0.5942350332594235,
"grad_norm": 1.2348523139953613,
"learning_rate": 9.76760918000173e-06,
"loss": 1.5512841939926147,
"step": 268
},
{
"epoch": 0.5986696230598669,
"grad_norm": 2.3876912593841553,
"learning_rate": 9.762344810188276e-06,
"loss": 1.1929247379302979,
"step": 270
},
{
"epoch": 0.6031042128603105,
"grad_norm": 1.362686038017273,
"learning_rate": 9.757023095698766e-06,
"loss": 1.2366266250610352,
"step": 272
},
{
"epoch": 0.6075388026607539,
"grad_norm": 1.314932107925415,
"learning_rate": 9.751644108130405e-06,
"loss": 1.230374813079834,
"step": 274
},
{
"epoch": 0.6119733924611973,
"grad_norm": 1.0009852647781372,
"learning_rate": 9.746207919850951e-06,
"loss": 1.230873942375183,
"step": 276
},
{
"epoch": 0.6164079822616408,
"grad_norm": 2.000821590423584,
"learning_rate": 9.740714603997712e-06,
"loss": 1.241438627243042,
"step": 278
},
{
"epoch": 0.6208425720620843,
"grad_norm": 1.346825122833252,
"learning_rate": 9.735164234476588e-06,
"loss": 1.3042198419570923,
"step": 280
},
{
"epoch": 0.6252771618625277,
"grad_norm": 1.8797986507415771,
"learning_rate": 9.729556885961064e-06,
"loss": 0.9092460870742798,
"step": 282
},
{
"epoch": 0.6297117516629712,
"grad_norm": 0.9287862777709961,
"learning_rate": 9.72389263389121e-06,
"loss": 1.269888162612915,
"step": 284
},
{
"epoch": 0.6341463414634146,
"grad_norm": 1.3805606365203857,
"learning_rate": 9.718171554472662e-06,
"loss": 1.3446077108383179,
"step": 286
},
{
"epoch": 0.6385809312638581,
"grad_norm": 1.057990312576294,
"learning_rate": 9.712393724675597e-06,
"loss": 1.259419322013855,
"step": 288
},
{
"epoch": 0.6430155210643016,
"grad_norm": 1.940826654434204,
"learning_rate": 9.706559222233704e-06,
"loss": 1.2493295669555664,
"step": 290
},
{
"epoch": 0.647450110864745,
"grad_norm": 1.3292601108551025,
"learning_rate": 9.700668125643132e-06,
"loss": 1.3664789199829102,
"step": 292
},
{
"epoch": 0.6518847006651884,
"grad_norm": 1.5756752490997314,
"learning_rate": 9.694720514161437e-06,
"loss": 0.9199124574661255,
"step": 294
},
{
"epoch": 0.656319290465632,
"grad_norm": 0.7564427256584167,
"learning_rate": 9.688716467806508e-06,
"loss": 1.0202033519744873,
"step": 296
},
{
"epoch": 0.6607538802660754,
"grad_norm": 0.8535528182983398,
"learning_rate": 9.682656067355505e-06,
"loss": 1.2138911485671997,
"step": 298
},
{
"epoch": 0.6651884700665188,
"grad_norm": 1.1474356651306152,
"learning_rate": 9.67653939434376e-06,
"loss": 1.2619802951812744,
"step": 300
},
{
"epoch": 0.6696230598669624,
"grad_norm": 1.2519088983535767,
"learning_rate": 9.670366531063686e-06,
"loss": 1.2084006071090698,
"step": 302
},
{
"epoch": 0.6740576496674058,
"grad_norm": 1.4693647623062134,
"learning_rate": 9.664137560563663e-06,
"loss": 1.2800395488739014,
"step": 304
},
{
"epoch": 0.6784922394678492,
"grad_norm": 0.9270733594894409,
"learning_rate": 9.657852566646929e-06,
"loss": 1.2464381456375122,
"step": 306
},
{
"epoch": 0.6829268292682927,
"grad_norm": 0.7677431106567383,
"learning_rate": 9.651511633870451e-06,
"loss": 0.8917385339736938,
"step": 308
},
{
"epoch": 0.6873614190687362,
"grad_norm": 0.7328879833221436,
"learning_rate": 9.645114847543781e-06,
"loss": 1.248028039932251,
"step": 310
},
{
"epoch": 0.6917960088691796,
"grad_norm": 0.6110285520553589,
"learning_rate": 9.638662293727916e-06,
"loss": 1.2161647081375122,
"step": 312
},
{
"epoch": 0.6962305986696231,
"grad_norm": 0.7204345464706421,
"learning_rate": 9.632154059234137e-06,
"loss": 1.2376904487609863,
"step": 314
},
{
"epoch": 0.7006651884700665,
"grad_norm": 0.8166297078132629,
"learning_rate": 9.625590231622837e-06,
"loss": 1.3569447994232178,
"step": 316
},
{
"epoch": 0.70509977827051,
"grad_norm": 0.9921203255653381,
"learning_rate": 9.618970899202354e-06,
"loss": 1.0280705690383911,
"step": 318
},
{
"epoch": 0.7095343680709535,
"grad_norm": 2.89858078956604,
"learning_rate": 9.612296151027765e-06,
"loss": 0.9978334903717041,
"step": 320
},
{
"epoch": 0.7139689578713969,
"grad_norm": 1.732686996459961,
"learning_rate": 9.605566076899714e-06,
"loss": 0.9863637685775757,
"step": 322
},
{
"epoch": 0.7184035476718403,
"grad_norm": 9.696839332580566,
"learning_rate": 9.598780767363174e-06,
"loss": 1.1041550636291504,
"step": 324
},
{
"epoch": 0.7228381374722838,
"grad_norm": 2.4485864639282227,
"learning_rate": 9.591940313706248e-06,
"loss": 1.1134647130966187,
"step": 326
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.39739716053009033,
"learning_rate": 9.585044807958942e-06,
"loss": 0.8426170349121094,
"step": 328
},
{
"epoch": 0.7317073170731707,
"grad_norm": 1.39254629611969,
"learning_rate": 9.578094342891915e-06,
"loss": 0.8422537446022034,
"step": 330
},
{
"epoch": 0.7361419068736141,
"grad_norm": 1.350459337234497,
"learning_rate": 9.571089012015237e-06,
"loss": 1.4164326190948486,
"step": 332
},
{
"epoch": 0.7405764966740577,
"grad_norm": 1.751500129699707,
"learning_rate": 9.564028909577132e-06,
"loss": 1.2159979343414307,
"step": 334
},
{
"epoch": 0.7450110864745011,
"grad_norm": 1.3023916482925415,
"learning_rate": 9.55691413056271e-06,
"loss": 1.2543408870697021,
"step": 336
},
{
"epoch": 0.7494456762749445,
"grad_norm": 2.021757125854492,
"learning_rate": 9.54974477069269e-06,
"loss": 0.7136483788490295,
"step": 338
},
{
"epoch": 0.753880266075388,
"grad_norm": 2.4734063148498535,
"learning_rate": 9.542520926422105e-06,
"loss": 0.3343808054924011,
"step": 340
},
{
"epoch": 0.7583148558758315,
"grad_norm": 4.090439796447754,
"learning_rate": 9.535242694939011e-06,
"loss": 0.8977051973342896,
"step": 342
},
{
"epoch": 0.7627494456762749,
"grad_norm": 1.0740253925323486,
"learning_rate": 9.527910174163179e-06,
"loss": 1.317376732826233,
"step": 344
},
{
"epoch": 0.7671840354767184,
"grad_norm": 0.9744492173194885,
"learning_rate": 9.520523462744776e-06,
"loss": 1.249900221824646,
"step": 346
},
{
"epoch": 0.7716186252771619,
"grad_norm": 1.6449717283248901,
"learning_rate": 9.51308266006304e-06,
"loss": 1.1062456369400024,
"step": 348
},
{
"epoch": 0.7760532150776053,
"grad_norm": 1.0315587520599365,
"learning_rate": 9.505587866224939e-06,
"loss": 1.2459303140640259,
"step": 350
},
{
"epoch": 0.7804878048780488,
"grad_norm": 1.7614156007766724,
"learning_rate": 9.498039182063828e-06,
"loss": 1.1719058752059937,
"step": 352
},
{
"epoch": 0.7849223946784922,
"grad_norm": 3.0436220169067383,
"learning_rate": 9.49043670913809e-06,
"loss": 1.0381566286087036,
"step": 354
},
{
"epoch": 0.7893569844789357,
"grad_norm": 6.369256019592285,
"learning_rate": 9.48278054972977e-06,
"loss": 1.2438908815383911,
"step": 356
},
{
"epoch": 0.7937915742793792,
"grad_norm": 3.2349908351898193,
"learning_rate": 9.475070806843202e-06,
"loss": 1.323697566986084,
"step": 358
},
{
"epoch": 0.7982261640798226,
"grad_norm": 1.123029351234436,
"learning_rate": 9.467307584203619e-06,
"loss": 1.2511515617370605,
"step": 360
},
{
"epoch": 0.802660753880266,
"grad_norm": 1.8244556188583374,
"learning_rate": 9.459490986255756e-06,
"loss": 0.6574705839157104,
"step": 362
},
{
"epoch": 0.8070953436807096,
"grad_norm": 1.0850750207901,
"learning_rate": 9.451621118162453e-06,
"loss": 1.3474435806274414,
"step": 364
},
{
"epoch": 0.811529933481153,
"grad_norm": 1.0580451488494873,
"learning_rate": 9.443698085803235e-06,
"loss": 1.2190296649932861,
"step": 366
},
{
"epoch": 0.8159645232815964,
"grad_norm": 1.2732537984848022,
"learning_rate": 9.435721995772884e-06,
"loss": 1.001318335533142,
"step": 368
},
{
"epoch": 0.8203991130820399,
"grad_norm": 3.1396641731262207,
"learning_rate": 9.42769295538001e-06,
"loss": 0.8487209677696228,
"step": 370
},
{
"epoch": 0.8248337028824834,
"grad_norm": 1.9196490049362183,
"learning_rate": 9.419611072645608e-06,
"loss": 1.2449017763137817,
"step": 372
},
{
"epoch": 0.8292682926829268,
"grad_norm": 0.878728449344635,
"learning_rate": 9.4114764563016e-06,
"loss": 1.2437989711761475,
"step": 374
},
{
"epoch": 0.8337028824833703,
"grad_norm": 1.0601072311401367,
"learning_rate": 9.403289215789373e-06,
"loss": 1.205723524093628,
"step": 376
},
{
"epoch": 0.8381374722838137,
"grad_norm": 6.103841304779053,
"learning_rate": 9.395049461258318e-06,
"loss": 1.246256709098816,
"step": 378
},
{
"epoch": 0.8425720620842572,
"grad_norm": 2.8056368827819824,
"learning_rate": 9.386757303564323e-06,
"loss": 0.7605912089347839,
"step": 380
},
{
"epoch": 0.8470066518847007,
"grad_norm": 1.2943381071090698,
"learning_rate": 9.37841285426831e-06,
"loss": 1.3198047876358032,
"step": 382
},
{
"epoch": 0.8514412416851441,
"grad_norm": 1.5381852388381958,
"learning_rate": 9.370016225634719e-06,
"loss": 1.1507357358932495,
"step": 384
},
{
"epoch": 0.8558758314855875,
"grad_norm": 4.027266025543213,
"learning_rate": 9.361567530629988e-06,
"loss": 1.253443956375122,
"step": 386
},
{
"epoch": 0.8603104212860311,
"grad_norm": 0.5111842155456543,
"learning_rate": 9.353066882921063e-06,
"loss": 1.1759639978408813,
"step": 388
},
{
"epoch": 0.8647450110864745,
"grad_norm": 0.8026149868965149,
"learning_rate": 9.344514396873837e-06,
"loss": 1.2608635425567627,
"step": 390
},
{
"epoch": 0.8691796008869179,
"grad_norm": 1.007558822631836,
"learning_rate": 9.335910187551628e-06,
"loss": 0.9073533415794373,
"step": 392
},
{
"epoch": 0.8736141906873615,
"grad_norm": 1.0619909763336182,
"learning_rate": 9.327254370713636e-06,
"loss": 1.2199063301086426,
"step": 394
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.797447681427002,
"learning_rate": 9.31854706281336e-06,
"loss": 1.2162882089614868,
"step": 396
},
{
"epoch": 0.8824833702882483,
"grad_norm": 1.5606045722961426,
"learning_rate": 9.309788380997069e-06,
"loss": 1.208269476890564,
"step": 398
},
{
"epoch": 0.8869179600886918,
"grad_norm": 0.7051414847373962,
"learning_rate": 9.30097844310219e-06,
"loss": 0.5201588869094849,
"step": 400
},
{
"epoch": 0.8913525498891353,
"grad_norm": 1.7164850234985352,
"learning_rate": 9.292117367655749e-06,
"loss": 1.0428240299224854,
"step": 402
},
{
"epoch": 0.8957871396895787,
"grad_norm": 0.5142672657966614,
"learning_rate": 9.283205273872757e-06,
"loss": 0.9249513745307922,
"step": 404
},
{
"epoch": 0.9002217294900222,
"grad_norm": 2.530928373336792,
"learning_rate": 9.274242281654621e-06,
"loss": 1.2720247507095337,
"step": 406
},
{
"epoch": 0.9046563192904656,
"grad_norm": 2.1848092079162598,
"learning_rate": 9.265228511587525e-06,
"loss": 1.130611538887024,
"step": 408
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.4184147119522095,
"learning_rate": 9.2561640849408e-06,
"loss": 1.1104357242584229,
"step": 410
},
{
"epoch": 0.9135254988913526,
"grad_norm": 2.9592013359069824,
"learning_rate": 9.247049123665306e-06,
"loss": 1.2363438606262207,
"step": 412
},
{
"epoch": 0.917960088691796,
"grad_norm": 1.4257206916809082,
"learning_rate": 9.237883750391786e-06,
"loss": 1.2583258152008057,
"step": 414
},
{
"epoch": 0.9223946784922394,
"grad_norm": 4.5751776695251465,
"learning_rate": 9.228668088429212e-06,
"loss": 0.9869507551193237,
"step": 416
},
{
"epoch": 0.926829268292683,
"grad_norm": 0.9538375735282898,
"learning_rate": 9.219402261763129e-06,
"loss": 1.2694066762924194,
"step": 418
},
{
"epoch": 0.9312638580931264,
"grad_norm": 1.5408073663711548,
"learning_rate": 9.210086395053992e-06,
"loss": 0.7773984670639038,
"step": 420
},
{
"epoch": 0.9356984478935698,
"grad_norm": 3.4732754230499268,
"learning_rate": 9.200720613635476e-06,
"loss": 1.4664820432662964,
"step": 422
},
{
"epoch": 0.9401330376940134,
"grad_norm": 2.461186408996582,
"learning_rate": 9.191305043512806e-06,
"loss": 1.0693186521530151,
"step": 424
},
{
"epoch": 0.9445676274944568,
"grad_norm": 3.3022894859313965,
"learning_rate": 9.181839811361048e-06,
"loss": 1.2006292343139648,
"step": 426
},
{
"epoch": 0.9490022172949002,
"grad_norm": 0.8173018097877502,
"learning_rate": 9.172325044523413e-06,
"loss": 1.100250005722046,
"step": 428
},
{
"epoch": 0.9534368070953437,
"grad_norm": 1.222411870956421,
"learning_rate": 9.16276087100954e-06,
"loss": 0.6654449701309204,
"step": 430
},
{
"epoch": 0.9578713968957872,
"grad_norm": 1.2987111806869507,
"learning_rate": 9.153147419493774e-06,
"loss": 1.2298423051834106,
"step": 432
},
{
"epoch": 0.9623059866962306,
"grad_norm": 2.3424274921417236,
"learning_rate": 9.143484819313441e-06,
"loss": 1.1017597913742065,
"step": 434
},
{
"epoch": 0.9667405764966741,
"grad_norm": 1.5677627325057983,
"learning_rate": 9.133773200467095e-06,
"loss": 1.2761000394821167,
"step": 436
},
{
"epoch": 0.9711751662971175,
"grad_norm": 1.3605282306671143,
"learning_rate": 9.12401269361278e-06,
"loss": 1.2378090620040894,
"step": 438
},
{
"epoch": 0.975609756097561,
"grad_norm": 6.334904193878174,
"learning_rate": 9.114203430066273e-06,
"loss": 0.9209675788879395,
"step": 440
},
{
"epoch": 0.9800443458980045,
"grad_norm": 1.7632060050964355,
"learning_rate": 9.104345541799304e-06,
"loss": 1.0448412895202637,
"step": 442
},
{
"epoch": 0.9844789356984479,
"grad_norm": 1.790408730506897,
"learning_rate": 9.094439161437797e-06,
"loss": 1.2859784364700317,
"step": 444
},
{
"epoch": 0.9889135254988913,
"grad_norm": 1.3654117584228516,
"learning_rate": 9.084484422260079e-06,
"loss": 1.2079429626464844,
"step": 446
},
{
"epoch": 0.9933481152993349,
"grad_norm": 1.1531587839126587,
"learning_rate": 9.074481458195077e-06,
"loss": 1.2449597120285034,
"step": 448
},
{
"epoch": 0.9977827050997783,
"grad_norm": 0.8101630806922913,
"learning_rate": 9.064430403820538e-06,
"loss": 1.0602502822875977,
"step": 450
},
{
"epoch": 1.0022172949002217,
"grad_norm": 3.6886227130889893,
"learning_rate": 9.054331394361195e-06,
"loss": 1.0475915670394897,
"step": 452
},
{
"epoch": 1.0066518847006651,
"grad_norm": 1.2973512411117554,
"learning_rate": 9.044184565686963e-06,
"loss": 1.2166190147399902,
"step": 454
},
{
"epoch": 1.0110864745011086,
"grad_norm": 1.0704854726791382,
"learning_rate": 9.033990054311108e-06,
"loss": 1.214512586593628,
"step": 456
},
{
"epoch": 1.0155210643015522,
"grad_norm": 5.711384296417236,
"learning_rate": 9.023747997388409e-06,
"loss": 0.8170270919799805,
"step": 458
},
{
"epoch": 1.0199556541019956,
"grad_norm": 0.5732322931289673,
"learning_rate": 9.013458532713303e-06,
"loss": 0.7988295555114746,
"step": 460
},
{
"epoch": 1.024390243902439,
"grad_norm": 1.6274175643920898,
"learning_rate": 9.003121798718055e-06,
"loss": 0.520905077457428,
"step": 462
},
{
"epoch": 1.0288248337028825,
"grad_norm": 1.794394612312317,
"learning_rate": 8.992737934470875e-06,
"loss": 0.7766972184181213,
"step": 464
},
{
"epoch": 1.033259423503326,
"grad_norm": 1.3309742212295532,
"learning_rate": 8.982307079674051e-06,
"loss": 0.6682877540588379,
"step": 466
},
{
"epoch": 1.0376940133037693,
"grad_norm": 1.3699984550476074,
"learning_rate": 8.971829374662075e-06,
"loss": 0.9524274468421936,
"step": 468
},
{
"epoch": 1.042128603104213,
"grad_norm": 1.1349494457244873,
"learning_rate": 8.961304960399746e-06,
"loss": 0.610037624835968,
"step": 470
},
{
"epoch": 1.0465631929046564,
"grad_norm": 1.1969398260116577,
"learning_rate": 8.950733978480295e-06,
"loss": 0.9021300077438354,
"step": 472
},
{
"epoch": 1.0509977827050998,
"grad_norm": 1.6304041147232056,
"learning_rate": 8.940116571123442e-06,
"loss": 1.0323752164840698,
"step": 474
},
{
"epoch": 1.0554323725055432,
"grad_norm": 1.383133888244629,
"learning_rate": 8.929452881173522e-06,
"loss": 1.06901216506958,
"step": 476
},
{
"epoch": 1.0598669623059866,
"grad_norm": 1.9538545608520508,
"learning_rate": 8.91874305209754e-06,
"loss": 1.087662696838379,
"step": 478
},
{
"epoch": 1.06430155210643,
"grad_norm": 0.9558976888656616,
"learning_rate": 8.907987227983244e-06,
"loss": 0.628630518913269,
"step": 480
},
{
"epoch": 1.0687361419068737,
"grad_norm": 0.9924697875976562,
"learning_rate": 8.897185553537199e-06,
"loss": 1.079522728919983,
"step": 482
},
{
"epoch": 1.0731707317073171,
"grad_norm": 0.4461686313152313,
"learning_rate": 8.886338174082818e-06,
"loss": 0.9315227270126343,
"step": 484
},
{
"epoch": 1.0776053215077606,
"grad_norm": 0.8930976986885071,
"learning_rate": 8.875445235558429e-06,
"loss": 1.030774474143982,
"step": 486
},
{
"epoch": 1.082039911308204,
"grad_norm": 0.9663419723510742,
"learning_rate": 8.864506884515298e-06,
"loss": 0.775393009185791,
"step": 488
},
{
"epoch": 1.0864745011086474,
"grad_norm": 1.8761645555496216,
"learning_rate": 8.853523268115662e-06,
"loss": 0.878838062286377,
"step": 490
},
{
"epoch": 1.0909090909090908,
"grad_norm": 1.990370512008667,
"learning_rate": 8.84249453413075e-06,
"loss": 1.1431140899658203,
"step": 492
},
{
"epoch": 1.0953436807095343,
"grad_norm": 4.04485559463501,
"learning_rate": 8.831420830938787e-06,
"loss": 1.0441185235977173,
"step": 494
},
{
"epoch": 1.099778270509978,
"grad_norm": 3.171431303024292,
"learning_rate": 8.820302307523012e-06,
"loss": 0.8327910304069519,
"step": 496
},
{
"epoch": 1.1042128603104213,
"grad_norm": 0.8679511547088623,
"learning_rate": 8.809139113469664e-06,
"loss": 0.9644469618797302,
"step": 498
},
{
"epoch": 1.1086474501108647,
"grad_norm": 1.1949224472045898,
"learning_rate": 8.797931398965968e-06,
"loss": 0.6914905905723572,
"step": 500
},
{
"epoch": 1.1130820399113082,
"grad_norm": 1.3626770973205566,
"learning_rate": 8.78667931479812e-06,
"loss": 1.0657753944396973,
"step": 502
},
{
"epoch": 1.1175166297117516,
"grad_norm": 1.2040343284606934,
"learning_rate": 8.775383012349255e-06,
"loss": 0.8900744915008545,
"step": 504
},
{
"epoch": 1.1219512195121952,
"grad_norm": 2.972325325012207,
"learning_rate": 8.764042643597413e-06,
"loss": 0.9030492305755615,
"step": 506
},
{
"epoch": 1.1263858093126387,
"grad_norm": 2.5267043113708496,
"learning_rate": 8.75265836111349e-06,
"loss": 0.8307876586914062,
"step": 508
},
{
"epoch": 1.130820399113082,
"grad_norm": 1.421155333518982,
"learning_rate": 8.741230318059188e-06,
"loss": 1.1410354375839233,
"step": 510
},
{
"epoch": 1.1352549889135255,
"grad_norm": 1.9269262552261353,
"learning_rate": 8.72975866818496e-06,
"loss": 0.5328395962715149,
"step": 512
},
{
"epoch": 1.139689578713969,
"grad_norm": 1.2638076543807983,
"learning_rate": 8.718243565827927e-06,
"loss": 0.44444531202316284,
"step": 514
},
{
"epoch": 1.1441241685144123,
"grad_norm": 1.3325319290161133,
"learning_rate": 8.706685165909817e-06,
"loss": 0.9025118350982666,
"step": 516
},
{
"epoch": 1.1485587583148558,
"grad_norm": 1.31704843044281,
"learning_rate": 8.695083623934872e-06,
"loss": 1.0184192657470703,
"step": 518
},
{
"epoch": 1.1529933481152994,
"grad_norm": 0.9442891478538513,
"learning_rate": 8.683439095987758e-06,
"loss": 1.075165033340454,
"step": 520
},
{
"epoch": 1.1574279379157428,
"grad_norm": 2.553899049758911,
"learning_rate": 8.671751738731464e-06,
"loss": 0.545891284942627,
"step": 522
},
{
"epoch": 1.1618625277161863,
"grad_norm": 1.0405535697937012,
"learning_rate": 8.660021709405197e-06,
"loss": 0.9202799201011658,
"step": 524
},
{
"epoch": 1.1662971175166297,
"grad_norm": 1.757952332496643,
"learning_rate": 8.648249165822265e-06,
"loss": 1.147822380065918,
"step": 526
},
{
"epoch": 1.170731707317073,
"grad_norm": 0.8222943544387817,
"learning_rate": 8.636434266367956e-06,
"loss": 0.6507425904273987,
"step": 528
},
{
"epoch": 1.1751662971175167,
"grad_norm": 0.7118900418281555,
"learning_rate": 8.624577169997394e-06,
"loss": 0.8483346104621887,
"step": 530
},
{
"epoch": 1.1796008869179602,
"grad_norm": 0.7820613384246826,
"learning_rate": 8.612678036233428e-06,
"loss": 0.9108617305755615,
"step": 532
},
{
"epoch": 1.1840354767184036,
"grad_norm": 1.182815670967102,
"learning_rate": 8.600737025164454e-06,
"loss": 1.0241936445236206,
"step": 534
},
{
"epoch": 1.188470066518847,
"grad_norm": 0.36487239599227905,
"learning_rate": 8.588754297442288e-06,
"loss": 0.7436657547950745,
"step": 536
},
{
"epoch": 1.1929046563192904,
"grad_norm": 0.996915876865387,
"learning_rate": 8.576730014279982e-06,
"loss": 1.047356128692627,
"step": 538
},
{
"epoch": 1.1973392461197339,
"grad_norm": 0.9109376072883606,
"learning_rate": 8.564664337449677e-06,
"loss": 0.6739315986633301,
"step": 540
},
{
"epoch": 1.2017738359201773,
"grad_norm": 1.8329638242721558,
"learning_rate": 8.552557429280407e-06,
"loss": 0.6435797214508057,
"step": 542
},
{
"epoch": 1.206208425720621,
"grad_norm": 1.6585458517074585,
"learning_rate": 8.540409452655927e-06,
"loss": 1.0112941265106201,
"step": 544
},
{
"epoch": 1.2106430155210643,
"grad_norm": 0.4716590940952301,
"learning_rate": 8.528220571012518e-06,
"loss": 0.7702317237854004,
"step": 546
},
{
"epoch": 1.2150776053215078,
"grad_norm": 1.1359302997589111,
"learning_rate": 8.51599094833679e-06,
"loss": 1.208444356918335,
"step": 548
},
{
"epoch": 1.2195121951219512,
"grad_norm": 1.7960258722305298,
"learning_rate": 8.503720749163472e-06,
"loss": 0.6087892055511475,
"step": 550
},
{
"epoch": 1.2239467849223946,
"grad_norm": 1.232657551765442,
"learning_rate": 8.491410138573201e-06,
"loss": 1.0305365324020386,
"step": 552
},
{
"epoch": 1.2283813747228383,
"grad_norm": 1.60177481174469,
"learning_rate": 8.479059282190298e-06,
"loss": 0.7768429517745972,
"step": 554
},
{
"epoch": 1.2328159645232817,
"grad_norm": 0.3955824077129364,
"learning_rate": 8.466668346180548e-06,
"loss": 0.7353140115737915,
"step": 556
},
{
"epoch": 1.237250554323725,
"grad_norm": 2.8532001972198486,
"learning_rate": 8.454237497248956e-06,
"loss": 0.6795215010643005,
"step": 558
},
{
"epoch": 1.2416851441241685,
"grad_norm": 2.764127016067505,
"learning_rate": 8.441766902637506e-06,
"loss": 1.2555410861968994,
"step": 560
},
{
"epoch": 1.246119733924612,
"grad_norm": 2.7635672092437744,
"learning_rate": 8.429256730122909e-06,
"loss": 0.9878131747245789,
"step": 562
},
{
"epoch": 1.2505543237250554,
"grad_norm": 0.5536887645721436,
"learning_rate": 8.416707148014358e-06,
"loss": 0.6607757806777954,
"step": 564
},
{
"epoch": 1.2549889135254988,
"grad_norm": 2.1173784732818604,
"learning_rate": 8.404118325151245e-06,
"loss": 0.9502476453781128,
"step": 566
},
{
"epoch": 1.2594235033259422,
"grad_norm": 1.0017777681350708,
"learning_rate": 8.391490430900902e-06,
"loss": 0.8671638369560242,
"step": 568
},
{
"epoch": 1.2638580931263859,
"grad_norm": 7.101733207702637,
"learning_rate": 8.378823635156319e-06,
"loss": 1.1196980476379395,
"step": 570
},
{
"epoch": 1.2682926829268293,
"grad_norm": 0.7950246930122375,
"learning_rate": 8.366118108333861e-06,
"loss": 0.7276540398597717,
"step": 572
},
{
"epoch": 1.2727272727272727,
"grad_norm": 2.6005003452301025,
"learning_rate": 8.353374021370967e-06,
"loss": 1.2642651796340942,
"step": 574
},
{
"epoch": 1.2771618625277161,
"grad_norm": 1.111757516860962,
"learning_rate": 8.340591545723861e-06,
"loss": 0.9772793650627136,
"step": 576
},
{
"epoch": 1.2815964523281598,
"grad_norm": 1.9483938217163086,
"learning_rate": 8.327770853365238e-06,
"loss": 1.144858479499817,
"step": 578
},
{
"epoch": 1.2860310421286032,
"grad_norm": 1.3433754444122314,
"learning_rate": 8.314912116781954e-06,
"loss": 1.0109100341796875,
"step": 580
},
{
"epoch": 1.2904656319290466,
"grad_norm": 3.104252815246582,
"learning_rate": 8.302015508972702e-06,
"loss": 1.3024400472640991,
"step": 582
},
{
"epoch": 1.29490022172949,
"grad_norm": 0.7278043031692505,
"learning_rate": 8.289081203445686e-06,
"loss": 0.6852630376815796,
"step": 584
},
{
"epoch": 1.2993348115299335,
"grad_norm": 0.9191624522209167,
"learning_rate": 8.276109374216286e-06,
"loss": 0.7753503322601318,
"step": 586
},
{
"epoch": 1.3037694013303769,
"grad_norm": 2.7969019412994385,
"learning_rate": 8.263100195804722e-06,
"loss": 0.5715383291244507,
"step": 588
},
{
"epoch": 1.3082039911308203,
"grad_norm": 3.702993392944336,
"learning_rate": 8.250053843233704e-06,
"loss": 1.0089302062988281,
"step": 590
},
{
"epoch": 1.3126385809312637,
"grad_norm": 0.8864567279815674,
"learning_rate": 8.236970492026063e-06,
"loss": 0.9880189895629883,
"step": 592
},
{
"epoch": 1.3170731707317074,
"grad_norm": 0.795543372631073,
"learning_rate": 8.223850318202415e-06,
"loss": 1.063581109046936,
"step": 594
},
{
"epoch": 1.3215077605321508,
"grad_norm": 2.8183748722076416,
"learning_rate": 8.210693498278773e-06,
"loss": 1.100631833076477,
"step": 596
},
{
"epoch": 1.3259423503325942,
"grad_norm": 0.9428039789199829,
"learning_rate": 8.197500209264181e-06,
"loss": 1.1460652351379395,
"step": 598
},
{
"epoch": 1.3303769401330376,
"grad_norm": 2.286390781402588,
"learning_rate": 8.18427062865833e-06,
"loss": 0.4320100247859955,
"step": 600
},
{
"epoch": 1.3348115299334813,
"grad_norm": 2.263921022415161,
"learning_rate": 8.171004934449166e-06,
"loss": 0.9320815205574036,
"step": 602
},
{
"epoch": 1.3392461197339247,
"grad_norm": 2.219259023666382,
"learning_rate": 8.157703305110508e-06,
"loss": 0.8366701602935791,
"step": 604
},
{
"epoch": 1.3436807095343681,
"grad_norm": 0.45662444829940796,
"learning_rate": 8.144365919599632e-06,
"loss": 0.7252450585365295,
"step": 606
},
{
"epoch": 1.3481152993348116,
"grad_norm": 3.525024890899658,
"learning_rate": 8.130992957354872e-06,
"loss": 1.1409834623336792,
"step": 608
},
{
"epoch": 1.352549889135255,
"grad_norm": 1.893685221672058,
"learning_rate": 8.117584598293204e-06,
"loss": 1.1217989921569824,
"step": 610
},
{
"epoch": 1.3569844789356984,
"grad_norm": 19.174959182739258,
"learning_rate": 8.104141022807824e-06,
"loss": 0.6694931983947754,
"step": 612
},
{
"epoch": 1.3614190687361418,
"grad_norm": 0.9400426745414734,
"learning_rate": 8.090662411765726e-06,
"loss": 1.087446689605713,
"step": 614
},
{
"epoch": 1.3658536585365852,
"grad_norm": 0.36289501190185547,
"learning_rate": 8.077148946505258e-06,
"loss": 0.6697701215744019,
"step": 616
},
{
"epoch": 1.370288248337029,
"grad_norm": 0.8704696893692017,
"learning_rate": 8.063600808833698e-06,
"loss": 0.8936916589736938,
"step": 618
},
{
"epoch": 1.3747228381374723,
"grad_norm": 0.25890159606933594,
"learning_rate": 8.050018181024788e-06,
"loss": 0.08715429157018661,
"step": 620
},
{
"epoch": 1.3791574279379157,
"grad_norm": 0.6486095190048218,
"learning_rate": 8.036401245816306e-06,
"loss": 0.991193413734436,
"step": 622
},
{
"epoch": 1.3835920177383592,
"grad_norm": 1.103645920753479,
"learning_rate": 8.022750186407586e-06,
"loss": 0.943727970123291,
"step": 624
},
{
"epoch": 1.3880266075388026,
"grad_norm": 2.296745538711548,
"learning_rate": 8.009065186457061e-06,
"loss": 0.9769091606140137,
"step": 626
},
{
"epoch": 1.3924611973392462,
"grad_norm": 2.5007238388061523,
"learning_rate": 7.995346430079799e-06,
"loss": 0.976685106754303,
"step": 628
},
{
"epoch": 1.3968957871396896,
"grad_norm": 1.3621313571929932,
"learning_rate": 7.981594101845012e-06,
"loss": 1.103744387626648,
"step": 630
},
{
"epoch": 1.401330376940133,
"grad_norm": 0.9651110172271729,
"learning_rate": 7.967808386773591e-06,
"loss": 0.7250331044197083,
"step": 632
},
{
"epoch": 1.4057649667405765,
"grad_norm": 1.190347671508789,
"learning_rate": 7.953989470335592e-06,
"loss": 0.9121408462524414,
"step": 634
},
{
"epoch": 1.41019955654102,
"grad_norm": 0.8217507004737854,
"learning_rate": 7.940137538447769e-06,
"loss": 1.0396496057510376,
"step": 636
},
{
"epoch": 1.4146341463414633,
"grad_norm": 7.098917484283447,
"learning_rate": 7.92625277747105e-06,
"loss": 0.8050640225410461,
"step": 638
},
{
"epoch": 1.4190687361419068,
"grad_norm": 1.0582122802734375,
"learning_rate": 7.912335374208043e-06,
"loss": 0.6821240186691284,
"step": 640
},
{
"epoch": 1.4235033259423504,
"grad_norm": 1.3478533029556274,
"learning_rate": 7.898385515900517e-06,
"loss": 0.8159670829772949,
"step": 642
},
{
"epoch": 1.4279379157427938,
"grad_norm": 1.2756816148757935,
"learning_rate": 7.884403390226883e-06,
"loss": 0.7149834036827087,
"step": 644
},
{
"epoch": 1.4323725055432373,
"grad_norm": 0.8082099556922913,
"learning_rate": 7.870389185299672e-06,
"loss": 1.2317858934402466,
"step": 646
},
{
"epoch": 1.4368070953436807,
"grad_norm": 0.9018802046775818,
"learning_rate": 7.856343089663002e-06,
"loss": 1.0831472873687744,
"step": 648
},
{
"epoch": 1.441241685144124,
"grad_norm": 1.004470944404602,
"learning_rate": 7.842265292290039e-06,
"loss": 1.1530892848968506,
"step": 650
},
{
"epoch": 1.4456762749445677,
"grad_norm": 2.8486859798431396,
"learning_rate": 7.828155982580465e-06,
"loss": 0.8907480835914612,
"step": 652
},
{
"epoch": 1.4501108647450112,
"grad_norm": 3.3158531188964844,
"learning_rate": 7.814015350357912e-06,
"loss": 1.065255880355835,
"step": 654
},
{
"epoch": 1.4545454545454546,
"grad_norm": 2.0944631099700928,
"learning_rate": 7.799843585867426e-06,
"loss": 0.9423489570617676,
"step": 656
},
{
"epoch": 1.458980044345898,
"grad_norm": 1.330924391746521,
"learning_rate": 7.785640879772897e-06,
"loss": 0.797570526599884,
"step": 658
},
{
"epoch": 1.4634146341463414,
"grad_norm": 9.296934127807617,
"learning_rate": 7.771407423154498e-06,
"loss": 1.0860297679901123,
"step": 660
},
{
"epoch": 1.4678492239467849,
"grad_norm": 1.0748159885406494,
"learning_rate": 7.757143407506111e-06,
"loss": 1.0053012371063232,
"step": 662
},
{
"epoch": 1.4722838137472283,
"grad_norm": 1.2716701030731201,
"learning_rate": 7.742849024732754e-06,
"loss": 0.7912719249725342,
"step": 664
},
{
"epoch": 1.476718403547672,
"grad_norm": 0.5906314849853516,
"learning_rate": 7.728524467148e-06,
"loss": 0.5228022933006287,
"step": 666
},
{
"epoch": 1.4811529933481153,
"grad_norm": 12.187054634094238,
"learning_rate": 7.714169927471379e-06,
"loss": 0.5079471468925476,
"step": 668
},
{
"epoch": 1.4855875831485588,
"grad_norm": 1.7716224193572998,
"learning_rate": 7.699785598825805e-06,
"loss": 0.903016209602356,
"step": 670
},
{
"epoch": 1.4900221729490022,
"grad_norm": 2.2982170581817627,
"learning_rate": 7.68537167473496e-06,
"loss": 1.0076159238815308,
"step": 672
},
{
"epoch": 1.4944567627494456,
"grad_norm": 0.5563678741455078,
"learning_rate": 7.670928349120699e-06,
"loss": 0.6532096266746521,
"step": 674
},
{
"epoch": 1.4988913525498893,
"grad_norm": 0.7698723077774048,
"learning_rate": 7.656455816300434e-06,
"loss": 0.7211450934410095,
"step": 676
},
{
"epoch": 1.5033259423503327,
"grad_norm": 1.3669579029083252,
"learning_rate": 7.641954270984532e-06,
"loss": 1.0425605773925781,
"step": 678
},
{
"epoch": 1.507760532150776,
"grad_norm": 0.6051411628723145,
"learning_rate": 7.627423908273683e-06,
"loss": 0.5795204639434814,
"step": 680
},
{
"epoch": 1.5121951219512195,
"grad_norm": 1.3261148929595947,
"learning_rate": 7.61286492365628e-06,
"loss": 0.5524274110794067,
"step": 682
},
{
"epoch": 1.516629711751663,
"grad_norm": 2.459198474884033,
"learning_rate": 7.598277513005793e-06,
"loss": 1.0282782316207886,
"step": 684
},
{
"epoch": 1.5210643015521064,
"grad_norm": 0.8493002653121948,
"learning_rate": 7.583661872578124e-06,
"loss": 0.961588978767395,
"step": 686
},
{
"epoch": 1.5254988913525498,
"grad_norm": 1.2923295497894287,
"learning_rate": 7.569018199008976e-06,
"loss": 0.5933117270469666,
"step": 688
},
{
"epoch": 1.5299334811529932,
"grad_norm": 1.0922276973724365,
"learning_rate": 7.554346689311205e-06,
"loss": 0.4136204123497009,
"step": 690
},
{
"epoch": 1.5343680709534369,
"grad_norm": 0.827107310295105,
"learning_rate": 7.539647540872165e-06,
"loss": 0.724818766117096,
"step": 692
},
{
"epoch": 1.5388026607538803,
"grad_norm": 10.415434837341309,
"learning_rate": 7.5249209514510595e-06,
"loss": 0.9134396314620972,
"step": 694
},
{
"epoch": 1.5432372505543237,
"grad_norm": 1.308587908744812,
"learning_rate": 7.510167119176273e-06,
"loss": 0.5761569738388062,
"step": 696
},
{
"epoch": 1.5476718403547673,
"grad_norm": 0.7201342582702637,
"learning_rate": 7.49538624254271e-06,
"loss": 1.1129498481750488,
"step": 698
},
{
"epoch": 1.5521064301552108,
"grad_norm": 0.7750831842422485,
"learning_rate": 7.48057852040913e-06,
"loss": 1.0308855772018433,
"step": 700
},
{
"epoch": 1.5565410199556542,
"grad_norm": 2.1023459434509277,
"learning_rate": 7.465744151995458e-06,
"loss": 1.0691447257995605,
"step": 702
},
{
"epoch": 1.5609756097560976,
"grad_norm": 0.7603687047958374,
"learning_rate": 7.450883336880116e-06,
"loss": 0.8402605652809143,
"step": 704
},
{
"epoch": 1.565410199556541,
"grad_norm": 2.021561861038208,
"learning_rate": 7.435996274997337e-06,
"loss": 0.43568840622901917,
"step": 706
},
{
"epoch": 1.5698447893569845,
"grad_norm": 0.7875716090202332,
"learning_rate": 7.421083166634466e-06,
"loss": 1.1289795637130737,
"step": 708
},
{
"epoch": 1.5742793791574279,
"grad_norm": 3.5039725303649902,
"learning_rate": 7.40614421242928e-06,
"loss": 0.9707282781600952,
"step": 710
},
{
"epoch": 1.5787139689578713,
"grad_norm": 1.424716591835022,
"learning_rate": 7.391179613367272e-06,
"loss": 0.6375144720077515,
"step": 712
},
{
"epoch": 1.5831485587583147,
"grad_norm": 0.6600134968757629,
"learning_rate": 7.37618957077896e-06,
"loss": 0.9794768691062927,
"step": 714
},
{
"epoch": 1.5875831485587582,
"grad_norm": 1.2803093194961548,
"learning_rate": 7.361174286337175e-06,
"loss": 1.127861738204956,
"step": 716
},
{
"epoch": 1.5920177383592018,
"grad_norm": 3.5021018981933594,
"learning_rate": 7.346133962054341e-06,
"loss": 1.2201976776123047,
"step": 718
},
{
"epoch": 1.5964523281596452,
"grad_norm": 1.4853342771530151,
"learning_rate": 7.33106880027977e-06,
"loss": 0.5727948546409607,
"step": 720
},
{
"epoch": 1.6008869179600886,
"grad_norm": 0.3484899401664734,
"learning_rate": 7.315979003696927e-06,
"loss": 0.5201311111450195,
"step": 722
},
{
"epoch": 1.6053215077605323,
"grad_norm": 0.6640962958335876,
"learning_rate": 7.300864775320708e-06,
"loss": 1.0324370861053467,
"step": 724
},
{
"epoch": 1.6097560975609757,
"grad_norm": 0.6866331696510315,
"learning_rate": 7.285726318494717e-06,
"loss": 1.0323240756988525,
"step": 726
},
{
"epoch": 1.6141906873614191,
"grad_norm": 0.8838914632797241,
"learning_rate": 7.2705638368885105e-06,
"loss": 0.708541989326477,
"step": 728
},
{
"epoch": 1.6186252771618626,
"grad_norm": 0.6660059690475464,
"learning_rate": 7.255377534494875e-06,
"loss": 1.0379067659378052,
"step": 730
},
{
"epoch": 1.623059866962306,
"grad_norm": 0.61259925365448,
"learning_rate": 7.240167615627082e-06,
"loss": 1.0389152765274048,
"step": 732
},
{
"epoch": 1.6274944567627494,
"grad_norm": 0.6811412572860718,
"learning_rate": 7.224934284916127e-06,
"loss": 0.9704182744026184,
"step": 734
},
{
"epoch": 1.6319290465631928,
"grad_norm": 0.36625775694847107,
"learning_rate": 7.209677747307982e-06,
"loss": 0.6105785965919495,
"step": 736
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.6093775033950806,
"learning_rate": 7.194398208060848e-06,
"loss": 1.1938239336013794,
"step": 738
},
{
"epoch": 1.6407982261640797,
"grad_norm": 0.6593520045280457,
"learning_rate": 7.179095872742378e-06,
"loss": 1.0288550853729248,
"step": 740
},
{
"epoch": 1.6452328159645233,
"grad_norm": 2.0731823444366455,
"learning_rate": 7.16377094722692e-06,
"loss": 1.107767939567566,
"step": 742
},
{
"epoch": 1.6496674057649667,
"grad_norm": 1.3170353174209595,
"learning_rate": 7.148423637692748e-06,
"loss": 0.8513399362564087,
"step": 744
},
{
"epoch": 1.6541019955654102,
"grad_norm": 0.9616439342498779,
"learning_rate": 7.133054150619282e-06,
"loss": 0.9469323754310608,
"step": 746
},
{
"epoch": 1.6585365853658538,
"grad_norm": 1.0744072198867798,
"learning_rate": 7.117662692784318e-06,
"loss": 0.6854807734489441,
"step": 748
},
{
"epoch": 1.6629711751662972,
"grad_norm": 0.8329254984855652,
"learning_rate": 7.102249471261241e-06,
"loss": 1.0346763134002686,
"step": 750
},
{
"epoch": 1.6674057649667406,
"grad_norm": 1.6085177659988403,
"learning_rate": 7.0868146934162365e-06,
"loss": 0.9381062388420105,
"step": 752
},
{
"epoch": 1.671840354767184,
"grad_norm": 0.6383267641067505,
"learning_rate": 7.071358566905507e-06,
"loss": 0.6153408885002136,
"step": 754
},
{
"epoch": 1.6762749445676275,
"grad_norm": 0.7473850846290588,
"learning_rate": 7.055881299672476e-06,
"loss": 0.9577728509902954,
"step": 756
},
{
"epoch": 1.680709534368071,
"grad_norm": 0.7026475071907043,
"learning_rate": 7.040383099944988e-06,
"loss": 0.9313692450523376,
"step": 758
},
{
"epoch": 1.6851441241685143,
"grad_norm": 13.826199531555176,
"learning_rate": 7.02486417623251e-06,
"loss": 0.8703269362449646,
"step": 760
},
{
"epoch": 1.6895787139689578,
"grad_norm": 1.2666261196136475,
"learning_rate": 7.009324737323325e-06,
"loss": 1.1479053497314453,
"step": 762
},
{
"epoch": 1.6940133037694012,
"grad_norm": 1.4821723699569702,
"learning_rate": 6.993764992281722e-06,
"loss": 1.0106499195098877,
"step": 764
},
{
"epoch": 1.6984478935698448,
"grad_norm": 2.4803268909454346,
"learning_rate": 6.978185150445187e-06,
"loss": 0.9789531230926514,
"step": 766
},
{
"epoch": 1.7028824833702882,
"grad_norm": 1.579527497291565,
"learning_rate": 6.96258542142158e-06,
"loss": 0.6890674233436584,
"step": 768
},
{
"epoch": 1.7073170731707317,
"grad_norm": 1.3111584186553955,
"learning_rate": 6.946966015086321e-06,
"loss": 1.0892305374145508,
"step": 770
},
{
"epoch": 1.7117516629711753,
"grad_norm": 3.6061043739318848,
"learning_rate": 6.931327141579565e-06,
"loss": 1.0220389366149902,
"step": 772
},
{
"epoch": 1.7161862527716187,
"grad_norm": 0.8147404789924622,
"learning_rate": 6.915669011303374e-06,
"loss": 0.9009866118431091,
"step": 774
},
{
"epoch": 1.7206208425720622,
"grad_norm": 0.8462866544723511,
"learning_rate": 6.899991834918884e-06,
"loss": 1.1094865798950195,
"step": 776
},
{
"epoch": 1.7250554323725056,
"grad_norm": 0.7154016494750977,
"learning_rate": 6.884295823343479e-06,
"loss": 1.0449351072311401,
"step": 778
},
{
"epoch": 1.729490022172949,
"grad_norm": 1.2640376091003418,
"learning_rate": 6.868581187747941e-06,
"loss": 0.6298045516014099,
"step": 780
},
{
"epoch": 1.7339246119733924,
"grad_norm": 0.6618129014968872,
"learning_rate": 6.852848139553619e-06,
"loss": 0.9618121981620789,
"step": 782
},
{
"epoch": 1.7383592017738358,
"grad_norm": 0.4092734158039093,
"learning_rate": 6.837096890429582e-06,
"loss": 0.6855942606925964,
"step": 784
},
{
"epoch": 1.7427937915742793,
"grad_norm": 0.6238394379615784,
"learning_rate": 6.821327652289768e-06,
"loss": 0.7413522005081177,
"step": 786
},
{
"epoch": 1.7472283813747227,
"grad_norm": 1.0907618999481201,
"learning_rate": 6.8055406372901344e-06,
"loss": 0.6723021268844604,
"step": 788
},
{
"epoch": 1.7516629711751663,
"grad_norm": 3.868008613586426,
"learning_rate": 6.789736057825812e-06,
"loss": 0.6211203932762146,
"step": 790
},
{
"epoch": 1.7560975609756098,
"grad_norm": 0.46340852975845337,
"learning_rate": 6.77391412652823e-06,
"loss": 0.6684106588363647,
"step": 792
},
{
"epoch": 1.7605321507760532,
"grad_norm": 0.6993800401687622,
"learning_rate": 6.758075056262271e-06,
"loss": 0.8443524837493896,
"step": 794
},
{
"epoch": 1.7649667405764968,
"grad_norm": 0.9882437586784363,
"learning_rate": 6.742219060123403e-06,
"loss": 0.7883599400520325,
"step": 796
},
{
"epoch": 1.7694013303769403,
"grad_norm": 0.7495700716972351,
"learning_rate": 6.7263463514348095e-06,
"loss": 0.9688935875892639,
"step": 798
},
{
"epoch": 1.7738359201773837,
"grad_norm": 0.7344732880592346,
"learning_rate": 6.710457143744519e-06,
"loss": 1.0964137315750122,
"step": 800
},
{
"epoch": 1.778270509977827,
"grad_norm": 1.3385992050170898,
"learning_rate": 6.6945516508225325e-06,
"loss": 0.9830183982849121,
"step": 802
},
{
"epoch": 1.7827050997782705,
"grad_norm": 1.4036580324172974,
"learning_rate": 6.678630086657959e-06,
"loss": 1.1154334545135498,
"step": 804
},
{
"epoch": 1.787139689578714,
"grad_norm": 1.4150375127792358,
"learning_rate": 6.662692665456115e-06,
"loss": 0.7647712230682373,
"step": 806
},
{
"epoch": 1.7915742793791574,
"grad_norm": 1.0048019886016846,
"learning_rate": 6.646739601635661e-06,
"loss": 1.0117907524108887,
"step": 808
},
{
"epoch": 1.7960088691796008,
"grad_norm": 0.8310498595237732,
"learning_rate": 6.6307711098257074e-06,
"loss": 0.5957526564598083,
"step": 810
},
{
"epoch": 1.8004434589800442,
"grad_norm": 1.8356163501739502,
"learning_rate": 6.6147874048629294e-06,
"loss": 0.6535270810127258,
"step": 812
},
{
"epoch": 1.8048780487804879,
"grad_norm": 2.6936192512512207,
"learning_rate": 6.598788701788677e-06,
"loss": 1.2300708293914795,
"step": 814
},
{
"epoch": 1.8093126385809313,
"grad_norm": 1.4539717435836792,
"learning_rate": 6.582775215846082e-06,
"loss": 0.9219212532043457,
"step": 816
},
{
"epoch": 1.8137472283813747,
"grad_norm": 1.1613236665725708,
"learning_rate": 6.566747162477164e-06,
"loss": 0.4706512689590454,
"step": 818
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.9962729811668396,
"learning_rate": 6.5507047573199235e-06,
"loss": 1.0331782102584839,
"step": 820
},
{
"epoch": 1.8226164079822618,
"grad_norm": 1.4629065990447998,
"learning_rate": 6.5346482162054526e-06,
"loss": 1.0425474643707275,
"step": 822
},
{
"epoch": 1.8270509977827052,
"grad_norm": 0.9501014351844788,
"learning_rate": 6.518577755155024e-06,
"loss": 1.1319053173065186,
"step": 824
},
{
"epoch": 1.8314855875831486,
"grad_norm": 2.260728120803833,
"learning_rate": 6.502493590377184e-06,
"loss": 0.9275112748146057,
"step": 826
},
{
"epoch": 1.835920177383592,
"grad_norm": 1.412589192390442,
"learning_rate": 6.48639593826485e-06,
"loss": 0.8285998106002808,
"step": 828
},
{
"epoch": 1.8403547671840355,
"grad_norm": 0.8061251640319824,
"learning_rate": 6.4702850153923915e-06,
"loss": 0.9728037714958191,
"step": 830
},
{
"epoch": 1.8447893569844789,
"grad_norm": 3.3840270042419434,
"learning_rate": 6.45416103851272e-06,
"loss": 0.9926842451095581,
"step": 832
},
{
"epoch": 1.8492239467849223,
"grad_norm": 0.75401771068573,
"learning_rate": 6.438024224554378e-06,
"loss": 0.9589172005653381,
"step": 834
},
{
"epoch": 1.8536585365853657,
"grad_norm": 0.8730403184890747,
"learning_rate": 6.421874790618608e-06,
"loss": 0.9601749181747437,
"step": 836
},
{
"epoch": 1.8580931263858091,
"grad_norm": 7.889377593994141,
"learning_rate": 6.405712953976444e-06,
"loss": 0.9879204034805298,
"step": 838
},
{
"epoch": 1.8625277161862528,
"grad_norm": 1.303679347038269,
"learning_rate": 6.389538932065783e-06,
"loss": 1.1837224960327148,
"step": 840
},
{
"epoch": 1.8669623059866962,
"grad_norm": 2.8485257625579834,
"learning_rate": 6.373352942488455e-06,
"loss": 0.7557274103164673,
"step": 842
},
{
"epoch": 1.8713968957871396,
"grad_norm": 1.183737874031067,
"learning_rate": 6.357155203007307e-06,
"loss": 0.7274500131607056,
"step": 844
},
{
"epoch": 1.8758314855875833,
"grad_norm": 1.8683289289474487,
"learning_rate": 6.340945931543263e-06,
"loss": 0.8478662371635437,
"step": 846
},
{
"epoch": 1.8802660753880267,
"grad_norm": 2.6959264278411865,
"learning_rate": 6.324725346172399e-06,
"loss": 0.8910980820655823,
"step": 848
},
{
"epoch": 1.8847006651884701,
"grad_norm": 1.2880501747131348,
"learning_rate": 6.308493665123e-06,
"loss": 1.2397483587265015,
"step": 850
},
{
"epoch": 1.8891352549889135,
"grad_norm": 2.6425297260284424,
"learning_rate": 6.2922511067726365e-06,
"loss": 0.6860370635986328,
"step": 852
},
{
"epoch": 1.893569844789357,
"grad_norm": 1.1020407676696777,
"learning_rate": 6.2759978896452155e-06,
"loss": 1.0302581787109375,
"step": 854
},
{
"epoch": 1.8980044345898004,
"grad_norm": 1.1593939065933228,
"learning_rate": 6.259734232408047e-06,
"loss": 0.9292960166931152,
"step": 856
},
{
"epoch": 1.9024390243902438,
"grad_norm": 0.899362325668335,
"learning_rate": 6.2434603538688975e-06,
"loss": 0.726291835308075,
"step": 858
},
{
"epoch": 1.9068736141906872,
"grad_norm": 2.7879903316497803,
"learning_rate": 6.2271764729730525e-06,
"loss": 0.7539620995521545,
"step": 860
},
{
"epoch": 1.9113082039911307,
"grad_norm": 2.1077184677124023,
"learning_rate": 6.210882808800366e-06,
"loss": 1.0768671035766602,
"step": 862
},
{
"epoch": 1.9157427937915743,
"grad_norm": 0.9502280354499817,
"learning_rate": 6.19457958056231e-06,
"loss": 0.7523056864738464,
"step": 864
},
{
"epoch": 1.9201773835920177,
"grad_norm": 1.5630991458892822,
"learning_rate": 6.178267007599034e-06,
"loss": 0.6150217652320862,
"step": 866
},
{
"epoch": 1.9246119733924612,
"grad_norm": 1.0712734460830688,
"learning_rate": 6.161945309376409e-06,
"loss": 1.0981650352478027,
"step": 868
},
{
"epoch": 1.9290465631929048,
"grad_norm": 0.8349732160568237,
"learning_rate": 6.145614705483075e-06,
"loss": 1.0270382165908813,
"step": 870
},
{
"epoch": 1.9334811529933482,
"grad_norm": 0.5500652194023132,
"learning_rate": 6.129275415627485e-06,
"loss": 0.7593640685081482,
"step": 872
},
{
"epoch": 1.9379157427937916,
"grad_norm": 1.514795184135437,
"learning_rate": 6.11292765963495e-06,
"loss": 0.6513434052467346,
"step": 874
},
{
"epoch": 1.942350332594235,
"grad_norm": 0.9090774655342102,
"learning_rate": 6.09657165744469e-06,
"loss": 0.6522112488746643,
"step": 876
},
{
"epoch": 1.9467849223946785,
"grad_norm": 0.8675610423088074,
"learning_rate": 6.080207629106859e-06,
"loss": 0.740065336227417,
"step": 878
},
{
"epoch": 1.951219512195122,
"grad_norm": 1.5948195457458496,
"learning_rate": 6.063835794779598e-06,
"loss": 0.8240799307823181,
"step": 880
},
{
"epoch": 1.9556541019955653,
"grad_norm": 3.2498433589935303,
"learning_rate": 6.047456374726067e-06,
"loss": 0.5737804174423218,
"step": 882
},
{
"epoch": 1.9600886917960088,
"grad_norm": 0.8827385902404785,
"learning_rate": 6.031069589311481e-06,
"loss": 0.7567857503890991,
"step": 884
},
{
"epoch": 1.9645232815964522,
"grad_norm": 1.2192054986953735,
"learning_rate": 6.01467565900015e-06,
"loss": 0.9545295238494873,
"step": 886
},
{
"epoch": 1.9689578713968958,
"grad_norm": 0.7741813659667969,
"learning_rate": 5.99827480435251e-06,
"loss": 0.9531633257865906,
"step": 888
},
{
"epoch": 1.9733924611973392,
"grad_norm": 0.8368670344352722,
"learning_rate": 5.981867246022149e-06,
"loss": 1.0802265405654907,
"step": 890
},
{
"epoch": 1.9778270509977827,
"grad_norm": 1.4058384895324707,
"learning_rate": 5.965453204752855e-06,
"loss": 1.0436638593673706,
"step": 892
},
{
"epoch": 1.9822616407982263,
"grad_norm": 1.1106218099594116,
"learning_rate": 5.949032901375627e-06,
"loss": 1.013320803642273,
"step": 894
},
{
"epoch": 1.9866962305986697,
"grad_norm": 5.496564865112305,
"learning_rate": 5.932606556805719e-06,
"loss": 0.8702308535575867,
"step": 896
},
{
"epoch": 1.9911308203991132,
"grad_norm": 0.7846924066543579,
"learning_rate": 5.916174392039659e-06,
"loss": 0.9531219005584717,
"step": 898
},
{
"epoch": 1.9955654101995566,
"grad_norm": 1.4032496213912964,
"learning_rate": 5.899736628152284e-06,
"loss": 0.7455130815505981,
"step": 900
},
{
"epoch": 2.0,
"grad_norm": 0.851958155632019,
"learning_rate": 5.88329348629375e-06,
"loss": 0.9732981324195862,
"step": 902
},
{
"epoch": 2.0044345898004434,
"grad_norm": 0.8989659547805786,
"learning_rate": 5.8668451876865736e-06,
"loss": 0.6848430633544922,
"step": 904
},
{
"epoch": 2.008869179600887,
"grad_norm": 0.9057672023773193,
"learning_rate": 5.850391953622652e-06,
"loss": 0.7127602100372314,
"step": 906
},
{
"epoch": 2.0133037694013303,
"grad_norm": 0.8921846747398376,
"learning_rate": 5.8339340054602775e-06,
"loss": 0.851362943649292,
"step": 908
},
{
"epoch": 2.0177383592017737,
"grad_norm": 0.8631959557533264,
"learning_rate": 5.817471564621169e-06,
"loss": 0.7096256017684937,
"step": 910
},
{
"epoch": 2.022172949002217,
"grad_norm": 1.8130816221237183,
"learning_rate": 5.801004852587485e-06,
"loss": 0.5044680237770081,
"step": 912
},
{
"epoch": 2.0266075388026605,
"grad_norm": 1.5005384683609009,
"learning_rate": 5.784534090898849e-06,
"loss": 0.5559933185577393,
"step": 914
},
{
"epoch": 2.0310421286031044,
"grad_norm": 1.5633256435394287,
"learning_rate": 5.768059501149369e-06,
"loss": 0.6281445622444153,
"step": 916
},
{
"epoch": 2.035476718403548,
"grad_norm": 1.614864706993103,
"learning_rate": 5.751581304984657e-06,
"loss": 0.6711671948432922,
"step": 918
},
{
"epoch": 2.0399113082039912,
"grad_norm": 1.0288786888122559,
"learning_rate": 5.735099724098838e-06,
"loss": 0.5363720655441284,
"step": 920
},
{
"epoch": 2.0443458980044347,
"grad_norm": 6.116771221160889,
"learning_rate": 5.718614980231582e-06,
"loss": 0.5760122537612915,
"step": 922
},
{
"epoch": 2.048780487804878,
"grad_norm": 0.6346896290779114,
"learning_rate": 5.702127295165107e-06,
"loss": 0.1978059560060501,
"step": 924
},
{
"epoch": 2.0532150776053215,
"grad_norm": 1.3448708057403564,
"learning_rate": 5.685636890721205e-06,
"loss": 0.8290249705314636,
"step": 926
},
{
"epoch": 2.057649667405765,
"grad_norm": 2.6560189723968506,
"learning_rate": 5.669143988758253e-06,
"loss": 0.5688458681106567,
"step": 928
},
{
"epoch": 2.0620842572062084,
"grad_norm": 0.2830648124217987,
"learning_rate": 5.652648811168228e-06,
"loss": 0.4656969904899597,
"step": 930
},
{
"epoch": 2.066518847006652,
"grad_norm": 2.9565589427948,
"learning_rate": 5.636151579873726e-06,
"loss": 0.5648703575134277,
"step": 932
},
{
"epoch": 2.070953436807095,
"grad_norm": 0.8213310837745667,
"learning_rate": 5.619652516824967e-06,
"loss": 0.6612739562988281,
"step": 934
},
{
"epoch": 2.0753880266075386,
"grad_norm": 1.1572480201721191,
"learning_rate": 5.603151843996822e-06,
"loss": 0.7529350519180298,
"step": 936
},
{
"epoch": 2.079822616407982,
"grad_norm": 1.7981088161468506,
"learning_rate": 5.586649783385813e-06,
"loss": 0.5522570610046387,
"step": 938
},
{
"epoch": 2.084257206208426,
"grad_norm": 1.3611361980438232,
"learning_rate": 5.570146557007141e-06,
"loss": 0.48875561356544495,
"step": 940
},
{
"epoch": 2.0886917960088693,
"grad_norm": 1.683354139328003,
"learning_rate": 5.553642386891683e-06,
"loss": 0.811143159866333,
"step": 942
},
{
"epoch": 2.0931263858093128,
"grad_norm": 1.7977195978164673,
"learning_rate": 5.537137495083018e-06,
"loss": 0.45032718777656555,
"step": 944
},
{
"epoch": 2.097560975609756,
"grad_norm": 1.0323246717453003,
"learning_rate": 5.5206321036344304e-06,
"loss": 0.703310489654541,
"step": 946
},
{
"epoch": 2.1019955654101996,
"grad_norm": 1.2375712394714355,
"learning_rate": 5.504126434605932e-06,
"loss": 0.7233847379684448,
"step": 948
},
{
"epoch": 2.106430155210643,
"grad_norm": 1.348511815071106,
"learning_rate": 5.487620710061262e-06,
"loss": 0.4606630802154541,
"step": 950
},
{
"epoch": 2.1108647450110865,
"grad_norm": 1.3615354299545288,
"learning_rate": 5.471115152064916e-06,
"loss": 0.5963136553764343,
"step": 952
},
{
"epoch": 2.11529933481153,
"grad_norm": 3.1463687419891357,
"learning_rate": 5.454609982679138e-06,
"loss": 0.5811668038368225,
"step": 954
},
{
"epoch": 2.1197339246119733,
"grad_norm": 1.2822504043579102,
"learning_rate": 5.4381054239609525e-06,
"loss": 0.7643156051635742,
"step": 956
},
{
"epoch": 2.1241685144124167,
"grad_norm": 1.20485258102417,
"learning_rate": 5.421601697959164e-06,
"loss": 0.5839754343032837,
"step": 958
},
{
"epoch": 2.12860310421286,
"grad_norm": 2.490534782409668,
"learning_rate": 5.405099026711374e-06,
"loss": 0.5331164598464966,
"step": 960
},
{
"epoch": 2.1330376940133036,
"grad_norm": 3.638983726501465,
"learning_rate": 5.388597632240994e-06,
"loss": 0.40006810426712036,
"step": 962
},
{
"epoch": 2.1374722838137474,
"grad_norm": 6.238394737243652,
"learning_rate": 5.372097736554261e-06,
"loss": 0.6906276941299438,
"step": 964
},
{
"epoch": 2.141906873614191,
"grad_norm": 0.839712917804718,
"learning_rate": 5.35559956163724e-06,
"loss": 0.6319162249565125,
"step": 966
},
{
"epoch": 2.1463414634146343,
"grad_norm": 0.7551127672195435,
"learning_rate": 5.339103329452856e-06,
"loss": 0.41347965598106384,
"step": 968
},
{
"epoch": 2.1507760532150777,
"grad_norm": 0.8068141937255859,
"learning_rate": 5.322609261937887e-06,
"loss": 0.5399714112281799,
"step": 970
},
{
"epoch": 2.155210643015521,
"grad_norm": 1.4304999113082886,
"learning_rate": 5.306117580999993e-06,
"loss": 0.4987761676311493,
"step": 972
},
{
"epoch": 2.1596452328159645,
"grad_norm": 1.4152289628982544,
"learning_rate": 5.289628508514725e-06,
"loss": 0.7639641165733337,
"step": 974
},
{
"epoch": 2.164079822616408,
"grad_norm": 1.2822636365890503,
"learning_rate": 5.2731422663225385e-06,
"loss": 0.7585563659667969,
"step": 976
},
{
"epoch": 2.1685144124168514,
"grad_norm": 2.3158578872680664,
"learning_rate": 5.256659076225813e-06,
"loss": 0.3914712071418762,
"step": 978
},
{
"epoch": 2.172949002217295,
"grad_norm": 1.916115164756775,
"learning_rate": 5.240179159985866e-06,
"loss": 0.8097031712532043,
"step": 980
},
{
"epoch": 2.1773835920177382,
"grad_norm": 1.6557332277297974,
"learning_rate": 5.2237027393199645e-06,
"loss": 0.37278667092323303,
"step": 982
},
{
"epoch": 2.1818181818181817,
"grad_norm": 1.2614706754684448,
"learning_rate": 5.207230035898356e-06,
"loss": 0.2516429126262665,
"step": 984
},
{
"epoch": 2.186252771618625,
"grad_norm": 1.9887027740478516,
"learning_rate": 5.190761271341268e-06,
"loss": 0.659031331539154,
"step": 986
},
{
"epoch": 2.1906873614190685,
"grad_norm": 3.152275323867798,
"learning_rate": 5.174296667215939e-06,
"loss": 0.28620240092277527,
"step": 988
},
{
"epoch": 2.1951219512195124,
"grad_norm": 0.960594654083252,
"learning_rate": 5.157836445033636e-06,
"loss": 0.702060878276825,
"step": 990
},
{
"epoch": 2.199556541019956,
"grad_norm": 2.6779539585113525,
"learning_rate": 5.141380826246667e-06,
"loss": 0.9569138288497925,
"step": 992
},
{
"epoch": 2.203991130820399,
"grad_norm": 1.3519855737686157,
"learning_rate": 5.124930032245415e-06,
"loss": 0.5782943964004517,
"step": 994
},
{
"epoch": 2.2084257206208426,
"grad_norm": 1.4382355213165283,
"learning_rate": 5.108484284355339e-06,
"loss": 0.7067066431045532,
"step": 996
},
{
"epoch": 2.212860310421286,
"grad_norm": 1.1686522960662842,
"learning_rate": 5.0920438038340194e-06,
"loss": 0.6596247553825378,
"step": 998
},
{
"epoch": 2.2172949002217295,
"grad_norm": 1.4124189615249634,
"learning_rate": 5.075608811868169e-06,
"loss": 0.6456693410873413,
"step": 1000
},
{
"epoch": 2.221729490022173,
"grad_norm": 4.106961727142334,
"learning_rate": 5.059179529570657e-06,
"loss": 0.3303482234477997,
"step": 1002
},
{
"epoch": 2.2261640798226163,
"grad_norm": 0.9121060371398926,
"learning_rate": 5.042756177977534e-06,
"loss": 0.6765180230140686,
"step": 1004
},
{
"epoch": 2.2305986696230597,
"grad_norm": 1.4012293815612793,
"learning_rate": 5.026338978045062e-06,
"loss": 0.5389603972434998,
"step": 1006
},
{
"epoch": 2.235033259423503,
"grad_norm": 0.968221127986908,
"learning_rate": 5.009928150646741e-06,
"loss": 0.6899822950363159,
"step": 1008
},
{
"epoch": 2.2394678492239466,
"grad_norm": 1.1386181116104126,
"learning_rate": 4.993523916570334e-06,
"loss": 0.6064386367797852,
"step": 1010
},
{
"epoch": 2.2439024390243905,
"grad_norm": 1.1713107824325562,
"learning_rate": 4.977126496514902e-06,
"loss": 0.6847143769264221,
"step": 1012
},
{
"epoch": 2.248337028824834,
"grad_norm": 1.6454007625579834,
"learning_rate": 4.960736111087827e-06,
"loss": 0.6909704804420471,
"step": 1014
},
{
"epoch": 2.2527716186252773,
"grad_norm": 0.9556133151054382,
"learning_rate": 4.9443529808018545e-06,
"loss": 0.8876364231109619,
"step": 1016
},
{
"epoch": 2.2572062084257207,
"grad_norm": 1.2127729654312134,
"learning_rate": 4.927977326072115e-06,
"loss": 0.36061155796051025,
"step": 1018
},
{
"epoch": 2.261640798226164,
"grad_norm": 0.925119936466217,
"learning_rate": 4.911609367213168e-06,
"loss": 0.8120240569114685,
"step": 1020
},
{
"epoch": 2.2660753880266076,
"grad_norm": 0.3137432932853699,
"learning_rate": 4.895249324436035e-06,
"loss": 0.4972486197948456,
"step": 1022
},
{
"epoch": 2.270509977827051,
"grad_norm": 1.6952173709869385,
"learning_rate": 4.8788974178452316e-06,
"loss": 0.9327743053436279,
"step": 1024
},
{
"epoch": 2.2749445676274944,
"grad_norm": 0.680587112903595,
"learning_rate": 4.86255386743582e-06,
"loss": 0.524124026298523,
"step": 1026
},
{
"epoch": 2.279379157427938,
"grad_norm": 1.0751386880874634,
"learning_rate": 4.846218893090426e-06,
"loss": 0.7662097215652466,
"step": 1028
},
{
"epoch": 2.2838137472283813,
"grad_norm": 0.8584238290786743,
"learning_rate": 4.829892714576307e-06,
"loss": 0.7474344968795776,
"step": 1030
},
{
"epoch": 2.2882483370288247,
"grad_norm": 2.667839765548706,
"learning_rate": 4.813575551542381e-06,
"loss": 0.6120243072509766,
"step": 1032
},
{
"epoch": 2.292682926829268,
"grad_norm": 2.76969313621521,
"learning_rate": 4.7972676235162714e-06,
"loss": 0.7319304347038269,
"step": 1034
},
{
"epoch": 2.2971175166297115,
"grad_norm": 1.307244896888733,
"learning_rate": 4.780969149901354e-06,
"loss": 0.7238577604293823,
"step": 1036
},
{
"epoch": 2.3015521064301554,
"grad_norm": 2.190412998199463,
"learning_rate": 4.764680349973812e-06,
"loss": 0.732725977897644,
"step": 1038
},
{
"epoch": 2.305986696230599,
"grad_norm": 1.9367870092391968,
"learning_rate": 4.748401442879674e-06,
"loss": 0.6513870358467102,
"step": 1040
},
{
"epoch": 2.3104212860310422,
"grad_norm": 1.6586291790008545,
"learning_rate": 4.732132647631881e-06,
"loss": 0.9142364263534546,
"step": 1042
},
{
"epoch": 2.3148558758314857,
"grad_norm": 0.5376819968223572,
"learning_rate": 4.715874183107324e-06,
"loss": 0.6070502996444702,
"step": 1044
},
{
"epoch": 2.319290465631929,
"grad_norm": 1.9359222650527954,
"learning_rate": 4.699626268043911e-06,
"loss": 0.7449045181274414,
"step": 1046
},
{
"epoch": 2.3237250554323725,
"grad_norm": 1.128960132598877,
"learning_rate": 4.683389121037618e-06,
"loss": 0.6669731736183167,
"step": 1048
},
{
"epoch": 2.328159645232816,
"grad_norm": 0.9983499646186829,
"learning_rate": 4.667162960539552e-06,
"loss": 0.7283903360366821,
"step": 1050
},
{
"epoch": 2.3325942350332594,
"grad_norm": 2.680569887161255,
"learning_rate": 4.650948004853006e-06,
"loss": 0.614159107208252,
"step": 1052
},
{
"epoch": 2.337028824833703,
"grad_norm": 2.4752540588378906,
"learning_rate": 4.634744472130529e-06,
"loss": 0.4821033477783203,
"step": 1054
},
{
"epoch": 2.341463414634146,
"grad_norm": 0.9671631455421448,
"learning_rate": 4.618552580370988e-06,
"loss": 0.6265279054641724,
"step": 1056
},
{
"epoch": 2.3458980044345896,
"grad_norm": 0.278334379196167,
"learning_rate": 4.6023725474166324e-06,
"loss": 0.3953332304954529,
"step": 1058
},
{
"epoch": 2.3503325942350335,
"grad_norm": 1.0874994993209839,
"learning_rate": 4.586204590950169e-06,
"loss": 0.8931505084037781,
"step": 1060
},
{
"epoch": 2.354767184035477,
"grad_norm": 3.5059680938720703,
"learning_rate": 4.570048928491824e-06,
"loss": 0.37978875637054443,
"step": 1062
},
{
"epoch": 2.3592017738359203,
"grad_norm": 1.128374695777893,
"learning_rate": 4.5539057773964316e-06,
"loss": 0.4617552161216736,
"step": 1064
},
{
"epoch": 2.3636363636363638,
"grad_norm": 2.8418750762939453,
"learning_rate": 4.537775354850496e-06,
"loss": 0.6248428821563721,
"step": 1066
},
{
"epoch": 2.368070953436807,
"grad_norm": 1.0952420234680176,
"learning_rate": 4.5216578778692725e-06,
"loss": 0.7397058010101318,
"step": 1068
},
{
"epoch": 2.3725055432372506,
"grad_norm": 1.1010463237762451,
"learning_rate": 4.5055535632938526e-06,
"loss": 0.6921043395996094,
"step": 1070
},
{
"epoch": 2.376940133037694,
"grad_norm": 1.206845998764038,
"learning_rate": 4.489462627788242e-06,
"loss": 0.8171138167381287,
"step": 1072
},
{
"epoch": 2.3813747228381374,
"grad_norm": 0.2211584895849228,
"learning_rate": 4.473385287836448e-06,
"loss": 0.034517209976911545,
"step": 1074
},
{
"epoch": 2.385809312638581,
"grad_norm": 1.39876389503479,
"learning_rate": 4.457321759739567e-06,
"loss": 0.4455287456512451,
"step": 1076
},
{
"epoch": 2.3902439024390243,
"grad_norm": 0.5344479084014893,
"learning_rate": 4.4412722596128686e-06,
"loss": 0.2573848068714142,
"step": 1078
},
{
"epoch": 2.3946784922394677,
"grad_norm": 3.3447608947753906,
"learning_rate": 4.425237003382903e-06,
"loss": 0.6614237427711487,
"step": 1080
},
{
"epoch": 2.399113082039911,
"grad_norm": 0.2269127368927002,
"learning_rate": 4.409216206784577e-06,
"loss": 0.4690076410770416,
"step": 1082
},
{
"epoch": 2.4035476718403546,
"grad_norm": 0.7799621820449829,
"learning_rate": 4.393210085358265e-06,
"loss": 0.5260664224624634,
"step": 1084
},
{
"epoch": 2.4079822616407984,
"grad_norm": 0.9041287899017334,
"learning_rate": 4.3772188544469016e-06,
"loss": 0.71802818775177,
"step": 1086
},
{
"epoch": 2.412416851441242,
"grad_norm": 0.303363174200058,
"learning_rate": 4.3612427291930915e-06,
"loss": 0.20643645524978638,
"step": 1088
},
{
"epoch": 2.4168514412416853,
"grad_norm": 2.20377779006958,
"learning_rate": 4.345281924536208e-06,
"loss": 0.7628622651100159,
"step": 1090
},
{
"epoch": 2.4212860310421287,
"grad_norm": 1.9586279392242432,
"learning_rate": 4.329336655209505e-06,
"loss": 0.6242840886116028,
"step": 1092
},
{
"epoch": 2.425720620842572,
"grad_norm": 2.349154233932495,
"learning_rate": 4.31340713573723e-06,
"loss": 0.4106002748012543,
"step": 1094
},
{
"epoch": 2.4301552106430155,
"grad_norm": 4.811046600341797,
"learning_rate": 4.297493580431732e-06,
"loss": 0.4525107741355896,
"step": 1096
},
{
"epoch": 2.434589800443459,
"grad_norm": 1.139159917831421,
"learning_rate": 4.281596203390582e-06,
"loss": 0.43235841393470764,
"step": 1098
},
{
"epoch": 2.4390243902439024,
"grad_norm": 2.029642343521118,
"learning_rate": 4.265715218493695e-06,
"loss": 0.6632136702537537,
"step": 1100
},
{
"epoch": 2.443458980044346,
"grad_norm": 0.7346834540367126,
"learning_rate": 4.249850839400446e-06,
"loss": 0.8561656475067139,
"step": 1102
},
{
"epoch": 2.4478935698447892,
"grad_norm": 1.7342432737350464,
"learning_rate": 4.2340032795468e-06,
"loss": 0.3280484080314636,
"step": 1104
},
{
"epoch": 2.4523281596452327,
"grad_norm": 1.2294172048568726,
"learning_rate": 4.218172752142442e-06,
"loss": 0.8532360792160034,
"step": 1106
},
{
"epoch": 2.4567627494456765,
"grad_norm": 2.7901058197021484,
"learning_rate": 4.202359470167903e-06,
"loss": 0.6427351236343384,
"step": 1108
},
{
"epoch": 2.4611973392461195,
"grad_norm": 1.3930901288986206,
"learning_rate": 4.186563646371696e-06,
"loss": 0.7979812622070312,
"step": 1110
},
{
"epoch": 2.4656319290465634,
"grad_norm": 0.6308720111846924,
"learning_rate": 4.170785493267463e-06,
"loss": 0.5055820345878601,
"step": 1112
},
{
"epoch": 2.470066518847007,
"grad_norm": 1.5060687065124512,
"learning_rate": 4.155025223131102e-06,
"loss": 0.7073782086372375,
"step": 1114
},
{
"epoch": 2.47450110864745,
"grad_norm": 1.2661110162734985,
"learning_rate": 4.139283047997919e-06,
"loss": 0.2692304849624634,
"step": 1116
},
{
"epoch": 2.4789356984478936,
"grad_norm": 0.8920889496803284,
"learning_rate": 4.123559179659771e-06,
"loss": 0.6808326840400696,
"step": 1118
},
{
"epoch": 2.483370288248337,
"grad_norm": 1.1632484197616577,
"learning_rate": 4.107853829662224e-06,
"loss": 0.6864634156227112,
"step": 1120
},
{
"epoch": 2.4878048780487805,
"grad_norm": 1.0989418029785156,
"learning_rate": 4.0921672093017e-06,
"loss": 0.6558045148849487,
"step": 1122
},
{
"epoch": 2.492239467849224,
"grad_norm": 2.8047521114349365,
"learning_rate": 4.076499529622636e-06,
"loss": 0.8816790580749512,
"step": 1124
},
{
"epoch": 2.4966740576496673,
"grad_norm": 0.8200728297233582,
"learning_rate": 4.0608510014146455e-06,
"loss": 0.7856003642082214,
"step": 1126
},
{
"epoch": 2.5011086474501107,
"grad_norm": 1.6482559442520142,
"learning_rate": 4.045221835209684e-06,
"loss": 0.5203614234924316,
"step": 1128
},
{
"epoch": 2.505543237250554,
"grad_norm": 1.5657063722610474,
"learning_rate": 4.02961224127921e-06,
"loss": 0.5360028147697449,
"step": 1130
},
{
"epoch": 2.5099778270509976,
"grad_norm": 1.1746268272399902,
"learning_rate": 4.014022429631368e-06,
"loss": 0.6573871970176697,
"step": 1132
},
{
"epoch": 2.5144124168514415,
"grad_norm": 1.0907959938049316,
"learning_rate": 3.998452610008147e-06,
"loss": 0.3955955505371094,
"step": 1134
},
{
"epoch": 2.5188470066518844,
"grad_norm": 0.861308217048645,
"learning_rate": 3.982902991882578e-06,
"loss": 0.7564470767974854,
"step": 1136
},
{
"epoch": 2.5232815964523283,
"grad_norm": 0.7468408346176147,
"learning_rate": 3.967373784455896e-06,
"loss": 0.6149483919143677,
"step": 1138
},
{
"epoch": 2.5277161862527717,
"grad_norm": 0.710243284702301,
"learning_rate": 3.951865196654738e-06,
"loss": 0.8047510385513306,
"step": 1140
},
{
"epoch": 2.532150776053215,
"grad_norm": 2.8993804454803467,
"learning_rate": 3.936377437128329e-06,
"loss": 0.41506367921829224,
"step": 1142
},
{
"epoch": 2.5365853658536586,
"grad_norm": 0.2720961272716522,
"learning_rate": 3.920910714245679e-06,
"loss": 0.44911813735961914,
"step": 1144
},
{
"epoch": 2.541019955654102,
"grad_norm": 0.7515284419059753,
"learning_rate": 3.905465236092771e-06,
"loss": 0.7769864201545715,
"step": 1146
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.7892196178436279,
"learning_rate": 3.890041210469765e-06,
"loss": 0.7185046076774597,
"step": 1148
},
{
"epoch": 2.549889135254989,
"grad_norm": 0.7974555492401123,
"learning_rate": 3.8746388448882055e-06,
"loss": 0.6233813166618347,
"step": 1150
},
{
"epoch": 2.5543237250554323,
"grad_norm": 0.8632022142410278,
"learning_rate": 3.859258346568228e-06,
"loss": 0.44528669118881226,
"step": 1152
},
{
"epoch": 2.5587583148558757,
"grad_norm": 0.21908880770206451,
"learning_rate": 3.843899922435767e-06,
"loss": 0.2759099304676056,
"step": 1154
},
{
"epoch": 2.5631929046563195,
"grad_norm": 0.6866997480392456,
"learning_rate": 3.8285637791197815e-06,
"loss": 0.5508578419685364,
"step": 1156
},
{
"epoch": 2.5676274944567625,
"grad_norm": 2.519130229949951,
"learning_rate": 3.8132501229494635e-06,
"loss": 0.5549399852752686,
"step": 1158
},
{
"epoch": 2.5720620842572064,
"grad_norm": 2.8596558570861816,
"learning_rate": 3.7979591599514696e-06,
"loss": 0.6041897535324097,
"step": 1160
},
{
"epoch": 2.57649667405765,
"grad_norm": 0.7755621671676636,
"learning_rate": 3.782691095847151e-06,
"loss": 0.7620603442192078,
"step": 1162
},
{
"epoch": 2.5809312638580932,
"grad_norm": 0.9750792980194092,
"learning_rate": 3.767446136049775e-06,
"loss": 0.939260721206665,
"step": 1164
},
{
"epoch": 2.5853658536585367,
"grad_norm": 1.4371932744979858,
"learning_rate": 3.752224485661775e-06,
"loss": 0.4603317975997925,
"step": 1166
},
{
"epoch": 2.58980044345898,
"grad_norm": 1.9136115312576294,
"learning_rate": 3.7370263494719805e-06,
"loss": 0.9075461626052856,
"step": 1168
},
{
"epoch": 2.5942350332594235,
"grad_norm": 1.7198249101638794,
"learning_rate": 3.721851931952869e-06,
"loss": 0.8085947632789612,
"step": 1170
},
{
"epoch": 2.598669623059867,
"grad_norm": 2.1932644844055176,
"learning_rate": 3.706701437257808e-06,
"loss": 0.28652137517929077,
"step": 1172
},
{
"epoch": 2.6031042128603104,
"grad_norm": 0.7438820600509644,
"learning_rate": 3.691575069218314e-06,
"loss": 0.5113945007324219,
"step": 1174
},
{
"epoch": 2.6075388026607538,
"grad_norm": 0.860270619392395,
"learning_rate": 3.676473031341313e-06,
"loss": 0.6308207511901855,
"step": 1176
},
{
"epoch": 2.611973392461197,
"grad_norm": 1.0259983539581299,
"learning_rate": 3.661395526806395e-06,
"loss": 0.4299200773239136,
"step": 1178
},
{
"epoch": 2.6164079822616406,
"grad_norm": 2.737011194229126,
"learning_rate": 3.6463427584630806e-06,
"loss": 0.5992394089698792,
"step": 1180
},
{
"epoch": 2.6208425720620845,
"grad_norm": 0.8241181373596191,
"learning_rate": 3.631314928828099e-06,
"loss": 0.7023974061012268,
"step": 1182
},
{
"epoch": 2.6252771618625275,
"grad_norm": 0.6770392656326294,
"learning_rate": 3.616312240082659e-06,
"loss": 0.7103127241134644,
"step": 1184
},
{
"epoch": 2.6297117516629713,
"grad_norm": 1.7923256158828735,
"learning_rate": 3.601334894069728e-06,
"loss": 0.8160955309867859,
"step": 1186
},
{
"epoch": 2.6341463414634148,
"grad_norm": 2.411703586578369,
"learning_rate": 3.5863830922913147e-06,
"loss": 0.8449252247810364,
"step": 1188
},
{
"epoch": 2.638580931263858,
"grad_norm": 0.7744673490524292,
"learning_rate": 3.5714570359057676e-06,
"loss": 0.4491943418979645,
"step": 1190
},
{
"epoch": 2.6430155210643016,
"grad_norm": 2.5860702991485596,
"learning_rate": 3.556556925725061e-06,
"loss": 0.5912431478500366,
"step": 1192
},
{
"epoch": 2.647450110864745,
"grad_norm": 3.4122977256774902,
"learning_rate": 3.5416829622120875e-06,
"loss": 0.5446506142616272,
"step": 1194
},
{
"epoch": 2.6518847006651884,
"grad_norm": 1.6620879173278809,
"learning_rate": 3.526835345477978e-06,
"loss": 0.6308864951133728,
"step": 1196
},
{
"epoch": 2.656319290465632,
"grad_norm": 0.7548476457595825,
"learning_rate": 3.5120142752793907e-06,
"loss": 0.21907749772071838,
"step": 1198
},
{
"epoch": 2.6607538802660753,
"grad_norm": 3.578019380569458,
"learning_rate": 3.4972199510158393e-06,
"loss": 0.9215325117111206,
"step": 1200
},
{
"epoch": 2.6651884700665187,
"grad_norm": 0.7092931270599365,
"learning_rate": 3.4824525717269975e-06,
"loss": 0.8297696709632874,
"step": 1202
},
{
"epoch": 2.6696230598669626,
"grad_norm": 0.9896040558815002,
"learning_rate": 3.4677123360900342e-06,
"loss": 0.38038522005081177,
"step": 1204
},
{
"epoch": 2.6740576496674056,
"grad_norm": 2.033034563064575,
"learning_rate": 3.4529994424169233e-06,
"loss": 0.5968571901321411,
"step": 1206
},
{
"epoch": 2.6784922394678494,
"grad_norm": 2.7922661304473877,
"learning_rate": 3.4383140886517953e-06,
"loss": 0.7829728722572327,
"step": 1208
},
{
"epoch": 2.682926829268293,
"grad_norm": 1.3708895444869995,
"learning_rate": 3.423656472368262e-06,
"loss": 0.4674299359321594,
"step": 1210
},
{
"epoch": 2.6873614190687363,
"grad_norm": 1.8650954961776733,
"learning_rate": 3.409026790766756e-06,
"loss": 0.25274747610092163,
"step": 1212
},
{
"epoch": 2.6917960088691797,
"grad_norm": 2.8033974170684814,
"learning_rate": 3.394425240671891e-06,
"loss": 0.4370385706424713,
"step": 1214
},
{
"epoch": 2.696230598669623,
"grad_norm": 2.1162285804748535,
"learning_rate": 3.379852018529799e-06,
"loss": 0.5205950736999512,
"step": 1216
},
{
"epoch": 2.7006651884700665,
"grad_norm": 1.2731273174285889,
"learning_rate": 3.3653073204054942e-06,
"loss": 0.5236338973045349,
"step": 1218
},
{
"epoch": 2.70509977827051,
"grad_norm": 1.8009387254714966,
"learning_rate": 3.3507913419802403e-06,
"loss": 0.7941880822181702,
"step": 1220
},
{
"epoch": 2.7095343680709534,
"grad_norm": 1.7554905414581299,
"learning_rate": 3.336304278548903e-06,
"loss": 0.7005539536476135,
"step": 1222
},
{
"epoch": 2.713968957871397,
"grad_norm": 1.5743012428283691,
"learning_rate": 3.321846325017342e-06,
"loss": 0.7519204616546631,
"step": 1224
},
{
"epoch": 2.7184035476718402,
"grad_norm": 3.2630934715270996,
"learning_rate": 3.3074176758997744e-06,
"loss": 0.37882906198501587,
"step": 1226
},
{
"epoch": 2.7228381374722836,
"grad_norm": 0.9136330485343933,
"learning_rate": 3.2930185253161574e-06,
"loss": 0.8159320950508118,
"step": 1228
},
{
"epoch": 2.7272727272727275,
"grad_norm": 1.3966491222381592,
"learning_rate": 3.2786490669895883e-06,
"loss": 0.6657707095146179,
"step": 1230
},
{
"epoch": 2.7317073170731705,
"grad_norm": 0.7242875099182129,
"learning_rate": 3.2643094942436865e-06,
"loss": 0.7183330655097961,
"step": 1232
},
{
"epoch": 2.7361419068736144,
"grad_norm": 2.514469623565674,
"learning_rate": 3.2500000000000015e-06,
"loss": 0.4106196463108063,
"step": 1234
},
{
"epoch": 2.740576496674058,
"grad_norm": 1.027852177619934,
"learning_rate": 3.2357207767754063e-06,
"loss": 0.6766651272773743,
"step": 1236
},
{
"epoch": 2.745011086474501,
"grad_norm": 1.0152777433395386,
"learning_rate": 3.221472016679521e-06,
"loss": 0.47056448459625244,
"step": 1238
},
{
"epoch": 2.7494456762749446,
"grad_norm": 0.7261103987693787,
"learning_rate": 3.2072539114121188e-06,
"loss": 0.46467670798301697,
"step": 1240
},
{
"epoch": 2.753880266075388,
"grad_norm": 0.899706244468689,
"learning_rate": 3.193066652260547e-06,
"loss": 0.8382993340492249,
"step": 1242
},
{
"epoch": 2.7583148558758315,
"grad_norm": 0.9996641874313354,
"learning_rate": 3.1789104300971603e-06,
"loss": 0.7458208203315735,
"step": 1244
},
{
"epoch": 2.762749445676275,
"grad_norm": 0.36415642499923706,
"learning_rate": 3.164785435376745e-06,
"loss": 0.23638926446437836,
"step": 1246
},
{
"epoch": 2.7671840354767183,
"grad_norm": 1.592871904373169,
"learning_rate": 3.1506918581339583e-06,
"loss": 0.47278061509132385,
"step": 1248
},
{
"epoch": 2.7716186252771617,
"grad_norm": 1.0408028364181519,
"learning_rate": 3.136629887980781e-06,
"loss": 0.5122473835945129,
"step": 1250
},
{
"epoch": 2.776053215077605,
"grad_norm": 0.9867532849311829,
"learning_rate": 3.122599714103949e-06,
"loss": 0.8818725347518921,
"step": 1252
},
{
"epoch": 2.7804878048780486,
"grad_norm": 1.422898769378662,
"learning_rate": 3.1086015252624257e-06,
"loss": 0.8071056604385376,
"step": 1254
},
{
"epoch": 2.7849223946784925,
"grad_norm": 1.5934648513793945,
"learning_rate": 3.0946355097848535e-06,
"loss": 0.7926267385482788,
"step": 1256
},
{
"epoch": 2.7893569844789354,
"grad_norm": 0.5067124962806702,
"learning_rate": 3.0807018555670153e-06,
"loss": 0.13775405287742615,
"step": 1258
},
{
"epoch": 2.7937915742793793,
"grad_norm": 1.111436128616333,
"learning_rate": 3.0668007500693216e-06,
"loss": 0.7149184346199036,
"step": 1260
},
{
"epoch": 2.7982261640798227,
"grad_norm": 2.614287853240967,
"learning_rate": 3.0529323803142697e-06,
"loss": 0.5375425815582275,
"step": 1262
},
{
"epoch": 2.802660753880266,
"grad_norm": 1.180129885673523,
"learning_rate": 3.0390969328839464e-06,
"loss": 0.48145541548728943,
"step": 1264
},
{
"epoch": 2.8070953436807096,
"grad_norm": 1.0282138586044312,
"learning_rate": 3.0252945939175004e-06,
"loss": 0.7159358859062195,
"step": 1266
},
{
"epoch": 2.811529933481153,
"grad_norm": 3.18182635307312,
"learning_rate": 3.0115255491086537e-06,
"loss": 0.8956208229064941,
"step": 1268
},
{
"epoch": 2.8159645232815964,
"grad_norm": 0.7246714234352112,
"learning_rate": 2.9977899837031895e-06,
"loss": 0.7317441701889038,
"step": 1270
},
{
"epoch": 2.82039911308204,
"grad_norm": 1.5273503065109253,
"learning_rate": 2.984088082496469e-06,
"loss": 0.2754761874675751,
"step": 1272
},
{
"epoch": 2.8248337028824833,
"grad_norm": 0.9987423419952393,
"learning_rate": 2.970420029830946e-06,
"loss": 0.4880000650882721,
"step": 1274
},
{
"epoch": 2.8292682926829267,
"grad_norm": 0.7692528367042542,
"learning_rate": 2.9567860095936775e-06,
"loss": 0.9671233892440796,
"step": 1276
},
{
"epoch": 2.8337028824833705,
"grad_norm": 0.9331156015396118,
"learning_rate": 2.9431862052138545e-06,
"loss": 0.8612651824951172,
"step": 1278
},
{
"epoch": 2.8381374722838135,
"grad_norm": 0.9668262600898743,
"learning_rate": 2.929620799660343e-06,
"loss": 0.3867911994457245,
"step": 1280
},
{
"epoch": 2.8425720620842574,
"grad_norm": 2.516414165496826,
"learning_rate": 2.916089975439207e-06,
"loss": 0.47361209988594055,
"step": 1282
},
{
"epoch": 2.847006651884701,
"grad_norm": 0.9498918056488037,
"learning_rate": 2.9025939145912655e-06,
"loss": 0.4672809839248657,
"step": 1284
},
{
"epoch": 2.8514412416851442,
"grad_norm": 0.8674972653388977,
"learning_rate": 2.8891327986896345e-06,
"loss": 0.8502570390701294,
"step": 1286
},
{
"epoch": 2.8558758314855877,
"grad_norm": 2.823939085006714,
"learning_rate": 2.875706808837292e-06,
"loss": 0.1998748630285263,
"step": 1288
},
{
"epoch": 2.860310421286031,
"grad_norm": 1.9392937421798706,
"learning_rate": 2.862316125664636e-06,
"loss": 0.8196284770965576,
"step": 1290
},
{
"epoch": 2.8647450110864745,
"grad_norm": 8.883353233337402,
"learning_rate": 2.848960929327053e-06,
"loss": 0.7396450042724609,
"step": 1292
},
{
"epoch": 2.869179600886918,
"grad_norm": 1.4906433820724487,
"learning_rate": 2.8356413995025044e-06,
"loss": 0.717079758644104,
"step": 1294
},
{
"epoch": 2.8736141906873613,
"grad_norm": 0.8910164833068848,
"learning_rate": 2.8223577153890934e-06,
"loss": 0.7069391012191772,
"step": 1296
},
{
"epoch": 2.8780487804878048,
"grad_norm": 3.354166030883789,
"learning_rate": 2.8091100557026702e-06,
"loss": 0.4838540852069855,
"step": 1298
},
{
"epoch": 2.882483370288248,
"grad_norm": 0.841211199760437,
"learning_rate": 2.795898598674415e-06,
"loss": 0.6780248284339905,
"step": 1300
},
{
"epoch": 2.8869179600886916,
"grad_norm": 1.9939450025558472,
"learning_rate": 2.782723522048444e-06,
"loss": 0.2901532053947449,
"step": 1302
},
{
"epoch": 2.8913525498891355,
"grad_norm": 1.6043404340744019,
"learning_rate": 2.7695850030794293e-06,
"loss": 0.7313271760940552,
"step": 1304
},
{
"epoch": 2.8957871396895785,
"grad_norm": 2.5174219608306885,
"learning_rate": 2.7564832185301915e-06,
"loss": 0.6357086896896362,
"step": 1306
},
{
"epoch": 2.9002217294900223,
"grad_norm": 0.29980650544166565,
"learning_rate": 2.7434183446693397e-06,
"loss": 0.2599072754383087,
"step": 1308
},
{
"epoch": 2.9046563192904657,
"grad_norm": 1.3467071056365967,
"learning_rate": 2.730390557268897e-06,
"loss": 0.33962565660476685,
"step": 1310
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.9841307401657104,
"learning_rate": 2.7174000316019277e-06,
"loss": 0.6833657622337341,
"step": 1312
},
{
"epoch": 2.9135254988913526,
"grad_norm": 1.2662699222564697,
"learning_rate": 2.704446942440191e-06,
"loss": 0.7377205491065979,
"step": 1314
},
{
"epoch": 2.917960088691796,
"grad_norm": 2.137340784072876,
"learning_rate": 2.6915314640517755e-06,
"loss": 0.6367099285125732,
"step": 1316
},
{
"epoch": 2.9223946784922394,
"grad_norm": 1.5056116580963135,
"learning_rate": 2.6786537701987703e-06,
"loss": 0.8529772162437439,
"step": 1318
},
{
"epoch": 2.926829268292683,
"grad_norm": 1.2501715421676636,
"learning_rate": 2.665814034134916e-06,
"loss": 0.5121623873710632,
"step": 1320
},
{
"epoch": 2.9312638580931263,
"grad_norm": 1.0894711017608643,
"learning_rate": 2.6530124286032755e-06,
"loss": 0.7275009155273438,
"step": 1322
},
{
"epoch": 2.9356984478935697,
"grad_norm": 3.4206230640411377,
"learning_rate": 2.640249125833915e-06,
"loss": 0.7550503611564636,
"step": 1324
},
{
"epoch": 2.9401330376940136,
"grad_norm": 0.8373146653175354,
"learning_rate": 2.6275242975415804e-06,
"loss": 0.8755195736885071,
"step": 1326
},
{
"epoch": 2.9445676274944566,
"grad_norm": 1.1842820644378662,
"learning_rate": 2.614838114923394e-06,
"loss": 0.7242047786712646,
"step": 1328
},
{
"epoch": 2.9490022172949004,
"grad_norm": 0.9061135053634644,
"learning_rate": 2.6021907486565447e-06,
"loss": 0.7160645127296448,
"step": 1330
},
{
"epoch": 2.953436807095344,
"grad_norm": 1.9043796062469482,
"learning_rate": 2.589582368895992e-06,
"loss": 0.7991137504577637,
"step": 1332
},
{
"epoch": 2.9578713968957873,
"grad_norm": 0.7662011384963989,
"learning_rate": 2.577013145272185e-06,
"loss": 0.42025405168533325,
"step": 1334
},
{
"epoch": 2.9623059866962307,
"grad_norm": 0.9441994428634644,
"learning_rate": 2.564483246888772e-06,
"loss": 0.6747118830680847,
"step": 1336
},
{
"epoch": 2.966740576496674,
"grad_norm": 1.0354760885238647,
"learning_rate": 2.5519928423203266e-06,
"loss": 0.7479525208473206,
"step": 1338
},
{
"epoch": 2.9711751662971175,
"grad_norm": 0.9520703554153442,
"learning_rate": 2.539542099610084e-06,
"loss": 0.5493751168251038,
"step": 1340
},
{
"epoch": 2.975609756097561,
"grad_norm": 2.1422736644744873,
"learning_rate": 2.5271311862676727e-06,
"loss": 0.5847128033638,
"step": 1342
},
{
"epoch": 2.9800443458980044,
"grad_norm": 0.5576608777046204,
"learning_rate": 2.514760269266871e-06,
"loss": 0.1638008952140808,
"step": 1344
},
{
"epoch": 2.984478935698448,
"grad_norm": 1.6136894226074219,
"learning_rate": 2.50242951504335e-06,
"loss": 0.5379747748374939,
"step": 1346
},
{
"epoch": 2.988913525498891,
"grad_norm": 0.31877100467681885,
"learning_rate": 2.490139089492443e-06,
"loss": 0.42808306217193604,
"step": 1348
},
{
"epoch": 2.9933481152993346,
"grad_norm": 2.9408724308013916,
"learning_rate": 2.4778891579669067e-06,
"loss": 0.42848342657089233,
"step": 1350
},
{
"epoch": 2.9977827050997785,
"grad_norm": 1.016264796257019,
"learning_rate": 2.4656798852747023e-06,
"loss": 0.5269397497177124,
"step": 1352
},
{
"epoch": 3.002217294900222,
"grad_norm": 1.1126813888549805,
"learning_rate": 2.453511435676777e-06,
"loss": 0.557039737701416,
"step": 1354
},
{
"epoch": 3.0066518847006654,
"grad_norm": 1.4688986539840698,
"learning_rate": 2.441383972884848e-06,
"loss": 0.2774271070957184,
"step": 1356
},
{
"epoch": 3.011086474501109,
"grad_norm": 1.5381768941879272,
"learning_rate": 2.4292976600592095e-06,
"loss": 0.45656082034111023,
"step": 1358
},
{
"epoch": 3.015521064301552,
"grad_norm": 0.8177427649497986,
"learning_rate": 2.4172526598065304e-06,
"loss": 0.5471428036689758,
"step": 1360
},
{
"epoch": 3.0199556541019956,
"grad_norm": 1.1968615055084229,
"learning_rate": 2.4052491341776686e-06,
"loss": 0.3854435086250305,
"step": 1362
},
{
"epoch": 3.024390243902439,
"grad_norm": 0.8419128656387329,
"learning_rate": 2.393287244665494e-06,
"loss": 0.4754073917865753,
"step": 1364
},
{
"epoch": 3.0288248337028825,
"grad_norm": 0.12323111295700073,
"learning_rate": 2.3813671522027094e-06,
"loss": 0.22163067758083344,
"step": 1366
},
{
"epoch": 3.033259423503326,
"grad_norm": 1.2366307973861694,
"learning_rate": 2.369489017159692e-06,
"loss": 0.20418155193328857,
"step": 1368
},
{
"epoch": 3.0376940133037693,
"grad_norm": 2.100736141204834,
"learning_rate": 2.357652999342334e-06,
"loss": 0.2910291850566864,
"step": 1370
},
{
"epoch": 3.0421286031042127,
"grad_norm": 4.004879951477051,
"learning_rate": 2.345859257989886e-06,
"loss": 0.47884419560432434,
"step": 1372
},
{
"epoch": 3.046563192904656,
"grad_norm": 1.1280056238174438,
"learning_rate": 2.334107951772826e-06,
"loss": 0.15896357595920563,
"step": 1374
},
{
"epoch": 3.0509977827050996,
"grad_norm": 3.1216893196105957,
"learning_rate": 2.3223992387907137e-06,
"loss": 0.23939193785190582,
"step": 1376
},
{
"epoch": 3.0554323725055434,
"grad_norm": 1.0042521953582764,
"learning_rate": 2.3107332765700733e-06,
"loss": 0.1343676894903183,
"step": 1378
},
{
"epoch": 3.059866962305987,
"grad_norm": 0.09163781255483627,
"learning_rate": 2.2991102220622647e-06,
"loss": 0.16645547747612,
"step": 1380
},
{
"epoch": 3.0643015521064303,
"grad_norm": 2.4105827808380127,
"learning_rate": 2.2875302316413807e-06,
"loss": 0.1257064789533615,
"step": 1382
},
{
"epoch": 3.0687361419068737,
"grad_norm": 1.94559645652771,
"learning_rate": 2.275993461102138e-06,
"loss": 0.3437502384185791,
"step": 1384
},
{
"epoch": 3.073170731707317,
"grad_norm": 0.2491624653339386,
"learning_rate": 2.2645000656577793e-06,
"loss": 0.1117410808801651,
"step": 1386
},
{
"epoch": 3.0776053215077606,
"grad_norm": 0.1076858639717102,
"learning_rate": 2.2530501999379932e-06,
"loss": 0.17695897817611694,
"step": 1388
},
{
"epoch": 3.082039911308204,
"grad_norm": 0.8501622080802917,
"learning_rate": 2.2416440179868236e-06,
"loss": 0.3754989504814148,
"step": 1390
},
{
"epoch": 3.0864745011086474,
"grad_norm": 1.7676734924316406,
"learning_rate": 2.230281673260605e-06,
"loss": 0.15238375961780548,
"step": 1392
},
{
"epoch": 3.090909090909091,
"grad_norm": 6.170825958251953,
"learning_rate": 2.218963318625895e-06,
"loss": 0.22011463344097137,
"step": 1394
},
{
"epoch": 3.0953436807095343,
"grad_norm": 2.093130350112915,
"learning_rate": 2.2076891063574167e-06,
"loss": 0.5483108162879944,
"step": 1396
},
{
"epoch": 3.0997782705099777,
"grad_norm": 0.9943727850914001,
"learning_rate": 2.196459188136014e-06,
"loss": 0.39382025599479675,
"step": 1398
},
{
"epoch": 3.104212860310421,
"grad_norm": 0.4041390120983124,
"learning_rate": 2.1852737150466064e-06,
"loss": 0.37452182173728943,
"step": 1400
},
{
"epoch": 3.1086474501108645,
"grad_norm": 0.41770124435424805,
"learning_rate": 2.174132837576156e-06,
"loss": 0.1812993884086609,
"step": 1402
},
{
"epoch": 3.1130820399113084,
"grad_norm": 3.0523788928985596,
"learning_rate": 2.1630367056116496e-06,
"loss": 0.3220471739768982,
"step": 1404
},
{
"epoch": 3.117516629711752,
"grad_norm": 2.8227522373199463,
"learning_rate": 2.1519854684380724e-06,
"loss": 0.5665332674980164,
"step": 1406
},
{
"epoch": 3.1219512195121952,
"grad_norm": 1.215554118156433,
"learning_rate": 2.1409792747364103e-06,
"loss": 0.6941906809806824,
"step": 1408
},
{
"epoch": 3.1263858093126387,
"grad_norm": 0.9595901370048523,
"learning_rate": 2.1300182725816378e-06,
"loss": 0.34386202692985535,
"step": 1410
},
{
"epoch": 3.130820399113082,
"grad_norm": 1.20671808719635,
"learning_rate": 2.1191026094407386e-06,
"loss": 0.40727710723876953,
"step": 1412
},
{
"epoch": 3.1352549889135255,
"grad_norm": 0.9301721453666687,
"learning_rate": 2.1082324321707075e-06,
"loss": 0.24828168749809265,
"step": 1414
},
{
"epoch": 3.139689578713969,
"grad_norm": 0.29069337248802185,
"learning_rate": 2.0974078870165882e-06,
"loss": 0.02273000217974186,
"step": 1416
},
{
"epoch": 3.1441241685144123,
"grad_norm": 1.1463552713394165,
"learning_rate": 2.086629119609499e-06,
"loss": 0.4149464964866638,
"step": 1418
},
{
"epoch": 3.1485587583148558,
"grad_norm": 0.8834893107414246,
"learning_rate": 2.0758962749646716e-06,
"loss": 0.47038036584854126,
"step": 1420
},
{
"epoch": 3.152993348115299,
"grad_norm": 3.7323062419891357,
"learning_rate": 2.065209497479502e-06,
"loss": 0.34102943539619446,
"step": 1422
},
{
"epoch": 3.1574279379157426,
"grad_norm": 0.32775676250457764,
"learning_rate": 2.0545689309316138e-06,
"loss": 0.06301730126142502,
"step": 1424
},
{
"epoch": 3.1618625277161865,
"grad_norm": 1.2714399099349976,
"learning_rate": 2.043974718476911e-06,
"loss": 0.4001501798629761,
"step": 1426
},
{
"epoch": 3.16629711751663,
"grad_norm": 1.573042631149292,
"learning_rate": 2.033427002647668e-06,
"loss": 0.37112024426460266,
"step": 1428
},
{
"epoch": 3.1707317073170733,
"grad_norm": 1.6631622314453125,
"learning_rate": 2.0229259253505946e-06,
"loss": 0.2901914417743683,
"step": 1430
},
{
"epoch": 3.1751662971175167,
"grad_norm": 1.1970975399017334,
"learning_rate": 2.012471627864943e-06,
"loss": 0.29324379563331604,
"step": 1432
},
{
"epoch": 3.17960088691796,
"grad_norm": 1.13413667678833,
"learning_rate": 2.0020642508405984e-06,
"loss": 0.3578546643257141,
"step": 1434
},
{
"epoch": 3.1840354767184036,
"grad_norm": 2.5074386596679688,
"learning_rate": 1.9917039342961837e-06,
"loss": 0.0840587466955185,
"step": 1436
},
{
"epoch": 3.188470066518847,
"grad_norm": 0.9287826418876648,
"learning_rate": 1.9813908176171857e-06,
"loss": 0.27323436737060547,
"step": 1438
},
{
"epoch": 3.1929046563192904,
"grad_norm": 2.366941213607788,
"learning_rate": 1.97112503955407e-06,
"loss": 0.43178707361221313,
"step": 1440
},
{
"epoch": 3.197339246119734,
"grad_norm": 1.5955890417099,
"learning_rate": 1.9609067382204224e-06,
"loss": 0.47529783844947815,
"step": 1442
},
{
"epoch": 3.2017738359201773,
"grad_norm": 1.0811793804168701,
"learning_rate": 1.950736051091084e-06,
"loss": 0.4838941693305969,
"step": 1444
},
{
"epoch": 3.2062084257206207,
"grad_norm": 1.8544070720672607,
"learning_rate": 1.9406131150003036e-06,
"loss": 0.37333235144615173,
"step": 1446
},
{
"epoch": 3.210643015521064,
"grad_norm": 7.789588451385498,
"learning_rate": 1.930538066139904e-06,
"loss": 0.1161646842956543,
"step": 1448
},
{
"epoch": 3.2150776053215075,
"grad_norm": 0.8985663056373596,
"learning_rate": 1.9205110400574368e-06,
"loss": 0.5245546698570251,
"step": 1450
},
{
"epoch": 3.2195121951219514,
"grad_norm": 1.785593867301941,
"learning_rate": 1.910532171654367e-06,
"loss": 0.23958845436573029,
"step": 1452
},
{
"epoch": 3.223946784922395,
"grad_norm": 1.4889003038406372,
"learning_rate": 1.9006015951842587e-06,
"loss": 0.3209075331687927,
"step": 1454
},
{
"epoch": 3.2283813747228383,
"grad_norm": 3.653893232345581,
"learning_rate": 1.8907194442509642e-06,
"loss": 0.43527886271476746,
"step": 1456
},
{
"epoch": 3.2328159645232817,
"grad_norm": 2.4832544326782227,
"learning_rate": 1.8808858518068312e-06,
"loss": 0.2721869647502899,
"step": 1458
},
{
"epoch": 3.237250554323725,
"grad_norm": 1.3052841424942017,
"learning_rate": 1.8711009501509087e-06,
"loss": 0.44211310148239136,
"step": 1460
},
{
"epoch": 3.2416851441241685,
"grad_norm": 1.3836771249771118,
"learning_rate": 1.8613648709271732e-06,
"loss": 0.26933524012565613,
"step": 1462
},
{
"epoch": 3.246119733924612,
"grad_norm": 3.1456692218780518,
"learning_rate": 1.8516777451227552e-06,
"loss": 0.39066338539123535,
"step": 1464
},
{
"epoch": 3.2505543237250554,
"grad_norm": 1.4456260204315186,
"learning_rate": 1.842039703066172e-06,
"loss": 0.47105956077575684,
"step": 1466
},
{
"epoch": 3.254988913525499,
"grad_norm": 0.20494961738586426,
"learning_rate": 1.8324508744255842e-06,
"loss": 0.039846230298280716,
"step": 1468
},
{
"epoch": 3.259423503325942,
"grad_norm": 2.348210573196411,
"learning_rate": 1.8229113882070398e-06,
"loss": 0.310930460691452,
"step": 1470
},
{
"epoch": 3.2638580931263856,
"grad_norm": 1.5028283596038818,
"learning_rate": 1.8134213727527504e-06,
"loss": 0.420907586812973,
"step": 1472
},
{
"epoch": 3.2682926829268295,
"grad_norm": 0.636045515537262,
"learning_rate": 1.803980955739354e-06,
"loss": 0.08325402438640594,
"step": 1474
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.7118128538131714,
"learning_rate": 1.7945902641762027e-06,
"loss": 0.3166371285915375,
"step": 1476
},
{
"epoch": 3.2771618625277164,
"grad_norm": 1.0103552341461182,
"learning_rate": 1.785249424403654e-06,
"loss": 0.4005421996116638,
"step": 1478
},
{
"epoch": 3.2815964523281598,
"grad_norm": 0.3991997539997101,
"learning_rate": 1.7759585620913723e-06,
"loss": 0.27822190523147583,
"step": 1480
},
{
"epoch": 3.286031042128603,
"grad_norm": 0.9653980731964111,
"learning_rate": 1.7667178022366294e-06,
"loss": 0.4057963192462921,
"step": 1482
},
{
"epoch": 3.2904656319290466,
"grad_norm": 8.63097095489502,
"learning_rate": 1.757527269162636e-06,
"loss": 0.327663779258728,
"step": 1484
},
{
"epoch": 3.29490022172949,
"grad_norm": 0.9608578681945801,
"learning_rate": 1.7483870865168585e-06,
"loss": 0.3130677342414856,
"step": 1486
},
{
"epoch": 3.2993348115299335,
"grad_norm": 2.4934980869293213,
"learning_rate": 1.739297377269361e-06,
"loss": 0.18216699361801147,
"step": 1488
},
{
"epoch": 3.303769401330377,
"grad_norm": 2.835766553878784,
"learning_rate": 1.730258263711149e-06,
"loss": 0.1408637911081314,
"step": 1490
},
{
"epoch": 3.3082039911308203,
"grad_norm": 1.3449338674545288,
"learning_rate": 1.7212698674525246e-06,
"loss": 0.3613136410713196,
"step": 1492
},
{
"epoch": 3.3126385809312637,
"grad_norm": 1.2930020093917847,
"learning_rate": 1.7123323094214485e-06,
"loss": 0.3133498728275299,
"step": 1494
},
{
"epoch": 3.317073170731707,
"grad_norm": 2.178227663040161,
"learning_rate": 1.7034457098619176e-06,
"loss": 0.329238623380661,
"step": 1496
},
{
"epoch": 3.3215077605321506,
"grad_norm": 0.9758614301681519,
"learning_rate": 1.6946101883323435e-06,
"loss": 0.5130437016487122,
"step": 1498
},
{
"epoch": 3.3259423503325944,
"grad_norm": 0.9882797002792358,
"learning_rate": 1.6858258637039421e-06,
"loss": 0.3322954475879669,
"step": 1500
},
{
"epoch": 3.330376940133038,
"grad_norm": 1.1139705181121826,
"learning_rate": 1.677092854159142e-06,
"loss": 0.37578579783439636,
"step": 1502
},
{
"epoch": 3.3348115299334813,
"grad_norm": 1.0699121952056885,
"learning_rate": 1.6684112771899858e-06,
"loss": 0.3910093903541565,
"step": 1504
},
{
"epoch": 3.3392461197339247,
"grad_norm": 0.19254817068576813,
"learning_rate": 1.6597812495965537e-06,
"loss": 0.17204663157463074,
"step": 1506
},
{
"epoch": 3.343680709534368,
"grad_norm": 1.2403380870819092,
"learning_rate": 1.651202887485394e-06,
"loss": 0.21663136780261993,
"step": 1508
},
{
"epoch": 3.3481152993348116,
"grad_norm": 2.0894668102264404,
"learning_rate": 1.6426763062679553e-06,
"loss": 0.47299203276634216,
"step": 1510
},
{
"epoch": 3.352549889135255,
"grad_norm": 0.15021318197250366,
"learning_rate": 1.63420162065904e-06,
"loss": 0.20796214044094086,
"step": 1512
},
{
"epoch": 3.3569844789356984,
"grad_norm": 1.074012041091919,
"learning_rate": 1.625778944675257e-06,
"loss": 0.4941790997982025,
"step": 1514
},
{
"epoch": 3.361419068736142,
"grad_norm": 6.77968168258667,
"learning_rate": 1.6174083916334877e-06,
"loss": 0.3023959994316101,
"step": 1516
},
{
"epoch": 3.3658536585365852,
"grad_norm": 1.5364946126937866,
"learning_rate": 1.609090074149366e-06,
"loss": 0.30802658200263977,
"step": 1518
},
{
"epoch": 3.3702882483370287,
"grad_norm": 2.5270133018493652,
"learning_rate": 1.6008241041357535e-06,
"loss": 0.5083972811698914,
"step": 1520
},
{
"epoch": 3.374722838137472,
"grad_norm": 0.16544599831104279,
"learning_rate": 1.5926105928012486e-06,
"loss": 0.2425152212381363,
"step": 1522
},
{
"epoch": 3.3791574279379155,
"grad_norm": 1.5787891149520874,
"learning_rate": 1.5844496506486734e-06,
"loss": 0.5767493844032288,
"step": 1524
},
{
"epoch": 3.3835920177383594,
"grad_norm": 24.345401763916016,
"learning_rate": 1.576341387473601e-06,
"loss": 0.3439426124095917,
"step": 1526
},
{
"epoch": 3.388026607538803,
"grad_norm": 1.3219624757766724,
"learning_rate": 1.568285912362872e-06,
"loss": 0.21882264316082,
"step": 1528
},
{
"epoch": 3.3924611973392462,
"grad_norm": 0.47698870301246643,
"learning_rate": 1.5602833336931242e-06,
"loss": 0.209283709526062,
"step": 1530
},
{
"epoch": 3.3968957871396896,
"grad_norm": 0.9787905812263489,
"learning_rate": 1.552333759129344e-06,
"loss": 0.09470443427562714,
"step": 1532
},
{
"epoch": 3.401330376940133,
"grad_norm": 1.2374992370605469,
"learning_rate": 1.5444372956234062e-06,
"loss": 0.3461211919784546,
"step": 1534
},
{
"epoch": 3.4057649667405765,
"grad_norm": 1.2397676706314087,
"learning_rate": 1.5365940494126424e-06,
"loss": 0.46922361850738525,
"step": 1536
},
{
"epoch": 3.41019955654102,
"grad_norm": 1.3618249893188477,
"learning_rate": 1.5288041260184132e-06,
"loss": 0.3622947037220001,
"step": 1538
},
{
"epoch": 3.4146341463414633,
"grad_norm": 1.0507458448410034,
"learning_rate": 1.5210676302446801e-06,
"loss": 0.39398759603500366,
"step": 1540
},
{
"epoch": 3.4190687361419068,
"grad_norm": 1.3180983066558838,
"learning_rate": 1.5133846661766058e-06,
"loss": 0.3517080545425415,
"step": 1542
},
{
"epoch": 3.42350332594235,
"grad_norm": 1.1892797946929932,
"learning_rate": 1.5057553371791461e-06,
"loss": 0.3794390559196472,
"step": 1544
},
{
"epoch": 3.4279379157427936,
"grad_norm": 34.94332504272461,
"learning_rate": 1.4981797458956624e-06,
"loss": 0.10474438220262527,
"step": 1546
},
{
"epoch": 3.4323725055432375,
"grad_norm": 0.027800027281045914,
"learning_rate": 1.490657994246542e-06,
"loss": 0.0768439918756485,
"step": 1548
},
{
"epoch": 3.436807095343681,
"grad_norm": 1.083954095840454,
"learning_rate": 1.4831901834278212e-06,
"loss": 0.42769551277160645,
"step": 1550
},
{
"epoch": 3.4412416851441243,
"grad_norm": 1.0862568616867065,
"learning_rate": 1.4757764139098332e-06,
"loss": 0.3436740040779114,
"step": 1552
},
{
"epoch": 3.4456762749445677,
"grad_norm": 0.10507642477750778,
"learning_rate": 1.468416785435847e-06,
"loss": 0.29862260818481445,
"step": 1554
},
{
"epoch": 3.450110864745011,
"grad_norm": 1.1534576416015625,
"learning_rate": 1.461111397020732e-06,
"loss": 0.4363846778869629,
"step": 1556
},
{
"epoch": 3.4545454545454546,
"grad_norm": 1.3515652418136597,
"learning_rate": 1.4538603469496215e-06,
"loss": 0.4772418439388275,
"step": 1558
},
{
"epoch": 3.458980044345898,
"grad_norm": 0.9060417413711548,
"learning_rate": 1.4466637327765937e-06,
"loss": 0.5036817789077759,
"step": 1560
},
{
"epoch": 3.4634146341463414,
"grad_norm": 1.3972153663635254,
"learning_rate": 1.4395216513233584e-06,
"loss": 0.0940362960100174,
"step": 1562
},
{
"epoch": 3.467849223946785,
"grad_norm": 0.9557284712791443,
"learning_rate": 1.4324341986779527e-06,
"loss": 0.6766175627708435,
"step": 1564
},
{
"epoch": 3.4722838137472283,
"grad_norm": 0.8374095559120178,
"learning_rate": 1.4254014701934481e-06,
"loss": 0.21213091909885406,
"step": 1566
},
{
"epoch": 3.4767184035476717,
"grad_norm": 1.0304781198501587,
"learning_rate": 1.4184235604866725e-06,
"loss": 0.3508773744106293,
"step": 1568
},
{
"epoch": 3.481152993348115,
"grad_norm": 2.0591909885406494,
"learning_rate": 1.4115005634369296e-06,
"loss": 0.34335634112358093,
"step": 1570
},
{
"epoch": 3.4855875831485585,
"grad_norm": 0.2248295098543167,
"learning_rate": 1.4046325721847443e-06,
"loss": 0.3259221315383911,
"step": 1572
},
{
"epoch": 3.4900221729490024,
"grad_norm": 1.0798416137695312,
"learning_rate": 1.397819679130601e-06,
"loss": 0.5986089110374451,
"step": 1574
},
{
"epoch": 3.494456762749446,
"grad_norm": 3.3881709575653076,
"learning_rate": 1.3910619759337074e-06,
"loss": 0.26687368750572205,
"step": 1576
},
{
"epoch": 3.4988913525498893,
"grad_norm": 0.6919270157814026,
"learning_rate": 1.3843595535107587e-06,
"loss": 0.23883309960365295,
"step": 1578
},
{
"epoch": 3.5033259423503327,
"grad_norm": 1.5870050191879272,
"learning_rate": 1.377712502034712e-06,
"loss": 0.5271674394607544,
"step": 1580
},
{
"epoch": 3.507760532150776,
"grad_norm": 1.3132179975509644,
"learning_rate": 1.3711209109335793e-06,
"loss": 0.48455068469047546,
"step": 1582
},
{
"epoch": 3.5121951219512195,
"grad_norm": 0.9779510498046875,
"learning_rate": 1.3645848688892162e-06,
"loss": 0.2770904302597046,
"step": 1584
},
{
"epoch": 3.516629711751663,
"grad_norm": 1.488776445388794,
"learning_rate": 1.3581044638361373e-06,
"loss": 0.2443387657403946,
"step": 1586
},
{
"epoch": 3.5210643015521064,
"grad_norm": 1.3030271530151367,
"learning_rate": 1.3516797829603256e-06,
"loss": 0.44791534543037415,
"step": 1588
},
{
"epoch": 3.52549889135255,
"grad_norm": 2.1599605083465576,
"learning_rate": 1.3453109126980643e-06,
"loss": 0.19074156880378723,
"step": 1590
},
{
"epoch": 3.529933481152993,
"grad_norm": 1.5765827894210815,
"learning_rate": 1.3389979387347743e-06,
"loss": 0.23592326045036316,
"step": 1592
},
{
"epoch": 3.5343680709534366,
"grad_norm": 1.3213473558425903,
"learning_rate": 1.332740946003857e-06,
"loss": 0.4785956144332886,
"step": 1594
},
{
"epoch": 3.5388026607538805,
"grad_norm": 1.8811619281768799,
"learning_rate": 1.3265400186855548e-06,
"loss": 0.07135710120201111,
"step": 1596
},
{
"epoch": 3.5432372505543235,
"grad_norm": 3.579979181289673,
"learning_rate": 1.320395240205819e-06,
"loss": 0.4762045741081238,
"step": 1598
},
{
"epoch": 3.5476718403547673,
"grad_norm": 1.0917787551879883,
"learning_rate": 1.3143066932351856e-06,
"loss": 0.2512458860874176,
"step": 1600
},
{
"epoch": 3.5521064301552108,
"grad_norm": 1.354770302772522,
"learning_rate": 1.308274459687665e-06,
"loss": 0.2989339232444763,
"step": 1602
},
{
"epoch": 3.556541019955654,
"grad_norm": 0.9256249070167542,
"learning_rate": 1.3022986207196367e-06,
"loss": 0.5872430205345154,
"step": 1604
},
{
"epoch": 3.5609756097560976,
"grad_norm": 1.0018621683120728,
"learning_rate": 1.2963792567287617e-06,
"loss": 0.5670958757400513,
"step": 1606
},
{
"epoch": 3.565410199556541,
"grad_norm": 0.1671750247478485,
"learning_rate": 1.290516447352899e-06,
"loss": 0.034741759300231934,
"step": 1608
},
{
"epoch": 3.5698447893569845,
"grad_norm": 1.0988103151321411,
"learning_rate": 1.2847102714690308e-06,
"loss": 0.3017559051513672,
"step": 1610
},
{
"epoch": 3.574279379157428,
"grad_norm": 2.024768352508545,
"learning_rate": 1.2789608071922076e-06,
"loss": 0.09087596833705902,
"step": 1612
},
{
"epoch": 3.5787139689578713,
"grad_norm": 2.462632656097412,
"learning_rate": 1.2732681318744923e-06,
"loss": 0.3814306855201721,
"step": 1614
},
{
"epoch": 3.5831485587583147,
"grad_norm": 1.586369276046753,
"learning_rate": 1.2676323221039236e-06,
"loss": 0.7159979939460754,
"step": 1616
},
{
"epoch": 3.587583148558758,
"grad_norm": 3.622847557067871,
"learning_rate": 1.2620534537034795e-06,
"loss": 0.24280951917171478,
"step": 1618
},
{
"epoch": 3.5920177383592016,
"grad_norm": 2.6759791374206543,
"learning_rate": 1.2565316017300635e-06,
"loss": 0.40450718998908997,
"step": 1620
},
{
"epoch": 3.5964523281596454,
"grad_norm": 0.979674220085144,
"learning_rate": 1.2510668404734924e-06,
"loss": 0.3859134614467621,
"step": 1622
},
{
"epoch": 3.6008869179600884,
"grad_norm": 1.1429744958877563,
"learning_rate": 1.2456592434554963e-06,
"loss": 0.4199633002281189,
"step": 1624
},
{
"epoch": 3.6053215077605323,
"grad_norm": 10.887653350830078,
"learning_rate": 1.2403088834287282e-06,
"loss": 0.11880503594875336,
"step": 1626
},
{
"epoch": 3.6097560975609757,
"grad_norm": 1.1529648303985596,
"learning_rate": 1.2350158323757903e-06,
"loss": 0.3755669593811035,
"step": 1628
},
{
"epoch": 3.614190687361419,
"grad_norm": 2.1950061321258545,
"learning_rate": 1.229780161508259e-06,
"loss": 0.30650004744529724,
"step": 1630
},
{
"epoch": 3.6186252771618626,
"grad_norm": 1.1274499893188477,
"learning_rate": 1.2246019412657319e-06,
"loss": 0.49355947971343994,
"step": 1632
},
{
"epoch": 3.623059866962306,
"grad_norm": 1.7049529552459717,
"learning_rate": 1.2194812413148756e-06,
"loss": 0.49852749705314636,
"step": 1634
},
{
"epoch": 3.6274944567627494,
"grad_norm": 1.2080436944961548,
"learning_rate": 1.214418130548495e-06,
"loss": 0.3345094621181488,
"step": 1636
},
{
"epoch": 3.631929046563193,
"grad_norm": 2.6900432109832764,
"learning_rate": 1.2094126770845986e-06,
"loss": 0.38614609837532043,
"step": 1638
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.9296088814735413,
"learning_rate": 1.2044649482654876e-06,
"loss": 0.5239455103874207,
"step": 1640
},
{
"epoch": 3.6407982261640797,
"grad_norm": 0.42654258012771606,
"learning_rate": 1.1995750106568496e-06,
"loss": 0.24880681931972504,
"step": 1642
},
{
"epoch": 3.6452328159645235,
"grad_norm": 0.9114068746566772,
"learning_rate": 1.1947429300468575e-06,
"loss": 0.42800384759902954,
"step": 1644
},
{
"epoch": 3.6496674057649665,
"grad_norm": 0.22093704342842102,
"learning_rate": 1.1899687714452932e-06,
"loss": 0.24181388318538666,
"step": 1646
},
{
"epoch": 3.6541019955654104,
"grad_norm": 2.1429035663604736,
"learning_rate": 1.1852525990826658e-06,
"loss": 0.16454415023326874,
"step": 1648
},
{
"epoch": 3.658536585365854,
"grad_norm": 1.6719216108322144,
"learning_rate": 1.1805944764093484e-06,
"loss": 0.34569817781448364,
"step": 1650
},
{
"epoch": 3.662971175166297,
"grad_norm": 2.202148199081421,
"learning_rate": 1.1759944660947301e-06,
"loss": 0.44416671991348267,
"step": 1652
},
{
"epoch": 3.6674057649667406,
"grad_norm": 1.0591462850570679,
"learning_rate": 1.171452630026365e-06,
"loss": 0.1786390095949173,
"step": 1654
},
{
"epoch": 3.671840354767184,
"grad_norm": 1.1581944227218628,
"learning_rate": 1.1669690293091452e-06,
"loss": 0.4971603453159332,
"step": 1656
},
{
"epoch": 3.6762749445676275,
"grad_norm": 2.9780452251434326,
"learning_rate": 1.1625437242644772e-06,
"loss": 0.28031206130981445,
"step": 1658
},
{
"epoch": 3.680709534368071,
"grad_norm": 0.9189567565917969,
"learning_rate": 1.1581767744294682e-06,
"loss": 0.325800359249115,
"step": 1660
},
{
"epoch": 3.6851441241685143,
"grad_norm": 0.1943116933107376,
"learning_rate": 1.1538682385561286e-06,
"loss": 0.2070523202419281,
"step": 1662
},
{
"epoch": 3.6895787139689578,
"grad_norm": 1.0158610343933105,
"learning_rate": 1.1496181746105784e-06,
"loss": 0.23334433138370514,
"step": 1664
},
{
"epoch": 3.694013303769401,
"grad_norm": 0.8135693073272705,
"learning_rate": 1.1454266397722707e-06,
"loss": 0.40261054039001465,
"step": 1666
},
{
"epoch": 3.6984478935698446,
"grad_norm": 1.025733470916748,
"learning_rate": 1.1412936904332181e-06,
"loss": 0.3658636212348938,
"step": 1668
},
{
"epoch": 3.7028824833702885,
"grad_norm": 1.5712085962295532,
"learning_rate": 1.1372193821972379e-06,
"loss": 0.4164735674858093,
"step": 1670
},
{
"epoch": 3.7073170731707314,
"grad_norm": 1.251022458076477,
"learning_rate": 1.1332037698792033e-06,
"loss": 0.32476893067359924,
"step": 1672
},
{
"epoch": 3.7117516629711753,
"grad_norm": 1.0968090295791626,
"learning_rate": 1.1292469075043026e-06,
"loss": 0.5027676224708557,
"step": 1674
},
{
"epoch": 3.7161862527716187,
"grad_norm": 1.1787587404251099,
"learning_rate": 1.1253488483073177e-06,
"loss": 0.4444116950035095,
"step": 1676
},
{
"epoch": 3.720620842572062,
"grad_norm": 1.1607528924942017,
"learning_rate": 1.1215096447319038e-06,
"loss": 0.41037842631340027,
"step": 1678
},
{
"epoch": 3.7250554323725056,
"grad_norm": 2.8341286182403564,
"learning_rate": 1.117729348429884e-06,
"loss": 0.19149872660636902,
"step": 1680
},
{
"epoch": 3.729490022172949,
"grad_norm": 1.3583526611328125,
"learning_rate": 1.114008010260558e-06,
"loss": 0.4621056318283081,
"step": 1682
},
{
"epoch": 3.7339246119733924,
"grad_norm": 0.825949490070343,
"learning_rate": 1.1103456802900134e-06,
"loss": 0.18589909374713898,
"step": 1684
},
{
"epoch": 3.738359201773836,
"grad_norm": 2.031763792037964,
"learning_rate": 1.1067424077904555e-06,
"loss": 0.3091331422328949,
"step": 1686
},
{
"epoch": 3.7427937915742793,
"grad_norm": 0.24129338562488556,
"learning_rate": 1.103198241239542e-06,
"loss": 0.07289690524339676,
"step": 1688
},
{
"epoch": 3.7472283813747227,
"grad_norm": 1.1850625276565552,
"learning_rate": 1.0997132283197324e-06,
"loss": 0.5156506896018982,
"step": 1690
},
{
"epoch": 3.7516629711751666,
"grad_norm": 2.6560683250427246,
"learning_rate": 1.0962874159176454e-06,
"loss": 0.5042511820793152,
"step": 1692
},
{
"epoch": 3.7560975609756095,
"grad_norm": 1.1266878843307495,
"learning_rate": 1.0929208501234286e-06,
"loss": 0.5055519938468933,
"step": 1694
},
{
"epoch": 3.7605321507760534,
"grad_norm": 1.4622995853424072,
"learning_rate": 1.0896135762301393e-06,
"loss": 0.46531805396080017,
"step": 1696
},
{
"epoch": 3.764966740576497,
"grad_norm": 1.787441372871399,
"learning_rate": 1.0863656387331328e-06,
"loss": 0.21703627705574036,
"step": 1698
},
{
"epoch": 3.7694013303769403,
"grad_norm": 1.8496445417404175,
"learning_rate": 1.0831770813294668e-06,
"loss": 0.3597804009914398,
"step": 1700
},
{
"epoch": 3.7738359201773837,
"grad_norm": 0.731442928314209,
"learning_rate": 1.0800479469173101e-06,
"loss": 0.6957812309265137,
"step": 1702
},
{
"epoch": 3.778270509977827,
"grad_norm": 0.28417858481407166,
"learning_rate": 1.076978277595369e-06,
"loss": 0.05468475818634033,
"step": 1704
},
{
"epoch": 3.7827050997782705,
"grad_norm": 0.22050651907920837,
"learning_rate": 1.0739681146623185e-06,
"loss": 0.26601287722587585,
"step": 1706
},
{
"epoch": 3.787139689578714,
"grad_norm": 1.0851820707321167,
"learning_rate": 1.0710174986162471e-06,
"loss": 0.2386590987443924,
"step": 1708
},
{
"epoch": 3.7915742793791574,
"grad_norm": 0.8813753724098206,
"learning_rate": 1.0681264691541127e-06,
"loss": 0.6136298775672913,
"step": 1710
},
{
"epoch": 3.796008869179601,
"grad_norm": 2.6262285709381104,
"learning_rate": 1.0652950651712072e-06,
"loss": 0.2965908348560333,
"step": 1712
},
{
"epoch": 3.800443458980044,
"grad_norm": 2.0973598957061768,
"learning_rate": 1.0625233247606348e-06,
"loss": 0.2669585049152374,
"step": 1714
},
{
"epoch": 3.8048780487804876,
"grad_norm": 1.7458832263946533,
"learning_rate": 1.059811285212799e-06,
"loss": 0.5059341192245483,
"step": 1716
},
{
"epoch": 3.8093126385809315,
"grad_norm": 1.2826629877090454,
"learning_rate": 1.0571589830149e-06,
"loss": 0.27312329411506653,
"step": 1718
},
{
"epoch": 3.8137472283813745,
"grad_norm": 3.2964842319488525,
"learning_rate": 1.054566453850444e-06,
"loss": 0.3187982738018036,
"step": 1720
},
{
"epoch": 3.8181818181818183,
"grad_norm": 2.765103340148926,
"learning_rate": 1.0520337325987649e-06,
"loss": 0.5375199913978577,
"step": 1722
},
{
"epoch": 3.8226164079822618,
"grad_norm": 5.0600361824035645,
"learning_rate": 1.049560853334553e-06,
"loss": 0.4722135066986084,
"step": 1724
},
{
"epoch": 3.827050997782705,
"grad_norm": 1.3732205629348755,
"learning_rate": 1.0471478493273976e-06,
"loss": 0.4118424654006958,
"step": 1726
},
{
"epoch": 3.8314855875831486,
"grad_norm": 1.0661771297454834,
"learning_rate": 1.0447947530413389e-06,
"loss": 0.2754386365413666,
"step": 1728
},
{
"epoch": 3.835920177383592,
"grad_norm": 1.2804239988327026,
"learning_rate": 1.042501596134431e-06,
"loss": 0.2026994377374649,
"step": 1730
},
{
"epoch": 3.8403547671840355,
"grad_norm": 1.4092806577682495,
"learning_rate": 1.0402684094583173e-06,
"loss": 0.4652438163757324,
"step": 1732
},
{
"epoch": 3.844789356984479,
"grad_norm": 4.014774322509766,
"learning_rate": 1.0380952230578125e-06,
"loss": 0.379792720079422,
"step": 1734
},
{
"epoch": 3.8492239467849223,
"grad_norm": 1.469107985496521,
"learning_rate": 1.0359820661705042e-06,
"loss": 0.3514306843280792,
"step": 1736
},
{
"epoch": 3.8536585365853657,
"grad_norm": 1.1409002542495728,
"learning_rate": 1.0339289672263519e-06,
"loss": 0.44202250242233276,
"step": 1738
},
{
"epoch": 3.858093126385809,
"grad_norm": 0.8285048604011536,
"learning_rate": 1.0319359538473107e-06,
"loss": 0.23279811441898346,
"step": 1740
},
{
"epoch": 3.8625277161862526,
"grad_norm": 1.168976902961731,
"learning_rate": 1.0300030528469564e-06,
"loss": 0.20990443229675293,
"step": 1742
},
{
"epoch": 3.8669623059866964,
"grad_norm": 1.5003247261047363,
"learning_rate": 1.0281302902301254e-06,
"loss": 0.4064357280731201,
"step": 1744
},
{
"epoch": 3.8713968957871394,
"grad_norm": 0.8279268741607666,
"learning_rate": 1.026317691192567e-06,
"loss": 0.4411630630493164,
"step": 1746
},
{
"epoch": 3.8758314855875833,
"grad_norm": 1.1625535488128662,
"learning_rate": 1.0245652801205999e-06,
"loss": 0.2272336483001709,
"step": 1748
},
{
"epoch": 3.8802660753880267,
"grad_norm": 12.195842742919922,
"learning_rate": 1.0228730805907891e-06,
"loss": 0.3394715189933777,
"step": 1750
},
{
"epoch": 3.88470066518847,
"grad_norm": 3.731518507003784,
"learning_rate": 1.0212411153696247e-06,
"loss": 0.34466421604156494,
"step": 1752
},
{
"epoch": 3.8891352549889135,
"grad_norm": 1.2109719514846802,
"learning_rate": 1.019669406413218e-06,
"loss": 0.3292272388935089,
"step": 1754
},
{
"epoch": 3.893569844789357,
"grad_norm": 3.1141412258148193,
"learning_rate": 1.0181579748670054e-06,
"loss": 0.3200131058692932,
"step": 1756
},
{
"epoch": 3.8980044345898004,
"grad_norm": 0.9708091020584106,
"learning_rate": 1.0167068410654643e-06,
"loss": 0.4250810444355011,
"step": 1758
},
{
"epoch": 3.902439024390244,
"grad_norm": 0.030687185004353523,
"learning_rate": 1.0153160245318384e-06,
"loss": 0.002097110729664564,
"step": 1760
},
{
"epoch": 3.9068736141906872,
"grad_norm": 1.3904731273651123,
"learning_rate": 1.0139855439778766e-06,
"loss": 0.08549664914608002,
"step": 1762
},
{
"epoch": 3.9113082039911307,
"grad_norm": 0.10882235318422318,
"learning_rate": 1.0127154173035787e-06,
"loss": 0.28717437386512756,
"step": 1764
},
{
"epoch": 3.9157427937915745,
"grad_norm": 1.5540599822998047,
"learning_rate": 1.0115056615969584e-06,
"loss": 0.2964329421520233,
"step": 1766
},
{
"epoch": 3.9201773835920175,
"grad_norm": 1.1777206659317017,
"learning_rate": 1.0103562931338105e-06,
"loss": 0.619647741317749,
"step": 1768
},
{
"epoch": 3.9246119733924614,
"grad_norm": 2.035280704498291,
"learning_rate": 1.009267327377492e-06,
"loss": 0.48704907298088074,
"step": 1770
},
{
"epoch": 3.929046563192905,
"grad_norm": 0.8825568556785583,
"learning_rate": 1.008238778978716e-06,
"loss": 0.18097802996635437,
"step": 1772
},
{
"epoch": 3.933481152993348,
"grad_norm": 0.8677657246589661,
"learning_rate": 1.0072706617753528e-06,
"loss": 0.39700010418891907,
"step": 1774
},
{
"epoch": 3.9379157427937916,
"grad_norm": 1.6046772003173828,
"learning_rate": 1.0063629887922441e-06,
"loss": 0.5214118361473083,
"step": 1776
},
{
"epoch": 3.942350332594235,
"grad_norm": 0.15087999403476715,
"learning_rate": 1.0055157722410279e-06,
"loss": 0.03213101252913475,
"step": 1778
},
{
"epoch": 3.9467849223946785,
"grad_norm": 3.6360549926757812,
"learning_rate": 1.0047290235199753e-06,
"loss": 0.32588061690330505,
"step": 1780
},
{
"epoch": 3.951219512195122,
"grad_norm": 0.1825929582118988,
"learning_rate": 1.0040027532138351e-06,
"loss": 0.22568024694919586,
"step": 1782
},
{
"epoch": 3.9556541019955653,
"grad_norm": 0.905255138874054,
"learning_rate": 1.0033369710936928e-06,
"loss": 0.3658754527568817,
"step": 1784
},
{
"epoch": 3.9600886917960088,
"grad_norm": 0.8106067180633545,
"learning_rate": 1.0027316861168388e-06,
"loss": 0.357939213514328,
"step": 1786
},
{
"epoch": 3.964523281596452,
"grad_norm": 0.7470359206199646,
"learning_rate": 1.0021869064266472e-06,
"loss": 0.30442333221435547,
"step": 1788
},
{
"epoch": 3.9689578713968956,
"grad_norm": 0.6582420468330383,
"learning_rate": 1.0017026393524684e-06,
"loss": 0.23345550894737244,
"step": 1790
},
{
"epoch": 3.9733924611973395,
"grad_norm": 1.1834862232208252,
"learning_rate": 1.0012788914095275e-06,
"loss": 0.5164616703987122,
"step": 1792
},
{
"epoch": 3.9778270509977824,
"grad_norm": 0.7614670395851135,
"learning_rate": 1.0009156682988395e-06,
"loss": 0.36194929480552673,
"step": 1794
},
{
"epoch": 3.9822616407982263,
"grad_norm": 1.1429847478866577,
"learning_rate": 1.0006129749071298e-06,
"loss": 0.3539275825023651,
"step": 1796
},
{
"epoch": 3.9866962305986697,
"grad_norm": 1.9152874946594238,
"learning_rate": 1.00037081530677e-06,
"loss": 0.4812496304512024,
"step": 1798
},
{
"epoch": 3.991130820399113,
"grad_norm": 4.806298732757568,
"learning_rate": 1.0001891927557255e-06,
"loss": 0.4314287304878235,
"step": 1800
},
{
"epoch": 3.9955654101995566,
"grad_norm": 1.1714129447937012,
"learning_rate": 1.0000681096975056e-06,
"loss": 0.37641072273254395,
"step": 1802
},
{
"epoch": 4.0,
"grad_norm": 0.90379798412323,
"learning_rate": 1.0000075677611364e-06,
"loss": 0.12868885695934296,
"step": 1804
},
{
"epoch": 4.0,
"step": 1804,
"total_flos": 3.4175049861232067e+18,
"train_loss": 0.7562091423920304,
"train_runtime": 9085.3052,
"train_samples_per_second": 5.957,
"train_steps_per_second": 0.199
}
],
"logging_steps": 2,
"max_steps": 1804,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.4175049861232067e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}