P4_multi_without_retrieval_2 / trainer_state.json
AliHmlii's picture
Model save
7bd9ea3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 7106,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00028145229383619476,
"grad_norm": 1.9831818342208862,
"learning_rate": 2.8129395218002816e-07,
"loss": 2.1254,
"step": 1
},
{
"epoch": 0.0014072614691809737,
"grad_norm": 1.082022786140442,
"learning_rate": 1.4064697609001406e-06,
"loss": 1.2669,
"step": 5
},
{
"epoch": 0.0028145229383619475,
"grad_norm": 1.065866231918335,
"learning_rate": 2.8129395218002813e-06,
"loss": 1.3653,
"step": 10
},
{
"epoch": 0.004221784407542921,
"grad_norm": 2.5712316036224365,
"learning_rate": 4.219409282700422e-06,
"loss": 1.53,
"step": 15
},
{
"epoch": 0.005629045876723895,
"grad_norm": 2.248335838317871,
"learning_rate": 5.6258790436005626e-06,
"loss": 1.4474,
"step": 20
},
{
"epoch": 0.007036307345904869,
"grad_norm": 1.0500571727752686,
"learning_rate": 7.032348804500703e-06,
"loss": 1.3372,
"step": 25
},
{
"epoch": 0.008443568815085843,
"grad_norm": 1.867329716682434,
"learning_rate": 8.438818565400844e-06,
"loss": 1.2333,
"step": 30
},
{
"epoch": 0.009850830284266816,
"grad_norm": 3.1149637699127197,
"learning_rate": 9.845288326300985e-06,
"loss": 1.2918,
"step": 35
},
{
"epoch": 0.01125809175344779,
"grad_norm": 1.9895963668823242,
"learning_rate": 1.1251758087201125e-05,
"loss": 1.3152,
"step": 40
},
{
"epoch": 0.012665353222628765,
"grad_norm": 2.0947887897491455,
"learning_rate": 1.2658227848101267e-05,
"loss": 1.3231,
"step": 45
},
{
"epoch": 0.014072614691809739,
"grad_norm": 1.4856278896331787,
"learning_rate": 1.4064697609001406e-05,
"loss": 1.2376,
"step": 50
},
{
"epoch": 0.015479876160990712,
"grad_norm": 1.2920206785202026,
"learning_rate": 1.547116736990155e-05,
"loss": 1.1083,
"step": 55
},
{
"epoch": 0.016887137630171686,
"grad_norm": 1.3694531917572021,
"learning_rate": 1.6877637130801688e-05,
"loss": 0.9554,
"step": 60
},
{
"epoch": 0.01829439909935266,
"grad_norm": 1.335752248764038,
"learning_rate": 1.828410689170183e-05,
"loss": 0.8074,
"step": 65
},
{
"epoch": 0.019701660568533633,
"grad_norm": 0.8360928297042847,
"learning_rate": 1.969057665260197e-05,
"loss": 0.9003,
"step": 70
},
{
"epoch": 0.021108922037714608,
"grad_norm": 1.4033712148666382,
"learning_rate": 2.1097046413502112e-05,
"loss": 1.0069,
"step": 75
},
{
"epoch": 0.02251618350689558,
"grad_norm": 3.524489164352417,
"learning_rate": 2.250351617440225e-05,
"loss": 0.8655,
"step": 80
},
{
"epoch": 0.023923444976076555,
"grad_norm": 5.065525054931641,
"learning_rate": 2.3909985935302392e-05,
"loss": 0.8884,
"step": 85
},
{
"epoch": 0.02533070644525753,
"grad_norm": 1.1002086400985718,
"learning_rate": 2.5316455696202533e-05,
"loss": 0.7538,
"step": 90
},
{
"epoch": 0.026737967914438502,
"grad_norm": 1.5529321432113647,
"learning_rate": 2.672292545710267e-05,
"loss": 0.9944,
"step": 95
},
{
"epoch": 0.028145229383619477,
"grad_norm": 2.5230774879455566,
"learning_rate": 2.8129395218002813e-05,
"loss": 0.742,
"step": 100
},
{
"epoch": 0.02955249085280045,
"grad_norm": 1.8407468795776367,
"learning_rate": 2.9535864978902954e-05,
"loss": 0.6106,
"step": 105
},
{
"epoch": 0.030959752321981424,
"grad_norm": 1.8544448614120483,
"learning_rate": 3.09423347398031e-05,
"loss": 0.7784,
"step": 110
},
{
"epoch": 0.0323670137911624,
"grad_norm": 1.5393428802490234,
"learning_rate": 3.234880450070324e-05,
"loss": 0.6225,
"step": 115
},
{
"epoch": 0.03377427526034337,
"grad_norm": 0.9650129675865173,
"learning_rate": 3.3755274261603375e-05,
"loss": 0.466,
"step": 120
},
{
"epoch": 0.03518153672952434,
"grad_norm": 1.2199194431304932,
"learning_rate": 3.516174402250352e-05,
"loss": 0.7885,
"step": 125
},
{
"epoch": 0.03658879819870532,
"grad_norm": 3.1491034030914307,
"learning_rate": 3.656821378340366e-05,
"loss": 0.642,
"step": 130
},
{
"epoch": 0.037996059667886294,
"grad_norm": 1.015199899673462,
"learning_rate": 3.79746835443038e-05,
"loss": 1.0289,
"step": 135
},
{
"epoch": 0.039403321137067265,
"grad_norm": 1.211543321609497,
"learning_rate": 3.938115330520394e-05,
"loss": 0.8134,
"step": 140
},
{
"epoch": 0.040810582606248244,
"grad_norm": 1.6816538572311401,
"learning_rate": 4.078762306610408e-05,
"loss": 0.8157,
"step": 145
},
{
"epoch": 0.042217844075429216,
"grad_norm": 1.9145057201385498,
"learning_rate": 4.2194092827004224e-05,
"loss": 0.6091,
"step": 150
},
{
"epoch": 0.04362510554461019,
"grad_norm": 1.168205976486206,
"learning_rate": 4.3600562587904366e-05,
"loss": 0.5557,
"step": 155
},
{
"epoch": 0.04503236701379116,
"grad_norm": 0.8458957076072693,
"learning_rate": 4.50070323488045e-05,
"loss": 0.4024,
"step": 160
},
{
"epoch": 0.04643962848297214,
"grad_norm": 1.442372441291809,
"learning_rate": 4.641350210970464e-05,
"loss": 0.8014,
"step": 165
},
{
"epoch": 0.04784688995215311,
"grad_norm": 1.6391854286193848,
"learning_rate": 4.7819971870604783e-05,
"loss": 1.012,
"step": 170
},
{
"epoch": 0.04925415142133408,
"grad_norm": 1.334926724433899,
"learning_rate": 4.9226441631504925e-05,
"loss": 0.6832,
"step": 175
},
{
"epoch": 0.05066141289051506,
"grad_norm": 1.0498499870300293,
"learning_rate": 5.0632911392405066e-05,
"loss": 0.6351,
"step": 180
},
{
"epoch": 0.05206867435969603,
"grad_norm": 2.0023510456085205,
"learning_rate": 5.203938115330521e-05,
"loss": 0.6497,
"step": 185
},
{
"epoch": 0.053475935828877004,
"grad_norm": 1.9690536260604858,
"learning_rate": 5.344585091420534e-05,
"loss": 0.7836,
"step": 190
},
{
"epoch": 0.054883197298057976,
"grad_norm": 1.4102208614349365,
"learning_rate": 5.4852320675105484e-05,
"loss": 0.5955,
"step": 195
},
{
"epoch": 0.056290458767238954,
"grad_norm": 0.9214100241661072,
"learning_rate": 5.6258790436005626e-05,
"loss": 0.7519,
"step": 200
},
{
"epoch": 0.057697720236419926,
"grad_norm": 1.3210060596466064,
"learning_rate": 5.766526019690577e-05,
"loss": 0.5468,
"step": 205
},
{
"epoch": 0.0591049817056009,
"grad_norm": 1.723496437072754,
"learning_rate": 5.907172995780591e-05,
"loss": 0.4599,
"step": 210
},
{
"epoch": 0.06051224317478188,
"grad_norm": 1.1883797645568848,
"learning_rate": 6.047819971870605e-05,
"loss": 0.7824,
"step": 215
},
{
"epoch": 0.06191950464396285,
"grad_norm": 1.0189827680587769,
"learning_rate": 6.18846694796062e-05,
"loss": 0.5021,
"step": 220
},
{
"epoch": 0.06332676611314382,
"grad_norm": 1.1384845972061157,
"learning_rate": 6.329113924050633e-05,
"loss": 0.7703,
"step": 225
},
{
"epoch": 0.0647340275823248,
"grad_norm": 2.097339391708374,
"learning_rate": 6.469760900140648e-05,
"loss": 1.0998,
"step": 230
},
{
"epoch": 0.06614128905150576,
"grad_norm": 2.55668044090271,
"learning_rate": 6.610407876230662e-05,
"loss": 0.5333,
"step": 235
},
{
"epoch": 0.06754855052068674,
"grad_norm": 1.1277037858963013,
"learning_rate": 6.751054852320675e-05,
"loss": 0.6361,
"step": 240
},
{
"epoch": 0.06895581198986772,
"grad_norm": 2.0660481452941895,
"learning_rate": 6.89170182841069e-05,
"loss": 0.7486,
"step": 245
},
{
"epoch": 0.07036307345904869,
"grad_norm": 2.1117303371429443,
"learning_rate": 7.032348804500703e-05,
"loss": 0.7103,
"step": 250
},
{
"epoch": 0.07177033492822966,
"grad_norm": 1.1796034574508667,
"learning_rate": 7.172995780590718e-05,
"loss": 0.6379,
"step": 255
},
{
"epoch": 0.07317759639741064,
"grad_norm": 1.470502257347107,
"learning_rate": 7.313642756680732e-05,
"loss": 0.4737,
"step": 260
},
{
"epoch": 0.07458485786659161,
"grad_norm": 1.443248987197876,
"learning_rate": 7.454289732770746e-05,
"loss": 0.812,
"step": 265
},
{
"epoch": 0.07599211933577259,
"grad_norm": 3.0095481872558594,
"learning_rate": 7.59493670886076e-05,
"loss": 0.4456,
"step": 270
},
{
"epoch": 0.07739938080495357,
"grad_norm": 1.157353401184082,
"learning_rate": 7.735583684950773e-05,
"loss": 0.524,
"step": 275
},
{
"epoch": 0.07880664227413453,
"grad_norm": 1.1761438846588135,
"learning_rate": 7.876230661040788e-05,
"loss": 0.7222,
"step": 280
},
{
"epoch": 0.08021390374331551,
"grad_norm": 0.64066082239151,
"learning_rate": 8.016877637130802e-05,
"loss": 0.5886,
"step": 285
},
{
"epoch": 0.08162116521249649,
"grad_norm": 0.9376239776611328,
"learning_rate": 8.157524613220817e-05,
"loss": 0.6901,
"step": 290
},
{
"epoch": 0.08302842668167745,
"grad_norm": 0.9339331388473511,
"learning_rate": 8.29817158931083e-05,
"loss": 0.389,
"step": 295
},
{
"epoch": 0.08443568815085843,
"grad_norm": 1.1914637088775635,
"learning_rate": 8.438818565400845e-05,
"loss": 0.535,
"step": 300
},
{
"epoch": 0.0858429496200394,
"grad_norm": 1.1882398128509521,
"learning_rate": 8.579465541490858e-05,
"loss": 0.3909,
"step": 305
},
{
"epoch": 0.08725021108922038,
"grad_norm": 1.5186290740966797,
"learning_rate": 8.720112517580873e-05,
"loss": 0.6317,
"step": 310
},
{
"epoch": 0.08865747255840135,
"grad_norm": 1.509944200515747,
"learning_rate": 8.860759493670887e-05,
"loss": 0.4739,
"step": 315
},
{
"epoch": 0.09006473402758232,
"grad_norm": 1.4957388639450073,
"learning_rate": 9.0014064697609e-05,
"loss": 0.6078,
"step": 320
},
{
"epoch": 0.0914719954967633,
"grad_norm": 1.8821747303009033,
"learning_rate": 9.142053445850915e-05,
"loss": 0.8152,
"step": 325
},
{
"epoch": 0.09287925696594428,
"grad_norm": 0.9399609565734863,
"learning_rate": 9.282700421940928e-05,
"loss": 0.6356,
"step": 330
},
{
"epoch": 0.09428651843512524,
"grad_norm": 1.4053034782409668,
"learning_rate": 9.423347398030943e-05,
"loss": 0.7405,
"step": 335
},
{
"epoch": 0.09569377990430622,
"grad_norm": 0.9742883443832397,
"learning_rate": 9.563994374120957e-05,
"loss": 0.7251,
"step": 340
},
{
"epoch": 0.0971010413734872,
"grad_norm": 3.047891616821289,
"learning_rate": 9.704641350210972e-05,
"loss": 0.7387,
"step": 345
},
{
"epoch": 0.09850830284266816,
"grad_norm": 0.8324292898178101,
"learning_rate": 9.845288326300985e-05,
"loss": 0.584,
"step": 350
},
{
"epoch": 0.09991556431184914,
"grad_norm": 1.0198436975479126,
"learning_rate": 9.985935302391e-05,
"loss": 0.4691,
"step": 355
},
{
"epoch": 0.10132282578103012,
"grad_norm": 3.0640432834625244,
"learning_rate": 0.00010126582278481013,
"loss": 0.4508,
"step": 360
},
{
"epoch": 0.10273008725021109,
"grad_norm": 0.9727720022201538,
"learning_rate": 0.00010267229254571027,
"loss": 0.4544,
"step": 365
},
{
"epoch": 0.10413734871939206,
"grad_norm": 1.4771376848220825,
"learning_rate": 0.00010407876230661042,
"loss": 0.5085,
"step": 370
},
{
"epoch": 0.10554461018857304,
"grad_norm": 1.5016095638275146,
"learning_rate": 0.00010548523206751055,
"loss": 0.5482,
"step": 375
},
{
"epoch": 0.10695187165775401,
"grad_norm": 1.5180020332336426,
"learning_rate": 0.00010689170182841069,
"loss": 0.7243,
"step": 380
},
{
"epoch": 0.10835913312693499,
"grad_norm": 1.8111554384231567,
"learning_rate": 0.00010829817158931083,
"loss": 0.5539,
"step": 385
},
{
"epoch": 0.10976639459611595,
"grad_norm": 1.488231897354126,
"learning_rate": 0.00010970464135021097,
"loss": 0.4533,
"step": 390
},
{
"epoch": 0.11117365606529693,
"grad_norm": 1.7389737367630005,
"learning_rate": 0.00011111111111111112,
"loss": 0.6554,
"step": 395
},
{
"epoch": 0.11258091753447791,
"grad_norm": 0.9282882213592529,
"learning_rate": 0.00011251758087201125,
"loss": 0.5665,
"step": 400
},
{
"epoch": 0.11398817900365887,
"grad_norm": 1.2808202505111694,
"learning_rate": 0.0001139240506329114,
"loss": 0.8137,
"step": 405
},
{
"epoch": 0.11539544047283985,
"grad_norm": 1.520807147026062,
"learning_rate": 0.00011533052039381153,
"loss": 0.7432,
"step": 410
},
{
"epoch": 0.11680270194202083,
"grad_norm": 1.4392223358154297,
"learning_rate": 0.0001167369901547117,
"loss": 0.478,
"step": 415
},
{
"epoch": 0.1182099634112018,
"grad_norm": 0.8880683779716492,
"learning_rate": 0.00011814345991561182,
"loss": 0.4246,
"step": 420
},
{
"epoch": 0.11961722488038277,
"grad_norm": 0.832594633102417,
"learning_rate": 0.00011954992967651195,
"loss": 0.5505,
"step": 425
},
{
"epoch": 0.12102448634956375,
"grad_norm": 0.4944693148136139,
"learning_rate": 0.0001209563994374121,
"loss": 0.4342,
"step": 430
},
{
"epoch": 0.12243174781874472,
"grad_norm": 0.8733665943145752,
"learning_rate": 0.00012236286919831225,
"loss": 0.5839,
"step": 435
},
{
"epoch": 0.1238390092879257,
"grad_norm": 1.1832093000411987,
"learning_rate": 0.0001237693389592124,
"loss": 0.6976,
"step": 440
},
{
"epoch": 0.12524627075710668,
"grad_norm": 1.0406477451324463,
"learning_rate": 0.00012517580872011252,
"loss": 0.6353,
"step": 445
},
{
"epoch": 0.12665353222628764,
"grad_norm": 0.788364827632904,
"learning_rate": 0.00012658227848101267,
"loss": 0.3272,
"step": 450
},
{
"epoch": 0.1280607936954686,
"grad_norm": 1.2941433191299438,
"learning_rate": 0.00012798874824191281,
"loss": 0.7372,
"step": 455
},
{
"epoch": 0.1294680551646496,
"grad_norm": 0.9147971272468567,
"learning_rate": 0.00012939521800281296,
"loss": 0.5474,
"step": 460
},
{
"epoch": 0.13087531663383056,
"grad_norm": 1.0644923448562622,
"learning_rate": 0.00013080168776371308,
"loss": 0.6286,
"step": 465
},
{
"epoch": 0.13228257810301153,
"grad_norm": 0.8214511275291443,
"learning_rate": 0.00013220815752461323,
"loss": 0.3655,
"step": 470
},
{
"epoch": 0.13368983957219252,
"grad_norm": 0.7348743677139282,
"learning_rate": 0.00013361462728551338,
"loss": 0.5278,
"step": 475
},
{
"epoch": 0.13509710104137349,
"grad_norm": 1.0437523126602173,
"learning_rate": 0.0001350210970464135,
"loss": 0.4665,
"step": 480
},
{
"epoch": 0.13650436251055445,
"grad_norm": 1.6613603830337524,
"learning_rate": 0.00013642756680731365,
"loss": 0.7575,
"step": 485
},
{
"epoch": 0.13791162397973544,
"grad_norm": 1.0844550132751465,
"learning_rate": 0.0001378340365682138,
"loss": 0.6744,
"step": 490
},
{
"epoch": 0.1393188854489164,
"grad_norm": 1.3651305437088013,
"learning_rate": 0.00013924050632911395,
"loss": 0.8377,
"step": 495
},
{
"epoch": 0.14072614691809737,
"grad_norm": 1.256631851196289,
"learning_rate": 0.00014064697609001407,
"loss": 0.6523,
"step": 500
},
{
"epoch": 0.14213340838727836,
"grad_norm": 1.7894726991653442,
"learning_rate": 0.0001420534458509142,
"loss": 0.5191,
"step": 505
},
{
"epoch": 0.14354066985645933,
"grad_norm": 0.8206887245178223,
"learning_rate": 0.00014345991561181436,
"loss": 0.378,
"step": 510
},
{
"epoch": 0.1449479313256403,
"grad_norm": 1.6677026748657227,
"learning_rate": 0.00014486638537271449,
"loss": 0.326,
"step": 515
},
{
"epoch": 0.1463551927948213,
"grad_norm": 1.4679995775222778,
"learning_rate": 0.00014627285513361463,
"loss": 0.6619,
"step": 520
},
{
"epoch": 0.14776245426400225,
"grad_norm": 0.829093337059021,
"learning_rate": 0.00014767932489451478,
"loss": 0.7372,
"step": 525
},
{
"epoch": 0.14916971573318322,
"grad_norm": 1.6188422441482544,
"learning_rate": 0.00014908579465541493,
"loss": 0.5666,
"step": 530
},
{
"epoch": 0.1505769772023642,
"grad_norm": 1.319091558456421,
"learning_rate": 0.00015049226441631505,
"loss": 0.8461,
"step": 535
},
{
"epoch": 0.15198423867154517,
"grad_norm": 1.7154995203018188,
"learning_rate": 0.0001518987341772152,
"loss": 0.6463,
"step": 540
},
{
"epoch": 0.15339150014072614,
"grad_norm": 1.4643100500106812,
"learning_rate": 0.00015330520393811535,
"loss": 0.6149,
"step": 545
},
{
"epoch": 0.15479876160990713,
"grad_norm": 1.554081916809082,
"learning_rate": 0.00015471167369901547,
"loss": 0.7509,
"step": 550
},
{
"epoch": 0.1562060230790881,
"grad_norm": 1.040045976638794,
"learning_rate": 0.00015611814345991562,
"loss": 0.6607,
"step": 555
},
{
"epoch": 0.15761328454826906,
"grad_norm": 1.9093159437179565,
"learning_rate": 0.00015752461322081577,
"loss": 0.6108,
"step": 560
},
{
"epoch": 0.15902054601745005,
"grad_norm": 0.8650393486022949,
"learning_rate": 0.0001589310829817159,
"loss": 0.629,
"step": 565
},
{
"epoch": 0.16042780748663102,
"grad_norm": 1.011257529258728,
"learning_rate": 0.00016033755274261603,
"loss": 0.2586,
"step": 570
},
{
"epoch": 0.16183506895581198,
"grad_norm": 0.8653711676597595,
"learning_rate": 0.00016174402250351618,
"loss": 0.6063,
"step": 575
},
{
"epoch": 0.16324233042499298,
"grad_norm": 1.7408281564712524,
"learning_rate": 0.00016315049226441633,
"loss": 0.4728,
"step": 580
},
{
"epoch": 0.16464959189417394,
"grad_norm": 0.7200327515602112,
"learning_rate": 0.00016455696202531648,
"loss": 0.6803,
"step": 585
},
{
"epoch": 0.1660568533633549,
"grad_norm": 2.032118320465088,
"learning_rate": 0.0001659634317862166,
"loss": 0.6615,
"step": 590
},
{
"epoch": 0.1674641148325359,
"grad_norm": 1.1240061521530151,
"learning_rate": 0.00016736990154711675,
"loss": 0.4675,
"step": 595
},
{
"epoch": 0.16887137630171686,
"grad_norm": 0.8609156012535095,
"learning_rate": 0.0001687763713080169,
"loss": 0.6737,
"step": 600
},
{
"epoch": 0.17027863777089783,
"grad_norm": 1.4271563291549683,
"learning_rate": 0.00017018284106891702,
"loss": 0.6479,
"step": 605
},
{
"epoch": 0.1716858992400788,
"grad_norm": 0.8409131765365601,
"learning_rate": 0.00017158931082981717,
"loss": 0.5877,
"step": 610
},
{
"epoch": 0.17309316070925979,
"grad_norm": 1.002172827720642,
"learning_rate": 0.00017299578059071731,
"loss": 0.5572,
"step": 615
},
{
"epoch": 0.17450042217844075,
"grad_norm": 0.7729489207267761,
"learning_rate": 0.00017440225035161746,
"loss": 0.64,
"step": 620
},
{
"epoch": 0.17590768364762172,
"grad_norm": 1.3359206914901733,
"learning_rate": 0.00017580872011251758,
"loss": 0.5652,
"step": 625
},
{
"epoch": 0.1773149451168027,
"grad_norm": 2.492105722427368,
"learning_rate": 0.00017721518987341773,
"loss": 0.584,
"step": 630
},
{
"epoch": 0.17872220658598367,
"grad_norm": 1.271020770072937,
"learning_rate": 0.00017862165963431788,
"loss": 0.4011,
"step": 635
},
{
"epoch": 0.18012946805516464,
"grad_norm": 0.8744266629219055,
"learning_rate": 0.000180028129395218,
"loss": 0.616,
"step": 640
},
{
"epoch": 0.18153672952434563,
"grad_norm": 1.2818926572799683,
"learning_rate": 0.00018143459915611815,
"loss": 0.463,
"step": 645
},
{
"epoch": 0.1829439909935266,
"grad_norm": 1.3106176853179932,
"learning_rate": 0.0001828410689170183,
"loss": 0.4851,
"step": 650
},
{
"epoch": 0.18435125246270756,
"grad_norm": 1.068864345550537,
"learning_rate": 0.00018424753867791845,
"loss": 0.6297,
"step": 655
},
{
"epoch": 0.18575851393188855,
"grad_norm": 1.879895567893982,
"learning_rate": 0.00018565400843881857,
"loss": 0.6638,
"step": 660
},
{
"epoch": 0.18716577540106952,
"grad_norm": 1.4671173095703125,
"learning_rate": 0.00018706047819971872,
"loss": 0.7588,
"step": 665
},
{
"epoch": 0.18857303687025048,
"grad_norm": 1.5851764678955078,
"learning_rate": 0.00018846694796061886,
"loss": 0.7505,
"step": 670
},
{
"epoch": 0.18998029833943147,
"grad_norm": 0.7149075269699097,
"learning_rate": 0.00018987341772151899,
"loss": 0.3806,
"step": 675
},
{
"epoch": 0.19138755980861244,
"grad_norm": 1.049310326576233,
"learning_rate": 0.00019127988748241913,
"loss": 0.5908,
"step": 680
},
{
"epoch": 0.1927948212777934,
"grad_norm": 0.950442373752594,
"learning_rate": 0.00019268635724331928,
"loss": 0.6755,
"step": 685
},
{
"epoch": 0.1942020827469744,
"grad_norm": 0.9287855625152588,
"learning_rate": 0.00019409282700421943,
"loss": 0.5703,
"step": 690
},
{
"epoch": 0.19560934421615536,
"grad_norm": 0.7228776216506958,
"learning_rate": 0.00019549929676511955,
"loss": 0.5971,
"step": 695
},
{
"epoch": 0.19701660568533633,
"grad_norm": 1.04582941532135,
"learning_rate": 0.0001969057665260197,
"loss": 0.9477,
"step": 700
},
{
"epoch": 0.19842386715451732,
"grad_norm": 1.6367225646972656,
"learning_rate": 0.00019831223628691985,
"loss": 0.5875,
"step": 705
},
{
"epoch": 0.19983112862369828,
"grad_norm": 0.724415123462677,
"learning_rate": 0.00019971870604782,
"loss": 0.5531,
"step": 710
},
{
"epoch": 0.20123839009287925,
"grad_norm": 1.1167938709259033,
"learning_rate": 0.00019999980693280142,
"loss": 0.4568,
"step": 715
},
{
"epoch": 0.20264565156206024,
"grad_norm": 3.7291440963745117,
"learning_rate": 0.00019999902259858484,
"loss": 0.4796,
"step": 720
},
{
"epoch": 0.2040529130312412,
"grad_norm": 1.0626037120819092,
"learning_rate": 0.00019999763493537887,
"loss": 0.5454,
"step": 725
},
{
"epoch": 0.20546017450042217,
"grad_norm": 1.1673458814620972,
"learning_rate": 0.00019999564395155577,
"loss": 0.6261,
"step": 730
},
{
"epoch": 0.20686743596960316,
"grad_norm": 1.1592299938201904,
"learning_rate": 0.00019999304965912784,
"loss": 0.6726,
"step": 735
},
{
"epoch": 0.20827469743878413,
"grad_norm": 1.117803692817688,
"learning_rate": 0.00019998985207374736,
"loss": 0.8504,
"step": 740
},
{
"epoch": 0.2096819589079651,
"grad_norm": 0.8449244499206543,
"learning_rate": 0.00019998605121470645,
"loss": 0.4394,
"step": 745
},
{
"epoch": 0.2110892203771461,
"grad_norm": 0.9696683883666992,
"learning_rate": 0.00019998164710493705,
"loss": 0.3861,
"step": 750
},
{
"epoch": 0.21249648184632705,
"grad_norm": 1.5206379890441895,
"learning_rate": 0.00019997663977101068,
"loss": 0.6289,
"step": 755
},
{
"epoch": 0.21390374331550802,
"grad_norm": 1.5071372985839844,
"learning_rate": 0.00019997102924313836,
"loss": 0.8584,
"step": 760
},
{
"epoch": 0.215311004784689,
"grad_norm": 0.9600889086723328,
"learning_rate": 0.00019996481555517028,
"loss": 0.3949,
"step": 765
},
{
"epoch": 0.21671826625386997,
"grad_norm": 0.8249372839927673,
"learning_rate": 0.00019995799874459585,
"loss": 0.559,
"step": 770
},
{
"epoch": 0.21812552772305094,
"grad_norm": 1.2509324550628662,
"learning_rate": 0.00019995057885254333,
"loss": 0.5327,
"step": 775
},
{
"epoch": 0.2195327891922319,
"grad_norm": 0.8242643475532532,
"learning_rate": 0.00019994255592377936,
"loss": 0.4605,
"step": 780
},
{
"epoch": 0.2209400506614129,
"grad_norm": 0.7586041688919067,
"learning_rate": 0.00019993393000670916,
"loss": 0.4722,
"step": 785
},
{
"epoch": 0.22234731213059386,
"grad_norm": 1.2805287837982178,
"learning_rate": 0.00019992470115337592,
"loss": 0.2861,
"step": 790
},
{
"epoch": 0.22375457359977483,
"grad_norm": 1.6375665664672852,
"learning_rate": 0.00019991486941946048,
"loss": 0.5846,
"step": 795
},
{
"epoch": 0.22516183506895582,
"grad_norm": 0.8348977565765381,
"learning_rate": 0.00019990443486428118,
"loss": 0.4657,
"step": 800
},
{
"epoch": 0.22656909653813678,
"grad_norm": 1.1735246181488037,
"learning_rate": 0.0001998933975507933,
"loss": 0.6255,
"step": 805
},
{
"epoch": 0.22797635800731775,
"grad_norm": 1.1627134084701538,
"learning_rate": 0.00019988175754558874,
"loss": 0.7479,
"step": 810
},
{
"epoch": 0.22938361947649874,
"grad_norm": 1.916646957397461,
"learning_rate": 0.00019986951491889578,
"loss": 0.4814,
"step": 815
},
{
"epoch": 0.2307908809456797,
"grad_norm": 1.5607751607894897,
"learning_rate": 0.00019985666974457847,
"loss": 0.5807,
"step": 820
},
{
"epoch": 0.23219814241486067,
"grad_norm": 0.759840726852417,
"learning_rate": 0.0001998432221001362,
"loss": 0.5299,
"step": 825
},
{
"epoch": 0.23360540388404166,
"grad_norm": 0.8141459226608276,
"learning_rate": 0.0001998291720667033,
"loss": 0.584,
"step": 830
},
{
"epoch": 0.23501266535322263,
"grad_norm": 1.113457441329956,
"learning_rate": 0.00019981451972904854,
"loss": 0.5733,
"step": 835
},
{
"epoch": 0.2364199268224036,
"grad_norm": 1.1313204765319824,
"learning_rate": 0.00019979926517557458,
"loss": 0.6995,
"step": 840
},
{
"epoch": 0.23782718829158458,
"grad_norm": 0.8379271626472473,
"learning_rate": 0.00019978340849831743,
"loss": 0.3914,
"step": 845
},
{
"epoch": 0.23923444976076555,
"grad_norm": 0.8467435240745544,
"learning_rate": 0.00019976694979294596,
"loss": 0.6813,
"step": 850
},
{
"epoch": 0.24064171122994651,
"grad_norm": 1.30973219871521,
"learning_rate": 0.00019974988915876134,
"loss": 0.4174,
"step": 855
},
{
"epoch": 0.2420489726991275,
"grad_norm": 0.9715356230735779,
"learning_rate": 0.0001997322266986963,
"loss": 0.4208,
"step": 860
},
{
"epoch": 0.24345623416830847,
"grad_norm": 1.0101361274719238,
"learning_rate": 0.0001997139625193146,
"loss": 0.602,
"step": 865
},
{
"epoch": 0.24486349563748944,
"grad_norm": 0.9341487288475037,
"learning_rate": 0.0001996950967308104,
"loss": 0.3989,
"step": 870
},
{
"epoch": 0.24627075710667043,
"grad_norm": 1.2196135520935059,
"learning_rate": 0.00019967562944700763,
"loss": 0.4883,
"step": 875
},
{
"epoch": 0.2476780185758514,
"grad_norm": 1.2374253273010254,
"learning_rate": 0.00019965556078535917,
"loss": 0.7397,
"step": 880
},
{
"epoch": 0.24908528004503236,
"grad_norm": 0.561997652053833,
"learning_rate": 0.00019963489086694626,
"loss": 0.7548,
"step": 885
},
{
"epoch": 0.25049254151421335,
"grad_norm": 0.8023036122322083,
"learning_rate": 0.00019961361981647775,
"loss": 0.4486,
"step": 890
},
{
"epoch": 0.2518998029833943,
"grad_norm": 0.9484225511550903,
"learning_rate": 0.00019959174776228928,
"loss": 0.4158,
"step": 895
},
{
"epoch": 0.2533070644525753,
"grad_norm": 1.119430661201477,
"learning_rate": 0.0001995692748363426,
"loss": 0.7553,
"step": 900
},
{
"epoch": 0.25471432592175625,
"grad_norm": 1.4776628017425537,
"learning_rate": 0.0001995462011742247,
"loss": 0.2808,
"step": 905
},
{
"epoch": 0.2561215873909372,
"grad_norm": 1.370290756225586,
"learning_rate": 0.00019952252691514706,
"loss": 0.4522,
"step": 910
},
{
"epoch": 0.25752884886011823,
"grad_norm": 1.1513909101486206,
"learning_rate": 0.00019949825220194468,
"loss": 0.5382,
"step": 915
},
{
"epoch": 0.2589361103292992,
"grad_norm": 1.0892587900161743,
"learning_rate": 0.00019947337718107547,
"loss": 0.5407,
"step": 920
},
{
"epoch": 0.26034337179848016,
"grad_norm": 1.1014186143875122,
"learning_rate": 0.00019944790200261903,
"loss": 0.5723,
"step": 925
},
{
"epoch": 0.2617506332676611,
"grad_norm": 1.4293971061706543,
"learning_rate": 0.000199421826820276,
"loss": 0.7333,
"step": 930
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.5284586548805237,
"learning_rate": 0.00019939515179136713,
"loss": 0.6351,
"step": 935
},
{
"epoch": 0.26456515620602306,
"grad_norm": 0.7904505133628845,
"learning_rate": 0.0001993678770768321,
"loss": 0.6792,
"step": 940
},
{
"epoch": 0.2659724176752041,
"grad_norm": 0.5654340982437134,
"learning_rate": 0.0001993400028412288,
"loss": 0.4223,
"step": 945
},
{
"epoch": 0.26737967914438504,
"grad_norm": 0.9616327285766602,
"learning_rate": 0.00019931152925273225,
"loss": 0.4585,
"step": 950
},
{
"epoch": 0.268786940613566,
"grad_norm": 1.3930063247680664,
"learning_rate": 0.00019928245648313347,
"loss": 0.7828,
"step": 955
},
{
"epoch": 0.27019420208274697,
"grad_norm": 1.6367273330688477,
"learning_rate": 0.00019925278470783866,
"loss": 0.6883,
"step": 960
},
{
"epoch": 0.27160146355192794,
"grad_norm": 0.9764294028282166,
"learning_rate": 0.00019922251410586802,
"loss": 0.4474,
"step": 965
},
{
"epoch": 0.2730087250211089,
"grad_norm": 0.7450019121170044,
"learning_rate": 0.00019919164485985463,
"loss": 0.436,
"step": 970
},
{
"epoch": 0.2744159864902899,
"grad_norm": 0.774627149105072,
"learning_rate": 0.0001991601771560434,
"loss": 0.3708,
"step": 975
},
{
"epoch": 0.2758232479594709,
"grad_norm": 1.1829273700714111,
"learning_rate": 0.00019912811118429,
"loss": 0.4453,
"step": 980
},
{
"epoch": 0.27723050942865185,
"grad_norm": 1.0340484380722046,
"learning_rate": 0.0001990954471380596,
"loss": 0.3123,
"step": 985
},
{
"epoch": 0.2786377708978328,
"grad_norm": 0.6128121018409729,
"learning_rate": 0.00019906218521442576,
"loss": 0.3459,
"step": 990
},
{
"epoch": 0.2800450323670138,
"grad_norm": 0.8443979024887085,
"learning_rate": 0.00019902832561406934,
"loss": 0.7583,
"step": 995
},
{
"epoch": 0.28145229383619474,
"grad_norm": 1.4136847257614136,
"learning_rate": 0.00019899386854127705,
"loss": 0.6206,
"step": 1000
},
{
"epoch": 0.28285955530537576,
"grad_norm": 0.7922631502151489,
"learning_rate": 0.00019895881420394052,
"loss": 0.5676,
"step": 1005
},
{
"epoch": 0.28426681677455673,
"grad_norm": 1.7876763343811035,
"learning_rate": 0.0001989231628135547,
"loss": 0.5216,
"step": 1010
},
{
"epoch": 0.2856740782437377,
"grad_norm": 1.3975410461425781,
"learning_rate": 0.00019888691458521692,
"loss": 0.5053,
"step": 1015
},
{
"epoch": 0.28708133971291866,
"grad_norm": 1.0760260820388794,
"learning_rate": 0.00019885006973762535,
"loss": 0.3415,
"step": 1020
},
{
"epoch": 0.2884886011820996,
"grad_norm": 1.2067842483520508,
"learning_rate": 0.00019881262849307785,
"loss": 0.4352,
"step": 1025
},
{
"epoch": 0.2898958626512806,
"grad_norm": 0.8484588265419006,
"learning_rate": 0.0001987745910774705,
"loss": 0.6558,
"step": 1030
},
{
"epoch": 0.29130312412046155,
"grad_norm": 1.2854669094085693,
"learning_rate": 0.00019873595772029628,
"loss": 0.5144,
"step": 1035
},
{
"epoch": 0.2927103855896426,
"grad_norm": 0.8903659582138062,
"learning_rate": 0.00019869672865464373,
"loss": 0.7212,
"step": 1040
},
{
"epoch": 0.29411764705882354,
"grad_norm": 1.1273301839828491,
"learning_rate": 0.00019865690411719546,
"loss": 0.5763,
"step": 1045
},
{
"epoch": 0.2955249085280045,
"grad_norm": 1.6163692474365234,
"learning_rate": 0.00019861648434822687,
"loss": 0.8076,
"step": 1050
},
{
"epoch": 0.29693216999718547,
"grad_norm": 1.0796860456466675,
"learning_rate": 0.00019857546959160444,
"loss": 0.8208,
"step": 1055
},
{
"epoch": 0.29833943146636643,
"grad_norm": 0.8399056792259216,
"learning_rate": 0.00019853386009478454,
"loss": 0.5939,
"step": 1060
},
{
"epoch": 0.2997466929355474,
"grad_norm": 1.2428550720214844,
"learning_rate": 0.0001984916561088118,
"loss": 0.2594,
"step": 1065
},
{
"epoch": 0.3011539544047284,
"grad_norm": 2.2983717918395996,
"learning_rate": 0.00019844885788831756,
"loss": 0.7697,
"step": 1070
},
{
"epoch": 0.3025612158739094,
"grad_norm": 1.0774344205856323,
"learning_rate": 0.0001984054656915184,
"loss": 0.6441,
"step": 1075
},
{
"epoch": 0.30396847734309035,
"grad_norm": 0.6637004613876343,
"learning_rate": 0.00019836147978021467,
"loss": 0.4219,
"step": 1080
},
{
"epoch": 0.3053757388122713,
"grad_norm": 0.9496357440948486,
"learning_rate": 0.00019831690041978862,
"loss": 0.6518,
"step": 1085
},
{
"epoch": 0.3067830002814523,
"grad_norm": 1.3843315839767456,
"learning_rate": 0.00019827172787920315,
"loss": 0.6269,
"step": 1090
},
{
"epoch": 0.30819026175063324,
"grad_norm": 0.9899942278862,
"learning_rate": 0.0001982259624309999,
"loss": 0.5791,
"step": 1095
},
{
"epoch": 0.30959752321981426,
"grad_norm": 0.8998156785964966,
"learning_rate": 0.00019817960435129778,
"loss": 0.7362,
"step": 1100
},
{
"epoch": 0.31100478468899523,
"grad_norm": 0.615544319152832,
"learning_rate": 0.00019813265391979137,
"loss": 0.457,
"step": 1105
},
{
"epoch": 0.3124120461581762,
"grad_norm": 1.026685118675232,
"learning_rate": 0.00019808511141974886,
"loss": 0.5494,
"step": 1110
},
{
"epoch": 0.31381930762735716,
"grad_norm": 1.0256643295288086,
"learning_rate": 0.00019803697713801084,
"loss": 0.3588,
"step": 1115
},
{
"epoch": 0.3152265690965381,
"grad_norm": 0.8720577359199524,
"learning_rate": 0.00019798825136498814,
"loss": 0.5563,
"step": 1120
},
{
"epoch": 0.3166338305657191,
"grad_norm": 0.8864659667015076,
"learning_rate": 0.00019793893439466043,
"loss": 0.3091,
"step": 1125
},
{
"epoch": 0.3180410920349001,
"grad_norm": 1.0853145122528076,
"learning_rate": 0.00019788902652457412,
"loss": 0.6204,
"step": 1130
},
{
"epoch": 0.3194483535040811,
"grad_norm": 1.6496775150299072,
"learning_rate": 0.0001978385280558409,
"loss": 0.4948,
"step": 1135
},
{
"epoch": 0.32085561497326204,
"grad_norm": 1.668879508972168,
"learning_rate": 0.00019778743929313555,
"loss": 0.7545,
"step": 1140
},
{
"epoch": 0.322262876442443,
"grad_norm": 0.7751437425613403,
"learning_rate": 0.00019773576054469446,
"loss": 0.4416,
"step": 1145
},
{
"epoch": 0.32367013791162397,
"grad_norm": 1.3606644868850708,
"learning_rate": 0.0001976834921223135,
"loss": 0.4837,
"step": 1150
},
{
"epoch": 0.32507739938080493,
"grad_norm": 0.5276009440422058,
"learning_rate": 0.0001976306343413463,
"loss": 0.2264,
"step": 1155
},
{
"epoch": 0.32648466084998595,
"grad_norm": 1.034682035446167,
"learning_rate": 0.00019757718752070239,
"loss": 0.5388,
"step": 1160
},
{
"epoch": 0.3278919223191669,
"grad_norm": 0.9205548763275146,
"learning_rate": 0.00019752315198284497,
"loss": 0.7432,
"step": 1165
},
{
"epoch": 0.3292991837883479,
"grad_norm": 0.2892135977745056,
"learning_rate": 0.00019746852805378932,
"loss": 0.2681,
"step": 1170
},
{
"epoch": 0.33070644525752885,
"grad_norm": 1.4844127893447876,
"learning_rate": 0.0001974133160631007,
"loss": 0.4837,
"step": 1175
},
{
"epoch": 0.3321137067267098,
"grad_norm": 0.7771471738815308,
"learning_rate": 0.00019735751634389226,
"loss": 0.7133,
"step": 1180
},
{
"epoch": 0.3335209681958908,
"grad_norm": 1.23273766040802,
"learning_rate": 0.00019730112923282321,
"loss": 0.789,
"step": 1185
},
{
"epoch": 0.3349282296650718,
"grad_norm": 1.751483678817749,
"learning_rate": 0.0001972441550700966,
"loss": 0.7569,
"step": 1190
},
{
"epoch": 0.33633549113425276,
"grad_norm": 0.31647899746894836,
"learning_rate": 0.00019718659419945756,
"loss": 0.4276,
"step": 1195
},
{
"epoch": 0.3377427526034337,
"grad_norm": 1.3560551404953003,
"learning_rate": 0.00019712844696819076,
"loss": 0.4853,
"step": 1200
},
{
"epoch": 0.3391500140726147,
"grad_norm": 1.571906328201294,
"learning_rate": 0.00019706971372711882,
"loss": 0.3889,
"step": 1205
},
{
"epoch": 0.34055727554179566,
"grad_norm": 1.2469801902770996,
"learning_rate": 0.00019701039483059981,
"loss": 0.5063,
"step": 1210
},
{
"epoch": 0.3419645370109766,
"grad_norm": 0.660874605178833,
"learning_rate": 0.00019695049063652543,
"loss": 0.4789,
"step": 1215
},
{
"epoch": 0.3433717984801576,
"grad_norm": 0.9069953560829163,
"learning_rate": 0.00019689000150631845,
"loss": 0.393,
"step": 1220
},
{
"epoch": 0.3447790599493386,
"grad_norm": 1.9359229803085327,
"learning_rate": 0.000196828927804931,
"loss": 0.4297,
"step": 1225
},
{
"epoch": 0.34618632141851957,
"grad_norm": 1.063952088356018,
"learning_rate": 0.00019676726990084195,
"loss": 0.5455,
"step": 1230
},
{
"epoch": 0.34759358288770054,
"grad_norm": 1.7802363634109497,
"learning_rate": 0.000196705028166055,
"loss": 0.5684,
"step": 1235
},
{
"epoch": 0.3490008443568815,
"grad_norm": 1.1787841320037842,
"learning_rate": 0.00019664220297609624,
"loss": 0.6942,
"step": 1240
},
{
"epoch": 0.35040810582606247,
"grad_norm": 1.146467924118042,
"learning_rate": 0.00019657879471001195,
"loss": 0.6188,
"step": 1245
},
{
"epoch": 0.35181536729524343,
"grad_norm": 1.322690486907959,
"learning_rate": 0.0001965148037503663,
"loss": 0.5142,
"step": 1250
},
{
"epoch": 0.35322262876442445,
"grad_norm": 0.8079725503921509,
"learning_rate": 0.0001964502304832391,
"loss": 0.4729,
"step": 1255
},
{
"epoch": 0.3546298902336054,
"grad_norm": 1.8152616024017334,
"learning_rate": 0.0001963850752982234,
"loss": 0.7246,
"step": 1260
},
{
"epoch": 0.3560371517027864,
"grad_norm": 1.4570809602737427,
"learning_rate": 0.00019631933858842317,
"loss": 0.8113,
"step": 1265
},
{
"epoch": 0.35744441317196735,
"grad_norm": 1.1229805946350098,
"learning_rate": 0.00019625302075045088,
"loss": 0.5401,
"step": 1270
},
{
"epoch": 0.3588516746411483,
"grad_norm": 0.693499743938446,
"learning_rate": 0.00019618612218442517,
"loss": 0.3536,
"step": 1275
},
{
"epoch": 0.3602589361103293,
"grad_norm": 1.592119574546814,
"learning_rate": 0.00019611864329396853,
"loss": 0.5994,
"step": 1280
},
{
"epoch": 0.3616661975795103,
"grad_norm": 1.087098479270935,
"learning_rate": 0.00019605058448620452,
"loss": 0.5211,
"step": 1285
},
{
"epoch": 0.36307345904869126,
"grad_norm": 1.002854585647583,
"learning_rate": 0.0001959819461717557,
"loss": 0.6473,
"step": 1290
},
{
"epoch": 0.3644807205178722,
"grad_norm": 1.2526451349258423,
"learning_rate": 0.00019591272876474106,
"loss": 0.4721,
"step": 1295
},
{
"epoch": 0.3658879819870532,
"grad_norm": 0.9391024112701416,
"learning_rate": 0.00019584293268277324,
"loss": 0.5849,
"step": 1300
},
{
"epoch": 0.36729524345623416,
"grad_norm": 1.1725986003875732,
"learning_rate": 0.00019577255834695643,
"loss": 0.4718,
"step": 1305
},
{
"epoch": 0.3687025049254151,
"grad_norm": 1.1449577808380127,
"learning_rate": 0.00019570160618188353,
"loss": 0.5429,
"step": 1310
},
{
"epoch": 0.37010976639459614,
"grad_norm": 1.8632793426513672,
"learning_rate": 0.00019563007661563367,
"loss": 0.5791,
"step": 1315
},
{
"epoch": 0.3715170278637771,
"grad_norm": 0.6620994210243225,
"learning_rate": 0.00019555797007976975,
"loss": 0.4016,
"step": 1320
},
{
"epoch": 0.37292428933295807,
"grad_norm": 1.7540533542633057,
"learning_rate": 0.00019548528700933559,
"loss": 0.5039,
"step": 1325
},
{
"epoch": 0.37433155080213903,
"grad_norm": 0.9329980611801147,
"learning_rate": 0.00019541202784285352,
"loss": 0.403,
"step": 1330
},
{
"epoch": 0.37573881227132,
"grad_norm": 0.4586445689201355,
"learning_rate": 0.00019533819302232168,
"loss": 0.3944,
"step": 1335
},
{
"epoch": 0.37714607374050096,
"grad_norm": 1.575636863708496,
"learning_rate": 0.00019526378299321127,
"loss": 0.5372,
"step": 1340
},
{
"epoch": 0.378553335209682,
"grad_norm": 1.2038066387176514,
"learning_rate": 0.00019518879820446398,
"loss": 0.4409,
"step": 1345
},
{
"epoch": 0.37996059667886295,
"grad_norm": 0.9737414121627808,
"learning_rate": 0.0001951132391084892,
"loss": 0.7155,
"step": 1350
},
{
"epoch": 0.3813678581480439,
"grad_norm": 1.0166410207748413,
"learning_rate": 0.00019503710616116128,
"loss": 0.6772,
"step": 1355
},
{
"epoch": 0.3827751196172249,
"grad_norm": 1.1660302877426147,
"learning_rate": 0.0001949603998218169,
"loss": 0.7076,
"step": 1360
},
{
"epoch": 0.38418238108640584,
"grad_norm": 0.576275110244751,
"learning_rate": 0.0001948831205532521,
"loss": 0.5392,
"step": 1365
},
{
"epoch": 0.3855896425555868,
"grad_norm": 1.453596830368042,
"learning_rate": 0.00019480526882171976,
"loss": 0.7963,
"step": 1370
},
{
"epoch": 0.38699690402476783,
"grad_norm": 0.7829164862632751,
"learning_rate": 0.00019472684509692646,
"loss": 0.3505,
"step": 1375
},
{
"epoch": 0.3884041654939488,
"grad_norm": 0.9208312630653381,
"learning_rate": 0.0001946478498520299,
"loss": 0.5539,
"step": 1380
},
{
"epoch": 0.38981142696312976,
"grad_norm": 1.0814006328582764,
"learning_rate": 0.00019456828356363598,
"loss": 0.3839,
"step": 1385
},
{
"epoch": 0.3912186884323107,
"grad_norm": 1.592490553855896,
"learning_rate": 0.00019448814671179585,
"loss": 0.6688,
"step": 1390
},
{
"epoch": 0.3926259499014917,
"grad_norm": 0.880333662033081,
"learning_rate": 0.00019440743978000312,
"loss": 0.6542,
"step": 1395
},
{
"epoch": 0.39403321137067265,
"grad_norm": 0.516769528388977,
"learning_rate": 0.00019432616325519084,
"loss": 0.4571,
"step": 1400
},
{
"epoch": 0.3954404728398536,
"grad_norm": 1.1296850442886353,
"learning_rate": 0.00019424431762772866,
"loss": 0.5596,
"step": 1405
},
{
"epoch": 0.39684773430903464,
"grad_norm": 0.8967404365539551,
"learning_rate": 0.00019416190339141976,
"loss": 0.4144,
"step": 1410
},
{
"epoch": 0.3982549957782156,
"grad_norm": 1.983446478843689,
"learning_rate": 0.00019407892104349804,
"loss": 0.2378,
"step": 1415
},
{
"epoch": 0.39966225724739657,
"grad_norm": 0.868871808052063,
"learning_rate": 0.00019399537108462494,
"loss": 0.8016,
"step": 1420
},
{
"epoch": 0.40106951871657753,
"grad_norm": 1.9956140518188477,
"learning_rate": 0.00019391125401888644,
"loss": 0.5541,
"step": 1425
},
{
"epoch": 0.4024767801857585,
"grad_norm": 1.437330961227417,
"learning_rate": 0.00019382657035379026,
"loss": 0.299,
"step": 1430
},
{
"epoch": 0.40388404165493946,
"grad_norm": 1.0055358409881592,
"learning_rate": 0.00019374132060026242,
"loss": 0.5419,
"step": 1435
},
{
"epoch": 0.4052913031241205,
"grad_norm": 1.3034961223602295,
"learning_rate": 0.00019365550527264443,
"loss": 0.7488,
"step": 1440
},
{
"epoch": 0.40669856459330145,
"grad_norm": 1.9104148149490356,
"learning_rate": 0.0001935691248886901,
"loss": 0.4039,
"step": 1445
},
{
"epoch": 0.4081058260624824,
"grad_norm": 1.3824232816696167,
"learning_rate": 0.00019348217996956245,
"loss": 0.5864,
"step": 1450
},
{
"epoch": 0.4095130875316634,
"grad_norm": 0.18742340803146362,
"learning_rate": 0.00019339467103983044,
"loss": 0.3931,
"step": 1455
},
{
"epoch": 0.41092034900084434,
"grad_norm": 1.0197157859802246,
"learning_rate": 0.00019330659862746603,
"loss": 0.4888,
"step": 1460
},
{
"epoch": 0.4123276104700253,
"grad_norm": 1.248344898223877,
"learning_rate": 0.00019321796326384082,
"loss": 0.4607,
"step": 1465
},
{
"epoch": 0.41373487193920633,
"grad_norm": 0.8360584378242493,
"learning_rate": 0.00019312876548372286,
"loss": 0.5113,
"step": 1470
},
{
"epoch": 0.4151421334083873,
"grad_norm": 1.7348827123641968,
"learning_rate": 0.00019303900582527344,
"loss": 0.511,
"step": 1475
},
{
"epoch": 0.41654939487756826,
"grad_norm": 1.2273963689804077,
"learning_rate": 0.00019294868483004396,
"loss": 0.3603,
"step": 1480
},
{
"epoch": 0.4179566563467492,
"grad_norm": 1.0628288984298706,
"learning_rate": 0.00019285780304297245,
"loss": 0.5377,
"step": 1485
},
{
"epoch": 0.4193639178159302,
"grad_norm": 1.1135960817337036,
"learning_rate": 0.00019276636101238045,
"loss": 0.3928,
"step": 1490
},
{
"epoch": 0.42077117928511115,
"grad_norm": 0.8842063546180725,
"learning_rate": 0.00019267435928996962,
"loss": 0.4252,
"step": 1495
},
{
"epoch": 0.4221784407542922,
"grad_norm": 0.56885826587677,
"learning_rate": 0.00019258179843081847,
"loss": 0.5456,
"step": 1500
},
{
"epoch": 0.42358570222347314,
"grad_norm": 0.5579463243484497,
"learning_rate": 0.00019248867899337896,
"loss": 0.3585,
"step": 1505
},
{
"epoch": 0.4249929636926541,
"grad_norm": 1.1640398502349854,
"learning_rate": 0.00019239500153947305,
"loss": 0.5048,
"step": 1510
},
{
"epoch": 0.42640022516183507,
"grad_norm": 0.8812012672424316,
"learning_rate": 0.00019230076663428962,
"loss": 0.4503,
"step": 1515
},
{
"epoch": 0.42780748663101603,
"grad_norm": 1.1245768070220947,
"learning_rate": 0.0001922059748463807,
"loss": 0.364,
"step": 1520
},
{
"epoch": 0.429214748100197,
"grad_norm": 1.0180691480636597,
"learning_rate": 0.00019211062674765817,
"loss": 0.4229,
"step": 1525
},
{
"epoch": 0.430622009569378,
"grad_norm": 1.3053510189056396,
"learning_rate": 0.0001920147229133904,
"loss": 0.4794,
"step": 1530
},
{
"epoch": 0.432029271038559,
"grad_norm": 0.8506336808204651,
"learning_rate": 0.00019191826392219867,
"loss": 0.5524,
"step": 1535
},
{
"epoch": 0.43343653250773995,
"grad_norm": 1.0151127576828003,
"learning_rate": 0.00019182125035605376,
"loss": 0.5024,
"step": 1540
},
{
"epoch": 0.4348437939769209,
"grad_norm": 1.094344973564148,
"learning_rate": 0.00019172368280027233,
"loss": 0.5535,
"step": 1545
},
{
"epoch": 0.4362510554461019,
"grad_norm": 1.0190297365188599,
"learning_rate": 0.00019162556184351348,
"loss": 0.393,
"step": 1550
},
{
"epoch": 0.43765831691528284,
"grad_norm": 1.502398133277893,
"learning_rate": 0.00019152688807777516,
"loss": 0.4018,
"step": 1555
},
{
"epoch": 0.4390655783844638,
"grad_norm": 0.8518544435501099,
"learning_rate": 0.00019142766209839064,
"loss": 0.5682,
"step": 1560
},
{
"epoch": 0.4404728398536448,
"grad_norm": 0.42057764530181885,
"learning_rate": 0.0001913278845040249,
"loss": 0.2624,
"step": 1565
},
{
"epoch": 0.4418801013228258,
"grad_norm": 0.8204036951065063,
"learning_rate": 0.00019122755589667093,
"loss": 0.6987,
"step": 1570
},
{
"epoch": 0.44328736279200676,
"grad_norm": 1.2145869731903076,
"learning_rate": 0.00019112667688164626,
"loss": 0.575,
"step": 1575
},
{
"epoch": 0.4446946242611877,
"grad_norm": 1.5361616611480713,
"learning_rate": 0.0001910252480675891,
"loss": 0.466,
"step": 1580
},
{
"epoch": 0.4461018857303687,
"grad_norm": 1.8853634595870972,
"learning_rate": 0.00019092327006645497,
"loss": 0.4938,
"step": 1585
},
{
"epoch": 0.44750914719954965,
"grad_norm": 1.2990604639053345,
"learning_rate": 0.00019082074349351268,
"loss": 0.5759,
"step": 1590
},
{
"epoch": 0.44891640866873067,
"grad_norm": 1.3845807313919067,
"learning_rate": 0.0001907176689673408,
"loss": 0.6341,
"step": 1595
},
{
"epoch": 0.45032367013791164,
"grad_norm": 0.8449406027793884,
"learning_rate": 0.0001906140471098239,
"loss": 0.546,
"step": 1600
},
{
"epoch": 0.4517309316070926,
"grad_norm": 1.2000244855880737,
"learning_rate": 0.00019050987854614886,
"loss": 0.5149,
"step": 1605
},
{
"epoch": 0.45313819307627357,
"grad_norm": 0.8644974827766418,
"learning_rate": 0.0001904051639048009,
"loss": 0.5419,
"step": 1610
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.4699718654155731,
"learning_rate": 0.00019029990381756002,
"loss": 0.3501,
"step": 1615
},
{
"epoch": 0.4559527160146355,
"grad_norm": 0.6143896579742432,
"learning_rate": 0.00019019409891949703,
"loss": 0.4732,
"step": 1620
},
{
"epoch": 0.4573599774838165,
"grad_norm": 1.4060841798782349,
"learning_rate": 0.0001900877498489698,
"loss": 0.6648,
"step": 1625
},
{
"epoch": 0.4587672389529975,
"grad_norm": 1.3622968196868896,
"learning_rate": 0.00018998085724761935,
"loss": 0.3465,
"step": 1630
},
{
"epoch": 0.46017450042217845,
"grad_norm": 0.6618224382400513,
"learning_rate": 0.00018987342176036607,
"loss": 0.5135,
"step": 1635
},
{
"epoch": 0.4615817618913594,
"grad_norm": 1.253423810005188,
"learning_rate": 0.0001897654440354057,
"loss": 0.5411,
"step": 1640
},
{
"epoch": 0.4629890233605404,
"grad_norm": 1.0359442234039307,
"learning_rate": 0.00018965692472420554,
"loss": 0.5266,
"step": 1645
},
{
"epoch": 0.46439628482972134,
"grad_norm": 1.4265358448028564,
"learning_rate": 0.00018954786448150047,
"loss": 0.481,
"step": 1650
},
{
"epoch": 0.46580354629890236,
"grad_norm": 0.6981240510940552,
"learning_rate": 0.00018943826396528897,
"loss": 0.287,
"step": 1655
},
{
"epoch": 0.4672108077680833,
"grad_norm": 0.8274213671684265,
"learning_rate": 0.00018932812383682917,
"loss": 0.4081,
"step": 1660
},
{
"epoch": 0.4686180692372643,
"grad_norm": 0.7835836410522461,
"learning_rate": 0.0001892174447606349,
"loss": 0.344,
"step": 1665
},
{
"epoch": 0.47002533070644525,
"grad_norm": 1.9255175590515137,
"learning_rate": 0.00018910622740447167,
"loss": 0.6834,
"step": 1670
},
{
"epoch": 0.4714325921756262,
"grad_norm": 1.7480101585388184,
"learning_rate": 0.00018899447243935256,
"loss": 0.4431,
"step": 1675
},
{
"epoch": 0.4728398536448072,
"grad_norm": 0.7691779136657715,
"learning_rate": 0.00018888218053953425,
"loss": 0.5831,
"step": 1680
},
{
"epoch": 0.4742471151139882,
"grad_norm": 0.6671115756034851,
"learning_rate": 0.00018876935238251296,
"loss": 0.3096,
"step": 1685
},
{
"epoch": 0.47565437658316917,
"grad_norm": 0.7756052613258362,
"learning_rate": 0.00018865598864902035,
"loss": 0.4505,
"step": 1690
},
{
"epoch": 0.47706163805235013,
"grad_norm": 0.7612590193748474,
"learning_rate": 0.00018854209002301932,
"loss": 0.5595,
"step": 1695
},
{
"epoch": 0.4784688995215311,
"grad_norm": 0.9925332069396973,
"learning_rate": 0.00018842765719170006,
"loss": 0.3256,
"step": 1700
},
{
"epoch": 0.47987616099071206,
"grad_norm": 1.4211307764053345,
"learning_rate": 0.00018831269084547574,
"loss": 0.3897,
"step": 1705
},
{
"epoch": 0.48128342245989303,
"grad_norm": 0.8699591159820557,
"learning_rate": 0.00018819719167797842,
"loss": 0.348,
"step": 1710
},
{
"epoch": 0.48269068392907405,
"grad_norm": 1.1962676048278809,
"learning_rate": 0.00018808116038605493,
"loss": 0.6022,
"step": 1715
},
{
"epoch": 0.484097945398255,
"grad_norm": 1.0962321758270264,
"learning_rate": 0.00018796459766976247,
"loss": 0.4853,
"step": 1720
},
{
"epoch": 0.485505206867436,
"grad_norm": 1.8502682447433472,
"learning_rate": 0.00018784750423236462,
"loss": 0.5438,
"step": 1725
},
{
"epoch": 0.48691246833661694,
"grad_norm": 0.8780159950256348,
"learning_rate": 0.0001877298807803269,
"loss": 0.4728,
"step": 1730
},
{
"epoch": 0.4883197298057979,
"grad_norm": 1.3143213987350464,
"learning_rate": 0.00018761172802331263,
"loss": 0.648,
"step": 1735
},
{
"epoch": 0.4897269912749789,
"grad_norm": 1.3124626874923706,
"learning_rate": 0.00018749304667417863,
"loss": 0.568,
"step": 1740
},
{
"epoch": 0.49113425274415984,
"grad_norm": 1.2247035503387451,
"learning_rate": 0.0001873738374489709,
"loss": 0.3325,
"step": 1745
},
{
"epoch": 0.49254151421334086,
"grad_norm": 0.8056420683860779,
"learning_rate": 0.00018725410106692025,
"loss": 0.5355,
"step": 1750
},
{
"epoch": 0.4939487756825218,
"grad_norm": 1.782456636428833,
"learning_rate": 0.00018713383825043806,
"loss": 0.3927,
"step": 1755
},
{
"epoch": 0.4953560371517028,
"grad_norm": 0.9671362638473511,
"learning_rate": 0.00018701304972511187,
"loss": 0.4428,
"step": 1760
},
{
"epoch": 0.49676329862088375,
"grad_norm": 0.8646135330200195,
"learning_rate": 0.00018689173621970096,
"loss": 0.396,
"step": 1765
},
{
"epoch": 0.4981705600900647,
"grad_norm": 1.406186580657959,
"learning_rate": 0.00018676989846613205,
"loss": 0.4296,
"step": 1770
},
{
"epoch": 0.4995778215592457,
"grad_norm": 1.2148306369781494,
"learning_rate": 0.00018664753719949478,
"loss": 0.3217,
"step": 1775
},
{
"epoch": 0.5009850830284267,
"grad_norm": 2.317777395248413,
"learning_rate": 0.00018652465315803745,
"loss": 0.5039,
"step": 1780
},
{
"epoch": 0.5023923444976076,
"grad_norm": 2.461662530899048,
"learning_rate": 0.00018640124708316225,
"loss": 0.5716,
"step": 1785
},
{
"epoch": 0.5037996059667886,
"grad_norm": 1.3684732913970947,
"learning_rate": 0.0001862773197194211,
"loss": 0.3489,
"step": 1790
},
{
"epoch": 0.5052068674359697,
"grad_norm": 0.7968658208847046,
"learning_rate": 0.00018615287181451108,
"loss": 0.4202,
"step": 1795
},
{
"epoch": 0.5066141289051506,
"grad_norm": 1.1133559942245483,
"learning_rate": 0.00018602790411926975,
"loss": 0.4799,
"step": 1800
},
{
"epoch": 0.5080213903743316,
"grad_norm": 1.4438867568969727,
"learning_rate": 0.0001859024173876709,
"loss": 0.5841,
"step": 1805
},
{
"epoch": 0.5094286518435125,
"grad_norm": 0.5369459986686707,
"learning_rate": 0.0001857764123768196,
"loss": 0.4793,
"step": 1810
},
{
"epoch": 0.5108359133126935,
"grad_norm": 0.7949886918067932,
"learning_rate": 0.0001856498898469482,
"loss": 0.4041,
"step": 1815
},
{
"epoch": 0.5122431747818744,
"grad_norm": 0.5967936515808105,
"learning_rate": 0.00018552285056141124,
"loss": 0.3951,
"step": 1820
},
{
"epoch": 0.5136504362510554,
"grad_norm": 0.32833540439605713,
"learning_rate": 0.00018539529528668094,
"loss": 0.2362,
"step": 1825
},
{
"epoch": 0.5150576977202365,
"grad_norm": 0.7846612334251404,
"learning_rate": 0.00018526722479234286,
"loss": 0.4279,
"step": 1830
},
{
"epoch": 0.5164649591894174,
"grad_norm": 1.5786385536193848,
"learning_rate": 0.00018513863985109095,
"loss": 0.429,
"step": 1835
},
{
"epoch": 0.5178722206585984,
"grad_norm": 1.2571947574615479,
"learning_rate": 0.00018500954123872303,
"loss": 0.6325,
"step": 1840
},
{
"epoch": 0.5192794821277793,
"grad_norm": 0.807839035987854,
"learning_rate": 0.00018487992973413605,
"loss": 0.3732,
"step": 1845
},
{
"epoch": 0.5206867435969603,
"grad_norm": 0.9321346282958984,
"learning_rate": 0.00018474980611932144,
"loss": 0.5329,
"step": 1850
},
{
"epoch": 0.5220940050661413,
"grad_norm": 1.1516450643539429,
"learning_rate": 0.0001846191711793604,
"loss": 0.553,
"step": 1855
},
{
"epoch": 0.5235012665353223,
"grad_norm": 1.2552000284194946,
"learning_rate": 0.000184488025702419,
"loss": 0.5088,
"step": 1860
},
{
"epoch": 0.5249085280045033,
"grad_norm": 0.7412288188934326,
"learning_rate": 0.00018435637047974375,
"loss": 0.623,
"step": 1865
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.7325606942176819,
"learning_rate": 0.0001842242063056565,
"loss": 0.4663,
"step": 1870
},
{
"epoch": 0.5277230509428652,
"grad_norm": 0.7041971683502197,
"learning_rate": 0.0001840915339775498,
"loss": 0.3317,
"step": 1875
},
{
"epoch": 0.5291303124120461,
"grad_norm": 0.8097009062767029,
"learning_rate": 0.00018395835429588215,
"loss": 0.5374,
"step": 1880
},
{
"epoch": 0.5305375738812271,
"grad_norm": 0.5471770763397217,
"learning_rate": 0.000183824668064173,
"loss": 0.6708,
"step": 1885
},
{
"epoch": 0.5319448353504082,
"grad_norm": 0.9955052137374878,
"learning_rate": 0.00018369047608899798,
"loss": 0.3958,
"step": 1890
},
{
"epoch": 0.5333520968195891,
"grad_norm": 0.980060875415802,
"learning_rate": 0.00018355577917998414,
"loss": 0.5356,
"step": 1895
},
{
"epoch": 0.5347593582887701,
"grad_norm": 0.8592010736465454,
"learning_rate": 0.00018342057814980494,
"loss": 0.5253,
"step": 1900
},
{
"epoch": 0.536166619757951,
"grad_norm": 0.8325905799865723,
"learning_rate": 0.00018328487381417532,
"loss": 0.5743,
"step": 1905
},
{
"epoch": 0.537573881227132,
"grad_norm": 1.0972857475280762,
"learning_rate": 0.00018314866699184687,
"loss": 0.6613,
"step": 1910
},
{
"epoch": 0.5389811426963129,
"grad_norm": 0.9051984548568726,
"learning_rate": 0.00018301195850460293,
"loss": 0.5146,
"step": 1915
},
{
"epoch": 0.5403884041654939,
"grad_norm": 0.8490184545516968,
"learning_rate": 0.00018287474917725343,
"loss": 0.6052,
"step": 1920
},
{
"epoch": 0.541795665634675,
"grad_norm": 0.9744853377342224,
"learning_rate": 0.00018273703983763017,
"loss": 0.556,
"step": 1925
},
{
"epoch": 0.5432029271038559,
"grad_norm": 0.9393332600593567,
"learning_rate": 0.0001825988313165816,
"loss": 0.6805,
"step": 1930
},
{
"epoch": 0.5446101885730369,
"grad_norm": 0.786738932132721,
"learning_rate": 0.0001824601244479679,
"loss": 0.5313,
"step": 1935
},
{
"epoch": 0.5460174500422178,
"grad_norm": 1.7297477722167969,
"learning_rate": 0.00018232092006865606,
"loss": 0.6627,
"step": 1940
},
{
"epoch": 0.5474247115113988,
"grad_norm": 0.8226016759872437,
"learning_rate": 0.00018218121901851468,
"loss": 0.4177,
"step": 1945
},
{
"epoch": 0.5488319729805798,
"grad_norm": 1.1636661291122437,
"learning_rate": 0.0001820410221404089,
"loss": 0.5303,
"step": 1950
},
{
"epoch": 0.5502392344497608,
"grad_norm": 1.3004634380340576,
"learning_rate": 0.00018190033028019534,
"loss": 0.5114,
"step": 1955
},
{
"epoch": 0.5516464959189418,
"grad_norm": 1.512581706047058,
"learning_rate": 0.00018175914428671716,
"loss": 0.5918,
"step": 1960
},
{
"epoch": 0.5530537573881227,
"grad_norm": 0.7482631206512451,
"learning_rate": 0.0001816174650117987,
"loss": 0.6304,
"step": 1965
},
{
"epoch": 0.5544610188573037,
"grad_norm": 1.3120630979537964,
"learning_rate": 0.00018147529331024044,
"loss": 0.5008,
"step": 1970
},
{
"epoch": 0.5558682803264846,
"grad_norm": 0.9526933431625366,
"learning_rate": 0.00018133263003981384,
"loss": 0.6951,
"step": 1975
},
{
"epoch": 0.5572755417956656,
"grad_norm": 0.8142489194869995,
"learning_rate": 0.0001811894760612562,
"loss": 0.478,
"step": 1980
},
{
"epoch": 0.5586828032648467,
"grad_norm": 1.5639302730560303,
"learning_rate": 0.0001810458322382654,
"loss": 0.6378,
"step": 1985
},
{
"epoch": 0.5600900647340276,
"grad_norm": 0.6878836154937744,
"learning_rate": 0.00018090169943749476,
"loss": 0.6067,
"step": 1990
},
{
"epoch": 0.5614973262032086,
"grad_norm": 1.1296664476394653,
"learning_rate": 0.0001807570785285477,
"loss": 0.6044,
"step": 1995
},
{
"epoch": 0.5629045876723895,
"grad_norm": 0.837823748588562,
"learning_rate": 0.00018061197038397268,
"loss": 0.4684,
"step": 2000
},
{
"epoch": 0.5643118491415705,
"grad_norm": 1.2144043445587158,
"learning_rate": 0.0001804663758792577,
"loss": 0.3649,
"step": 2005
},
{
"epoch": 0.5657191106107515,
"grad_norm": 0.8372750878334045,
"learning_rate": 0.00018032029589282525,
"loss": 0.4253,
"step": 2010
},
{
"epoch": 0.5671263720799324,
"grad_norm": 0.8684276342391968,
"learning_rate": 0.00018017373130602683,
"loss": 0.3992,
"step": 2015
},
{
"epoch": 0.5685336335491135,
"grad_norm": 0.9675285816192627,
"learning_rate": 0.0001800266830031377,
"loss": 0.5995,
"step": 2020
},
{
"epoch": 0.5699408950182944,
"grad_norm": 0.9824860692024231,
"learning_rate": 0.00017987915187135157,
"loss": 0.2531,
"step": 2025
},
{
"epoch": 0.5713481564874754,
"grad_norm": 2.90608549118042,
"learning_rate": 0.0001797311388007753,
"loss": 0.6474,
"step": 2030
},
{
"epoch": 0.5727554179566563,
"grad_norm": 0.922585666179657,
"learning_rate": 0.00017958264468442332,
"loss": 0.4685,
"step": 2035
},
{
"epoch": 0.5741626794258373,
"grad_norm": 1.4679278135299683,
"learning_rate": 0.00017943367041821243,
"loss": 0.4786,
"step": 2040
},
{
"epoch": 0.5755699408950183,
"grad_norm": 0.8750627040863037,
"learning_rate": 0.00017928421690095636,
"loss": 0.317,
"step": 2045
},
{
"epoch": 0.5769772023641992,
"grad_norm": 1.1974796056747437,
"learning_rate": 0.00017913428503436035,
"loss": 0.496,
"step": 2050
},
{
"epoch": 0.5783844638333803,
"grad_norm": 0.8931379914283752,
"learning_rate": 0.00017898387572301563,
"loss": 0.6886,
"step": 2055
},
{
"epoch": 0.5797917253025612,
"grad_norm": 1.0573607683181763,
"learning_rate": 0.00017883298987439404,
"loss": 0.5887,
"step": 2060
},
{
"epoch": 0.5811989867717422,
"grad_norm": 1.1087405681610107,
"learning_rate": 0.00017868162839884254,
"loss": 0.5817,
"step": 2065
},
{
"epoch": 0.5826062482409231,
"grad_norm": 0.5602430701255798,
"learning_rate": 0.00017852979220957775,
"loss": 0.4194,
"step": 2070
},
{
"epoch": 0.5840135097101041,
"grad_norm": 0.9328368306159973,
"learning_rate": 0.00017837748222268037,
"loss": 0.3816,
"step": 2075
},
{
"epoch": 0.5854207711792851,
"grad_norm": 1.4052832126617432,
"learning_rate": 0.00017822469935708965,
"loss": 0.7981,
"step": 2080
},
{
"epoch": 0.5868280326484661,
"grad_norm": 1.0276223421096802,
"learning_rate": 0.00017807144453459793,
"loss": 0.4105,
"step": 2085
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.257156491279602,
"learning_rate": 0.00017791771867984503,
"loss": 0.5565,
"step": 2090
},
{
"epoch": 0.589642555586828,
"grad_norm": 1.0978988409042358,
"learning_rate": 0.00017776352272031264,
"loss": 0.5929,
"step": 2095
},
{
"epoch": 0.591049817056009,
"grad_norm": 0.8809897303581238,
"learning_rate": 0.0001776088575863188,
"loss": 0.3527,
"step": 2100
},
{
"epoch": 0.59245707852519,
"grad_norm": 0.6997563242912292,
"learning_rate": 0.00017745372421101223,
"loss": 0.5211,
"step": 2105
},
{
"epoch": 0.5938643399943709,
"grad_norm": 0.9955636262893677,
"learning_rate": 0.00017729812353036668,
"loss": 0.5267,
"step": 2110
},
{
"epoch": 0.595271601463552,
"grad_norm": 0.8788183927536011,
"learning_rate": 0.00017714205648317535,
"loss": 0.5372,
"step": 2115
},
{
"epoch": 0.5966788629327329,
"grad_norm": 1.0072330236434937,
"learning_rate": 0.00017698552401104517,
"loss": 0.5234,
"step": 2120
},
{
"epoch": 0.5980861244019139,
"grad_norm": 1.6254470348358154,
"learning_rate": 0.00017682852705839115,
"loss": 0.4621,
"step": 2125
},
{
"epoch": 0.5994933858710948,
"grad_norm": 1.0389853715896606,
"learning_rate": 0.00017667106657243072,
"loss": 0.5439,
"step": 2130
},
{
"epoch": 0.6009006473402758,
"grad_norm": 0.9769371151924133,
"learning_rate": 0.00017651314350317787,
"loss": 0.6171,
"step": 2135
},
{
"epoch": 0.6023079088094568,
"grad_norm": 1.7502343654632568,
"learning_rate": 0.0001763547588034376,
"loss": 0.612,
"step": 2140
},
{
"epoch": 0.6037151702786377,
"grad_norm": 1.1023430824279785,
"learning_rate": 0.00017619591342880005,
"loss": 0.4228,
"step": 2145
},
{
"epoch": 0.6051224317478188,
"grad_norm": 2.0511550903320312,
"learning_rate": 0.00017603660833763476,
"loss": 0.3462,
"step": 2150
},
{
"epoch": 0.6065296932169997,
"grad_norm": 0.7986024022102356,
"learning_rate": 0.00017587684449108497,
"loss": 0.4616,
"step": 2155
},
{
"epoch": 0.6079369546861807,
"grad_norm": 0.7450430989265442,
"learning_rate": 0.00017571662285306166,
"loss": 0.5481,
"step": 2160
},
{
"epoch": 0.6093442161553617,
"grad_norm": 1.1748677492141724,
"learning_rate": 0.00017555594439023787,
"loss": 0.5419,
"step": 2165
},
{
"epoch": 0.6107514776245426,
"grad_norm": 0.7183251976966858,
"learning_rate": 0.0001753948100720429,
"loss": 0.4122,
"step": 2170
},
{
"epoch": 0.6121587390937236,
"grad_norm": 0.7296462655067444,
"learning_rate": 0.00017523322087065614,
"loss": 0.3651,
"step": 2175
},
{
"epoch": 0.6135660005629046,
"grad_norm": 0.5904517769813538,
"learning_rate": 0.00017507117776100178,
"loss": 0.3728,
"step": 2180
},
{
"epoch": 0.6149732620320856,
"grad_norm": 1.5718715190887451,
"learning_rate": 0.00017490868172074232,
"loss": 0.4729,
"step": 2185
},
{
"epoch": 0.6163805235012665,
"grad_norm": 1.053885579109192,
"learning_rate": 0.00017474573373027315,
"loss": 0.4341,
"step": 2190
},
{
"epoch": 0.6177877849704475,
"grad_norm": 0.723726212978363,
"learning_rate": 0.00017458233477271628,
"loss": 0.4755,
"step": 2195
},
{
"epoch": 0.6191950464396285,
"grad_norm": 1.133907437324524,
"learning_rate": 0.00017441848583391463,
"loss": 0.7399,
"step": 2200
},
{
"epoch": 0.6206023079088094,
"grad_norm": 0.5922422409057617,
"learning_rate": 0.00017425418790242606,
"loss": 0.4381,
"step": 2205
},
{
"epoch": 0.6220095693779905,
"grad_norm": 0.534817636013031,
"learning_rate": 0.0001740894419695172,
"loss": 0.4668,
"step": 2210
},
{
"epoch": 0.6234168308471714,
"grad_norm": 0.5950006246566772,
"learning_rate": 0.00017392424902915786,
"loss": 0.3497,
"step": 2215
},
{
"epoch": 0.6248240923163524,
"grad_norm": 3.878748655319214,
"learning_rate": 0.00017375861007801465,
"loss": 0.2247,
"step": 2220
},
{
"epoch": 0.6262313537855334,
"grad_norm": 1.3402066230773926,
"learning_rate": 0.00017359252611544505,
"loss": 0.3214,
"step": 2225
},
{
"epoch": 0.6276386152547143,
"grad_norm": 1.3445652723312378,
"learning_rate": 0.0001734259981434917,
"loss": 0.4757,
"step": 2230
},
{
"epoch": 0.6290458767238953,
"grad_norm": 0.801052987575531,
"learning_rate": 0.00017325902716687578,
"loss": 0.542,
"step": 2235
},
{
"epoch": 0.6304531381930762,
"grad_norm": 0.6313127279281616,
"learning_rate": 0.0001730916141929916,
"loss": 0.6026,
"step": 2240
},
{
"epoch": 0.6318603996622573,
"grad_norm": 0.7048347592353821,
"learning_rate": 0.00017292376023189996,
"loss": 0.4769,
"step": 2245
},
{
"epoch": 0.6332676611314382,
"grad_norm": 1.3377580642700195,
"learning_rate": 0.00017275546629632235,
"loss": 0.3727,
"step": 2250
},
{
"epoch": 0.6346749226006192,
"grad_norm": 1.3854931592941284,
"learning_rate": 0.00017258673340163485,
"loss": 0.4537,
"step": 2255
},
{
"epoch": 0.6360821840698002,
"grad_norm": 1.5850138664245605,
"learning_rate": 0.00017241756256586183,
"loss": 0.5933,
"step": 2260
},
{
"epoch": 0.6374894455389811,
"grad_norm": 1.3591883182525635,
"learning_rate": 0.00017224795480967,
"loss": 0.3786,
"step": 2265
},
{
"epoch": 0.6388967070081621,
"grad_norm": 0.685483992099762,
"learning_rate": 0.00017207791115636206,
"loss": 0.3562,
"step": 2270
},
{
"epoch": 0.640303968477343,
"grad_norm": 1.1758111715316772,
"learning_rate": 0.00017190743263187076,
"loss": 0.3506,
"step": 2275
},
{
"epoch": 0.6417112299465241,
"grad_norm": 0.9146699905395508,
"learning_rate": 0.00017173652026475247,
"loss": 0.4753,
"step": 2280
},
{
"epoch": 0.643118491415705,
"grad_norm": 0.6895302534103394,
"learning_rate": 0.00017156517508618116,
"loss": 0.2637,
"step": 2285
},
{
"epoch": 0.644525752884886,
"grad_norm": 1.011983036994934,
"learning_rate": 0.00017139339812994204,
"loss": 0.551,
"step": 2290
},
{
"epoch": 0.645933014354067,
"grad_norm": 1.5470740795135498,
"learning_rate": 0.0001712211904324254,
"loss": 0.6397,
"step": 2295
},
{
"epoch": 0.6473402758232479,
"grad_norm": 0.8334661722183228,
"learning_rate": 0.0001710485530326204,
"loss": 0.3297,
"step": 2300
},
{
"epoch": 0.648747537292429,
"grad_norm": 1.3184936046600342,
"learning_rate": 0.00017087548697210868,
"loss": 0.2933,
"step": 2305
},
{
"epoch": 0.6501547987616099,
"grad_norm": 0.6180691719055176,
"learning_rate": 0.00017070199329505815,
"loss": 0.316,
"step": 2310
},
{
"epoch": 0.6515620602307909,
"grad_norm": 1.5314627885818481,
"learning_rate": 0.00017052807304821673,
"loss": 0.4908,
"step": 2315
},
{
"epoch": 0.6529693216999719,
"grad_norm": 0.2867351472377777,
"learning_rate": 0.0001703537272809059,
"loss": 0.4078,
"step": 2320
},
{
"epoch": 0.6543765831691528,
"grad_norm": 1.513857126235962,
"learning_rate": 0.00017017895704501447,
"loss": 0.5121,
"step": 2325
},
{
"epoch": 0.6557838446383338,
"grad_norm": 0.7989262938499451,
"learning_rate": 0.00017000376339499233,
"loss": 0.4578,
"step": 2330
},
{
"epoch": 0.6571911061075147,
"grad_norm": 1.8081159591674805,
"learning_rate": 0.00016982814738784386,
"loss": 0.3809,
"step": 2335
},
{
"epoch": 0.6585983675766958,
"grad_norm": 1.2163859605789185,
"learning_rate": 0.0001696521100831216,
"loss": 0.3293,
"step": 2340
},
{
"epoch": 0.6600056290458767,
"grad_norm": 1.5051732063293457,
"learning_rate": 0.00016947565254292016,
"loss": 0.33,
"step": 2345
},
{
"epoch": 0.6614128905150577,
"grad_norm": 0.6793294548988342,
"learning_rate": 0.00016929877583186936,
"loss": 0.5292,
"step": 2350
},
{
"epoch": 0.6628201519842387,
"grad_norm": 1.8864996433258057,
"learning_rate": 0.00016912148101712814,
"loss": 0.1853,
"step": 2355
},
{
"epoch": 0.6642274134534196,
"grad_norm": 1.2697969675064087,
"learning_rate": 0.00016894376916837795,
"loss": 0.4886,
"step": 2360
},
{
"epoch": 0.6656346749226006,
"grad_norm": 1.4264556169509888,
"learning_rate": 0.00016876564135781638,
"loss": 0.5061,
"step": 2365
},
{
"epoch": 0.6670419363917816,
"grad_norm": 0.5291624665260315,
"learning_rate": 0.00016858709866015065,
"loss": 0.4241,
"step": 2370
},
{
"epoch": 0.6684491978609626,
"grad_norm": 1.5842996835708618,
"learning_rate": 0.00016840814215259112,
"loss": 0.4321,
"step": 2375
},
{
"epoch": 0.6698564593301436,
"grad_norm": 0.7339175939559937,
"learning_rate": 0.0001682287729148449,
"loss": 0.4975,
"step": 2380
},
{
"epoch": 0.6712637207993245,
"grad_norm": 0.6193541884422302,
"learning_rate": 0.00016804899202910907,
"loss": 0.1977,
"step": 2385
},
{
"epoch": 0.6726709822685055,
"grad_norm": 1.8930505514144897,
"learning_rate": 0.00016786880058006453,
"loss": 0.6117,
"step": 2390
},
{
"epoch": 0.6740782437376864,
"grad_norm": 1.268921971321106,
"learning_rate": 0.0001676881996548691,
"loss": 0.5449,
"step": 2395
},
{
"epoch": 0.6754855052068675,
"grad_norm": 1.5368669033050537,
"learning_rate": 0.00016750719034315121,
"loss": 0.4734,
"step": 2400
},
{
"epoch": 0.6768927666760484,
"grad_norm": 0.8705158233642578,
"learning_rate": 0.00016732577373700314,
"loss": 0.4644,
"step": 2405
},
{
"epoch": 0.6783000281452294,
"grad_norm": 0.3128531873226166,
"learning_rate": 0.00016714395093097458,
"loss": 0.4438,
"step": 2410
},
{
"epoch": 0.6797072896144104,
"grad_norm": 1.795952558517456,
"learning_rate": 0.00016696172302206597,
"loss": 0.463,
"step": 2415
},
{
"epoch": 0.6811145510835913,
"grad_norm": 0.8031005263328552,
"learning_rate": 0.00016677909110972183,
"loss": 0.727,
"step": 2420
},
{
"epoch": 0.6825218125527723,
"grad_norm": 1.083425760269165,
"learning_rate": 0.00016659605629582418,
"loss": 0.6498,
"step": 2425
},
{
"epoch": 0.6839290740219532,
"grad_norm": 0.9262056350708008,
"learning_rate": 0.00016641261968468598,
"loss": 0.3122,
"step": 2430
},
{
"epoch": 0.6853363354911343,
"grad_norm": 0.27757611870765686,
"learning_rate": 0.00016622878238304424,
"loss": 0.3477,
"step": 2435
},
{
"epoch": 0.6867435969603152,
"grad_norm": 0.6037611365318298,
"learning_rate": 0.00016604454550005356,
"loss": 0.2896,
"step": 2440
},
{
"epoch": 0.6881508584294962,
"grad_norm": 0.7902546525001526,
"learning_rate": 0.00016585991014727932,
"loss": 0.6687,
"step": 2445
},
{
"epoch": 0.6895581198986772,
"grad_norm": 0.8998187184333801,
"learning_rate": 0.000165674877438691,
"loss": 0.5168,
"step": 2450
},
{
"epoch": 0.6909653813678581,
"grad_norm": 0.9715900421142578,
"learning_rate": 0.0001654894484906555,
"loss": 0.6263,
"step": 2455
},
{
"epoch": 0.6923726428370391,
"grad_norm": 1.390411138534546,
"learning_rate": 0.00016530362442193037,
"loss": 0.4905,
"step": 2460
},
{
"epoch": 0.69377990430622,
"grad_norm": 0.8985224366188049,
"learning_rate": 0.00016511740635365705,
"loss": 0.5525,
"step": 2465
},
{
"epoch": 0.6951871657754011,
"grad_norm": 0.8099625110626221,
"learning_rate": 0.00016493079540935406,
"loss": 0.3906,
"step": 2470
},
{
"epoch": 0.6965944272445821,
"grad_norm": 1.9844683408737183,
"learning_rate": 0.00016474379271491033,
"loss": 0.5456,
"step": 2475
},
{
"epoch": 0.698001688713763,
"grad_norm": 1.053562045097351,
"learning_rate": 0.00016455639939857842,
"loss": 0.2934,
"step": 2480
},
{
"epoch": 0.699408950182944,
"grad_norm": 1.4200698137283325,
"learning_rate": 0.00016436861659096752,
"loss": 0.6771,
"step": 2485
},
{
"epoch": 0.7008162116521249,
"grad_norm": 0.7813885807991028,
"learning_rate": 0.00016418044542503685,
"loss": 0.357,
"step": 2490
},
{
"epoch": 0.702223473121306,
"grad_norm": 1.131839632987976,
"learning_rate": 0.00016399188703608867,
"loss": 0.528,
"step": 2495
},
{
"epoch": 0.7036307345904869,
"grad_norm": 0.7668808698654175,
"learning_rate": 0.00016380294256176155,
"loss": 0.4434,
"step": 2500
},
{
"epoch": 0.7050379960596679,
"grad_norm": 2.0037477016448975,
"learning_rate": 0.00016361361314202343,
"loss": 0.5884,
"step": 2505
},
{
"epoch": 0.7064452575288489,
"grad_norm": 0.726494550704956,
"learning_rate": 0.0001634238999191647,
"loss": 0.4555,
"step": 2510
},
{
"epoch": 0.7078525189980298,
"grad_norm": 0.5868455171585083,
"learning_rate": 0.0001632338040377915,
"loss": 0.4513,
"step": 2515
},
{
"epoch": 0.7092597804672108,
"grad_norm": 0.8666847348213196,
"learning_rate": 0.00016304332664481848,
"loss": 0.7028,
"step": 2520
},
{
"epoch": 0.7106670419363917,
"grad_norm": 1.0513399839401245,
"learning_rate": 0.00016285246888946234,
"loss": 0.3972,
"step": 2525
},
{
"epoch": 0.7120743034055728,
"grad_norm": 0.765617847442627,
"learning_rate": 0.0001626612319232344,
"loss": 0.4364,
"step": 2530
},
{
"epoch": 0.7134815648747538,
"grad_norm": 0.7804258465766907,
"learning_rate": 0.00016246961689993404,
"loss": 0.6756,
"step": 2535
},
{
"epoch": 0.7148888263439347,
"grad_norm": 1.0644882917404175,
"learning_rate": 0.00016227762497564153,
"loss": 0.4398,
"step": 2540
},
{
"epoch": 0.7162960878131157,
"grad_norm": 1.0868752002716064,
"learning_rate": 0.0001620852573087111,
"loss": 0.4097,
"step": 2545
},
{
"epoch": 0.7177033492822966,
"grad_norm": 0.877193033695221,
"learning_rate": 0.00016189251505976403,
"loss": 0.4445,
"step": 2550
},
{
"epoch": 0.7191106107514776,
"grad_norm": 1.735767126083374,
"learning_rate": 0.00016169939939168155,
"loss": 0.4002,
"step": 2555
},
{
"epoch": 0.7205178722206586,
"grad_norm": 0.679560124874115,
"learning_rate": 0.00016150591146959787,
"loss": 0.4376,
"step": 2560
},
{
"epoch": 0.7219251336898396,
"grad_norm": 0.7569028735160828,
"learning_rate": 0.00016131205246089304,
"loss": 0.5988,
"step": 2565
},
{
"epoch": 0.7233323951590206,
"grad_norm": 0.7681282758712769,
"learning_rate": 0.00016111782353518624,
"loss": 0.6736,
"step": 2570
},
{
"epoch": 0.7247396566282015,
"grad_norm": 0.9109302759170532,
"learning_rate": 0.0001609232258643282,
"loss": 0.4269,
"step": 2575
},
{
"epoch": 0.7261469180973825,
"grad_norm": 1.033499002456665,
"learning_rate": 0.00016072826062239458,
"loss": 0.4186,
"step": 2580
},
{
"epoch": 0.7275541795665634,
"grad_norm": 0.765438437461853,
"learning_rate": 0.00016053292898567876,
"loss": 0.4688,
"step": 2585
},
{
"epoch": 0.7289614410357445,
"grad_norm": 1.352359414100647,
"learning_rate": 0.00016033723213268464,
"loss": 0.4242,
"step": 2590
},
{
"epoch": 0.7303687025049254,
"grad_norm": 0.9118134379386902,
"learning_rate": 0.00016014117124411954,
"loss": 0.4915,
"step": 2595
},
{
"epoch": 0.7317759639741064,
"grad_norm": 1.1372839212417603,
"learning_rate": 0.00015994474750288725,
"loss": 0.3128,
"step": 2600
},
{
"epoch": 0.7331832254432874,
"grad_norm": 0.23089000582695007,
"learning_rate": 0.00015974796209408071,
"loss": 0.4923,
"step": 2605
},
{
"epoch": 0.7345904869124683,
"grad_norm": 1.543110728263855,
"learning_rate": 0.00015955081620497497,
"loss": 0.5901,
"step": 2610
},
{
"epoch": 0.7359977483816493,
"grad_norm": 1.474463939666748,
"learning_rate": 0.00015935331102501994,
"loss": 0.5367,
"step": 2615
},
{
"epoch": 0.7374050098508302,
"grad_norm": 0.7584693431854248,
"learning_rate": 0.00015915544774583324,
"loss": 0.6098,
"step": 2620
},
{
"epoch": 0.7388122713200113,
"grad_norm": 0.6778565645217896,
"learning_rate": 0.0001589572275611931,
"loss": 0.4514,
"step": 2625
},
{
"epoch": 0.7402195327891923,
"grad_norm": 0.7713000178337097,
"learning_rate": 0.00015875865166703105,
"loss": 0.2646,
"step": 2630
},
{
"epoch": 0.7416267942583732,
"grad_norm": 1.2152999639511108,
"learning_rate": 0.0001585597212614247,
"loss": 0.5909,
"step": 2635
},
{
"epoch": 0.7430340557275542,
"grad_norm": 1.4983125925064087,
"learning_rate": 0.00015836043754459064,
"loss": 0.4621,
"step": 2640
},
{
"epoch": 0.7444413171967351,
"grad_norm": 1.0301270484924316,
"learning_rate": 0.000158160801718877,
"loss": 0.2372,
"step": 2645
},
{
"epoch": 0.7458485786659161,
"grad_norm": 1.2305338382720947,
"learning_rate": 0.0001579608149887564,
"loss": 0.3397,
"step": 2650
},
{
"epoch": 0.747255840135097,
"grad_norm": 1.1948976516723633,
"learning_rate": 0.00015776047856081853,
"loss": 0.3388,
"step": 2655
},
{
"epoch": 0.7486631016042781,
"grad_norm": 1.539473295211792,
"learning_rate": 0.00015755979364376295,
"loss": 0.239,
"step": 2660
},
{
"epoch": 0.7500703630734591,
"grad_norm": 2.136974811553955,
"learning_rate": 0.0001573587614483918,
"loss": 0.5409,
"step": 2665
},
{
"epoch": 0.75147762454264,
"grad_norm": 1.2603963613510132,
"learning_rate": 0.0001571573831876024,
"loss": 0.3763,
"step": 2670
},
{
"epoch": 0.752884886011821,
"grad_norm": 0.9054425954818726,
"learning_rate": 0.00015695566007638013,
"loss": 0.4531,
"step": 2675
},
{
"epoch": 0.7542921474810019,
"grad_norm": 0.6948245763778687,
"learning_rate": 0.0001567535933317908,
"loss": 0.3894,
"step": 2680
},
{
"epoch": 0.755699408950183,
"grad_norm": 1.3231799602508545,
"learning_rate": 0.00015655118417297366,
"loss": 0.4352,
"step": 2685
},
{
"epoch": 0.757106670419364,
"grad_norm": 0.8093194365501404,
"learning_rate": 0.00015634843382113372,
"loss": 0.5505,
"step": 2690
},
{
"epoch": 0.7585139318885449,
"grad_norm": 0.7088418006896973,
"learning_rate": 0.0001561453434995346,
"loss": 0.4232,
"step": 2695
},
{
"epoch": 0.7599211933577259,
"grad_norm": 0.48376569151878357,
"learning_rate": 0.00015594191443349105,
"loss": 0.5123,
"step": 2700
},
{
"epoch": 0.7613284548269068,
"grad_norm": 1.2853504419326782,
"learning_rate": 0.00015573814785036164,
"loss": 0.3733,
"step": 2705
},
{
"epoch": 0.7627357162960878,
"grad_norm": 0.7034462690353394,
"learning_rate": 0.00015553404497954117,
"loss": 0.4144,
"step": 2710
},
{
"epoch": 0.7641429777652687,
"grad_norm": 1.340484380722046,
"learning_rate": 0.00015532960705245356,
"loss": 0.4388,
"step": 2715
},
{
"epoch": 0.7655502392344498,
"grad_norm": 0.7512633204460144,
"learning_rate": 0.00015512483530254412,
"loss": 0.4672,
"step": 2720
},
{
"epoch": 0.7669575007036308,
"grad_norm": 2.1453585624694824,
"learning_rate": 0.00015491973096527217,
"loss": 0.8132,
"step": 2725
},
{
"epoch": 0.7683647621728117,
"grad_norm": 1.0686702728271484,
"learning_rate": 0.00015471429527810383,
"loss": 0.3679,
"step": 2730
},
{
"epoch": 0.7697720236419927,
"grad_norm": 1.7490125894546509,
"learning_rate": 0.00015450852948050426,
"loss": 0.3288,
"step": 2735
},
{
"epoch": 0.7711792851111736,
"grad_norm": 1.7581394910812378,
"learning_rate": 0.00015430243481393024,
"loss": 0.6833,
"step": 2740
},
{
"epoch": 0.7725865465803546,
"grad_norm": 1.5255379676818848,
"learning_rate": 0.00015409601252182285,
"loss": 0.4711,
"step": 2745
},
{
"epoch": 0.7739938080495357,
"grad_norm": 1.7117855548858643,
"learning_rate": 0.00015388926384959976,
"loss": 0.6609,
"step": 2750
},
{
"epoch": 0.7754010695187166,
"grad_norm": 0.5109424591064453,
"learning_rate": 0.00015368219004464786,
"loss": 0.3426,
"step": 2755
},
{
"epoch": 0.7768083309878976,
"grad_norm": 1.3394129276275635,
"learning_rate": 0.0001534747923563156,
"loss": 0.4882,
"step": 2760
},
{
"epoch": 0.7782155924570785,
"grad_norm": 1.1809154748916626,
"learning_rate": 0.00015326707203590568,
"loss": 0.262,
"step": 2765
},
{
"epoch": 0.7796228539262595,
"grad_norm": 0.6428471207618713,
"learning_rate": 0.0001530590303366672,
"loss": 0.3657,
"step": 2770
},
{
"epoch": 0.7810301153954404,
"grad_norm": 0.5726737976074219,
"learning_rate": 0.0001528506685137883,
"loss": 0.4514,
"step": 2775
},
{
"epoch": 0.7824373768646214,
"grad_norm": 0.589094877243042,
"learning_rate": 0.00015264198782438858,
"loss": 0.5539,
"step": 2780
},
{
"epoch": 0.7838446383338025,
"grad_norm": 0.7207341194152832,
"learning_rate": 0.00015243298952751145,
"loss": 0.3529,
"step": 2785
},
{
"epoch": 0.7852518998029834,
"grad_norm": 1.0593701601028442,
"learning_rate": 0.0001522236748841165,
"loss": 0.317,
"step": 2790
},
{
"epoch": 0.7866591612721644,
"grad_norm": 1.1395798921585083,
"learning_rate": 0.000152014045157072,
"loss": 0.5062,
"step": 2795
},
{
"epoch": 0.7880664227413453,
"grad_norm": 1.3966251611709595,
"learning_rate": 0.00015180410161114724,
"loss": 0.4887,
"step": 2800
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.7492479681968689,
"learning_rate": 0.00015159384551300493,
"loss": 0.3919,
"step": 2805
},
{
"epoch": 0.7908809456797072,
"grad_norm": 1.2680071592330933,
"learning_rate": 0.00015138327813119337,
"loss": 0.3053,
"step": 2810
},
{
"epoch": 0.7922882071488883,
"grad_norm": 1.4319703578948975,
"learning_rate": 0.00015117240073613908,
"loss": 0.3683,
"step": 2815
},
{
"epoch": 0.7936954686180693,
"grad_norm": 1.0931735038757324,
"learning_rate": 0.00015096121460013895,
"loss": 0.5054,
"step": 2820
},
{
"epoch": 0.7951027300872502,
"grad_norm": 0.627133309841156,
"learning_rate": 0.00015074972099735266,
"loss": 0.4424,
"step": 2825
},
{
"epoch": 0.7965099915564312,
"grad_norm": 0.90239417552948,
"learning_rate": 0.00015053792120379476,
"loss": 0.5346,
"step": 2830
},
{
"epoch": 0.7979172530256121,
"grad_norm": 1.3932188749313354,
"learning_rate": 0.0001503258164973274,
"loss": 0.5265,
"step": 2835
},
{
"epoch": 0.7993245144947931,
"grad_norm": 1.2821606397628784,
"learning_rate": 0.0001501134081576523,
"loss": 0.3778,
"step": 2840
},
{
"epoch": 0.8007317759639742,
"grad_norm": 0.8399055600166321,
"learning_rate": 0.00014990069746630299,
"loss": 0.5459,
"step": 2845
},
{
"epoch": 0.8021390374331551,
"grad_norm": 2.0415430068969727,
"learning_rate": 0.00014968768570663735,
"loss": 0.534,
"step": 2850
},
{
"epoch": 0.8035462989023361,
"grad_norm": 1.1202126741409302,
"learning_rate": 0.00014947437416382956,
"loss": 0.3913,
"step": 2855
},
{
"epoch": 0.804953560371517,
"grad_norm": 1.3579108715057373,
"learning_rate": 0.00014926076412486263,
"loss": 0.3769,
"step": 2860
},
{
"epoch": 0.806360821840698,
"grad_norm": 1.1060523986816406,
"learning_rate": 0.00014904685687852043,
"loss": 0.4045,
"step": 2865
},
{
"epoch": 0.8077680833098789,
"grad_norm": 1.785001277923584,
"learning_rate": 0.00014883265371538,
"loss": 0.4895,
"step": 2870
},
{
"epoch": 0.80917534477906,
"grad_norm": 0.7138920426368713,
"learning_rate": 0.00014861815592780378,
"loss": 0.2431,
"step": 2875
},
{
"epoch": 0.810582606248241,
"grad_norm": 1.0932033061981201,
"learning_rate": 0.00014840336480993172,
"loss": 0.4196,
"step": 2880
},
{
"epoch": 0.8119898677174219,
"grad_norm": 1.47943115234375,
"learning_rate": 0.00014818828165767355,
"loss": 0.4288,
"step": 2885
},
{
"epoch": 0.8133971291866029,
"grad_norm": 1.5669611692428589,
"learning_rate": 0.00014797290776870101,
"loss": 0.7103,
"step": 2890
},
{
"epoch": 0.8148043906557838,
"grad_norm": 1.002616047859192,
"learning_rate": 0.0001477572444424399,
"loss": 0.2174,
"step": 2895
},
{
"epoch": 0.8162116521249648,
"grad_norm": 1.2607040405273438,
"learning_rate": 0.00014754129298006228,
"loss": 0.3312,
"step": 2900
},
{
"epoch": 0.8176189135941458,
"grad_norm": 1.2113310098648071,
"learning_rate": 0.00014732505468447867,
"loss": 0.309,
"step": 2905
},
{
"epoch": 0.8190261750633268,
"grad_norm": 0.6215373277664185,
"learning_rate": 0.00014710853086033013,
"loss": 0.3802,
"step": 2910
},
{
"epoch": 0.8204334365325078,
"grad_norm": 0.9997283220291138,
"learning_rate": 0.00014689172281398042,
"loss": 0.5467,
"step": 2915
},
{
"epoch": 0.8218406980016887,
"grad_norm": 0.7299907803535461,
"learning_rate": 0.0001466746318535082,
"loss": 0.4039,
"step": 2920
},
{
"epoch": 0.8232479594708697,
"grad_norm": 0.8940709829330444,
"learning_rate": 0.00014645725928869892,
"loss": 0.282,
"step": 2925
},
{
"epoch": 0.8246552209400506,
"grad_norm": 1.1947124004364014,
"learning_rate": 0.00014623960643103705,
"loss": 0.4364,
"step": 2930
},
{
"epoch": 0.8260624824092316,
"grad_norm": 0.6835992932319641,
"learning_rate": 0.00014602167459369826,
"loss": 0.4539,
"step": 2935
},
{
"epoch": 0.8274697438784127,
"grad_norm": 0.7021106481552124,
"learning_rate": 0.00014580346509154136,
"loss": 0.2876,
"step": 2940
},
{
"epoch": 0.8288770053475936,
"grad_norm": 1.7289482355117798,
"learning_rate": 0.00014558497924110038,
"loss": 0.4377,
"step": 2945
},
{
"epoch": 0.8302842668167746,
"grad_norm": 1.0549077987670898,
"learning_rate": 0.00014536621836057665,
"loss": 0.5667,
"step": 2950
},
{
"epoch": 0.8316915282859555,
"grad_norm": 0.5255772471427917,
"learning_rate": 0.000145147183769831,
"loss": 0.4976,
"step": 2955
},
{
"epoch": 0.8330987897551365,
"grad_norm": 2.376354694366455,
"learning_rate": 0.00014492787679037537,
"loss": 0.8001,
"step": 2960
},
{
"epoch": 0.8345060512243174,
"grad_norm": 0.8916311264038086,
"learning_rate": 0.0001447082987453654,
"loss": 0.4217,
"step": 2965
},
{
"epoch": 0.8359133126934984,
"grad_norm": 0.5236600637435913,
"learning_rate": 0.00014448845095959192,
"loss": 0.4531,
"step": 2970
},
{
"epoch": 0.8373205741626795,
"grad_norm": 1.5615344047546387,
"learning_rate": 0.00014426833475947345,
"loss": 0.3796,
"step": 2975
},
{
"epoch": 0.8387278356318604,
"grad_norm": 0.6851219534873962,
"learning_rate": 0.00014404795147304774,
"loss": 0.3966,
"step": 2980
},
{
"epoch": 0.8401350971010414,
"grad_norm": 1.6611498594284058,
"learning_rate": 0.00014382730242996404,
"loss": 0.6284,
"step": 2985
},
{
"epoch": 0.8415423585702223,
"grad_norm": 2.139336109161377,
"learning_rate": 0.00014360638896147501,
"loss": 0.4697,
"step": 2990
},
{
"epoch": 0.8429496200394033,
"grad_norm": 1.0581591129302979,
"learning_rate": 0.00014338521240042873,
"loss": 0.5119,
"step": 2995
},
{
"epoch": 0.8443568815085843,
"grad_norm": 0.885945200920105,
"learning_rate": 0.00014316377408126046,
"loss": 0.4225,
"step": 3000
},
{
"epoch": 0.8457641429777653,
"grad_norm": 2.1063387393951416,
"learning_rate": 0.00014294207533998486,
"loss": 0.4308,
"step": 3005
},
{
"epoch": 0.8471714044469463,
"grad_norm": 0.6381533741950989,
"learning_rate": 0.00014272011751418782,
"loss": 0.4063,
"step": 3010
},
{
"epoch": 0.8485786659161272,
"grad_norm": 0.740987241268158,
"learning_rate": 0.00014249790194301832,
"loss": 0.2807,
"step": 3015
},
{
"epoch": 0.8499859273853082,
"grad_norm": 0.8399060964584351,
"learning_rate": 0.0001422754299671804,
"loss": 0.3904,
"step": 3020
},
{
"epoch": 0.8513931888544891,
"grad_norm": 1.4542044401168823,
"learning_rate": 0.00014205270292892512,
"loss": 0.5098,
"step": 3025
},
{
"epoch": 0.8528004503236701,
"grad_norm": 0.8759632706642151,
"learning_rate": 0.00014182972217204238,
"loss": 0.438,
"step": 3030
},
{
"epoch": 0.8542077117928512,
"grad_norm": 1.2544376850128174,
"learning_rate": 0.00014160648904185295,
"loss": 0.3654,
"step": 3035
},
{
"epoch": 0.8556149732620321,
"grad_norm": 0.9191109538078308,
"learning_rate": 0.00014138300488520007,
"loss": 0.4855,
"step": 3040
},
{
"epoch": 0.8570222347312131,
"grad_norm": 1.2452969551086426,
"learning_rate": 0.00014115927105044172,
"loss": 0.1865,
"step": 3045
},
{
"epoch": 0.858429496200394,
"grad_norm": 1.0692249536514282,
"learning_rate": 0.00014093528888744212,
"loss": 0.3869,
"step": 3050
},
{
"epoch": 0.859836757669575,
"grad_norm": 0.9611905217170715,
"learning_rate": 0.00014071105974756382,
"loss": 0.4429,
"step": 3055
},
{
"epoch": 0.861244019138756,
"grad_norm": 1.419103741645813,
"learning_rate": 0.00014048658498365946,
"loss": 0.3828,
"step": 3060
},
{
"epoch": 0.8626512806079369,
"grad_norm": 0.70958012342453,
"learning_rate": 0.00014026186595006356,
"loss": 0.4098,
"step": 3065
},
{
"epoch": 0.864058542077118,
"grad_norm": 0.7273248434066772,
"learning_rate": 0.0001400369040025845,
"loss": 0.3795,
"step": 3070
},
{
"epoch": 0.8654658035462989,
"grad_norm": 1.2816479206085205,
"learning_rate": 0.00013981170049849614,
"loss": 0.3648,
"step": 3075
},
{
"epoch": 0.8668730650154799,
"grad_norm": 1.0046167373657227,
"learning_rate": 0.00013958625679652982,
"loss": 0.3949,
"step": 3080
},
{
"epoch": 0.8682803264846608,
"grad_norm": 0.45679983496665955,
"learning_rate": 0.000139360574256866,
"loss": 0.3828,
"step": 3085
},
{
"epoch": 0.8696875879538418,
"grad_norm": 0.7042393684387207,
"learning_rate": 0.00013913465424112627,
"loss": 0.3163,
"step": 3090
},
{
"epoch": 0.8710948494230228,
"grad_norm": 0.7769744992256165,
"learning_rate": 0.00013890849811236478,
"loss": 0.275,
"step": 3095
},
{
"epoch": 0.8725021108922038,
"grad_norm": 0.5500330328941345,
"learning_rate": 0.0001386821072350604,
"loss": 0.36,
"step": 3100
},
{
"epoch": 0.8739093723613848,
"grad_norm": 1.508569359779358,
"learning_rate": 0.00013845548297510834,
"loss": 0.3744,
"step": 3105
},
{
"epoch": 0.8753166338305657,
"grad_norm": 1.6323150396347046,
"learning_rate": 0.0001382286266998117,
"loss": 0.5385,
"step": 3110
},
{
"epoch": 0.8767238952997467,
"grad_norm": 1.0691790580749512,
"learning_rate": 0.00013800153977787364,
"loss": 0.4918,
"step": 3115
},
{
"epoch": 0.8781311567689276,
"grad_norm": 0.8545736074447632,
"learning_rate": 0.0001377742235793887,
"loss": 0.327,
"step": 3120
},
{
"epoch": 0.8795384182381086,
"grad_norm": 1.2977032661437988,
"learning_rate": 0.00013754667947583486,
"loss": 0.3627,
"step": 3125
},
{
"epoch": 0.8809456797072897,
"grad_norm": 0.8414074778556824,
"learning_rate": 0.00013731890884006507,
"loss": 0.4126,
"step": 3130
},
{
"epoch": 0.8823529411764706,
"grad_norm": 1.2440998554229736,
"learning_rate": 0.00013709091304629903,
"loss": 0.5402,
"step": 3135
},
{
"epoch": 0.8837602026456516,
"grad_norm": 1.1474038362503052,
"learning_rate": 0.00013686269347011487,
"loss": 0.4402,
"step": 3140
},
{
"epoch": 0.8851674641148325,
"grad_norm": 1.9769107103347778,
"learning_rate": 0.00013663425148844097,
"loss": 0.5528,
"step": 3145
},
{
"epoch": 0.8865747255840135,
"grad_norm": 1.071049451828003,
"learning_rate": 0.00013640558847954746,
"loss": 0.3496,
"step": 3150
},
{
"epoch": 0.8879819870531945,
"grad_norm": 1.002313494682312,
"learning_rate": 0.00013617670582303804,
"loss": 0.4351,
"step": 3155
},
{
"epoch": 0.8893892485223754,
"grad_norm": 0.8908954858779907,
"learning_rate": 0.00013594760489984167,
"loss": 0.3371,
"step": 3160
},
{
"epoch": 0.8907965099915565,
"grad_norm": 0.9060853123664856,
"learning_rate": 0.00013571828709220413,
"loss": 0.2489,
"step": 3165
},
{
"epoch": 0.8922037714607374,
"grad_norm": 0.7479000687599182,
"learning_rate": 0.00013548875378367972,
"loss": 0.2874,
"step": 3170
},
{
"epoch": 0.8936110329299184,
"grad_norm": 0.9289246201515198,
"learning_rate": 0.00013525900635912299,
"loss": 0.466,
"step": 3175
},
{
"epoch": 0.8950182943990993,
"grad_norm": 1.428377628326416,
"learning_rate": 0.0001350290462046803,
"loss": 0.5203,
"step": 3180
},
{
"epoch": 0.8964255558682803,
"grad_norm": 0.7524283528327942,
"learning_rate": 0.00013479887470778149,
"loss": 0.365,
"step": 3185
},
{
"epoch": 0.8978328173374613,
"grad_norm": 1.021815299987793,
"learning_rate": 0.0001345684932571315,
"loss": 0.5084,
"step": 3190
},
{
"epoch": 0.8992400788066423,
"grad_norm": 0.7522305846214294,
"learning_rate": 0.00013433790324270199,
"loss": 0.2659,
"step": 3195
},
{
"epoch": 0.9006473402758233,
"grad_norm": 1.3865163326263428,
"learning_rate": 0.00013410710605572294,
"loss": 0.2533,
"step": 3200
},
{
"epoch": 0.9020546017450042,
"grad_norm": 1.8485382795333862,
"learning_rate": 0.00013387610308867437,
"loss": 0.3675,
"step": 3205
},
{
"epoch": 0.9034618632141852,
"grad_norm": 1.203482985496521,
"learning_rate": 0.0001336448957352777,
"loss": 0.3284,
"step": 3210
},
{
"epoch": 0.9048691246833662,
"grad_norm": 0.9714936017990112,
"learning_rate": 0.00013341348539048752,
"loss": 0.2657,
"step": 3215
},
{
"epoch": 0.9062763861525471,
"grad_norm": 1.062326192855835,
"learning_rate": 0.00013318187345048328,
"loss": 0.3837,
"step": 3220
},
{
"epoch": 0.9076836476217282,
"grad_norm": 1.3822613954544067,
"learning_rate": 0.00013295006131266055,
"loss": 0.3584,
"step": 3225
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.2804548740386963,
"learning_rate": 0.0001327180503756228,
"loss": 0.4558,
"step": 3230
},
{
"epoch": 0.9104981705600901,
"grad_norm": 0.6253718137741089,
"learning_rate": 0.00013248584203917298,
"loss": 0.2871,
"step": 3235
},
{
"epoch": 0.911905432029271,
"grad_norm": 0.8237050175666809,
"learning_rate": 0.00013225343770430502,
"loss": 0.4014,
"step": 3240
},
{
"epoch": 0.913312693498452,
"grad_norm": 0.9199953675270081,
"learning_rate": 0.00013202083877319538,
"loss": 0.597,
"step": 3245
},
{
"epoch": 0.914719954967633,
"grad_norm": 1.0530214309692383,
"learning_rate": 0.00013178804664919444,
"loss": 0.5745,
"step": 3250
},
{
"epoch": 0.9161272164368139,
"grad_norm": 1.0369855165481567,
"learning_rate": 0.00013155506273681837,
"loss": 0.2493,
"step": 3255
},
{
"epoch": 0.917534477905995,
"grad_norm": 0.37017834186553955,
"learning_rate": 0.00013132188844174042,
"loss": 0.5125,
"step": 3260
},
{
"epoch": 0.9189417393751759,
"grad_norm": 0.5272582769393921,
"learning_rate": 0.0001310885251707824,
"loss": 0.2099,
"step": 3265
},
{
"epoch": 0.9203490008443569,
"grad_norm": 1.3228068351745605,
"learning_rate": 0.00013085497433190635,
"loss": 0.3625,
"step": 3270
},
{
"epoch": 0.9217562623135379,
"grad_norm": 1.2980788946151733,
"learning_rate": 0.000130621237334206,
"loss": 0.3258,
"step": 3275
},
{
"epoch": 0.9231635237827188,
"grad_norm": 0.7955147624015808,
"learning_rate": 0.00013038731558789816,
"loss": 0.331,
"step": 3280
},
{
"epoch": 0.9245707852518998,
"grad_norm": 0.33198082447052,
"learning_rate": 0.00013015321050431435,
"loss": 0.2828,
"step": 3285
},
{
"epoch": 0.9259780467210807,
"grad_norm": 1.193824052810669,
"learning_rate": 0.0001299189234958922,
"loss": 0.5299,
"step": 3290
},
{
"epoch": 0.9273853081902618,
"grad_norm": 0.6841180324554443,
"learning_rate": 0.00012968445597616695,
"loss": 0.2236,
"step": 3295
},
{
"epoch": 0.9287925696594427,
"grad_norm": 1.009793758392334,
"learning_rate": 0.00012944980935976295,
"loss": 0.4583,
"step": 3300
},
{
"epoch": 0.9301998311286237,
"grad_norm": 1.1918591260910034,
"learning_rate": 0.00012921498506238512,
"loss": 0.4523,
"step": 3305
},
{
"epoch": 0.9316070925978047,
"grad_norm": 0.7123336791992188,
"learning_rate": 0.00012897998450081037,
"loss": 0.3185,
"step": 3310
},
{
"epoch": 0.9330143540669856,
"grad_norm": 0.6820237040519714,
"learning_rate": 0.00012874480909287904,
"loss": 0.4963,
"step": 3315
},
{
"epoch": 0.9344216155361666,
"grad_norm": 0.6030889749526978,
"learning_rate": 0.00012850946025748643,
"loss": 0.3238,
"step": 3320
},
{
"epoch": 0.9358288770053476,
"grad_norm": 0.3159545958042145,
"learning_rate": 0.00012827393941457416,
"loss": 0.1804,
"step": 3325
},
{
"epoch": 0.9372361384745286,
"grad_norm": 0.500643789768219,
"learning_rate": 0.00012803824798512166,
"loss": 0.4421,
"step": 3330
},
{
"epoch": 0.9386433999437095,
"grad_norm": 1.0271189212799072,
"learning_rate": 0.00012780238739113755,
"loss": 0.4825,
"step": 3335
},
{
"epoch": 0.9400506614128905,
"grad_norm": 1.3835067749023438,
"learning_rate": 0.000127566359055651,
"loss": 0.5109,
"step": 3340
},
{
"epoch": 0.9414579228820715,
"grad_norm": 0.6945546269416809,
"learning_rate": 0.00012733016440270344,
"loss": 0.3438,
"step": 3345
},
{
"epoch": 0.9428651843512524,
"grad_norm": 0.5347813367843628,
"learning_rate": 0.0001270938048573395,
"loss": 0.2245,
"step": 3350
},
{
"epoch": 0.9442724458204335,
"grad_norm": 0.5110495090484619,
"learning_rate": 0.00012685728184559878,
"loss": 0.3236,
"step": 3355
},
{
"epoch": 0.9456797072896144,
"grad_norm": 1.1028776168823242,
"learning_rate": 0.00012662059679450715,
"loss": 0.3656,
"step": 3360
},
{
"epoch": 0.9470869687587954,
"grad_norm": 1.0305935144424438,
"learning_rate": 0.0001263837511320681,
"loss": 0.2271,
"step": 3365
},
{
"epoch": 0.9484942302279764,
"grad_norm": 1.1044567823410034,
"learning_rate": 0.0001261467462872541,
"loss": 0.3901,
"step": 3370
},
{
"epoch": 0.9499014916971573,
"grad_norm": 1.0489617586135864,
"learning_rate": 0.00012590958368999817,
"loss": 0.3906,
"step": 3375
},
{
"epoch": 0.9513087531663383,
"grad_norm": 0.9781221747398376,
"learning_rate": 0.0001256722647711849,
"loss": 0.3616,
"step": 3380
},
{
"epoch": 0.9527160146355192,
"grad_norm": 1.1387841701507568,
"learning_rate": 0.0001254347909626421,
"loss": 0.2382,
"step": 3385
},
{
"epoch": 0.9541232761047003,
"grad_norm": 1.3473316431045532,
"learning_rate": 0.00012519716369713214,
"loss": 0.446,
"step": 3390
},
{
"epoch": 0.9555305375738812,
"grad_norm": 1.1464128494262695,
"learning_rate": 0.00012495938440834327,
"loss": 0.341,
"step": 3395
},
{
"epoch": 0.9569377990430622,
"grad_norm": 0.9990252256393433,
"learning_rate": 0.0001247214545308808,
"loss": 0.4666,
"step": 3400
},
{
"epoch": 0.9583450605122432,
"grad_norm": 1.9256302118301392,
"learning_rate": 0.0001244833755002587,
"loss": 0.4555,
"step": 3405
},
{
"epoch": 0.9597523219814241,
"grad_norm": 0.8169670104980469,
"learning_rate": 0.00012424514875289088,
"loss": 0.6558,
"step": 3410
},
{
"epoch": 0.9611595834506051,
"grad_norm": 1.60161554813385,
"learning_rate": 0.0001240067757260824,
"loss": 0.4544,
"step": 3415
},
{
"epoch": 0.9625668449197861,
"grad_norm": 0.7437291741371155,
"learning_rate": 0.0001237682578580208,
"loss": 0.3022,
"step": 3420
},
{
"epoch": 0.9639741063889671,
"grad_norm": 0.9030975699424744,
"learning_rate": 0.00012352959658776767,
"loss": 0.4267,
"step": 3425
},
{
"epoch": 0.9653813678581481,
"grad_norm": 1.0298916101455688,
"learning_rate": 0.00012329079335524973,
"loss": 0.5084,
"step": 3430
},
{
"epoch": 0.966788629327329,
"grad_norm": 1.4346392154693604,
"learning_rate": 0.0001230518496012502,
"loss": 0.5032,
"step": 3435
},
{
"epoch": 0.96819589079651,
"grad_norm": 1.988788366317749,
"learning_rate": 0.00012281276676739996,
"loss": 0.5206,
"step": 3440
},
{
"epoch": 0.9696031522656909,
"grad_norm": 0.627189040184021,
"learning_rate": 0.00012257354629616933,
"loss": 0.3927,
"step": 3445
},
{
"epoch": 0.971010413734872,
"grad_norm": 1.1982104778289795,
"learning_rate": 0.0001223341896308588,
"loss": 0.4134,
"step": 3450
},
{
"epoch": 0.9724176752040529,
"grad_norm": 1.1405185461044312,
"learning_rate": 0.00012209469821559062,
"loss": 0.314,
"step": 3455
},
{
"epoch": 0.9738249366732339,
"grad_norm": 1.0637789964675903,
"learning_rate": 0.00012185507349530006,
"loss": 0.4855,
"step": 3460
},
{
"epoch": 0.9752321981424149,
"grad_norm": 1.1884607076644897,
"learning_rate": 0.00012161531691572665,
"loss": 0.4043,
"step": 3465
},
{
"epoch": 0.9766394596115958,
"grad_norm": 0.7082695960998535,
"learning_rate": 0.00012137542992340552,
"loss": 0.3864,
"step": 3470
},
{
"epoch": 0.9780467210807768,
"grad_norm": 1.400940179824829,
"learning_rate": 0.0001211354139656585,
"loss": 0.3179,
"step": 3475
},
{
"epoch": 0.9794539825499577,
"grad_norm": 1.0918678045272827,
"learning_rate": 0.00012089527049058566,
"loss": 0.3724,
"step": 3480
},
{
"epoch": 0.9808612440191388,
"grad_norm": 0.8317002654075623,
"learning_rate": 0.00012065500094705635,
"loss": 0.4669,
"step": 3485
},
{
"epoch": 0.9822685054883197,
"grad_norm": 2.4732000827789307,
"learning_rate": 0.00012041460678470057,
"loss": 0.536,
"step": 3490
},
{
"epoch": 0.9836757669575007,
"grad_norm": 0.4239155650138855,
"learning_rate": 0.00012017408945390009,
"loss": 0.4178,
"step": 3495
},
{
"epoch": 0.9850830284266817,
"grad_norm": 1.0096583366394043,
"learning_rate": 0.00011993345040577995,
"loss": 0.5533,
"step": 3500
},
{
"epoch": 0.9864902898958626,
"grad_norm": 1.6637718677520752,
"learning_rate": 0.00011969269109219945,
"loss": 0.1999,
"step": 3505
},
{
"epoch": 0.9878975513650436,
"grad_norm": 1.4339228868484497,
"learning_rate": 0.0001194518129657435,
"loss": 0.2913,
"step": 3510
},
{
"epoch": 0.9893048128342246,
"grad_norm": 0.9473050236701965,
"learning_rate": 0.00011921081747971392,
"loss": 0.4202,
"step": 3515
},
{
"epoch": 0.9907120743034056,
"grad_norm": 1.5468287467956543,
"learning_rate": 0.00011896970608812053,
"loss": 0.2755,
"step": 3520
},
{
"epoch": 0.9921193357725866,
"grad_norm": 1.0197608470916748,
"learning_rate": 0.00011872848024567245,
"loss": 0.399,
"step": 3525
},
{
"epoch": 0.9935265972417675,
"grad_norm": 1.9030907154083252,
"learning_rate": 0.00011848714140776936,
"loss": 0.3538,
"step": 3530
},
{
"epoch": 0.9949338587109485,
"grad_norm": 1.1370608806610107,
"learning_rate": 0.00011824569103049264,
"loss": 0.6243,
"step": 3535
},
{
"epoch": 0.9963411201801294,
"grad_norm": 0.7336493134498596,
"learning_rate": 0.0001180041305705967,
"loss": 0.287,
"step": 3540
},
{
"epoch": 0.9977483816493105,
"grad_norm": 0.8091352581977844,
"learning_rate": 0.0001177624614855,
"loss": 0.4314,
"step": 3545
},
{
"epoch": 0.9991556431184914,
"grad_norm": 0.8396396636962891,
"learning_rate": 0.0001175206852332765,
"loss": 0.243,
"step": 3550
},
{
"epoch": 1.0005629045876725,
"grad_norm": 0.4893011152744293,
"learning_rate": 0.00011727880327264667,
"loss": 0.4008,
"step": 3555
},
{
"epoch": 1.0019701660568534,
"grad_norm": 0.5934264659881592,
"learning_rate": 0.00011703681706296871,
"loss": 0.197,
"step": 3560
},
{
"epoch": 1.0033774275260343,
"grad_norm": 0.9697572588920593,
"learning_rate": 0.00011679472806422991,
"loss": 0.2565,
"step": 3565
},
{
"epoch": 1.0047846889952152,
"grad_norm": 0.6383791565895081,
"learning_rate": 0.00011655253773703763,
"loss": 0.1732,
"step": 3570
},
{
"epoch": 1.0061919504643964,
"grad_norm": 2.7294044494628906,
"learning_rate": 0.00011631024754261057,
"loss": 0.344,
"step": 3575
},
{
"epoch": 1.0075992119335773,
"grad_norm": 0.7987744212150574,
"learning_rate": 0.00011606785894277002,
"loss": 0.2462,
"step": 3580
},
{
"epoch": 1.0090064734027582,
"grad_norm": 1.0963287353515625,
"learning_rate": 0.00011582537339993102,
"loss": 0.2017,
"step": 3585
},
{
"epoch": 1.0104137348719393,
"grad_norm": 0.2937074303627014,
"learning_rate": 0.00011558279237709337,
"loss": 0.2587,
"step": 3590
},
{
"epoch": 1.0118209963411202,
"grad_norm": 1.1680563688278198,
"learning_rate": 0.00011534011733783303,
"loss": 0.3315,
"step": 3595
},
{
"epoch": 1.0132282578103011,
"grad_norm": 0.8227936029434204,
"learning_rate": 0.00011509734974629316,
"loss": 0.1936,
"step": 3600
},
{
"epoch": 1.014635519279482,
"grad_norm": 1.266236424446106,
"learning_rate": 0.0001148544910671754,
"loss": 0.283,
"step": 3605
},
{
"epoch": 1.0160427807486632,
"grad_norm": 0.4134606122970581,
"learning_rate": 0.0001146115427657308,
"loss": 0.1711,
"step": 3610
},
{
"epoch": 1.017450042217844,
"grad_norm": 0.5949440598487854,
"learning_rate": 0.00011436850630775127,
"loss": 0.2659,
"step": 3615
},
{
"epoch": 1.018857303687025,
"grad_norm": 1.2255134582519531,
"learning_rate": 0.00011412538315956051,
"loss": 0.331,
"step": 3620
},
{
"epoch": 1.0202645651562061,
"grad_norm": 0.7793748378753662,
"learning_rate": 0.00011388217478800536,
"loss": 0.3107,
"step": 3625
},
{
"epoch": 1.021671826625387,
"grad_norm": 1.5764113664627075,
"learning_rate": 0.00011363888266044668,
"loss": 0.2801,
"step": 3630
},
{
"epoch": 1.023079088094568,
"grad_norm": 0.7818349599838257,
"learning_rate": 0.0001133955082447508,
"loss": 0.4592,
"step": 3635
},
{
"epoch": 1.0244863495637488,
"grad_norm": 0.8325141072273254,
"learning_rate": 0.00011315205300928047,
"loss": 0.2221,
"step": 3640
},
{
"epoch": 1.02589361103293,
"grad_norm": 0.8759342432022095,
"learning_rate": 0.0001129085184228861,
"loss": 0.2282,
"step": 3645
},
{
"epoch": 1.0273008725021109,
"grad_norm": 0.8269652724266052,
"learning_rate": 0.00011266490595489672,
"loss": 0.288,
"step": 3650
},
{
"epoch": 1.0287081339712918,
"grad_norm": 0.9182637929916382,
"learning_rate": 0.0001124212170751114,
"loss": 0.2124,
"step": 3655
},
{
"epoch": 1.030115395440473,
"grad_norm": 0.7247250080108643,
"learning_rate": 0.00011217745325379017,
"loss": 0.2818,
"step": 3660
},
{
"epoch": 1.0315226569096538,
"grad_norm": 1.1736894845962524,
"learning_rate": 0.00011193361596164517,
"loss": 0.2349,
"step": 3665
},
{
"epoch": 1.0329299183788347,
"grad_norm": 0.3809513747692108,
"learning_rate": 0.00011168970666983184,
"loss": 0.158,
"step": 3670
},
{
"epoch": 1.0343371798480159,
"grad_norm": 1.4163240194320679,
"learning_rate": 0.0001114457268499401,
"loss": 0.3035,
"step": 3675
},
{
"epoch": 1.0357444413171968,
"grad_norm": 1.8142826557159424,
"learning_rate": 0.00011120167797398527,
"loss": 0.3572,
"step": 3680
},
{
"epoch": 1.0371517027863777,
"grad_norm": 0.9238508343696594,
"learning_rate": 0.00011095756151439934,
"loss": 0.2104,
"step": 3685
},
{
"epoch": 1.0385589642555586,
"grad_norm": 1.3922544717788696,
"learning_rate": 0.0001107133789440221,
"loss": 0.3846,
"step": 3690
},
{
"epoch": 1.0399662257247397,
"grad_norm": 0.5761235952377319,
"learning_rate": 0.00011046913173609217,
"loss": 0.1728,
"step": 3695
},
{
"epoch": 1.0413734871939206,
"grad_norm": 1.3399313688278198,
"learning_rate": 0.0001102248213642382,
"loss": 0.2158,
"step": 3700
},
{
"epoch": 1.0427807486631016,
"grad_norm": 0.5189816355705261,
"learning_rate": 0.00010998044930246985,
"loss": 0.2724,
"step": 3705
},
{
"epoch": 1.0441880101322827,
"grad_norm": 1.0454604625701904,
"learning_rate": 0.00010973601702516903,
"loss": 0.3016,
"step": 3710
},
{
"epoch": 1.0455952716014636,
"grad_norm": 0.9476893544197083,
"learning_rate": 0.00010949152600708096,
"loss": 0.161,
"step": 3715
},
{
"epoch": 1.0470025330706445,
"grad_norm": 1.1760029792785645,
"learning_rate": 0.00010924697772330525,
"loss": 0.3402,
"step": 3720
},
{
"epoch": 1.0484097945398254,
"grad_norm": 0.7986089587211609,
"learning_rate": 0.000109002373649287,
"loss": 0.3381,
"step": 3725
},
{
"epoch": 1.0498170560090065,
"grad_norm": 0.46115541458129883,
"learning_rate": 0.00010875771526080791,
"loss": 0.2121,
"step": 3730
},
{
"epoch": 1.0512243174781875,
"grad_norm": 0.8159217238426208,
"learning_rate": 0.00010851300403397741,
"loss": 0.1618,
"step": 3735
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.9532806277275085,
"learning_rate": 0.00010826824144522369,
"loss": 0.2001,
"step": 3740
},
{
"epoch": 1.0540388404165495,
"grad_norm": 0.987647294998169,
"learning_rate": 0.00010802342897128484,
"loss": 0.1255,
"step": 3745
},
{
"epoch": 1.0554461018857304,
"grad_norm": 0.5456539988517761,
"learning_rate": 0.00010777856808919993,
"loss": 0.1738,
"step": 3750
},
{
"epoch": 1.0568533633549113,
"grad_norm": 1.2354178428649902,
"learning_rate": 0.00010753366027630005,
"loss": 0.1968,
"step": 3755
},
{
"epoch": 1.0582606248240922,
"grad_norm": 1.5054504871368408,
"learning_rate": 0.00010728870701019952,
"loss": 0.3881,
"step": 3760
},
{
"epoch": 1.0596678862932734,
"grad_norm": 0.33300110697746277,
"learning_rate": 0.00010704370976878683,
"loss": 0.3455,
"step": 3765
},
{
"epoch": 1.0610751477624543,
"grad_norm": 0.28057172894477844,
"learning_rate": 0.00010679867003021582,
"loss": 0.3676,
"step": 3770
},
{
"epoch": 1.0624824092316352,
"grad_norm": 0.78326416015625,
"learning_rate": 0.0001065535892728967,
"loss": 0.2051,
"step": 3775
},
{
"epoch": 1.0638896707008163,
"grad_norm": 0.30371785163879395,
"learning_rate": 0.00010630846897548719,
"loss": 0.2172,
"step": 3780
},
{
"epoch": 1.0652969321699972,
"grad_norm": 0.951871931552887,
"learning_rate": 0.00010606331061688352,
"loss": 0.2731,
"step": 3785
},
{
"epoch": 1.0667041936391781,
"grad_norm": 0.9194802641868591,
"learning_rate": 0.00010581811567621165,
"loss": 0.437,
"step": 3790
},
{
"epoch": 1.068111455108359,
"grad_norm": 1.3185656070709229,
"learning_rate": 0.00010557288563281819,
"loss": 0.1762,
"step": 3795
},
{
"epoch": 1.0695187165775402,
"grad_norm": 0.6637858152389526,
"learning_rate": 0.00010532762196626151,
"loss": 0.3499,
"step": 3800
},
{
"epoch": 1.070925978046721,
"grad_norm": 0.5646357536315918,
"learning_rate": 0.00010508232615630291,
"loss": 0.1794,
"step": 3805
},
{
"epoch": 1.072333239515902,
"grad_norm": 0.7347474694252014,
"learning_rate": 0.00010483699968289754,
"loss": 0.2088,
"step": 3810
},
{
"epoch": 1.0737405009850831,
"grad_norm": 0.7603871822357178,
"learning_rate": 0.00010459164402618567,
"loss": 0.2723,
"step": 3815
},
{
"epoch": 1.075147762454264,
"grad_norm": 1.574090838432312,
"learning_rate": 0.0001043462606664835,
"loss": 0.3175,
"step": 3820
},
{
"epoch": 1.076555023923445,
"grad_norm": 1.8480275869369507,
"learning_rate": 0.00010410085108427448,
"loss": 0.3903,
"step": 3825
},
{
"epoch": 1.0779622853926258,
"grad_norm": 3.3462395668029785,
"learning_rate": 0.00010385541676020026,
"loss": 0.2867,
"step": 3830
},
{
"epoch": 1.079369546861807,
"grad_norm": 1.0282424688339233,
"learning_rate": 0.00010360995917505167,
"loss": 0.3542,
"step": 3835
},
{
"epoch": 1.0807768083309879,
"grad_norm": 1.081586241722107,
"learning_rate": 0.00010336447980976,
"loss": 0.1933,
"step": 3840
},
{
"epoch": 1.0821840698001688,
"grad_norm": 0.7061908841133118,
"learning_rate": 0.00010311898014538788,
"loss": 0.3673,
"step": 3845
},
{
"epoch": 1.08359133126935,
"grad_norm": 1.0589807033538818,
"learning_rate": 0.00010287346166312048,
"loss": 0.2017,
"step": 3850
},
{
"epoch": 1.0849985927385308,
"grad_norm": 0.7850357890129089,
"learning_rate": 0.0001026279258442564,
"loss": 0.3781,
"step": 3855
},
{
"epoch": 1.0864058542077117,
"grad_norm": 0.8800612688064575,
"learning_rate": 0.00010238237417019889,
"loss": 0.2454,
"step": 3860
},
{
"epoch": 1.0878131156768927,
"grad_norm": 0.8004993796348572,
"learning_rate": 0.00010213680812244693,
"loss": 0.3253,
"step": 3865
},
{
"epoch": 1.0892203771460738,
"grad_norm": 1.0395301580429077,
"learning_rate": 0.00010189122918258611,
"loss": 0.3023,
"step": 3870
},
{
"epoch": 1.0906276386152547,
"grad_norm": 0.7087461352348328,
"learning_rate": 0.00010164563883227982,
"loss": 0.258,
"step": 3875
},
{
"epoch": 1.0920349000844356,
"grad_norm": 1.0742789506912231,
"learning_rate": 0.00010140003855326034,
"loss": 0.1768,
"step": 3880
},
{
"epoch": 1.0934421615536167,
"grad_norm": 1.7721843719482422,
"learning_rate": 0.00010115442982731988,
"loss": 0.2673,
"step": 3885
},
{
"epoch": 1.0948494230227976,
"grad_norm": 0.5749943256378174,
"learning_rate": 0.00010090881413630154,
"loss": 0.2943,
"step": 3890
},
{
"epoch": 1.0962566844919786,
"grad_norm": 1.210871696472168,
"learning_rate": 0.00010066319296209043,
"loss": 0.2569,
"step": 3895
},
{
"epoch": 1.0976639459611597,
"grad_norm": 0.7546014189720154,
"learning_rate": 0.00010041756778660483,
"loss": 0.1277,
"step": 3900
},
{
"epoch": 1.0990712074303406,
"grad_norm": 0.45546409487724304,
"learning_rate": 0.0001001719400917871,
"loss": 0.2447,
"step": 3905
},
{
"epoch": 1.1004784688995215,
"grad_norm": 0.9810652136802673,
"learning_rate": 9.992631135959484e-05,
"loss": 0.1891,
"step": 3910
},
{
"epoch": 1.1018857303687024,
"grad_norm": 0.26853448152542114,
"learning_rate": 9.96806830719918e-05,
"loss": 0.2793,
"step": 3915
},
{
"epoch": 1.1032929918378835,
"grad_norm": 0.815556526184082,
"learning_rate": 9.943505671093923e-05,
"loss": 0.1589,
"step": 3920
},
{
"epoch": 1.1047002533070645,
"grad_norm": 1.1649208068847656,
"learning_rate": 9.918943375838658e-05,
"loss": 0.1692,
"step": 3925
},
{
"epoch": 1.1061075147762454,
"grad_norm": 1.3160449266433716,
"learning_rate": 9.894381569626286e-05,
"loss": 0.1748,
"step": 3930
},
{
"epoch": 1.1075147762454265,
"grad_norm": 0.7906925082206726,
"learning_rate": 9.869820400646752e-05,
"loss": 0.2706,
"step": 3935
},
{
"epoch": 1.1089220377146074,
"grad_norm": 1.7690831422805786,
"learning_rate": 9.845260017086152e-05,
"loss": 0.4101,
"step": 3940
},
{
"epoch": 1.1103292991837883,
"grad_norm": 0.7361578941345215,
"learning_rate": 9.820700567125855e-05,
"loss": 0.2352,
"step": 3945
},
{
"epoch": 1.1117365606529692,
"grad_norm": 0.7984316945075989,
"learning_rate": 9.79614219894159e-05,
"loss": 0.2466,
"step": 3950
},
{
"epoch": 1.1131438221221504,
"grad_norm": 1.6478660106658936,
"learning_rate": 9.771585060702551e-05,
"loss": 0.2434,
"step": 3955
},
{
"epoch": 1.1145510835913313,
"grad_norm": 0.8288646936416626,
"learning_rate": 9.747029300570528e-05,
"loss": 0.1954,
"step": 3960
},
{
"epoch": 1.1159583450605122,
"grad_norm": 1.0649809837341309,
"learning_rate": 9.722475066698992e-05,
"loss": 0.1995,
"step": 3965
},
{
"epoch": 1.1173656065296933,
"grad_norm": 1.0399101972579956,
"learning_rate": 9.697922507232194e-05,
"loss": 0.2972,
"step": 3970
},
{
"epoch": 1.1187728679988742,
"grad_norm": 0.9969576001167297,
"learning_rate": 9.673371770304291e-05,
"loss": 0.2133,
"step": 3975
},
{
"epoch": 1.1201801294680551,
"grad_norm": 0.7914555072784424,
"learning_rate": 9.648823004038452e-05,
"loss": 0.2006,
"step": 3980
},
{
"epoch": 1.1215873909372363,
"grad_norm": 0.8462080359458923,
"learning_rate": 9.62427635654594e-05,
"loss": 0.1759,
"step": 3985
},
{
"epoch": 1.1229946524064172,
"grad_norm": 1.5257298946380615,
"learning_rate": 9.599731975925248e-05,
"loss": 0.2961,
"step": 3990
},
{
"epoch": 1.124401913875598,
"grad_norm": 0.918910562992096,
"learning_rate": 9.575190010261179e-05,
"loss": 0.2468,
"step": 3995
},
{
"epoch": 1.125809175344779,
"grad_norm": 0.9318897128105164,
"learning_rate": 9.550650607623982e-05,
"loss": 0.2609,
"step": 4000
},
{
"epoch": 1.12721643681396,
"grad_norm": 0.49596425890922546,
"learning_rate": 9.526113916068431e-05,
"loss": 0.2369,
"step": 4005
},
{
"epoch": 1.128623698283141,
"grad_norm": 0.6530629396438599,
"learning_rate": 9.501580083632946e-05,
"loss": 0.1354,
"step": 4010
},
{
"epoch": 1.130030959752322,
"grad_norm": 0.39932572841644287,
"learning_rate": 9.477049258338694e-05,
"loss": 0.2277,
"step": 4015
},
{
"epoch": 1.131438221221503,
"grad_norm": 0.8406773805618286,
"learning_rate": 9.452521588188711e-05,
"loss": 0.1472,
"step": 4020
},
{
"epoch": 1.132845482690684,
"grad_norm": 0.7629873752593994,
"learning_rate": 9.427997221166978e-05,
"loss": 0.2421,
"step": 4025
},
{
"epoch": 1.1342527441598649,
"grad_norm": 1.1697338819503784,
"learning_rate": 9.40347630523756e-05,
"loss": 0.2181,
"step": 4030
},
{
"epoch": 1.1356600056290458,
"grad_norm": 0.924167811870575,
"learning_rate": 9.378958988343702e-05,
"loss": 0.3934,
"step": 4035
},
{
"epoch": 1.137067267098227,
"grad_norm": 0.8078356385231018,
"learning_rate": 9.354445418406924e-05,
"loss": 0.1403,
"step": 4040
},
{
"epoch": 1.1384745285674078,
"grad_norm": 0.520318329334259,
"learning_rate": 9.329935743326144e-05,
"loss": 0.2916,
"step": 4045
},
{
"epoch": 1.1398817900365887,
"grad_norm": 0.45882686972618103,
"learning_rate": 9.305430110976793e-05,
"loss": 0.1297,
"step": 4050
},
{
"epoch": 1.1412890515057699,
"grad_norm": 0.5139206051826477,
"learning_rate": 9.280928669209887e-05,
"loss": 0.2342,
"step": 4055
},
{
"epoch": 1.1426963129749508,
"grad_norm": 0.9370526671409607,
"learning_rate": 9.256431565851181e-05,
"loss": 0.1581,
"step": 4060
},
{
"epoch": 1.1441035744441317,
"grad_norm": 1.525415301322937,
"learning_rate": 9.23193894870024e-05,
"loss": 0.255,
"step": 4065
},
{
"epoch": 1.1455108359133126,
"grad_norm": 1.745328426361084,
"learning_rate": 9.207450965529571e-05,
"loss": 0.1585,
"step": 4070
},
{
"epoch": 1.1469180973824937,
"grad_norm": 0.5603808760643005,
"learning_rate": 9.18296776408372e-05,
"loss": 0.2085,
"step": 4075
},
{
"epoch": 1.1483253588516746,
"grad_norm": 0.24650625884532928,
"learning_rate": 9.158489492078381e-05,
"loss": 0.2441,
"step": 4080
},
{
"epoch": 1.1497326203208555,
"grad_norm": 1.2769076824188232,
"learning_rate": 9.134016297199506e-05,
"loss": 0.1923,
"step": 4085
},
{
"epoch": 1.1511398817900367,
"grad_norm": 0.6759532690048218,
"learning_rate": 9.109548327102424e-05,
"loss": 0.1818,
"step": 4090
},
{
"epoch": 1.1525471432592176,
"grad_norm": 1.7534480094909668,
"learning_rate": 9.085085729410928e-05,
"loss": 0.2677,
"step": 4095
},
{
"epoch": 1.1539544047283985,
"grad_norm": 1.578730583190918,
"learning_rate": 9.060628651716409e-05,
"loss": 0.3868,
"step": 4100
},
{
"epoch": 1.1553616661975794,
"grad_norm": 1.5693743228912354,
"learning_rate": 9.036177241576949e-05,
"loss": 0.4238,
"step": 4105
},
{
"epoch": 1.1567689276667605,
"grad_norm": 0.7190649509429932,
"learning_rate": 9.011731646516429e-05,
"loss": 0.2943,
"step": 4110
},
{
"epoch": 1.1581761891359414,
"grad_norm": 1.3021358251571655,
"learning_rate": 8.987292014023658e-05,
"loss": 0.282,
"step": 4115
},
{
"epoch": 1.1595834506051224,
"grad_norm": 0.7299554944038391,
"learning_rate": 8.962858491551467e-05,
"loss": 0.2086,
"step": 4120
},
{
"epoch": 1.1609907120743035,
"grad_norm": 0.8138667345046997,
"learning_rate": 8.938431226515813e-05,
"loss": 0.3847,
"step": 4125
},
{
"epoch": 1.1623979735434844,
"grad_norm": 1.6948626041412354,
"learning_rate": 8.914010366294917e-05,
"loss": 0.2519,
"step": 4130
},
{
"epoch": 1.1638052350126653,
"grad_norm": 0.4518921971321106,
"learning_rate": 8.889596058228339e-05,
"loss": 0.1481,
"step": 4135
},
{
"epoch": 1.1652124964818462,
"grad_norm": 0.9538673162460327,
"learning_rate": 8.865188449616124e-05,
"loss": 0.2342,
"step": 4140
},
{
"epoch": 1.1666197579510273,
"grad_norm": 1.5478556156158447,
"learning_rate": 8.84078768771789e-05,
"loss": 0.2741,
"step": 4145
},
{
"epoch": 1.1680270194202083,
"grad_norm": 0.8891351222991943,
"learning_rate": 8.816393919751937e-05,
"loss": 0.2279,
"step": 4150
},
{
"epoch": 1.1694342808893892,
"grad_norm": 1.0661555528640747,
"learning_rate": 8.792007292894387e-05,
"loss": 0.2588,
"step": 4155
},
{
"epoch": 1.1708415423585703,
"grad_norm": 1.0529447793960571,
"learning_rate": 8.767627954278267e-05,
"loss": 0.3593,
"step": 4160
},
{
"epoch": 1.1722488038277512,
"grad_norm": 1.0678569078445435,
"learning_rate": 8.743256050992623e-05,
"loss": 0.1596,
"step": 4165
},
{
"epoch": 1.1736560652969321,
"grad_norm": 0.7005488276481628,
"learning_rate": 8.71889173008166e-05,
"loss": 0.2517,
"step": 4170
},
{
"epoch": 1.175063326766113,
"grad_norm": 0.4683868885040283,
"learning_rate": 8.69453513854382e-05,
"loss": 0.1622,
"step": 4175
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.8689951300621033,
"learning_rate": 8.67018642333092e-05,
"loss": 0.1776,
"step": 4180
},
{
"epoch": 1.177877849704475,
"grad_norm": 0.7526000738143921,
"learning_rate": 8.645845731347248e-05,
"loss": 0.1588,
"step": 4185
},
{
"epoch": 1.179285111173656,
"grad_norm": 1.2025400400161743,
"learning_rate": 8.621513209448701e-05,
"loss": 0.197,
"step": 4190
},
{
"epoch": 1.180692372642837,
"grad_norm": 1.2456661462783813,
"learning_rate": 8.597189004441863e-05,
"loss": 0.2185,
"step": 4195
},
{
"epoch": 1.182099634112018,
"grad_norm": 0.26599639654159546,
"learning_rate": 8.572873263083152e-05,
"loss": 0.1736,
"step": 4200
},
{
"epoch": 1.183506895581199,
"grad_norm": 0.6946321725845337,
"learning_rate": 8.548566132077916e-05,
"loss": 0.2439,
"step": 4205
},
{
"epoch": 1.18491415705038,
"grad_norm": 0.8973987102508545,
"learning_rate": 8.524267758079557e-05,
"loss": 0.2171,
"step": 4210
},
{
"epoch": 1.186321418519561,
"grad_norm": 0.653135359287262,
"learning_rate": 8.499978287688648e-05,
"loss": 0.1822,
"step": 4215
},
{
"epoch": 1.1877286799887419,
"grad_norm": 1.1294854879379272,
"learning_rate": 8.475697867452028e-05,
"loss": 0.3998,
"step": 4220
},
{
"epoch": 1.189135941457923,
"grad_norm": 0.7260348200798035,
"learning_rate": 8.451426643861946e-05,
"loss": 0.3177,
"step": 4225
},
{
"epoch": 1.190543202927104,
"grad_norm": 0.9421544075012207,
"learning_rate": 8.427164763355169e-05,
"loss": 0.3644,
"step": 4230
},
{
"epoch": 1.1919504643962848,
"grad_norm": 1.8454887866973877,
"learning_rate": 8.402912372312076e-05,
"loss": 0.2601,
"step": 4235
},
{
"epoch": 1.1933577258654657,
"grad_norm": 0.7556844353675842,
"learning_rate": 8.378669617055806e-05,
"loss": 0.1539,
"step": 4240
},
{
"epoch": 1.1947649873346469,
"grad_norm": 1.1138182878494263,
"learning_rate": 8.354436643851365e-05,
"loss": 0.2221,
"step": 4245
},
{
"epoch": 1.1961722488038278,
"grad_norm": 1.7039527893066406,
"learning_rate": 8.330213598904726e-05,
"loss": 0.3543,
"step": 4250
},
{
"epoch": 1.1975795102730087,
"grad_norm": 1.6566787958145142,
"learning_rate": 8.306000628361972e-05,
"loss": 0.1975,
"step": 4255
},
{
"epoch": 1.1989867717421898,
"grad_norm": 1.0765029191970825,
"learning_rate": 8.281797878308406e-05,
"loss": 0.1358,
"step": 4260
},
{
"epoch": 1.2003940332113707,
"grad_norm": 0.7748456001281738,
"learning_rate": 8.257605494767654e-05,
"loss": 0.1821,
"step": 4265
},
{
"epoch": 1.2018012946805516,
"grad_norm": 0.32174113392829895,
"learning_rate": 8.233423623700816e-05,
"loss": 0.1391,
"step": 4270
},
{
"epoch": 1.2032085561497325,
"grad_norm": 0.5359024405479431,
"learning_rate": 8.209252411005548e-05,
"loss": 0.1476,
"step": 4275
},
{
"epoch": 1.2046158176189137,
"grad_norm": 0.9815373420715332,
"learning_rate": 8.185092002515209e-05,
"loss": 0.3173,
"step": 4280
},
{
"epoch": 1.2060230790880946,
"grad_norm": 0.6186626553535461,
"learning_rate": 8.16094254399798e-05,
"loss": 0.3268,
"step": 4285
},
{
"epoch": 1.2074303405572755,
"grad_norm": 1.598221778869629,
"learning_rate": 8.136804181155961e-05,
"loss": 0.2788,
"step": 4290
},
{
"epoch": 1.2088376020264566,
"grad_norm": 0.409020334482193,
"learning_rate": 8.112677059624316e-05,
"loss": 0.2455,
"step": 4295
},
{
"epoch": 1.2102448634956375,
"grad_norm": 1.0623451471328735,
"learning_rate": 8.088561324970396e-05,
"loss": 0.2883,
"step": 4300
},
{
"epoch": 1.2116521249648184,
"grad_norm": 0.9107158780097961,
"learning_rate": 8.064457122692828e-05,
"loss": 0.191,
"step": 4305
},
{
"epoch": 1.2130593864339994,
"grad_norm": 1.021278738975525,
"learning_rate": 8.040364598220682e-05,
"loss": 0.2287,
"step": 4310
},
{
"epoch": 1.2144666479031805,
"grad_norm": 1.0348402261734009,
"learning_rate": 8.016283896912563e-05,
"loss": 0.1455,
"step": 4315
},
{
"epoch": 1.2158739093723614,
"grad_norm": 1.06684410572052,
"learning_rate": 7.992215164055737e-05,
"loss": 0.1786,
"step": 4320
},
{
"epoch": 1.2172811708415423,
"grad_norm": 0.45586028695106506,
"learning_rate": 7.968158544865272e-05,
"loss": 0.2625,
"step": 4325
},
{
"epoch": 1.2186884323107234,
"grad_norm": 1.0333331823349,
"learning_rate": 7.944114184483144e-05,
"loss": 0.1766,
"step": 4330
},
{
"epoch": 1.2200956937799043,
"grad_norm": 1.477582335472107,
"learning_rate": 7.920082227977361e-05,
"loss": 0.2547,
"step": 4335
},
{
"epoch": 1.2215029552490853,
"grad_norm": 0.732683539390564,
"learning_rate": 7.89606282034111e-05,
"loss": 0.1894,
"step": 4340
},
{
"epoch": 1.2229102167182662,
"grad_norm": 1.199336290359497,
"learning_rate": 7.872056106491846e-05,
"loss": 0.3359,
"step": 4345
},
{
"epoch": 1.2243174781874473,
"grad_norm": 2.6119384765625,
"learning_rate": 7.848062231270458e-05,
"loss": 0.3301,
"step": 4350
},
{
"epoch": 1.2257247396566282,
"grad_norm": 1.0260940790176392,
"learning_rate": 7.824081339440364e-05,
"loss": 0.1735,
"step": 4355
},
{
"epoch": 1.2271320011258091,
"grad_norm": 0.7368533611297607,
"learning_rate": 7.800113575686643e-05,
"loss": 0.1741,
"step": 4360
},
{
"epoch": 1.2285392625949902,
"grad_norm": 0.8837445378303528,
"learning_rate": 7.776159084615183e-05,
"loss": 0.2789,
"step": 4365
},
{
"epoch": 1.2299465240641712,
"grad_norm": 1.0234431028366089,
"learning_rate": 7.752218010751786e-05,
"loss": 0.1811,
"step": 4370
},
{
"epoch": 1.231353785533352,
"grad_norm": 1.1849218606948853,
"learning_rate": 7.728290498541297e-05,
"loss": 0.2951,
"step": 4375
},
{
"epoch": 1.232761047002533,
"grad_norm": 1.1420046091079712,
"learning_rate": 7.704376692346748e-05,
"loss": 0.2964,
"step": 4380
},
{
"epoch": 1.234168308471714,
"grad_norm": 0.44826436042785645,
"learning_rate": 7.680476736448477e-05,
"loss": 0.165,
"step": 4385
},
{
"epoch": 1.235575569940895,
"grad_norm": 0.6397153735160828,
"learning_rate": 7.656590775043249e-05,
"loss": 0.138,
"step": 4390
},
{
"epoch": 1.236982831410076,
"grad_norm": 1.1096476316452026,
"learning_rate": 7.632718952243404e-05,
"loss": 0.2673,
"step": 4395
},
{
"epoch": 1.238390092879257,
"grad_norm": 0.7769279479980469,
"learning_rate": 7.608861412075987e-05,
"loss": 0.1631,
"step": 4400
},
{
"epoch": 1.239797354348438,
"grad_norm": 0.8061667084693909,
"learning_rate": 7.585018298481849e-05,
"loss": 0.1851,
"step": 4405
},
{
"epoch": 1.2412046158176189,
"grad_norm": 1.618454098701477,
"learning_rate": 7.561189755314817e-05,
"loss": 0.2377,
"step": 4410
},
{
"epoch": 1.2426118772867998,
"grad_norm": 1.1752551794052124,
"learning_rate": 7.537375926340802e-05,
"loss": 0.1806,
"step": 4415
},
{
"epoch": 1.244019138755981,
"grad_norm": 0.29463231563568115,
"learning_rate": 7.513576955236944e-05,
"loss": 0.1611,
"step": 4420
},
{
"epoch": 1.2454264002251618,
"grad_norm": 0.7407804131507874,
"learning_rate": 7.489792985590743e-05,
"loss": 0.3176,
"step": 4425
},
{
"epoch": 1.2468336616943427,
"grad_norm": 0.8456223011016846,
"learning_rate": 7.466024160899173e-05,
"loss": 0.2742,
"step": 4430
},
{
"epoch": 1.2482409231635239,
"grad_norm": 1.3502225875854492,
"learning_rate": 7.442270624567856e-05,
"loss": 0.2477,
"step": 4435
},
{
"epoch": 1.2496481846327048,
"grad_norm": 1.0241039991378784,
"learning_rate": 7.418532519910162e-05,
"loss": 0.2415,
"step": 4440
},
{
"epoch": 1.2510554461018857,
"grad_norm": 0.570637047290802,
"learning_rate": 7.394809990146356e-05,
"loss": 0.2094,
"step": 4445
},
{
"epoch": 1.2524627075710666,
"grad_norm": 0.4012211859226227,
"learning_rate": 7.371103178402731e-05,
"loss": 0.2591,
"step": 4450
},
{
"epoch": 1.2538699690402477,
"grad_norm": 1.1546359062194824,
"learning_rate": 7.347412227710766e-05,
"loss": 0.2837,
"step": 4455
},
{
"epoch": 1.2552772305094286,
"grad_norm": 0.8672778606414795,
"learning_rate": 7.32373728100622e-05,
"loss": 0.298,
"step": 4460
},
{
"epoch": 1.2566844919786098,
"grad_norm": 0.4911658465862274,
"learning_rate": 7.300078481128306e-05,
"loss": 0.1921,
"step": 4465
},
{
"epoch": 1.2580917534477907,
"grad_norm": 1.1717147827148438,
"learning_rate": 7.276435970818824e-05,
"loss": 0.1687,
"step": 4470
},
{
"epoch": 1.2594990149169716,
"grad_norm": 0.5286734104156494,
"learning_rate": 7.252809892721282e-05,
"loss": 0.2104,
"step": 4475
},
{
"epoch": 1.2609062763861525,
"grad_norm": 2.43472957611084,
"learning_rate": 7.229200389380056e-05,
"loss": 0.2763,
"step": 4480
},
{
"epoch": 1.2623135378553334,
"grad_norm": 0.9692918062210083,
"learning_rate": 7.205607603239508e-05,
"loss": 0.1913,
"step": 4485
},
{
"epoch": 1.2637207993245145,
"grad_norm": 0.8969650268554688,
"learning_rate": 7.182031676643153e-05,
"loss": 0.4249,
"step": 4490
},
{
"epoch": 1.2651280607936954,
"grad_norm": 0.7135694026947021,
"learning_rate": 7.158472751832783e-05,
"loss": 0.1957,
"step": 4495
},
{
"epoch": 1.2665353222628766,
"grad_norm": 2.911539077758789,
"learning_rate": 7.134930970947607e-05,
"loss": 0.3644,
"step": 4500
},
{
"epoch": 1.2679425837320575,
"grad_norm": 1.8338284492492676,
"learning_rate": 7.111406476023398e-05,
"loss": 0.2941,
"step": 4505
},
{
"epoch": 1.2693498452012384,
"grad_norm": 0.736365020275116,
"learning_rate": 7.087899408991651e-05,
"loss": 0.2541,
"step": 4510
},
{
"epoch": 1.2707571066704193,
"grad_norm": 1.269327163696289,
"learning_rate": 7.06440991167869e-05,
"loss": 0.2847,
"step": 4515
},
{
"epoch": 1.2721643681396002,
"grad_norm": 0.6774185299873352,
"learning_rate": 7.040938125804858e-05,
"loss": 0.2047,
"step": 4520
},
{
"epoch": 1.2735716296087813,
"grad_norm": 1.0028345584869385,
"learning_rate": 7.017484192983623e-05,
"loss": 0.2327,
"step": 4525
},
{
"epoch": 1.2749788910779623,
"grad_norm": 0.9345621466636658,
"learning_rate": 6.99404825472074e-05,
"loss": 0.2574,
"step": 4530
},
{
"epoch": 1.2763861525471434,
"grad_norm": 1.2837140560150146,
"learning_rate": 6.970630452413407e-05,
"loss": 0.298,
"step": 4535
},
{
"epoch": 1.2777934140163243,
"grad_norm": 0.5337740182876587,
"learning_rate": 6.947230927349396e-05,
"loss": 0.1538,
"step": 4540
},
{
"epoch": 1.2792006754855052,
"grad_norm": 0.5805062651634216,
"learning_rate": 6.923849820706194e-05,
"loss": 0.1483,
"step": 4545
},
{
"epoch": 1.280607936954686,
"grad_norm": 0.8201838135719299,
"learning_rate": 6.900487273550187e-05,
"loss": 0.163,
"step": 4550
},
{
"epoch": 1.282015198423867,
"grad_norm": 0.5184070467948914,
"learning_rate": 6.877143426835764e-05,
"loss": 0.2611,
"step": 4555
},
{
"epoch": 1.2834224598930482,
"grad_norm": 1.0877232551574707,
"learning_rate": 6.853818421404496e-05,
"loss": 0.3085,
"step": 4560
},
{
"epoch": 1.284829721362229,
"grad_norm": 1.616977572441101,
"learning_rate": 6.830512397984288e-05,
"loss": 0.3108,
"step": 4565
},
{
"epoch": 1.2862369828314102,
"grad_norm": 0.6340872049331665,
"learning_rate": 6.807225497188496e-05,
"loss": 0.177,
"step": 4570
},
{
"epoch": 1.287644244300591,
"grad_norm": 0.8518214821815491,
"learning_rate": 6.783957859515127e-05,
"loss": 0.1805,
"step": 4575
},
{
"epoch": 1.289051505769772,
"grad_norm": 1.280093789100647,
"learning_rate": 6.760709625345953e-05,
"loss": 0.2854,
"step": 4580
},
{
"epoch": 1.290458767238953,
"grad_norm": 0.7486845850944519,
"learning_rate": 6.737480934945677e-05,
"loss": 0.1399,
"step": 4585
},
{
"epoch": 1.291866028708134,
"grad_norm": 1.3590744733810425,
"learning_rate": 6.714271928461097e-05,
"loss": 0.1735,
"step": 4590
},
{
"epoch": 1.293273290177315,
"grad_norm": 0.6231881380081177,
"learning_rate": 6.691082745920247e-05,
"loss": 0.2083,
"step": 4595
},
{
"epoch": 1.2946805516464959,
"grad_norm": 1.0750889778137207,
"learning_rate": 6.667913527231549e-05,
"loss": 0.2304,
"step": 4600
},
{
"epoch": 1.296087813115677,
"grad_norm": 1.3983303308486938,
"learning_rate": 6.644764412182986e-05,
"loss": 0.3285,
"step": 4605
},
{
"epoch": 1.297495074584858,
"grad_norm": 0.5835619568824768,
"learning_rate": 6.621635540441249e-05,
"loss": 0.2651,
"step": 4610
},
{
"epoch": 1.2989023360540388,
"grad_norm": 0.7869633436203003,
"learning_rate": 6.598527051550882e-05,
"loss": 0.2144,
"step": 4615
},
{
"epoch": 1.3003095975232197,
"grad_norm": 0.4034360945224762,
"learning_rate": 6.575439084933468e-05,
"loss": 0.1919,
"step": 4620
},
{
"epoch": 1.3017168589924009,
"grad_norm": 1.0225868225097656,
"learning_rate": 6.552371779886756e-05,
"loss": 0.2942,
"step": 4625
},
{
"epoch": 1.3031241204615818,
"grad_norm": 1.8515701293945312,
"learning_rate": 6.52932527558385e-05,
"loss": 0.2579,
"step": 4630
},
{
"epoch": 1.3045313819307627,
"grad_norm": 1.13215172290802,
"learning_rate": 6.506299711072353e-05,
"loss": 0.189,
"step": 4635
},
{
"epoch": 1.3059386433999438,
"grad_norm": 1.1587252616882324,
"learning_rate": 6.483295225273521e-05,
"loss": 0.2055,
"step": 4640
},
{
"epoch": 1.3073459048691247,
"grad_norm": 1.6920759677886963,
"learning_rate": 6.460311956981444e-05,
"loss": 0.3108,
"step": 4645
},
{
"epoch": 1.3087531663383056,
"grad_norm": 0.5736072659492493,
"learning_rate": 6.437350044862207e-05,
"loss": 0.2675,
"step": 4650
},
{
"epoch": 1.3101604278074865,
"grad_norm": 0.9719104170799255,
"learning_rate": 6.414409627453025e-05,
"loss": 0.1933,
"step": 4655
},
{
"epoch": 1.3115676892766677,
"grad_norm": 0.8271322250366211,
"learning_rate": 6.391490843161442e-05,
"loss": 0.0908,
"step": 4660
},
{
"epoch": 1.3129749507458486,
"grad_norm": 1.2622920274734497,
"learning_rate": 6.368593830264485e-05,
"loss": 0.1837,
"step": 4665
},
{
"epoch": 1.3143822122150295,
"grad_norm": 1.0141448974609375,
"learning_rate": 6.345718726907815e-05,
"loss": 0.1396,
"step": 4670
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.5923504829406738,
"learning_rate": 6.322865671104909e-05,
"loss": 0.1631,
"step": 4675
},
{
"epoch": 1.3171967351533915,
"grad_norm": 1.8866256475448608,
"learning_rate": 6.300034800736233e-05,
"loss": 0.1407,
"step": 4680
},
{
"epoch": 1.3186039966225724,
"grad_norm": 0.8495520353317261,
"learning_rate": 6.277226253548385e-05,
"loss": 0.2345,
"step": 4685
},
{
"epoch": 1.3200112580917533,
"grad_norm": 0.8851481080055237,
"learning_rate": 6.254440167153295e-05,
"loss": 0.2431,
"step": 4690
},
{
"epoch": 1.3214185195609345,
"grad_norm": 0.5228270292282104,
"learning_rate": 6.231676679027364e-05,
"loss": 0.1606,
"step": 4695
},
{
"epoch": 1.3228257810301154,
"grad_norm": 1.2752258777618408,
"learning_rate": 6.208935926510659e-05,
"loss": 0.2588,
"step": 4700
},
{
"epoch": 1.3242330424992963,
"grad_norm": 1.6664029359817505,
"learning_rate": 6.186218046806078e-05,
"loss": 0.2418,
"step": 4705
},
{
"epoch": 1.3256403039684774,
"grad_norm": 0.7116133570671082,
"learning_rate": 6.16352317697851e-05,
"loss": 0.1839,
"step": 4710
},
{
"epoch": 1.3270475654376583,
"grad_norm": 1.6506725549697876,
"learning_rate": 6.140851453954021e-05,
"loss": 0.2076,
"step": 4715
},
{
"epoch": 1.3284548269068392,
"grad_norm": 1.0681225061416626,
"learning_rate": 6.118203014519034e-05,
"loss": 0.2491,
"step": 4720
},
{
"epoch": 1.3298620883760202,
"grad_norm": 0.969599723815918,
"learning_rate": 6.095577995319476e-05,
"loss": 0.273,
"step": 4725
},
{
"epoch": 1.3312693498452013,
"grad_norm": 1.4593223333358765,
"learning_rate": 6.072976532859982e-05,
"loss": 0.358,
"step": 4730
},
{
"epoch": 1.3326766113143822,
"grad_norm": 0.29552891850471497,
"learning_rate": 6.0503987635030656e-05,
"loss": 0.2655,
"step": 4735
},
{
"epoch": 1.334083872783563,
"grad_norm": 2.189373731613159,
"learning_rate": 6.0278448234682784e-05,
"loss": 0.2624,
"step": 4740
},
{
"epoch": 1.3354911342527442,
"grad_norm": 0.28230440616607666,
"learning_rate": 6.005314848831415e-05,
"loss": 0.1886,
"step": 4745
},
{
"epoch": 1.3368983957219251,
"grad_norm": 0.5569413304328918,
"learning_rate": 5.9828089755236714e-05,
"loss": 0.231,
"step": 4750
},
{
"epoch": 1.338305657191106,
"grad_norm": 0.8192738890647888,
"learning_rate": 5.960327339330828e-05,
"loss": 0.23,
"step": 4755
},
{
"epoch": 1.339712918660287,
"grad_norm": 1.0859158039093018,
"learning_rate": 5.9378700758924466e-05,
"loss": 0.3275,
"step": 4760
},
{
"epoch": 1.341120180129468,
"grad_norm": 0.8077869415283203,
"learning_rate": 5.915437320701025e-05,
"loss": 0.0847,
"step": 4765
},
{
"epoch": 1.342527441598649,
"grad_norm": 1.8826837539672852,
"learning_rate": 5.8930292091012015e-05,
"loss": 0.2158,
"step": 4770
},
{
"epoch": 1.3439347030678301,
"grad_norm": 0.6470653414726257,
"learning_rate": 5.870645876288938e-05,
"loss": 0.3325,
"step": 4775
},
{
"epoch": 1.345341964537011,
"grad_norm": 0.7090429067611694,
"learning_rate": 5.848287457310681e-05,
"loss": 0.2083,
"step": 4780
},
{
"epoch": 1.346749226006192,
"grad_norm": 0.1886598914861679,
"learning_rate": 5.825954087062579e-05,
"loss": 0.2118,
"step": 4785
},
{
"epoch": 1.3481564874753729,
"grad_norm": 0.5092473030090332,
"learning_rate": 5.8036459002896473e-05,
"loss": 0.253,
"step": 4790
},
{
"epoch": 1.3495637489445538,
"grad_norm": 0.9652419686317444,
"learning_rate": 5.78136303158495e-05,
"loss": 0.1499,
"step": 4795
},
{
"epoch": 1.350971010413735,
"grad_norm": 0.6111290454864502,
"learning_rate": 5.759105615388814e-05,
"loss": 0.1805,
"step": 4800
},
{
"epoch": 1.3523782718829158,
"grad_norm": 2.2469632625579834,
"learning_rate": 5.736873785987997e-05,
"loss": 0.3536,
"step": 4805
},
{
"epoch": 1.353785533352097,
"grad_norm": 0.9734948873519897,
"learning_rate": 5.714667677514882e-05,
"loss": 0.2784,
"step": 4810
},
{
"epoch": 1.3551927948212779,
"grad_norm": 1.076882243156433,
"learning_rate": 5.692487423946662e-05,
"loss": 0.1953,
"step": 4815
},
{
"epoch": 1.3566000562904588,
"grad_norm": 0.7746699452400208,
"learning_rate": 5.6703331591045524e-05,
"loss": 0.2175,
"step": 4820
},
{
"epoch": 1.3580073177596397,
"grad_norm": 0.7650654315948486,
"learning_rate": 5.6482050166529546e-05,
"loss": 0.1676,
"step": 4825
},
{
"epoch": 1.3594145792288206,
"grad_norm": 0.6610764861106873,
"learning_rate": 5.62610313009868e-05,
"loss": 0.1721,
"step": 4830
},
{
"epoch": 1.3608218406980017,
"grad_norm": 0.8137916326522827,
"learning_rate": 5.604027632790112e-05,
"loss": 0.1374,
"step": 4835
},
{
"epoch": 1.3622291021671826,
"grad_norm": 0.6320801377296448,
"learning_rate": 5.581978657916431e-05,
"loss": 0.209,
"step": 4840
},
{
"epoch": 1.3636363636363638,
"grad_norm": 1.4471935033798218,
"learning_rate": 5.5599563385067996e-05,
"loss": 0.1163,
"step": 4845
},
{
"epoch": 1.3650436251055447,
"grad_norm": 0.9794873595237732,
"learning_rate": 5.537960807429547e-05,
"loss": 0.2077,
"step": 4850
},
{
"epoch": 1.3664508865747256,
"grad_norm": 1.3119271993637085,
"learning_rate": 5.5159921973913866e-05,
"loss": 0.2667,
"step": 4855
},
{
"epoch": 1.3678581480439065,
"grad_norm": 1.156152367591858,
"learning_rate": 5.49405064093661e-05,
"loss": 0.1734,
"step": 4860
},
{
"epoch": 1.3692654095130874,
"grad_norm": 0.06259223818778992,
"learning_rate": 5.472136270446275e-05,
"loss": 0.2067,
"step": 4865
},
{
"epoch": 1.3706726709822685,
"grad_norm": 0.6296875476837158,
"learning_rate": 5.4502492181374284e-05,
"loss": 0.229,
"step": 4870
},
{
"epoch": 1.3720799324514494,
"grad_norm": 1.3139517307281494,
"learning_rate": 5.428389616062298e-05,
"loss": 0.286,
"step": 4875
},
{
"epoch": 1.3734871939206306,
"grad_norm": 0.5777654051780701,
"learning_rate": 5.40655759610748e-05,
"loss": 0.2024,
"step": 4880
},
{
"epoch": 1.3748944553898115,
"grad_norm": 0.5422516465187073,
"learning_rate": 5.384753289993173e-05,
"loss": 0.2453,
"step": 4885
},
{
"epoch": 1.3763017168589924,
"grad_norm": 1.2088871002197266,
"learning_rate": 5.3629768292723614e-05,
"loss": 0.1644,
"step": 4890
},
{
"epoch": 1.3777089783281733,
"grad_norm": 0.6206454634666443,
"learning_rate": 5.341228345330025e-05,
"loss": 0.3293,
"step": 4895
},
{
"epoch": 1.3791162397973544,
"grad_norm": 1.0353143215179443,
"learning_rate": 5.3195079693823624e-05,
"loss": 0.2197,
"step": 4900
},
{
"epoch": 1.3805235012665353,
"grad_norm": 1.076452612876892,
"learning_rate": 5.297815832475971e-05,
"loss": 0.1435,
"step": 4905
},
{
"epoch": 1.3819307627357162,
"grad_norm": 0.7797285914421082,
"learning_rate": 5.2761520654870846e-05,
"loss": 0.1499,
"step": 4910
},
{
"epoch": 1.3833380242048974,
"grad_norm": 3.2293171882629395,
"learning_rate": 5.25451679912077e-05,
"loss": 0.4037,
"step": 4915
},
{
"epoch": 1.3847452856740783,
"grad_norm": 0.7513951659202576,
"learning_rate": 5.232910163910132e-05,
"loss": 0.136,
"step": 4920
},
{
"epoch": 1.3861525471432592,
"grad_norm": 0.43260759115219116,
"learning_rate": 5.211332290215543e-05,
"loss": 0.2419,
"step": 4925
},
{
"epoch": 1.38755980861244,
"grad_norm": 0.7441173791885376,
"learning_rate": 5.189783308223841e-05,
"loss": 0.1678,
"step": 4930
},
{
"epoch": 1.3889670700816212,
"grad_norm": 0.4429182708263397,
"learning_rate": 5.1682633479475484e-05,
"loss": 0.1767,
"step": 4935
},
{
"epoch": 1.3903743315508021,
"grad_norm": 1.6440355777740479,
"learning_rate": 5.146772539224094e-05,
"loss": 0.2831,
"step": 4940
},
{
"epoch": 1.391781593019983,
"grad_norm": 1.1421854496002197,
"learning_rate": 5.1253110117150314e-05,
"loss": 0.157,
"step": 4945
},
{
"epoch": 1.3931888544891642,
"grad_norm": 1.013460397720337,
"learning_rate": 5.1038788949052344e-05,
"loss": 0.3537,
"step": 4950
},
{
"epoch": 1.394596115958345,
"grad_norm": 1.2984402179718018,
"learning_rate": 5.082476318102144e-05,
"loss": 0.2869,
"step": 4955
},
{
"epoch": 1.396003377427526,
"grad_norm": 0.8296849727630615,
"learning_rate": 5.061103410434978e-05,
"loss": 0.2029,
"step": 4960
},
{
"epoch": 1.397410638896707,
"grad_norm": 1.1972373723983765,
"learning_rate": 5.0397603008539374e-05,
"loss": 0.182,
"step": 4965
},
{
"epoch": 1.398817900365888,
"grad_norm": 1.5300724506378174,
"learning_rate": 5.0184471181294515e-05,
"loss": 0.1537,
"step": 4970
},
{
"epoch": 1.400225161835069,
"grad_norm": 0.9540086984634399,
"learning_rate": 4.997163990851381e-05,
"loss": 0.1679,
"step": 4975
},
{
"epoch": 1.4016324233042499,
"grad_norm": 0.15063901245594025,
"learning_rate": 4.975911047428263e-05,
"loss": 0.1512,
"step": 4980
},
{
"epoch": 1.403039684773431,
"grad_norm": 1.925596833229065,
"learning_rate": 4.954688416086524e-05,
"loss": 0.2077,
"step": 4985
},
{
"epoch": 1.404446946242612,
"grad_norm": 1.4239457845687866,
"learning_rate": 4.9334962248696934e-05,
"loss": 0.2464,
"step": 4990
},
{
"epoch": 1.4058542077117928,
"grad_norm": 0.3618084490299225,
"learning_rate": 4.912334601637658e-05,
"loss": 0.1579,
"step": 4995
},
{
"epoch": 1.4072614691809737,
"grad_norm": 0.8101370334625244,
"learning_rate": 4.8912036740658776e-05,
"loss": 0.2682,
"step": 5000
},
{
"epoch": 1.4086687306501549,
"grad_norm": 0.7149579524993896,
"learning_rate": 4.8701035696446064e-05,
"loss": 0.3497,
"step": 5005
},
{
"epoch": 1.4100759921193358,
"grad_norm": 1.0598907470703125,
"learning_rate": 4.849034415678131e-05,
"loss": 0.2342,
"step": 5010
},
{
"epoch": 1.4114832535885167,
"grad_norm": 1.2105034589767456,
"learning_rate": 4.8279963392840156e-05,
"loss": 0.2693,
"step": 5015
},
{
"epoch": 1.4128905150576978,
"grad_norm": 0.6534488201141357,
"learning_rate": 4.8069894673923064e-05,
"loss": 0.2475,
"step": 5020
},
{
"epoch": 1.4142977765268787,
"grad_norm": 1.4907587766647339,
"learning_rate": 4.7860139267447956e-05,
"loss": 0.2958,
"step": 5025
},
{
"epoch": 1.4157050379960596,
"grad_norm": 1.1340523958206177,
"learning_rate": 4.765069843894239e-05,
"loss": 0.1087,
"step": 5030
},
{
"epoch": 1.4171122994652405,
"grad_norm": 0.6139047145843506,
"learning_rate": 4.744157345203588e-05,
"loss": 0.1827,
"step": 5035
},
{
"epoch": 1.4185195609344217,
"grad_norm": 1.5109590291976929,
"learning_rate": 4.723276556845252e-05,
"loss": 0.1851,
"step": 5040
},
{
"epoch": 1.4199268224036026,
"grad_norm": 0.593103289604187,
"learning_rate": 4.702427604800307e-05,
"loss": 0.2019,
"step": 5045
},
{
"epoch": 1.4213340838727835,
"grad_norm": 1.3064155578613281,
"learning_rate": 4.681610614857749e-05,
"loss": 0.1086,
"step": 5050
},
{
"epoch": 1.4227413453419646,
"grad_norm": 1.4465229511260986,
"learning_rate": 4.66082571261375e-05,
"loss": 0.099,
"step": 5055
},
{
"epoch": 1.4241486068111455,
"grad_norm": 1.0164941549301147,
"learning_rate": 4.6400730234708676e-05,
"loss": 0.2006,
"step": 5060
},
{
"epoch": 1.4255558682803264,
"grad_norm": 1.600894808769226,
"learning_rate": 4.61935267263732e-05,
"loss": 0.2938,
"step": 5065
},
{
"epoch": 1.4269631297495073,
"grad_norm": 0.8022120594978333,
"learning_rate": 4.598664785126217e-05,
"loss": 0.2981,
"step": 5070
},
{
"epoch": 1.4283703912186885,
"grad_norm": 0.6564612984657288,
"learning_rate": 4.578009485754791e-05,
"loss": 0.1266,
"step": 5075
},
{
"epoch": 1.4297776526878694,
"grad_norm": 0.7073236107826233,
"learning_rate": 4.557386899143678e-05,
"loss": 0.2229,
"step": 5080
},
{
"epoch": 1.4311849141570505,
"grad_norm": 0.9632103443145752,
"learning_rate": 4.536797149716133e-05,
"loss": 0.1511,
"step": 5085
},
{
"epoch": 1.4325921756262314,
"grad_norm": 1.1304622888565063,
"learning_rate": 4.5162403616972945e-05,
"loss": 0.2341,
"step": 5090
},
{
"epoch": 1.4339994370954123,
"grad_norm": 1.135055422782898,
"learning_rate": 4.4957166591134405e-05,
"loss": 0.3898,
"step": 5095
},
{
"epoch": 1.4354066985645932,
"grad_norm": 0.6786003112792969,
"learning_rate": 4.475226165791231e-05,
"loss": 0.2129,
"step": 5100
},
{
"epoch": 1.4368139600337742,
"grad_norm": 1.3296654224395752,
"learning_rate": 4.454769005356955e-05,
"loss": 0.3128,
"step": 5105
},
{
"epoch": 1.4382212215029553,
"grad_norm": 0.7507737278938293,
"learning_rate": 4.434345301235802e-05,
"loss": 0.1069,
"step": 5110
},
{
"epoch": 1.4396284829721362,
"grad_norm": 1.4222168922424316,
"learning_rate": 4.4139551766511e-05,
"loss": 0.1529,
"step": 5115
},
{
"epoch": 1.4410357444413173,
"grad_norm": 0.21092858910560608,
"learning_rate": 4.39359875462359e-05,
"loss": 0.2159,
"step": 5120
},
{
"epoch": 1.4424430059104982,
"grad_norm": 1.0862993001937866,
"learning_rate": 4.373276157970665e-05,
"loss": 0.1262,
"step": 5125
},
{
"epoch": 1.4438502673796791,
"grad_norm": 1.6479579210281372,
"learning_rate": 4.352987509305635e-05,
"loss": 0.2165,
"step": 5130
},
{
"epoch": 1.44525752884886,
"grad_norm": 0.11600520461797714,
"learning_rate": 4.3327329310370016e-05,
"loss": 0.1696,
"step": 5135
},
{
"epoch": 1.446664790318041,
"grad_norm": 0.9424710869789124,
"learning_rate": 4.312512545367702e-05,
"loss": 0.3328,
"step": 5140
},
{
"epoch": 1.448072051787222,
"grad_norm": 0.6428975462913513,
"learning_rate": 4.292326474294372e-05,
"loss": 0.1069,
"step": 5145
},
{
"epoch": 1.449479313256403,
"grad_norm": 0.8455730676651001,
"learning_rate": 4.272174839606628e-05,
"loss": 0.3006,
"step": 5150
},
{
"epoch": 1.4508865747255841,
"grad_norm": 0.6467002034187317,
"learning_rate": 4.252057762886305e-05,
"loss": 0.1345,
"step": 5155
},
{
"epoch": 1.452293836194765,
"grad_norm": 0.7402626276016235,
"learning_rate": 4.2319753655067505e-05,
"loss": 0.1928,
"step": 5160
},
{
"epoch": 1.453701097663946,
"grad_norm": 1.142514705657959,
"learning_rate": 4.211927768632068e-05,
"loss": 0.3225,
"step": 5165
},
{
"epoch": 1.4551083591331269,
"grad_norm": 0.9843090772628784,
"learning_rate": 4.191915093216411e-05,
"loss": 0.1223,
"step": 5170
},
{
"epoch": 1.4565156206023078,
"grad_norm": 0.9305518865585327,
"learning_rate": 4.171937460003223e-05,
"loss": 0.1518,
"step": 5175
},
{
"epoch": 1.457922882071489,
"grad_norm": 0.9245863556861877,
"learning_rate": 4.1519949895245435e-05,
"loss": 0.161,
"step": 5180
},
{
"epoch": 1.4593301435406698,
"grad_norm": 0.5494176149368286,
"learning_rate": 4.1320878021002466e-05,
"loss": 0.1645,
"step": 5185
},
{
"epoch": 1.460737405009851,
"grad_norm": 0.454455703496933,
"learning_rate": 4.112216017837346e-05,
"loss": 0.1784,
"step": 5190
},
{
"epoch": 1.4621446664790319,
"grad_norm": 0.8797675967216492,
"learning_rate": 4.092379756629244e-05,
"loss": 0.1915,
"step": 5195
},
{
"epoch": 1.4635519279482128,
"grad_norm": 0.5059092044830322,
"learning_rate": 4.072579138155024e-05,
"loss": 0.1533,
"step": 5200
},
{
"epoch": 1.4649591894173937,
"grad_norm": 1.5164445638656616,
"learning_rate": 4.052814281878725e-05,
"loss": 0.3054,
"step": 5205
},
{
"epoch": 1.4663664508865748,
"grad_norm": 0.8489431738853455,
"learning_rate": 4.033085307048626e-05,
"loss": 0.1573,
"step": 5210
},
{
"epoch": 1.4677737123557557,
"grad_norm": 0.8418503999710083,
"learning_rate": 4.0133923326965073e-05,
"loss": 0.2269,
"step": 5215
},
{
"epoch": 1.4691809738249366,
"grad_norm": 0.4309021830558777,
"learning_rate": 3.9937354776369565e-05,
"loss": 0.1621,
"step": 5220
},
{
"epoch": 1.4705882352941178,
"grad_norm": 1.8004333972930908,
"learning_rate": 3.974114860466641e-05,
"loss": 0.1821,
"step": 5225
},
{
"epoch": 1.4719954967632987,
"grad_norm": 0.5034974217414856,
"learning_rate": 3.954530599563586e-05,
"loss": 0.1586,
"step": 5230
},
{
"epoch": 1.4734027582324796,
"grad_norm": 1.8636256456375122,
"learning_rate": 3.934982813086466e-05,
"loss": 0.1778,
"step": 5235
},
{
"epoch": 1.4748100197016605,
"grad_norm": 0.7782198190689087,
"learning_rate": 3.915471618973905e-05,
"loss": 0.2362,
"step": 5240
},
{
"epoch": 1.4762172811708416,
"grad_norm": 0.5170087218284607,
"learning_rate": 3.895997134943735e-05,
"loss": 0.1389,
"step": 5245
},
{
"epoch": 1.4776245426400225,
"grad_norm": 0.6563436388969421,
"learning_rate": 3.876559478492319e-05,
"loss": 0.1972,
"step": 5250
},
{
"epoch": 1.4790318041092034,
"grad_norm": 0.6524726748466492,
"learning_rate": 3.857158766893814e-05,
"loss": 0.2123,
"step": 5255
},
{
"epoch": 1.4804390655783846,
"grad_norm": 0.8341132402420044,
"learning_rate": 3.837795117199483e-05,
"loss": 0.2374,
"step": 5260
},
{
"epoch": 1.4818463270475655,
"grad_norm": 0.37632039189338684,
"learning_rate": 3.818468646236984e-05,
"loss": 0.114,
"step": 5265
},
{
"epoch": 1.4832535885167464,
"grad_norm": 2.116046190261841,
"learning_rate": 3.799179470609656e-05,
"loss": 0.3048,
"step": 5270
},
{
"epoch": 1.4846608499859273,
"grad_norm": 2.3138134479522705,
"learning_rate": 3.7799277066958205e-05,
"loss": 0.1414,
"step": 5275
},
{
"epoch": 1.4860681114551084,
"grad_norm": 1.4033293724060059,
"learning_rate": 3.760713470648093e-05,
"loss": 0.1972,
"step": 5280
},
{
"epoch": 1.4874753729242893,
"grad_norm": 0.9336678981781006,
"learning_rate": 3.741536878392654e-05,
"loss": 0.1519,
"step": 5285
},
{
"epoch": 1.4888826343934702,
"grad_norm": 1.4050379991531372,
"learning_rate": 3.7223980456285813e-05,
"loss": 0.1493,
"step": 5290
},
{
"epoch": 1.4902898958626514,
"grad_norm": 0.4991312623023987,
"learning_rate": 3.70329708782713e-05,
"loss": 0.157,
"step": 5295
},
{
"epoch": 1.4916971573318323,
"grad_norm": 1.6823819875717163,
"learning_rate": 3.6842341202310374e-05,
"loss": 0.2532,
"step": 5300
},
{
"epoch": 1.4931044188010132,
"grad_norm": 0.81031733751297,
"learning_rate": 3.665209257853843e-05,
"loss": 0.3201,
"step": 5305
},
{
"epoch": 1.494511680270194,
"grad_norm": 1.287041425704956,
"learning_rate": 3.646222615479177e-05,
"loss": 0.1398,
"step": 5310
},
{
"epoch": 1.4959189417393752,
"grad_norm": 0.4528125822544098,
"learning_rate": 3.62727430766007e-05,
"loss": 0.2131,
"step": 5315
},
{
"epoch": 1.4973262032085561,
"grad_norm": 1.0578283071517944,
"learning_rate": 3.608364448718283e-05,
"loss": 0.1415,
"step": 5320
},
{
"epoch": 1.498733464677737,
"grad_norm": 0.4122551679611206,
"learning_rate": 3.589493152743585e-05,
"loss": 0.0914,
"step": 5325
},
{
"epoch": 1.5001407261469182,
"grad_norm": 0.6634222269058228,
"learning_rate": 3.570660533593091e-05,
"loss": 0.1269,
"step": 5330
},
{
"epoch": 1.501547987616099,
"grad_norm": 0.27888017892837524,
"learning_rate": 3.551866704890564e-05,
"loss": 0.1288,
"step": 5335
},
{
"epoch": 1.50295524908528,
"grad_norm": 1.0966591835021973,
"learning_rate": 3.533111780025725e-05,
"loss": 0.1822,
"step": 5340
},
{
"epoch": 1.504362510554461,
"grad_norm": 1.1912025213241577,
"learning_rate": 3.514395872153584e-05,
"loss": 0.2205,
"step": 5345
},
{
"epoch": 1.505769772023642,
"grad_norm": 0.34254777431488037,
"learning_rate": 3.49571909419374e-05,
"loss": 0.1333,
"step": 5350
},
{
"epoch": 1.507177033492823,
"grad_norm": 0.7154930233955383,
"learning_rate": 3.4770815588297054e-05,
"loss": 0.1758,
"step": 5355
},
{
"epoch": 1.508584294962004,
"grad_norm": 0.7776800394058228,
"learning_rate": 3.4584833785082385e-05,
"loss": 0.1721,
"step": 5360
},
{
"epoch": 1.509991556431185,
"grad_norm": 1.0347821712493896,
"learning_rate": 3.43992466543865e-05,
"loss": 0.1735,
"step": 5365
},
{
"epoch": 1.511398817900366,
"grad_norm": 0.773311972618103,
"learning_rate": 3.4214055315921245e-05,
"loss": 0.1798,
"step": 5370
},
{
"epoch": 1.5128060793695468,
"grad_norm": 0.15166114270687103,
"learning_rate": 3.402926088701062e-05,
"loss": 0.2025,
"step": 5375
},
{
"epoch": 1.5142133408387277,
"grad_norm": 0.4494927227497101,
"learning_rate": 3.38448644825839e-05,
"loss": 0.1211,
"step": 5380
},
{
"epoch": 1.5156206023079088,
"grad_norm": 1.2481530904769897,
"learning_rate": 3.36608672151689e-05,
"loss": 0.1325,
"step": 5385
},
{
"epoch": 1.5170278637770898,
"grad_norm": 0.7955223321914673,
"learning_rate": 3.347727019488531e-05,
"loss": 0.1334,
"step": 5390
},
{
"epoch": 1.518435125246271,
"grad_norm": 1.1012686491012573,
"learning_rate": 3.329407452943799e-05,
"loss": 0.1978,
"step": 5395
},
{
"epoch": 1.5198423867154518,
"grad_norm": 2.147088050842285,
"learning_rate": 3.311128132411031e-05,
"loss": 0.1742,
"step": 5400
},
{
"epoch": 1.5212496481846327,
"grad_norm": 1.0812978744506836,
"learning_rate": 3.292889168175751e-05,
"loss": 0.1237,
"step": 5405
},
{
"epoch": 1.5226569096538136,
"grad_norm": 0.8602486848831177,
"learning_rate": 3.274690670279984e-05,
"loss": 0.1628,
"step": 5410
},
{
"epoch": 1.5240641711229945,
"grad_norm": 0.4767683446407318,
"learning_rate": 3.25653274852162e-05,
"loss": 0.0893,
"step": 5415
},
{
"epoch": 1.5254714325921757,
"grad_norm": 1.434166431427002,
"learning_rate": 3.238415512453741e-05,
"loss": 0.3905,
"step": 5420
},
{
"epoch": 1.5268786940613566,
"grad_norm": 3.7128000259399414,
"learning_rate": 3.220339071383948e-05,
"loss": 0.336,
"step": 5425
},
{
"epoch": 1.5282859555305377,
"grad_norm": 0.9743013381958008,
"learning_rate": 3.202303534373712e-05,
"loss": 0.17,
"step": 5430
},
{
"epoch": 1.5296932169997186,
"grad_norm": 0.4060254991054535,
"learning_rate": 3.184309010237728e-05,
"loss": 0.1817,
"step": 5435
},
{
"epoch": 1.5311004784688995,
"grad_norm": 1.3302080631256104,
"learning_rate": 3.16635560754323e-05,
"loss": 0.2442,
"step": 5440
},
{
"epoch": 1.5325077399380804,
"grad_norm": 1.5643320083618164,
"learning_rate": 3.148443434609367e-05,
"loss": 0.3225,
"step": 5445
},
{
"epoch": 1.5339150014072613,
"grad_norm": 1.2559304237365723,
"learning_rate": 3.1305725995065205e-05,
"loss": 0.1861,
"step": 5450
},
{
"epoch": 1.5353222628764425,
"grad_norm": 1.1454960107803345,
"learning_rate": 3.112743210055677e-05,
"loss": 0.1262,
"step": 5455
},
{
"epoch": 1.5367295243456234,
"grad_norm": 0.46115657687187195,
"learning_rate": 3.0949553738277634e-05,
"loss": 0.1827,
"step": 5460
},
{
"epoch": 1.5381367858148045,
"grad_norm": 1.2840021848678589,
"learning_rate": 3.077209198143002e-05,
"loss": 0.1399,
"step": 5465
},
{
"epoch": 1.5395440472839854,
"grad_norm": 1.189970850944519,
"learning_rate": 3.0595047900702564e-05,
"loss": 0.2078,
"step": 5470
},
{
"epoch": 1.5409513087531663,
"grad_norm": 0.5335509181022644,
"learning_rate": 3.041842256426404e-05,
"loss": 0.1423,
"step": 5475
},
{
"epoch": 1.5423585702223472,
"grad_norm": 0.8606838583946228,
"learning_rate": 3.024221703775665e-05,
"loss": 0.1468,
"step": 5480
},
{
"epoch": 1.5437658316915281,
"grad_norm": 1.3679966926574707,
"learning_rate": 3.0066432384289844e-05,
"loss": 0.1247,
"step": 5485
},
{
"epoch": 1.5451730931607093,
"grad_norm": 1.2723866701126099,
"learning_rate": 2.989106966443379e-05,
"loss": 0.1482,
"step": 5490
},
{
"epoch": 1.5465803546298902,
"grad_norm": 0.8712704181671143,
"learning_rate": 2.97161299362129e-05,
"loss": 0.2848,
"step": 5495
},
{
"epoch": 1.5479876160990713,
"grad_norm": 0.6967242360115051,
"learning_rate": 2.9541614255099625e-05,
"loss": 0.1604,
"step": 5500
},
{
"epoch": 1.5493948775682522,
"grad_norm": 1.0415253639221191,
"learning_rate": 2.9367523674007947e-05,
"loss": 0.1876,
"step": 5505
},
{
"epoch": 1.5508021390374331,
"grad_norm": 0.5861086845397949,
"learning_rate": 2.9193859243287036e-05,
"loss": 0.1835,
"step": 5510
},
{
"epoch": 1.552209400506614,
"grad_norm": 1.444682002067566,
"learning_rate": 2.902062201071505e-05,
"loss": 0.1588,
"step": 5515
},
{
"epoch": 1.553616661975795,
"grad_norm": 1.0231586694717407,
"learning_rate": 2.8847813021492574e-05,
"loss": 0.3833,
"step": 5520
},
{
"epoch": 1.555023923444976,
"grad_norm": 1.2998064756393433,
"learning_rate": 2.8675433318236567e-05,
"loss": 0.1849,
"step": 5525
},
{
"epoch": 1.556431184914157,
"grad_norm": 0.8349362015724182,
"learning_rate": 2.8503483940973952e-05,
"loss": 0.1391,
"step": 5530
},
{
"epoch": 1.5578384463833381,
"grad_norm": 0.9555754661560059,
"learning_rate": 2.8331965927135274e-05,
"loss": 0.2073,
"step": 5535
},
{
"epoch": 1.559245707852519,
"grad_norm": 1.703472375869751,
"learning_rate": 2.8160880311548522e-05,
"loss": 0.2548,
"step": 5540
},
{
"epoch": 1.5606529693217,
"grad_norm": 0.39019107818603516,
"learning_rate": 2.799022812643295e-05,
"loss": 0.1277,
"step": 5545
},
{
"epoch": 1.5620602307908809,
"grad_norm": 1.0451160669326782,
"learning_rate": 2.782001040139267e-05,
"loss": 0.3046,
"step": 5550
},
{
"epoch": 1.5634674922600618,
"grad_norm": 0.8136467337608337,
"learning_rate": 2.765022816341063e-05,
"loss": 0.197,
"step": 5555
},
{
"epoch": 1.564874753729243,
"grad_norm": 0.6249985098838806,
"learning_rate": 2.7480882436842335e-05,
"loss": 0.1592,
"step": 5560
},
{
"epoch": 1.566282015198424,
"grad_norm": 0.5969499945640564,
"learning_rate": 2.7311974243409565e-05,
"loss": 0.2353,
"step": 5565
},
{
"epoch": 1.567689276667605,
"grad_norm": 0.5542153716087341,
"learning_rate": 2.7143504602194448e-05,
"loss": 0.1407,
"step": 5570
},
{
"epoch": 1.5690965381367858,
"grad_norm": 0.40066176652908325,
"learning_rate": 2.697547452963307e-05,
"loss": 0.1318,
"step": 5575
},
{
"epoch": 1.5705037996059668,
"grad_norm": 0.4262009859085083,
"learning_rate": 2.680788503950944e-05,
"loss": 0.171,
"step": 5580
},
{
"epoch": 1.5719110610751477,
"grad_norm": 0.7851074934005737,
"learning_rate": 2.664073714294948e-05,
"loss": 0.2443,
"step": 5585
},
{
"epoch": 1.5733183225443286,
"grad_norm": 0.39711621403694153,
"learning_rate": 2.6474031848414704e-05,
"loss": 0.2419,
"step": 5590
},
{
"epoch": 1.5747255840135097,
"grad_norm": 0.4387623369693756,
"learning_rate": 2.6307770161696354e-05,
"loss": 0.0821,
"step": 5595
},
{
"epoch": 1.5761328454826908,
"grad_norm": 0.9057246446609497,
"learning_rate": 2.6141953085909198e-05,
"loss": 0.2652,
"step": 5600
},
{
"epoch": 1.5775401069518717,
"grad_norm": 0.7787453532218933,
"learning_rate": 2.597658162148544e-05,
"loss": 0.2335,
"step": 5605
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.116365909576416,
"learning_rate": 2.5811656766168902e-05,
"loss": 0.2092,
"step": 5610
},
{
"epoch": 1.5803546298902336,
"grad_norm": 0.741118848323822,
"learning_rate": 2.5647179515008724e-05,
"loss": 0.18,
"step": 5615
},
{
"epoch": 1.5817618913594145,
"grad_norm": 0.9240850806236267,
"learning_rate": 2.548315086035351e-05,
"loss": 0.2047,
"step": 5620
},
{
"epoch": 1.5831691528285956,
"grad_norm": 1.0324885845184326,
"learning_rate": 2.5319571791845408e-05,
"loss": 0.1117,
"step": 5625
},
{
"epoch": 1.5845764142977765,
"grad_norm": 1.108396053314209,
"learning_rate": 2.5156443296414013e-05,
"loss": 0.1582,
"step": 5630
},
{
"epoch": 1.5859836757669576,
"grad_norm": 1.0466639995574951,
"learning_rate": 2.4993766358270388e-05,
"loss": 0.2145,
"step": 5635
},
{
"epoch": 1.5873909372361386,
"grad_norm": 1.1003303527832031,
"learning_rate": 2.4831541958901293e-05,
"loss": 0.1401,
"step": 5640
},
{
"epoch": 1.5887981987053195,
"grad_norm": 0.7945972084999084,
"learning_rate": 2.4669771077063152e-05,
"loss": 0.101,
"step": 5645
},
{
"epoch": 1.5902054601745004,
"grad_norm": 1.6851614713668823,
"learning_rate": 2.4508454688776105e-05,
"loss": 0.2356,
"step": 5650
},
{
"epoch": 1.5916127216436813,
"grad_norm": 0.708411693572998,
"learning_rate": 2.434759376731819e-05,
"loss": 0.2346,
"step": 5655
},
{
"epoch": 1.5930199831128624,
"grad_norm": 0.9913239479064941,
"learning_rate": 2.4187189283219446e-05,
"loss": 0.1195,
"step": 5660
},
{
"epoch": 1.5944272445820433,
"grad_norm": 1.0097897052764893,
"learning_rate": 2.4027242204256108e-05,
"loss": 0.1723,
"step": 5665
},
{
"epoch": 1.5958345060512245,
"grad_norm": 0.8258925080299377,
"learning_rate": 2.3867753495444723e-05,
"loss": 0.1539,
"step": 5670
},
{
"epoch": 1.5972417675204054,
"grad_norm": 0.5283498764038086,
"learning_rate": 2.3708724119036262e-05,
"loss": 0.1165,
"step": 5675
},
{
"epoch": 1.5986490289895863,
"grad_norm": 1.170369267463684,
"learning_rate": 2.355015503451048e-05,
"loss": 0.1951,
"step": 5680
},
{
"epoch": 1.6000562904587672,
"grad_norm": 0.8622944355010986,
"learning_rate": 2.339204719856998e-05,
"loss": 0.153,
"step": 5685
},
{
"epoch": 1.601463551927948,
"grad_norm": 0.6249514818191528,
"learning_rate": 2.323440156513448e-05,
"loss": 0.0686,
"step": 5690
},
{
"epoch": 1.6028708133971292,
"grad_norm": 0.2732272446155548,
"learning_rate": 2.3077219085335054e-05,
"loss": 0.1054,
"step": 5695
},
{
"epoch": 1.6042780748663101,
"grad_norm": 1.5117753744125366,
"learning_rate": 2.2920500707508496e-05,
"loss": 0.1682,
"step": 5700
},
{
"epoch": 1.6056853363354913,
"grad_norm": 1.9940603971481323,
"learning_rate": 2.2764247377191405e-05,
"loss": 0.2375,
"step": 5705
},
{
"epoch": 1.6070925978046722,
"grad_norm": 1.0817060470581055,
"learning_rate": 2.2608460037114642e-05,
"loss": 0.2294,
"step": 5710
},
{
"epoch": 1.608499859273853,
"grad_norm": 0.4378751814365387,
"learning_rate": 2.2453139627197618e-05,
"loss": 0.1674,
"step": 5715
},
{
"epoch": 1.609907120743034,
"grad_norm": 0.5405195951461792,
"learning_rate": 2.22982870845425e-05,
"loss": 0.3422,
"step": 5720
},
{
"epoch": 1.611314382212215,
"grad_norm": 1.4159220457077026,
"learning_rate": 2.214390334342875e-05,
"loss": 0.2116,
"step": 5725
},
{
"epoch": 1.612721643681396,
"grad_norm": 1.1930686235427856,
"learning_rate": 2.1989989335307304e-05,
"loss": 0.0965,
"step": 5730
},
{
"epoch": 1.614128905150577,
"grad_norm": 1.2334959506988525,
"learning_rate": 2.1836545988795054e-05,
"loss": 0.1547,
"step": 5735
},
{
"epoch": 1.615536166619758,
"grad_norm": 0.7615369558334351,
"learning_rate": 2.168357422966928e-05,
"loss": 0.2468,
"step": 5740
},
{
"epoch": 1.616943428088939,
"grad_norm": 0.7710257172584534,
"learning_rate": 2.153107498086193e-05,
"loss": 0.1674,
"step": 5745
},
{
"epoch": 1.61835068955812,
"grad_norm": 0.464054673910141,
"learning_rate": 2.137904916245419e-05,
"loss": 0.2004,
"step": 5750
},
{
"epoch": 1.6197579510273008,
"grad_norm": 0.3523075580596924,
"learning_rate": 2.1227497691670894e-05,
"loss": 0.2314,
"step": 5755
},
{
"epoch": 1.6211652124964817,
"grad_norm": 0.8045745491981506,
"learning_rate": 2.1076421482874877e-05,
"loss": 0.1431,
"step": 5760
},
{
"epoch": 1.6225724739656628,
"grad_norm": 0.7054654955863953,
"learning_rate": 2.0925821447561665e-05,
"loss": 0.1056,
"step": 5765
},
{
"epoch": 1.6239797354348438,
"grad_norm": 1.5930366516113281,
"learning_rate": 2.077569849435379e-05,
"loss": 0.2394,
"step": 5770
},
{
"epoch": 1.6253869969040249,
"grad_norm": 0.678402304649353,
"learning_rate": 2.062605352899537e-05,
"loss": 0.1482,
"step": 5775
},
{
"epoch": 1.6267942583732058,
"grad_norm": 1.009436845779419,
"learning_rate": 2.0476887454346716e-05,
"loss": 0.2381,
"step": 5780
},
{
"epoch": 1.6282015198423867,
"grad_norm": 0.5717734098434448,
"learning_rate": 2.0328201170378813e-05,
"loss": 0.1877,
"step": 5785
},
{
"epoch": 1.6296087813115676,
"grad_norm": 1.0021076202392578,
"learning_rate": 2.0179995574167842e-05,
"loss": 0.1836,
"step": 5790
},
{
"epoch": 1.6310160427807485,
"grad_norm": 0.5409684777259827,
"learning_rate": 2.0032271559889915e-05,
"loss": 0.21,
"step": 5795
},
{
"epoch": 1.6324233042499297,
"grad_norm": 1.6268481016159058,
"learning_rate": 1.9885030018815487e-05,
"loss": 0.1786,
"step": 5800
},
{
"epoch": 1.6338305657191106,
"grad_norm": 1.0220392942428589,
"learning_rate": 1.9738271839304213e-05,
"loss": 0.2016,
"step": 5805
},
{
"epoch": 1.6352378271882917,
"grad_norm": 0.8178629875183105,
"learning_rate": 1.959199790679934e-05,
"loss": 0.1491,
"step": 5810
},
{
"epoch": 1.6366450886574726,
"grad_norm": 2.1935439109802246,
"learning_rate": 1.944620910382252e-05,
"loss": 0.1966,
"step": 5815
},
{
"epoch": 1.6380523501266535,
"grad_norm": 1.1369730234146118,
"learning_rate": 1.930090630996849e-05,
"loss": 0.2084,
"step": 5820
},
{
"epoch": 1.6394596115958344,
"grad_norm": 0.8570969104766846,
"learning_rate": 1.915609040189972e-05,
"loss": 0.1779,
"step": 5825
},
{
"epoch": 1.6408668730650153,
"grad_norm": 0.8881973624229431,
"learning_rate": 1.901176225334105e-05,
"loss": 0.2334,
"step": 5830
},
{
"epoch": 1.6422741345341965,
"grad_norm": 1.057015299797058,
"learning_rate": 1.886792273507457e-05,
"loss": 0.2208,
"step": 5835
},
{
"epoch": 1.6436813960033776,
"grad_norm": 0.40783455967903137,
"learning_rate": 1.8724572714934307e-05,
"loss": 0.0648,
"step": 5840
},
{
"epoch": 1.6450886574725585,
"grad_norm": 0.8724305629730225,
"learning_rate": 1.8581713057800933e-05,
"loss": 0.2695,
"step": 5845
},
{
"epoch": 1.6464959189417394,
"grad_norm": 1.3229783773422241,
"learning_rate": 1.8439344625596534e-05,
"loss": 0.1555,
"step": 5850
},
{
"epoch": 1.6479031804109203,
"grad_norm": 0.7381983399391174,
"learning_rate": 1.8297468277279618e-05,
"loss": 0.177,
"step": 5855
},
{
"epoch": 1.6493104418801012,
"grad_norm": 0.4356767535209656,
"learning_rate": 1.8156084868839617e-05,
"loss": 0.094,
"step": 5860
},
{
"epoch": 1.6507177033492821,
"grad_norm": 2.0452256202697754,
"learning_rate": 1.8015195253292016e-05,
"loss": 0.3872,
"step": 5865
},
{
"epoch": 1.6521249648184633,
"grad_norm": 0.7345725297927856,
"learning_rate": 1.7874800280672953e-05,
"loss": 0.3794,
"step": 5870
},
{
"epoch": 1.6535322262876444,
"grad_norm": 0.5564286112785339,
"learning_rate": 1.773490079803436e-05,
"loss": 0.194,
"step": 5875
},
{
"epoch": 1.6549394877568253,
"grad_norm": 1.4534375667572021,
"learning_rate": 1.7595497649438565e-05,
"loss": 0.2468,
"step": 5880
},
{
"epoch": 1.6563467492260062,
"grad_norm": 1.159037709236145,
"learning_rate": 1.745659167595337e-05,
"loss": 0.2072,
"step": 5885
},
{
"epoch": 1.6577540106951871,
"grad_norm": 0.9856454133987427,
"learning_rate": 1.7318183715647017e-05,
"loss": 0.2057,
"step": 5890
},
{
"epoch": 1.659161272164368,
"grad_norm": 0.9816296696662903,
"learning_rate": 1.7180274603583035e-05,
"loss": 0.0591,
"step": 5895
},
{
"epoch": 1.660568533633549,
"grad_norm": 0.6953201293945312,
"learning_rate": 1.7042865171815158e-05,
"loss": 0.1549,
"step": 5900
},
{
"epoch": 1.66197579510273,
"grad_norm": 0.9859986901283264,
"learning_rate": 1.6905956249382448e-05,
"loss": 0.1446,
"step": 5905
},
{
"epoch": 1.6633830565719112,
"grad_norm": 2.2135300636291504,
"learning_rate": 1.6769548662304224e-05,
"loss": 0.2074,
"step": 5910
},
{
"epoch": 1.6647903180410921,
"grad_norm": 0.7724807858467102,
"learning_rate": 1.6633643233575014e-05,
"loss": 0.1867,
"step": 5915
},
{
"epoch": 1.666197579510273,
"grad_norm": 0.6000497341156006,
"learning_rate": 1.6498240783159656e-05,
"loss": 0.3259,
"step": 5920
},
{
"epoch": 1.667604840979454,
"grad_norm": 1.0605989694595337,
"learning_rate": 1.6363342127988435e-05,
"loss": 0.2042,
"step": 5925
},
{
"epoch": 1.6690121024486348,
"grad_norm": 0.4106568396091461,
"learning_rate": 1.6228948081951943e-05,
"loss": 0.1073,
"step": 5930
},
{
"epoch": 1.670419363917816,
"grad_norm": 0.9518342614173889,
"learning_rate": 1.6095059455896387e-05,
"loss": 0.1523,
"step": 5935
},
{
"epoch": 1.671826625386997,
"grad_norm": 0.7186952829360962,
"learning_rate": 1.596167705761852e-05,
"loss": 0.1052,
"step": 5940
},
{
"epoch": 1.673233886856178,
"grad_norm": 0.5331084728240967,
"learning_rate": 1.5828801691860895e-05,
"loss": 0.1007,
"step": 5945
},
{
"epoch": 1.674641148325359,
"grad_norm": 0.530546247959137,
"learning_rate": 1.5696434160306983e-05,
"loss": 0.0948,
"step": 5950
},
{
"epoch": 1.6760484097945398,
"grad_norm": 0.9805326461791992,
"learning_rate": 1.5564575261576254e-05,
"loss": 0.2097,
"step": 5955
},
{
"epoch": 1.6774556712637207,
"grad_norm": 0.8919891715049744,
"learning_rate": 1.5433225791219407e-05,
"loss": 0.1409,
"step": 5960
},
{
"epoch": 1.6788629327329017,
"grad_norm": 0.8015194535255432,
"learning_rate": 1.5302386541713687e-05,
"loss": 0.126,
"step": 5965
},
{
"epoch": 1.6802701942020828,
"grad_norm": 0.47212257981300354,
"learning_rate": 1.5172058302457881e-05,
"loss": 0.1573,
"step": 5970
},
{
"epoch": 1.6816774556712637,
"grad_norm": 0.6983383297920227,
"learning_rate": 1.5042241859767735e-05,
"loss": 0.1209,
"step": 5975
},
{
"epoch": 1.6830847171404448,
"grad_norm": 1.2159236669540405,
"learning_rate": 1.4912937996871168e-05,
"loss": 0.1802,
"step": 5980
},
{
"epoch": 1.6844919786096257,
"grad_norm": 0.764870822429657,
"learning_rate": 1.4784147493903455e-05,
"loss": 0.2714,
"step": 5985
},
{
"epoch": 1.6858992400788066,
"grad_norm": 0.9790758490562439,
"learning_rate": 1.4655871127902655e-05,
"loss": 0.2561,
"step": 5990
},
{
"epoch": 1.6873065015479876,
"grad_norm": 2.1390011310577393,
"learning_rate": 1.4528109672804835e-05,
"loss": 0.23,
"step": 5995
},
{
"epoch": 1.6887137630171685,
"grad_norm": 0.39941343665122986,
"learning_rate": 1.4400863899439387e-05,
"loss": 0.2019,
"step": 6000
},
{
"epoch": 1.6901210244863496,
"grad_norm": 0.6225385069847107,
"learning_rate": 1.42741345755245e-05,
"loss": 0.1884,
"step": 6005
},
{
"epoch": 1.6915282859555305,
"grad_norm": 0.7307006120681763,
"learning_rate": 1.4147922465662367e-05,
"loss": 0.1126,
"step": 6010
},
{
"epoch": 1.6929355474247116,
"grad_norm": 1.095548152923584,
"learning_rate": 1.4022228331334675e-05,
"loss": 0.1279,
"step": 6015
},
{
"epoch": 1.6943428088938925,
"grad_norm": 0.45030713081359863,
"learning_rate": 1.3897052930898035e-05,
"loss": 0.1378,
"step": 6020
},
{
"epoch": 1.6957500703630735,
"grad_norm": 1.7270435094833374,
"learning_rate": 1.3772397019579242e-05,
"loss": 0.2399,
"step": 6025
},
{
"epoch": 1.6971573318322544,
"grad_norm": 1.0650115013122559,
"learning_rate": 1.3648261349470948e-05,
"loss": 0.1895,
"step": 6030
},
{
"epoch": 1.6985645933014353,
"grad_norm": 1.0545300245285034,
"learning_rate": 1.352464666952694e-05,
"loss": 0.1122,
"step": 6035
},
{
"epoch": 1.6999718547706164,
"grad_norm": 1.0150022506713867,
"learning_rate": 1.3401553725557681e-05,
"loss": 0.1585,
"step": 6040
},
{
"epoch": 1.7013791162397973,
"grad_norm": 0.5082919001579285,
"learning_rate": 1.3278983260225875e-05,
"loss": 0.2291,
"step": 6045
},
{
"epoch": 1.7027863777089784,
"grad_norm": 0.9131124019622803,
"learning_rate": 1.3156936013041898e-05,
"loss": 0.1303,
"step": 6050
},
{
"epoch": 1.7041936391781594,
"grad_norm": 0.6868187189102173,
"learning_rate": 1.3035412720359353e-05,
"loss": 0.1357,
"step": 6055
},
{
"epoch": 1.7056009006473403,
"grad_norm": 0.8841606378555298,
"learning_rate": 1.2914414115370666e-05,
"loss": 0.1271,
"step": 6060
},
{
"epoch": 1.7070081621165212,
"grad_norm": 0.7348530888557434,
"learning_rate": 1.2793940928102654e-05,
"loss": 0.1773,
"step": 6065
},
{
"epoch": 1.708415423585702,
"grad_norm": 0.7667552828788757,
"learning_rate": 1.2673993885412073e-05,
"loss": 0.2278,
"step": 6070
},
{
"epoch": 1.7098226850548832,
"grad_norm": 1.5741273164749146,
"learning_rate": 1.2554573710981276e-05,
"loss": 0.1607,
"step": 6075
},
{
"epoch": 1.7112299465240641,
"grad_norm": 1.1054571866989136,
"learning_rate": 1.2435681125313803e-05,
"loss": 0.1732,
"step": 6080
},
{
"epoch": 1.7126372079932453,
"grad_norm": 1.193298101425171,
"learning_rate": 1.2317316845730131e-05,
"loss": 0.2668,
"step": 6085
},
{
"epoch": 1.7140444694624262,
"grad_norm": 0.5256794691085815,
"learning_rate": 1.2199481586363281e-05,
"loss": 0.1741,
"step": 6090
},
{
"epoch": 1.715451730931607,
"grad_norm": 1.2280601263046265,
"learning_rate": 1.2082176058154426e-05,
"loss": 0.1479,
"step": 6095
},
{
"epoch": 1.716858992400788,
"grad_norm": 1.0573979616165161,
"learning_rate": 1.196540096884876e-05,
"loss": 0.1264,
"step": 6100
},
{
"epoch": 1.718266253869969,
"grad_norm": 1.5370665788650513,
"learning_rate": 1.1849157022991163e-05,
"loss": 0.2142,
"step": 6105
},
{
"epoch": 1.71967351533915,
"grad_norm": 0.7827951312065125,
"learning_rate": 1.1733444921921899e-05,
"loss": 0.2057,
"step": 6110
},
{
"epoch": 1.721080776808331,
"grad_norm": 1.3667113780975342,
"learning_rate": 1.1618265363772407e-05,
"loss": 0.2746,
"step": 6115
},
{
"epoch": 1.722488038277512,
"grad_norm": 1.506797432899475,
"learning_rate": 1.15036190434612e-05,
"loss": 0.1855,
"step": 6120
},
{
"epoch": 1.723895299746693,
"grad_norm": 0.9613803029060364,
"learning_rate": 1.1389506652689474e-05,
"loss": 0.1031,
"step": 6125
},
{
"epoch": 1.7253025612158739,
"grad_norm": 1.2002402544021606,
"learning_rate": 1.1275928879937114e-05,
"loss": 0.1781,
"step": 6130
},
{
"epoch": 1.7267098226850548,
"grad_norm": 0.5957798361778259,
"learning_rate": 1.1162886410458462e-05,
"loss": 0.1176,
"step": 6135
},
{
"epoch": 1.7281170841542357,
"grad_norm": 0.9620370268821716,
"learning_rate": 1.1050379926278132e-05,
"loss": 0.1515,
"step": 6140
},
{
"epoch": 1.7295243456234168,
"grad_norm": 0.9195571541786194,
"learning_rate": 1.0938410106187046e-05,
"loss": 0.1121,
"step": 6145
},
{
"epoch": 1.730931607092598,
"grad_norm": 0.4538973867893219,
"learning_rate": 1.0826977625738155e-05,
"loss": 0.1129,
"step": 6150
},
{
"epoch": 1.7323388685617789,
"grad_norm": 1.3514046669006348,
"learning_rate": 1.0716083157242484e-05,
"loss": 0.1743,
"step": 6155
},
{
"epoch": 1.7337461300309598,
"grad_norm": 0.8769412636756897,
"learning_rate": 1.0605727369765072e-05,
"loss": 0.1615,
"step": 6160
},
{
"epoch": 1.7351533915001407,
"grad_norm": 1.3082162141799927,
"learning_rate": 1.0495910929120866e-05,
"loss": 0.1344,
"step": 6165
},
{
"epoch": 1.7365606529693216,
"grad_norm": 0.8667125105857849,
"learning_rate": 1.0386634497870751e-05,
"loss": 0.2135,
"step": 6170
},
{
"epoch": 1.7379679144385025,
"grad_norm": 0.7873309850692749,
"learning_rate": 1.0277898735317614e-05,
"loss": 0.1445,
"step": 6175
},
{
"epoch": 1.7393751759076836,
"grad_norm": 1.0749235153198242,
"learning_rate": 1.016970429750218e-05,
"loss": 0.1792,
"step": 6180
},
{
"epoch": 1.7407824373768648,
"grad_norm": 0.7576783299446106,
"learning_rate": 1.0062051837199282e-05,
"loss": 0.1597,
"step": 6185
},
{
"epoch": 1.7421896988460457,
"grad_norm": 0.7447710037231445,
"learning_rate": 9.954942003913758e-06,
"loss": 0.1363,
"step": 6190
},
{
"epoch": 1.7435969603152266,
"grad_norm": 0.756251335144043,
"learning_rate": 9.848375443876578e-06,
"loss": 0.1474,
"step": 6195
},
{
"epoch": 1.7450042217844075,
"grad_norm": 0.45274704694747925,
"learning_rate": 9.742352800040988e-06,
"loss": 0.065,
"step": 6200
},
{
"epoch": 1.7464114832535884,
"grad_norm": 1.0789294242858887,
"learning_rate": 9.636874712078603e-06,
"loss": 0.2623,
"step": 6205
},
{
"epoch": 1.7478187447227693,
"grad_norm": 1.4076869487762451,
"learning_rate": 9.531941816375501e-06,
"loss": 0.2516,
"step": 6210
},
{
"epoch": 1.7492260061919505,
"grad_norm": 2.701754331588745,
"learning_rate": 9.427554746028478e-06,
"loss": 0.2951,
"step": 6215
},
{
"epoch": 1.7506332676611316,
"grad_norm": 0.36146071553230286,
"learning_rate": 9.3237141308411e-06,
"loss": 0.0842,
"step": 6220
},
{
"epoch": 1.7520405291303125,
"grad_norm": 1.120956540107727,
"learning_rate": 9.22042059732008e-06,
"loss": 0.2894,
"step": 6225
},
{
"epoch": 1.7534477905994934,
"grad_norm": 0.5138603448867798,
"learning_rate": 9.117674768671313e-06,
"loss": 0.0713,
"step": 6230
},
{
"epoch": 1.7548550520686743,
"grad_norm": 0.8469157814979553,
"learning_rate": 9.015477264796202e-06,
"loss": 0.2038,
"step": 6235
},
{
"epoch": 1.7562623135378552,
"grad_norm": 1.5071958303451538,
"learning_rate": 8.913828702287974e-06,
"loss": 0.3285,
"step": 6240
},
{
"epoch": 1.7576695750070364,
"grad_norm": 1.6233199834823608,
"learning_rate": 8.812729694427879e-06,
"loss": 0.1192,
"step": 6245
},
{
"epoch": 1.7590768364762173,
"grad_norm": 0.884638786315918,
"learning_rate": 8.712180851181462e-06,
"loss": 0.1612,
"step": 6250
},
{
"epoch": 1.7604840979453984,
"grad_norm": 1.5049396753311157,
"learning_rate": 8.612182779195021e-06,
"loss": 0.1233,
"step": 6255
},
{
"epoch": 1.7618913594145793,
"grad_norm": 1.0843751430511475,
"learning_rate": 8.512736081791772e-06,
"loss": 0.2496,
"step": 6260
},
{
"epoch": 1.7632986208837602,
"grad_norm": 0.9301806688308716,
"learning_rate": 8.413841358968332e-06,
"loss": 0.2379,
"step": 6265
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1.611035943031311,
"learning_rate": 8.315499207391075e-06,
"loss": 0.1856,
"step": 6270
},
{
"epoch": 1.766113143822122,
"grad_norm": 1.3043655157089233,
"learning_rate": 8.217710220392526e-06,
"loss": 0.1456,
"step": 6275
},
{
"epoch": 1.7675204052913032,
"grad_norm": 1.800098180770874,
"learning_rate": 8.12047498796773e-06,
"loss": 0.2416,
"step": 6280
},
{
"epoch": 1.768927666760484,
"grad_norm": 0.7097885608673096,
"learning_rate": 8.023794096770808e-06,
"loss": 0.141,
"step": 6285
},
{
"epoch": 1.7703349282296652,
"grad_norm": 1.1929750442504883,
"learning_rate": 7.927668130111243e-06,
"loss": 0.3433,
"step": 6290
},
{
"epoch": 1.7717421896988461,
"grad_norm": 1.647980809211731,
"learning_rate": 7.832097667950588e-06,
"loss": 0.2052,
"step": 6295
},
{
"epoch": 1.773149451168027,
"grad_norm": 0.43591317534446716,
"learning_rate": 7.737083286898749e-06,
"loss": 0.2104,
"step": 6300
},
{
"epoch": 1.774556712637208,
"grad_norm": 1.241782546043396,
"learning_rate": 7.642625560210637e-06,
"loss": 0.1109,
"step": 6305
},
{
"epoch": 1.7759639741063888,
"grad_norm": 0.9579405784606934,
"learning_rate": 7.548725057782658e-06,
"loss": 0.1786,
"step": 6310
},
{
"epoch": 1.77737123557557,
"grad_norm": 0.7312494516372681,
"learning_rate": 7.455382346149342e-06,
"loss": 0.1228,
"step": 6315
},
{
"epoch": 1.7787784970447509,
"grad_norm": 0.7087497711181641,
"learning_rate": 7.36259798847978e-06,
"loss": 0.1424,
"step": 6320
},
{
"epoch": 1.780185758513932,
"grad_norm": 1.6807194948196411,
"learning_rate": 7.2703725445744105e-06,
"loss": 0.1199,
"step": 6325
},
{
"epoch": 1.781593019983113,
"grad_norm": 1.101808786392212,
"learning_rate": 7.178706570861515e-06,
"loss": 0.0979,
"step": 6330
},
{
"epoch": 1.7830002814522938,
"grad_norm": 1.7121551036834717,
"learning_rate": 7.087600620393864e-06,
"loss": 0.101,
"step": 6335
},
{
"epoch": 1.7844075429214747,
"grad_norm": 0.6395900845527649,
"learning_rate": 6.997055242845441e-06,
"loss": 0.2197,
"step": 6340
},
{
"epoch": 1.7858148043906557,
"grad_norm": 0.9732767343521118,
"learning_rate": 6.907070984508124e-06,
"loss": 0.1321,
"step": 6345
},
{
"epoch": 1.7872220658598368,
"grad_norm": 1.2426737546920776,
"learning_rate": 6.8176483882883e-06,
"loss": 0.2246,
"step": 6350
},
{
"epoch": 1.7886293273290177,
"grad_norm": 1.6869935989379883,
"learning_rate": 6.728787993703733e-06,
"loss": 0.2733,
"step": 6355
},
{
"epoch": 1.7900365887981988,
"grad_norm": 0.49518850445747375,
"learning_rate": 6.640490336880134e-06,
"loss": 0.1142,
"step": 6360
},
{
"epoch": 1.7914438502673797,
"grad_norm": 0.7494794726371765,
"learning_rate": 6.552755950548095e-06,
"loss": 0.2115,
"step": 6365
},
{
"epoch": 1.7928511117365606,
"grad_norm": 0.7595309019088745,
"learning_rate": 6.465585364039795e-06,
"loss": 0.1135,
"step": 6370
},
{
"epoch": 1.7942583732057416,
"grad_norm": 0.7823693752288818,
"learning_rate": 6.378979103285765e-06,
"loss": 0.1422,
"step": 6375
},
{
"epoch": 1.7956656346749225,
"grad_norm": 1.9872539043426514,
"learning_rate": 6.292937690811795e-06,
"loss": 0.22,
"step": 6380
},
{
"epoch": 1.7970728961441036,
"grad_norm": 0.46582117676734924,
"learning_rate": 6.207461645735746e-06,
"loss": 0.1519,
"step": 6385
},
{
"epoch": 1.7984801576132845,
"grad_norm": 0.40433597564697266,
"learning_rate": 6.122551483764416e-06,
"loss": 0.2422,
"step": 6390
},
{
"epoch": 1.7998874190824656,
"grad_norm": 1.4909939765930176,
"learning_rate": 6.038207717190436e-06,
"loss": 0.1638,
"step": 6395
},
{
"epoch": 1.8012946805516465,
"grad_norm": 0.7252668738365173,
"learning_rate": 5.954430854889182e-06,
"loss": 0.1053,
"step": 6400
},
{
"epoch": 1.8027019420208275,
"grad_norm": 1.4477570056915283,
"learning_rate": 5.871221402315674e-06,
"loss": 0.1934,
"step": 6405
},
{
"epoch": 1.8041092034900084,
"grad_norm": 0.43066859245300293,
"learning_rate": 5.788579861501597e-06,
"loss": 0.114,
"step": 6410
},
{
"epoch": 1.8055164649591893,
"grad_norm": 1.1360474824905396,
"learning_rate": 5.706506731052175e-06,
"loss": 0.1447,
"step": 6415
},
{
"epoch": 1.8069237264283704,
"grad_norm": 0.6951930522918701,
"learning_rate": 5.625002506143218e-06,
"loss": 0.1401,
"step": 6420
},
{
"epoch": 1.8083309878975513,
"grad_norm": 1.213666319847107,
"learning_rate": 5.544067678518194e-06,
"loss": 0.1737,
"step": 6425
},
{
"epoch": 1.8097382493667324,
"grad_norm": 0.9512806534767151,
"learning_rate": 5.46370273648511e-06,
"loss": 0.1517,
"step": 6430
},
{
"epoch": 1.8111455108359134,
"grad_norm": 1.4045182466506958,
"learning_rate": 5.3839081649137205e-06,
"loss": 0.1899,
"step": 6435
},
{
"epoch": 1.8125527723050943,
"grad_norm": 0.579311192035675,
"learning_rate": 5.304684445232522e-06,
"loss": 0.1442,
"step": 6440
},
{
"epoch": 1.8139600337742752,
"grad_norm": 1.6119418144226074,
"learning_rate": 5.2260320554258225e-06,
"loss": 0.1473,
"step": 6445
},
{
"epoch": 1.815367295243456,
"grad_norm": 1.2963722944259644,
"learning_rate": 5.147951470030976e-06,
"loss": 0.227,
"step": 6450
},
{
"epoch": 1.8167745567126372,
"grad_norm": 1.3112095594406128,
"learning_rate": 5.070443160135352e-06,
"loss": 0.116,
"step": 6455
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.49451136589050293,
"learning_rate": 4.993507593373625e-06,
"loss": 0.2077,
"step": 6460
},
{
"epoch": 1.8195890796509993,
"grad_norm": 1.0468064546585083,
"learning_rate": 4.917145233924924e-06,
"loss": 0.246,
"step": 6465
},
{
"epoch": 1.8209963411201802,
"grad_norm": 0.5947392582893372,
"learning_rate": 4.841356542510022e-06,
"loss": 0.1534,
"step": 6470
},
{
"epoch": 1.822403602589361,
"grad_norm": 0.3909468352794647,
"learning_rate": 4.766141976388494e-06,
"loss": 0.1792,
"step": 6475
},
{
"epoch": 1.823810864058542,
"grad_norm": 0.911483645439148,
"learning_rate": 4.691501989356084e-06,
"loss": 0.2147,
"step": 6480
},
{
"epoch": 1.825218125527723,
"grad_norm": 0.5338053703308105,
"learning_rate": 4.617437031741867e-06,
"loss": 0.0811,
"step": 6485
},
{
"epoch": 1.826625386996904,
"grad_norm": 0.5877882242202759,
"learning_rate": 4.54394755040558e-06,
"loss": 0.1473,
"step": 6490
},
{
"epoch": 1.8280326484660852,
"grad_norm": 0.21510696411132812,
"learning_rate": 4.471033988734885e-06,
"loss": 0.2545,
"step": 6495
},
{
"epoch": 1.829439909935266,
"grad_norm": 1.325976014137268,
"learning_rate": 4.398696786642731e-06,
"loss": 0.1934,
"step": 6500
},
{
"epoch": 1.830847171404447,
"grad_norm": 0.5961741805076599,
"learning_rate": 4.326936380564705e-06,
"loss": 0.1732,
"step": 6505
},
{
"epoch": 1.8322544328736279,
"grad_norm": 1.4790091514587402,
"learning_rate": 4.255753203456392e-06,
"loss": 0.1699,
"step": 6510
},
{
"epoch": 1.8336616943428088,
"grad_norm": 0.5095391869544983,
"learning_rate": 4.185147684790691e-06,
"loss": 0.1335,
"step": 6515
},
{
"epoch": 1.83506895581199,
"grad_norm": 0.5565084218978882,
"learning_rate": 4.115120250555349e-06,
"loss": 0.1748,
"step": 6520
},
{
"epoch": 1.8364762172811708,
"grad_norm": 1.2198169231414795,
"learning_rate": 4.045671323250333e-06,
"loss": 0.2197,
"step": 6525
},
{
"epoch": 1.837883478750352,
"grad_norm": 0.4299394488334656,
"learning_rate": 3.976801321885215e-06,
"loss": 0.1229,
"step": 6530
},
{
"epoch": 1.8392907402195329,
"grad_norm": 0.8082312345504761,
"learning_rate": 3.908510661976739e-06,
"loss": 0.2784,
"step": 6535
},
{
"epoch": 1.8406980016887138,
"grad_norm": 0.7714455723762512,
"learning_rate": 3.840799755546298e-06,
"loss": 0.1128,
"step": 6540
},
{
"epoch": 1.8421052631578947,
"grad_norm": 1.8380225896835327,
"learning_rate": 3.773669011117398e-06,
"loss": 0.2196,
"step": 6545
},
{
"epoch": 1.8435125246270756,
"grad_norm": 1.4072784185409546,
"learning_rate": 3.707118833713241e-06,
"loss": 0.1164,
"step": 6550
},
{
"epoch": 1.8449197860962567,
"grad_norm": 2.7376558780670166,
"learning_rate": 3.6411496248542897e-06,
"loss": 0.1715,
"step": 6555
},
{
"epoch": 1.8463270475654376,
"grad_norm": 1.3996756076812744,
"learning_rate": 3.5757617825557533e-06,
"loss": 0.1792,
"step": 6560
},
{
"epoch": 1.8477343090346188,
"grad_norm": 1.6355584859848022,
"learning_rate": 3.5109557013253357e-06,
"loss": 0.1213,
"step": 6565
},
{
"epoch": 1.8491415705037997,
"grad_norm": 0.6846399903297424,
"learning_rate": 3.446731772160716e-06,
"loss": 0.1431,
"step": 6570
},
{
"epoch": 1.8505488319729806,
"grad_norm": 1.0300202369689941,
"learning_rate": 3.3830903825472493e-06,
"loss": 0.1996,
"step": 6575
},
{
"epoch": 1.8519560934421615,
"grad_norm": 0.8449344038963318,
"learning_rate": 3.3200319164556683e-06,
"loss": 0.1457,
"step": 6580
},
{
"epoch": 1.8533633549113424,
"grad_norm": 0.8704646825790405,
"learning_rate": 3.2575567543396746e-06,
"loss": 0.1493,
"step": 6585
},
{
"epoch": 1.8547706163805235,
"grad_norm": 1.0447726249694824,
"learning_rate": 3.195665273133719e-06,
"loss": 0.2999,
"step": 6590
},
{
"epoch": 1.8561778778497044,
"grad_norm": 0.6128522157669067,
"learning_rate": 3.134357846250735e-06,
"loss": 0.0989,
"step": 6595
},
{
"epoch": 1.8575851393188856,
"grad_norm": 0.7889478802680969,
"learning_rate": 3.073634843579776e-06,
"loss": 0.1107,
"step": 6600
},
{
"epoch": 1.8589924007880665,
"grad_norm": 1.114986777305603,
"learning_rate": 3.0134966314839144e-06,
"loss": 0.0739,
"step": 6605
},
{
"epoch": 1.8603996622572474,
"grad_norm": 0.4977349638938904,
"learning_rate": 2.953943572797968e-06,
"loss": 0.0591,
"step": 6610
},
{
"epoch": 1.8618069237264283,
"grad_norm": 0.6706826686859131,
"learning_rate": 2.8949760268263017e-06,
"loss": 0.1383,
"step": 6615
},
{
"epoch": 1.8632141851956092,
"grad_norm": 0.6721628308296204,
"learning_rate": 2.8365943493406934e-06,
"loss": 0.1539,
"step": 6620
},
{
"epoch": 1.8646214466647903,
"grad_norm": 0.6661956310272217,
"learning_rate": 2.7787988925782048e-06,
"loss": 0.1833,
"step": 6625
},
{
"epoch": 1.8660287081339713,
"grad_norm": 1.3089790344238281,
"learning_rate": 2.7215900052389497e-06,
"loss": 0.1368,
"step": 6630
},
{
"epoch": 1.8674359696031524,
"grad_norm": 1.6742780208587646,
"learning_rate": 2.6649680324841166e-06,
"loss": 0.2486,
"step": 6635
},
{
"epoch": 1.8688432310723333,
"grad_norm": 0.8076462745666504,
"learning_rate": 2.608933315933837e-06,
"loss": 0.115,
"step": 6640
},
{
"epoch": 1.8702504925415142,
"grad_norm": 1.4497947692871094,
"learning_rate": 2.5534861936650665e-06,
"loss": 0.1807,
"step": 6645
},
{
"epoch": 1.8716577540106951,
"grad_norm": 0.8782854676246643,
"learning_rate": 2.4986270002096747e-06,
"loss": 0.1052,
"step": 6650
},
{
"epoch": 1.873065015479876,
"grad_norm": 0.6687735915184021,
"learning_rate": 2.4443560665523e-06,
"loss": 0.2023,
"step": 6655
},
{
"epoch": 1.8744722769490572,
"grad_norm": 0.698962390422821,
"learning_rate": 2.3906737201284002e-06,
"loss": 0.1023,
"step": 6660
},
{
"epoch": 1.875879538418238,
"grad_norm": 1.2811174392700195,
"learning_rate": 2.3375802848223385e-06,
"loss": 0.1374,
"step": 6665
},
{
"epoch": 1.8772867998874192,
"grad_norm": 0.8447235226631165,
"learning_rate": 2.285076080965287e-06,
"loss": 0.1569,
"step": 6670
},
{
"epoch": 1.8786940613566,
"grad_norm": 0.6996911764144897,
"learning_rate": 2.233161425333474e-06,
"loss": 0.1395,
"step": 6675
},
{
"epoch": 1.880101322825781,
"grad_norm": 1.388584852218628,
"learning_rate": 2.1818366311460946e-06,
"loss": 0.1692,
"step": 6680
},
{
"epoch": 1.881508584294962,
"grad_norm": 0.5281504988670349,
"learning_rate": 2.1311020080635346e-06,
"loss": 0.1218,
"step": 6685
},
{
"epoch": 1.8829158457641428,
"grad_norm": 0.8310534954071045,
"learning_rate": 2.080957862185484e-06,
"loss": 0.2253,
"step": 6690
},
{
"epoch": 1.884323107233324,
"grad_norm": 0.5924013257026672,
"learning_rate": 2.031404496049072e-06,
"loss": 0.0862,
"step": 6695
},
{
"epoch": 1.8857303687025049,
"grad_norm": 0.445305198431015,
"learning_rate": 1.982442208627033e-06,
"loss": 0.2208,
"step": 6700
},
{
"epoch": 1.887137630171686,
"grad_norm": 0.66776442527771,
"learning_rate": 1.9340712953259565e-06,
"loss": 0.159,
"step": 6705
},
{
"epoch": 1.888544891640867,
"grad_norm": 0.8003804683685303,
"learning_rate": 1.886292047984395e-06,
"loss": 0.1276,
"step": 6710
},
{
"epoch": 1.8899521531100478,
"grad_norm": 1.1968119144439697,
"learning_rate": 1.839104754871257e-06,
"loss": 0.1147,
"step": 6715
},
{
"epoch": 1.8913594145792287,
"grad_norm": 2.06772518157959,
"learning_rate": 1.7925097006839198e-06,
"loss": 0.1263,
"step": 6720
},
{
"epoch": 1.8927666760484096,
"grad_norm": 0.8591898083686829,
"learning_rate": 1.746507166546596e-06,
"loss": 0.1612,
"step": 6725
},
{
"epoch": 1.8941739375175908,
"grad_norm": 1.3790104389190674,
"learning_rate": 1.7010974300086358e-06,
"loss": 0.1714,
"step": 6730
},
{
"epoch": 1.8955811989867717,
"grad_norm": 0.6857600808143616,
"learning_rate": 1.656280765042828e-06,
"loss": 0.1331,
"step": 6735
},
{
"epoch": 1.8969884604559528,
"grad_norm": 0.9561905860900879,
"learning_rate": 1.612057442043724e-06,
"loss": 0.13,
"step": 6740
},
{
"epoch": 1.8983957219251337,
"grad_norm": 1.3840196132659912,
"learning_rate": 1.5684277278260718e-06,
"loss": 0.2562,
"step": 6745
},
{
"epoch": 1.8998029833943146,
"grad_norm": 0.6963467001914978,
"learning_rate": 1.525391885623173e-06,
"loss": 0.1882,
"step": 6750
},
{
"epoch": 1.9012102448634955,
"grad_norm": 0.9500248432159424,
"learning_rate": 1.4829501750852626e-06,
"loss": 0.131,
"step": 6755
},
{
"epoch": 1.9026175063326765,
"grad_norm": 0.8108523488044739,
"learning_rate": 1.4411028522779757e-06,
"loss": 0.1891,
"step": 6760
},
{
"epoch": 1.9040247678018576,
"grad_norm": 0.6868911981582642,
"learning_rate": 1.3998501696808274e-06,
"loss": 0.1761,
"step": 6765
},
{
"epoch": 1.9054320292710387,
"grad_norm": 1.8471946716308594,
"learning_rate": 1.3591923761856363e-06,
"loss": 0.2683,
"step": 6770
},
{
"epoch": 1.9068392907402196,
"grad_norm": 0.5496200919151306,
"learning_rate": 1.3191297170950578e-06,
"loss": 0.1627,
"step": 6775
},
{
"epoch": 1.9082465522094005,
"grad_norm": 0.7432734370231628,
"learning_rate": 1.2796624341210873e-06,
"loss": 0.1406,
"step": 6780
},
{
"epoch": 1.9096538136785814,
"grad_norm": 0.773916482925415,
"learning_rate": 1.2407907653836038e-06,
"loss": 0.1308,
"step": 6785
},
{
"epoch": 1.9110610751477624,
"grad_norm": 1.0941839218139648,
"learning_rate": 1.2025149454089723e-06,
"loss": 0.1269,
"step": 6790
},
{
"epoch": 1.9124683366169433,
"grad_norm": 0.5930225253105164,
"learning_rate": 1.1648352051285448e-06,
"loss": 0.1393,
"step": 6795
},
{
"epoch": 1.9138755980861244,
"grad_norm": 0.38355159759521484,
"learning_rate": 1.127751771877339e-06,
"loss": 0.128,
"step": 6800
},
{
"epoch": 1.9152828595553055,
"grad_norm": 0.8687125444412231,
"learning_rate": 1.0912648693926497e-06,
"loss": 0.128,
"step": 6805
},
{
"epoch": 1.9166901210244864,
"grad_norm": 0.9181435704231262,
"learning_rate": 1.055374717812696e-06,
"loss": 0.2078,
"step": 6810
},
{
"epoch": 1.9180973824936673,
"grad_norm": 1.5709048509597778,
"learning_rate": 1.0200815336752657e-06,
"loss": 0.1745,
"step": 6815
},
{
"epoch": 1.9195046439628483,
"grad_norm": 0.8740848302841187,
"learning_rate": 9.853855299164717e-07,
"loss": 0.1209,
"step": 6820
},
{
"epoch": 1.9209119054320292,
"grad_norm": 0.46822214126586914,
"learning_rate": 9.512869158693982e-07,
"loss": 0.1031,
"step": 6825
},
{
"epoch": 1.9223191669012103,
"grad_norm": 0.6493380665779114,
"learning_rate": 9.177858972628794e-07,
"loss": 0.1665,
"step": 6830
},
{
"epoch": 1.9237264283703912,
"grad_norm": 0.628223180770874,
"learning_rate": 8.848826762202556e-07,
"loss": 0.1375,
"step": 6835
},
{
"epoch": 1.9251336898395723,
"grad_norm": 0.8677277565002441,
"learning_rate": 8.525774512581297e-07,
"loss": 0.1193,
"step": 6840
},
{
"epoch": 1.9265409513087532,
"grad_norm": 0.34191542863845825,
"learning_rate": 8.208704172851911e-07,
"loss": 0.1605,
"step": 6845
},
{
"epoch": 1.9279482127779342,
"grad_norm": 0.3965689539909363,
"learning_rate": 7.897617656010381e-07,
"loss": 0.2008,
"step": 6850
},
{
"epoch": 1.929355474247115,
"grad_norm": 1.651140809059143,
"learning_rate": 7.592516838950348e-07,
"loss": 0.259,
"step": 6855
},
{
"epoch": 1.930762735716296,
"grad_norm": 1.2457526922225952,
"learning_rate": 7.293403562451229e-07,
"loss": 0.1243,
"step": 6860
},
{
"epoch": 1.932169997185477,
"grad_norm": 0.42919033765792847,
"learning_rate": 7.000279631168005e-07,
"loss": 0.0686,
"step": 6865
},
{
"epoch": 1.933577258654658,
"grad_norm": 1.004384160041809,
"learning_rate": 6.713146813619564e-07,
"loss": 0.1132,
"step": 6870
},
{
"epoch": 1.9349845201238391,
"grad_norm": 0.7319831252098083,
"learning_rate": 6.432006842178262e-07,
"loss": 0.0594,
"step": 6875
},
{
"epoch": 1.93639178159302,
"grad_norm": 0.9444944262504578,
"learning_rate": 6.156861413059601e-07,
"loss": 0.1181,
"step": 6880
},
{
"epoch": 1.937799043062201,
"grad_norm": 1.6310319900512695,
"learning_rate": 5.887712186312011e-07,
"loss": 0.2333,
"step": 6885
},
{
"epoch": 1.9392063045313819,
"grad_norm": 0.7760756015777588,
"learning_rate": 5.624560785806754e-07,
"loss": 0.1101,
"step": 6890
},
{
"epoch": 1.9406135660005628,
"grad_norm": 1.4316829442977905,
"learning_rate": 5.367408799227925e-07,
"loss": 0.1512,
"step": 6895
},
{
"epoch": 1.942020827469744,
"grad_norm": 0.6632144451141357,
"learning_rate": 5.116257778063238e-07,
"loss": 0.176,
"step": 6900
},
{
"epoch": 1.9434280889389248,
"grad_norm": 0.4353666603565216,
"learning_rate": 4.871109237594373e-07,
"loss": 0.1293,
"step": 6905
},
{
"epoch": 1.944835350408106,
"grad_norm": 2.0593976974487305,
"learning_rate": 4.631964656888088e-07,
"loss": 0.4206,
"step": 6910
},
{
"epoch": 1.9462426118772869,
"grad_norm": 0.8553899526596069,
"learning_rate": 4.3988254787868945e-07,
"loss": 0.2033,
"step": 6915
},
{
"epoch": 1.9476498733464678,
"grad_norm": 2.4069225788116455,
"learning_rate": 4.171693109900954e-07,
"loss": 0.1747,
"step": 6920
},
{
"epoch": 1.9490571348156487,
"grad_norm": 1.0317012071609497,
"learning_rate": 3.950568920598974e-07,
"loss": 0.1857,
"step": 6925
},
{
"epoch": 1.9504643962848296,
"grad_norm": 0.16559715569019318,
"learning_rate": 3.735454245000436e-07,
"loss": 0.1506,
"step": 6930
},
{
"epoch": 1.9518716577540107,
"grad_norm": 1.008353590965271,
"learning_rate": 3.526350380967047e-07,
"loss": 0.1661,
"step": 6935
},
{
"epoch": 1.9532789192231916,
"grad_norm": 0.8605316877365112,
"learning_rate": 3.323258590095635e-07,
"loss": 0.1547,
"step": 6940
},
{
"epoch": 1.9546861806923728,
"grad_norm": 0.8140857815742493,
"learning_rate": 3.126180097709597e-07,
"loss": 0.204,
"step": 6945
},
{
"epoch": 1.9560934421615537,
"grad_norm": 0.250213086605072,
"learning_rate": 2.9351160928522416e-07,
"loss": 0.1531,
"step": 6950
},
{
"epoch": 1.9575007036307346,
"grad_norm": 2.0146706104278564,
"learning_rate": 2.7500677282795704e-07,
"loss": 0.135,
"step": 6955
},
{
"epoch": 1.9589079650999155,
"grad_norm": 0.43031638860702515,
"learning_rate": 2.57103612045273e-07,
"loss": 0.1118,
"step": 6960
},
{
"epoch": 1.9603152265690964,
"grad_norm": 1.1351455450057983,
"learning_rate": 2.3980223495319034e-07,
"loss": 0.1474,
"step": 6965
},
{
"epoch": 1.9617224880382775,
"grad_norm": 0.6760854721069336,
"learning_rate": 2.231027459369539e-07,
"loss": 0.1577,
"step": 6970
},
{
"epoch": 1.9631297495074584,
"grad_norm": 0.6344230771064758,
"learning_rate": 2.0700524575041347e-07,
"loss": 0.0911,
"step": 6975
},
{
"epoch": 1.9645370109766396,
"grad_norm": 0.8816024661064148,
"learning_rate": 1.915098315153907e-07,
"loss": 0.1711,
"step": 6980
},
{
"epoch": 1.9659442724458205,
"grad_norm": 1.2508419752120972,
"learning_rate": 1.766165967211464e-07,
"loss": 0.2165,
"step": 6985
},
{
"epoch": 1.9673515339150014,
"grad_norm": 0.9682034254074097,
"learning_rate": 1.6232563122373645e-07,
"loss": 0.1176,
"step": 6990
},
{
"epoch": 1.9687587953841823,
"grad_norm": 0.5194812417030334,
"learning_rate": 1.4863702124554567e-07,
"loss": 0.1792,
"step": 6995
},
{
"epoch": 1.9701660568533632,
"grad_norm": 0.7501698136329651,
"learning_rate": 1.3555084937475483e-07,
"loss": 0.1375,
"step": 7000
},
{
"epoch": 1.9715733183225443,
"grad_norm": 0.8848897218704224,
"learning_rate": 1.2306719456478544e-07,
"loss": 0.1218,
"step": 7005
},
{
"epoch": 1.9729805797917253,
"grad_norm": 0.5296036601066589,
"learning_rate": 1.1118613213388918e-07,
"loss": 0.0949,
"step": 7010
},
{
"epoch": 1.9743878412609064,
"grad_norm": 0.5823400616645813,
"learning_rate": 9.990773376464812e-08,
"loss": 0.1266,
"step": 7015
},
{
"epoch": 1.9757951027300873,
"grad_norm": 1.2051528692245483,
"learning_rate": 8.923206750359736e-08,
"loss": 0.1841,
"step": 7020
},
{
"epoch": 1.9772023641992682,
"grad_norm": 2.1660141944885254,
"learning_rate": 7.915919776073644e-08,
"loss": 0.1758,
"step": 7025
},
{
"epoch": 1.9786096256684491,
"grad_norm": 0.9142996072769165,
"learning_rate": 6.968918530920742e-08,
"loss": 0.2226,
"step": 7030
},
{
"epoch": 1.98001688713763,
"grad_norm": 2.0500295162200928,
"learning_rate": 6.082208728490635e-08,
"loss": 0.1638,
"step": 7035
},
{
"epoch": 1.9814241486068112,
"grad_norm": 0.7084165811538696,
"learning_rate": 5.255795718611678e-08,
"loss": 0.1535,
"step": 7040
},
{
"epoch": 1.9828314100759923,
"grad_norm": 0.5557725429534912,
"learning_rate": 4.489684487322121e-08,
"loss": 0.1053,
"step": 7045
},
{
"epoch": 1.9842386715451732,
"grad_norm": 0.3313843905925751,
"learning_rate": 3.783879656840128e-08,
"loss": 0.1593,
"step": 7050
},
{
"epoch": 1.985645933014354,
"grad_norm": 2.084636688232422,
"learning_rate": 3.1383854855304705e-08,
"loss": 0.1938,
"step": 7055
},
{
"epoch": 1.987053194483535,
"grad_norm": 0.47041577100753784,
"learning_rate": 2.553205867884545e-08,
"loss": 0.0875,
"step": 7060
},
{
"epoch": 1.988460455952716,
"grad_norm": 0.6036000847816467,
"learning_rate": 2.0283443344959464e-08,
"loss": 0.064,
"step": 7065
},
{
"epoch": 1.9898677174218968,
"grad_norm": 0.40105298161506653,
"learning_rate": 1.5638040520382646e-08,
"loss": 0.1467,
"step": 7070
},
{
"epoch": 1.991274978891078,
"grad_norm": 0.8283329606056213,
"learning_rate": 1.1595878232428803e-08,
"loss": 0.1675,
"step": 7075
},
{
"epoch": 1.992682240360259,
"grad_norm": 0.612358570098877,
"learning_rate": 8.15698086888972e-09,
"loss": 0.1813,
"step": 7080
},
{
"epoch": 1.99408950182944,
"grad_norm": 0.3482489287853241,
"learning_rate": 5.321369177835323e-09,
"loss": 0.1543,
"step": 7085
},
{
"epoch": 1.995496763298621,
"grad_norm": 0.9294025301933289,
"learning_rate": 3.089060267480459e-09,
"loss": 0.1197,
"step": 7090
},
{
"epoch": 1.9969040247678018,
"grad_norm": 1.7287979125976562,
"learning_rate": 1.4600676061404805e-09,
"loss": 0.1638,
"step": 7095
},
{
"epoch": 1.9983112862369827,
"grad_norm": 0.451955109834671,
"learning_rate": 4.344010220980188e-10,
"loss": 0.2378,
"step": 7100
},
{
"epoch": 1.9997185477061636,
"grad_norm": 0.541246771812439,
"learning_rate": 1.20667035474753e-11,
"loss": 0.1537,
"step": 7105
},
{
"epoch": 2.0,
"step": 7106,
"total_flos": 1.54790643235396e+18,
"train_loss": 0.3593486731773929,
"train_runtime": 16225.5696,
"train_samples_per_second": 3.503,
"train_steps_per_second": 0.438
}
],
"logging_steps": 5,
"max_steps": 7106,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.54790643235396e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}