ErrorAI's picture
Training in progress, step 1337, checkpoint
3ce23d1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.004443325581607889,
"eval_steps": 335,
"global_step": 1337,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.323354960065736e-06,
"grad_norm": 2.593838930130005,
"learning_rate": 2e-05,
"loss": 1.1307,
"step": 1
},
{
"epoch": 3.323354960065736e-06,
"eval_loss": 0.82511967420578,
"eval_runtime": 3228.7238,
"eval_samples_per_second": 39.24,
"eval_steps_per_second": 19.62,
"step": 1
},
{
"epoch": 6.646709920131472e-06,
"grad_norm": 1.9493608474731445,
"learning_rate": 4e-05,
"loss": 0.8578,
"step": 2
},
{
"epoch": 9.970064880197207e-06,
"grad_norm": 2.583665370941162,
"learning_rate": 6e-05,
"loss": 0.6069,
"step": 3
},
{
"epoch": 1.3293419840262944e-05,
"grad_norm": 3.607430934906006,
"learning_rate": 8e-05,
"loss": 0.7483,
"step": 4
},
{
"epoch": 1.661677480032868e-05,
"grad_norm": 2.763707399368286,
"learning_rate": 0.0001,
"loss": 1.0399,
"step": 5
},
{
"epoch": 1.9940129760394414e-05,
"grad_norm": 6.582963466644287,
"learning_rate": 0.00012,
"loss": 1.3938,
"step": 6
},
{
"epoch": 2.326348472046015e-05,
"grad_norm": 3.642049551010132,
"learning_rate": 0.00014,
"loss": 0.6619,
"step": 7
},
{
"epoch": 2.6586839680525888e-05,
"grad_norm": 2.402776002883911,
"learning_rate": 0.00016,
"loss": 0.9859,
"step": 8
},
{
"epoch": 2.9910194640591625e-05,
"grad_norm": 3.6133182048797607,
"learning_rate": 0.00018,
"loss": 1.3822,
"step": 9
},
{
"epoch": 3.323354960065736e-05,
"grad_norm": 2.660515546798706,
"learning_rate": 0.0002,
"loss": 0.9685,
"step": 10
},
{
"epoch": 3.65569045607231e-05,
"grad_norm": 3.0625710487365723,
"learning_rate": 0.0001999997197615636,
"loss": 1.1394,
"step": 11
},
{
"epoch": 3.988025952078883e-05,
"grad_norm": 3.345867872238159,
"learning_rate": 0.0001999988790478251,
"loss": 0.7105,
"step": 12
},
{
"epoch": 4.3203614480854566e-05,
"grad_norm": 3.0021090507507324,
"learning_rate": 0.00019999747786349646,
"loss": 0.7867,
"step": 13
},
{
"epoch": 4.65269694409203e-05,
"grad_norm": 2.038267135620117,
"learning_rate": 0.000199995516216431,
"loss": 0.7967,
"step": 14
},
{
"epoch": 4.985032440098604e-05,
"grad_norm": 2.1804399490356445,
"learning_rate": 0.00019999299411762334,
"loss": 0.6677,
"step": 15
},
{
"epoch": 5.3173679361051776e-05,
"grad_norm": 2.740802526473999,
"learning_rate": 0.0001999899115812092,
"loss": 0.6612,
"step": 16
},
{
"epoch": 5.649703432111751e-05,
"grad_norm": 2.2491555213928223,
"learning_rate": 0.00019998626862446556,
"loss": 0.7864,
"step": 17
},
{
"epoch": 5.982038928118325e-05,
"grad_norm": 2.0781562328338623,
"learning_rate": 0.0001999820652678103,
"loss": 0.9538,
"step": 18
},
{
"epoch": 6.314374424124898e-05,
"grad_norm": 1.8750240802764893,
"learning_rate": 0.00019997730153480228,
"loss": 0.6619,
"step": 19
},
{
"epoch": 6.646709920131472e-05,
"grad_norm": 1.6040363311767578,
"learning_rate": 0.00019997197745214108,
"loss": 0.7326,
"step": 20
},
{
"epoch": 6.979045416138045e-05,
"grad_norm": 2.0579588413238525,
"learning_rate": 0.000199966093049667,
"loss": 0.7618,
"step": 21
},
{
"epoch": 7.31138091214462e-05,
"grad_norm": 2.5934383869171143,
"learning_rate": 0.00019995964836036075,
"loss": 1.2188,
"step": 22
},
{
"epoch": 7.643716408151193e-05,
"grad_norm": 1.8951530456542969,
"learning_rate": 0.00019995264342034328,
"loss": 0.8864,
"step": 23
},
{
"epoch": 7.976051904157766e-05,
"grad_norm": 2.207963705062866,
"learning_rate": 0.00019994507826887574,
"loss": 0.6334,
"step": 24
},
{
"epoch": 8.30838740016434e-05,
"grad_norm": 1.5225898027420044,
"learning_rate": 0.00019993695294835898,
"loss": 0.5909,
"step": 25
},
{
"epoch": 8.640722896170913e-05,
"grad_norm": 2.108294725418091,
"learning_rate": 0.00019992826750433356,
"loss": 0.7502,
"step": 26
},
{
"epoch": 8.973058392177487e-05,
"grad_norm": 2.806995153427124,
"learning_rate": 0.00019991902198547942,
"loss": 0.9968,
"step": 27
},
{
"epoch": 9.30539388818406e-05,
"grad_norm": 2.3967955112457275,
"learning_rate": 0.00019990921644361546,
"loss": 0.5702,
"step": 28
},
{
"epoch": 9.637729384190635e-05,
"grad_norm": 3.112006187438965,
"learning_rate": 0.0001998988509336996,
"loss": 0.9814,
"step": 29
},
{
"epoch": 9.970064880197208e-05,
"grad_norm": 3.5534136295318604,
"learning_rate": 0.00019988792551382806,
"loss": 1.1793,
"step": 30
},
{
"epoch": 0.00010302400376203781,
"grad_norm": 2.224628448486328,
"learning_rate": 0.0001998764402452353,
"loss": 0.7663,
"step": 31
},
{
"epoch": 0.00010634735872210355,
"grad_norm": 4.616275310516357,
"learning_rate": 0.0001998643951922936,
"loss": 1.038,
"step": 32
},
{
"epoch": 0.00010967071368216928,
"grad_norm": 2.729283332824707,
"learning_rate": 0.00019985179042251267,
"loss": 1.1306,
"step": 33
},
{
"epoch": 0.00011299406864223503,
"grad_norm": 2.5778391361236572,
"learning_rate": 0.00019983862600653936,
"loss": 0.9213,
"step": 34
},
{
"epoch": 0.00011631742360230076,
"grad_norm": 2.805159568786621,
"learning_rate": 0.00019982490201815716,
"loss": 1.2159,
"step": 35
},
{
"epoch": 0.0001196407785623665,
"grad_norm": 3.2575483322143555,
"learning_rate": 0.00019981061853428588,
"loss": 1.14,
"step": 36
},
{
"epoch": 0.00012296413352243224,
"grad_norm": 2.3567512035369873,
"learning_rate": 0.00019979577563498108,
"loss": 0.5381,
"step": 37
},
{
"epoch": 0.00012628748848249796,
"grad_norm": 2.3204903602600098,
"learning_rate": 0.00019978037340343384,
"loss": 0.7389,
"step": 38
},
{
"epoch": 0.0001296108434425637,
"grad_norm": 2.385420083999634,
"learning_rate": 0.00019976441192597012,
"loss": 0.9223,
"step": 39
},
{
"epoch": 0.00013293419840262945,
"grad_norm": 2.0204904079437256,
"learning_rate": 0.00019974789129205024,
"loss": 1.0554,
"step": 40
},
{
"epoch": 0.00013625755336269516,
"grad_norm": 3.0869300365448,
"learning_rate": 0.00019973081159426856,
"loss": 1.0152,
"step": 41
},
{
"epoch": 0.0001395809083227609,
"grad_norm": 2.0119707584381104,
"learning_rate": 0.0001997131729283529,
"loss": 0.7922,
"step": 42
},
{
"epoch": 0.00014290426328282665,
"grad_norm": 2.9331247806549072,
"learning_rate": 0.00019969497539316374,
"loss": 0.9397,
"step": 43
},
{
"epoch": 0.0001462276182428924,
"grad_norm": 1.8829960823059082,
"learning_rate": 0.00019967621909069424,
"loss": 0.7462,
"step": 44
},
{
"epoch": 0.0001495509732029581,
"grad_norm": 1.8125828504562378,
"learning_rate": 0.00019965690412606906,
"loss": 0.9771,
"step": 45
},
{
"epoch": 0.00015287432816302385,
"grad_norm": 1.8464730978012085,
"learning_rate": 0.00019963703060754407,
"loss": 1.0249,
"step": 46
},
{
"epoch": 0.0001561976831230896,
"grad_norm": 1.6242536306381226,
"learning_rate": 0.0001996165986465058,
"loss": 0.7156,
"step": 47
},
{
"epoch": 0.00015952103808315532,
"grad_norm": 2.4028480052948,
"learning_rate": 0.00019959560835747066,
"loss": 0.7822,
"step": 48
},
{
"epoch": 0.00016284439304322106,
"grad_norm": 1.5756831169128418,
"learning_rate": 0.00019957405985808436,
"loss": 1.0894,
"step": 49
},
{
"epoch": 0.0001661677480032868,
"grad_norm": 1.5813134908676147,
"learning_rate": 0.00019955195326912123,
"loss": 0.9756,
"step": 50
},
{
"epoch": 0.00016949110296335255,
"grad_norm": 2.885624408721924,
"learning_rate": 0.00019952928871448363,
"loss": 1.2237,
"step": 51
},
{
"epoch": 0.00017281445792341826,
"grad_norm": 2.5551652908325195,
"learning_rate": 0.00019950606632120112,
"loss": 0.8768,
"step": 52
},
{
"epoch": 0.000176137812883484,
"grad_norm": 1.6047443151474,
"learning_rate": 0.00019948228621942984,
"loss": 1.289,
"step": 53
},
{
"epoch": 0.00017946116784354975,
"grad_norm": 3.01027250289917,
"learning_rate": 0.00019945794854245178,
"loss": 0.8267,
"step": 54
},
{
"epoch": 0.00018278452280361547,
"grad_norm": 2.2793943881988525,
"learning_rate": 0.000199433053426674,
"loss": 0.7214,
"step": 55
},
{
"epoch": 0.0001861078777636812,
"grad_norm": 1.9147419929504395,
"learning_rate": 0.00019940760101162783,
"loss": 1.3664,
"step": 56
},
{
"epoch": 0.00018943123272374695,
"grad_norm": 1.797659993171692,
"learning_rate": 0.0001993815914399682,
"loss": 0.9188,
"step": 57
},
{
"epoch": 0.0001927545876838127,
"grad_norm": 1.291646122932434,
"learning_rate": 0.00019935502485747273,
"loss": 1.1547,
"step": 58
},
{
"epoch": 0.00019607794264387841,
"grad_norm": 4.202106475830078,
"learning_rate": 0.00019932790141304096,
"loss": 0.4626,
"step": 59
},
{
"epoch": 0.00019940129760394416,
"grad_norm": 2.5573880672454834,
"learning_rate": 0.00019930022125869357,
"loss": 0.6246,
"step": 60
},
{
"epoch": 0.0002027246525640099,
"grad_norm": 2.0912930965423584,
"learning_rate": 0.00019927198454957137,
"loss": 0.9089,
"step": 61
},
{
"epoch": 0.00020604800752407562,
"grad_norm": 2.2763779163360596,
"learning_rate": 0.0001992431914439346,
"loss": 0.7671,
"step": 62
},
{
"epoch": 0.00020937136248414136,
"grad_norm": 1.7758549451828003,
"learning_rate": 0.00019921384210316197,
"loss": 0.6254,
"step": 63
},
{
"epoch": 0.0002126947174442071,
"grad_norm": 2.030876398086548,
"learning_rate": 0.0001991839366917497,
"loss": 1.4996,
"step": 64
},
{
"epoch": 0.00021601807240427285,
"grad_norm": 3.710184097290039,
"learning_rate": 0.0001991534753773108,
"loss": 1.0784,
"step": 65
},
{
"epoch": 0.00021934142736433857,
"grad_norm": 2.3565335273742676,
"learning_rate": 0.0001991224583305738,
"loss": 1.3819,
"step": 66
},
{
"epoch": 0.0002226647823244043,
"grad_norm": 1.755561351776123,
"learning_rate": 0.00019909088572538214,
"loss": 0.605,
"step": 67
},
{
"epoch": 0.00022598813728447005,
"grad_norm": 2.827974319458008,
"learning_rate": 0.00019905875773869292,
"loss": 1.2538,
"step": 68
},
{
"epoch": 0.00022931149224453577,
"grad_norm": 1.7999393939971924,
"learning_rate": 0.00019902607455057612,
"loss": 0.8563,
"step": 69
},
{
"epoch": 0.0002326348472046015,
"grad_norm": 1.5709620714187622,
"learning_rate": 0.00019899283634421342,
"loss": 0.89,
"step": 70
},
{
"epoch": 0.00023595820216466726,
"grad_norm": 2.1778719425201416,
"learning_rate": 0.00019895904330589724,
"loss": 0.641,
"step": 71
},
{
"epoch": 0.000239281557124733,
"grad_norm": 5.8329176902771,
"learning_rate": 0.00019892469562502984,
"loss": 1.005,
"step": 72
},
{
"epoch": 0.00024260491208479872,
"grad_norm": 2.6885933876037598,
"learning_rate": 0.00019888979349412197,
"loss": 0.6898,
"step": 73
},
{
"epoch": 0.0002459282670448645,
"grad_norm": 3.8890256881713867,
"learning_rate": 0.000198854337108792,
"loss": 1.1845,
"step": 74
},
{
"epoch": 0.0002492516220049302,
"grad_norm": 2.175297260284424,
"learning_rate": 0.0001988183266677648,
"loss": 0.5369,
"step": 75
},
{
"epoch": 0.0002525749769649959,
"grad_norm": 3.844905138015747,
"learning_rate": 0.00019878176237287054,
"loss": 1.4376,
"step": 76
},
{
"epoch": 0.00025589833192506166,
"grad_norm": 2.4426724910736084,
"learning_rate": 0.00019874464442904362,
"loss": 0.7001,
"step": 77
},
{
"epoch": 0.0002592216868851274,
"grad_norm": 1.2583106756210327,
"learning_rate": 0.00019870697304432154,
"loss": 0.5028,
"step": 78
},
{
"epoch": 0.00026254504184519315,
"grad_norm": 2.751328706741333,
"learning_rate": 0.00019866874842984372,
"loss": 0.6537,
"step": 79
},
{
"epoch": 0.0002658683968052589,
"grad_norm": 3.057467222213745,
"learning_rate": 0.0001986299707998503,
"loss": 0.8409,
"step": 80
},
{
"epoch": 0.00026919175176532464,
"grad_norm": 3.0413947105407715,
"learning_rate": 0.0001985906403716809,
"loss": 0.7247,
"step": 81
},
{
"epoch": 0.00027251510672539033,
"grad_norm": 1.5867135524749756,
"learning_rate": 0.00019855075736577345,
"loss": 0.8003,
"step": 82
},
{
"epoch": 0.00027583846168545607,
"grad_norm": 2.737339496612549,
"learning_rate": 0.00019851032200566301,
"loss": 0.9347,
"step": 83
},
{
"epoch": 0.0002791618166455218,
"grad_norm": 2.1772475242614746,
"learning_rate": 0.00019846933451798043,
"loss": 0.9581,
"step": 84
},
{
"epoch": 0.00028248517160558756,
"grad_norm": 2.1294620037078857,
"learning_rate": 0.00019842779513245105,
"loss": 0.6898,
"step": 85
},
{
"epoch": 0.0002858085265656533,
"grad_norm": 1.3660725355148315,
"learning_rate": 0.00019838570408189357,
"loss": 0.7278,
"step": 86
},
{
"epoch": 0.00028913188152571905,
"grad_norm": 1.9248489141464233,
"learning_rate": 0.00019834306160221857,
"loss": 0.4563,
"step": 87
},
{
"epoch": 0.0002924552364857848,
"grad_norm": 2.1242995262145996,
"learning_rate": 0.00019829986793242727,
"loss": 1.1302,
"step": 88
},
{
"epoch": 0.0002957785914458505,
"grad_norm": 1.845307469367981,
"learning_rate": 0.00019825612331461027,
"loss": 0.7694,
"step": 89
},
{
"epoch": 0.0002991019464059162,
"grad_norm": 2.346177339553833,
"learning_rate": 0.00019821182799394595,
"loss": 0.9094,
"step": 90
},
{
"epoch": 0.00030242530136598197,
"grad_norm": 1.6282812356948853,
"learning_rate": 0.0001981669822186994,
"loss": 0.3128,
"step": 91
},
{
"epoch": 0.0003057486563260477,
"grad_norm": 1.948099970817566,
"learning_rate": 0.00019812158624022075,
"loss": 1.2383,
"step": 92
},
{
"epoch": 0.00030907201128611345,
"grad_norm": 2.2840962409973145,
"learning_rate": 0.000198075640312944,
"loss": 0.9425,
"step": 93
},
{
"epoch": 0.0003123953662461792,
"grad_norm": 2.1497271060943604,
"learning_rate": 0.0001980291446943855,
"loss": 1.0945,
"step": 94
},
{
"epoch": 0.00031571872120624494,
"grad_norm": 3.017632246017456,
"learning_rate": 0.00019798209964514237,
"loss": 0.8373,
"step": 95
},
{
"epoch": 0.00031904207616631063,
"grad_norm": 2.1620545387268066,
"learning_rate": 0.00019793450542889124,
"loss": 0.9743,
"step": 96
},
{
"epoch": 0.0003223654311263764,
"grad_norm": 2.597017526626587,
"learning_rate": 0.0001978863623123867,
"loss": 0.8165,
"step": 97
},
{
"epoch": 0.0003256887860864421,
"grad_norm": 1.6289966106414795,
"learning_rate": 0.00019783767056545976,
"loss": 0.6275,
"step": 98
},
{
"epoch": 0.00032901214104650786,
"grad_norm": 2.287369966506958,
"learning_rate": 0.00019778843046101643,
"loss": 0.717,
"step": 99
},
{
"epoch": 0.0003323354960065736,
"grad_norm": 3.0594425201416016,
"learning_rate": 0.00019773864227503612,
"loss": 1.1728,
"step": 100
},
{
"epoch": 0.00033565885096663935,
"grad_norm": 2.5068864822387695,
"learning_rate": 0.00019768830628657004,
"loss": 1.0075,
"step": 101
},
{
"epoch": 0.0003389822059267051,
"grad_norm": 2.005697727203369,
"learning_rate": 0.0001976374227777398,
"loss": 0.6371,
"step": 102
},
{
"epoch": 0.0003423055608867708,
"grad_norm": 1.5179638862609863,
"learning_rate": 0.00019758599203373574,
"loss": 0.5727,
"step": 103
},
{
"epoch": 0.0003456289158468365,
"grad_norm": 2.4509246349334717,
"learning_rate": 0.0001975340143428152,
"loss": 1.027,
"step": 104
},
{
"epoch": 0.00034895227080690227,
"grad_norm": 1.57172429561615,
"learning_rate": 0.00019748148999630116,
"loss": 0.6933,
"step": 105
},
{
"epoch": 0.000352275625766968,
"grad_norm": 2.8001174926757812,
"learning_rate": 0.00019742841928858048,
"loss": 0.5884,
"step": 106
},
{
"epoch": 0.00035559898072703376,
"grad_norm": 1.4794889688491821,
"learning_rate": 0.00019737480251710209,
"loss": 0.5695,
"step": 107
},
{
"epoch": 0.0003589223356870995,
"grad_norm": 1.9946357011795044,
"learning_rate": 0.0001973206399823757,
"loss": 1.0238,
"step": 108
},
{
"epoch": 0.00036224569064716524,
"grad_norm": 1.5432323217391968,
"learning_rate": 0.00019726593198796975,
"loss": 0.8537,
"step": 109
},
{
"epoch": 0.00036556904560723093,
"grad_norm": 2.5868325233459473,
"learning_rate": 0.0001972106788405099,
"loss": 1.4079,
"step": 110
},
{
"epoch": 0.0003688924005672967,
"grad_norm": 1.8962173461914062,
"learning_rate": 0.00019715488084967727,
"loss": 0.6017,
"step": 111
},
{
"epoch": 0.0003722157555273624,
"grad_norm": 1.5898348093032837,
"learning_rate": 0.00019709853832820665,
"loss": 0.9094,
"step": 112
},
{
"epoch": 0.00037553911048742816,
"grad_norm": 2.134303569793701,
"learning_rate": 0.0001970416515918849,
"loss": 0.7571,
"step": 113
},
{
"epoch": 0.0003788624654474939,
"grad_norm": 2.37445068359375,
"learning_rate": 0.000196984220959549,
"loss": 0.8375,
"step": 114
},
{
"epoch": 0.00038218582040755965,
"grad_norm": 2.937535285949707,
"learning_rate": 0.00019692624675308435,
"loss": 0.9742,
"step": 115
},
{
"epoch": 0.0003855091753676254,
"grad_norm": 2.049798011779785,
"learning_rate": 0.000196867729297423,
"loss": 0.8425,
"step": 116
},
{
"epoch": 0.0003888325303276911,
"grad_norm": 2.130037546157837,
"learning_rate": 0.00019680866892054174,
"loss": 0.7941,
"step": 117
},
{
"epoch": 0.00039215588528775683,
"grad_norm": 1.7274867296218872,
"learning_rate": 0.00019674906595346027,
"loss": 0.635,
"step": 118
},
{
"epoch": 0.00039547924024782257,
"grad_norm": 2.4574530124664307,
"learning_rate": 0.00019668892073023954,
"loss": 1.0262,
"step": 119
},
{
"epoch": 0.0003988025952078883,
"grad_norm": 3.3932619094848633,
"learning_rate": 0.00019662823358797951,
"loss": 1.021,
"step": 120
},
{
"epoch": 0.00040212595016795406,
"grad_norm": 3.7460312843322754,
"learning_rate": 0.0001965670048668177,
"loss": 0.666,
"step": 121
},
{
"epoch": 0.0004054493051280198,
"grad_norm": 2.45210337638855,
"learning_rate": 0.00019650523490992683,
"loss": 0.7837,
"step": 122
},
{
"epoch": 0.00040877266008808555,
"grad_norm": 1.7856807708740234,
"learning_rate": 0.00019644292406351322,
"loss": 0.7383,
"step": 123
},
{
"epoch": 0.00041209601504815124,
"grad_norm": 1.8551324605941772,
"learning_rate": 0.0001963800726768148,
"loss": 0.7183,
"step": 124
},
{
"epoch": 0.000415419370008217,
"grad_norm": 1.6712207794189453,
"learning_rate": 0.00019631668110209907,
"loss": 1.2395,
"step": 125
},
{
"epoch": 0.0004187427249682827,
"grad_norm": 2.197774648666382,
"learning_rate": 0.00019625274969466106,
"loss": 0.5523,
"step": 126
},
{
"epoch": 0.00042206607992834847,
"grad_norm": 2.847198009490967,
"learning_rate": 0.00019618827881282168,
"loss": 1.0058,
"step": 127
},
{
"epoch": 0.0004253894348884142,
"grad_norm": 4.338975429534912,
"learning_rate": 0.00019612326881792512,
"loss": 1.3433,
"step": 128
},
{
"epoch": 0.00042871278984847995,
"grad_norm": 1.8630313873291016,
"learning_rate": 0.0001960577200743375,
"loss": 0.586,
"step": 129
},
{
"epoch": 0.0004320361448085457,
"grad_norm": 2.66900634765625,
"learning_rate": 0.0001959916329494443,
"loss": 0.9871,
"step": 130
},
{
"epoch": 0.0004353594997686114,
"grad_norm": 2.0445942878723145,
"learning_rate": 0.00019592500781364866,
"loss": 0.8108,
"step": 131
},
{
"epoch": 0.00043868285472867713,
"grad_norm": 2.0626654624938965,
"learning_rate": 0.00019585784504036894,
"loss": 1.0747,
"step": 132
},
{
"epoch": 0.0004420062096887429,
"grad_norm": 1.622671127319336,
"learning_rate": 0.00019579014500603704,
"loss": 1.1958,
"step": 133
},
{
"epoch": 0.0004453295646488086,
"grad_norm": 1.9433460235595703,
"learning_rate": 0.00019572190809009595,
"loss": 0.855,
"step": 134
},
{
"epoch": 0.00044865291960887436,
"grad_norm": 1.5850857496261597,
"learning_rate": 0.00019565313467499783,
"loss": 0.842,
"step": 135
},
{
"epoch": 0.0004519762745689401,
"grad_norm": 1.862047553062439,
"learning_rate": 0.00019558382514620176,
"loss": 1.0577,
"step": 136
},
{
"epoch": 0.00045529962952900585,
"grad_norm": 2.423488140106201,
"learning_rate": 0.00019551397989217158,
"loss": 0.9635,
"step": 137
},
{
"epoch": 0.00045862298448907154,
"grad_norm": 1.421524167060852,
"learning_rate": 0.00019544359930437382,
"loss": 0.6248,
"step": 138
},
{
"epoch": 0.0004619463394491373,
"grad_norm": 2.424243211746216,
"learning_rate": 0.0001953726837772754,
"loss": 0.8793,
"step": 139
},
{
"epoch": 0.000465269694409203,
"grad_norm": 1.9087179899215698,
"learning_rate": 0.0001953012337083415,
"loss": 1.0438,
"step": 140
},
{
"epoch": 0.00046859304936926877,
"grad_norm": 1.9860990047454834,
"learning_rate": 0.0001952292494980331,
"loss": 0.8836,
"step": 141
},
{
"epoch": 0.0004719164043293345,
"grad_norm": 1.922594428062439,
"learning_rate": 0.00019515673154980515,
"loss": 0.7314,
"step": 142
},
{
"epoch": 0.00047523975928940026,
"grad_norm": 2.4743568897247314,
"learning_rate": 0.00019508368027010395,
"loss": 0.7213,
"step": 143
},
{
"epoch": 0.000478563114249466,
"grad_norm": 2.92726731300354,
"learning_rate": 0.00019501009606836503,
"loss": 1.0991,
"step": 144
},
{
"epoch": 0.0004818864692095317,
"grad_norm": 3.6583187580108643,
"learning_rate": 0.00019493597935701086,
"loss": 1.1298,
"step": 145
},
{
"epoch": 0.00048520982416959743,
"grad_norm": 1.9078856706619263,
"learning_rate": 0.00019486133055144839,
"loss": 0.9963,
"step": 146
},
{
"epoch": 0.0004885331791296632,
"grad_norm": 2.1639506816864014,
"learning_rate": 0.00019478615007006698,
"loss": 0.9563,
"step": 147
},
{
"epoch": 0.000491856534089729,
"grad_norm": 3.626147985458374,
"learning_rate": 0.0001947104383342358,
"loss": 0.5332,
"step": 148
},
{
"epoch": 0.0004951798890497946,
"grad_norm": 2.0015130043029785,
"learning_rate": 0.00019463419576830164,
"loss": 0.4274,
"step": 149
},
{
"epoch": 0.0004985032440098604,
"grad_norm": 2.0864193439483643,
"learning_rate": 0.00019455742279958645,
"loss": 0.5563,
"step": 150
},
{
"epoch": 0.0005018265989699261,
"grad_norm": 2.014056921005249,
"learning_rate": 0.00019448011985838496,
"loss": 0.8894,
"step": 151
},
{
"epoch": 0.0005051499539299918,
"grad_norm": 2.897721529006958,
"learning_rate": 0.00019440228737796226,
"loss": 0.7464,
"step": 152
},
{
"epoch": 0.0005084733088900576,
"grad_norm": 1.6992590427398682,
"learning_rate": 0.00019432392579455142,
"loss": 1.0114,
"step": 153
},
{
"epoch": 0.0005117966638501233,
"grad_norm": 2.061690092086792,
"learning_rate": 0.000194245035547351,
"loss": 0.8523,
"step": 154
},
{
"epoch": 0.0005151200188101891,
"grad_norm": 1.9756520986557007,
"learning_rate": 0.00019416561707852255,
"loss": 0.8939,
"step": 155
},
{
"epoch": 0.0005184433737702548,
"grad_norm": 3.0679516792297363,
"learning_rate": 0.00019408567083318827,
"loss": 0.8342,
"step": 156
},
{
"epoch": 0.0005217667287303206,
"grad_norm": 2.8347623348236084,
"learning_rate": 0.00019400519725942835,
"loss": 0.9611,
"step": 157
},
{
"epoch": 0.0005250900836903863,
"grad_norm": 1.632202386856079,
"learning_rate": 0.00019392419680827857,
"loss": 0.523,
"step": 158
},
{
"epoch": 0.000528413438650452,
"grad_norm": 1.9706271886825562,
"learning_rate": 0.00019384266993372772,
"loss": 1.2523,
"step": 159
},
{
"epoch": 0.0005317367936105178,
"grad_norm": 1.5293477773666382,
"learning_rate": 0.0001937606170927151,
"loss": 0.9098,
"step": 160
},
{
"epoch": 0.0005350601485705835,
"grad_norm": 2.504441976547241,
"learning_rate": 0.00019367803874512785,
"loss": 0.6785,
"step": 161
},
{
"epoch": 0.0005383835035306493,
"grad_norm": 2.0011444091796875,
"learning_rate": 0.00019359493535379857,
"loss": 0.7819,
"step": 162
},
{
"epoch": 0.0005417068584907149,
"grad_norm": 2.698025941848755,
"learning_rate": 0.0001935113073845025,
"loss": 1.2558,
"step": 163
},
{
"epoch": 0.0005450302134507807,
"grad_norm": 1.6795804500579834,
"learning_rate": 0.0001934271553059551,
"loss": 0.9447,
"step": 164
},
{
"epoch": 0.0005483535684108464,
"grad_norm": 2.05863094329834,
"learning_rate": 0.0001933424795898093,
"loss": 0.9352,
"step": 165
},
{
"epoch": 0.0005516769233709121,
"grad_norm": 2.321521520614624,
"learning_rate": 0.0001932572807106529,
"loss": 0.8903,
"step": 166
},
{
"epoch": 0.0005550002783309779,
"grad_norm": 1.3765385150909424,
"learning_rate": 0.00019317155914600593,
"loss": 1.0314,
"step": 167
},
{
"epoch": 0.0005583236332910436,
"grad_norm": 2.1478731632232666,
"learning_rate": 0.0001930853153763179,
"loss": 1.0972,
"step": 168
},
{
"epoch": 0.0005616469882511094,
"grad_norm": 1.5712205171585083,
"learning_rate": 0.00019299854988496525,
"loss": 0.6613,
"step": 169
},
{
"epoch": 0.0005649703432111751,
"grad_norm": 2.106233596801758,
"learning_rate": 0.00019291126315824846,
"loss": 0.674,
"step": 170
},
{
"epoch": 0.0005682936981712409,
"grad_norm": 2.7500600814819336,
"learning_rate": 0.00019282345568538939,
"loss": 0.7301,
"step": 171
},
{
"epoch": 0.0005716170531313066,
"grad_norm": 2.4417264461517334,
"learning_rate": 0.00019273512795852873,
"loss": 0.8071,
"step": 172
},
{
"epoch": 0.0005749404080913723,
"grad_norm": 2.697852849960327,
"learning_rate": 0.00019264628047272289,
"loss": 0.7752,
"step": 173
},
{
"epoch": 0.0005782637630514381,
"grad_norm": 2.2733891010284424,
"learning_rate": 0.00019255691372594148,
"loss": 0.7162,
"step": 174
},
{
"epoch": 0.0005815871180115038,
"grad_norm": 2.801717758178711,
"learning_rate": 0.0001924670282190645,
"loss": 0.9177,
"step": 175
},
{
"epoch": 0.0005849104729715696,
"grad_norm": 2.3935840129852295,
"learning_rate": 0.00019237662445587936,
"loss": 1.0933,
"step": 176
},
{
"epoch": 0.0005882338279316352,
"grad_norm": 3.074425220489502,
"learning_rate": 0.00019228570294307828,
"loss": 0.7686,
"step": 177
},
{
"epoch": 0.000591557182891701,
"grad_norm": 2.155723810195923,
"learning_rate": 0.00019219426419025534,
"loss": 0.9334,
"step": 178
},
{
"epoch": 0.0005948805378517667,
"grad_norm": 2.251619577407837,
"learning_rate": 0.00019210230870990352,
"loss": 0.7433,
"step": 179
},
{
"epoch": 0.0005982038928118324,
"grad_norm": 1.779147982597351,
"learning_rate": 0.0001920098370174121,
"loss": 0.8434,
"step": 180
},
{
"epoch": 0.0006015272477718982,
"grad_norm": 1.5720431804656982,
"learning_rate": 0.00019191684963106349,
"loss": 1.1236,
"step": 181
},
{
"epoch": 0.0006048506027319639,
"grad_norm": 2.1575276851654053,
"learning_rate": 0.0001918233470720305,
"loss": 0.6812,
"step": 182
},
{
"epoch": 0.0006081739576920297,
"grad_norm": 2.104848623275757,
"learning_rate": 0.00019172932986437333,
"loss": 0.8865,
"step": 183
},
{
"epoch": 0.0006114973126520954,
"grad_norm": 2.6855051517486572,
"learning_rate": 0.00019163479853503672,
"loss": 0.864,
"step": 184
},
{
"epoch": 0.0006148206676121612,
"grad_norm": 6.135538578033447,
"learning_rate": 0.00019153975361384687,
"loss": 1.0433,
"step": 185
},
{
"epoch": 0.0006181440225722269,
"grad_norm": 1.9011839628219604,
"learning_rate": 0.00019144419563350858,
"loss": 1.0538,
"step": 186
},
{
"epoch": 0.0006214673775322927,
"grad_norm": 1.792335033416748,
"learning_rate": 0.00019134812512960233,
"loss": 0.9006,
"step": 187
},
{
"epoch": 0.0006247907324923584,
"grad_norm": 2.9424028396606445,
"learning_rate": 0.00019125154264058094,
"loss": 1.0332,
"step": 188
},
{
"epoch": 0.0006281140874524241,
"grad_norm": 2.4385499954223633,
"learning_rate": 0.00019115444870776695,
"loss": 0.3531,
"step": 189
},
{
"epoch": 0.0006314374424124899,
"grad_norm": 1.7163375616073608,
"learning_rate": 0.00019105684387534948,
"loss": 1.258,
"step": 190
},
{
"epoch": 0.0006347607973725555,
"grad_norm": 2.8776745796203613,
"learning_rate": 0.00019095872869038098,
"loss": 0.5629,
"step": 191
},
{
"epoch": 0.0006380841523326213,
"grad_norm": 2.397474765777588,
"learning_rate": 0.00019086010370277437,
"loss": 1.0474,
"step": 192
},
{
"epoch": 0.000641407507292687,
"grad_norm": 2.938826084136963,
"learning_rate": 0.00019076096946529992,
"loss": 1.0489,
"step": 193
},
{
"epoch": 0.0006447308622527527,
"grad_norm": 1.9875274896621704,
"learning_rate": 0.0001906613265335821,
"loss": 1.1448,
"step": 194
},
{
"epoch": 0.0006480542172128185,
"grad_norm": 2.6492698192596436,
"learning_rate": 0.00019056117546609647,
"loss": 1.0026,
"step": 195
},
{
"epoch": 0.0006513775721728842,
"grad_norm": 1.7564932107925415,
"learning_rate": 0.00019046051682416662,
"loss": 1.268,
"step": 196
},
{
"epoch": 0.00065470092713295,
"grad_norm": 2.032909631729126,
"learning_rate": 0.00019035935117196097,
"loss": 0.6964,
"step": 197
},
{
"epoch": 0.0006580242820930157,
"grad_norm": 3.4252007007598877,
"learning_rate": 0.00019025767907648958,
"loss": 1.0756,
"step": 198
},
{
"epoch": 0.0006613476370530815,
"grad_norm": 2.259429454803467,
"learning_rate": 0.00019015550110760106,
"loss": 1.1985,
"step": 199
},
{
"epoch": 0.0006646709920131472,
"grad_norm": 2.154845714569092,
"learning_rate": 0.00019005281783797927,
"loss": 0.8484,
"step": 200
},
{
"epoch": 0.000667994346973213,
"grad_norm": 1.7857868671417236,
"learning_rate": 0.0001899496298431402,
"loss": 1.178,
"step": 201
},
{
"epoch": 0.0006713177019332787,
"grad_norm": 1.5021724700927734,
"learning_rate": 0.0001898459377014287,
"loss": 0.9586,
"step": 202
},
{
"epoch": 0.0006746410568933444,
"grad_norm": 2.9662904739379883,
"learning_rate": 0.00018974174199401525,
"loss": 1.0109,
"step": 203
},
{
"epoch": 0.0006779644118534102,
"grad_norm": 2.3649721145629883,
"learning_rate": 0.00018963704330489262,
"loss": 0.7838,
"step": 204
},
{
"epoch": 0.0006812877668134758,
"grad_norm": 2.1048192977905273,
"learning_rate": 0.00018953184222087285,
"loss": 1.3349,
"step": 205
},
{
"epoch": 0.0006846111217735416,
"grad_norm": 2.5267252922058105,
"learning_rate": 0.00018942613933158365,
"loss": 1.2447,
"step": 206
},
{
"epoch": 0.0006879344767336073,
"grad_norm": 2.2604713439941406,
"learning_rate": 0.00018931993522946526,
"loss": 0.9698,
"step": 207
},
{
"epoch": 0.000691257831693673,
"grad_norm": 2.060702085494995,
"learning_rate": 0.00018921323050976712,
"loss": 1.1335,
"step": 208
},
{
"epoch": 0.0006945811866537388,
"grad_norm": 1.4325411319732666,
"learning_rate": 0.0001891060257705445,
"loss": 0.8489,
"step": 209
},
{
"epoch": 0.0006979045416138045,
"grad_norm": 1.696060299873352,
"learning_rate": 0.00018899832161265514,
"loss": 1.0021,
"step": 210
},
{
"epoch": 0.0007012278965738703,
"grad_norm": 2.194124221801758,
"learning_rate": 0.000188890118639756,
"loss": 0.8379,
"step": 211
},
{
"epoch": 0.000704551251533936,
"grad_norm": 2.39493727684021,
"learning_rate": 0.00018878141745829965,
"loss": 0.8481,
"step": 212
},
{
"epoch": 0.0007078746064940018,
"grad_norm": 2.1452245712280273,
"learning_rate": 0.0001886722186775311,
"loss": 0.7282,
"step": 213
},
{
"epoch": 0.0007111979614540675,
"grad_norm": 4.161812782287598,
"learning_rate": 0.0001885625229094843,
"loss": 1.1232,
"step": 214
},
{
"epoch": 0.0007145213164141333,
"grad_norm": 2.658095359802246,
"learning_rate": 0.00018845233076897864,
"loss": 0.7978,
"step": 215
},
{
"epoch": 0.000717844671374199,
"grad_norm": 2.6832375526428223,
"learning_rate": 0.00018834164287361553,
"loss": 1.0845,
"step": 216
},
{
"epoch": 0.0007211680263342647,
"grad_norm": 1.4918650388717651,
"learning_rate": 0.0001882304598437751,
"loss": 1.1464,
"step": 217
},
{
"epoch": 0.0007244913812943305,
"grad_norm": 2.185304641723633,
"learning_rate": 0.00018811878230261245,
"loss": 0.9031,
"step": 218
},
{
"epoch": 0.0007278147362543961,
"grad_norm": 2.20607590675354,
"learning_rate": 0.00018800661087605444,
"loss": 0.7687,
"step": 219
},
{
"epoch": 0.0007311380912144619,
"grad_norm": 1.7606959342956543,
"learning_rate": 0.0001878939461927959,
"loss": 0.7721,
"step": 220
},
{
"epoch": 0.0007344614461745276,
"grad_norm": 1.9390268325805664,
"learning_rate": 0.0001877807888842964,
"loss": 0.7985,
"step": 221
},
{
"epoch": 0.0007377848011345934,
"grad_norm": 1.9534977674484253,
"learning_rate": 0.00018766713958477644,
"loss": 0.7436,
"step": 222
},
{
"epoch": 0.0007411081560946591,
"grad_norm": 2.0108580589294434,
"learning_rate": 0.00018755299893121404,
"loss": 0.8744,
"step": 223
},
{
"epoch": 0.0007444315110547248,
"grad_norm": 2.3469605445861816,
"learning_rate": 0.0001874383675633412,
"loss": 0.8103,
"step": 224
},
{
"epoch": 0.0007477548660147906,
"grad_norm": 2.6010794639587402,
"learning_rate": 0.00018732324612364022,
"loss": 0.6754,
"step": 225
},
{
"epoch": 0.0007510782209748563,
"grad_norm": 2.374936819076538,
"learning_rate": 0.00018720763525734017,
"loss": 0.9731,
"step": 226
},
{
"epoch": 0.0007544015759349221,
"grad_norm": 1.8234738111495972,
"learning_rate": 0.00018709153561241316,
"loss": 0.9814,
"step": 227
},
{
"epoch": 0.0007577249308949878,
"grad_norm": 2.046567678451538,
"learning_rate": 0.0001869749478395709,
"loss": 1.3241,
"step": 228
},
{
"epoch": 0.0007610482858550536,
"grad_norm": 2.928182601928711,
"learning_rate": 0.00018685787259226086,
"loss": 0.8543,
"step": 229
},
{
"epoch": 0.0007643716408151193,
"grad_norm": 3.1443798542022705,
"learning_rate": 0.00018674031052666272,
"loss": 0.7922,
"step": 230
},
{
"epoch": 0.000767694995775185,
"grad_norm": 3.132957935333252,
"learning_rate": 0.00018662226230168472,
"loss": 0.732,
"step": 231
},
{
"epoch": 0.0007710183507352508,
"grad_norm": 4.542943477630615,
"learning_rate": 0.0001865037285789598,
"loss": 1.0421,
"step": 232
},
{
"epoch": 0.0007743417056953165,
"grad_norm": 2.511564016342163,
"learning_rate": 0.0001863847100228421,
"loss": 1.0499,
"step": 233
},
{
"epoch": 0.0007776650606553822,
"grad_norm": 2.294689655303955,
"learning_rate": 0.0001862652073004031,
"loss": 0.731,
"step": 234
},
{
"epoch": 0.0007809884156154479,
"grad_norm": 2.206822395324707,
"learning_rate": 0.00018614522108142788,
"loss": 1.0678,
"step": 235
},
{
"epoch": 0.0007843117705755137,
"grad_norm": 1.5954980850219727,
"learning_rate": 0.00018602475203841152,
"loss": 1.0332,
"step": 236
},
{
"epoch": 0.0007876351255355794,
"grad_norm": 2.328080177307129,
"learning_rate": 0.0001859038008465551,
"loss": 0.7923,
"step": 237
},
{
"epoch": 0.0007909584804956451,
"grad_norm": 3.6944761276245117,
"learning_rate": 0.00018578236818376207,
"loss": 0.68,
"step": 238
},
{
"epoch": 0.0007942818354557109,
"grad_norm": 2.6158041954040527,
"learning_rate": 0.00018566045473063442,
"loss": 0.9602,
"step": 239
},
{
"epoch": 0.0007976051904157766,
"grad_norm": 3.4564719200134277,
"learning_rate": 0.0001855380611704689,
"loss": 0.8778,
"step": 240
},
{
"epoch": 0.0008009285453758424,
"grad_norm": 2.708306074142456,
"learning_rate": 0.00018541518818925308,
"loss": 0.9371,
"step": 241
},
{
"epoch": 0.0008042519003359081,
"grad_norm": 2.519165515899658,
"learning_rate": 0.0001852918364756616,
"loss": 0.8222,
"step": 242
},
{
"epoch": 0.0008075752552959739,
"grad_norm": 2.298029899597168,
"learning_rate": 0.00018516800672105228,
"loss": 0.7957,
"step": 243
},
{
"epoch": 0.0008108986102560396,
"grad_norm": 3.7880380153656006,
"learning_rate": 0.00018504369961946227,
"loss": 0.6957,
"step": 244
},
{
"epoch": 0.0008142219652161053,
"grad_norm": 1.516716718673706,
"learning_rate": 0.00018491891586760412,
"loss": 1.2474,
"step": 245
},
{
"epoch": 0.0008175453201761711,
"grad_norm": 2.2521932125091553,
"learning_rate": 0.00018479365616486194,
"loss": 1.1017,
"step": 246
},
{
"epoch": 0.0008208686751362368,
"grad_norm": 2.2281289100646973,
"learning_rate": 0.0001846679212132873,
"loss": 0.8114,
"step": 247
},
{
"epoch": 0.0008241920300963025,
"grad_norm": 2.63161301612854,
"learning_rate": 0.00018454171171759562,
"loss": 0.562,
"step": 248
},
{
"epoch": 0.0008275153850563682,
"grad_norm": 2.087157964706421,
"learning_rate": 0.0001844150283851619,
"loss": 0.9418,
"step": 249
},
{
"epoch": 0.000830838740016434,
"grad_norm": 2.3542277812957764,
"learning_rate": 0.00018428787192601692,
"loss": 0.9547,
"step": 250
},
{
"epoch": 0.0008341620949764997,
"grad_norm": 2.7823688983917236,
"learning_rate": 0.00018416024305284318,
"loss": 1.1647,
"step": 251
},
{
"epoch": 0.0008374854499365654,
"grad_norm": 3.1965949535369873,
"learning_rate": 0.00018403214248097108,
"loss": 0.9733,
"step": 252
},
{
"epoch": 0.0008408088048966312,
"grad_norm": 2.3127360343933105,
"learning_rate": 0.00018390357092837464,
"loss": 0.6744,
"step": 253
},
{
"epoch": 0.0008441321598566969,
"grad_norm": 2.6081366539001465,
"learning_rate": 0.0001837745291156677,
"loss": 0.4712,
"step": 254
},
{
"epoch": 0.0008474555148167627,
"grad_norm": 2.655521869659424,
"learning_rate": 0.00018364501776609978,
"loss": 0.8526,
"step": 255
},
{
"epoch": 0.0008507788697768284,
"grad_norm": 2.5998477935791016,
"learning_rate": 0.000183515037605552,
"loss": 0.6903,
"step": 256
},
{
"epoch": 0.0008541022247368942,
"grad_norm": 1.3501770496368408,
"learning_rate": 0.00018338458936253323,
"loss": 0.6203,
"step": 257
},
{
"epoch": 0.0008574255796969599,
"grad_norm": 2.938946008682251,
"learning_rate": 0.00018325367376817553,
"loss": 1.0738,
"step": 258
},
{
"epoch": 0.0008607489346570257,
"grad_norm": 1.3566104173660278,
"learning_rate": 0.00018312229155623063,
"loss": 0.8905,
"step": 259
},
{
"epoch": 0.0008640722896170914,
"grad_norm": 1.6955430507659912,
"learning_rate": 0.0001829904434630654,
"loss": 0.8945,
"step": 260
},
{
"epoch": 0.0008673956445771571,
"grad_norm": 2.2695062160491943,
"learning_rate": 0.00018285813022765794,
"loss": 1.3401,
"step": 261
},
{
"epoch": 0.0008707189995372228,
"grad_norm": 2.285875082015991,
"learning_rate": 0.00018272535259159333,
"loss": 0.5588,
"step": 262
},
{
"epoch": 0.0008740423544972885,
"grad_norm": 2.5898141860961914,
"learning_rate": 0.0001825921112990595,
"loss": 0.5749,
"step": 263
},
{
"epoch": 0.0008773657094573543,
"grad_norm": 1.8657327890396118,
"learning_rate": 0.0001824584070968431,
"loss": 1.1243,
"step": 264
},
{
"epoch": 0.00088068906441742,
"grad_norm": 3.04009747505188,
"learning_rate": 0.00018232424073432523,
"loss": 0.6605,
"step": 265
},
{
"epoch": 0.0008840124193774857,
"grad_norm": 1.8760170936584473,
"learning_rate": 0.0001821896129634773,
"loss": 0.8554,
"step": 266
},
{
"epoch": 0.0008873357743375515,
"grad_norm": 3.241586685180664,
"learning_rate": 0.00018205452453885692,
"loss": 1.3335,
"step": 267
},
{
"epoch": 0.0008906591292976172,
"grad_norm": 2.223585367202759,
"learning_rate": 0.0001819189762176034,
"loss": 0.7643,
"step": 268
},
{
"epoch": 0.000893982484257683,
"grad_norm": 2.3313605785369873,
"learning_rate": 0.00018178296875943377,
"loss": 0.9619,
"step": 269
},
{
"epoch": 0.0008973058392177487,
"grad_norm": 2.2432057857513428,
"learning_rate": 0.00018164650292663837,
"loss": 0.9027,
"step": 270
},
{
"epoch": 0.0009006291941778145,
"grad_norm": 2.1724648475646973,
"learning_rate": 0.00018150957948407655,
"loss": 0.8207,
"step": 271
},
{
"epoch": 0.0009039525491378802,
"grad_norm": 1.8221163749694824,
"learning_rate": 0.00018137219919917268,
"loss": 0.564,
"step": 272
},
{
"epoch": 0.000907275904097946,
"grad_norm": 2.9687201976776123,
"learning_rate": 0.0001812343628419114,
"loss": 0.6431,
"step": 273
},
{
"epoch": 0.0009105992590580117,
"grad_norm": 1.6138179302215576,
"learning_rate": 0.0001810960711848336,
"loss": 0.8986,
"step": 274
},
{
"epoch": 0.0009139226140180774,
"grad_norm": 2.7505204677581787,
"learning_rate": 0.0001809573250030321,
"loss": 0.559,
"step": 275
},
{
"epoch": 0.0009172459689781431,
"grad_norm": 1.9008560180664062,
"learning_rate": 0.00018081812507414707,
"loss": 0.7741,
"step": 276
},
{
"epoch": 0.0009205693239382088,
"grad_norm": 1.8047677278518677,
"learning_rate": 0.00018067847217836204,
"loss": 0.8689,
"step": 277
},
{
"epoch": 0.0009238926788982746,
"grad_norm": 2.580085039138794,
"learning_rate": 0.00018053836709839907,
"loss": 1.1604,
"step": 278
},
{
"epoch": 0.0009272160338583403,
"grad_norm": 2.055727243423462,
"learning_rate": 0.00018039781061951482,
"loss": 0.8672,
"step": 279
},
{
"epoch": 0.000930539388818406,
"grad_norm": 3.0733115673065186,
"learning_rate": 0.0001802568035294958,
"loss": 0.9887,
"step": 280
},
{
"epoch": 0.0009338627437784718,
"grad_norm": 2.3724474906921387,
"learning_rate": 0.00018011534661865414,
"loss": 0.8757,
"step": 281
},
{
"epoch": 0.0009371860987385375,
"grad_norm": 2.3270065784454346,
"learning_rate": 0.00017997344067982314,
"loss": 0.8757,
"step": 282
},
{
"epoch": 0.0009405094536986033,
"grad_norm": 1.917435646057129,
"learning_rate": 0.00017983108650835273,
"loss": 1.0723,
"step": 283
},
{
"epoch": 0.000943832808658669,
"grad_norm": 2.466601848602295,
"learning_rate": 0.00017968828490210517,
"loss": 0.8578,
"step": 284
},
{
"epoch": 0.0009471561636187348,
"grad_norm": 1.9273431301116943,
"learning_rate": 0.00017954503666145038,
"loss": 0.5289,
"step": 285
},
{
"epoch": 0.0009504795185788005,
"grad_norm": 1.703373670578003,
"learning_rate": 0.00017940134258926166,
"loss": 0.9778,
"step": 286
},
{
"epoch": 0.0009538028735388663,
"grad_norm": 2.1906208992004395,
"learning_rate": 0.00017925720349091107,
"loss": 1.0434,
"step": 287
},
{
"epoch": 0.000957126228498932,
"grad_norm": 1.8268455266952515,
"learning_rate": 0.00017911262017426482,
"loss": 0.5991,
"step": 288
},
{
"epoch": 0.0009604495834589977,
"grad_norm": 2.182431697845459,
"learning_rate": 0.00017896759344967906,
"loss": 1.0621,
"step": 289
},
{
"epoch": 0.0009637729384190634,
"grad_norm": 2.1958398818969727,
"learning_rate": 0.00017882212412999505,
"loss": 1.0184,
"step": 290
},
{
"epoch": 0.0009670962933791291,
"grad_norm": 2.3882758617401123,
"learning_rate": 0.0001786762130305346,
"loss": 0.9169,
"step": 291
},
{
"epoch": 0.0009704196483391949,
"grad_norm": 2.064112901687622,
"learning_rate": 0.00017852986096909574,
"loss": 1.1755,
"step": 292
},
{
"epoch": 0.0009737430032992606,
"grad_norm": 1.9005621671676636,
"learning_rate": 0.00017838306876594788,
"loss": 0.9455,
"step": 293
},
{
"epoch": 0.0009770663582593265,
"grad_norm": 2.704106330871582,
"learning_rate": 0.00017823583724382744,
"loss": 0.8015,
"step": 294
},
{
"epoch": 0.000980389713219392,
"grad_norm": 1.8319103717803955,
"learning_rate": 0.000178088167227933,
"loss": 1.1968,
"step": 295
},
{
"epoch": 0.000983713068179458,
"grad_norm": 2.681913137435913,
"learning_rate": 0.00017794005954592084,
"loss": 0.7402,
"step": 296
},
{
"epoch": 0.0009870364231395236,
"grad_norm": 1.8737455606460571,
"learning_rate": 0.00017779151502790027,
"loss": 1.0434,
"step": 297
},
{
"epoch": 0.0009903597780995892,
"grad_norm": 2.2911226749420166,
"learning_rate": 0.00017764253450642902,
"loss": 0.562,
"step": 298
},
{
"epoch": 0.000993683133059655,
"grad_norm": 2.5499420166015625,
"learning_rate": 0.00017749311881650837,
"loss": 1.1258,
"step": 299
},
{
"epoch": 0.0009970064880197207,
"grad_norm": 1.6716042757034302,
"learning_rate": 0.00017734326879557876,
"loss": 0.5659,
"step": 300
},
{
"epoch": 0.0010003298429797866,
"grad_norm": 4.81651496887207,
"learning_rate": 0.00017719298528351489,
"loss": 0.9533,
"step": 301
},
{
"epoch": 0.0010036531979398522,
"grad_norm": 2.0948708057403564,
"learning_rate": 0.00017704226912262108,
"loss": 0.9046,
"step": 302
},
{
"epoch": 0.001006976552899918,
"grad_norm": 3.2504518032073975,
"learning_rate": 0.00017689112115762656,
"loss": 1.1282,
"step": 303
},
{
"epoch": 0.0010102999078599837,
"grad_norm": 2.3687779903411865,
"learning_rate": 0.00017673954223568073,
"loss": 1.3698,
"step": 304
},
{
"epoch": 0.0010136232628200495,
"grad_norm": 1.7902189493179321,
"learning_rate": 0.00017658753320634835,
"loss": 1.0331,
"step": 305
},
{
"epoch": 0.0010169466177801152,
"grad_norm": 2.5034868717193604,
"learning_rate": 0.00017643509492160493,
"loss": 0.9221,
"step": 306
},
{
"epoch": 0.001020269972740181,
"grad_norm": 2.9678382873535156,
"learning_rate": 0.00017628222823583175,
"loss": 1.1739,
"step": 307
},
{
"epoch": 0.0010235933277002467,
"grad_norm": 2.8975117206573486,
"learning_rate": 0.0001761289340058113,
"loss": 0.8452,
"step": 308
},
{
"epoch": 0.0010269166826603125,
"grad_norm": 2.479252576828003,
"learning_rate": 0.0001759752130907222,
"loss": 0.6775,
"step": 309
},
{
"epoch": 0.0010302400376203781,
"grad_norm": 1.6846801042556763,
"learning_rate": 0.0001758210663521347,
"loss": 0.8358,
"step": 310
},
{
"epoch": 0.0010335633925804438,
"grad_norm": 1.4105985164642334,
"learning_rate": 0.00017566649465400555,
"loss": 1.1666,
"step": 311
},
{
"epoch": 0.0010368867475405096,
"grad_norm": 1.7234609127044678,
"learning_rate": 0.00017551149886267347,
"loss": 1.0065,
"step": 312
},
{
"epoch": 0.0010402101025005753,
"grad_norm": 2.2938060760498047,
"learning_rate": 0.00017535607984685392,
"loss": 0.7772,
"step": 313
},
{
"epoch": 0.0010435334574606411,
"grad_norm": 1.7764406204223633,
"learning_rate": 0.00017520023847763456,
"loss": 0.5399,
"step": 314
},
{
"epoch": 0.0010468568124207068,
"grad_norm": 3.057908058166504,
"learning_rate": 0.0001750439756284703,
"loss": 0.7325,
"step": 315
},
{
"epoch": 0.0010501801673807726,
"grad_norm": 2.3120014667510986,
"learning_rate": 0.00017488729217517818,
"loss": 0.7843,
"step": 316
},
{
"epoch": 0.0010535035223408382,
"grad_norm": 2.835329055786133,
"learning_rate": 0.00017473018899593276,
"loss": 0.7952,
"step": 317
},
{
"epoch": 0.001056826877300904,
"grad_norm": 3.063936233520508,
"learning_rate": 0.00017457266697126103,
"loss": 1.0372,
"step": 318
},
{
"epoch": 0.0010601502322609697,
"grad_norm": 2.0668740272521973,
"learning_rate": 0.0001744147269840375,
"loss": 1.2189,
"step": 319
},
{
"epoch": 0.0010634735872210356,
"grad_norm": 1.467910885810852,
"learning_rate": 0.00017425636991947926,
"loss": 0.4099,
"step": 320
},
{
"epoch": 0.0010667969421811012,
"grad_norm": 2.1655707359313965,
"learning_rate": 0.00017409759666514108,
"loss": 1.064,
"step": 321
},
{
"epoch": 0.001070120297141167,
"grad_norm": 2.4154369831085205,
"learning_rate": 0.00017393840811091025,
"loss": 0.827,
"step": 322
},
{
"epoch": 0.0010734436521012327,
"grad_norm": 1.3061712980270386,
"learning_rate": 0.00017377880514900186,
"loss": 0.4219,
"step": 323
},
{
"epoch": 0.0010767670070612986,
"grad_norm": 2.6192286014556885,
"learning_rate": 0.00017361878867395357,
"loss": 0.8404,
"step": 324
},
{
"epoch": 0.0010800903620213642,
"grad_norm": 2.863708257675171,
"learning_rate": 0.00017345835958262076,
"loss": 0.7554,
"step": 325
},
{
"epoch": 0.0010834137169814298,
"grad_norm": 3.0085036754608154,
"learning_rate": 0.00017329751877417132,
"loss": 0.8845,
"step": 326
},
{
"epoch": 0.0010867370719414957,
"grad_norm": 2.4132509231567383,
"learning_rate": 0.00017313626715008087,
"loss": 0.7935,
"step": 327
},
{
"epoch": 0.0010900604269015613,
"grad_norm": 1.584530234336853,
"learning_rate": 0.00017297460561412737,
"loss": 0.8238,
"step": 328
},
{
"epoch": 0.0010933837818616272,
"grad_norm": 3.4459643363952637,
"learning_rate": 0.0001728125350723864,
"loss": 0.7411,
"step": 329
},
{
"epoch": 0.0010967071368216928,
"grad_norm": 1.6007535457611084,
"learning_rate": 0.00017265005643322584,
"loss": 1.0634,
"step": 330
},
{
"epoch": 0.0011000304917817587,
"grad_norm": 2.6975879669189453,
"learning_rate": 0.00017248717060730094,
"loss": 1.0681,
"step": 331
},
{
"epoch": 0.0011033538467418243,
"grad_norm": 2.289832353591919,
"learning_rate": 0.00017232387850754904,
"loss": 0.6341,
"step": 332
},
{
"epoch": 0.0011066772017018901,
"grad_norm": 1.5257370471954346,
"learning_rate": 0.00017216018104918455,
"loss": 0.5519,
"step": 333
},
{
"epoch": 0.0011100005566619558,
"grad_norm": 3.0421066284179688,
"learning_rate": 0.00017199607914969396,
"loss": 1.0758,
"step": 334
},
{
"epoch": 0.0011133239116220216,
"grad_norm": 2.0122852325439453,
"learning_rate": 0.0001718315737288304,
"loss": 0.7425,
"step": 335
},
{
"epoch": 0.0011133239116220216,
"eval_loss": 0.8063258528709412,
"eval_runtime": 3233.4173,
"eval_samples_per_second": 39.183,
"eval_steps_per_second": 19.592,
"step": 335
},
{
"epoch": 0.0011166472665820873,
"grad_norm": 2.3147132396698,
"learning_rate": 0.0001716666657086087,
"loss": 1.1424,
"step": 336
},
{
"epoch": 0.0011199706215421531,
"grad_norm": 3.1851842403411865,
"learning_rate": 0.00017150135601330023,
"loss": 0.7141,
"step": 337
},
{
"epoch": 0.0011232939765022187,
"grad_norm": 2.0654189586639404,
"learning_rate": 0.00017133564556942753,
"loss": 1.054,
"step": 338
},
{
"epoch": 0.0011266173314622846,
"grad_norm": 2.193204402923584,
"learning_rate": 0.0001711695353057594,
"loss": 0.4185,
"step": 339
},
{
"epoch": 0.0011299406864223502,
"grad_norm": 2.66573166847229,
"learning_rate": 0.00017100302615330538,
"loss": 1.0129,
"step": 340
},
{
"epoch": 0.0011332640413824159,
"grad_norm": 1.7640445232391357,
"learning_rate": 0.00017083611904531077,
"loss": 0.9847,
"step": 341
},
{
"epoch": 0.0011365873963424817,
"grad_norm": 2.9297056198120117,
"learning_rate": 0.00017066881491725137,
"loss": 0.5878,
"step": 342
},
{
"epoch": 0.0011399107513025474,
"grad_norm": 2.2123494148254395,
"learning_rate": 0.00017050111470682806,
"loss": 1.0239,
"step": 343
},
{
"epoch": 0.0011432341062626132,
"grad_norm": 3.016484498977661,
"learning_rate": 0.00017033301935396176,
"loss": 0.8833,
"step": 344
},
{
"epoch": 0.0011465574612226788,
"grad_norm": 2.979496717453003,
"learning_rate": 0.00017016452980078803,
"loss": 0.9933,
"step": 345
},
{
"epoch": 0.0011498808161827447,
"grad_norm": 3.2114806175231934,
"learning_rate": 0.00016999564699165184,
"loss": 1.1358,
"step": 346
},
{
"epoch": 0.0011532041711428103,
"grad_norm": 2.6296467781066895,
"learning_rate": 0.00016982637187310236,
"loss": 0.975,
"step": 347
},
{
"epoch": 0.0011565275261028762,
"grad_norm": 1.9657410383224487,
"learning_rate": 0.0001696567053938874,
"loss": 1.032,
"step": 348
},
{
"epoch": 0.0011598508810629418,
"grad_norm": 2.1968419551849365,
"learning_rate": 0.0001694866485049483,
"loss": 1.0387,
"step": 349
},
{
"epoch": 0.0011631742360230077,
"grad_norm": 1.9351694583892822,
"learning_rate": 0.0001693162021594147,
"loss": 1.0679,
"step": 350
},
{
"epoch": 0.0011664975909830733,
"grad_norm": 2.5958166122436523,
"learning_rate": 0.0001691453673125989,
"loss": 0.8852,
"step": 351
},
{
"epoch": 0.0011698209459431392,
"grad_norm": 2.6189615726470947,
"learning_rate": 0.00016897414492199068,
"loss": 0.8748,
"step": 352
},
{
"epoch": 0.0011731443009032048,
"grad_norm": 2.212649345397949,
"learning_rate": 0.00016880253594725195,
"loss": 0.8637,
"step": 353
},
{
"epoch": 0.0011764676558632704,
"grad_norm": 2.4734244346618652,
"learning_rate": 0.0001686305413502114,
"loss": 0.6893,
"step": 354
},
{
"epoch": 0.0011797910108233363,
"grad_norm": 2.0774455070495605,
"learning_rate": 0.00016845816209485885,
"loss": 1.0388,
"step": 355
},
{
"epoch": 0.001183114365783402,
"grad_norm": 2.198000192642212,
"learning_rate": 0.00016828539914734024,
"loss": 0.3543,
"step": 356
},
{
"epoch": 0.0011864377207434678,
"grad_norm": 3.1604952812194824,
"learning_rate": 0.0001681122534759519,
"loss": 1.2221,
"step": 357
},
{
"epoch": 0.0011897610757035334,
"grad_norm": 1.7176005840301514,
"learning_rate": 0.0001679387260511353,
"loss": 1.0015,
"step": 358
},
{
"epoch": 0.0011930844306635993,
"grad_norm": 1.6220836639404297,
"learning_rate": 0.0001677648178454715,
"loss": 1.1473,
"step": 359
},
{
"epoch": 0.0011964077856236649,
"grad_norm": 1.8516310453414917,
"learning_rate": 0.0001675905298336758,
"loss": 1.0743,
"step": 360
},
{
"epoch": 0.0011997311405837307,
"grad_norm": 1.4496151208877563,
"learning_rate": 0.00016741586299259215,
"loss": 1.1718,
"step": 361
},
{
"epoch": 0.0012030544955437964,
"grad_norm": 2.137094497680664,
"learning_rate": 0.00016724081830118786,
"loss": 0.9015,
"step": 362
},
{
"epoch": 0.0012063778505038622,
"grad_norm": 2.0566606521606445,
"learning_rate": 0.0001670653967405479,
"loss": 0.6874,
"step": 363
},
{
"epoch": 0.0012097012054639279,
"grad_norm": 1.8700917959213257,
"learning_rate": 0.00016688959929386958,
"loss": 0.9722,
"step": 364
},
{
"epoch": 0.0012130245604239937,
"grad_norm": 2.447117328643799,
"learning_rate": 0.0001667134269464569,
"loss": 0.7384,
"step": 365
},
{
"epoch": 0.0012163479153840594,
"grad_norm": 1.885358452796936,
"learning_rate": 0.00016653688068571514,
"loss": 1.0383,
"step": 366
},
{
"epoch": 0.0012196712703441252,
"grad_norm": 2.7440185546875,
"learning_rate": 0.00016635996150114526,
"loss": 0.916,
"step": 367
},
{
"epoch": 0.0012229946253041908,
"grad_norm": 2.909104347229004,
"learning_rate": 0.00016618267038433836,
"loss": 1.2791,
"step": 368
},
{
"epoch": 0.0012263179802642565,
"grad_norm": 2.072293519973755,
"learning_rate": 0.00016600500832897016,
"loss": 0.9197,
"step": 369
},
{
"epoch": 0.0012296413352243223,
"grad_norm": 2.057619333267212,
"learning_rate": 0.0001658269763307954,
"loss": 1.1913,
"step": 370
},
{
"epoch": 0.001232964690184388,
"grad_norm": 2.2635629177093506,
"learning_rate": 0.00016564857538764222,
"loss": 0.4376,
"step": 371
},
{
"epoch": 0.0012362880451444538,
"grad_norm": 2.7420809268951416,
"learning_rate": 0.00016546980649940668,
"loss": 0.9921,
"step": 372
},
{
"epoch": 0.0012396114001045194,
"grad_norm": 2.0869131088256836,
"learning_rate": 0.0001652906706680471,
"loss": 0.8394,
"step": 373
},
{
"epoch": 0.0012429347550645853,
"grad_norm": 2.557790756225586,
"learning_rate": 0.00016511116889757825,
"loss": 0.444,
"step": 374
},
{
"epoch": 0.001246258110024651,
"grad_norm": 3.326352596282959,
"learning_rate": 0.00016493130219406615,
"loss": 1.2891,
"step": 375
},
{
"epoch": 0.0012495814649847168,
"grad_norm": 2.856938123703003,
"learning_rate": 0.00016475107156562206,
"loss": 0.7033,
"step": 376
},
{
"epoch": 0.0012529048199447824,
"grad_norm": 2.09859037399292,
"learning_rate": 0.00016457047802239698,
"loss": 0.7436,
"step": 377
},
{
"epoch": 0.0012562281749048483,
"grad_norm": 2.375894069671631,
"learning_rate": 0.0001643895225765759,
"loss": 0.7112,
"step": 378
},
{
"epoch": 0.001259551529864914,
"grad_norm": 2.7644810676574707,
"learning_rate": 0.00016420820624237227,
"loss": 0.9923,
"step": 379
},
{
"epoch": 0.0012628748848249798,
"grad_norm": 1.7642817497253418,
"learning_rate": 0.0001640265300360222,
"loss": 0.6766,
"step": 380
},
{
"epoch": 0.0012661982397850454,
"grad_norm": 1.8854440450668335,
"learning_rate": 0.00016384449497577888,
"loss": 1.0343,
"step": 381
},
{
"epoch": 0.001269521594745111,
"grad_norm": 2.546734571456909,
"learning_rate": 0.00016366210208190664,
"loss": 1.1507,
"step": 382
},
{
"epoch": 0.0012728449497051769,
"grad_norm": 2.9949944019317627,
"learning_rate": 0.00016347935237667546,
"loss": 0.5915,
"step": 383
},
{
"epoch": 0.0012761683046652425,
"grad_norm": 2.4460384845733643,
"learning_rate": 0.00016329624688435527,
"loss": 0.7061,
"step": 384
},
{
"epoch": 0.0012794916596253084,
"grad_norm": 2.156528949737549,
"learning_rate": 0.0001631127866312099,
"loss": 1.0332,
"step": 385
},
{
"epoch": 0.001282815014585374,
"grad_norm": 4.042215824127197,
"learning_rate": 0.00016292897264549172,
"loss": 0.9354,
"step": 386
},
{
"epoch": 0.0012861383695454399,
"grad_norm": 2.446014165878296,
"learning_rate": 0.00016274480595743554,
"loss": 1.0042,
"step": 387
},
{
"epoch": 0.0012894617245055055,
"grad_norm": 3.4464094638824463,
"learning_rate": 0.00016256028759925313,
"loss": 0.9816,
"step": 388
},
{
"epoch": 0.0012927850794655713,
"grad_norm": 2.033539295196533,
"learning_rate": 0.00016237541860512713,
"loss": 0.7215,
"step": 389
},
{
"epoch": 0.001296108434425637,
"grad_norm": 2.1186728477478027,
"learning_rate": 0.00016219020001120556,
"loss": 1.2443,
"step": 390
},
{
"epoch": 0.0012994317893857028,
"grad_norm": 2.625006675720215,
"learning_rate": 0.00016200463285559574,
"loss": 1.0826,
"step": 391
},
{
"epoch": 0.0013027551443457685,
"grad_norm": 2.270512104034424,
"learning_rate": 0.00016181871817835876,
"loss": 0.8646,
"step": 392
},
{
"epoch": 0.0013060784993058343,
"grad_norm": 1.2360811233520508,
"learning_rate": 0.0001616324570215033,
"loss": 0.8564,
"step": 393
},
{
"epoch": 0.0013094018542659,
"grad_norm": 3.1289613246917725,
"learning_rate": 0.0001614458504289801,
"loss": 0.7,
"step": 394
},
{
"epoch": 0.0013127252092259658,
"grad_norm": 2.8300228118896484,
"learning_rate": 0.00016125889944667597,
"loss": 0.7768,
"step": 395
},
{
"epoch": 0.0013160485641860314,
"grad_norm": 1.623810887336731,
"learning_rate": 0.00016107160512240793,
"loss": 1.2405,
"step": 396
},
{
"epoch": 0.001319371919146097,
"grad_norm": 1.4173606634140015,
"learning_rate": 0.0001608839685059173,
"loss": 0.8267,
"step": 397
},
{
"epoch": 0.001322695274106163,
"grad_norm": 2.3122427463531494,
"learning_rate": 0.00016069599064886397,
"loss": 0.7878,
"step": 398
},
{
"epoch": 0.0013260186290662286,
"grad_norm": 2.017801284790039,
"learning_rate": 0.00016050767260482034,
"loss": 0.7762,
"step": 399
},
{
"epoch": 0.0013293419840262944,
"grad_norm": 1.9952462911605835,
"learning_rate": 0.00016031901542926547,
"loss": 0.9665,
"step": 400
},
{
"epoch": 0.00133266533898636,
"grad_norm": 1.493338704109192,
"learning_rate": 0.00016013002017957923,
"loss": 0.6265,
"step": 401
},
{
"epoch": 0.001335988693946426,
"grad_norm": 2.571161985397339,
"learning_rate": 0.00015994068791503629,
"loss": 0.601,
"step": 402
},
{
"epoch": 0.0013393120489064915,
"grad_norm": 2.7708725929260254,
"learning_rate": 0.00015975101969680018,
"loss": 0.8486,
"step": 403
},
{
"epoch": 0.0013426354038665574,
"grad_norm": 2.8990213871002197,
"learning_rate": 0.0001595610165879174,
"loss": 0.7763,
"step": 404
},
{
"epoch": 0.001345958758826623,
"grad_norm": 3.1342248916625977,
"learning_rate": 0.00015937067965331148,
"loss": 0.9712,
"step": 405
},
{
"epoch": 0.0013492821137866889,
"grad_norm": 2.5845744609832764,
"learning_rate": 0.00015918000995977685,
"loss": 0.5654,
"step": 406
},
{
"epoch": 0.0013526054687467545,
"grad_norm": 2.0698084831237793,
"learning_rate": 0.00015898900857597305,
"loss": 1.1681,
"step": 407
},
{
"epoch": 0.0013559288237068204,
"grad_norm": 1.965105652809143,
"learning_rate": 0.00015879767657241874,
"loss": 0.9746,
"step": 408
},
{
"epoch": 0.001359252178666886,
"grad_norm": 3.326399564743042,
"learning_rate": 0.00015860601502148549,
"loss": 0.5961,
"step": 409
},
{
"epoch": 0.0013625755336269516,
"grad_norm": 2.5008418560028076,
"learning_rate": 0.00015841402499739197,
"loss": 0.9563,
"step": 410
},
{
"epoch": 0.0013658988885870175,
"grad_norm": 2.0401337146759033,
"learning_rate": 0.00015822170757619789,
"loss": 1.0539,
"step": 411
},
{
"epoch": 0.0013692222435470831,
"grad_norm": 2.2083499431610107,
"learning_rate": 0.00015802906383579788,
"loss": 0.8539,
"step": 412
},
{
"epoch": 0.001372545598507149,
"grad_norm": 2.2163169384002686,
"learning_rate": 0.00015783609485591556,
"loss": 0.9977,
"step": 413
},
{
"epoch": 0.0013758689534672146,
"grad_norm": 1.962361454963684,
"learning_rate": 0.00015764280171809747,
"loss": 0.8299,
"step": 414
},
{
"epoch": 0.0013791923084272805,
"grad_norm": 1.924648404121399,
"learning_rate": 0.0001574491855057069,
"loss": 0.8988,
"step": 415
},
{
"epoch": 0.001382515663387346,
"grad_norm": 2.6560895442962646,
"learning_rate": 0.00015725524730391796,
"loss": 0.7632,
"step": 416
},
{
"epoch": 0.001385839018347412,
"grad_norm": 2.312373399734497,
"learning_rate": 0.00015706098819970942,
"loss": 1.2274,
"step": 417
},
{
"epoch": 0.0013891623733074776,
"grad_norm": 2.8376545906066895,
"learning_rate": 0.00015686640928185862,
"loss": 1.0709,
"step": 418
},
{
"epoch": 0.0013924857282675434,
"grad_norm": 1.7969470024108887,
"learning_rate": 0.00015667151164093545,
"loss": 0.6818,
"step": 419
},
{
"epoch": 0.001395809083227609,
"grad_norm": 2.3826024532318115,
"learning_rate": 0.00015647629636929606,
"loss": 0.991,
"step": 420
},
{
"epoch": 0.001399132438187675,
"grad_norm": 1.66337251663208,
"learning_rate": 0.00015628076456107687,
"loss": 0.5347,
"step": 421
},
{
"epoch": 0.0014024557931477406,
"grad_norm": 2.6910243034362793,
"learning_rate": 0.0001560849173121885,
"loss": 1.0328,
"step": 422
},
{
"epoch": 0.0014057791481078064,
"grad_norm": 3.5220468044281006,
"learning_rate": 0.0001558887557203095,
"loss": 0.8558,
"step": 423
},
{
"epoch": 0.001409102503067872,
"grad_norm": 1.8120965957641602,
"learning_rate": 0.00015569228088488016,
"loss": 0.7371,
"step": 424
},
{
"epoch": 0.0014124258580279377,
"grad_norm": 3.545508861541748,
"learning_rate": 0.00015549549390709652,
"loss": 0.8421,
"step": 425
},
{
"epoch": 0.0014157492129880035,
"grad_norm": 2.4664230346679688,
"learning_rate": 0.00015529839588990411,
"loss": 0.4116,
"step": 426
},
{
"epoch": 0.0014190725679480692,
"grad_norm": 2.0859506130218506,
"learning_rate": 0.0001551009879379917,
"loss": 1.0685,
"step": 427
},
{
"epoch": 0.001422395922908135,
"grad_norm": 2.197068452835083,
"learning_rate": 0.00015490327115778524,
"loss": 1.0368,
"step": 428
},
{
"epoch": 0.0014257192778682007,
"grad_norm": 2.6117873191833496,
"learning_rate": 0.0001547052466574415,
"loss": 0.8388,
"step": 429
},
{
"epoch": 0.0014290426328282665,
"grad_norm": 1.4545451402664185,
"learning_rate": 0.00015450691554684204,
"loss": 0.5815,
"step": 430
},
{
"epoch": 0.0014323659877883321,
"grad_norm": 1.6503546237945557,
"learning_rate": 0.00015430827893758687,
"loss": 1.2627,
"step": 431
},
{
"epoch": 0.001435689342748398,
"grad_norm": 5.260714530944824,
"learning_rate": 0.00015410933794298827,
"loss": 0.5394,
"step": 432
},
{
"epoch": 0.0014390126977084636,
"grad_norm": 1.7504278421401978,
"learning_rate": 0.00015391009367806446,
"loss": 0.8148,
"step": 433
},
{
"epoch": 0.0014423360526685295,
"grad_norm": 1.788711428642273,
"learning_rate": 0.00015371054725953347,
"loss": 0.6245,
"step": 434
},
{
"epoch": 0.0014456594076285951,
"grad_norm": 1.9077836275100708,
"learning_rate": 0.00015351069980580682,
"loss": 0.5702,
"step": 435
},
{
"epoch": 0.001448982762588661,
"grad_norm": 1.910592794418335,
"learning_rate": 0.00015331055243698332,
"loss": 0.7232,
"step": 436
},
{
"epoch": 0.0014523061175487266,
"grad_norm": 2.3402035236358643,
"learning_rate": 0.00015311010627484266,
"loss": 0.7282,
"step": 437
},
{
"epoch": 0.0014556294725087922,
"grad_norm": 2.232607841491699,
"learning_rate": 0.00015290936244283918,
"loss": 1.2627,
"step": 438
},
{
"epoch": 0.001458952827468858,
"grad_norm": 1.9575645923614502,
"learning_rate": 0.00015270832206609568,
"loss": 1.0406,
"step": 439
},
{
"epoch": 0.0014622761824289237,
"grad_norm": 2.150543212890625,
"learning_rate": 0.00015250698627139697,
"loss": 0.9826,
"step": 440
},
{
"epoch": 0.0014655995373889896,
"grad_norm": 4.266528606414795,
"learning_rate": 0.00015230535618718357,
"loss": 0.8738,
"step": 441
},
{
"epoch": 0.0014689228923490552,
"grad_norm": 2.0403382778167725,
"learning_rate": 0.00015210343294354557,
"loss": 0.7778,
"step": 442
},
{
"epoch": 0.001472246247309121,
"grad_norm": 1.740125060081482,
"learning_rate": 0.00015190121767221594,
"loss": 0.9677,
"step": 443
},
{
"epoch": 0.0014755696022691867,
"grad_norm": 2.5546271800994873,
"learning_rate": 0.00015169871150656457,
"loss": 1.2232,
"step": 444
},
{
"epoch": 0.0014788929572292526,
"grad_norm": 2.9307358264923096,
"learning_rate": 0.00015149591558159166,
"loss": 0.5073,
"step": 445
},
{
"epoch": 0.0014822163121893182,
"grad_norm": 1.6386537551879883,
"learning_rate": 0.0001512928310339215,
"loss": 0.5687,
"step": 446
},
{
"epoch": 0.001485539667149384,
"grad_norm": 2.8957998752593994,
"learning_rate": 0.00015108945900179602,
"loss": 0.767,
"step": 447
},
{
"epoch": 0.0014888630221094497,
"grad_norm": 2.1520543098449707,
"learning_rate": 0.00015088580062506835,
"loss": 0.8657,
"step": 448
},
{
"epoch": 0.0014921863770695155,
"grad_norm": 1.4322130680084229,
"learning_rate": 0.00015068185704519667,
"loss": 0.9973,
"step": 449
},
{
"epoch": 0.0014955097320295812,
"grad_norm": 1.5489296913146973,
"learning_rate": 0.0001504776294052375,
"loss": 0.7799,
"step": 450
},
{
"epoch": 0.001498833086989647,
"grad_norm": 2.5609688758850098,
"learning_rate": 0.00015027311884983964,
"loss": 0.9496,
"step": 451
},
{
"epoch": 0.0015021564419497127,
"grad_norm": 2.324089288711548,
"learning_rate": 0.00015006832652523735,
"loss": 0.9417,
"step": 452
},
{
"epoch": 0.0015054797969097783,
"grad_norm": 1.6559861898422241,
"learning_rate": 0.00014986325357924432,
"loss": 1.0362,
"step": 453
},
{
"epoch": 0.0015088031518698441,
"grad_norm": 1.5636197328567505,
"learning_rate": 0.00014965790116124692,
"loss": 1.0802,
"step": 454
},
{
"epoch": 0.0015121265068299098,
"grad_norm": 1.774526834487915,
"learning_rate": 0.00014945227042219804,
"loss": 0.706,
"step": 455
},
{
"epoch": 0.0015154498617899756,
"grad_norm": 2.7354817390441895,
"learning_rate": 0.00014924636251461033,
"loss": 0.5476,
"step": 456
},
{
"epoch": 0.0015187732167500413,
"grad_norm": 1.8853740692138672,
"learning_rate": 0.00014904017859255006,
"loss": 1.112,
"step": 457
},
{
"epoch": 0.0015220965717101071,
"grad_norm": 3.530038356781006,
"learning_rate": 0.0001488337198116304,
"loss": 0.9669,
"step": 458
},
{
"epoch": 0.0015254199266701728,
"grad_norm": 2.1883273124694824,
"learning_rate": 0.00014862698732900508,
"loss": 1.0206,
"step": 459
},
{
"epoch": 0.0015287432816302386,
"grad_norm": 2.3160412311553955,
"learning_rate": 0.0001484199823033618,
"loss": 0.9228,
"step": 460
},
{
"epoch": 0.0015320666365903042,
"grad_norm": 1.904786467552185,
"learning_rate": 0.00014821270589491592,
"loss": 1.1042,
"step": 461
},
{
"epoch": 0.00153538999155037,
"grad_norm": 2.208003044128418,
"learning_rate": 0.00014800515926540375,
"loss": 0.7847,
"step": 462
},
{
"epoch": 0.0015387133465104357,
"grad_norm": 2.0928616523742676,
"learning_rate": 0.00014779734357807614,
"loss": 0.5636,
"step": 463
},
{
"epoch": 0.0015420367014705016,
"grad_norm": 2.1676676273345947,
"learning_rate": 0.000147589259997692,
"loss": 0.9899,
"step": 464
},
{
"epoch": 0.0015453600564305672,
"grad_norm": 1.9746110439300537,
"learning_rate": 0.00014738090969051163,
"loss": 0.8615,
"step": 465
},
{
"epoch": 0.001548683411390633,
"grad_norm": 2.691312789916992,
"learning_rate": 0.00014717229382429028,
"loss": 0.9095,
"step": 466
},
{
"epoch": 0.0015520067663506987,
"grad_norm": 2.8194122314453125,
"learning_rate": 0.0001469634135682717,
"loss": 1.0815,
"step": 467
},
{
"epoch": 0.0015553301213107643,
"grad_norm": 2.1670734882354736,
"learning_rate": 0.0001467542700931814,
"loss": 1.1359,
"step": 468
},
{
"epoch": 0.0015586534762708302,
"grad_norm": 3.914708375930786,
"learning_rate": 0.0001465448645712202,
"loss": 0.7727,
"step": 469
},
{
"epoch": 0.0015619768312308958,
"grad_norm": 1.640748381614685,
"learning_rate": 0.00014633519817605758,
"loss": 0.333,
"step": 470
},
{
"epoch": 0.0015653001861909617,
"grad_norm": 2.2876367568969727,
"learning_rate": 0.00014612527208282522,
"loss": 0.7169,
"step": 471
},
{
"epoch": 0.0015686235411510273,
"grad_norm": 1.4500001668930054,
"learning_rate": 0.00014591508746811032,
"loss": 0.9855,
"step": 472
},
{
"epoch": 0.0015719468961110932,
"grad_norm": 3.451514482498169,
"learning_rate": 0.0001457046455099491,
"loss": 0.8154,
"step": 473
},
{
"epoch": 0.0015752702510711588,
"grad_norm": 2.2254014015197754,
"learning_rate": 0.00014549394738781993,
"loss": 0.9397,
"step": 474
},
{
"epoch": 0.0015785936060312246,
"grad_norm": 2.63394832611084,
"learning_rate": 0.00014528299428263712,
"loss": 0.709,
"step": 475
},
{
"epoch": 0.0015819169609912903,
"grad_norm": 2.099602460861206,
"learning_rate": 0.0001450717873767441,
"loss": 1.0745,
"step": 476
},
{
"epoch": 0.0015852403159513561,
"grad_norm": 2.0736916065216064,
"learning_rate": 0.00014486032785390667,
"loss": 1.2907,
"step": 477
},
{
"epoch": 0.0015885636709114218,
"grad_norm": 2.344252347946167,
"learning_rate": 0.0001446486168993065,
"loss": 0.9257,
"step": 478
},
{
"epoch": 0.0015918870258714876,
"grad_norm": 2.2529079914093018,
"learning_rate": 0.00014443665569953466,
"loss": 0.5821,
"step": 479
},
{
"epoch": 0.0015952103808315533,
"grad_norm": 2.390965223312378,
"learning_rate": 0.00014422444544258454,
"loss": 0.9467,
"step": 480
},
{
"epoch": 0.001598533735791619,
"grad_norm": 2.4676353931427,
"learning_rate": 0.0001440119873178456,
"loss": 1.1481,
"step": 481
},
{
"epoch": 0.0016018570907516847,
"grad_norm": 2.723836660385132,
"learning_rate": 0.0001437992825160965,
"loss": 1.0146,
"step": 482
},
{
"epoch": 0.0016051804457117504,
"grad_norm": 2.4351353645324707,
"learning_rate": 0.00014358633222949843,
"loss": 1.0365,
"step": 483
},
{
"epoch": 0.0016085038006718162,
"grad_norm": 2.6160218715667725,
"learning_rate": 0.0001433731376515885,
"loss": 0.8535,
"step": 484
},
{
"epoch": 0.0016118271556318819,
"grad_norm": 1.719913125038147,
"learning_rate": 0.00014315969997727305,
"loss": 0.7818,
"step": 485
},
{
"epoch": 0.0016151505105919477,
"grad_norm": 1.9794899225234985,
"learning_rate": 0.00014294602040282087,
"loss": 0.6927,
"step": 486
},
{
"epoch": 0.0016184738655520134,
"grad_norm": 2.530505657196045,
"learning_rate": 0.00014273210012585651,
"loss": 1.0023,
"step": 487
},
{
"epoch": 0.0016217972205120792,
"grad_norm": 3.295813798904419,
"learning_rate": 0.00014251794034535374,
"loss": 1.0951,
"step": 488
},
{
"epoch": 0.0016251205754721448,
"grad_norm": 1.7799811363220215,
"learning_rate": 0.00014230354226162849,
"loss": 1.0959,
"step": 489
},
{
"epoch": 0.0016284439304322107,
"grad_norm": 3.7634365558624268,
"learning_rate": 0.00014208890707633252,
"loss": 0.7226,
"step": 490
},
{
"epoch": 0.0016317672853922763,
"grad_norm": 2.918199062347412,
"learning_rate": 0.00014187403599244639,
"loss": 0.9726,
"step": 491
},
{
"epoch": 0.0016350906403523422,
"grad_norm": 1.9790452718734741,
"learning_rate": 0.00014165893021427276,
"loss": 0.7381,
"step": 492
},
{
"epoch": 0.0016384139953124078,
"grad_norm": 3.541562557220459,
"learning_rate": 0.00014144359094742985,
"loss": 1.0652,
"step": 493
},
{
"epoch": 0.0016417373502724737,
"grad_norm": 2.492720603942871,
"learning_rate": 0.00014122801939884445,
"loss": 0.6157,
"step": 494
},
{
"epoch": 0.0016450607052325393,
"grad_norm": 3.0369317531585693,
"learning_rate": 0.0001410122167767452,
"loss": 1.0933,
"step": 495
},
{
"epoch": 0.001648384060192605,
"grad_norm": 3.2473337650299072,
"learning_rate": 0.00014079618429065587,
"loss": 0.9726,
"step": 496
},
{
"epoch": 0.0016517074151526708,
"grad_norm": 2.1378862857818604,
"learning_rate": 0.0001405799231513886,
"loss": 1.0105,
"step": 497
},
{
"epoch": 0.0016550307701127364,
"grad_norm": 2.4640915393829346,
"learning_rate": 0.0001403634345710371,
"loss": 0.7305,
"step": 498
},
{
"epoch": 0.0016583541250728023,
"grad_norm": 2.5675315856933594,
"learning_rate": 0.00014014671976296975,
"loss": 0.9592,
"step": 499
},
{
"epoch": 0.001661677480032868,
"grad_norm": 2.35866117477417,
"learning_rate": 0.00013992977994182297,
"loss": 1.2306,
"step": 500
},
{
"epoch": 0.0016650008349929338,
"grad_norm": 1.8059924840927124,
"learning_rate": 0.00013971261632349423,
"loss": 0.9314,
"step": 501
},
{
"epoch": 0.0016683241899529994,
"grad_norm": 2.0500893592834473,
"learning_rate": 0.0001394952301251354,
"loss": 0.9083,
"step": 502
},
{
"epoch": 0.0016716475449130653,
"grad_norm": 1.9817473888397217,
"learning_rate": 0.00013927762256514588,
"loss": 1.1285,
"step": 503
},
{
"epoch": 0.0016749708998731309,
"grad_norm": 2.559354782104492,
"learning_rate": 0.00013905979486316568,
"loss": 0.9594,
"step": 504
},
{
"epoch": 0.0016782942548331967,
"grad_norm": 2.1031811237335205,
"learning_rate": 0.00013884174824006873,
"loss": 0.9267,
"step": 505
},
{
"epoch": 0.0016816176097932624,
"grad_norm": 2.237828254699707,
"learning_rate": 0.0001386234839179559,
"loss": 0.7867,
"step": 506
},
{
"epoch": 0.0016849409647533282,
"grad_norm": 3.0826919078826904,
"learning_rate": 0.00013840500312014826,
"loss": 0.9198,
"step": 507
},
{
"epoch": 0.0016882643197133939,
"grad_norm": 1.821419358253479,
"learning_rate": 0.00013818630707118014,
"loss": 0.877,
"step": 508
},
{
"epoch": 0.0016915876746734595,
"grad_norm": 2.015178918838501,
"learning_rate": 0.00013796739699679228,
"loss": 0.7542,
"step": 509
},
{
"epoch": 0.0016949110296335254,
"grad_norm": 2.8974575996398926,
"learning_rate": 0.00013774827412392507,
"loss": 0.9196,
"step": 510
},
{
"epoch": 0.001698234384593591,
"grad_norm": 2.040759801864624,
"learning_rate": 0.00013752893968071149,
"loss": 0.8026,
"step": 511
},
{
"epoch": 0.0017015577395536568,
"grad_norm": 1.8575818538665771,
"learning_rate": 0.00013730939489647043,
"loss": 0.7079,
"step": 512
},
{
"epoch": 0.0017048810945137225,
"grad_norm": 2.015794038772583,
"learning_rate": 0.00013708964100169957,
"loss": 1.1416,
"step": 513
},
{
"epoch": 0.0017082044494737883,
"grad_norm": 2.1127171516418457,
"learning_rate": 0.0001368696792280687,
"loss": 0.9396,
"step": 514
},
{
"epoch": 0.001711527804433854,
"grad_norm": 2.133606195449829,
"learning_rate": 0.00013664951080841268,
"loss": 1.1934,
"step": 515
},
{
"epoch": 0.0017148511593939198,
"grad_norm": 1.995360016822815,
"learning_rate": 0.00013642913697672462,
"loss": 0.7406,
"step": 516
},
{
"epoch": 0.0017181745143539854,
"grad_norm": 2.1345937252044678,
"learning_rate": 0.00013620855896814878,
"loss": 1.0595,
"step": 517
},
{
"epoch": 0.0017214978693140513,
"grad_norm": 2.163135051727295,
"learning_rate": 0.000135987778018974,
"loss": 0.8264,
"step": 518
},
{
"epoch": 0.001724821224274117,
"grad_norm": 1.7292685508728027,
"learning_rate": 0.00013576679536662638,
"loss": 0.7376,
"step": 519
},
{
"epoch": 0.0017281445792341828,
"grad_norm": 1.779617190361023,
"learning_rate": 0.0001355456122496626,
"loss": 0.2976,
"step": 520
},
{
"epoch": 0.0017314679341942484,
"grad_norm": 2.1159169673919678,
"learning_rate": 0.00013532422990776287,
"loss": 1.1358,
"step": 521
},
{
"epoch": 0.0017347912891543143,
"grad_norm": 2.535615921020508,
"learning_rate": 0.00013510264958172398,
"loss": 0.9962,
"step": 522
},
{
"epoch": 0.00173811464411438,
"grad_norm": 2.4328877925872803,
"learning_rate": 0.00013488087251345245,
"loss": 0.7934,
"step": 523
},
{
"epoch": 0.0017414379990744455,
"grad_norm": 3.112191915512085,
"learning_rate": 0.00013465889994595747,
"loss": 0.8322,
"step": 524
},
{
"epoch": 0.0017447613540345114,
"grad_norm": 1.626070499420166,
"learning_rate": 0.00013443673312334392,
"loss": 0.4358,
"step": 525
},
{
"epoch": 0.001748084708994577,
"grad_norm": 1.7329679727554321,
"learning_rate": 0.00013421437329080548,
"loss": 0.6419,
"step": 526
},
{
"epoch": 0.0017514080639546429,
"grad_norm": 2.1917316913604736,
"learning_rate": 0.00013399182169461757,
"loss": 1.0182,
"step": 527
},
{
"epoch": 0.0017547314189147085,
"grad_norm": 2.6607604026794434,
"learning_rate": 0.0001337690795821304,
"loss": 0.7061,
"step": 528
},
{
"epoch": 0.0017580547738747744,
"grad_norm": 2.403113603591919,
"learning_rate": 0.00013354614820176205,
"loss": 0.9044,
"step": 529
},
{
"epoch": 0.00176137812883484,
"grad_norm": 1.7576875686645508,
"learning_rate": 0.0001333230288029913,
"loss": 0.8398,
"step": 530
},
{
"epoch": 0.0017647014837949059,
"grad_norm": 1.7336602210998535,
"learning_rate": 0.00013309972263635076,
"loss": 1.3434,
"step": 531
},
{
"epoch": 0.0017680248387549715,
"grad_norm": 1.9603030681610107,
"learning_rate": 0.00013287623095341991,
"loss": 0.7383,
"step": 532
},
{
"epoch": 0.0017713481937150373,
"grad_norm": 2.057232141494751,
"learning_rate": 0.00013265255500681796,
"loss": 1.0643,
"step": 533
},
{
"epoch": 0.001774671548675103,
"grad_norm": 1.9266340732574463,
"learning_rate": 0.00013242869605019676,
"loss": 0.9176,
"step": 534
},
{
"epoch": 0.0017779949036351688,
"grad_norm": 2.640422821044922,
"learning_rate": 0.00013220465533823406,
"loss": 0.8926,
"step": 535
},
{
"epoch": 0.0017813182585952345,
"grad_norm": 2.180840015411377,
"learning_rate": 0.00013198043412662627,
"loss": 0.9526,
"step": 536
},
{
"epoch": 0.0017846416135553,
"grad_norm": 2.1570563316345215,
"learning_rate": 0.00013175603367208134,
"loss": 0.7101,
"step": 537
},
{
"epoch": 0.001787964968515366,
"grad_norm": 2.518481731414795,
"learning_rate": 0.00013153145523231197,
"loss": 0.7608,
"step": 538
},
{
"epoch": 0.0017912883234754316,
"grad_norm": 2.682793140411377,
"learning_rate": 0.00013130670006602837,
"loss": 0.9513,
"step": 539
},
{
"epoch": 0.0017946116784354974,
"grad_norm": 2.468629837036133,
"learning_rate": 0.00013108176943293126,
"loss": 0.8469,
"step": 540
},
{
"epoch": 0.001797935033395563,
"grad_norm": 2.705810308456421,
"learning_rate": 0.0001308566645937048,
"loss": 1.0462,
"step": 541
},
{
"epoch": 0.001801258388355629,
"grad_norm": 1.938127875328064,
"learning_rate": 0.00013063138681000962,
"loss": 0.9872,
"step": 542
},
{
"epoch": 0.0018045817433156946,
"grad_norm": 1.6162564754486084,
"learning_rate": 0.00013040593734447555,
"loss": 0.8097,
"step": 543
},
{
"epoch": 0.0018079050982757604,
"grad_norm": 1.9742683172225952,
"learning_rate": 0.00013018031746069467,
"loss": 0.7124,
"step": 544
},
{
"epoch": 0.001811228453235826,
"grad_norm": 2.3947043418884277,
"learning_rate": 0.0001299545284232143,
"loss": 0.54,
"step": 545
},
{
"epoch": 0.001814551808195892,
"grad_norm": 3.0326342582702637,
"learning_rate": 0.00012972857149752976,
"loss": 0.6806,
"step": 546
},
{
"epoch": 0.0018178751631559575,
"grad_norm": 2.3208701610565186,
"learning_rate": 0.0001295024479500774,
"loss": 0.8071,
"step": 547
},
{
"epoch": 0.0018211985181160234,
"grad_norm": 2.1220314502716064,
"learning_rate": 0.00012927615904822736,
"loss": 0.8911,
"step": 548
},
{
"epoch": 0.001824521873076089,
"grad_norm": 4.058460712432861,
"learning_rate": 0.00012904970606027657,
"loss": 0.6629,
"step": 549
},
{
"epoch": 0.0018278452280361549,
"grad_norm": 1.9857665300369263,
"learning_rate": 0.00012882309025544173,
"loss": 1.1803,
"step": 550
},
{
"epoch": 0.0018311685829962205,
"grad_norm": 2.9957692623138428,
"learning_rate": 0.000128596312903852,
"loss": 0.8082,
"step": 551
},
{
"epoch": 0.0018344919379562862,
"grad_norm": 3.009734869003296,
"learning_rate": 0.00012836937527654194,
"loss": 0.6776,
"step": 552
},
{
"epoch": 0.001837815292916352,
"grad_norm": 2.615323066711426,
"learning_rate": 0.00012814227864544453,
"loss": 1.3133,
"step": 553
},
{
"epoch": 0.0018411386478764176,
"grad_norm": 2.3974363803863525,
"learning_rate": 0.0001279150242833838,
"loss": 0.5645,
"step": 554
},
{
"epoch": 0.0018444620028364835,
"grad_norm": 2.873770236968994,
"learning_rate": 0.00012768761346406793,
"loss": 0.4937,
"step": 555
},
{
"epoch": 0.0018477853577965491,
"grad_norm": 2.761345863342285,
"learning_rate": 0.000127460047462082,
"loss": 0.6471,
"step": 556
},
{
"epoch": 0.001851108712756615,
"grad_norm": 2.254004955291748,
"learning_rate": 0.00012723232755288075,
"loss": 0.5895,
"step": 557
},
{
"epoch": 0.0018544320677166806,
"grad_norm": 3.007963180541992,
"learning_rate": 0.0001270044550127816,
"loss": 1.0542,
"step": 558
},
{
"epoch": 0.0018577554226767465,
"grad_norm": 2.213001251220703,
"learning_rate": 0.00012677643111895756,
"loss": 0.8269,
"step": 559
},
{
"epoch": 0.001861078777636812,
"grad_norm": 2.1694979667663574,
"learning_rate": 0.00012654825714942972,
"loss": 0.7593,
"step": 560
},
{
"epoch": 0.001864402132596878,
"grad_norm": 2.7547688484191895,
"learning_rate": 0.00012631993438306043,
"loss": 1.9703,
"step": 561
},
{
"epoch": 0.0018677254875569436,
"grad_norm": 2.476234197616577,
"learning_rate": 0.00012609146409954598,
"loss": 0.8481,
"step": 562
},
{
"epoch": 0.0018710488425170094,
"grad_norm": 3.1867456436157227,
"learning_rate": 0.00012586284757940946,
"loss": 0.9079,
"step": 563
},
{
"epoch": 0.001874372197477075,
"grad_norm": 2.296586513519287,
"learning_rate": 0.0001256340861039936,
"loss": 0.7001,
"step": 564
},
{
"epoch": 0.001877695552437141,
"grad_norm": 1.6829555034637451,
"learning_rate": 0.00012540518095545365,
"loss": 0.7896,
"step": 565
},
{
"epoch": 0.0018810189073972066,
"grad_norm": 3.4515936374664307,
"learning_rate": 0.00012517613341674988,
"loss": 0.8062,
"step": 566
},
{
"epoch": 0.0018843422623572722,
"grad_norm": 1.9574947357177734,
"learning_rate": 0.00012494694477164085,
"loss": 0.945,
"step": 567
},
{
"epoch": 0.001887665617317338,
"grad_norm": 2.0612564086914062,
"learning_rate": 0.00012471761630467593,
"loss": 0.82,
"step": 568
},
{
"epoch": 0.0018909889722774037,
"grad_norm": 1.8187974691390991,
"learning_rate": 0.0001244881493011881,
"loss": 0.6127,
"step": 569
},
{
"epoch": 0.0018943123272374695,
"grad_norm": 2.5469536781311035,
"learning_rate": 0.00012425854504728687,
"loss": 1.0473,
"step": 570
},
{
"epoch": 0.0018976356821975352,
"grad_norm": 2.1312432289123535,
"learning_rate": 0.00012402880482985097,
"loss": 0.6154,
"step": 571
},
{
"epoch": 0.001900959037157601,
"grad_norm": 2.154513359069824,
"learning_rate": 0.0001237989299365212,
"loss": 0.851,
"step": 572
},
{
"epoch": 0.0019042823921176667,
"grad_norm": 2.3860623836517334,
"learning_rate": 0.00012356892165569315,
"loss": 0.9685,
"step": 573
},
{
"epoch": 0.0019076057470777325,
"grad_norm": 1.7248111963272095,
"learning_rate": 0.00012333878127651006,
"loss": 0.8971,
"step": 574
},
{
"epoch": 0.0019109291020377981,
"grad_norm": 2.4173145294189453,
"learning_rate": 0.00012310851008885551,
"loss": 0.3288,
"step": 575
},
{
"epoch": 0.001914252456997864,
"grad_norm": 2.560663938522339,
"learning_rate": 0.00012287810938334628,
"loss": 0.9095,
"step": 576
},
{
"epoch": 0.0019175758119579296,
"grad_norm": 2.4700520038604736,
"learning_rate": 0.00012264758045132503,
"loss": 0.8354,
"step": 577
},
{
"epoch": 0.0019208991669179955,
"grad_norm": 3.193286895751953,
"learning_rate": 0.0001224169245848531,
"loss": 0.9986,
"step": 578
},
{
"epoch": 0.0019242225218780611,
"grad_norm": 2.184664249420166,
"learning_rate": 0.00012218614307670325,
"loss": 0.7368,
"step": 579
},
{
"epoch": 0.0019275458768381268,
"grad_norm": 2.0635182857513428,
"learning_rate": 0.00012195523722035251,
"loss": 1.0547,
"step": 580
},
{
"epoch": 0.0019308692317981926,
"grad_norm": 2.0145578384399414,
"learning_rate": 0.00012172420830997477,
"loss": 0.8455,
"step": 581
},
{
"epoch": 0.0019341925867582582,
"grad_norm": 2.438504219055176,
"learning_rate": 0.00012149305764043368,
"loss": 0.9734,
"step": 582
},
{
"epoch": 0.001937515941718324,
"grad_norm": 2.3351311683654785,
"learning_rate": 0.00012126178650727527,
"loss": 0.4809,
"step": 583
},
{
"epoch": 0.0019408392966783897,
"grad_norm": 2.146331548690796,
"learning_rate": 0.0001210303962067207,
"loss": 1.2014,
"step": 584
},
{
"epoch": 0.0019441626516384556,
"grad_norm": 2.121534824371338,
"learning_rate": 0.00012079888803565916,
"loss": 0.7174,
"step": 585
},
{
"epoch": 0.0019474860065985212,
"grad_norm": 2.191533088684082,
"learning_rate": 0.00012056726329164036,
"loss": 0.579,
"step": 586
},
{
"epoch": 0.001950809361558587,
"grad_norm": 3.195350170135498,
"learning_rate": 0.00012033552327286748,
"loss": 0.7127,
"step": 587
},
{
"epoch": 0.001954132716518653,
"grad_norm": 2.790376663208008,
"learning_rate": 0.00012010366927818966,
"loss": 0.7117,
"step": 588
},
{
"epoch": 0.0019574560714787186,
"grad_norm": 2.871413469314575,
"learning_rate": 0.00011987170260709497,
"loss": 0.8917,
"step": 589
},
{
"epoch": 0.001960779426438784,
"grad_norm": 2.7113215923309326,
"learning_rate": 0.00011963962455970292,
"loss": 0.9844,
"step": 590
},
{
"epoch": 0.00196410278139885,
"grad_norm": 2.5417799949645996,
"learning_rate": 0.00011940743643675733,
"loss": 0.7809,
"step": 591
},
{
"epoch": 0.001967426136358916,
"grad_norm": 2.755305528640747,
"learning_rate": 0.00011917513953961892,
"loss": 0.5988,
"step": 592
},
{
"epoch": 0.0019707494913189815,
"grad_norm": 3.3853535652160645,
"learning_rate": 0.00011894273517025802,
"loss": 0.7682,
"step": 593
},
{
"epoch": 0.001974072846279047,
"grad_norm": 2.008025884628296,
"learning_rate": 0.00011871022463124743,
"loss": 0.9777,
"step": 594
},
{
"epoch": 0.001977396201239113,
"grad_norm": 2.048288106918335,
"learning_rate": 0.00011847760922575495,
"loss": 1.1554,
"step": 595
},
{
"epoch": 0.0019807195561991784,
"grad_norm": 2.378748893737793,
"learning_rate": 0.00011824489025753611,
"loss": 0.7135,
"step": 596
},
{
"epoch": 0.0019840429111592445,
"grad_norm": 1.7821803092956543,
"learning_rate": 0.0001180120690309269,
"loss": 0.7473,
"step": 597
},
{
"epoch": 0.00198736626611931,
"grad_norm": 2.244962453842163,
"learning_rate": 0.00011777914685083649,
"loss": 0.9208,
"step": 598
},
{
"epoch": 0.0019906896210793758,
"grad_norm": 3.296584367752075,
"learning_rate": 0.00011754612502273976,
"loss": 0.6115,
"step": 599
},
{
"epoch": 0.0019940129760394414,
"grad_norm": 2.3598029613494873,
"learning_rate": 0.00011731300485267023,
"loss": 0.4963,
"step": 600
},
{
"epoch": 0.0019973363309995075,
"grad_norm": 1.8640865087509155,
"learning_rate": 0.00011707978764721253,
"loss": 1.0341,
"step": 601
},
{
"epoch": 0.002000659685959573,
"grad_norm": 2.823103666305542,
"learning_rate": 0.0001168464747134951,
"loss": 0.5334,
"step": 602
},
{
"epoch": 0.0020039830409196388,
"grad_norm": 2.356562614440918,
"learning_rate": 0.00011661306735918303,
"loss": 1.1563,
"step": 603
},
{
"epoch": 0.0020073063958797044,
"grad_norm": 1.7850453853607178,
"learning_rate": 0.00011637956689247058,
"loss": 0.6332,
"step": 604
},
{
"epoch": 0.0020106297508397705,
"grad_norm": 2.6029694080352783,
"learning_rate": 0.00011614597462207382,
"loss": 0.5515,
"step": 605
},
{
"epoch": 0.002013953105799836,
"grad_norm": 2.554975986480713,
"learning_rate": 0.00011591229185722339,
"loss": 1.0411,
"step": 606
},
{
"epoch": 0.0020172764607599017,
"grad_norm": 2.723120927810669,
"learning_rate": 0.0001156785199076572,
"loss": 0.9845,
"step": 607
},
{
"epoch": 0.0020205998157199674,
"grad_norm": 2.5275723934173584,
"learning_rate": 0.00011544466008361295,
"loss": 0.7504,
"step": 608
},
{
"epoch": 0.002023923170680033,
"grad_norm": 2.436131000518799,
"learning_rate": 0.00011521071369582079,
"loss": 0.834,
"step": 609
},
{
"epoch": 0.002027246525640099,
"grad_norm": 1.6245455741882324,
"learning_rate": 0.00011497668205549621,
"loss": 1.1013,
"step": 610
},
{
"epoch": 0.0020305698806001647,
"grad_norm": 1.9569754600524902,
"learning_rate": 0.00011474256647433239,
"loss": 0.7834,
"step": 611
},
{
"epoch": 0.0020338932355602303,
"grad_norm": 2.2459635734558105,
"learning_rate": 0.00011450836826449305,
"loss": 0.7966,
"step": 612
},
{
"epoch": 0.002037216590520296,
"grad_norm": 2.3233418464660645,
"learning_rate": 0.00011427408873860495,
"loss": 0.6925,
"step": 613
},
{
"epoch": 0.002040539945480362,
"grad_norm": 2.3365554809570312,
"learning_rate": 0.00011403972920975066,
"loss": 1.1375,
"step": 614
},
{
"epoch": 0.0020438633004404277,
"grad_norm": 2.217907667160034,
"learning_rate": 0.00011380529099146114,
"loss": 0.7229,
"step": 615
},
{
"epoch": 0.0020471866554004933,
"grad_norm": 2.2354955673217773,
"learning_rate": 0.0001135707753977084,
"loss": 0.9491,
"step": 616
},
{
"epoch": 0.002050510010360559,
"grad_norm": 1.5122630596160889,
"learning_rate": 0.00011333618374289812,
"loss": 0.9403,
"step": 617
},
{
"epoch": 0.002053833365320625,
"grad_norm": 1.8312718868255615,
"learning_rate": 0.00011310151734186221,
"loss": 0.718,
"step": 618
},
{
"epoch": 0.0020571567202806906,
"grad_norm": 4.088409900665283,
"learning_rate": 0.00011286677750985164,
"loss": 1.0302,
"step": 619
},
{
"epoch": 0.0020604800752407563,
"grad_norm": 1.87000572681427,
"learning_rate": 0.00011263196556252882,
"loss": 0.7077,
"step": 620
},
{
"epoch": 0.002063803430200822,
"grad_norm": 2.2027151584625244,
"learning_rate": 0.00011239708281596048,
"loss": 0.8584,
"step": 621
},
{
"epoch": 0.0020671267851608876,
"grad_norm": 1.6802822351455688,
"learning_rate": 0.00011216213058661005,
"loss": 0.4266,
"step": 622
},
{
"epoch": 0.0020704501401209536,
"grad_norm": 1.5808541774749756,
"learning_rate": 0.00011192711019133043,
"loss": 1.0296,
"step": 623
},
{
"epoch": 0.0020737734950810193,
"grad_norm": 2.912220001220703,
"learning_rate": 0.0001116920229473566,
"loss": 1.05,
"step": 624
},
{
"epoch": 0.002077096850041085,
"grad_norm": 2.2467291355133057,
"learning_rate": 0.00011145687017229817,
"loss": 0.9888,
"step": 625
},
{
"epoch": 0.0020804202050011505,
"grad_norm": 2.39890718460083,
"learning_rate": 0.00011122165318413212,
"loss": 0.9139,
"step": 626
},
{
"epoch": 0.0020837435599612166,
"grad_norm": 1.633890151977539,
"learning_rate": 0.0001109863733011952,
"loss": 0.9724,
"step": 627
},
{
"epoch": 0.0020870669149212822,
"grad_norm": 1.9472523927688599,
"learning_rate": 0.00011075103184217676,
"loss": 1.016,
"step": 628
},
{
"epoch": 0.002090390269881348,
"grad_norm": 2.565326452255249,
"learning_rate": 0.00011051563012611126,
"loss": 1.1732,
"step": 629
},
{
"epoch": 0.0020937136248414135,
"grad_norm": 2.2524020671844482,
"learning_rate": 0.00011028016947237089,
"loss": 0.4214,
"step": 630
},
{
"epoch": 0.0020970369798014796,
"grad_norm": 3.7713682651519775,
"learning_rate": 0.00011004465120065814,
"loss": 0.9991,
"step": 631
},
{
"epoch": 0.002100360334761545,
"grad_norm": 2.0051424503326416,
"learning_rate": 0.00010980907663099844,
"loss": 0.8795,
"step": 632
},
{
"epoch": 0.002103683689721611,
"grad_norm": 1.7558032274246216,
"learning_rate": 0.00010957344708373278,
"loss": 0.9593,
"step": 633
},
{
"epoch": 0.0021070070446816765,
"grad_norm": 1.7094303369522095,
"learning_rate": 0.00010933776387951033,
"loss": 1.0733,
"step": 634
},
{
"epoch": 0.0021103303996417425,
"grad_norm": 2.2374813556671143,
"learning_rate": 0.00010910202833928089,
"loss": 0.8686,
"step": 635
},
{
"epoch": 0.002113653754601808,
"grad_norm": 3.0024795532226562,
"learning_rate": 0.00010886624178428762,
"loss": 0.8538,
"step": 636
},
{
"epoch": 0.002116977109561874,
"grad_norm": 1.9901098012924194,
"learning_rate": 0.0001086304055360597,
"loss": 0.8818,
"step": 637
},
{
"epoch": 0.0021203004645219395,
"grad_norm": 1.961928367614746,
"learning_rate": 0.00010839452091640469,
"loss": 1.0803,
"step": 638
},
{
"epoch": 0.002123623819482005,
"grad_norm": 3.1986398696899414,
"learning_rate": 0.00010815858924740139,
"loss": 0.6213,
"step": 639
},
{
"epoch": 0.002126947174442071,
"grad_norm": 2.1918091773986816,
"learning_rate": 0.00010792261185139221,
"loss": 0.8908,
"step": 640
},
{
"epoch": 0.002130270529402137,
"grad_norm": 2.5319199562072754,
"learning_rate": 0.00010768659005097585,
"loss": 0.9992,
"step": 641
},
{
"epoch": 0.0021335938843622024,
"grad_norm": 2.3293981552124023,
"learning_rate": 0.00010745052516899996,
"loss": 1.0111,
"step": 642
},
{
"epoch": 0.002136917239322268,
"grad_norm": 2.176724910736084,
"learning_rate": 0.0001072144185285536,
"loss": 0.86,
"step": 643
},
{
"epoch": 0.002140240594282334,
"grad_norm": 1.5879802703857422,
"learning_rate": 0.00010697827145295991,
"loss": 0.6246,
"step": 644
},
{
"epoch": 0.0021435639492423998,
"grad_norm": 1.701689600944519,
"learning_rate": 0.00010674208526576857,
"loss": 0.5481,
"step": 645
},
{
"epoch": 0.0021468873042024654,
"grad_norm": 2.197331190109253,
"learning_rate": 0.00010650586129074857,
"loss": 0.6809,
"step": 646
},
{
"epoch": 0.002150210659162531,
"grad_norm": 2.1718311309814453,
"learning_rate": 0.00010626960085188067,
"loss": 1.0801,
"step": 647
},
{
"epoch": 0.002153534014122597,
"grad_norm": 4.355485916137695,
"learning_rate": 0.00010603330527334998,
"loss": 1.2853,
"step": 648
},
{
"epoch": 0.0021568573690826627,
"grad_norm": 2.317735195159912,
"learning_rate": 0.00010579697587953858,
"loss": 0.9521,
"step": 649
},
{
"epoch": 0.0021601807240427284,
"grad_norm": 1.9010009765625,
"learning_rate": 0.00010556061399501802,
"loss": 0.82,
"step": 650
},
{
"epoch": 0.002163504079002794,
"grad_norm": 1.3154947757720947,
"learning_rate": 0.00010532422094454204,
"loss": 1.2106,
"step": 651
},
{
"epoch": 0.0021668274339628596,
"grad_norm": 2.130319833755493,
"learning_rate": 0.00010508779805303905,
"loss": 0.9213,
"step": 652
},
{
"epoch": 0.0021701507889229257,
"grad_norm": 1.8011289834976196,
"learning_rate": 0.00010485134664560461,
"loss": 1.0765,
"step": 653
},
{
"epoch": 0.0021734741438829914,
"grad_norm": 2.1197900772094727,
"learning_rate": 0.00010461486804749418,
"loss": 0.9027,
"step": 654
},
{
"epoch": 0.002176797498843057,
"grad_norm": 1.9010964632034302,
"learning_rate": 0.00010437836358411568,
"loss": 0.6381,
"step": 655
},
{
"epoch": 0.0021801208538031226,
"grad_norm": 1.9410107135772705,
"learning_rate": 0.00010414183458102186,
"loss": 0.7964,
"step": 656
},
{
"epoch": 0.0021834442087631887,
"grad_norm": 2.4888432025909424,
"learning_rate": 0.0001039052823639031,
"loss": 0.9221,
"step": 657
},
{
"epoch": 0.0021867675637232543,
"grad_norm": 2.1054494380950928,
"learning_rate": 0.00010366870825857989,
"loss": 0.7669,
"step": 658
},
{
"epoch": 0.00219009091868332,
"grad_norm": 1.6643797159194946,
"learning_rate": 0.00010343211359099532,
"loss": 0.9111,
"step": 659
},
{
"epoch": 0.0021934142736433856,
"grad_norm": 1.8980937004089355,
"learning_rate": 0.00010319549968720787,
"loss": 1.103,
"step": 660
},
{
"epoch": 0.0021967376286034517,
"grad_norm": 2.2484617233276367,
"learning_rate": 0.00010295886787338369,
"loss": 0.93,
"step": 661
},
{
"epoch": 0.0022000609835635173,
"grad_norm": 2.264021873474121,
"learning_rate": 0.00010272221947578937,
"loss": 0.8636,
"step": 662
},
{
"epoch": 0.002203384338523583,
"grad_norm": 2.902461051940918,
"learning_rate": 0.00010248555582078445,
"loss": 1.0284,
"step": 663
},
{
"epoch": 0.0022067076934836486,
"grad_norm": 1.2113059759140015,
"learning_rate": 0.00010224887823481402,
"loss": 0.7502,
"step": 664
},
{
"epoch": 0.002210031048443714,
"grad_norm": 3.4769859313964844,
"learning_rate": 0.0001020121880444012,
"loss": 0.4294,
"step": 665
},
{
"epoch": 0.0022133544034037803,
"grad_norm": 3.4410877227783203,
"learning_rate": 0.00010177548657613971,
"loss": 0.9235,
"step": 666
},
{
"epoch": 0.002216677758363846,
"grad_norm": 3.706423044204712,
"learning_rate": 0.00010153877515668662,
"loss": 0.9124,
"step": 667
},
{
"epoch": 0.0022200011133239115,
"grad_norm": 1.6656898260116577,
"learning_rate": 0.00010130205511275464,
"loss": 1.0897,
"step": 668
},
{
"epoch": 0.002223324468283977,
"grad_norm": 1.8999181985855103,
"learning_rate": 0.00010106532777110486,
"loss": 0.9479,
"step": 669
},
{
"epoch": 0.0022266478232440432,
"grad_norm": 2.498556137084961,
"learning_rate": 0.00010082859445853934,
"loss": 0.8578,
"step": 670
},
{
"epoch": 0.0022266478232440432,
"eval_loss": 0.7900153994560242,
"eval_runtime": 3240.8445,
"eval_samples_per_second": 39.094,
"eval_steps_per_second": 19.547,
"step": 670
},
{
"epoch": 0.002229971178204109,
"grad_norm": 1.8369241952896118,
"learning_rate": 0.00010059185650189348,
"loss": 0.6654,
"step": 671
},
{
"epoch": 0.0022332945331641745,
"grad_norm": 1.9373723268508911,
"learning_rate": 0.0001003551152280288,
"loss": 0.5924,
"step": 672
},
{
"epoch": 0.00223661788812424,
"grad_norm": 2.284693956375122,
"learning_rate": 0.00010011837196382543,
"loss": 0.7141,
"step": 673
},
{
"epoch": 0.0022399412430843062,
"grad_norm": 1.8173497915267944,
"learning_rate": 9.988162803617458e-05,
"loss": 0.8533,
"step": 674
},
{
"epoch": 0.002243264598044372,
"grad_norm": 2.2727203369140625,
"learning_rate": 9.96448847719712e-05,
"loss": 1.0409,
"step": 675
},
{
"epoch": 0.0022465879530044375,
"grad_norm": 1.7468798160552979,
"learning_rate": 9.940814349810653e-05,
"loss": 0.9378,
"step": 676
},
{
"epoch": 0.002249911307964503,
"grad_norm": 2.3897597789764404,
"learning_rate": 9.917140554146067e-05,
"loss": 0.9678,
"step": 677
},
{
"epoch": 0.002253234662924569,
"grad_norm": 1.4605112075805664,
"learning_rate": 9.893467222889516e-05,
"loss": 0.8242,
"step": 678
},
{
"epoch": 0.002256558017884635,
"grad_norm": 2.9371156692504883,
"learning_rate": 9.869794488724537e-05,
"loss": 0.97,
"step": 679
},
{
"epoch": 0.0022598813728447005,
"grad_norm": 1.554976224899292,
"learning_rate": 9.846122484331343e-05,
"loss": 0.6133,
"step": 680
},
{
"epoch": 0.002263204727804766,
"grad_norm": 1.649769902229309,
"learning_rate": 9.822451342386031e-05,
"loss": 1.0024,
"step": 681
},
{
"epoch": 0.0022665280827648317,
"grad_norm": 2.1759161949157715,
"learning_rate": 9.798781195559883e-05,
"loss": 0.4422,
"step": 682
},
{
"epoch": 0.002269851437724898,
"grad_norm": 2.7241005897521973,
"learning_rate": 9.7751121765186e-05,
"loss": 0.6119,
"step": 683
},
{
"epoch": 0.0022731747926849634,
"grad_norm": 2.4562528133392334,
"learning_rate": 9.751444417921555e-05,
"loss": 0.9385,
"step": 684
},
{
"epoch": 0.002276498147645029,
"grad_norm": 4.837019920349121,
"learning_rate": 9.727778052421064e-05,
"loss": 0.4189,
"step": 685
},
{
"epoch": 0.0022798215026050947,
"grad_norm": 1.7064907550811768,
"learning_rate": 9.704113212661635e-05,
"loss": 1.1713,
"step": 686
},
{
"epoch": 0.0022831448575651608,
"grad_norm": 3.0644757747650146,
"learning_rate": 9.680450031279216e-05,
"loss": 0.8532,
"step": 687
},
{
"epoch": 0.0022864682125252264,
"grad_norm": 2.6350553035736084,
"learning_rate": 9.65678864090047e-05,
"loss": 0.7621,
"step": 688
},
{
"epoch": 0.002289791567485292,
"grad_norm": 1.530420184135437,
"learning_rate": 9.633129174142014e-05,
"loss": 0.8986,
"step": 689
},
{
"epoch": 0.0022931149224453577,
"grad_norm": 1.633294939994812,
"learning_rate": 9.609471763609692e-05,
"loss": 0.4471,
"step": 690
},
{
"epoch": 0.0022964382774054238,
"grad_norm": 2.1451008319854736,
"learning_rate": 9.585816541897816e-05,
"loss": 0.7268,
"step": 691
},
{
"epoch": 0.0022997616323654894,
"grad_norm": 2.0823404788970947,
"learning_rate": 9.562163641588437e-05,
"loss": 0.6338,
"step": 692
},
{
"epoch": 0.002303084987325555,
"grad_norm": 1.8083529472351074,
"learning_rate": 9.538513195250583e-05,
"loss": 0.6431,
"step": 693
},
{
"epoch": 0.0023064083422856207,
"grad_norm": 2.6677329540252686,
"learning_rate": 9.514865335439541e-05,
"loss": 0.7905,
"step": 694
},
{
"epoch": 0.0023097316972456863,
"grad_norm": 3.05940580368042,
"learning_rate": 9.491220194696096e-05,
"loss": 0.7371,
"step": 695
},
{
"epoch": 0.0023130550522057524,
"grad_norm": 1.8619906902313232,
"learning_rate": 9.467577905545795e-05,
"loss": 1.0022,
"step": 696
},
{
"epoch": 0.002316378407165818,
"grad_norm": 2.6758852005004883,
"learning_rate": 9.443938600498196e-05,
"loss": 1.0803,
"step": 697
},
{
"epoch": 0.0023197017621258836,
"grad_norm": 3.5494561195373535,
"learning_rate": 9.420302412046145e-05,
"loss": 0.9802,
"step": 698
},
{
"epoch": 0.0023230251170859493,
"grad_norm": 1.923540472984314,
"learning_rate": 9.396669472665004e-05,
"loss": 1.0205,
"step": 699
},
{
"epoch": 0.0023263484720460153,
"grad_norm": 2.2519900798797607,
"learning_rate": 9.373039914811935e-05,
"loss": 0.9522,
"step": 700
},
{
"epoch": 0.002329671827006081,
"grad_norm": 2.2550559043884277,
"learning_rate": 9.349413870925144e-05,
"loss": 1.0383,
"step": 701
},
{
"epoch": 0.0023329951819661466,
"grad_norm": 2.535550355911255,
"learning_rate": 9.325791473423144e-05,
"loss": 0.7763,
"step": 702
},
{
"epoch": 0.0023363185369262122,
"grad_norm": 2.5544021129608154,
"learning_rate": 9.30217285470401e-05,
"loss": 0.6332,
"step": 703
},
{
"epoch": 0.0023396418918862783,
"grad_norm": 2.3451223373413086,
"learning_rate": 9.278558147144642e-05,
"loss": 0.958,
"step": 704
},
{
"epoch": 0.002342965246846344,
"grad_norm": 3.874997138977051,
"learning_rate": 9.254947483100006e-05,
"loss": 1.3982,
"step": 705
},
{
"epoch": 0.0023462886018064096,
"grad_norm": 1.9717652797698975,
"learning_rate": 9.231340994902417e-05,
"loss": 0.8799,
"step": 706
},
{
"epoch": 0.0023496119567664752,
"grad_norm": 2.044839382171631,
"learning_rate": 9.207738814860783e-05,
"loss": 1.2354,
"step": 707
},
{
"epoch": 0.002352935311726541,
"grad_norm": 2.498375415802002,
"learning_rate": 9.184141075259863e-05,
"loss": 0.8478,
"step": 708
},
{
"epoch": 0.002356258666686607,
"grad_norm": 1.493669867515564,
"learning_rate": 9.160547908359532e-05,
"loss": 1.0594,
"step": 709
},
{
"epoch": 0.0023595820216466726,
"grad_norm": 2.704451084136963,
"learning_rate": 9.136959446394034e-05,
"loss": 0.9091,
"step": 710
},
{
"epoch": 0.002362905376606738,
"grad_norm": 1.4863560199737549,
"learning_rate": 9.113375821571239e-05,
"loss": 1.3247,
"step": 711
},
{
"epoch": 0.002366228731566804,
"grad_norm": 2.655653715133667,
"learning_rate": 9.089797166071914e-05,
"loss": 0.6064,
"step": 712
},
{
"epoch": 0.00236955208652687,
"grad_norm": 2.025148868560791,
"learning_rate": 9.066223612048969e-05,
"loss": 0.4657,
"step": 713
},
{
"epoch": 0.0023728754414869355,
"grad_norm": 2.5605967044830322,
"learning_rate": 9.04265529162672e-05,
"loss": 1.1225,
"step": 714
},
{
"epoch": 0.002376198796447001,
"grad_norm": 1.5156453847885132,
"learning_rate": 9.019092336900156e-05,
"loss": 0.5417,
"step": 715
},
{
"epoch": 0.002379522151407067,
"grad_norm": 1.3675994873046875,
"learning_rate": 8.99553487993419e-05,
"loss": 0.7597,
"step": 716
},
{
"epoch": 0.002382845506367133,
"grad_norm": 1.4033820629119873,
"learning_rate": 8.971983052762913e-05,
"loss": 0.8266,
"step": 717
},
{
"epoch": 0.0023861688613271985,
"grad_norm": 3.831524133682251,
"learning_rate": 8.948436987388876e-05,
"loss": 1.0312,
"step": 718
},
{
"epoch": 0.002389492216287264,
"grad_norm": 2.5193397998809814,
"learning_rate": 8.924896815782326e-05,
"loss": 0.388,
"step": 719
},
{
"epoch": 0.0023928155712473298,
"grad_norm": 2.961416482925415,
"learning_rate": 8.901362669880481e-05,
"loss": 0.9089,
"step": 720
},
{
"epoch": 0.0023961389262073954,
"grad_norm": 2.4046037197113037,
"learning_rate": 8.87783468158679e-05,
"loss": 0.674,
"step": 721
},
{
"epoch": 0.0023994622811674615,
"grad_norm": 2.0201237201690674,
"learning_rate": 8.854312982770185e-05,
"loss": 0.4603,
"step": 722
},
{
"epoch": 0.002402785636127527,
"grad_norm": 1.9336488246917725,
"learning_rate": 8.830797705264344e-05,
"loss": 1.2791,
"step": 723
},
{
"epoch": 0.0024061089910875928,
"grad_norm": 2.432431936264038,
"learning_rate": 8.80728898086696e-05,
"loss": 0.555,
"step": 724
},
{
"epoch": 0.0024094323460476584,
"grad_norm": 2.049858331680298,
"learning_rate": 8.783786941338997e-05,
"loss": 1.2561,
"step": 725
},
{
"epoch": 0.0024127557010077245,
"grad_norm": 1.9385666847229004,
"learning_rate": 8.760291718403955e-05,
"loss": 1.2627,
"step": 726
},
{
"epoch": 0.00241607905596779,
"grad_norm": 2.3782167434692383,
"learning_rate": 8.736803443747117e-05,
"loss": 0.493,
"step": 727
},
{
"epoch": 0.0024194024109278557,
"grad_norm": 2.8201656341552734,
"learning_rate": 8.713322249014841e-05,
"loss": 1.219,
"step": 728
},
{
"epoch": 0.0024227257658879214,
"grad_norm": 1.9326146841049194,
"learning_rate": 8.689848265813782e-05,
"loss": 0.671,
"step": 729
},
{
"epoch": 0.0024260491208479874,
"grad_norm": 2.4845776557922363,
"learning_rate": 8.666381625710192e-05,
"loss": 1.1306,
"step": 730
},
{
"epoch": 0.002429372475808053,
"grad_norm": 1.897662878036499,
"learning_rate": 8.642922460229161e-05,
"loss": 1.0483,
"step": 731
},
{
"epoch": 0.0024326958307681187,
"grad_norm": 2.1385698318481445,
"learning_rate": 8.619470900853887e-05,
"loss": 0.8237,
"step": 732
},
{
"epoch": 0.0024360191857281843,
"grad_norm": 2.609041690826416,
"learning_rate": 8.596027079024935e-05,
"loss": 1.3002,
"step": 733
},
{
"epoch": 0.0024393425406882504,
"grad_norm": 2.091109037399292,
"learning_rate": 8.57259112613951e-05,
"loss": 1.1823,
"step": 734
},
{
"epoch": 0.002442665895648316,
"grad_norm": 4.9723711013793945,
"learning_rate": 8.549163173550698e-05,
"loss": 0.9495,
"step": 735
},
{
"epoch": 0.0024459892506083817,
"grad_norm": 2.0263137817382812,
"learning_rate": 8.525743352566764e-05,
"loss": 0.945,
"step": 736
},
{
"epoch": 0.0024493126055684473,
"grad_norm": 2.4975996017456055,
"learning_rate": 8.50233179445038e-05,
"loss": 0.5585,
"step": 737
},
{
"epoch": 0.002452635960528513,
"grad_norm": 1.869718313217163,
"learning_rate": 8.478928630417921e-05,
"loss": 1.1208,
"step": 738
},
{
"epoch": 0.002455959315488579,
"grad_norm": 2.218090295791626,
"learning_rate": 8.455533991638712e-05,
"loss": 0.9037,
"step": 739
},
{
"epoch": 0.0024592826704486447,
"grad_norm": 1.9609618186950684,
"learning_rate": 8.432148009234284e-05,
"loss": 0.6614,
"step": 740
},
{
"epoch": 0.0024626060254087103,
"grad_norm": 2.544095277786255,
"learning_rate": 8.408770814277663e-05,
"loss": 0.7293,
"step": 741
},
{
"epoch": 0.002465929380368776,
"grad_norm": 2.0442986488342285,
"learning_rate": 8.385402537792621e-05,
"loss": 0.797,
"step": 742
},
{
"epoch": 0.002469252735328842,
"grad_norm": 1.888458251953125,
"learning_rate": 8.362043310752943e-05,
"loss": 0.8964,
"step": 743
},
{
"epoch": 0.0024725760902889076,
"grad_norm": 1.9157485961914062,
"learning_rate": 8.338693264081697e-05,
"loss": 0.8633,
"step": 744
},
{
"epoch": 0.0024758994452489733,
"grad_norm": 3.2599165439605713,
"learning_rate": 8.315352528650495e-05,
"loss": 1.0014,
"step": 745
},
{
"epoch": 0.002479222800209039,
"grad_norm": 2.07326340675354,
"learning_rate": 8.292021235278753e-05,
"loss": 0.8836,
"step": 746
},
{
"epoch": 0.002482546155169105,
"grad_norm": 2.074749231338501,
"learning_rate": 8.268699514732979e-05,
"loss": 1.0107,
"step": 747
},
{
"epoch": 0.0024858695101291706,
"grad_norm": 2.413445472717285,
"learning_rate": 8.245387497726027e-05,
"loss": 1.0935,
"step": 748
},
{
"epoch": 0.0024891928650892362,
"grad_norm": 2.770019769668579,
"learning_rate": 8.222085314916355e-05,
"loss": 0.6449,
"step": 749
},
{
"epoch": 0.002492516220049302,
"grad_norm": 1.7852829694747925,
"learning_rate": 8.198793096907309e-05,
"loss": 0.658,
"step": 750
},
{
"epoch": 0.0024958395750093675,
"grad_norm": 1.9140889644622803,
"learning_rate": 8.175510974246391e-05,
"loss": 0.7751,
"step": 751
},
{
"epoch": 0.0024991629299694336,
"grad_norm": 1.8890185356140137,
"learning_rate": 8.152239077424507e-05,
"loss": 0.854,
"step": 752
},
{
"epoch": 0.002502486284929499,
"grad_norm": 2.610456943511963,
"learning_rate": 8.12897753687526e-05,
"loss": 0.6235,
"step": 753
},
{
"epoch": 0.002505809639889565,
"grad_norm": 1.5817598104476929,
"learning_rate": 8.105726482974199e-05,
"loss": 0.6668,
"step": 754
},
{
"epoch": 0.0025091329948496305,
"grad_norm": 2.368863344192505,
"learning_rate": 8.082486046038111e-05,
"loss": 0.8256,
"step": 755
},
{
"epoch": 0.0025124563498096966,
"grad_norm": 2.1765031814575195,
"learning_rate": 8.059256356324268e-05,
"loss": 0.908,
"step": 756
},
{
"epoch": 0.002515779704769762,
"grad_norm": 1.7256314754486084,
"learning_rate": 8.036037544029709e-05,
"loss": 0.577,
"step": 757
},
{
"epoch": 0.002519103059729828,
"grad_norm": 1.280367136001587,
"learning_rate": 8.012829739290508e-05,
"loss": 0.6491,
"step": 758
},
{
"epoch": 0.0025224264146898935,
"grad_norm": 1.6887412071228027,
"learning_rate": 7.989633072181037e-05,
"loss": 1.1735,
"step": 759
},
{
"epoch": 0.0025257497696499595,
"grad_norm": 2.1677637100219727,
"learning_rate": 7.966447672713254e-05,
"loss": 0.9427,
"step": 760
},
{
"epoch": 0.002529073124610025,
"grad_norm": 2.13200044631958,
"learning_rate": 7.943273670835966e-05,
"loss": 0.5779,
"step": 761
},
{
"epoch": 0.002532396479570091,
"grad_norm": 1.6203086376190186,
"learning_rate": 7.920111196434085e-05,
"loss": 0.8072,
"step": 762
},
{
"epoch": 0.0025357198345301564,
"grad_norm": 3.0133414268493652,
"learning_rate": 7.896960379327934e-05,
"loss": 0.7679,
"step": 763
},
{
"epoch": 0.002539043189490222,
"grad_norm": 1.7745375633239746,
"learning_rate": 7.873821349272478e-05,
"loss": 0.8275,
"step": 764
},
{
"epoch": 0.002542366544450288,
"grad_norm": 2.3216776847839355,
"learning_rate": 7.850694235956633e-05,
"loss": 0.6882,
"step": 765
},
{
"epoch": 0.0025456898994103538,
"grad_norm": 1.5814892053604126,
"learning_rate": 7.827579169002524e-05,
"loss": 0.8818,
"step": 766
},
{
"epoch": 0.0025490132543704194,
"grad_norm": 2.209172487258911,
"learning_rate": 7.80447627796475e-05,
"loss": 0.5781,
"step": 767
},
{
"epoch": 0.002552336609330485,
"grad_norm": 2.160571813583374,
"learning_rate": 7.781385692329675e-05,
"loss": 0.8048,
"step": 768
},
{
"epoch": 0.002555659964290551,
"grad_norm": 2.0445640087127686,
"learning_rate": 7.758307541514695e-05,
"loss": 1.2058,
"step": 769
},
{
"epoch": 0.0025589833192506167,
"grad_norm": 2.0943262577056885,
"learning_rate": 7.7352419548675e-05,
"loss": 1.0538,
"step": 770
},
{
"epoch": 0.0025623066742106824,
"grad_norm": 4.018516540527344,
"learning_rate": 7.712189061665375e-05,
"loss": 0.4696,
"step": 771
},
{
"epoch": 0.002565630029170748,
"grad_norm": 1.7685644626617432,
"learning_rate": 7.68914899111445e-05,
"loss": 1.128,
"step": 772
},
{
"epoch": 0.002568953384130814,
"grad_norm": 1.4300886392593384,
"learning_rate": 7.666121872348995e-05,
"loss": 0.8966,
"step": 773
},
{
"epoch": 0.0025722767390908797,
"grad_norm": 1.8490028381347656,
"learning_rate": 7.643107834430686e-05,
"loss": 0.9422,
"step": 774
},
{
"epoch": 0.0025756000940509454,
"grad_norm": 1.758457064628601,
"learning_rate": 7.620107006347883e-05,
"loss": 1.0323,
"step": 775
},
{
"epoch": 0.002578923449011011,
"grad_norm": 2.2684803009033203,
"learning_rate": 7.597119517014905e-05,
"loss": 0.7242,
"step": 776
},
{
"epoch": 0.0025822468039710766,
"grad_norm": 1.4495296478271484,
"learning_rate": 7.574145495271315e-05,
"loss": 0.9589,
"step": 777
},
{
"epoch": 0.0025855701589311427,
"grad_norm": 1.9905943870544434,
"learning_rate": 7.55118506988119e-05,
"loss": 0.7841,
"step": 778
},
{
"epoch": 0.0025888935138912083,
"grad_norm": 1.9174848794937134,
"learning_rate": 7.52823836953241e-05,
"loss": 0.5076,
"step": 779
},
{
"epoch": 0.002592216868851274,
"grad_norm": 1.315011978149414,
"learning_rate": 7.505305522835916e-05,
"loss": 0.3087,
"step": 780
},
{
"epoch": 0.0025955402238113396,
"grad_norm": 2.5037121772766113,
"learning_rate": 7.482386658325018e-05,
"loss": 1.1691,
"step": 781
},
{
"epoch": 0.0025988635787714057,
"grad_norm": 1.9181026220321655,
"learning_rate": 7.459481904454642e-05,
"loss": 0.6995,
"step": 782
},
{
"epoch": 0.0026021869337314713,
"grad_norm": 1.811360478401184,
"learning_rate": 7.43659138960064e-05,
"loss": 0.9057,
"step": 783
},
{
"epoch": 0.002605510288691537,
"grad_norm": 3.2156643867492676,
"learning_rate": 7.413715242059058e-05,
"loss": 0.6813,
"step": 784
},
{
"epoch": 0.0026088336436516026,
"grad_norm": 2.809575080871582,
"learning_rate": 7.390853590045406e-05,
"loss": 0.8422,
"step": 785
},
{
"epoch": 0.0026121569986116686,
"grad_norm": 3.667475461959839,
"learning_rate": 7.368006561693959e-05,
"loss": 0.9714,
"step": 786
},
{
"epoch": 0.0026154803535717343,
"grad_norm": 1.7948848009109497,
"learning_rate": 7.345174285057032e-05,
"loss": 0.7066,
"step": 787
},
{
"epoch": 0.0026188037085318,
"grad_norm": 2.3704257011413574,
"learning_rate": 7.322356888104247e-05,
"loss": 0.7432,
"step": 788
},
{
"epoch": 0.0026221270634918655,
"grad_norm": 1.6095038652420044,
"learning_rate": 7.299554498721839e-05,
"loss": 1.0732,
"step": 789
},
{
"epoch": 0.0026254504184519316,
"grad_norm": 1.8659001588821411,
"learning_rate": 7.276767244711929e-05,
"loss": 1.3812,
"step": 790
},
{
"epoch": 0.0026287737734119973,
"grad_norm": 2.7053465843200684,
"learning_rate": 7.253995253791803e-05,
"loss": 0.8775,
"step": 791
},
{
"epoch": 0.002632097128372063,
"grad_norm": 1.9934276342391968,
"learning_rate": 7.231238653593208e-05,
"loss": 1.1197,
"step": 792
},
{
"epoch": 0.0026354204833321285,
"grad_norm": 2.0284664630889893,
"learning_rate": 7.208497571661625e-05,
"loss": 0.9916,
"step": 793
},
{
"epoch": 0.002638743838292194,
"grad_norm": 2.268935203552246,
"learning_rate": 7.185772135455553e-05,
"loss": 0.9107,
"step": 794
},
{
"epoch": 0.0026420671932522602,
"grad_norm": 1.5755196809768677,
"learning_rate": 7.163062472345807e-05,
"loss": 0.8286,
"step": 795
},
{
"epoch": 0.002645390548212326,
"grad_norm": 2.701737403869629,
"learning_rate": 7.140368709614804e-05,
"loss": 0.6317,
"step": 796
},
{
"epoch": 0.0026487139031723915,
"grad_norm": 1.6806485652923584,
"learning_rate": 7.117690974455828e-05,
"loss": 1.0092,
"step": 797
},
{
"epoch": 0.002652037258132457,
"grad_norm": 2.387301445007324,
"learning_rate": 7.095029393972341e-05,
"loss": 0.9981,
"step": 798
},
{
"epoch": 0.002655360613092523,
"grad_norm": 2.0945944786071777,
"learning_rate": 7.072384095177269e-05,
"loss": 0.8833,
"step": 799
},
{
"epoch": 0.002658683968052589,
"grad_norm": 2.2880685329437256,
"learning_rate": 7.049755204992262e-05,
"loss": 0.6492,
"step": 800
},
{
"epoch": 0.0026620073230126545,
"grad_norm": 2.2428207397460938,
"learning_rate": 7.027142850247023e-05,
"loss": 1.0001,
"step": 801
},
{
"epoch": 0.00266533067797272,
"grad_norm": 2.195579767227173,
"learning_rate": 7.00454715767857e-05,
"loss": 0.8971,
"step": 802
},
{
"epoch": 0.002668654032932786,
"grad_norm": 2.64233660697937,
"learning_rate": 6.981968253930532e-05,
"loss": 0.9304,
"step": 803
},
{
"epoch": 0.002671977387892852,
"grad_norm": 1.573728322982788,
"learning_rate": 6.959406265552446e-05,
"loss": 0.7636,
"step": 804
},
{
"epoch": 0.0026753007428529174,
"grad_norm": 2.323046922683716,
"learning_rate": 6.936861318999039e-05,
"loss": 0.8272,
"step": 805
},
{
"epoch": 0.002678624097812983,
"grad_norm": 1.2111873626708984,
"learning_rate": 6.914333540629521e-05,
"loss": 0.8412,
"step": 806
},
{
"epoch": 0.0026819474527730487,
"grad_norm": 2.6255946159362793,
"learning_rate": 6.891823056706877e-05,
"loss": 1.2073,
"step": 807
},
{
"epoch": 0.002685270807733115,
"grad_norm": 2.2799253463745117,
"learning_rate": 6.869329993397165e-05,
"loss": 1.0982,
"step": 808
},
{
"epoch": 0.0026885941626931804,
"grad_norm": 2.6791117191314697,
"learning_rate": 6.846854476768804e-05,
"loss": 0.8172,
"step": 809
},
{
"epoch": 0.002691917517653246,
"grad_norm": 1.7381466627120972,
"learning_rate": 6.824396632791867e-05,
"loss": 1.0518,
"step": 810
},
{
"epoch": 0.0026952408726133117,
"grad_norm": 2.056283712387085,
"learning_rate": 6.801956587337378e-05,
"loss": 0.7701,
"step": 811
},
{
"epoch": 0.0026985642275733778,
"grad_norm": 2.5469250679016113,
"learning_rate": 6.779534466176595e-05,
"loss": 0.77,
"step": 812
},
{
"epoch": 0.0027018875825334434,
"grad_norm": 1.608278512954712,
"learning_rate": 6.757130394980324e-05,
"loss": 1.0236,
"step": 813
},
{
"epoch": 0.002705210937493509,
"grad_norm": 2.222339153289795,
"learning_rate": 6.734744499318209e-05,
"loss": 0.7726,
"step": 814
},
{
"epoch": 0.0027085342924535747,
"grad_norm": 1.7149579524993896,
"learning_rate": 6.712376904658008e-05,
"loss": 0.8896,
"step": 815
},
{
"epoch": 0.0027118576474136407,
"grad_norm": 1.6997548341751099,
"learning_rate": 6.690027736364922e-05,
"loss": 0.9059,
"step": 816
},
{
"epoch": 0.0027151810023737064,
"grad_norm": 2.0666792392730713,
"learning_rate": 6.667697119700876e-05,
"loss": 0.9987,
"step": 817
},
{
"epoch": 0.002718504357333772,
"grad_norm": 2.7630252838134766,
"learning_rate": 6.645385179823798e-05,
"loss": 0.8782,
"step": 818
},
{
"epoch": 0.0027218277122938376,
"grad_norm": 1.464687466621399,
"learning_rate": 6.623092041786963e-05,
"loss": 0.7638,
"step": 819
},
{
"epoch": 0.0027251510672539033,
"grad_norm": 2.7846906185150146,
"learning_rate": 6.600817830538244e-05,
"loss": 1.2357,
"step": 820
},
{
"epoch": 0.0027284744222139693,
"grad_norm": 1.8688273429870605,
"learning_rate": 6.578562670919453e-05,
"loss": 0.7761,
"step": 821
},
{
"epoch": 0.002731797777174035,
"grad_norm": 2.552701473236084,
"learning_rate": 6.556326687665608e-05,
"loss": 0.8297,
"step": 822
},
{
"epoch": 0.0027351211321341006,
"grad_norm": 2.456305503845215,
"learning_rate": 6.534110005404255e-05,
"loss": 0.7151,
"step": 823
},
{
"epoch": 0.0027384444870941663,
"grad_norm": 2.453496217727661,
"learning_rate": 6.511912748654759e-05,
"loss": 0.59,
"step": 824
},
{
"epoch": 0.0027417678420542323,
"grad_norm": 1.7119109630584717,
"learning_rate": 6.489735041827605e-05,
"loss": 1.0155,
"step": 825
},
{
"epoch": 0.002745091197014298,
"grad_norm": 1.677862524986267,
"learning_rate": 6.467577009223717e-05,
"loss": 0.7779,
"step": 826
},
{
"epoch": 0.0027484145519743636,
"grad_norm": 2.11332106590271,
"learning_rate": 6.445438775033743e-05,
"loss": 1.1967,
"step": 827
},
{
"epoch": 0.0027517379069344292,
"grad_norm": 1.6521720886230469,
"learning_rate": 6.423320463337363e-05,
"loss": 0.9303,
"step": 828
},
{
"epoch": 0.0027550612618944953,
"grad_norm": 1.7343634366989136,
"learning_rate": 6.401222198102603e-05,
"loss": 1.0159,
"step": 829
},
{
"epoch": 0.002758384616854561,
"grad_norm": 1.9725018739700317,
"learning_rate": 6.379144103185123e-05,
"loss": 0.7249,
"step": 830
},
{
"epoch": 0.0027617079718146266,
"grad_norm": 1.848870873451233,
"learning_rate": 6.357086302327542e-05,
"loss": 0.8119,
"step": 831
},
{
"epoch": 0.002765031326774692,
"grad_norm": 2.3503406047821045,
"learning_rate": 6.335048919158733e-05,
"loss": 0.6752,
"step": 832
},
{
"epoch": 0.0027683546817347583,
"grad_norm": 1.880751132965088,
"learning_rate": 6.31303207719313e-05,
"loss": 0.8967,
"step": 833
},
{
"epoch": 0.002771678036694824,
"grad_norm": 2.5321950912475586,
"learning_rate": 6.291035899830043e-05,
"loss": 1.4429,
"step": 834
},
{
"epoch": 0.0027750013916548895,
"grad_norm": 2.1685030460357666,
"learning_rate": 6.26906051035296e-05,
"loss": 0.8103,
"step": 835
},
{
"epoch": 0.002778324746614955,
"grad_norm": 2.246095895767212,
"learning_rate": 6.247106031928854e-05,
"loss": 0.9691,
"step": 836
},
{
"epoch": 0.002781648101575021,
"grad_norm": 2.2341558933258057,
"learning_rate": 6.225172587607496e-05,
"loss": 0.9346,
"step": 837
},
{
"epoch": 0.002784971456535087,
"grad_norm": 1.7885407209396362,
"learning_rate": 6.203260300320773e-05,
"loss": 0.7622,
"step": 838
},
{
"epoch": 0.0027882948114951525,
"grad_norm": 2.0753839015960693,
"learning_rate": 6.181369292881987e-05,
"loss": 0.9163,
"step": 839
},
{
"epoch": 0.002791618166455218,
"grad_norm": 2.8468055725097656,
"learning_rate": 6.159499687985175e-05,
"loss": 1.0923,
"step": 840
},
{
"epoch": 0.002794941521415284,
"grad_norm": 1.672538161277771,
"learning_rate": 6.137651608204411e-05,
"loss": 0.8471,
"step": 841
},
{
"epoch": 0.00279826487637535,
"grad_norm": 1.954453945159912,
"learning_rate": 6.115825175993129e-05,
"loss": 1.0673,
"step": 842
},
{
"epoch": 0.0028015882313354155,
"grad_norm": 2.2643611431121826,
"learning_rate": 6.0940205136834314e-05,
"loss": 0.8641,
"step": 843
},
{
"epoch": 0.002804911586295481,
"grad_norm": 2.73639178276062,
"learning_rate": 6.0722377434854115e-05,
"loss": 0.3082,
"step": 844
},
{
"epoch": 0.0028082349412555468,
"grad_norm": 1.2507221698760986,
"learning_rate": 6.0504769874864606e-05,
"loss": 0.6386,
"step": 845
},
{
"epoch": 0.002811558296215613,
"grad_norm": 2.7501723766326904,
"learning_rate": 6.0287383676505796e-05,
"loss": 0.675,
"step": 846
},
{
"epoch": 0.0028148816511756785,
"grad_norm": 2.5215868949890137,
"learning_rate": 6.0070220058177083e-05,
"loss": 1.4006,
"step": 847
},
{
"epoch": 0.002818205006135744,
"grad_norm": 2.644489288330078,
"learning_rate": 5.985328023703026e-05,
"loss": 1.0212,
"step": 848
},
{
"epoch": 0.0028215283610958097,
"grad_norm": 2.0919172763824463,
"learning_rate": 5.963656542896292e-05,
"loss": 0.9604,
"step": 849
},
{
"epoch": 0.0028248517160558754,
"grad_norm": 1.9078031778335571,
"learning_rate": 5.942007684861141e-05,
"loss": 0.6867,
"step": 850
},
{
"epoch": 0.0028281750710159414,
"grad_norm": 1.341930866241455,
"learning_rate": 5.920381570934415e-05,
"loss": 0.4825,
"step": 851
},
{
"epoch": 0.002831498425976007,
"grad_norm": 1.4135596752166748,
"learning_rate": 5.898778322325482e-05,
"loss": 0.9042,
"step": 852
},
{
"epoch": 0.0028348217809360727,
"grad_norm": 3.0637757778167725,
"learning_rate": 5.8771980601155584e-05,
"loss": 0.8,
"step": 853
},
{
"epoch": 0.0028381451358961383,
"grad_norm": 1.1350816488265991,
"learning_rate": 5.8556409052570184e-05,
"loss": 0.6352,
"step": 854
},
{
"epoch": 0.0028414684908562044,
"grad_norm": 2.974297523498535,
"learning_rate": 5.834106978572726e-05,
"loss": 0.953,
"step": 855
},
{
"epoch": 0.00284479184581627,
"grad_norm": 1.6131561994552612,
"learning_rate": 5.812596400755368e-05,
"loss": 1.1661,
"step": 856
},
{
"epoch": 0.0028481152007763357,
"grad_norm": 1.42369544506073,
"learning_rate": 5.791109292366749e-05,
"loss": 1.1922,
"step": 857
},
{
"epoch": 0.0028514385557364013,
"grad_norm": 2.268709421157837,
"learning_rate": 5.769645773837156e-05,
"loss": 0.7496,
"step": 858
},
{
"epoch": 0.0028547619106964674,
"grad_norm": 1.760610580444336,
"learning_rate": 5.7482059654646304e-05,
"loss": 0.8179,
"step": 859
},
{
"epoch": 0.002858085265656533,
"grad_norm": 1.400349736213684,
"learning_rate": 5.7267899874143495e-05,
"loss": 0.9362,
"step": 860
},
{
"epoch": 0.0028614086206165987,
"grad_norm": 1.8297139406204224,
"learning_rate": 5.7053979597179175e-05,
"loss": 0.8381,
"step": 861
},
{
"epoch": 0.0028647319755766643,
"grad_norm": 1.8903241157531738,
"learning_rate": 5.684030002272694e-05,
"loss": 1.1997,
"step": 862
},
{
"epoch": 0.00286805533053673,
"grad_norm": 1.5943444967269897,
"learning_rate": 5.66268623484115e-05,
"loss": 0.7329,
"step": 863
},
{
"epoch": 0.002871378685496796,
"grad_norm": 1.3991549015045166,
"learning_rate": 5.641366777050159e-05,
"loss": 1.0264,
"step": 864
},
{
"epoch": 0.0028747020404568616,
"grad_norm": 3.293590784072876,
"learning_rate": 5.6200717483903545e-05,
"loss": 1.2638,
"step": 865
},
{
"epoch": 0.0028780253954169273,
"grad_norm": 2.305203914642334,
"learning_rate": 5.598801268215443e-05,
"loss": 0.7039,
"step": 866
},
{
"epoch": 0.002881348750376993,
"grad_norm": 1.9197697639465332,
"learning_rate": 5.5775554557415465e-05,
"loss": 0.7887,
"step": 867
},
{
"epoch": 0.002884672105337059,
"grad_norm": 1.9447755813598633,
"learning_rate": 5.556334430046537e-05,
"loss": 0.9385,
"step": 868
},
{
"epoch": 0.0028879954602971246,
"grad_norm": 2.6985504627227783,
"learning_rate": 5.53513831006935e-05,
"loss": 0.5904,
"step": 869
},
{
"epoch": 0.0028913188152571902,
"grad_norm": 1.594799518585205,
"learning_rate": 5.5139672146093376e-05,
"loss": 1.0022,
"step": 870
},
{
"epoch": 0.002894642170217256,
"grad_norm": 2.485776901245117,
"learning_rate": 5.492821262325595e-05,
"loss": 0.8321,
"step": 871
},
{
"epoch": 0.002897965525177322,
"grad_norm": 2.4602620601654053,
"learning_rate": 5.471700571736287e-05,
"loss": 0.8259,
"step": 872
},
{
"epoch": 0.0029012888801373876,
"grad_norm": 1.3913767337799072,
"learning_rate": 5.450605261218009e-05,
"loss": 0.9241,
"step": 873
},
{
"epoch": 0.002904612235097453,
"grad_norm": 2.930417537689209,
"learning_rate": 5.429535449005096e-05,
"loss": 0.68,
"step": 874
},
{
"epoch": 0.002907935590057519,
"grad_norm": 1.7724130153656006,
"learning_rate": 5.408491253188965e-05,
"loss": 0.719,
"step": 875
},
{
"epoch": 0.0029112589450175845,
"grad_norm": 1.5435396432876587,
"learning_rate": 5.3874727917174805e-05,
"loss": 0.8378,
"step": 876
},
{
"epoch": 0.0029145822999776506,
"grad_norm": 2.9708385467529297,
"learning_rate": 5.366480182394242e-05,
"loss": 0.583,
"step": 877
},
{
"epoch": 0.002917905654937716,
"grad_norm": 2.3581833839416504,
"learning_rate": 5.3455135428779826e-05,
"loss": 0.8518,
"step": 878
},
{
"epoch": 0.002921229009897782,
"grad_norm": 1.5053647756576538,
"learning_rate": 5.324572990681862e-05,
"loss": 0.5481,
"step": 879
},
{
"epoch": 0.0029245523648578475,
"grad_norm": 2.0453686714172363,
"learning_rate": 5.303658643172828e-05,
"loss": 1.4061,
"step": 880
},
{
"epoch": 0.0029278757198179135,
"grad_norm": 2.7578492164611816,
"learning_rate": 5.282770617570973e-05,
"loss": 0.9779,
"step": 881
},
{
"epoch": 0.002931199074777979,
"grad_norm": 2.1508371829986572,
"learning_rate": 5.2619090309488416e-05,
"loss": 1.0649,
"step": 882
},
{
"epoch": 0.002934522429738045,
"grad_norm": 2.0045487880706787,
"learning_rate": 5.2410740002308035e-05,
"loss": 0.8897,
"step": 883
},
{
"epoch": 0.0029378457846981104,
"grad_norm": 3.2993271350860596,
"learning_rate": 5.2202656421923876e-05,
"loss": 1.004,
"step": 884
},
{
"epoch": 0.0029411691396581765,
"grad_norm": 2.9482834339141846,
"learning_rate": 5.1994840734596264e-05,
"loss": 0.9076,
"step": 885
},
{
"epoch": 0.002944492494618242,
"grad_norm": 1.955139398574829,
"learning_rate": 5.1787294105084095e-05,
"loss": 0.9145,
"step": 886
},
{
"epoch": 0.0029478158495783078,
"grad_norm": 2.491614580154419,
"learning_rate": 5.1580017696638226e-05,
"loss": 0.662,
"step": 887
},
{
"epoch": 0.0029511392045383734,
"grad_norm": 2.719644069671631,
"learning_rate": 5.137301267099498e-05,
"loss": 1.1147,
"step": 888
},
{
"epoch": 0.0029544625594984395,
"grad_norm": 2.648268699645996,
"learning_rate": 5.1166280188369655e-05,
"loss": 0.6992,
"step": 889
},
{
"epoch": 0.002957785914458505,
"grad_norm": 1.7123095989227295,
"learning_rate": 5.095982140744995e-05,
"loss": 0.8088,
"step": 890
},
{
"epoch": 0.0029611092694185707,
"grad_norm": 1.9856046438217163,
"learning_rate": 5.0753637485389685e-05,
"loss": 0.7292,
"step": 891
},
{
"epoch": 0.0029644326243786364,
"grad_norm": 1.634596586227417,
"learning_rate": 5.0547729577802004e-05,
"loss": 1.0181,
"step": 892
},
{
"epoch": 0.002967755979338702,
"grad_norm": 2.8171486854553223,
"learning_rate": 5.034209883875307e-05,
"loss": 1.1075,
"step": 893
},
{
"epoch": 0.002971079334298768,
"grad_norm": 1.6797412633895874,
"learning_rate": 5.013674642075573e-05,
"loss": 0.9165,
"step": 894
},
{
"epoch": 0.0029744026892588337,
"grad_norm": 1.9247677326202393,
"learning_rate": 4.9931673474762666e-05,
"loss": 1.1835,
"step": 895
},
{
"epoch": 0.0029777260442188994,
"grad_norm": 2.5648396015167236,
"learning_rate": 4.972688115016039e-05,
"loss": 0.8235,
"step": 896
},
{
"epoch": 0.002981049399178965,
"grad_norm": 3.0656604766845703,
"learning_rate": 4.952237059476251e-05,
"loss": 1.1572,
"step": 897
},
{
"epoch": 0.002984372754139031,
"grad_norm": 1.6321872472763062,
"learning_rate": 4.931814295480335e-05,
"loss": 1.1099,
"step": 898
},
{
"epoch": 0.0029876961090990967,
"grad_norm": 1.9796544313430786,
"learning_rate": 4.9114199374931655e-05,
"loss": 0.9264,
"step": 899
},
{
"epoch": 0.0029910194640591623,
"grad_norm": 2.6657955646514893,
"learning_rate": 4.891054099820406e-05,
"loss": 0.8,
"step": 900
},
{
"epoch": 0.002994342819019228,
"grad_norm": 2.9986019134521484,
"learning_rate": 4.870716896607851e-05,
"loss": 1.0711,
"step": 901
},
{
"epoch": 0.002997666173979294,
"grad_norm": 2.200094223022461,
"learning_rate": 4.8504084418408355e-05,
"loss": 0.8344,
"step": 902
},
{
"epoch": 0.0030009895289393597,
"grad_norm": 3.1483426094055176,
"learning_rate": 4.830128849343542e-05,
"loss": 0.9605,
"step": 903
},
{
"epoch": 0.0030043128838994253,
"grad_norm": 2.294978141784668,
"learning_rate": 4.809878232778406e-05,
"loss": 0.6189,
"step": 904
},
{
"epoch": 0.003007636238859491,
"grad_norm": 1.5292420387268066,
"learning_rate": 4.789656705645447e-05,
"loss": 0.872,
"step": 905
},
{
"epoch": 0.0030109595938195566,
"grad_norm": 2.268393039703369,
"learning_rate": 4.769464381281643e-05,
"loss": 0.9634,
"step": 906
},
{
"epoch": 0.0030142829487796226,
"grad_norm": 2.076624631881714,
"learning_rate": 4.7493013728603074e-05,
"loss": 1.0022,
"step": 907
},
{
"epoch": 0.0030176063037396883,
"grad_norm": 2.2546167373657227,
"learning_rate": 4.729167793390435e-05,
"loss": 0.9459,
"step": 908
},
{
"epoch": 0.003020929658699754,
"grad_norm": 1.8075573444366455,
"learning_rate": 4.709063755716083e-05,
"loss": 1.3215,
"step": 909
},
{
"epoch": 0.0030242530136598196,
"grad_norm": 1.9737260341644287,
"learning_rate": 4.6889893725157375e-05,
"loss": 0.8568,
"step": 910
},
{
"epoch": 0.0030275763686198856,
"grad_norm": 3.899272918701172,
"learning_rate": 4.668944756301666e-05,
"loss": 0.973,
"step": 911
},
{
"epoch": 0.0030308997235799513,
"grad_norm": 1.8497694730758667,
"learning_rate": 4.648930019419321e-05,
"loss": 1.0238,
"step": 912
},
{
"epoch": 0.003034223078540017,
"grad_norm": 1.7761645317077637,
"learning_rate": 4.628945274046659e-05,
"loss": 1.1853,
"step": 913
},
{
"epoch": 0.0030375464335000825,
"grad_norm": 2.692018985748291,
"learning_rate": 4.608990632193557e-05,
"loss": 0.8397,
"step": 914
},
{
"epoch": 0.0030408697884601486,
"grad_norm": 2.3758668899536133,
"learning_rate": 4.589066205701177e-05,
"loss": 1.1698,
"step": 915
},
{
"epoch": 0.0030441931434202142,
"grad_norm": 1.9619114398956299,
"learning_rate": 4.569172106241312e-05,
"loss": 0.8451,
"step": 916
},
{
"epoch": 0.00304751649838028,
"grad_norm": 1.6542222499847412,
"learning_rate": 4.5493084453157954e-05,
"loss": 0.6891,
"step": 917
},
{
"epoch": 0.0030508398533403455,
"grad_norm": 2.2412753105163574,
"learning_rate": 4.529475334255855e-05,
"loss": 0.7928,
"step": 918
},
{
"epoch": 0.003054163208300411,
"grad_norm": 1.8066473007202148,
"learning_rate": 4.5096728842214795e-05,
"loss": 0.9513,
"step": 919
},
{
"epoch": 0.003057486563260477,
"grad_norm": 1.6183894872665405,
"learning_rate": 4.489901206200832e-05,
"loss": 0.8679,
"step": 920
},
{
"epoch": 0.003060809918220543,
"grad_norm": 1.7738823890686035,
"learning_rate": 4.4701604110095916e-05,
"loss": 1.0069,
"step": 921
},
{
"epoch": 0.0030641332731806085,
"grad_norm": 1.9727370738983154,
"learning_rate": 4.450450609290347e-05,
"loss": 0.5319,
"step": 922
},
{
"epoch": 0.003067456628140674,
"grad_norm": 1.3664517402648926,
"learning_rate": 4.430771911511986e-05,
"loss": 0.2944,
"step": 923
},
{
"epoch": 0.00307077998310074,
"grad_norm": 2.3164656162261963,
"learning_rate": 4.4111244279690536e-05,
"loss": 1.3496,
"step": 924
},
{
"epoch": 0.003074103338060806,
"grad_norm": 1.6333527565002441,
"learning_rate": 4.3915082687811515e-05,
"loss": 0.7681,
"step": 925
},
{
"epoch": 0.0030774266930208715,
"grad_norm": 2.8815577030181885,
"learning_rate": 4.371923543892316e-05,
"loss": 1.0568,
"step": 926
},
{
"epoch": 0.003080750047980937,
"grad_norm": 1.6626960039138794,
"learning_rate": 4.352370363070396e-05,
"loss": 0.7345,
"step": 927
},
{
"epoch": 0.003084073402941003,
"grad_norm": 2.0292208194732666,
"learning_rate": 4.332848835906457e-05,
"loss": 0.5221,
"step": 928
},
{
"epoch": 0.003087396757901069,
"grad_norm": 2.0600383281707764,
"learning_rate": 4.313359071814137e-05,
"loss": 0.7379,
"step": 929
},
{
"epoch": 0.0030907201128611344,
"grad_norm": 2.089693069458008,
"learning_rate": 4.293901180029059e-05,
"loss": 0.7173,
"step": 930
},
{
"epoch": 0.0030940434678212,
"grad_norm": 2.7445740699768066,
"learning_rate": 4.274475269608208e-05,
"loss": 1.1091,
"step": 931
},
{
"epoch": 0.003097366822781266,
"grad_norm": 2.0086019039154053,
"learning_rate": 4.2550814494293114e-05,
"loss": 1.3061,
"step": 932
},
{
"epoch": 0.0031006901777413318,
"grad_norm": 3.048259735107422,
"learning_rate": 4.2357198281902556e-05,
"loss": 0.9055,
"step": 933
},
{
"epoch": 0.0031040135327013974,
"grad_norm": 2.275136947631836,
"learning_rate": 4.2163905144084456e-05,
"loss": 0.7963,
"step": 934
},
{
"epoch": 0.003107336887661463,
"grad_norm": 1.90250563621521,
"learning_rate": 4.197093616420212e-05,
"loss": 0.906,
"step": 935
},
{
"epoch": 0.0031106602426215287,
"grad_norm": 1.2324166297912598,
"learning_rate": 4.1778292423802165e-05,
"loss": 1.0246,
"step": 936
},
{
"epoch": 0.0031139835975815947,
"grad_norm": 1.7340452671051025,
"learning_rate": 4.158597500260804e-05,
"loss": 1.2704,
"step": 937
},
{
"epoch": 0.0031173069525416604,
"grad_norm": 1.7009319067001343,
"learning_rate": 4.139398497851453e-05,
"loss": 0.7377,
"step": 938
},
{
"epoch": 0.003120630307501726,
"grad_norm": 1.9392496347427368,
"learning_rate": 4.120232342758128e-05,
"loss": 0.8505,
"step": 939
},
{
"epoch": 0.0031239536624617916,
"grad_norm": 1.9137873649597168,
"learning_rate": 4.1010991424026936e-05,
"loss": 0.854,
"step": 940
},
{
"epoch": 0.0031272770174218577,
"grad_norm": 2.3392927646636963,
"learning_rate": 4.081999004022317e-05,
"loss": 0.7184,
"step": 941
},
{
"epoch": 0.0031306003723819233,
"grad_norm": 2.563201904296875,
"learning_rate": 4.062932034668856e-05,
"loss": 1.0486,
"step": 942
},
{
"epoch": 0.003133923727341989,
"grad_norm": 3.3593509197235107,
"learning_rate": 4.0438983412082624e-05,
"loss": 0.8378,
"step": 943
},
{
"epoch": 0.0031372470823020546,
"grad_norm": 2.170193672180176,
"learning_rate": 4.0248980303199854e-05,
"loss": 0.7698,
"step": 944
},
{
"epoch": 0.0031405704372621207,
"grad_norm": 1.4478988647460938,
"learning_rate": 4.005931208496373e-05,
"loss": 0.8382,
"step": 945
},
{
"epoch": 0.0031438937922221863,
"grad_norm": 1.5125523805618286,
"learning_rate": 3.986997982042079e-05,
"loss": 0.3141,
"step": 946
},
{
"epoch": 0.003147217147182252,
"grad_norm": 2.468640089035034,
"learning_rate": 3.968098457073456e-05,
"loss": 0.7174,
"step": 947
},
{
"epoch": 0.0031505405021423176,
"grad_norm": 1.7223023176193237,
"learning_rate": 3.9492327395179707e-05,
"loss": 0.9229,
"step": 948
},
{
"epoch": 0.0031538638571023832,
"grad_norm": 1.9831416606903076,
"learning_rate": 3.930400935113607e-05,
"loss": 0.6094,
"step": 949
},
{
"epoch": 0.0031571872120624493,
"grad_norm": 3.7181596755981445,
"learning_rate": 3.9116031494082715e-05,
"loss": 0.4995,
"step": 950
},
{
"epoch": 0.003160510567022515,
"grad_norm": 2.800658941268921,
"learning_rate": 3.89283948775921e-05,
"loss": 1.1526,
"step": 951
},
{
"epoch": 0.0031638339219825806,
"grad_norm": 2.679004430770874,
"learning_rate": 3.8741100553324036e-05,
"loss": 1.1475,
"step": 952
},
{
"epoch": 0.003167157276942646,
"grad_norm": 1.9589389562606812,
"learning_rate": 3.855414957101987e-05,
"loss": 0.8868,
"step": 953
},
{
"epoch": 0.0031704806319027123,
"grad_norm": 3.259084701538086,
"learning_rate": 3.8367542978496726e-05,
"loss": 0.7888,
"step": 954
},
{
"epoch": 0.003173803986862778,
"grad_norm": 1.8688689470291138,
"learning_rate": 3.8181281821641245e-05,
"loss": 0.868,
"step": 955
},
{
"epoch": 0.0031771273418228435,
"grad_norm": 1.5200146436691284,
"learning_rate": 3.799536714440426e-05,
"loss": 0.8957,
"step": 956
},
{
"epoch": 0.003180450696782909,
"grad_norm": 2.1697998046875,
"learning_rate": 3.780979998879448e-05,
"loss": 0.719,
"step": 957
},
{
"epoch": 0.0031837740517429752,
"grad_norm": 2.2164082527160645,
"learning_rate": 3.762458139487287e-05,
"loss": 0.9671,
"step": 958
},
{
"epoch": 0.003187097406703041,
"grad_norm": 1.7509944438934326,
"learning_rate": 3.743971240074689e-05,
"loss": 0.4619,
"step": 959
},
{
"epoch": 0.0031904207616631065,
"grad_norm": 2.366628646850586,
"learning_rate": 3.725519404256447e-05,
"loss": 1.1947,
"step": 960
},
{
"epoch": 0.003193744116623172,
"grad_norm": 1.7423028945922852,
"learning_rate": 3.707102735450831e-05,
"loss": 0.4799,
"step": 961
},
{
"epoch": 0.003197067471583238,
"grad_norm": 3.3096704483032227,
"learning_rate": 3.688721336879012e-05,
"loss": 0.9332,
"step": 962
},
{
"epoch": 0.003200390826543304,
"grad_norm": 2.0674052238464355,
"learning_rate": 3.6703753115644735e-05,
"loss": 0.7861,
"step": 963
},
{
"epoch": 0.0032037141815033695,
"grad_norm": 2.0808584690093994,
"learning_rate": 3.6520647623324525e-05,
"loss": 0.5346,
"step": 964
},
{
"epoch": 0.003207037536463435,
"grad_norm": 2.2569382190704346,
"learning_rate": 3.633789791809339e-05,
"loss": 0.7376,
"step": 965
},
{
"epoch": 0.0032103608914235008,
"grad_norm": 2.037261724472046,
"learning_rate": 3.615550502422115e-05,
"loss": 0.9662,
"step": 966
},
{
"epoch": 0.003213684246383567,
"grad_norm": 1.5911048650741577,
"learning_rate": 3.5973469963977805e-05,
"loss": 0.5487,
"step": 967
},
{
"epoch": 0.0032170076013436325,
"grad_norm": 1.9614652395248413,
"learning_rate": 3.579179375762773e-05,
"loss": 1.0662,
"step": 968
},
{
"epoch": 0.003220330956303698,
"grad_norm": 2.467716932296753,
"learning_rate": 3.5610477423424124e-05,
"loss": 0.6805,
"step": 969
},
{
"epoch": 0.0032236543112637637,
"grad_norm": 2.4678971767425537,
"learning_rate": 3.542952197760305e-05,
"loss": 0.6498,
"step": 970
},
{
"epoch": 0.00322697766622383,
"grad_norm": 1.838238000869751,
"learning_rate": 3.524892843437793e-05,
"loss": 0.8032,
"step": 971
},
{
"epoch": 0.0032303010211838954,
"grad_norm": 4.720175743103027,
"learning_rate": 3.506869780593387e-05,
"loss": 0.6131,
"step": 972
},
{
"epoch": 0.003233624376143961,
"grad_norm": 2.0438411235809326,
"learning_rate": 3.488883110242175e-05,
"loss": 1.0154,
"step": 973
},
{
"epoch": 0.0032369477311040267,
"grad_norm": 1.9405808448791504,
"learning_rate": 3.4709329331952946e-05,
"loss": 0.8456,
"step": 974
},
{
"epoch": 0.0032402710860640923,
"grad_norm": 1.3455164432525635,
"learning_rate": 3.453019350059333e-05,
"loss": 0.6341,
"step": 975
},
{
"epoch": 0.0032435944410241584,
"grad_norm": 2.457744836807251,
"learning_rate": 3.435142461235778e-05,
"loss": 0.6841,
"step": 976
},
{
"epoch": 0.003246917795984224,
"grad_norm": 2.1893134117126465,
"learning_rate": 3.417302366920465e-05,
"loss": 1.0017,
"step": 977
},
{
"epoch": 0.0032502411509442897,
"grad_norm": 2.831120491027832,
"learning_rate": 3.399499167102985e-05,
"loss": 0.842,
"step": 978
},
{
"epoch": 0.0032535645059043553,
"grad_norm": 1.5025831460952759,
"learning_rate": 3.381732961566166e-05,
"loss": 0.67,
"step": 979
},
{
"epoch": 0.0032568878608644214,
"grad_norm": 1.6793795824050903,
"learning_rate": 3.364003849885476e-05,
"loss": 0.8812,
"step": 980
},
{
"epoch": 0.003260211215824487,
"grad_norm": 1.9502850770950317,
"learning_rate": 3.346311931428485e-05,
"loss": 0.8764,
"step": 981
},
{
"epoch": 0.0032635345707845527,
"grad_norm": 1.9033828973770142,
"learning_rate": 3.32865730535431e-05,
"loss": 0.8055,
"step": 982
},
{
"epoch": 0.0032668579257446183,
"grad_norm": 2.4927141666412354,
"learning_rate": 3.3110400706130427e-05,
"loss": 0.5764,
"step": 983
},
{
"epoch": 0.0032701812807046844,
"grad_norm": 1.6894636154174805,
"learning_rate": 3.2934603259452104e-05,
"loss": 0.9428,
"step": 984
},
{
"epoch": 0.00327350463566475,
"grad_norm": 2.994685649871826,
"learning_rate": 3.275918169881216e-05,
"loss": 0.6822,
"step": 985
},
{
"epoch": 0.0032768279906248156,
"grad_norm": 2.1028683185577393,
"learning_rate": 3.258413700740783e-05,
"loss": 0.5847,
"step": 986
},
{
"epoch": 0.0032801513455848813,
"grad_norm": 2.150242328643799,
"learning_rate": 3.2409470166324216e-05,
"loss": 0.8227,
"step": 987
},
{
"epoch": 0.0032834747005449473,
"grad_norm": 1.834642767906189,
"learning_rate": 3.223518215452852e-05,
"loss": 0.5538,
"step": 988
},
{
"epoch": 0.003286798055505013,
"grad_norm": 2.7726798057556152,
"learning_rate": 3.2061273948864736e-05,
"loss": 1.0142,
"step": 989
},
{
"epoch": 0.0032901214104650786,
"grad_norm": 1.5941482782363892,
"learning_rate": 3.188774652404813e-05,
"loss": 1.2037,
"step": 990
},
{
"epoch": 0.0032934447654251442,
"grad_norm": 1.4523831605911255,
"learning_rate": 3.171460085265978e-05,
"loss": 0.8859,
"step": 991
},
{
"epoch": 0.00329676812038521,
"grad_norm": 2.7575342655181885,
"learning_rate": 3.154183790514117e-05,
"loss": 0.9737,
"step": 992
},
{
"epoch": 0.003300091475345276,
"grad_norm": 1.9906915426254272,
"learning_rate": 3.1369458649788644e-05,
"loss": 0.6769,
"step": 993
},
{
"epoch": 0.0033034148303053416,
"grad_norm": 2.669388771057129,
"learning_rate": 3.1197464052748024e-05,
"loss": 1.038,
"step": 994
},
{
"epoch": 0.0033067381852654072,
"grad_norm": 2.2965517044067383,
"learning_rate": 3.102585507800936e-05,
"loss": 0.5346,
"step": 995
},
{
"epoch": 0.003310061540225473,
"grad_norm": 1.4986422061920166,
"learning_rate": 3.0854632687401154e-05,
"loss": 1.1781,
"step": 996
},
{
"epoch": 0.003313384895185539,
"grad_norm": 2.397091865539551,
"learning_rate": 3.0683797840585317e-05,
"loss": 0.7228,
"step": 997
},
{
"epoch": 0.0033167082501456046,
"grad_norm": 1.880570888519287,
"learning_rate": 3.051335149505171e-05,
"loss": 0.965,
"step": 998
},
{
"epoch": 0.00332003160510567,
"grad_norm": 1.5805224180221558,
"learning_rate": 3.0343294606112628e-05,
"loss": 0.5373,
"step": 999
},
{
"epoch": 0.003323354960065736,
"grad_norm": 2.085787534713745,
"learning_rate": 3.0173628126897657e-05,
"loss": 1.0924,
"step": 1000
},
{
"epoch": 0.003326678315025802,
"grad_norm": 2.0339345932006836,
"learning_rate": 3.0004353008348186e-05,
"loss": 0.9347,
"step": 1001
},
{
"epoch": 0.0033300016699858675,
"grad_norm": 2.8665969371795654,
"learning_rate": 2.983547019921199e-05,
"loss": 0.6028,
"step": 1002
},
{
"epoch": 0.003333325024945933,
"grad_norm": 1.641309380531311,
"learning_rate": 2.9666980646038278e-05,
"loss": 0.9894,
"step": 1003
},
{
"epoch": 0.003336648379905999,
"grad_norm": 2.332895278930664,
"learning_rate": 2.9498885293171942e-05,
"loss": 1.0866,
"step": 1004
},
{
"epoch": 0.0033399717348660644,
"grad_norm": 2.141460418701172,
"learning_rate": 2.9331185082748634e-05,
"loss": 0.8964,
"step": 1005
},
{
"epoch": 0.0033399717348660644,
"eval_loss": 0.7770960330963135,
"eval_runtime": 3252.7632,
"eval_samples_per_second": 38.95,
"eval_steps_per_second": 19.475,
"step": 1005
},
{
"epoch": 0.0033432950898261305,
"grad_norm": 1.9887334108352661,
"learning_rate": 2.9163880954689228e-05,
"loss": 0.8261,
"step": 1006
},
{
"epoch": 0.003346618444786196,
"grad_norm": 2.993713140487671,
"learning_rate": 2.8996973846694642e-05,
"loss": 0.6206,
"step": 1007
},
{
"epoch": 0.0033499417997462618,
"grad_norm": 1.2360352277755737,
"learning_rate": 2.8830464694240634e-05,
"loss": 1.2049,
"step": 1008
},
{
"epoch": 0.0033532651547063274,
"grad_norm": 2.6683409214019775,
"learning_rate": 2.8664354430572492e-05,
"loss": 0.8874,
"step": 1009
},
{
"epoch": 0.0033565885096663935,
"grad_norm": 1.849593162536621,
"learning_rate": 2.8498643986699803e-05,
"loss": 1.0419,
"step": 1010
},
{
"epoch": 0.003359911864626459,
"grad_norm": 1.955989956855774,
"learning_rate": 2.8333334291391323e-05,
"loss": 1.3201,
"step": 1011
},
{
"epoch": 0.0033632352195865248,
"grad_norm": 1.4712536334991455,
"learning_rate": 2.8168426271169623e-05,
"loss": 0.8732,
"step": 1012
},
{
"epoch": 0.0033665585745465904,
"grad_norm": 1.8448526859283447,
"learning_rate": 2.8003920850306085e-05,
"loss": 1.313,
"step": 1013
},
{
"epoch": 0.0033698819295066565,
"grad_norm": 2.3865597248077393,
"learning_rate": 2.783981895081549e-05,
"loss": 1.0349,
"step": 1014
},
{
"epoch": 0.003373205284466722,
"grad_norm": 3.1011106967926025,
"learning_rate": 2.767612149245099e-05,
"loss": 1.1726,
"step": 1015
},
{
"epoch": 0.0033765286394267877,
"grad_norm": 1.5871694087982178,
"learning_rate": 2.751282939269908e-05,
"loss": 0.7144,
"step": 1016
},
{
"epoch": 0.0033798519943868534,
"grad_norm": 2.121737480163574,
"learning_rate": 2.734994356677416e-05,
"loss": 1.1891,
"step": 1017
},
{
"epoch": 0.003383175349346919,
"grad_norm": 1.8265715837478638,
"learning_rate": 2.7187464927613606e-05,
"loss": 0.6414,
"step": 1018
},
{
"epoch": 0.003386498704306985,
"grad_norm": 2.1911258697509766,
"learning_rate": 2.702539438587267e-05,
"loss": 0.9227,
"step": 1019
},
{
"epoch": 0.0033898220592670507,
"grad_norm": 2.096592903137207,
"learning_rate": 2.6863732849919164e-05,
"loss": 0.268,
"step": 1020
},
{
"epoch": 0.0033931454142271163,
"grad_norm": 1.7348281145095825,
"learning_rate": 2.670248122582869e-05,
"loss": 0.9092,
"step": 1021
},
{
"epoch": 0.003396468769187182,
"grad_norm": 2.108294725418091,
"learning_rate": 2.6541640417379276e-05,
"loss": 0.98,
"step": 1022
},
{
"epoch": 0.003399792124147248,
"grad_norm": 2.166422128677368,
"learning_rate": 2.6381211326046428e-05,
"loss": 1.0629,
"step": 1023
},
{
"epoch": 0.0034031154791073137,
"grad_norm": 1.3211616277694702,
"learning_rate": 2.6221194850998155e-05,
"loss": 1.1247,
"step": 1024
},
{
"epoch": 0.0034064388340673793,
"grad_norm": 1.4853568077087402,
"learning_rate": 2.6061591889089776e-05,
"loss": 0.3974,
"step": 1025
},
{
"epoch": 0.003409762189027445,
"grad_norm": 1.7520192861557007,
"learning_rate": 2.590240333485897e-05,
"loss": 0.8057,
"step": 1026
},
{
"epoch": 0.003413085543987511,
"grad_norm": 2.130795955657959,
"learning_rate": 2.5743630080520765e-05,
"loss": 0.8887,
"step": 1027
},
{
"epoch": 0.0034164088989475767,
"grad_norm": 2.033766269683838,
"learning_rate": 2.558527301596251e-05,
"loss": 0.6583,
"step": 1028
},
{
"epoch": 0.0034197322539076423,
"grad_norm": 1.8105857372283936,
"learning_rate": 2.5427333028738988e-05,
"loss": 0.4101,
"step": 1029
},
{
"epoch": 0.003423055608867708,
"grad_norm": 2.780566930770874,
"learning_rate": 2.5269811004067256e-05,
"loss": 0.8233,
"step": 1030
},
{
"epoch": 0.003426378963827774,
"grad_norm": 1.9028615951538086,
"learning_rate": 2.5112707824821846e-05,
"loss": 0.8525,
"step": 1031
},
{
"epoch": 0.0034297023187878396,
"grad_norm": 3.4555652141571045,
"learning_rate": 2.495602437152975e-05,
"loss": 0.8126,
"step": 1032
},
{
"epoch": 0.0034330256737479053,
"grad_norm": 1.3704328536987305,
"learning_rate": 2.4799761522365438e-05,
"loss": 0.7354,
"step": 1033
},
{
"epoch": 0.003436349028707971,
"grad_norm": 2.7514312267303467,
"learning_rate": 2.4643920153146116e-05,
"loss": 1.19,
"step": 1034
},
{
"epoch": 0.0034396723836680365,
"grad_norm": 2.007598638534546,
"learning_rate": 2.448850113732658e-05,
"loss": 1.0406,
"step": 1035
},
{
"epoch": 0.0034429957386281026,
"grad_norm": 1.9599459171295166,
"learning_rate": 2.4333505345994433e-05,
"loss": 1.0742,
"step": 1036
},
{
"epoch": 0.0034463190935881682,
"grad_norm": 1.525658130645752,
"learning_rate": 2.4178933647865344e-05,
"loss": 1.0423,
"step": 1037
},
{
"epoch": 0.003449642448548234,
"grad_norm": 2.253969430923462,
"learning_rate": 2.4024786909277806e-05,
"loss": 0.7478,
"step": 1038
},
{
"epoch": 0.0034529658035082995,
"grad_norm": 2.9593136310577393,
"learning_rate": 2.3871065994188723e-05,
"loss": 0.9221,
"step": 1039
},
{
"epoch": 0.0034562891584683656,
"grad_norm": 1.6113935708999634,
"learning_rate": 2.3717771764168262e-05,
"loss": 1.0702,
"step": 1040
},
{
"epoch": 0.003459612513428431,
"grad_norm": 1.477838397026062,
"learning_rate": 2.3564905078395073e-05,
"loss": 0.8423,
"step": 1041
},
{
"epoch": 0.003462935868388497,
"grad_norm": 1.8534512519836426,
"learning_rate": 2.3412466793651654e-05,
"loss": 0.6136,
"step": 1042
},
{
"epoch": 0.0034662592233485625,
"grad_norm": 1.2977598905563354,
"learning_rate": 2.32604577643193e-05,
"loss": 1.0533,
"step": 1043
},
{
"epoch": 0.0034695825783086286,
"grad_norm": 1.402443289756775,
"learning_rate": 2.310887884237346e-05,
"loss": 0.6865,
"step": 1044
},
{
"epoch": 0.003472905933268694,
"grad_norm": 1.5745853185653687,
"learning_rate": 2.2957730877378947e-05,
"loss": 0.8526,
"step": 1045
},
{
"epoch": 0.00347622928822876,
"grad_norm": 2.4579102993011475,
"learning_rate": 2.280701471648512e-05,
"loss": 0.5886,
"step": 1046
},
{
"epoch": 0.0034795526431888255,
"grad_norm": 2.511986017227173,
"learning_rate": 2.2656731204421255e-05,
"loss": 1.0093,
"step": 1047
},
{
"epoch": 0.003482875998148891,
"grad_norm": 1.5310604572296143,
"learning_rate": 2.2506881183491647e-05,
"loss": 0.5451,
"step": 1048
},
{
"epoch": 0.003486199353108957,
"grad_norm": 2.594639539718628,
"learning_rate": 2.2357465493571016e-05,
"loss": 1.3524,
"step": 1049
},
{
"epoch": 0.003489522708069023,
"grad_norm": 2.8036839962005615,
"learning_rate": 2.2208484972099743e-05,
"loss": 0.8427,
"step": 1050
},
{
"epoch": 0.0034928460630290884,
"grad_norm": 1.729481816291809,
"learning_rate": 2.2059940454079175e-05,
"loss": 0.5532,
"step": 1051
},
{
"epoch": 0.003496169417989154,
"grad_norm": 1.573091745376587,
"learning_rate": 2.191183277206703e-05,
"loss": 0.5843,
"step": 1052
},
{
"epoch": 0.00349949277294922,
"grad_norm": 2.2913410663604736,
"learning_rate": 2.176416275617259e-05,
"loss": 0.6285,
"step": 1053
},
{
"epoch": 0.0035028161279092858,
"grad_norm": 2.0742692947387695,
"learning_rate": 2.1616931234052108e-05,
"loss": 1.4509,
"step": 1054
},
{
"epoch": 0.0035061394828693514,
"grad_norm": 2.3040852546691895,
"learning_rate": 2.1470139030904312e-05,
"loss": 0.8464,
"step": 1055
},
{
"epoch": 0.003509462837829417,
"grad_norm": 2.0199368000030518,
"learning_rate": 2.132378696946542e-05,
"loss": 1.0534,
"step": 1056
},
{
"epoch": 0.003512786192789483,
"grad_norm": 2.384861469268799,
"learning_rate": 2.1177875870004993e-05,
"loss": 1.1861,
"step": 1057
},
{
"epoch": 0.0035161095477495487,
"grad_norm": 1.7507251501083374,
"learning_rate": 2.103240655032095e-05,
"loss": 0.6772,
"step": 1058
},
{
"epoch": 0.0035194329027096144,
"grad_norm": 1.9110218286514282,
"learning_rate": 2.0887379825735176e-05,
"loss": 0.4735,
"step": 1059
},
{
"epoch": 0.00352275625766968,
"grad_norm": 2.0105197429656982,
"learning_rate": 2.074279650908897e-05,
"loss": 0.6666,
"step": 1060
},
{
"epoch": 0.0035260796126297457,
"grad_norm": 2.125974416732788,
"learning_rate": 2.0598657410738343e-05,
"loss": 1.151,
"step": 1061
},
{
"epoch": 0.0035294029675898117,
"grad_norm": 1.5914584398269653,
"learning_rate": 2.0454963338549625e-05,
"loss": 0.7659,
"step": 1062
},
{
"epoch": 0.0035327263225498774,
"grad_norm": 2.65944504737854,
"learning_rate": 2.0311715097894855e-05,
"loss": 0.6658,
"step": 1063
},
{
"epoch": 0.003536049677509943,
"grad_norm": 1.8550827503204346,
"learning_rate": 2.0168913491647255e-05,
"loss": 1.1008,
"step": 1064
},
{
"epoch": 0.0035393730324700086,
"grad_norm": 1.9186201095581055,
"learning_rate": 2.0026559320176875e-05,
"loss": 1.0043,
"step": 1065
},
{
"epoch": 0.0035426963874300747,
"grad_norm": 2.0259082317352295,
"learning_rate": 1.9884653381345875e-05,
"loss": 1.2005,
"step": 1066
},
{
"epoch": 0.0035460197423901403,
"grad_norm": 2.626133680343628,
"learning_rate": 1.9743196470504234e-05,
"loss": 0.9398,
"step": 1067
},
{
"epoch": 0.003549343097350206,
"grad_norm": 2.803363561630249,
"learning_rate": 1.9602189380485214e-05,
"loss": 0.7764,
"step": 1068
},
{
"epoch": 0.0035526664523102716,
"grad_norm": 2.6616218090057373,
"learning_rate": 1.9461632901600935e-05,
"loss": 0.867,
"step": 1069
},
{
"epoch": 0.0035559898072703377,
"grad_norm": 1.805355191230774,
"learning_rate": 1.9321527821637987e-05,
"loss": 0.9621,
"step": 1070
},
{
"epoch": 0.0035593131622304033,
"grad_norm": 1.9301918745040894,
"learning_rate": 1.9181874925852926e-05,
"loss": 1.1156,
"step": 1071
},
{
"epoch": 0.003562636517190469,
"grad_norm": 1.2671960592269897,
"learning_rate": 1.904267499696791e-05,
"loss": 1.0542,
"step": 1072
},
{
"epoch": 0.0035659598721505346,
"grad_norm": 1.832224726676941,
"learning_rate": 1.8903928815166426e-05,
"loss": 0.9923,
"step": 1073
},
{
"epoch": 0.0035692832271106,
"grad_norm": 2.3667545318603516,
"learning_rate": 1.8765637158088623e-05,
"loss": 0.7825,
"step": 1074
},
{
"epoch": 0.0035726065820706663,
"grad_norm": 1.8500324487686157,
"learning_rate": 1.8627800800827332e-05,
"loss": 0.4897,
"step": 1075
},
{
"epoch": 0.003575929937030732,
"grad_norm": 2.8491454124450684,
"learning_rate": 1.8490420515923445e-05,
"loss": 1.0215,
"step": 1076
},
{
"epoch": 0.0035792532919907975,
"grad_norm": 2.6926255226135254,
"learning_rate": 1.8353497073361647e-05,
"loss": 0.8692,
"step": 1077
},
{
"epoch": 0.003582576646950863,
"grad_norm": 3.2963078022003174,
"learning_rate": 1.821703124056623e-05,
"loss": 1.2669,
"step": 1078
},
{
"epoch": 0.0035859000019109293,
"grad_norm": 1.8386890888214111,
"learning_rate": 1.808102378239659e-05,
"loss": 1.0086,
"step": 1079
},
{
"epoch": 0.003589223356870995,
"grad_norm": 2.384105920791626,
"learning_rate": 1.794547546114308e-05,
"loss": 1.6574,
"step": 1080
},
{
"epoch": 0.0035925467118310605,
"grad_norm": 2.209704875946045,
"learning_rate": 1.78103870365227e-05,
"loss": 1.2232,
"step": 1081
},
{
"epoch": 0.003595870066791126,
"grad_norm": 2.2206103801727295,
"learning_rate": 1.7675759265674797e-05,
"loss": 1.0329,
"step": 1082
},
{
"epoch": 0.0035991934217511922,
"grad_norm": 1.4707410335540771,
"learning_rate": 1.7541592903156933e-05,
"loss": 0.5543,
"step": 1083
},
{
"epoch": 0.003602516776711258,
"grad_norm": 1.7666308879852295,
"learning_rate": 1.7407888700940523e-05,
"loss": 0.6173,
"step": 1084
},
{
"epoch": 0.0036058401316713235,
"grad_norm": 3.515272378921509,
"learning_rate": 1.7274647408406698e-05,
"loss": 0.7079,
"step": 1085
},
{
"epoch": 0.003609163486631389,
"grad_norm": 1.8757985830307007,
"learning_rate": 1.7141869772342088e-05,
"loss": 0.6742,
"step": 1086
},
{
"epoch": 0.003612486841591455,
"grad_norm": 1.7394914627075195,
"learning_rate": 1.7009556536934602e-05,
"loss": 0.9227,
"step": 1087
},
{
"epoch": 0.003615810196551521,
"grad_norm": 1.7292208671569824,
"learning_rate": 1.6877708443769392e-05,
"loss": 0.6633,
"step": 1088
},
{
"epoch": 0.0036191335515115865,
"grad_norm": 2.7111852169036865,
"learning_rate": 1.6746326231824495e-05,
"loss": 1.4088,
"step": 1089
},
{
"epoch": 0.003622456906471652,
"grad_norm": 2.8773698806762695,
"learning_rate": 1.661541063746679e-05,
"loss": 0.9911,
"step": 1090
},
{
"epoch": 0.0036257802614317177,
"grad_norm": 2.6539015769958496,
"learning_rate": 1.648496239444799e-05,
"loss": 0.8291,
"step": 1091
},
{
"epoch": 0.003629103616391784,
"grad_norm": 1.9189116954803467,
"learning_rate": 1.635498223390022e-05,
"loss": 1.0237,
"step": 1092
},
{
"epoch": 0.0036324269713518494,
"grad_norm": 1.969369888305664,
"learning_rate": 1.6225470884332304e-05,
"loss": 1.0304,
"step": 1093
},
{
"epoch": 0.003635750326311915,
"grad_norm": 2.1338069438934326,
"learning_rate": 1.6096429071625374e-05,
"loss": 1.0386,
"step": 1094
},
{
"epoch": 0.0036390736812719807,
"grad_norm": 2.693995714187622,
"learning_rate": 1.5967857519028928e-05,
"loss": 0.6724,
"step": 1095
},
{
"epoch": 0.003642397036232047,
"grad_norm": 1.5616436004638672,
"learning_rate": 1.5839756947156846e-05,
"loss": 1.0765,
"step": 1096
},
{
"epoch": 0.0036457203911921124,
"grad_norm": 2.3733086585998535,
"learning_rate": 1.5712128073983146e-05,
"loss": 0.6867,
"step": 1097
},
{
"epoch": 0.003649043746152178,
"grad_norm": 1.7389389276504517,
"learning_rate": 1.5584971614838128e-05,
"loss": 0.9531,
"step": 1098
},
{
"epoch": 0.0036523671011122437,
"grad_norm": 2.9344327449798584,
"learning_rate": 1.5458288282404398e-05,
"loss": 0.6522,
"step": 1099
},
{
"epoch": 0.0036556904560723098,
"grad_norm": 2.1339237689971924,
"learning_rate": 1.533207878671269e-05,
"loss": 1.0749,
"step": 1100
},
{
"epoch": 0.0036590138110323754,
"grad_norm": 2.975815534591675,
"learning_rate": 1.5206343835138092e-05,
"loss": 0.4487,
"step": 1101
},
{
"epoch": 0.003662337165992441,
"grad_norm": 1.2044059038162231,
"learning_rate": 1.5081084132395907e-05,
"loss": 0.6741,
"step": 1102
},
{
"epoch": 0.0036656605209525067,
"grad_norm": 1.7217930555343628,
"learning_rate": 1.4956300380537747e-05,
"loss": 1.0937,
"step": 1103
},
{
"epoch": 0.0036689838759125723,
"grad_norm": 1.3124263286590576,
"learning_rate": 1.4831993278947742e-05,
"loss": 0.4858,
"step": 1104
},
{
"epoch": 0.0036723072308726384,
"grad_norm": 2.2098429203033447,
"learning_rate": 1.4708163524338436e-05,
"loss": 1.0121,
"step": 1105
},
{
"epoch": 0.003675630585832704,
"grad_norm": 1.8337208032608032,
"learning_rate": 1.4584811810746935e-05,
"loss": 1.2821,
"step": 1106
},
{
"epoch": 0.0036789539407927696,
"grad_norm": 1.871624231338501,
"learning_rate": 1.4461938829531107e-05,
"loss": 0.6309,
"step": 1107
},
{
"epoch": 0.0036822772957528353,
"grad_norm": 2.333550453186035,
"learning_rate": 1.4339545269365585e-05,
"loss": 0.9257,
"step": 1108
},
{
"epoch": 0.0036856006507129013,
"grad_norm": 1.6626560688018799,
"learning_rate": 1.4217631816237952e-05,
"loss": 0.5538,
"step": 1109
},
{
"epoch": 0.003688924005672967,
"grad_norm": 1.7753632068634033,
"learning_rate": 1.4096199153444934e-05,
"loss": 0.6881,
"step": 1110
},
{
"epoch": 0.0036922473606330326,
"grad_norm": 2.1906261444091797,
"learning_rate": 1.3975247961588478e-05,
"loss": 1.2149,
"step": 1111
},
{
"epoch": 0.0036955707155930983,
"grad_norm": 1.9990803003311157,
"learning_rate": 1.385477891857211e-05,
"loss": 0.9241,
"step": 1112
},
{
"epoch": 0.0036988940705531643,
"grad_norm": 1.9631175994873047,
"learning_rate": 1.3734792699596921e-05,
"loss": 0.9384,
"step": 1113
},
{
"epoch": 0.00370221742551323,
"grad_norm": 1.601910948753357,
"learning_rate": 1.361528997715792e-05,
"loss": 1.0611,
"step": 1114
},
{
"epoch": 0.0037055407804732956,
"grad_norm": 1.8392131328582764,
"learning_rate": 1.3496271421040219e-05,
"loss": 0.9894,
"step": 1115
},
{
"epoch": 0.0037088641354333612,
"grad_norm": 3.042781352996826,
"learning_rate": 1.3377737698315295e-05,
"loss": 0.9245,
"step": 1116
},
{
"epoch": 0.003712187490393427,
"grad_norm": 2.590090751647949,
"learning_rate": 1.3259689473337289e-05,
"loss": 1.0378,
"step": 1117
},
{
"epoch": 0.003715510845353493,
"grad_norm": 2.383646011352539,
"learning_rate": 1.3142127407739168e-05,
"loss": 0.5865,
"step": 1118
},
{
"epoch": 0.0037188342003135586,
"grad_norm": 1.675218939781189,
"learning_rate": 1.3025052160429118e-05,
"loss": 0.442,
"step": 1119
},
{
"epoch": 0.003722157555273624,
"grad_norm": 2.6803250312805176,
"learning_rate": 1.290846438758687e-05,
"loss": 1.1026,
"step": 1120
},
{
"epoch": 0.00372548091023369,
"grad_norm": 2.088174343109131,
"learning_rate": 1.2792364742659856e-05,
"loss": 1.0888,
"step": 1121
},
{
"epoch": 0.003728804265193756,
"grad_norm": 2.361299514770508,
"learning_rate": 1.2676753876359781e-05,
"loss": 0.9856,
"step": 1122
},
{
"epoch": 0.0037321276201538215,
"grad_norm": 1.9991151094436646,
"learning_rate": 1.2561632436658811e-05,
"loss": 0.7982,
"step": 1123
},
{
"epoch": 0.003735450975113887,
"grad_norm": 3.7411093711853027,
"learning_rate": 1.2447001068785969e-05,
"loss": 1.2467,
"step": 1124
},
{
"epoch": 0.003738774330073953,
"grad_norm": 2.4900856018066406,
"learning_rate": 1.233286041522358e-05,
"loss": 0.707,
"step": 1125
},
{
"epoch": 0.003742097685034019,
"grad_norm": 2.1829333305358887,
"learning_rate": 1.2219211115703621e-05,
"loss": 0.6891,
"step": 1126
},
{
"epoch": 0.0037454210399940845,
"grad_norm": 1.6782306432724,
"learning_rate": 1.210605380720411e-05,
"loss": 1.1077,
"step": 1127
},
{
"epoch": 0.00374874439495415,
"grad_norm": 2.4409914016723633,
"learning_rate": 1.1993389123945597e-05,
"loss": 0.8018,
"step": 1128
},
{
"epoch": 0.003752067749914216,
"grad_norm": 1.8187549114227295,
"learning_rate": 1.1881217697387547e-05,
"loss": 1.0921,
"step": 1129
},
{
"epoch": 0.003755391104874282,
"grad_norm": 1.4356580972671509,
"learning_rate": 1.1769540156224923e-05,
"loss": 0.7464,
"step": 1130
},
{
"epoch": 0.0037587144598343475,
"grad_norm": 2.073961019515991,
"learning_rate": 1.1658357126384479e-05,
"loss": 0.6749,
"step": 1131
},
{
"epoch": 0.003762037814794413,
"grad_norm": 1.284589409828186,
"learning_rate": 1.1547669231021395e-05,
"loss": 0.4601,
"step": 1132
},
{
"epoch": 0.0037653611697544788,
"grad_norm": 2.1579086780548096,
"learning_rate": 1.1437477090515713e-05,
"loss": 1.0621,
"step": 1133
},
{
"epoch": 0.0037686845247145444,
"grad_norm": 1.9478920698165894,
"learning_rate": 1.1327781322468889e-05,
"loss": 0.8049,
"step": 1134
},
{
"epoch": 0.0037720078796746105,
"grad_norm": 1.604015827178955,
"learning_rate": 1.1218582541700362e-05,
"loss": 1.1651,
"step": 1135
},
{
"epoch": 0.003775331234634676,
"grad_norm": 2.576106309890747,
"learning_rate": 1.1109881360244034e-05,
"loss": 1.2095,
"step": 1136
},
{
"epoch": 0.0037786545895947417,
"grad_norm": 1.4734787940979004,
"learning_rate": 1.100167838734486e-05,
"loss": 0.8627,
"step": 1137
},
{
"epoch": 0.0037819779445548074,
"grad_norm": 1.465071439743042,
"learning_rate": 1.0893974229455538e-05,
"loss": 1.0974,
"step": 1138
},
{
"epoch": 0.0037853012995148734,
"grad_norm": 1.63118577003479,
"learning_rate": 1.0786769490232895e-05,
"loss": 1.0472,
"step": 1139
},
{
"epoch": 0.003788624654474939,
"grad_norm": 1.9454184770584106,
"learning_rate": 1.0680064770534748e-05,
"loss": 1.0103,
"step": 1140
},
{
"epoch": 0.0037919480094350047,
"grad_norm": 1.2677627801895142,
"learning_rate": 1.0573860668416358e-05,
"loss": 0.7591,
"step": 1141
},
{
"epoch": 0.0037952713643950703,
"grad_norm": 1.8168962001800537,
"learning_rate": 1.046815777912713e-05,
"loss": 0.3897,
"step": 1142
},
{
"epoch": 0.0037985947193551364,
"grad_norm": 2.0347893238067627,
"learning_rate": 1.0362956695107361e-05,
"loss": 0.9761,
"step": 1143
},
{
"epoch": 0.003801918074315202,
"grad_norm": 1.2987099885940552,
"learning_rate": 1.0258258005984777e-05,
"loss": 0.7144,
"step": 1144
},
{
"epoch": 0.0038052414292752677,
"grad_norm": 2.079082489013672,
"learning_rate": 1.015406229857132e-05,
"loss": 0.8902,
"step": 1145
},
{
"epoch": 0.0038085647842353333,
"grad_norm": 1.4583004713058472,
"learning_rate": 1.0050370156859811e-05,
"loss": 0.8563,
"step": 1146
},
{
"epoch": 0.003811888139195399,
"grad_norm": 2.9463703632354736,
"learning_rate": 9.947182162020729e-06,
"loss": 0.5976,
"step": 1147
},
{
"epoch": 0.003815211494155465,
"grad_norm": 1.276196002960205,
"learning_rate": 9.844498892398946e-06,
"loss": 1.2324,
"step": 1148
},
{
"epoch": 0.0038185348491155307,
"grad_norm": 2.1213057041168213,
"learning_rate": 9.742320923510428e-06,
"loss": 1.2369,
"step": 1149
},
{
"epoch": 0.0038218582040755963,
"grad_norm": 2.704526424407959,
"learning_rate": 9.640648828039045e-06,
"loss": 0.9242,
"step": 1150
},
{
"epoch": 0.003825181559035662,
"grad_norm": 1.3100472688674927,
"learning_rate": 9.539483175833397e-06,
"loss": 0.6016,
"step": 1151
},
{
"epoch": 0.003828504913995728,
"grad_norm": 2.6044392585754395,
"learning_rate": 9.438824533903545e-06,
"loss": 0.6004,
"step": 1152
},
{
"epoch": 0.0038318282689557936,
"grad_norm": 1.9593836069107056,
"learning_rate": 9.338673466417924e-06,
"loss": 1.0782,
"step": 1153
},
{
"epoch": 0.0038351516239158593,
"grad_norm": 2.4404966831207275,
"learning_rate": 9.2390305347001e-06,
"loss": 0.9982,
"step": 1154
},
{
"epoch": 0.003838474978875925,
"grad_norm": 1.8588268756866455,
"learning_rate": 9.139896297225626e-06,
"loss": 0.65,
"step": 1155
},
{
"epoch": 0.003841798333835991,
"grad_norm": 1.6673916578292847,
"learning_rate": 9.041271309619048e-06,
"loss": 1.3098,
"step": 1156
},
{
"epoch": 0.0038451216887960566,
"grad_norm": 2.00077223777771,
"learning_rate": 8.943156124650531e-06,
"loss": 1.1602,
"step": 1157
},
{
"epoch": 0.0038484450437561222,
"grad_norm": 3.087786912918091,
"learning_rate": 8.845551292233045e-06,
"loss": 0.9981,
"step": 1158
},
{
"epoch": 0.003851768398716188,
"grad_norm": 1.7945916652679443,
"learning_rate": 8.748457359419093e-06,
"loss": 0.9974,
"step": 1159
},
{
"epoch": 0.0038550917536762535,
"grad_norm": 2.521268129348755,
"learning_rate": 8.651874870397692e-06,
"loss": 0.9116,
"step": 1160
},
{
"epoch": 0.0038584151086363196,
"grad_norm": 2.505439281463623,
"learning_rate": 8.555804366491405e-06,
"loss": 0.9118,
"step": 1161
},
{
"epoch": 0.003861738463596385,
"grad_norm": 2.009557008743286,
"learning_rate": 8.460246386153147e-06,
"loss": 0.6388,
"step": 1162
},
{
"epoch": 0.003865061818556451,
"grad_norm": 3.128729820251465,
"learning_rate": 8.365201464963302e-06,
"loss": 1.0031,
"step": 1163
},
{
"epoch": 0.0038683851735165165,
"grad_norm": 1.6357756853103638,
"learning_rate": 8.27067013562669e-06,
"loss": 0.8037,
"step": 1164
},
{
"epoch": 0.0038717085284765826,
"grad_norm": 2.719797134399414,
"learning_rate": 8.17665292796952e-06,
"loss": 1.0075,
"step": 1165
},
{
"epoch": 0.003875031883436648,
"grad_norm": 2.046483278274536,
"learning_rate": 8.083150368936532e-06,
"loss": 0.2509,
"step": 1166
},
{
"epoch": 0.003878355238396714,
"grad_norm": 1.5247260332107544,
"learning_rate": 7.990162982587924e-06,
"loss": 1.1599,
"step": 1167
},
{
"epoch": 0.0038816785933567795,
"grad_norm": 1.7768243551254272,
"learning_rate": 7.897691290096498e-06,
"loss": 0.6034,
"step": 1168
},
{
"epoch": 0.0038850019483168455,
"grad_norm": 3.438926935195923,
"learning_rate": 7.805735809744696e-06,
"loss": 0.6389,
"step": 1169
},
{
"epoch": 0.003888325303276911,
"grad_norm": 1.7144073247909546,
"learning_rate": 7.714297056921715e-06,
"loss": 1.159,
"step": 1170
},
{
"epoch": 0.003891648658236977,
"grad_norm": 1.8290139436721802,
"learning_rate": 7.623375544120648e-06,
"loss": 0.9152,
"step": 1171
},
{
"epoch": 0.0038949720131970424,
"grad_norm": 2.1877503395080566,
"learning_rate": 7.532971780935516e-06,
"loss": 1.0348,
"step": 1172
},
{
"epoch": 0.003898295368157108,
"grad_norm": 1.77268648147583,
"learning_rate": 7.443086274058497e-06,
"loss": 0.7937,
"step": 1173
},
{
"epoch": 0.003901618723117174,
"grad_norm": 1.3810285329818726,
"learning_rate": 7.353719527277125e-06,
"loss": 0.9105,
"step": 1174
},
{
"epoch": 0.0039049420780772398,
"grad_norm": 1.8284751176834106,
"learning_rate": 7.2648720414712716e-06,
"loss": 1.1995,
"step": 1175
},
{
"epoch": 0.003908265433037306,
"grad_norm": 1.7051944732666016,
"learning_rate": 7.1765443146106e-06,
"loss": 0.7849,
"step": 1176
},
{
"epoch": 0.003911588787997371,
"grad_norm": 1.6046769618988037,
"learning_rate": 7.088736841751575e-06,
"loss": 0.8096,
"step": 1177
},
{
"epoch": 0.003914912142957437,
"grad_norm": 2.0210366249084473,
"learning_rate": 7.001450115034758e-06,
"loss": 0.6858,
"step": 1178
},
{
"epoch": 0.003918235497917502,
"grad_norm": 1.8352065086364746,
"learning_rate": 6.914684623682099e-06,
"loss": 0.8634,
"step": 1179
},
{
"epoch": 0.003921558852877568,
"grad_norm": 1.6586220264434814,
"learning_rate": 6.828440853994089e-06,
"loss": 1.0516,
"step": 1180
},
{
"epoch": 0.0039248822078376345,
"grad_norm": 1.8991930484771729,
"learning_rate": 6.7427192893471105e-06,
"loss": 1.3651,
"step": 1181
},
{
"epoch": 0.0039282055627977,
"grad_norm": 2.0748491287231445,
"learning_rate": 6.657520410190721e-06,
"loss": 0.9304,
"step": 1182
},
{
"epoch": 0.003931528917757766,
"grad_norm": 2.1835572719573975,
"learning_rate": 6.572844694044911e-06,
"loss": 0.7364,
"step": 1183
},
{
"epoch": 0.003934852272717832,
"grad_norm": 1.8947280645370483,
"learning_rate": 6.488692615497516e-06,
"loss": 0.9096,
"step": 1184
},
{
"epoch": 0.003938175627677897,
"grad_norm": 1.584531307220459,
"learning_rate": 6.405064646201464e-06,
"loss": 0.8091,
"step": 1185
},
{
"epoch": 0.003941498982637963,
"grad_norm": 2.568542718887329,
"learning_rate": 6.321961254872166e-06,
"loss": 0.8995,
"step": 1186
},
{
"epoch": 0.003944822337598028,
"grad_norm": 1.460935354232788,
"learning_rate": 6.239382907284941e-06,
"loss": 0.6506,
"step": 1187
},
{
"epoch": 0.003948145692558094,
"grad_norm": 2.127983570098877,
"learning_rate": 6.157330066272282e-06,
"loss": 0.8965,
"step": 1188
},
{
"epoch": 0.00395146904751816,
"grad_norm": 1.676879644393921,
"learning_rate": 6.0758031917214296e-06,
"loss": 1.0078,
"step": 1189
},
{
"epoch": 0.003954792402478226,
"grad_norm": 1.7694756984710693,
"learning_rate": 5.994802740571659e-06,
"loss": 0.5985,
"step": 1190
},
{
"epoch": 0.003958115757438292,
"grad_norm": 2.5703816413879395,
"learning_rate": 5.914329166811727e-06,
"loss": 0.9092,
"step": 1191
},
{
"epoch": 0.003961439112398357,
"grad_norm": 1.5351899862289429,
"learning_rate": 5.834382921477466e-06,
"loss": 1.1089,
"step": 1192
},
{
"epoch": 0.003964762467358423,
"grad_norm": 2.5164711475372314,
"learning_rate": 5.754964452649037e-06,
"loss": 0.9535,
"step": 1193
},
{
"epoch": 0.003968085822318489,
"grad_norm": 2.1561896800994873,
"learning_rate": 5.67607420544859e-06,
"loss": 1.2628,
"step": 1194
},
{
"epoch": 0.003971409177278554,
"grad_norm": 2.350405693054199,
"learning_rate": 5.597712622037754e-06,
"loss": 0.826,
"step": 1195
},
{
"epoch": 0.00397473253223862,
"grad_norm": 2.584944009780884,
"learning_rate": 5.519880141615042e-06,
"loss": 0.9049,
"step": 1196
},
{
"epoch": 0.003978055887198686,
"grad_norm": 1.9516925811767578,
"learning_rate": 5.442577200413546e-06,
"loss": 1.0605,
"step": 1197
},
{
"epoch": 0.0039813792421587516,
"grad_norm": 3.750558853149414,
"learning_rate": 5.365804231698368e-06,
"loss": 0.9567,
"step": 1198
},
{
"epoch": 0.003984702597118818,
"grad_norm": 1.9897552728652954,
"learning_rate": 5.289561665764198e-06,
"loss": 0.6373,
"step": 1199
},
{
"epoch": 0.003988025952078883,
"grad_norm": 2.0267670154571533,
"learning_rate": 5.213849929933024e-06,
"loss": 0.7988,
"step": 1200
},
{
"epoch": 0.003991349307038949,
"grad_norm": 1.9218477010726929,
"learning_rate": 5.138669448551614e-06,
"loss": 0.8301,
"step": 1201
},
{
"epoch": 0.003994672661999015,
"grad_norm": 2.785154104232788,
"learning_rate": 5.0640206429891535e-06,
"loss": 0.8819,
"step": 1202
},
{
"epoch": 0.00399799601695908,
"grad_norm": 1.7020105123519897,
"learning_rate": 4.989903931634965e-06,
"loss": 0.9052,
"step": 1203
},
{
"epoch": 0.004001319371919146,
"grad_norm": 1.9669575691223145,
"learning_rate": 4.916319729896057e-06,
"loss": 0.8468,
"step": 1204
},
{
"epoch": 0.004004642726879211,
"grad_norm": 2.421363353729248,
"learning_rate": 4.843268450194871e-06,
"loss": 0.7444,
"step": 1205
},
{
"epoch": 0.0040079660818392775,
"grad_norm": 1.5673800706863403,
"learning_rate": 4.770750501966925e-06,
"loss": 0.9701,
"step": 1206
},
{
"epoch": 0.004011289436799344,
"grad_norm": 1.9752105474472046,
"learning_rate": 4.698766291658552e-06,
"loss": 1.0881,
"step": 1207
},
{
"epoch": 0.004014612791759409,
"grad_norm": 1.9096660614013672,
"learning_rate": 4.627316222724598e-06,
"loss": 0.9221,
"step": 1208
},
{
"epoch": 0.004017936146719475,
"grad_norm": 3.038297653198242,
"learning_rate": 4.556400695626173e-06,
"loss": 0.7838,
"step": 1209
},
{
"epoch": 0.004021259501679541,
"grad_norm": 2.406052350997925,
"learning_rate": 4.486020107828448e-06,
"loss": 1.0768,
"step": 1210
},
{
"epoch": 0.004024582856639606,
"grad_norm": 2.2107632160186768,
"learning_rate": 4.416174853798283e-06,
"loss": 0.9042,
"step": 1211
},
{
"epoch": 0.004027906211599672,
"grad_norm": 1.4963706731796265,
"learning_rate": 4.34686532500218e-06,
"loss": 0.7624,
"step": 1212
},
{
"epoch": 0.004031229566559737,
"grad_norm": 1.8679277896881104,
"learning_rate": 4.2780919099040585e-06,
"loss": 0.9459,
"step": 1213
},
{
"epoch": 0.0040345529215198035,
"grad_norm": 2.617995023727417,
"learning_rate": 4.2098549939629696e-06,
"loss": 1.009,
"step": 1214
},
{
"epoch": 0.0040378762764798695,
"grad_norm": 1.7749000787734985,
"learning_rate": 4.14215495963105e-06,
"loss": 0.8112,
"step": 1215
},
{
"epoch": 0.004041199631439935,
"grad_norm": 1.2091432809829712,
"learning_rate": 4.074992186351367e-06,
"loss": 0.6515,
"step": 1216
},
{
"epoch": 0.004044522986400001,
"grad_norm": 2.0824778079986572,
"learning_rate": 4.008367050555683e-06,
"loss": 1.0088,
"step": 1217
},
{
"epoch": 0.004047846341360066,
"grad_norm": 1.503562569618225,
"learning_rate": 3.942279925662506e-06,
"loss": 0.8859,
"step": 1218
},
{
"epoch": 0.004051169696320132,
"grad_norm": 2.2736172676086426,
"learning_rate": 3.876731182074888e-06,
"loss": 0.9602,
"step": 1219
},
{
"epoch": 0.004054493051280198,
"grad_norm": 1.9392001628875732,
"learning_rate": 3.811721187178352e-06,
"loss": 1.2851,
"step": 1220
},
{
"epoch": 0.004057816406240263,
"grad_norm": 1.8600444793701172,
"learning_rate": 3.747250305338934e-06,
"loss": 0.3464,
"step": 1221
},
{
"epoch": 0.004061139761200329,
"grad_norm": 3.741283655166626,
"learning_rate": 3.6833188979009447e-06,
"loss": 0.9047,
"step": 1222
},
{
"epoch": 0.0040644631161603955,
"grad_norm": 2.066908359527588,
"learning_rate": 3.6199273231852016e-06,
"loss": 0.7508,
"step": 1223
},
{
"epoch": 0.004067786471120461,
"grad_norm": 1.472381591796875,
"learning_rate": 3.5570759364867976e-06,
"loss": 1.1071,
"step": 1224
},
{
"epoch": 0.004071109826080527,
"grad_norm": 2.138390302658081,
"learning_rate": 3.494765090073193e-06,
"loss": 0.7563,
"step": 1225
},
{
"epoch": 0.004074433181040592,
"grad_norm": 1.5176939964294434,
"learning_rate": 3.432995133182315e-06,
"loss": 1.1034,
"step": 1226
},
{
"epoch": 0.004077756536000658,
"grad_norm": 1.9056531190872192,
"learning_rate": 3.3717664120204717e-06,
"loss": 0.7667,
"step": 1227
},
{
"epoch": 0.004081079890960724,
"grad_norm": 3.191318988800049,
"learning_rate": 3.3110792697604755e-06,
"loss": 0.8358,
"step": 1228
},
{
"epoch": 0.004084403245920789,
"grad_norm": 1.8594859838485718,
"learning_rate": 3.250934046539722e-06,
"loss": 0.2147,
"step": 1229
},
{
"epoch": 0.004087726600880855,
"grad_norm": 1.8989686965942383,
"learning_rate": 3.1913310794582817e-06,
"loss": 0.7407,
"step": 1230
},
{
"epoch": 0.0040910499558409206,
"grad_norm": 2.4154255390167236,
"learning_rate": 3.1322707025770114e-06,
"loss": 0.7876,
"step": 1231
},
{
"epoch": 0.004094373310800987,
"grad_norm": 2.14886736869812,
"learning_rate": 3.073753246915656e-06,
"loss": 0.8468,
"step": 1232
},
{
"epoch": 0.004097696665761053,
"grad_norm": 2.599923849105835,
"learning_rate": 3.015779040451017e-06,
"loss": 0.8232,
"step": 1233
},
{
"epoch": 0.004101020020721118,
"grad_norm": 1.8031104803085327,
"learning_rate": 2.958348408115108e-06,
"loss": 0.9074,
"step": 1234
},
{
"epoch": 0.004104343375681184,
"grad_norm": 1.2037352323532104,
"learning_rate": 2.9014616717933595e-06,
"loss": 0.494,
"step": 1235
},
{
"epoch": 0.00410766673064125,
"grad_norm": 1.8279937505722046,
"learning_rate": 2.8451191503227614e-06,
"loss": 0.7566,
"step": 1236
},
{
"epoch": 0.004110990085601315,
"grad_norm": 1.9116507768630981,
"learning_rate": 2.7893211594901215e-06,
"loss": 0.937,
"step": 1237
},
{
"epoch": 0.004114313440561381,
"grad_norm": 1.6439839601516724,
"learning_rate": 2.7340680120302554e-06,
"loss": 0.7953,
"step": 1238
},
{
"epoch": 0.0041176367955214465,
"grad_norm": 2.841966390609741,
"learning_rate": 2.6793600176243105e-06,
"loss": 0.7789,
"step": 1239
},
{
"epoch": 0.004120960150481513,
"grad_norm": 1.5409972667694092,
"learning_rate": 2.625197482897912e-06,
"loss": 0.4756,
"step": 1240
},
{
"epoch": 0.004124283505441579,
"grad_norm": 1.8044242858886719,
"learning_rate": 2.5715807114195525e-06,
"loss": 0.7198,
"step": 1241
},
{
"epoch": 0.004127606860401644,
"grad_norm": 2.6371781826019287,
"learning_rate": 2.5185100036988484e-06,
"loss": 0.6385,
"step": 1242
},
{
"epoch": 0.00413093021536171,
"grad_norm": 1.3817503452301025,
"learning_rate": 2.4659856571848105e-06,
"loss": 0.9401,
"step": 1243
},
{
"epoch": 0.004134253570321775,
"grad_norm": 1.8377162218093872,
"learning_rate": 2.4140079662642865e-06,
"loss": 0.8917,
"step": 1244
},
{
"epoch": 0.004137576925281841,
"grad_norm": 1.7400139570236206,
"learning_rate": 2.362577222260198e-06,
"loss": 1.108,
"step": 1245
},
{
"epoch": 0.004140900280241907,
"grad_norm": 1.966605544090271,
"learning_rate": 2.3116937134299745e-06,
"loss": 0.6442,
"step": 1246
},
{
"epoch": 0.0041442236352019724,
"grad_norm": 1.370110273361206,
"learning_rate": 2.2613577249639083e-06,
"loss": 0.777,
"step": 1247
},
{
"epoch": 0.0041475469901620385,
"grad_norm": 1.5771291255950928,
"learning_rate": 2.2115695389835712e-06,
"loss": 0.6103,
"step": 1248
},
{
"epoch": 0.004150870345122105,
"grad_norm": 2.013594388961792,
"learning_rate": 2.1623294345402447e-06,
"loss": 1.0112,
"step": 1249
},
{
"epoch": 0.00415419370008217,
"grad_norm": 2.0027196407318115,
"learning_rate": 2.1136376876133234e-06,
"loss": 0.8395,
"step": 1250
},
{
"epoch": 0.004157517055042236,
"grad_norm": 2.6950972080230713,
"learning_rate": 2.0654945711087834e-06,
"loss": 1.2351,
"step": 1251
},
{
"epoch": 0.004160840410002301,
"grad_norm": 2.4111666679382324,
"learning_rate": 2.0179003548576602e-06,
"loss": 1.0297,
"step": 1252
},
{
"epoch": 0.004164163764962367,
"grad_norm": 1.8260390758514404,
"learning_rate": 1.970855305614516e-06,
"loss": 1.159,
"step": 1253
},
{
"epoch": 0.004167487119922433,
"grad_norm": 2.396212577819824,
"learning_rate": 1.9243596870559988e-06,
"loss": 0.9679,
"step": 1254
},
{
"epoch": 0.004170810474882498,
"grad_norm": 2.532935380935669,
"learning_rate": 1.8784137597792738e-06,
"loss": 0.9187,
"step": 1255
},
{
"epoch": 0.0041741338298425645,
"grad_norm": 1.9746872186660767,
"learning_rate": 1.8330177813006388e-06,
"loss": 0.9043,
"step": 1256
},
{
"epoch": 0.0041774571848026305,
"grad_norm": 1.886202335357666,
"learning_rate": 1.7881720060540786e-06,
"loss": 0.8272,
"step": 1257
},
{
"epoch": 0.004180780539762696,
"grad_norm": 1.5669914484024048,
"learning_rate": 1.7438766853897558e-06,
"loss": 0.798,
"step": 1258
},
{
"epoch": 0.004184103894722762,
"grad_norm": 1.6605589389801025,
"learning_rate": 1.7001320675727239e-06,
"loss": 1.1453,
"step": 1259
},
{
"epoch": 0.004187427249682827,
"grad_norm": 2.4264414310455322,
"learning_rate": 1.656938397781449e-06,
"loss": 0.727,
"step": 1260
},
{
"epoch": 0.004190750604642893,
"grad_norm": 2.188891887664795,
"learning_rate": 1.6142959181064343e-06,
"loss": 0.9834,
"step": 1261
},
{
"epoch": 0.004194073959602959,
"grad_norm": 1.7915462255477905,
"learning_rate": 1.5722048675489541e-06,
"loss": 0.8014,
"step": 1262
},
{
"epoch": 0.004197397314563024,
"grad_norm": 2.2112960815429688,
"learning_rate": 1.5306654820195886e-06,
"loss": 0.669,
"step": 1263
},
{
"epoch": 0.00420072066952309,
"grad_norm": 1.741981029510498,
"learning_rate": 1.489677994336991e-06,
"loss": 0.9008,
"step": 1264
},
{
"epoch": 0.004204044024483156,
"grad_norm": 1.3700710535049438,
"learning_rate": 1.4492426342265552e-06,
"loss": 0.7386,
"step": 1265
},
{
"epoch": 0.004207367379443222,
"grad_norm": 2.1248631477355957,
"learning_rate": 1.4093596283191179e-06,
"loss": 0.977,
"step": 1266
},
{
"epoch": 0.004210690734403288,
"grad_norm": 2.1686811447143555,
"learning_rate": 1.3700292001497028e-06,
"loss": 0.8883,
"step": 1267
},
{
"epoch": 0.004214014089363353,
"grad_norm": 1.8819714784622192,
"learning_rate": 1.3312515701562667e-06,
"loss": 0.5279,
"step": 1268
},
{
"epoch": 0.004217337444323419,
"grad_norm": 1.7714366912841797,
"learning_rate": 1.2930269556784558e-06,
"loss": 1.1745,
"step": 1269
},
{
"epoch": 0.004220660799283485,
"grad_norm": 2.3303427696228027,
"learning_rate": 1.2553555709564068e-06,
"loss": 1.0176,
"step": 1270
},
{
"epoch": 0.00422398415424355,
"grad_norm": 1.934847354888916,
"learning_rate": 1.2182376271294815e-06,
"loss": 1.0643,
"step": 1271
},
{
"epoch": 0.004227307509203616,
"grad_norm": 1.9978758096694946,
"learning_rate": 1.1816733322352114e-06,
"loss": 1.093,
"step": 1272
},
{
"epoch": 0.0042306308641636816,
"grad_norm": 1.9631693363189697,
"learning_rate": 1.1456628912079992e-06,
"loss": 1.091,
"step": 1273
},
{
"epoch": 0.004233954219123748,
"grad_norm": 1.8431519269943237,
"learning_rate": 1.110206505878031e-06,
"loss": 0.8671,
"step": 1274
},
{
"epoch": 0.004237277574083814,
"grad_norm": 2.2053067684173584,
"learning_rate": 1.0753043749701652e-06,
"loss": 0.8372,
"step": 1275
},
{
"epoch": 0.004240600929043879,
"grad_norm": 2.2077174186706543,
"learning_rate": 1.040956694102746e-06,
"loss": 1.0352,
"step": 1276
},
{
"epoch": 0.004243924284003945,
"grad_norm": 1.8683050870895386,
"learning_rate": 1.0071636557866137e-06,
"loss": 0.9413,
"step": 1277
},
{
"epoch": 0.00424724763896401,
"grad_norm": 1.6869767904281616,
"learning_rate": 9.73925449423896e-07,
"loss": 0.8921,
"step": 1278
},
{
"epoch": 0.004250570993924076,
"grad_norm": 2.4182016849517822,
"learning_rate": 9.412422613070848e-07,
"loss": 0.9755,
"step": 1279
},
{
"epoch": 0.004253894348884142,
"grad_norm": 1.7648463249206543,
"learning_rate": 9.091142746178726e-07,
"loss": 1.1577,
"step": 1280
},
{
"epoch": 0.0042572177038442075,
"grad_norm": 1.8752923011779785,
"learning_rate": 8.775416694262073e-07,
"loss": 0.96,
"step": 1281
},
{
"epoch": 0.004260541058804274,
"grad_norm": 1.7734326124191284,
"learning_rate": 8.465246226892154e-07,
"loss": 0.6681,
"step": 1282
},
{
"epoch": 0.00426386441376434,
"grad_norm": 2.1282567977905273,
"learning_rate": 8.160633082502922e-07,
"loss": 0.8363,
"step": 1283
},
{
"epoch": 0.004267187768724405,
"grad_norm": 1.7843672037124634,
"learning_rate": 7.861578968380578e-07,
"loss": 1.0503,
"step": 1284
},
{
"epoch": 0.004270511123684471,
"grad_norm": 1.9426231384277344,
"learning_rate": 7.568085560654247e-07,
"loss": 1.0447,
"step": 1285
},
{
"epoch": 0.004273834478644536,
"grad_norm": 2.154215097427368,
"learning_rate": 7.28015450428654e-07,
"loss": 1.0945,
"step": 1286
},
{
"epoch": 0.004277157833604602,
"grad_norm": 1.881326675415039,
"learning_rate": 6.99778741306456e-07,
"loss": 0.8126,
"step": 1287
},
{
"epoch": 0.004280481188564668,
"grad_norm": 2.788193941116333,
"learning_rate": 6.720985869590357e-07,
"loss": 0.6972,
"step": 1288
},
{
"epoch": 0.0042838045435247335,
"grad_norm": 2.3980252742767334,
"learning_rate": 6.44975142527282e-07,
"loss": 1.0364,
"step": 1289
},
{
"epoch": 0.0042871278984847995,
"grad_norm": 2.5638821125030518,
"learning_rate": 6.184085600318024e-07,
"loss": 0.9393,
"step": 1290
},
{
"epoch": 0.004290451253444865,
"grad_norm": 3.0566511154174805,
"learning_rate": 5.92398988372167e-07,
"loss": 1.0036,
"step": 1291
},
{
"epoch": 0.004293774608404931,
"grad_norm": 2.413839101791382,
"learning_rate": 5.669465733259994e-07,
"loss": 1.3389,
"step": 1292
},
{
"epoch": 0.004297097963364997,
"grad_norm": 2.7977092266082764,
"learning_rate": 5.420514575482205e-07,
"loss": 1.091,
"step": 1293
},
{
"epoch": 0.004300421318325062,
"grad_norm": 3.4734342098236084,
"learning_rate": 5.177137805701615e-07,
"loss": 0.7818,
"step": 1294
},
{
"epoch": 0.004303744673285128,
"grad_norm": 2.988787889480591,
"learning_rate": 4.939336787988857e-07,
"loss": 0.7218,
"step": 1295
},
{
"epoch": 0.004307068028245194,
"grad_norm": 1.6737980842590332,
"learning_rate": 4.7071128551637867e-07,
"loss": 1.1564,
"step": 1296
},
{
"epoch": 0.004310391383205259,
"grad_norm": 2.376605749130249,
"learning_rate": 4.4804673087877057e-07,
"loss": 0.3682,
"step": 1297
},
{
"epoch": 0.0043137147381653255,
"grad_norm": 2.320552110671997,
"learning_rate": 4.2594014191565945e-07,
"loss": 0.9988,
"step": 1298
},
{
"epoch": 0.004317038093125391,
"grad_norm": 2.8528194427490234,
"learning_rate": 4.043916425293559e-07,
"loss": 1.3724,
"step": 1299
},
{
"epoch": 0.004320361448085457,
"grad_norm": 3.585374593734741,
"learning_rate": 3.83401353494206e-07,
"loss": 0.9507,
"step": 1300
},
{
"epoch": 0.004323684803045523,
"grad_norm": 2.1745338439941406,
"learning_rate": 3.6296939245593633e-07,
"loss": 0.6232,
"step": 1301
},
{
"epoch": 0.004327008158005588,
"grad_norm": 1.2730436325073242,
"learning_rate": 3.4309587393096534e-07,
"loss": 0.6022,
"step": 1302
},
{
"epoch": 0.004330331512965654,
"grad_norm": 2.007568120956421,
"learning_rate": 3.2378090930577087e-07,
"loss": 1.1188,
"step": 1303
},
{
"epoch": 0.004333654867925719,
"grad_norm": 3.0893633365631104,
"learning_rate": 3.0502460683624613e-07,
"loss": 0.5077,
"step": 1304
},
{
"epoch": 0.004336978222885785,
"grad_norm": 1.57036554813385,
"learning_rate": 2.868270716471444e-07,
"loss": 0.5061,
"step": 1305
},
{
"epoch": 0.004340301577845851,
"grad_norm": 1.7737019062042236,
"learning_rate": 2.6918840573144644e-07,
"loss": 1.0445,
"step": 1306
},
{
"epoch": 0.004343624932805917,
"grad_norm": 2.042353391647339,
"learning_rate": 2.521087079497719e-07,
"loss": 0.9395,
"step": 1307
},
{
"epoch": 0.004346948287765983,
"grad_norm": 2.071798086166382,
"learning_rate": 2.3558807402989103e-07,
"loss": 1.0835,
"step": 1308
},
{
"epoch": 0.004350271642726049,
"grad_norm": 1.6557015180587769,
"learning_rate": 2.1962659656614703e-07,
"loss": 1.0824,
"step": 1309
},
{
"epoch": 0.004353594997686114,
"grad_norm": 1.98556649684906,
"learning_rate": 2.042243650189124e-07,
"loss": 1.0463,
"step": 1310
},
{
"epoch": 0.00435691835264618,
"grad_norm": 1.9155919551849365,
"learning_rate": 1.8938146571413352e-07,
"loss": 0.8835,
"step": 1311
},
{
"epoch": 0.004360241707606245,
"grad_norm": 2.178790807723999,
"learning_rate": 1.750979818428422e-07,
"loss": 0.6642,
"step": 1312
},
{
"epoch": 0.004363565062566311,
"grad_norm": 1.4291096925735474,
"learning_rate": 1.6137399346064508e-07,
"loss": 0.7367,
"step": 1313
},
{
"epoch": 0.004366888417526377,
"grad_norm": 3.2246644496917725,
"learning_rate": 1.4820957748733488e-07,
"loss": 0.8849,
"step": 1314
},
{
"epoch": 0.004370211772486443,
"grad_norm": 2.7322123050689697,
"learning_rate": 1.3560480770642425e-07,
"loss": 1.2786,
"step": 1315
},
{
"epoch": 0.004373535127446509,
"grad_norm": 2.1076912879943848,
"learning_rate": 1.235597547647127e-07,
"loss": 1.0382,
"step": 1316
},
{
"epoch": 0.004376858482406574,
"grad_norm": 2.021298885345459,
"learning_rate": 1.120744861719536e-07,
"loss": 0.7905,
"step": 1317
},
{
"epoch": 0.00438018183736664,
"grad_norm": 2.3875844478607178,
"learning_rate": 1.0114906630040999e-07,
"loss": 0.7163,
"step": 1318
},
{
"epoch": 0.004383505192326706,
"grad_norm": 1.4649884700775146,
"learning_rate": 9.078355638453273e-08,
"loss": 1.1184,
"step": 1319
},
{
"epoch": 0.004386828547286771,
"grad_norm": 1.641730546951294,
"learning_rate": 8.097801452061626e-08,
"loss": 0.7022,
"step": 1320
},
{
"epoch": 0.004390151902246837,
"grad_norm": 2.3859140872955322,
"learning_rate": 7.173249566645446e-08,
"loss": 1.2353,
"step": 1321
},
{
"epoch": 0.004393475257206903,
"grad_norm": 1.9294756650924683,
"learning_rate": 6.304705164104086e-08,
"loss": 0.857,
"step": 1322
},
{
"epoch": 0.0043967986121669685,
"grad_norm": 2.262242078781128,
"learning_rate": 5.4921731124280007e-08,
"loss": 0.8556,
"step": 1323
},
{
"epoch": 0.004400121967127035,
"grad_norm": 1.5056575536727905,
"learning_rate": 4.735657965672102e-08,
"loss": 0.8407,
"step": 1324
},
{
"epoch": 0.0044034453220871,
"grad_norm": 2.3266441822052,
"learning_rate": 4.035163963926891e-08,
"loss": 0.6501,
"step": 1325
},
{
"epoch": 0.004406768677047166,
"grad_norm": 2.55777907371521,
"learning_rate": 3.390695033300695e-08,
"loss": 1.2284,
"step": 1326
},
{
"epoch": 0.004410092032007232,
"grad_norm": 2.0342135429382324,
"learning_rate": 2.8022547858930213e-08,
"loss": 0.7973,
"step": 1327
},
{
"epoch": 0.004413415386967297,
"grad_norm": 1.9018234014511108,
"learning_rate": 2.2698465197745766e-08,
"loss": 0.7389,
"step": 1328
},
{
"epoch": 0.004416738741927363,
"grad_norm": 1.6029261350631714,
"learning_rate": 1.7934732189706093e-08,
"loss": 0.796,
"step": 1329
},
{
"epoch": 0.004420062096887428,
"grad_norm": 1.9555233716964722,
"learning_rate": 1.3731375534442593e-08,
"loss": 0.5044,
"step": 1330
},
{
"epoch": 0.0044233854518474945,
"grad_norm": 1.6823757886886597,
"learning_rate": 1.0088418790787923e-08,
"loss": 0.5384,
"step": 1331
},
{
"epoch": 0.0044267088068075605,
"grad_norm": 1.7445098161697388,
"learning_rate": 7.005882376676098e-09,
"loss": 0.8662,
"step": 1332
},
{
"epoch": 0.004430032161767626,
"grad_norm": 1.3536102771759033,
"learning_rate": 4.483783568998145e-09,
"loss": 0.6267,
"step": 1333
},
{
"epoch": 0.004433355516727692,
"grad_norm": 2.434354543685913,
"learning_rate": 2.5221365035577037e-09,
"loss": 0.9421,
"step": 1334
},
{
"epoch": 0.004436678871687758,
"grad_norm": 3.5411949157714844,
"learning_rate": 1.1209521749155904e-09,
"loss": 0.6256,
"step": 1335
},
{
"epoch": 0.004440002226647823,
"grad_norm": 1.8648262023925781,
"learning_rate": 2.8023843640090364e-10,
"loss": 0.8315,
"step": 1336
},
{
"epoch": 0.004443325581607889,
"grad_norm": 3.0049996376037598,
"learning_rate": 0.0,
"loss": 0.7712,
"step": 1337
}
],
"logging_steps": 1,
"max_steps": 1337,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 335,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.018183511703552e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}