AIM-project-adapters / checkpoint-2625 /trainer_state.json
oodeh's picture
Upload folder using huggingface_hub
80bfed8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.9928673323823105,
"eval_steps": 500,
"global_step": 2625,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019020446980504042,
"grad_norm": 0.9932524561882019,
"learning_rate": 2e-05,
"loss": 1.3348,
"step": 1
},
{
"epoch": 0.0038040893961008085,
"grad_norm": 0.9241018295288086,
"learning_rate": 4e-05,
"loss": 1.3131,
"step": 2
},
{
"epoch": 0.005706134094151213,
"grad_norm": 1.1556137800216675,
"learning_rate": 6e-05,
"loss": 1.5644,
"step": 3
},
{
"epoch": 0.007608178792201617,
"grad_norm": 0.8612737059593201,
"learning_rate": 8e-05,
"loss": 1.2192,
"step": 4
},
{
"epoch": 0.009510223490252021,
"grad_norm": 0.8998388648033142,
"learning_rate": 0.0001,
"loss": 1.3651,
"step": 5
},
{
"epoch": 0.011412268188302425,
"grad_norm": 0.7211980819702148,
"learning_rate": 9.999364877738964e-05,
"loss": 1.2525,
"step": 6
},
{
"epoch": 0.01331431288635283,
"grad_norm": 0.44894707202911377,
"learning_rate": 9.998729755477931e-05,
"loss": 1.1999,
"step": 7
},
{
"epoch": 0.015216357584403234,
"grad_norm": 0.4338511824607849,
"learning_rate": 9.998094633216895e-05,
"loss": 1.0147,
"step": 8
},
{
"epoch": 0.017118402282453638,
"grad_norm": 0.5658989548683167,
"learning_rate": 9.99745951095586e-05,
"loss": 1.1997,
"step": 9
},
{
"epoch": 0.019020446980504042,
"grad_norm": 0.4467356503009796,
"learning_rate": 9.996824388694824e-05,
"loss": 1.0424,
"step": 10
},
{
"epoch": 0.020922491678554447,
"grad_norm": 0.3743385374546051,
"learning_rate": 9.996189266433789e-05,
"loss": 1.0902,
"step": 11
},
{
"epoch": 0.02282453637660485,
"grad_norm": 0.30667275190353394,
"learning_rate": 9.995554144172754e-05,
"loss": 0.8736,
"step": 12
},
{
"epoch": 0.024726581074655255,
"grad_norm": 0.48634254932403564,
"learning_rate": 9.994919021911718e-05,
"loss": 0.977,
"step": 13
},
{
"epoch": 0.02662862577270566,
"grad_norm": 0.4229658246040344,
"learning_rate": 9.994283899650683e-05,
"loss": 0.9673,
"step": 14
},
{
"epoch": 0.028530670470756064,
"grad_norm": 0.39269882440567017,
"learning_rate": 9.993648777389648e-05,
"loss": 1.0001,
"step": 15
},
{
"epoch": 0.030432715168806468,
"grad_norm": 0.38597363233566284,
"learning_rate": 9.993013655128612e-05,
"loss": 0.9705,
"step": 16
},
{
"epoch": 0.03233475986685687,
"grad_norm": 0.40809136629104614,
"learning_rate": 9.992378532867577e-05,
"loss": 0.9246,
"step": 17
},
{
"epoch": 0.034236804564907276,
"grad_norm": 0.4431133270263672,
"learning_rate": 9.991743410606542e-05,
"loss": 1.0409,
"step": 18
},
{
"epoch": 0.03613884926295768,
"grad_norm": 0.5659255981445312,
"learning_rate": 9.991108288345506e-05,
"loss": 1.1118,
"step": 19
},
{
"epoch": 0.038040893961008085,
"grad_norm": 0.4943106472492218,
"learning_rate": 9.990473166084471e-05,
"loss": 0.9213,
"step": 20
},
{
"epoch": 0.039942938659058486,
"grad_norm": 0.48820945620536804,
"learning_rate": 9.989838043823437e-05,
"loss": 0.9108,
"step": 21
},
{
"epoch": 0.04184498335710889,
"grad_norm": 0.4464576542377472,
"learning_rate": 9.989202921562402e-05,
"loss": 0.8959,
"step": 22
},
{
"epoch": 0.043747028055159294,
"grad_norm": 0.3870016038417816,
"learning_rate": 9.988567799301366e-05,
"loss": 0.8013,
"step": 23
},
{
"epoch": 0.0456490727532097,
"grad_norm": 0.42381179332733154,
"learning_rate": 9.987932677040331e-05,
"loss": 0.8584,
"step": 24
},
{
"epoch": 0.0475511174512601,
"grad_norm": 0.37170907855033875,
"learning_rate": 9.987297554779296e-05,
"loss": 0.7849,
"step": 25
},
{
"epoch": 0.04945316214931051,
"grad_norm": 0.4516700506210327,
"learning_rate": 9.98666243251826e-05,
"loss": 0.8902,
"step": 26
},
{
"epoch": 0.05135520684736091,
"grad_norm": 0.3525027334690094,
"learning_rate": 9.986027310257225e-05,
"loss": 0.6029,
"step": 27
},
{
"epoch": 0.05325725154541132,
"grad_norm": 0.437707781791687,
"learning_rate": 9.98539218799619e-05,
"loss": 0.7387,
"step": 28
},
{
"epoch": 0.05515929624346172,
"grad_norm": 0.45205071568489075,
"learning_rate": 9.984757065735154e-05,
"loss": 0.7468,
"step": 29
},
{
"epoch": 0.05706134094151213,
"grad_norm": 0.3709086775779724,
"learning_rate": 9.984121943474119e-05,
"loss": 0.7365,
"step": 30
},
{
"epoch": 0.05896338563956253,
"grad_norm": 0.4089844822883606,
"learning_rate": 9.983486821213084e-05,
"loss": 0.6563,
"step": 31
},
{
"epoch": 0.060865430337612936,
"grad_norm": 0.45955532789230347,
"learning_rate": 9.982851698952048e-05,
"loss": 0.8021,
"step": 32
},
{
"epoch": 0.06276747503566334,
"grad_norm": 0.5240988731384277,
"learning_rate": 9.982216576691013e-05,
"loss": 0.6933,
"step": 33
},
{
"epoch": 0.06466951973371374,
"grad_norm": 0.4703526496887207,
"learning_rate": 9.981581454429977e-05,
"loss": 0.7339,
"step": 34
},
{
"epoch": 0.06657156443176415,
"grad_norm": 0.5659805536270142,
"learning_rate": 9.980946332168944e-05,
"loss": 0.8139,
"step": 35
},
{
"epoch": 0.06847360912981455,
"grad_norm": 0.39259326457977295,
"learning_rate": 9.980311209907908e-05,
"loss": 0.5838,
"step": 36
},
{
"epoch": 0.07037565382786495,
"grad_norm": 0.4165003001689911,
"learning_rate": 9.979676087646871e-05,
"loss": 0.674,
"step": 37
},
{
"epoch": 0.07227769852591535,
"grad_norm": 0.4533802568912506,
"learning_rate": 9.979040965385838e-05,
"loss": 0.6974,
"step": 38
},
{
"epoch": 0.07417974322396577,
"grad_norm": 0.5213814973831177,
"learning_rate": 9.978405843124802e-05,
"loss": 0.7896,
"step": 39
},
{
"epoch": 0.07608178792201617,
"grad_norm": 0.3241259753704071,
"learning_rate": 9.977770720863767e-05,
"loss": 0.5895,
"step": 40
},
{
"epoch": 0.07798383262006657,
"grad_norm": 0.34446167945861816,
"learning_rate": 9.977135598602731e-05,
"loss": 0.6222,
"step": 41
},
{
"epoch": 0.07988587731811697,
"grad_norm": 0.49035167694091797,
"learning_rate": 9.976500476341696e-05,
"loss": 0.6978,
"step": 42
},
{
"epoch": 0.08178792201616739,
"grad_norm": 0.4795296788215637,
"learning_rate": 9.975865354080661e-05,
"loss": 0.7368,
"step": 43
},
{
"epoch": 0.08368996671421779,
"grad_norm": 0.44959381222724915,
"learning_rate": 9.975230231819625e-05,
"loss": 0.57,
"step": 44
},
{
"epoch": 0.08559201141226819,
"grad_norm": 0.4577605426311493,
"learning_rate": 9.974595109558592e-05,
"loss": 0.691,
"step": 45
},
{
"epoch": 0.08749405611031859,
"grad_norm": 0.41654840111732483,
"learning_rate": 9.973959987297555e-05,
"loss": 0.6346,
"step": 46
},
{
"epoch": 0.089396100808369,
"grad_norm": 0.6599829196929932,
"learning_rate": 9.973324865036519e-05,
"loss": 0.6358,
"step": 47
},
{
"epoch": 0.0912981455064194,
"grad_norm": 0.38539162278175354,
"learning_rate": 9.972689742775484e-05,
"loss": 0.5723,
"step": 48
},
{
"epoch": 0.0932001902044698,
"grad_norm": 0.4626316428184509,
"learning_rate": 9.97205462051445e-05,
"loss": 0.6845,
"step": 49
},
{
"epoch": 0.0951022349025202,
"grad_norm": 0.348387211561203,
"learning_rate": 9.971419498253413e-05,
"loss": 0.4857,
"step": 50
},
{
"epoch": 0.09700427960057062,
"grad_norm": 0.4964020252227783,
"learning_rate": 9.970784375992379e-05,
"loss": 0.7141,
"step": 51
},
{
"epoch": 0.09890632429862102,
"grad_norm": 0.4282241463661194,
"learning_rate": 9.970149253731344e-05,
"loss": 0.6619,
"step": 52
},
{
"epoch": 0.10080836899667142,
"grad_norm": 0.35991716384887695,
"learning_rate": 9.969514131470309e-05,
"loss": 0.4727,
"step": 53
},
{
"epoch": 0.10271041369472182,
"grad_norm": 0.3936012387275696,
"learning_rate": 9.968879009209273e-05,
"loss": 0.5644,
"step": 54
},
{
"epoch": 0.10461245839277224,
"grad_norm": 0.39267924427986145,
"learning_rate": 9.968243886948238e-05,
"loss": 0.5126,
"step": 55
},
{
"epoch": 0.10651450309082264,
"grad_norm": 0.4119136333465576,
"learning_rate": 9.967608764687203e-05,
"loss": 0.471,
"step": 56
},
{
"epoch": 0.10841654778887304,
"grad_norm": 0.5160384178161621,
"learning_rate": 9.966973642426167e-05,
"loss": 0.6555,
"step": 57
},
{
"epoch": 0.11031859248692344,
"grad_norm": 0.4742174744606018,
"learning_rate": 9.966338520165132e-05,
"loss": 0.6093,
"step": 58
},
{
"epoch": 0.11222063718497385,
"grad_norm": 0.3615169823169708,
"learning_rate": 9.965703397904097e-05,
"loss": 0.5527,
"step": 59
},
{
"epoch": 0.11412268188302425,
"grad_norm": 0.5700575113296509,
"learning_rate": 9.965068275643061e-05,
"loss": 0.5713,
"step": 60
},
{
"epoch": 0.11602472658107466,
"grad_norm": 0.4825727939605713,
"learning_rate": 9.964433153382026e-05,
"loss": 0.5142,
"step": 61
},
{
"epoch": 0.11792677127912506,
"grad_norm": 0.392088919878006,
"learning_rate": 9.963798031120992e-05,
"loss": 0.513,
"step": 62
},
{
"epoch": 0.11982881597717546,
"grad_norm": 0.35883110761642456,
"learning_rate": 9.963162908859957e-05,
"loss": 0.501,
"step": 63
},
{
"epoch": 0.12173086067522587,
"grad_norm": 0.39946749806404114,
"learning_rate": 9.96252778659892e-05,
"loss": 0.5532,
"step": 64
},
{
"epoch": 0.12363290537327627,
"grad_norm": 0.4191288352012634,
"learning_rate": 9.961892664337886e-05,
"loss": 0.5258,
"step": 65
},
{
"epoch": 0.12553495007132667,
"grad_norm": 0.3662487268447876,
"learning_rate": 9.961257542076851e-05,
"loss": 0.5121,
"step": 66
},
{
"epoch": 0.1274369947693771,
"grad_norm": 0.5582164525985718,
"learning_rate": 9.960622419815815e-05,
"loss": 0.6494,
"step": 67
},
{
"epoch": 0.12933903946742747,
"grad_norm": 0.485128790140152,
"learning_rate": 9.959987297554779e-05,
"loss": 0.6022,
"step": 68
},
{
"epoch": 0.1312410841654779,
"grad_norm": 0.3816944360733032,
"learning_rate": 9.959352175293745e-05,
"loss": 0.4851,
"step": 69
},
{
"epoch": 0.1331431288635283,
"grad_norm": 0.3637336194515228,
"learning_rate": 9.958717053032709e-05,
"loss": 0.4344,
"step": 70
},
{
"epoch": 0.1350451735615787,
"grad_norm": 0.4418705105781555,
"learning_rate": 9.958081930771674e-05,
"loss": 0.6008,
"step": 71
},
{
"epoch": 0.1369472182596291,
"grad_norm": 0.44138631224632263,
"learning_rate": 9.95744680851064e-05,
"loss": 0.5319,
"step": 72
},
{
"epoch": 0.1388492629576795,
"grad_norm": 0.37523001432418823,
"learning_rate": 9.956811686249603e-05,
"loss": 0.657,
"step": 73
},
{
"epoch": 0.1407513076557299,
"grad_norm": 0.4489665627479553,
"learning_rate": 9.956176563988568e-05,
"loss": 0.5526,
"step": 74
},
{
"epoch": 0.14265335235378032,
"grad_norm": 0.39318791031837463,
"learning_rate": 9.955541441727532e-05,
"loss": 0.6046,
"step": 75
},
{
"epoch": 0.1445553970518307,
"grad_norm": 0.4817538261413574,
"learning_rate": 9.954906319466499e-05,
"loss": 0.5149,
"step": 76
},
{
"epoch": 0.14645744174988112,
"grad_norm": 0.4451163411140442,
"learning_rate": 9.954271197205463e-05,
"loss": 0.4892,
"step": 77
},
{
"epoch": 0.14835948644793154,
"grad_norm": 0.29836660623550415,
"learning_rate": 9.953636074944426e-05,
"loss": 0.4005,
"step": 78
},
{
"epoch": 0.15026153114598192,
"grad_norm": 0.3185100555419922,
"learning_rate": 9.953000952683393e-05,
"loss": 0.4168,
"step": 79
},
{
"epoch": 0.15216357584403234,
"grad_norm": 0.26550424098968506,
"learning_rate": 9.952365830422357e-05,
"loss": 0.39,
"step": 80
},
{
"epoch": 0.15406562054208273,
"grad_norm": 0.4328240156173706,
"learning_rate": 9.951730708161322e-05,
"loss": 0.5041,
"step": 81
},
{
"epoch": 0.15596766524013314,
"grad_norm": 0.5178936123847961,
"learning_rate": 9.951095585900286e-05,
"loss": 0.6017,
"step": 82
},
{
"epoch": 0.15786970993818356,
"grad_norm": 0.45657551288604736,
"learning_rate": 9.950460463639251e-05,
"loss": 0.5734,
"step": 83
},
{
"epoch": 0.15977175463623394,
"grad_norm": 0.5482913851737976,
"learning_rate": 9.949825341378216e-05,
"loss": 0.6015,
"step": 84
},
{
"epoch": 0.16167379933428436,
"grad_norm": 0.39362308382987976,
"learning_rate": 9.94919021911718e-05,
"loss": 0.5712,
"step": 85
},
{
"epoch": 0.16357584403233477,
"grad_norm": 0.4381113350391388,
"learning_rate": 9.948555096856145e-05,
"loss": 0.5194,
"step": 86
},
{
"epoch": 0.16547788873038516,
"grad_norm": 0.5021312236785889,
"learning_rate": 9.94791997459511e-05,
"loss": 0.5279,
"step": 87
},
{
"epoch": 0.16737993342843557,
"grad_norm": 0.4364267587661743,
"learning_rate": 9.947284852334074e-05,
"loss": 0.5892,
"step": 88
},
{
"epoch": 0.16928197812648596,
"grad_norm": 0.37873050570487976,
"learning_rate": 9.94664973007304e-05,
"loss": 0.5328,
"step": 89
},
{
"epoch": 0.17118402282453637,
"grad_norm": 0.4768919050693512,
"learning_rate": 9.946014607812005e-05,
"loss": 0.4889,
"step": 90
},
{
"epoch": 0.1730860675225868,
"grad_norm": 0.3834541440010071,
"learning_rate": 9.945379485550968e-05,
"loss": 0.4642,
"step": 91
},
{
"epoch": 0.17498811222063718,
"grad_norm": 0.48581764101982117,
"learning_rate": 9.944744363289934e-05,
"loss": 0.4741,
"step": 92
},
{
"epoch": 0.1768901569186876,
"grad_norm": 0.39364808797836304,
"learning_rate": 9.944109241028899e-05,
"loss": 0.5684,
"step": 93
},
{
"epoch": 0.178792201616738,
"grad_norm": 0.4657204747200012,
"learning_rate": 9.943474118767864e-05,
"loss": 0.609,
"step": 94
},
{
"epoch": 0.1806942463147884,
"grad_norm": 0.40989887714385986,
"learning_rate": 9.942838996506828e-05,
"loss": 0.4319,
"step": 95
},
{
"epoch": 0.1825962910128388,
"grad_norm": 0.43797624111175537,
"learning_rate": 9.942203874245793e-05,
"loss": 0.4997,
"step": 96
},
{
"epoch": 0.1844983357108892,
"grad_norm": 0.3887675106525421,
"learning_rate": 9.941568751984758e-05,
"loss": 0.5548,
"step": 97
},
{
"epoch": 0.1864003804089396,
"grad_norm": 0.39017003774642944,
"learning_rate": 9.940933629723722e-05,
"loss": 0.5113,
"step": 98
},
{
"epoch": 0.18830242510699002,
"grad_norm": 0.41409194469451904,
"learning_rate": 9.940298507462687e-05,
"loss": 0.5496,
"step": 99
},
{
"epoch": 0.1902044698050404,
"grad_norm": 0.34578803181648254,
"learning_rate": 9.939663385201652e-05,
"loss": 0.4048,
"step": 100
},
{
"epoch": 0.19210651450309082,
"grad_norm": 0.32233092188835144,
"learning_rate": 9.939028262940616e-05,
"loss": 0.4442,
"step": 101
},
{
"epoch": 0.19400855920114124,
"grad_norm": 0.45841965079307556,
"learning_rate": 9.938393140679581e-05,
"loss": 0.5646,
"step": 102
},
{
"epoch": 0.19591060389919163,
"grad_norm": 0.3825596272945404,
"learning_rate": 9.937758018418547e-05,
"loss": 0.4583,
"step": 103
},
{
"epoch": 0.19781264859724204,
"grad_norm": 0.44690102338790894,
"learning_rate": 9.93712289615751e-05,
"loss": 0.5799,
"step": 104
},
{
"epoch": 0.19971469329529243,
"grad_norm": 0.4881773591041565,
"learning_rate": 9.936487773896476e-05,
"loss": 0.4094,
"step": 105
},
{
"epoch": 0.20161673799334284,
"grad_norm": 0.4745669960975647,
"learning_rate": 9.93585265163544e-05,
"loss": 0.6068,
"step": 106
},
{
"epoch": 0.20351878269139326,
"grad_norm": 0.5497081279754639,
"learning_rate": 9.935217529374406e-05,
"loss": 0.4654,
"step": 107
},
{
"epoch": 0.20542082738944364,
"grad_norm": 0.3564707636833191,
"learning_rate": 9.93458240711337e-05,
"loss": 0.5678,
"step": 108
},
{
"epoch": 0.20732287208749406,
"grad_norm": 0.446321964263916,
"learning_rate": 9.933947284852334e-05,
"loss": 0.4503,
"step": 109
},
{
"epoch": 0.20922491678554447,
"grad_norm": 0.4253140389919281,
"learning_rate": 9.9333121625913e-05,
"loss": 0.538,
"step": 110
},
{
"epoch": 0.21112696148359486,
"grad_norm": 0.4123047888278961,
"learning_rate": 9.932677040330264e-05,
"loss": 0.4359,
"step": 111
},
{
"epoch": 0.21302900618164528,
"grad_norm": 0.3887772262096405,
"learning_rate": 9.932041918069229e-05,
"loss": 0.5534,
"step": 112
},
{
"epoch": 0.21493105087969566,
"grad_norm": 0.38153669238090515,
"learning_rate": 9.931406795808193e-05,
"loss": 0.4296,
"step": 113
},
{
"epoch": 0.21683309557774608,
"grad_norm": 0.43017521500587463,
"learning_rate": 9.930771673547158e-05,
"loss": 0.5899,
"step": 114
},
{
"epoch": 0.2187351402757965,
"grad_norm": 0.40156394243240356,
"learning_rate": 9.930136551286123e-05,
"loss": 0.3917,
"step": 115
},
{
"epoch": 0.22063718497384688,
"grad_norm": 0.3576590120792389,
"learning_rate": 9.929501429025087e-05,
"loss": 0.3908,
"step": 116
},
{
"epoch": 0.2225392296718973,
"grad_norm": 0.33245769143104553,
"learning_rate": 9.928866306764054e-05,
"loss": 0.4043,
"step": 117
},
{
"epoch": 0.2244412743699477,
"grad_norm": 0.43169739842414856,
"learning_rate": 9.928231184503018e-05,
"loss": 0.5569,
"step": 118
},
{
"epoch": 0.2263433190679981,
"grad_norm": 0.4004412293434143,
"learning_rate": 9.927596062241981e-05,
"loss": 0.4931,
"step": 119
},
{
"epoch": 0.2282453637660485,
"grad_norm": 0.3550797998905182,
"learning_rate": 9.926960939980947e-05,
"loss": 0.4505,
"step": 120
},
{
"epoch": 0.2301474084640989,
"grad_norm": 0.3701287508010864,
"learning_rate": 9.926325817719912e-05,
"loss": 0.4967,
"step": 121
},
{
"epoch": 0.2320494531621493,
"grad_norm": 0.4120308756828308,
"learning_rate": 9.925690695458876e-05,
"loss": 0.4408,
"step": 122
},
{
"epoch": 0.23395149786019973,
"grad_norm": 0.4737403392791748,
"learning_rate": 9.925055573197841e-05,
"loss": 0.7221,
"step": 123
},
{
"epoch": 0.2358535425582501,
"grad_norm": 0.37103158235549927,
"learning_rate": 9.924420450936806e-05,
"loss": 0.4419,
"step": 124
},
{
"epoch": 0.23775558725630053,
"grad_norm": 0.48644623160362244,
"learning_rate": 9.923785328675771e-05,
"loss": 0.5006,
"step": 125
},
{
"epoch": 0.2396576319543509,
"grad_norm": 0.3381918966770172,
"learning_rate": 9.923150206414735e-05,
"loss": 0.4786,
"step": 126
},
{
"epoch": 0.24155967665240133,
"grad_norm": 0.4500490128993988,
"learning_rate": 9.9225150841537e-05,
"loss": 0.4984,
"step": 127
},
{
"epoch": 0.24346172135045174,
"grad_norm": 0.5506143569946289,
"learning_rate": 9.921879961892665e-05,
"loss": 0.4857,
"step": 128
},
{
"epoch": 0.24536376604850213,
"grad_norm": 0.4111080467700958,
"learning_rate": 9.921244839631629e-05,
"loss": 0.4464,
"step": 129
},
{
"epoch": 0.24726581074655254,
"grad_norm": 0.52936851978302,
"learning_rate": 9.920609717370594e-05,
"loss": 0.5664,
"step": 130
},
{
"epoch": 0.24916785544460296,
"grad_norm": 0.465009480714798,
"learning_rate": 9.91997459510956e-05,
"loss": 0.4318,
"step": 131
},
{
"epoch": 0.25106990014265335,
"grad_norm": 0.3044665455818176,
"learning_rate": 9.919339472848523e-05,
"loss": 0.4284,
"step": 132
},
{
"epoch": 0.25297194484070373,
"grad_norm": 0.4849638342857361,
"learning_rate": 9.918704350587488e-05,
"loss": 0.5956,
"step": 133
},
{
"epoch": 0.2548739895387542,
"grad_norm": 0.4701893925666809,
"learning_rate": 9.918069228326454e-05,
"loss": 0.4541,
"step": 134
},
{
"epoch": 0.25677603423680456,
"grad_norm": 0.42524924874305725,
"learning_rate": 9.917434106065419e-05,
"loss": 0.4991,
"step": 135
},
{
"epoch": 0.25867807893485495,
"grad_norm": 0.46284592151641846,
"learning_rate": 9.916798983804383e-05,
"loss": 0.453,
"step": 136
},
{
"epoch": 0.2605801236329054,
"grad_norm": 0.40281572937965393,
"learning_rate": 9.916163861543348e-05,
"loss": 0.4771,
"step": 137
},
{
"epoch": 0.2624821683309558,
"grad_norm": 0.425214558839798,
"learning_rate": 9.915528739282313e-05,
"loss": 0.4665,
"step": 138
},
{
"epoch": 0.26438421302900617,
"grad_norm": 0.4181045889854431,
"learning_rate": 9.914893617021277e-05,
"loss": 0.5014,
"step": 139
},
{
"epoch": 0.2662862577270566,
"grad_norm": 0.4024779498577118,
"learning_rate": 9.914258494760241e-05,
"loss": 0.5905,
"step": 140
},
{
"epoch": 0.268188302425107,
"grad_norm": 0.3768770694732666,
"learning_rate": 9.913623372499207e-05,
"loss": 0.408,
"step": 141
},
{
"epoch": 0.2700903471231574,
"grad_norm": 0.4033905267715454,
"learning_rate": 9.912988250238171e-05,
"loss": 0.4511,
"step": 142
},
{
"epoch": 0.2719923918212078,
"grad_norm": 0.32505708932876587,
"learning_rate": 9.912353127977136e-05,
"loss": 0.4395,
"step": 143
},
{
"epoch": 0.2738944365192582,
"grad_norm": 0.3487790822982788,
"learning_rate": 9.9117180057161e-05,
"loss": 0.3601,
"step": 144
},
{
"epoch": 0.2757964812173086,
"grad_norm": 0.30558326840400696,
"learning_rate": 9.911082883455065e-05,
"loss": 0.4607,
"step": 145
},
{
"epoch": 0.277698525915359,
"grad_norm": 0.3752080500125885,
"learning_rate": 9.91044776119403e-05,
"loss": 0.3957,
"step": 146
},
{
"epoch": 0.2796005706134094,
"grad_norm": 0.3506644368171692,
"learning_rate": 9.909812638932994e-05,
"loss": 0.366,
"step": 147
},
{
"epoch": 0.2815026153114598,
"grad_norm": 0.43430307507514954,
"learning_rate": 9.909177516671961e-05,
"loss": 0.4542,
"step": 148
},
{
"epoch": 0.2834046600095102,
"grad_norm": 0.41930171847343445,
"learning_rate": 9.908542394410925e-05,
"loss": 0.709,
"step": 149
},
{
"epoch": 0.28530670470756064,
"grad_norm": 0.3717108964920044,
"learning_rate": 9.907907272149888e-05,
"loss": 0.4701,
"step": 150
},
{
"epoch": 0.28720874940561103,
"grad_norm": 0.4177984595298767,
"learning_rate": 9.907272149888854e-05,
"loss": 0.6189,
"step": 151
},
{
"epoch": 0.2891107941036614,
"grad_norm": 0.37706881761550903,
"learning_rate": 9.906637027627819e-05,
"loss": 0.4546,
"step": 152
},
{
"epoch": 0.29101283880171186,
"grad_norm": 0.4210599660873413,
"learning_rate": 9.906001905366784e-05,
"loss": 0.4716,
"step": 153
},
{
"epoch": 0.29291488349976225,
"grad_norm": 0.3707990050315857,
"learning_rate": 9.905366783105748e-05,
"loss": 0.4644,
"step": 154
},
{
"epoch": 0.29481692819781263,
"grad_norm": 0.36913537979125977,
"learning_rate": 9.904731660844713e-05,
"loss": 0.4605,
"step": 155
},
{
"epoch": 0.2967189728958631,
"grad_norm": 0.41291072964668274,
"learning_rate": 9.904096538583678e-05,
"loss": 0.4294,
"step": 156
},
{
"epoch": 0.29862101759391346,
"grad_norm": 0.30809640884399414,
"learning_rate": 9.903461416322642e-05,
"loss": 0.4369,
"step": 157
},
{
"epoch": 0.30052306229196385,
"grad_norm": 0.4266267716884613,
"learning_rate": 9.902826294061607e-05,
"loss": 0.456,
"step": 158
},
{
"epoch": 0.3024251069900143,
"grad_norm": 0.37408629059791565,
"learning_rate": 9.902191171800572e-05,
"loss": 0.4359,
"step": 159
},
{
"epoch": 0.3043271516880647,
"grad_norm": 0.40199100971221924,
"learning_rate": 9.901556049539536e-05,
"loss": 0.4433,
"step": 160
},
{
"epoch": 0.30622919638611507,
"grad_norm": 0.3430602252483368,
"learning_rate": 9.900920927278501e-05,
"loss": 0.4317,
"step": 161
},
{
"epoch": 0.30813124108416545,
"grad_norm": 0.5091786980628967,
"learning_rate": 9.900285805017467e-05,
"loss": 0.5824,
"step": 162
},
{
"epoch": 0.3100332857822159,
"grad_norm": 0.34287527203559875,
"learning_rate": 9.89965068275643e-05,
"loss": 0.4025,
"step": 163
},
{
"epoch": 0.3119353304802663,
"grad_norm": 0.4919246733188629,
"learning_rate": 9.899015560495396e-05,
"loss": 0.5612,
"step": 164
},
{
"epoch": 0.31383737517831667,
"grad_norm": 0.35404297709465027,
"learning_rate": 9.898380438234361e-05,
"loss": 0.4731,
"step": 165
},
{
"epoch": 0.3157394198763671,
"grad_norm": 0.3590085506439209,
"learning_rate": 9.897745315973326e-05,
"loss": 0.4365,
"step": 166
},
{
"epoch": 0.3176414645744175,
"grad_norm": 0.4132196605205536,
"learning_rate": 9.89711019371229e-05,
"loss": 0.3485,
"step": 167
},
{
"epoch": 0.3195435092724679,
"grad_norm": 0.46459728479385376,
"learning_rate": 9.896475071451255e-05,
"loss": 0.4327,
"step": 168
},
{
"epoch": 0.3214455539705183,
"grad_norm": 0.435651957988739,
"learning_rate": 9.89583994919022e-05,
"loss": 0.4684,
"step": 169
},
{
"epoch": 0.3233475986685687,
"grad_norm": 0.38278958201408386,
"learning_rate": 9.895204826929184e-05,
"loss": 0.4265,
"step": 170
},
{
"epoch": 0.3252496433666191,
"grad_norm": 0.31499558687210083,
"learning_rate": 9.894569704668149e-05,
"loss": 0.4099,
"step": 171
},
{
"epoch": 0.32715168806466954,
"grad_norm": 0.40141284465789795,
"learning_rate": 9.893934582407114e-05,
"loss": 0.4461,
"step": 172
},
{
"epoch": 0.32905373276271993,
"grad_norm": 0.42945384979248047,
"learning_rate": 9.893299460146078e-05,
"loss": 0.4379,
"step": 173
},
{
"epoch": 0.3309557774607703,
"grad_norm": 0.5186269283294678,
"learning_rate": 9.892664337885043e-05,
"loss": 0.5134,
"step": 174
},
{
"epoch": 0.33285782215882076,
"grad_norm": 0.3771612048149109,
"learning_rate": 9.892029215624009e-05,
"loss": 0.4617,
"step": 175
},
{
"epoch": 0.33475986685687115,
"grad_norm": 0.48396849632263184,
"learning_rate": 9.891394093362972e-05,
"loss": 0.4944,
"step": 176
},
{
"epoch": 0.33666191155492153,
"grad_norm": 0.5303121209144592,
"learning_rate": 9.890758971101938e-05,
"loss": 0.4049,
"step": 177
},
{
"epoch": 0.3385639562529719,
"grad_norm": 0.33063024282455444,
"learning_rate": 9.890123848840901e-05,
"loss": 0.401,
"step": 178
},
{
"epoch": 0.34046600095102236,
"grad_norm": 0.3764759302139282,
"learning_rate": 9.889488726579868e-05,
"loss": 0.4222,
"step": 179
},
{
"epoch": 0.34236804564907275,
"grad_norm": 0.27206951379776,
"learning_rate": 9.888853604318832e-05,
"loss": 0.3206,
"step": 180
},
{
"epoch": 0.34427009034712314,
"grad_norm": 0.3893122971057892,
"learning_rate": 9.888218482057796e-05,
"loss": 0.3558,
"step": 181
},
{
"epoch": 0.3461721350451736,
"grad_norm": 0.42340540885925293,
"learning_rate": 9.887583359796762e-05,
"loss": 0.3948,
"step": 182
},
{
"epoch": 0.34807417974322397,
"grad_norm": 0.4103796184062958,
"learning_rate": 9.886948237535726e-05,
"loss": 0.4769,
"step": 183
},
{
"epoch": 0.34997622444127435,
"grad_norm": 0.39225244522094727,
"learning_rate": 9.886313115274691e-05,
"loss": 0.441,
"step": 184
},
{
"epoch": 0.3518782691393248,
"grad_norm": 0.3774043023586273,
"learning_rate": 9.885677993013655e-05,
"loss": 0.3018,
"step": 185
},
{
"epoch": 0.3537803138373752,
"grad_norm": 0.4012366235256195,
"learning_rate": 9.88504287075262e-05,
"loss": 0.4217,
"step": 186
},
{
"epoch": 0.35568235853542557,
"grad_norm": 0.37299972772598267,
"learning_rate": 9.884407748491585e-05,
"loss": 0.4518,
"step": 187
},
{
"epoch": 0.357584403233476,
"grad_norm": 0.34713125228881836,
"learning_rate": 9.883772626230549e-05,
"loss": 0.3882,
"step": 188
},
{
"epoch": 0.3594864479315264,
"grad_norm": 0.4148958623409271,
"learning_rate": 9.883137503969516e-05,
"loss": 0.4979,
"step": 189
},
{
"epoch": 0.3613884926295768,
"grad_norm": 0.3979155421257019,
"learning_rate": 9.88250238170848e-05,
"loss": 0.3854,
"step": 190
},
{
"epoch": 0.36329053732762717,
"grad_norm": 0.42723751068115234,
"learning_rate": 9.881867259447443e-05,
"loss": 0.4325,
"step": 191
},
{
"epoch": 0.3651925820256776,
"grad_norm": 0.4195951521396637,
"learning_rate": 9.881232137186409e-05,
"loss": 0.3917,
"step": 192
},
{
"epoch": 0.367094626723728,
"grad_norm": 0.43937554955482483,
"learning_rate": 9.880597014925374e-05,
"loss": 0.3907,
"step": 193
},
{
"epoch": 0.3689966714217784,
"grad_norm": 0.3176072835922241,
"learning_rate": 9.879961892664338e-05,
"loss": 0.3581,
"step": 194
},
{
"epoch": 0.37089871611982883,
"grad_norm": 0.39909854531288147,
"learning_rate": 9.879326770403303e-05,
"loss": 0.5881,
"step": 195
},
{
"epoch": 0.3728007608178792,
"grad_norm": 0.35058659315109253,
"learning_rate": 9.878691648142268e-05,
"loss": 0.4753,
"step": 196
},
{
"epoch": 0.3747028055159296,
"grad_norm": 0.3353765904903412,
"learning_rate": 9.878056525881233e-05,
"loss": 0.4014,
"step": 197
},
{
"epoch": 0.37660485021398005,
"grad_norm": 0.4102007746696472,
"learning_rate": 9.877421403620197e-05,
"loss": 0.4841,
"step": 198
},
{
"epoch": 0.37850689491203043,
"grad_norm": 0.45450812578201294,
"learning_rate": 9.876786281359162e-05,
"loss": 0.4655,
"step": 199
},
{
"epoch": 0.3804089396100808,
"grad_norm": 0.32525572180747986,
"learning_rate": 9.876151159098127e-05,
"loss": 0.3869,
"step": 200
},
{
"epoch": 0.38231098430813126,
"grad_norm": 0.4488207697868347,
"learning_rate": 9.875516036837091e-05,
"loss": 0.4743,
"step": 201
},
{
"epoch": 0.38421302900618165,
"grad_norm": 0.432962030172348,
"learning_rate": 9.874880914576056e-05,
"loss": 0.4171,
"step": 202
},
{
"epoch": 0.38611507370423204,
"grad_norm": 0.4264095723628998,
"learning_rate": 9.874245792315022e-05,
"loss": 0.4344,
"step": 203
},
{
"epoch": 0.3880171184022825,
"grad_norm": 0.43752139806747437,
"learning_rate": 9.873610670053985e-05,
"loss": 0.5248,
"step": 204
},
{
"epoch": 0.38991916310033287,
"grad_norm": 0.42547503113746643,
"learning_rate": 9.87297554779295e-05,
"loss": 0.4011,
"step": 205
},
{
"epoch": 0.39182120779838325,
"grad_norm": 0.34600159525871277,
"learning_rate": 9.872340425531916e-05,
"loss": 0.3444,
"step": 206
},
{
"epoch": 0.39372325249643364,
"grad_norm": 0.3614776134490967,
"learning_rate": 9.871705303270881e-05,
"loss": 0.4784,
"step": 207
},
{
"epoch": 0.3956252971944841,
"grad_norm": 0.47591882944107056,
"learning_rate": 9.871070181009845e-05,
"loss": 0.5159,
"step": 208
},
{
"epoch": 0.39752734189253447,
"grad_norm": 0.3321515917778015,
"learning_rate": 9.870435058748809e-05,
"loss": 0.4382,
"step": 209
},
{
"epoch": 0.39942938659058486,
"grad_norm": 0.45849499106407166,
"learning_rate": 9.869799936487775e-05,
"loss": 0.4269,
"step": 210
},
{
"epoch": 0.4013314312886353,
"grad_norm": 0.3666900098323822,
"learning_rate": 9.869164814226739e-05,
"loss": 0.4077,
"step": 211
},
{
"epoch": 0.4032334759866857,
"grad_norm": 0.3387741446495056,
"learning_rate": 9.868529691965703e-05,
"loss": 0.4485,
"step": 212
},
{
"epoch": 0.4051355206847361,
"grad_norm": 0.3360239267349243,
"learning_rate": 9.86789456970467e-05,
"loss": 0.4042,
"step": 213
},
{
"epoch": 0.4070375653827865,
"grad_norm": 0.40923500061035156,
"learning_rate": 9.867259447443633e-05,
"loss": 0.5001,
"step": 214
},
{
"epoch": 0.4089396100808369,
"grad_norm": 0.3974573314189911,
"learning_rate": 9.866624325182598e-05,
"loss": 0.4984,
"step": 215
},
{
"epoch": 0.4108416547788873,
"grad_norm": 0.4095960557460785,
"learning_rate": 9.865989202921562e-05,
"loss": 0.3837,
"step": 216
},
{
"epoch": 0.41274369947693773,
"grad_norm": 0.3334168493747711,
"learning_rate": 9.865354080660527e-05,
"loss": 0.3935,
"step": 217
},
{
"epoch": 0.4146457441749881,
"grad_norm": 0.5007266998291016,
"learning_rate": 9.864718958399493e-05,
"loss": 0.4443,
"step": 218
},
{
"epoch": 0.4165477888730385,
"grad_norm": 0.35881495475769043,
"learning_rate": 9.864083836138456e-05,
"loss": 0.3835,
"step": 219
},
{
"epoch": 0.41844983357108895,
"grad_norm": 0.3785092830657959,
"learning_rate": 9.863448713877423e-05,
"loss": 0.3884,
"step": 220
},
{
"epoch": 0.42035187826913933,
"grad_norm": 0.41435107588768005,
"learning_rate": 9.862813591616387e-05,
"loss": 0.4116,
"step": 221
},
{
"epoch": 0.4222539229671897,
"grad_norm": 0.41338756680488586,
"learning_rate": 9.86217846935535e-05,
"loss": 0.5235,
"step": 222
},
{
"epoch": 0.4241559676652401,
"grad_norm": 0.4335710406303406,
"learning_rate": 9.861543347094316e-05,
"loss": 0.516,
"step": 223
},
{
"epoch": 0.42605801236329055,
"grad_norm": 0.37374967336654663,
"learning_rate": 9.860908224833281e-05,
"loss": 0.4663,
"step": 224
},
{
"epoch": 0.42796005706134094,
"grad_norm": 0.3213825821876526,
"learning_rate": 9.860273102572246e-05,
"loss": 0.3636,
"step": 225
},
{
"epoch": 0.4298621017593913,
"grad_norm": 0.41535523533821106,
"learning_rate": 9.85963798031121e-05,
"loss": 0.3677,
"step": 226
},
{
"epoch": 0.43176414645744177,
"grad_norm": 0.3543884754180908,
"learning_rate": 9.859002858050175e-05,
"loss": 0.376,
"step": 227
},
{
"epoch": 0.43366619115549215,
"grad_norm": 0.4012312889099121,
"learning_rate": 9.85836773578914e-05,
"loss": 0.4886,
"step": 228
},
{
"epoch": 0.43556823585354254,
"grad_norm": 0.3928169310092926,
"learning_rate": 9.857732613528104e-05,
"loss": 0.3741,
"step": 229
},
{
"epoch": 0.437470280551593,
"grad_norm": 0.4982980191707611,
"learning_rate": 9.85709749126707e-05,
"loss": 0.5704,
"step": 230
},
{
"epoch": 0.43937232524964337,
"grad_norm": 0.356545090675354,
"learning_rate": 9.856462369006035e-05,
"loss": 0.3618,
"step": 231
},
{
"epoch": 0.44127436994769376,
"grad_norm": 0.5087487697601318,
"learning_rate": 9.855827246744998e-05,
"loss": 0.4733,
"step": 232
},
{
"epoch": 0.4431764146457442,
"grad_norm": 0.3566097021102905,
"learning_rate": 9.855192124483964e-05,
"loss": 0.3771,
"step": 233
},
{
"epoch": 0.4450784593437946,
"grad_norm": 0.3210541605949402,
"learning_rate": 9.854557002222929e-05,
"loss": 0.4341,
"step": 234
},
{
"epoch": 0.446980504041845,
"grad_norm": 0.25422924757003784,
"learning_rate": 9.853921879961893e-05,
"loss": 0.3987,
"step": 235
},
{
"epoch": 0.4488825487398954,
"grad_norm": 0.39164894819259644,
"learning_rate": 9.853286757700858e-05,
"loss": 0.4149,
"step": 236
},
{
"epoch": 0.4507845934379458,
"grad_norm": 0.37471455335617065,
"learning_rate": 9.852651635439823e-05,
"loss": 0.4471,
"step": 237
},
{
"epoch": 0.4526866381359962,
"grad_norm": 0.37678262591362,
"learning_rate": 9.852016513178788e-05,
"loss": 0.3943,
"step": 238
},
{
"epoch": 0.4545886828340466,
"grad_norm": 0.4653976857662201,
"learning_rate": 9.851381390917752e-05,
"loss": 0.4848,
"step": 239
},
{
"epoch": 0.456490727532097,
"grad_norm": 0.46764564514160156,
"learning_rate": 9.850746268656717e-05,
"loss": 0.4624,
"step": 240
},
{
"epoch": 0.4583927722301474,
"grad_norm": 0.3803463876247406,
"learning_rate": 9.850111146395682e-05,
"loss": 0.442,
"step": 241
},
{
"epoch": 0.4602948169281978,
"grad_norm": 0.33662229776382446,
"learning_rate": 9.849476024134646e-05,
"loss": 0.4564,
"step": 242
},
{
"epoch": 0.46219686162624823,
"grad_norm": 0.42181041836738586,
"learning_rate": 9.848840901873611e-05,
"loss": 0.4702,
"step": 243
},
{
"epoch": 0.4640989063242986,
"grad_norm": 0.40373390913009644,
"learning_rate": 9.848205779612576e-05,
"loss": 0.3745,
"step": 244
},
{
"epoch": 0.466000951022349,
"grad_norm": 0.36634379625320435,
"learning_rate": 9.84757065735154e-05,
"loss": 0.428,
"step": 245
},
{
"epoch": 0.46790299572039945,
"grad_norm": 0.35369235277175903,
"learning_rate": 9.846935535090506e-05,
"loss": 0.3986,
"step": 246
},
{
"epoch": 0.46980504041844984,
"grad_norm": 0.4154004454612732,
"learning_rate": 9.846300412829471e-05,
"loss": 0.3512,
"step": 247
},
{
"epoch": 0.4717070851165002,
"grad_norm": 0.3689868450164795,
"learning_rate": 9.845665290568435e-05,
"loss": 0.3708,
"step": 248
},
{
"epoch": 0.47360912981455067,
"grad_norm": 0.38414841890335083,
"learning_rate": 9.8450301683074e-05,
"loss": 0.3401,
"step": 249
},
{
"epoch": 0.47551117451260105,
"grad_norm": 0.39936143159866333,
"learning_rate": 9.844395046046364e-05,
"loss": 0.4328,
"step": 250
},
{
"epoch": 0.47741321921065144,
"grad_norm": 0.30578187108039856,
"learning_rate": 9.84375992378533e-05,
"loss": 0.3694,
"step": 251
},
{
"epoch": 0.4793152639087018,
"grad_norm": 0.39497658610343933,
"learning_rate": 9.843124801524294e-05,
"loss": 0.3945,
"step": 252
},
{
"epoch": 0.48121730860675227,
"grad_norm": 0.44466689229011536,
"learning_rate": 9.842489679263258e-05,
"loss": 0.4485,
"step": 253
},
{
"epoch": 0.48311935330480266,
"grad_norm": 0.3614617586135864,
"learning_rate": 9.841854557002223e-05,
"loss": 0.3701,
"step": 254
},
{
"epoch": 0.48502139800285304,
"grad_norm": 0.3102608621120453,
"learning_rate": 9.841219434741188e-05,
"loss": 0.3677,
"step": 255
},
{
"epoch": 0.4869234427009035,
"grad_norm": 0.36049678921699524,
"learning_rate": 9.840584312480153e-05,
"loss": 0.411,
"step": 256
},
{
"epoch": 0.4888254873989539,
"grad_norm": 0.4025668501853943,
"learning_rate": 9.839949190219117e-05,
"loss": 0.433,
"step": 257
},
{
"epoch": 0.49072753209700426,
"grad_norm": 0.4131562113761902,
"learning_rate": 9.839314067958082e-05,
"loss": 0.4818,
"step": 258
},
{
"epoch": 0.4926295767950547,
"grad_norm": 0.481468141078949,
"learning_rate": 9.838678945697047e-05,
"loss": 0.5226,
"step": 259
},
{
"epoch": 0.4945316214931051,
"grad_norm": 0.2845190167427063,
"learning_rate": 9.838043823436011e-05,
"loss": 0.3323,
"step": 260
},
{
"epoch": 0.4964336661911555,
"grad_norm": 0.40381497144699097,
"learning_rate": 9.837408701174976e-05,
"loss": 0.4025,
"step": 261
},
{
"epoch": 0.4983357108892059,
"grad_norm": 0.4109043478965759,
"learning_rate": 9.836773578913942e-05,
"loss": 0.4429,
"step": 262
},
{
"epoch": 0.5002377555872562,
"grad_norm": 0.4256783425807953,
"learning_rate": 9.836138456652906e-05,
"loss": 0.3994,
"step": 263
},
{
"epoch": 0.5021398002853067,
"grad_norm": 0.35044407844543457,
"learning_rate": 9.835503334391871e-05,
"loss": 0.4431,
"step": 264
},
{
"epoch": 0.5040418449833571,
"grad_norm": 0.4456939697265625,
"learning_rate": 9.834868212130836e-05,
"loss": 0.5424,
"step": 265
},
{
"epoch": 0.5059438896814075,
"grad_norm": 0.36340197920799255,
"learning_rate": 9.8342330898698e-05,
"loss": 0.4199,
"step": 266
},
{
"epoch": 0.5078459343794579,
"grad_norm": 0.4018803536891937,
"learning_rate": 9.833597967608765e-05,
"loss": 0.4132,
"step": 267
},
{
"epoch": 0.5097479790775084,
"grad_norm": 0.3372616469860077,
"learning_rate": 9.83296284534773e-05,
"loss": 0.3239,
"step": 268
},
{
"epoch": 0.5116500237755587,
"grad_norm": 0.4497722387313843,
"learning_rate": 9.832327723086695e-05,
"loss": 0.4019,
"step": 269
},
{
"epoch": 0.5135520684736091,
"grad_norm": 0.422269344329834,
"learning_rate": 9.831692600825659e-05,
"loss": 0.45,
"step": 270
},
{
"epoch": 0.5154541131716596,
"grad_norm": 0.4167305529117584,
"learning_rate": 9.831057478564624e-05,
"loss": 0.4172,
"step": 271
},
{
"epoch": 0.5173561578697099,
"grad_norm": 0.4340919554233551,
"learning_rate": 9.83042235630359e-05,
"loss": 0.5042,
"step": 272
},
{
"epoch": 0.5192582025677603,
"grad_norm": 0.4179072380065918,
"learning_rate": 9.829787234042553e-05,
"loss": 0.3499,
"step": 273
},
{
"epoch": 0.5211602472658108,
"grad_norm": 0.39216554164886475,
"learning_rate": 9.829152111781518e-05,
"loss": 0.4729,
"step": 274
},
{
"epoch": 0.5230622919638611,
"grad_norm": 0.4485825002193451,
"learning_rate": 9.828516989520484e-05,
"loss": 0.4449,
"step": 275
},
{
"epoch": 0.5249643366619116,
"grad_norm": 0.3843270242214203,
"learning_rate": 9.827881867259447e-05,
"loss": 0.5416,
"step": 276
},
{
"epoch": 0.526866381359962,
"grad_norm": 0.30829140543937683,
"learning_rate": 9.827246744998413e-05,
"loss": 0.4004,
"step": 277
},
{
"epoch": 0.5287684260580123,
"grad_norm": 0.2905525863170624,
"learning_rate": 9.826611622737378e-05,
"loss": 0.3574,
"step": 278
},
{
"epoch": 0.5306704707560628,
"grad_norm": 0.3848637342453003,
"learning_rate": 9.825976500476343e-05,
"loss": 0.4021,
"step": 279
},
{
"epoch": 0.5325725154541132,
"grad_norm": 0.32691988348960876,
"learning_rate": 9.825341378215307e-05,
"loss": 0.4317,
"step": 280
},
{
"epoch": 0.5344745601521635,
"grad_norm": 0.3506065011024475,
"learning_rate": 9.824706255954271e-05,
"loss": 0.329,
"step": 281
},
{
"epoch": 0.536376604850214,
"grad_norm": 0.3102387487888336,
"learning_rate": 9.824071133693237e-05,
"loss": 0.3695,
"step": 282
},
{
"epoch": 0.5382786495482644,
"grad_norm": 0.45750680565834045,
"learning_rate": 9.823436011432201e-05,
"loss": 0.4232,
"step": 283
},
{
"epoch": 0.5401806942463148,
"grad_norm": 0.297134131193161,
"learning_rate": 9.822800889171165e-05,
"loss": 0.4137,
"step": 284
},
{
"epoch": 0.5420827389443652,
"grad_norm": 0.3696708679199219,
"learning_rate": 9.822165766910131e-05,
"loss": 0.4598,
"step": 285
},
{
"epoch": 0.5439847836424156,
"grad_norm": 0.31236112117767334,
"learning_rate": 9.821530644649095e-05,
"loss": 0.314,
"step": 286
},
{
"epoch": 0.545886828340466,
"grad_norm": 0.3596087694168091,
"learning_rate": 9.82089552238806e-05,
"loss": 0.4164,
"step": 287
},
{
"epoch": 0.5477888730385164,
"grad_norm": 0.33347079157829285,
"learning_rate": 9.820260400127024e-05,
"loss": 0.3915,
"step": 288
},
{
"epoch": 0.5496909177365669,
"grad_norm": 0.37818920612335205,
"learning_rate": 9.81962527786599e-05,
"loss": 0.3994,
"step": 289
},
{
"epoch": 0.5515929624346172,
"grad_norm": 0.3968106806278229,
"learning_rate": 9.818990155604955e-05,
"loss": 0.3611,
"step": 290
},
{
"epoch": 0.5534950071326676,
"grad_norm": 0.34991270303726196,
"learning_rate": 9.818355033343918e-05,
"loss": 0.3703,
"step": 291
},
{
"epoch": 0.555397051830718,
"grad_norm": 0.4046263098716736,
"learning_rate": 9.817719911082885e-05,
"loss": 0.3302,
"step": 292
},
{
"epoch": 0.5572990965287684,
"grad_norm": 0.35804587602615356,
"learning_rate": 9.817084788821849e-05,
"loss": 0.373,
"step": 293
},
{
"epoch": 0.5592011412268189,
"grad_norm": 0.3538301885128021,
"learning_rate": 9.816449666560813e-05,
"loss": 0.3482,
"step": 294
},
{
"epoch": 0.5611031859248692,
"grad_norm": 0.36835455894470215,
"learning_rate": 9.815814544299778e-05,
"loss": 0.3393,
"step": 295
},
{
"epoch": 0.5630052306229196,
"grad_norm": 0.48919835686683655,
"learning_rate": 9.815179422038743e-05,
"loss": 0.4213,
"step": 296
},
{
"epoch": 0.5649072753209701,
"grad_norm": 0.3472330570220947,
"learning_rate": 9.814544299777708e-05,
"loss": 0.3996,
"step": 297
},
{
"epoch": 0.5668093200190204,
"grad_norm": 0.428611159324646,
"learning_rate": 9.813909177516672e-05,
"loss": 0.4524,
"step": 298
},
{
"epoch": 0.5687113647170708,
"grad_norm": 0.4176979959011078,
"learning_rate": 9.813274055255637e-05,
"loss": 0.3787,
"step": 299
},
{
"epoch": 0.5706134094151213,
"grad_norm": 0.41548797488212585,
"learning_rate": 9.812638932994602e-05,
"loss": 0.4758,
"step": 300
},
{
"epoch": 0.5725154541131716,
"grad_norm": 0.3926902413368225,
"learning_rate": 9.812003810733566e-05,
"loss": 0.434,
"step": 301
},
{
"epoch": 0.5744174988112221,
"grad_norm": 0.392846018075943,
"learning_rate": 9.811368688472531e-05,
"loss": 0.3928,
"step": 302
},
{
"epoch": 0.5763195435092725,
"grad_norm": 0.36347585916519165,
"learning_rate": 9.810733566211497e-05,
"loss": 0.4264,
"step": 303
},
{
"epoch": 0.5782215882073228,
"grad_norm": 0.4314410090446472,
"learning_rate": 9.81009844395046e-05,
"loss": 0.4199,
"step": 304
},
{
"epoch": 0.5801236329053733,
"grad_norm": 0.337494820356369,
"learning_rate": 9.809463321689426e-05,
"loss": 0.4181,
"step": 305
},
{
"epoch": 0.5820256776034237,
"grad_norm": 0.27786335349082947,
"learning_rate": 9.808828199428391e-05,
"loss": 0.3,
"step": 306
},
{
"epoch": 0.583927722301474,
"grad_norm": 0.37235599756240845,
"learning_rate": 9.808193077167355e-05,
"loss": 0.3927,
"step": 307
},
{
"epoch": 0.5858297669995245,
"grad_norm": 0.37353670597076416,
"learning_rate": 9.80755795490632e-05,
"loss": 0.4146,
"step": 308
},
{
"epoch": 0.5877318116975749,
"grad_norm": 0.3919946551322937,
"learning_rate": 9.806922832645285e-05,
"loss": 0.5055,
"step": 309
},
{
"epoch": 0.5896338563956253,
"grad_norm": 0.45411062240600586,
"learning_rate": 9.80628771038425e-05,
"loss": 0.5347,
"step": 310
},
{
"epoch": 0.5915359010936757,
"grad_norm": 0.4087005853652954,
"learning_rate": 9.805652588123214e-05,
"loss": 0.3732,
"step": 311
},
{
"epoch": 0.5934379457917262,
"grad_norm": 0.313297837972641,
"learning_rate": 9.805017465862178e-05,
"loss": 0.3093,
"step": 312
},
{
"epoch": 0.5953399904897765,
"grad_norm": 0.40149226784706116,
"learning_rate": 9.804382343601144e-05,
"loss": 0.4404,
"step": 313
},
{
"epoch": 0.5972420351878269,
"grad_norm": 0.34245574474334717,
"learning_rate": 9.803747221340108e-05,
"loss": 0.4036,
"step": 314
},
{
"epoch": 0.5991440798858774,
"grad_norm": 0.38059449195861816,
"learning_rate": 9.803112099079073e-05,
"loss": 0.3763,
"step": 315
},
{
"epoch": 0.6010461245839277,
"grad_norm": 0.4539381265640259,
"learning_rate": 9.802476976818039e-05,
"loss": 0.4551,
"step": 316
},
{
"epoch": 0.6029481692819781,
"grad_norm": 0.4077235460281372,
"learning_rate": 9.801841854557002e-05,
"loss": 0.4641,
"step": 317
},
{
"epoch": 0.6048502139800286,
"grad_norm": 0.3426643908023834,
"learning_rate": 9.801206732295968e-05,
"loss": 0.3684,
"step": 318
},
{
"epoch": 0.6067522586780789,
"grad_norm": 0.3042270839214325,
"learning_rate": 9.800571610034931e-05,
"loss": 0.373,
"step": 319
},
{
"epoch": 0.6086543033761294,
"grad_norm": 0.4373973309993744,
"learning_rate": 9.799936487773897e-05,
"loss": 0.5442,
"step": 320
},
{
"epoch": 0.6105563480741797,
"grad_norm": 0.385797917842865,
"learning_rate": 9.799301365512862e-05,
"loss": 0.4218,
"step": 321
},
{
"epoch": 0.6124583927722301,
"grad_norm": 0.33210891485214233,
"learning_rate": 9.798666243251826e-05,
"loss": 0.3062,
"step": 322
},
{
"epoch": 0.6143604374702806,
"grad_norm": 0.3997063636779785,
"learning_rate": 9.798031120990792e-05,
"loss": 0.4104,
"step": 323
},
{
"epoch": 0.6162624821683309,
"grad_norm": 0.4837460219860077,
"learning_rate": 9.797395998729756e-05,
"loss": 0.5271,
"step": 324
},
{
"epoch": 0.6181645268663813,
"grad_norm": 0.36420971155166626,
"learning_rate": 9.79676087646872e-05,
"loss": 0.4033,
"step": 325
},
{
"epoch": 0.6200665715644318,
"grad_norm": 0.33610865473747253,
"learning_rate": 9.796125754207685e-05,
"loss": 0.3992,
"step": 326
},
{
"epoch": 0.6219686162624821,
"grad_norm": 0.28999099135398865,
"learning_rate": 9.79549063194665e-05,
"loss": 0.3675,
"step": 327
},
{
"epoch": 0.6238706609605326,
"grad_norm": 0.359401673078537,
"learning_rate": 9.794855509685615e-05,
"loss": 0.4363,
"step": 328
},
{
"epoch": 0.625772705658583,
"grad_norm": 0.3948569595813751,
"learning_rate": 9.794220387424579e-05,
"loss": 0.3698,
"step": 329
},
{
"epoch": 0.6276747503566333,
"grad_norm": 0.3753513991832733,
"learning_rate": 9.793585265163544e-05,
"loss": 0.4397,
"step": 330
},
{
"epoch": 0.6295767950546838,
"grad_norm": 0.32612451910972595,
"learning_rate": 9.79295014290251e-05,
"loss": 0.3846,
"step": 331
},
{
"epoch": 0.6314788397527342,
"grad_norm": 0.40796539187431335,
"learning_rate": 9.792315020641473e-05,
"loss": 0.371,
"step": 332
},
{
"epoch": 0.6333808844507846,
"grad_norm": 0.4358294904232025,
"learning_rate": 9.791679898380439e-05,
"loss": 0.4052,
"step": 333
},
{
"epoch": 0.635282929148835,
"grad_norm": 0.39615437388420105,
"learning_rate": 9.791044776119404e-05,
"loss": 0.3686,
"step": 334
},
{
"epoch": 0.6371849738468854,
"grad_norm": 0.32977715134620667,
"learning_rate": 9.790409653858368e-05,
"loss": 0.4404,
"step": 335
},
{
"epoch": 0.6390870185449358,
"grad_norm": 0.38361093401908875,
"learning_rate": 9.789774531597333e-05,
"loss": 0.3709,
"step": 336
},
{
"epoch": 0.6409890632429862,
"grad_norm": 0.40280988812446594,
"learning_rate": 9.789139409336298e-05,
"loss": 0.3322,
"step": 337
},
{
"epoch": 0.6428911079410367,
"grad_norm": 0.3682766854763031,
"learning_rate": 9.788504287075262e-05,
"loss": 0.4144,
"step": 338
},
{
"epoch": 0.644793152639087,
"grad_norm": 0.39864271879196167,
"learning_rate": 9.787869164814227e-05,
"loss": 0.4404,
"step": 339
},
{
"epoch": 0.6466951973371374,
"grad_norm": 0.3244321048259735,
"learning_rate": 9.787234042553192e-05,
"loss": 0.3541,
"step": 340
},
{
"epoch": 0.6485972420351879,
"grad_norm": 0.323403924703598,
"learning_rate": 9.786598920292157e-05,
"loss": 0.3374,
"step": 341
},
{
"epoch": 0.6504992867332382,
"grad_norm": 0.3881044387817383,
"learning_rate": 9.785963798031121e-05,
"loss": 0.4415,
"step": 342
},
{
"epoch": 0.6524013314312886,
"grad_norm": 0.35189467668533325,
"learning_rate": 9.785328675770086e-05,
"loss": 0.401,
"step": 343
},
{
"epoch": 0.6543033761293391,
"grad_norm": 0.3553767800331116,
"learning_rate": 9.784693553509052e-05,
"loss": 0.456,
"step": 344
},
{
"epoch": 0.6562054208273894,
"grad_norm": 0.3302605152130127,
"learning_rate": 9.784058431248015e-05,
"loss": 0.472,
"step": 345
},
{
"epoch": 0.6581074655254399,
"grad_norm": 0.4526873826980591,
"learning_rate": 9.78342330898698e-05,
"loss": 0.3908,
"step": 346
},
{
"epoch": 0.6600095102234903,
"grad_norm": 0.3232348561286926,
"learning_rate": 9.782788186725946e-05,
"loss": 0.3421,
"step": 347
},
{
"epoch": 0.6619115549215406,
"grad_norm": 0.38508203625679016,
"learning_rate": 9.78215306446491e-05,
"loss": 0.4093,
"step": 348
},
{
"epoch": 0.6638135996195911,
"grad_norm": 0.3187748193740845,
"learning_rate": 9.781517942203875e-05,
"loss": 0.4319,
"step": 349
},
{
"epoch": 0.6657156443176415,
"grad_norm": 0.2614807188510895,
"learning_rate": 9.78088281994284e-05,
"loss": 0.314,
"step": 350
},
{
"epoch": 0.6676176890156919,
"grad_norm": 0.40218180418014526,
"learning_rate": 9.780247697681805e-05,
"loss": 0.4404,
"step": 351
},
{
"epoch": 0.6695197337137423,
"grad_norm": 0.4016517996788025,
"learning_rate": 9.779612575420769e-05,
"loss": 0.5063,
"step": 352
},
{
"epoch": 0.6714217784117926,
"grad_norm": 0.3333278000354767,
"learning_rate": 9.778977453159733e-05,
"loss": 0.2966,
"step": 353
},
{
"epoch": 0.6733238231098431,
"grad_norm": 0.4535547196865082,
"learning_rate": 9.778342330898699e-05,
"loss": 0.4077,
"step": 354
},
{
"epoch": 0.6752258678078935,
"grad_norm": 0.4180653393268585,
"learning_rate": 9.777707208637663e-05,
"loss": 0.4554,
"step": 355
},
{
"epoch": 0.6771279125059438,
"grad_norm": 0.43454670906066895,
"learning_rate": 9.777072086376627e-05,
"loss": 0.4403,
"step": 356
},
{
"epoch": 0.6790299572039943,
"grad_norm": 0.45290321111679077,
"learning_rate": 9.776436964115594e-05,
"loss": 0.4037,
"step": 357
},
{
"epoch": 0.6809320019020447,
"grad_norm": 0.34165212512016296,
"learning_rate": 9.775801841854557e-05,
"loss": 0.3044,
"step": 358
},
{
"epoch": 0.6828340466000951,
"grad_norm": 0.435138463973999,
"learning_rate": 9.775166719593523e-05,
"loss": 0.4293,
"step": 359
},
{
"epoch": 0.6847360912981455,
"grad_norm": 0.36061882972717285,
"learning_rate": 9.774531597332486e-05,
"loss": 0.4052,
"step": 360
},
{
"epoch": 0.6866381359961959,
"grad_norm": 0.4023354947566986,
"learning_rate": 9.773896475071452e-05,
"loss": 0.4232,
"step": 361
},
{
"epoch": 0.6885401806942463,
"grad_norm": 0.39200109243392944,
"learning_rate": 9.773261352810417e-05,
"loss": 0.3882,
"step": 362
},
{
"epoch": 0.6904422253922967,
"grad_norm": 0.34504035115242004,
"learning_rate": 9.77262623054938e-05,
"loss": 0.4063,
"step": 363
},
{
"epoch": 0.6923442700903472,
"grad_norm": 0.31081900000572205,
"learning_rate": 9.771991108288346e-05,
"loss": 0.251,
"step": 364
},
{
"epoch": 0.6942463147883975,
"grad_norm": 0.3800300061702728,
"learning_rate": 9.771355986027311e-05,
"loss": 0.3722,
"step": 365
},
{
"epoch": 0.6961483594864479,
"grad_norm": 0.3476494550704956,
"learning_rate": 9.770720863766275e-05,
"loss": 0.382,
"step": 366
},
{
"epoch": 0.6980504041844984,
"grad_norm": 0.38069918751716614,
"learning_rate": 9.77008574150524e-05,
"loss": 0.4329,
"step": 367
},
{
"epoch": 0.6999524488825487,
"grad_norm": 0.4034759998321533,
"learning_rate": 9.769450619244205e-05,
"loss": 0.4112,
"step": 368
},
{
"epoch": 0.7018544935805991,
"grad_norm": 0.4232093393802643,
"learning_rate": 9.76881549698317e-05,
"loss": 0.4524,
"step": 369
},
{
"epoch": 0.7037565382786496,
"grad_norm": 0.40627321600914,
"learning_rate": 9.768180374722134e-05,
"loss": 0.388,
"step": 370
},
{
"epoch": 0.7056585829766999,
"grad_norm": 0.41021519899368286,
"learning_rate": 9.767545252461099e-05,
"loss": 0.3741,
"step": 371
},
{
"epoch": 0.7075606276747504,
"grad_norm": 0.3615809679031372,
"learning_rate": 9.766910130200065e-05,
"loss": 0.4432,
"step": 372
},
{
"epoch": 0.7094626723728008,
"grad_norm": 0.3088645935058594,
"learning_rate": 9.766275007939028e-05,
"loss": 0.3343,
"step": 373
},
{
"epoch": 0.7113647170708511,
"grad_norm": 0.380659818649292,
"learning_rate": 9.765639885677994e-05,
"loss": 0.4092,
"step": 374
},
{
"epoch": 0.7132667617689016,
"grad_norm": 0.28462380170822144,
"learning_rate": 9.765004763416959e-05,
"loss": 0.31,
"step": 375
},
{
"epoch": 0.715168806466952,
"grad_norm": 0.3215513229370117,
"learning_rate": 9.764369641155923e-05,
"loss": 0.4115,
"step": 376
},
{
"epoch": 0.7170708511650024,
"grad_norm": 0.397651731967926,
"learning_rate": 9.763734518894888e-05,
"loss": 0.4369,
"step": 377
},
{
"epoch": 0.7189728958630528,
"grad_norm": 0.31436121463775635,
"learning_rate": 9.763099396633853e-05,
"loss": 0.4339,
"step": 378
},
{
"epoch": 0.7208749405611032,
"grad_norm": 0.4024806320667267,
"learning_rate": 9.762464274372817e-05,
"loss": 0.4252,
"step": 379
},
{
"epoch": 0.7227769852591536,
"grad_norm": 0.37994107604026794,
"learning_rate": 9.761829152111782e-05,
"loss": 0.3483,
"step": 380
},
{
"epoch": 0.724679029957204,
"grad_norm": 0.44616061449050903,
"learning_rate": 9.761194029850747e-05,
"loss": 0.3809,
"step": 381
},
{
"epoch": 0.7265810746552543,
"grad_norm": 0.3396744728088379,
"learning_rate": 9.760558907589712e-05,
"loss": 0.3382,
"step": 382
},
{
"epoch": 0.7284831193533048,
"grad_norm": 0.334839791059494,
"learning_rate": 9.759923785328676e-05,
"loss": 0.3465,
"step": 383
},
{
"epoch": 0.7303851640513552,
"grad_norm": 0.417478084564209,
"learning_rate": 9.75928866306764e-05,
"loss": 0.3191,
"step": 384
},
{
"epoch": 0.7322872087494056,
"grad_norm": 0.30790823698043823,
"learning_rate": 9.758653540806606e-05,
"loss": 0.3139,
"step": 385
},
{
"epoch": 0.734189253447456,
"grad_norm": 0.4008057415485382,
"learning_rate": 9.75801841854557e-05,
"loss": 0.419,
"step": 386
},
{
"epoch": 0.7360912981455064,
"grad_norm": 0.42966723442077637,
"learning_rate": 9.757383296284535e-05,
"loss": 0.3634,
"step": 387
},
{
"epoch": 0.7379933428435568,
"grad_norm": 0.33789002895355225,
"learning_rate": 9.7567481740235e-05,
"loss": 0.3966,
"step": 388
},
{
"epoch": 0.7398953875416072,
"grad_norm": 0.35244229435920715,
"learning_rate": 9.756113051762464e-05,
"loss": 0.3991,
"step": 389
},
{
"epoch": 0.7417974322396577,
"grad_norm": 0.3581864833831787,
"learning_rate": 9.75547792950143e-05,
"loss": 0.347,
"step": 390
},
{
"epoch": 0.743699476937708,
"grad_norm": 0.30788975954055786,
"learning_rate": 9.754842807240394e-05,
"loss": 0.3485,
"step": 391
},
{
"epoch": 0.7456015216357584,
"grad_norm": 0.5155593156814575,
"learning_rate": 9.754207684979359e-05,
"loss": 0.4793,
"step": 392
},
{
"epoch": 0.7475035663338089,
"grad_norm": 0.4183029532432556,
"learning_rate": 9.753572562718324e-05,
"loss": 0.4064,
"step": 393
},
{
"epoch": 0.7494056110318592,
"grad_norm": 0.36132046580314636,
"learning_rate": 9.752937440457288e-05,
"loss": 0.3539,
"step": 394
},
{
"epoch": 0.7513076557299097,
"grad_norm": 0.4269217252731323,
"learning_rate": 9.752302318196254e-05,
"loss": 0.4358,
"step": 395
},
{
"epoch": 0.7532097004279601,
"grad_norm": 0.38872459530830383,
"learning_rate": 9.751667195935218e-05,
"loss": 0.3238,
"step": 396
},
{
"epoch": 0.7551117451260104,
"grad_norm": 0.4668743312358856,
"learning_rate": 9.751032073674182e-05,
"loss": 0.4218,
"step": 397
},
{
"epoch": 0.7570137898240609,
"grad_norm": 0.3817143738269806,
"learning_rate": 9.750396951413147e-05,
"loss": 0.4332,
"step": 398
},
{
"epoch": 0.7589158345221113,
"grad_norm": 0.4089401960372925,
"learning_rate": 9.749761829152112e-05,
"loss": 0.319,
"step": 399
},
{
"epoch": 0.7608178792201616,
"grad_norm": 0.36516866087913513,
"learning_rate": 9.749126706891077e-05,
"loss": 0.3858,
"step": 400
},
{
"epoch": 0.7627199239182121,
"grad_norm": 0.3843027949333191,
"learning_rate": 9.748491584630041e-05,
"loss": 0.4682,
"step": 401
},
{
"epoch": 0.7646219686162625,
"grad_norm": 0.36987295746803284,
"learning_rate": 9.747856462369006e-05,
"loss": 0.3328,
"step": 402
},
{
"epoch": 0.7665240133143129,
"grad_norm": 0.4972301721572876,
"learning_rate": 9.747221340107972e-05,
"loss": 0.3939,
"step": 403
},
{
"epoch": 0.7684260580123633,
"grad_norm": 0.4319972097873688,
"learning_rate": 9.746586217846935e-05,
"loss": 0.3918,
"step": 404
},
{
"epoch": 0.7703281027104137,
"grad_norm": 0.364364892244339,
"learning_rate": 9.7459510955859e-05,
"loss": 0.3871,
"step": 405
},
{
"epoch": 0.7722301474084641,
"grad_norm": 0.43767908215522766,
"learning_rate": 9.745315973324866e-05,
"loss": 0.3973,
"step": 406
},
{
"epoch": 0.7741321921065145,
"grad_norm": 0.44734928011894226,
"learning_rate": 9.74468085106383e-05,
"loss": 0.3884,
"step": 407
},
{
"epoch": 0.776034236804565,
"grad_norm": 0.3817954957485199,
"learning_rate": 9.744045728802795e-05,
"loss": 0.3647,
"step": 408
},
{
"epoch": 0.7779362815026153,
"grad_norm": 0.3619462251663208,
"learning_rate": 9.74341060654176e-05,
"loss": 0.4994,
"step": 409
},
{
"epoch": 0.7798383262006657,
"grad_norm": 0.38225993514060974,
"learning_rate": 9.742775484280724e-05,
"loss": 0.4116,
"step": 410
},
{
"epoch": 0.7817403708987162,
"grad_norm": 0.39784252643585205,
"learning_rate": 9.742140362019689e-05,
"loss": 0.3729,
"step": 411
},
{
"epoch": 0.7836424155967665,
"grad_norm": 0.3188072443008423,
"learning_rate": 9.741505239758654e-05,
"loss": 0.3767,
"step": 412
},
{
"epoch": 0.785544460294817,
"grad_norm": 0.4509223401546478,
"learning_rate": 9.74087011749762e-05,
"loss": 0.4595,
"step": 413
},
{
"epoch": 0.7874465049928673,
"grad_norm": 0.40249937772750854,
"learning_rate": 9.740234995236583e-05,
"loss": 0.3761,
"step": 414
},
{
"epoch": 0.7893485496909177,
"grad_norm": 0.3387410044670105,
"learning_rate": 9.739599872975547e-05,
"loss": 0.401,
"step": 415
},
{
"epoch": 0.7912505943889682,
"grad_norm": 0.47670629620552063,
"learning_rate": 9.738964750714514e-05,
"loss": 0.3656,
"step": 416
},
{
"epoch": 0.7931526390870185,
"grad_norm": 0.37239211797714233,
"learning_rate": 9.738329628453477e-05,
"loss": 0.4885,
"step": 417
},
{
"epoch": 0.7950546837850689,
"grad_norm": 0.3347351849079132,
"learning_rate": 9.737694506192443e-05,
"loss": 0.291,
"step": 418
},
{
"epoch": 0.7969567284831194,
"grad_norm": 0.3727717399597168,
"learning_rate": 9.737059383931408e-05,
"loss": 0.3506,
"step": 419
},
{
"epoch": 0.7988587731811697,
"grad_norm": 0.3866841793060303,
"learning_rate": 9.736424261670372e-05,
"loss": 0.4355,
"step": 420
},
{
"epoch": 0.8007608178792202,
"grad_norm": 0.39670372009277344,
"learning_rate": 9.735789139409337e-05,
"loss": 0.4041,
"step": 421
},
{
"epoch": 0.8026628625772706,
"grad_norm": 0.35946765542030334,
"learning_rate": 9.7351540171483e-05,
"loss": 0.3378,
"step": 422
},
{
"epoch": 0.8045649072753209,
"grad_norm": 0.24180381000041962,
"learning_rate": 9.734518894887267e-05,
"loss": 0.3133,
"step": 423
},
{
"epoch": 0.8064669519733714,
"grad_norm": 0.4238085150718689,
"learning_rate": 9.733883772626231e-05,
"loss": 0.3968,
"step": 424
},
{
"epoch": 0.8083689966714218,
"grad_norm": 0.35451412200927734,
"learning_rate": 9.733248650365195e-05,
"loss": 0.3456,
"step": 425
},
{
"epoch": 0.8102710413694721,
"grad_norm": 0.49277418851852417,
"learning_rate": 9.732613528104161e-05,
"loss": 0.3916,
"step": 426
},
{
"epoch": 0.8121730860675226,
"grad_norm": 0.34536874294281006,
"learning_rate": 9.731978405843125e-05,
"loss": 0.537,
"step": 427
},
{
"epoch": 0.814075130765573,
"grad_norm": 0.3002311885356903,
"learning_rate": 9.731343283582089e-05,
"loss": 0.3842,
"step": 428
},
{
"epoch": 0.8159771754636234,
"grad_norm": 0.29766812920570374,
"learning_rate": 9.730708161321054e-05,
"loss": 0.2979,
"step": 429
},
{
"epoch": 0.8178792201616738,
"grad_norm": 0.34347230195999146,
"learning_rate": 9.73007303906002e-05,
"loss": 0.3996,
"step": 430
},
{
"epoch": 0.8197812648597242,
"grad_norm": 0.42430102825164795,
"learning_rate": 9.729437916798985e-05,
"loss": 0.4677,
"step": 431
},
{
"epoch": 0.8216833095577746,
"grad_norm": 0.3375668227672577,
"learning_rate": 9.728802794537948e-05,
"loss": 0.4257,
"step": 432
},
{
"epoch": 0.823585354255825,
"grad_norm": 0.3718586266040802,
"learning_rate": 9.728167672276914e-05,
"loss": 0.3555,
"step": 433
},
{
"epoch": 0.8254873989538755,
"grad_norm": 0.4310496151447296,
"learning_rate": 9.727532550015879e-05,
"loss": 0.4026,
"step": 434
},
{
"epoch": 0.8273894436519258,
"grad_norm": 0.43832001090049744,
"learning_rate": 9.726897427754843e-05,
"loss": 0.4421,
"step": 435
},
{
"epoch": 0.8292914883499762,
"grad_norm": 0.42209911346435547,
"learning_rate": 9.726262305493808e-05,
"loss": 0.397,
"step": 436
},
{
"epoch": 0.8311935330480267,
"grad_norm": 0.4297396242618561,
"learning_rate": 9.725627183232773e-05,
"loss": 0.4244,
"step": 437
},
{
"epoch": 0.833095577746077,
"grad_norm": 0.40587079524993896,
"learning_rate": 9.724992060971737e-05,
"loss": 0.3753,
"step": 438
},
{
"epoch": 0.8349976224441275,
"grad_norm": 0.4127040505409241,
"learning_rate": 9.724356938710702e-05,
"loss": 0.3926,
"step": 439
},
{
"epoch": 0.8368996671421779,
"grad_norm": 0.3734678030014038,
"learning_rate": 9.723721816449667e-05,
"loss": 0.3338,
"step": 440
},
{
"epoch": 0.8388017118402282,
"grad_norm": 0.38152286410331726,
"learning_rate": 9.723086694188632e-05,
"loss": 0.3893,
"step": 441
},
{
"epoch": 0.8407037565382787,
"grad_norm": 0.4234791398048401,
"learning_rate": 9.722451571927596e-05,
"loss": 0.3104,
"step": 442
},
{
"epoch": 0.842605801236329,
"grad_norm": 0.49204525351524353,
"learning_rate": 9.721816449666561e-05,
"loss": 0.3698,
"step": 443
},
{
"epoch": 0.8445078459343794,
"grad_norm": 0.40980932116508484,
"learning_rate": 9.721181327405527e-05,
"loss": 0.3901,
"step": 444
},
{
"epoch": 0.8464098906324299,
"grad_norm": 0.3330426514148712,
"learning_rate": 9.72054620514449e-05,
"loss": 0.3118,
"step": 445
},
{
"epoch": 0.8483119353304802,
"grad_norm": 0.3042624890804291,
"learning_rate": 9.719911082883456e-05,
"loss": 0.3003,
"step": 446
},
{
"epoch": 0.8502139800285307,
"grad_norm": 0.34576475620269775,
"learning_rate": 9.719275960622421e-05,
"loss": 0.3332,
"step": 447
},
{
"epoch": 0.8521160247265811,
"grad_norm": 0.2980082035064697,
"learning_rate": 9.718640838361385e-05,
"loss": 0.3285,
"step": 448
},
{
"epoch": 0.8540180694246314,
"grad_norm": 0.31439459323883057,
"learning_rate": 9.71800571610035e-05,
"loss": 0.3178,
"step": 449
},
{
"epoch": 0.8559201141226819,
"grad_norm": 0.37447845935821533,
"learning_rate": 9.717370593839315e-05,
"loss": 0.3861,
"step": 450
},
{
"epoch": 0.8578221588207323,
"grad_norm": 0.4261024594306946,
"learning_rate": 9.716735471578279e-05,
"loss": 0.4377,
"step": 451
},
{
"epoch": 0.8597242035187826,
"grad_norm": 0.3328630328178406,
"learning_rate": 9.716100349317244e-05,
"loss": 0.2791,
"step": 452
},
{
"epoch": 0.8616262482168331,
"grad_norm": 0.41943463683128357,
"learning_rate": 9.715465227056209e-05,
"loss": 0.4693,
"step": 453
},
{
"epoch": 0.8635282929148835,
"grad_norm": 0.4295640289783478,
"learning_rate": 9.714830104795174e-05,
"loss": 0.4105,
"step": 454
},
{
"epoch": 0.8654303376129339,
"grad_norm": 0.3548508882522583,
"learning_rate": 9.714194982534138e-05,
"loss": 0.3024,
"step": 455
},
{
"epoch": 0.8673323823109843,
"grad_norm": 0.5577777624130249,
"learning_rate": 9.713559860273102e-05,
"loss": 0.3961,
"step": 456
},
{
"epoch": 0.8692344270090347,
"grad_norm": 0.4119040071964264,
"learning_rate": 9.712924738012069e-05,
"loss": 0.3143,
"step": 457
},
{
"epoch": 0.8711364717070851,
"grad_norm": 0.40272560715675354,
"learning_rate": 9.712289615751032e-05,
"loss": 0.3452,
"step": 458
},
{
"epoch": 0.8730385164051355,
"grad_norm": 0.456386536359787,
"learning_rate": 9.711654493489998e-05,
"loss": 0.403,
"step": 459
},
{
"epoch": 0.874940561103186,
"grad_norm": 0.3982544541358948,
"learning_rate": 9.711019371228963e-05,
"loss": 0.4498,
"step": 460
},
{
"epoch": 0.8768426058012363,
"grad_norm": 0.29361623525619507,
"learning_rate": 9.710384248967927e-05,
"loss": 0.3724,
"step": 461
},
{
"epoch": 0.8787446504992867,
"grad_norm": 0.3854773938655853,
"learning_rate": 9.709749126706892e-05,
"loss": 0.4162,
"step": 462
},
{
"epoch": 0.8806466951973372,
"grad_norm": 0.3760225474834442,
"learning_rate": 9.709114004445856e-05,
"loss": 0.4335,
"step": 463
},
{
"epoch": 0.8825487398953875,
"grad_norm": 0.4936290383338928,
"learning_rate": 9.708478882184821e-05,
"loss": 0.3522,
"step": 464
},
{
"epoch": 0.884450784593438,
"grad_norm": 0.3584468364715576,
"learning_rate": 9.707843759923786e-05,
"loss": 0.552,
"step": 465
},
{
"epoch": 0.8863528292914884,
"grad_norm": 0.3523949086666107,
"learning_rate": 9.70720863766275e-05,
"loss": 0.3498,
"step": 466
},
{
"epoch": 0.8882548739895387,
"grad_norm": 0.42082804441452026,
"learning_rate": 9.706573515401716e-05,
"loss": 0.4863,
"step": 467
},
{
"epoch": 0.8901569186875892,
"grad_norm": 0.4284763038158417,
"learning_rate": 9.70593839314068e-05,
"loss": 0.4737,
"step": 468
},
{
"epoch": 0.8920589633856396,
"grad_norm": 0.3609261214733124,
"learning_rate": 9.705303270879644e-05,
"loss": 0.3208,
"step": 469
},
{
"epoch": 0.89396100808369,
"grad_norm": 0.31832849979400635,
"learning_rate": 9.704668148618609e-05,
"loss": 0.2545,
"step": 470
},
{
"epoch": 0.8958630527817404,
"grad_norm": 0.38202738761901855,
"learning_rate": 9.704033026357574e-05,
"loss": 0.3952,
"step": 471
},
{
"epoch": 0.8977650974797908,
"grad_norm": 0.347649484872818,
"learning_rate": 9.70339790409654e-05,
"loss": 0.3776,
"step": 472
},
{
"epoch": 0.8996671421778412,
"grad_norm": 0.41626760363578796,
"learning_rate": 9.702762781835503e-05,
"loss": 0.4152,
"step": 473
},
{
"epoch": 0.9015691868758916,
"grad_norm": 0.4042579233646393,
"learning_rate": 9.702127659574469e-05,
"loss": 0.3813,
"step": 474
},
{
"epoch": 0.9034712315739419,
"grad_norm": 0.38196825981140137,
"learning_rate": 9.701492537313434e-05,
"loss": 0.4398,
"step": 475
},
{
"epoch": 0.9053732762719924,
"grad_norm": 0.3867753744125366,
"learning_rate": 9.700857415052398e-05,
"loss": 0.4995,
"step": 476
},
{
"epoch": 0.9072753209700428,
"grad_norm": 0.34228166937828064,
"learning_rate": 9.700222292791363e-05,
"loss": 0.284,
"step": 477
},
{
"epoch": 0.9091773656680932,
"grad_norm": 0.3962937593460083,
"learning_rate": 9.699587170530328e-05,
"loss": 0.3501,
"step": 478
},
{
"epoch": 0.9110794103661436,
"grad_norm": 0.3665268123149872,
"learning_rate": 9.698952048269292e-05,
"loss": 0.2737,
"step": 479
},
{
"epoch": 0.912981455064194,
"grad_norm": 0.3775653839111328,
"learning_rate": 9.698316926008257e-05,
"loss": 0.3173,
"step": 480
},
{
"epoch": 0.9148834997622444,
"grad_norm": 0.3584369421005249,
"learning_rate": 9.697681803747222e-05,
"loss": 0.3055,
"step": 481
},
{
"epoch": 0.9167855444602948,
"grad_norm": 0.3510100245475769,
"learning_rate": 9.697046681486186e-05,
"loss": 0.3278,
"step": 482
},
{
"epoch": 0.9186875891583453,
"grad_norm": 0.33394765853881836,
"learning_rate": 9.696411559225151e-05,
"loss": 0.2954,
"step": 483
},
{
"epoch": 0.9205896338563956,
"grad_norm": 0.437014102935791,
"learning_rate": 9.695776436964116e-05,
"loss": 0.3797,
"step": 484
},
{
"epoch": 0.922491678554446,
"grad_norm": 0.37421244382858276,
"learning_rate": 9.695141314703082e-05,
"loss": 0.3521,
"step": 485
},
{
"epoch": 0.9243937232524965,
"grad_norm": 0.37696099281311035,
"learning_rate": 9.694506192442045e-05,
"loss": 0.3455,
"step": 486
},
{
"epoch": 0.9262957679505468,
"grad_norm": 0.5452500581741333,
"learning_rate": 9.693871070181009e-05,
"loss": 0.3624,
"step": 487
},
{
"epoch": 0.9281978126485972,
"grad_norm": 0.4049624502658844,
"learning_rate": 9.693235947919976e-05,
"loss": 0.4017,
"step": 488
},
{
"epoch": 0.9300998573466477,
"grad_norm": 0.32757866382598877,
"learning_rate": 9.69260082565894e-05,
"loss": 0.3536,
"step": 489
},
{
"epoch": 0.932001902044698,
"grad_norm": 0.298367977142334,
"learning_rate": 9.691965703397905e-05,
"loss": 0.3374,
"step": 490
},
{
"epoch": 0.9339039467427485,
"grad_norm": 0.22035005688667297,
"learning_rate": 9.69133058113687e-05,
"loss": 0.2855,
"step": 491
},
{
"epoch": 0.9358059914407989,
"grad_norm": 0.43000441789627075,
"learning_rate": 9.690695458875834e-05,
"loss": 0.4544,
"step": 492
},
{
"epoch": 0.9377080361388492,
"grad_norm": 0.28024253249168396,
"learning_rate": 9.690060336614799e-05,
"loss": 0.308,
"step": 493
},
{
"epoch": 0.9396100808368997,
"grad_norm": 0.53145432472229,
"learning_rate": 9.689425214353763e-05,
"loss": 0.4569,
"step": 494
},
{
"epoch": 0.9415121255349501,
"grad_norm": 0.4006127715110779,
"learning_rate": 9.688790092092729e-05,
"loss": 0.419,
"step": 495
},
{
"epoch": 0.9434141702330004,
"grad_norm": 0.4057261645793915,
"learning_rate": 9.688154969831693e-05,
"loss": 0.3553,
"step": 496
},
{
"epoch": 0.9453162149310509,
"grad_norm": 0.40803465247154236,
"learning_rate": 9.687519847570657e-05,
"loss": 0.3735,
"step": 497
},
{
"epoch": 0.9472182596291013,
"grad_norm": 0.34222155809402466,
"learning_rate": 9.686884725309623e-05,
"loss": 0.367,
"step": 498
},
{
"epoch": 0.9491203043271517,
"grad_norm": 0.40403544902801514,
"learning_rate": 9.686249603048587e-05,
"loss": 0.416,
"step": 499
},
{
"epoch": 0.9510223490252021,
"grad_norm": 0.33636951446533203,
"learning_rate": 9.685614480787551e-05,
"loss": 0.3423,
"step": 500
},
{
"epoch": 0.9529243937232525,
"grad_norm": 0.3394258916378021,
"learning_rate": 9.684979358526516e-05,
"loss": 0.3282,
"step": 501
},
{
"epoch": 0.9548264384213029,
"grad_norm": 0.3682473599910736,
"learning_rate": 9.684344236265482e-05,
"loss": 0.406,
"step": 502
},
{
"epoch": 0.9567284831193533,
"grad_norm": 0.35073623061180115,
"learning_rate": 9.683709114004447e-05,
"loss": 0.376,
"step": 503
},
{
"epoch": 0.9586305278174037,
"grad_norm": 0.36000022292137146,
"learning_rate": 9.68307399174341e-05,
"loss": 0.3969,
"step": 504
},
{
"epoch": 0.9605325725154541,
"grad_norm": 0.361158162355423,
"learning_rate": 9.682438869482376e-05,
"loss": 0.347,
"step": 505
},
{
"epoch": 0.9624346172135045,
"grad_norm": 0.3075178265571594,
"learning_rate": 9.681803747221341e-05,
"loss": 0.4362,
"step": 506
},
{
"epoch": 0.9643366619115549,
"grad_norm": 0.30084747076034546,
"learning_rate": 9.681168624960305e-05,
"loss": 0.3563,
"step": 507
},
{
"epoch": 0.9662387066096053,
"grad_norm": 0.3221014440059662,
"learning_rate": 9.68053350269927e-05,
"loss": 0.3366,
"step": 508
},
{
"epoch": 0.9681407513076558,
"grad_norm": 0.36464688181877136,
"learning_rate": 9.679898380438235e-05,
"loss": 0.3992,
"step": 509
},
{
"epoch": 0.9700427960057061,
"grad_norm": 0.32443803548812866,
"learning_rate": 9.679263258177199e-05,
"loss": 0.3293,
"step": 510
},
{
"epoch": 0.9719448407037565,
"grad_norm": 0.3689454197883606,
"learning_rate": 9.678628135916164e-05,
"loss": 0.3546,
"step": 511
},
{
"epoch": 0.973846885401807,
"grad_norm": 0.3754975199699402,
"learning_rate": 9.677993013655129e-05,
"loss": 0.3856,
"step": 512
},
{
"epoch": 0.9757489300998573,
"grad_norm": 0.3642953634262085,
"learning_rate": 9.677357891394094e-05,
"loss": 0.4326,
"step": 513
},
{
"epoch": 0.9776509747979077,
"grad_norm": 0.43278223276138306,
"learning_rate": 9.676722769133058e-05,
"loss": 0.3964,
"step": 514
},
{
"epoch": 0.9795530194959582,
"grad_norm": 0.43771886825561523,
"learning_rate": 9.676087646872023e-05,
"loss": 0.3861,
"step": 515
},
{
"epoch": 0.9814550641940085,
"grad_norm": 0.34908977150917053,
"learning_rate": 9.675452524610989e-05,
"loss": 0.3981,
"step": 516
},
{
"epoch": 0.983357108892059,
"grad_norm": 0.35733312368392944,
"learning_rate": 9.674817402349953e-05,
"loss": 0.3636,
"step": 517
},
{
"epoch": 0.9852591535901094,
"grad_norm": 0.3636298179626465,
"learning_rate": 9.674182280088918e-05,
"loss": 0.4336,
"step": 518
},
{
"epoch": 0.9871611982881597,
"grad_norm": 0.32771605253219604,
"learning_rate": 9.673547157827883e-05,
"loss": 0.3481,
"step": 519
},
{
"epoch": 0.9890632429862102,
"grad_norm": 0.40213117003440857,
"learning_rate": 9.672912035566847e-05,
"loss": 0.3707,
"step": 520
},
{
"epoch": 0.9909652876842606,
"grad_norm": 0.3386654257774353,
"learning_rate": 9.672276913305812e-05,
"loss": 0.3384,
"step": 521
},
{
"epoch": 0.992867332382311,
"grad_norm": 0.3965696096420288,
"learning_rate": 9.671641791044777e-05,
"loss": 0.3595,
"step": 522
},
{
"epoch": 0.9947693770803614,
"grad_norm": 0.38238459825515747,
"learning_rate": 9.671006668783741e-05,
"loss": 0.3714,
"step": 523
},
{
"epoch": 0.9966714217784118,
"grad_norm": 0.3248405456542969,
"learning_rate": 9.670371546522706e-05,
"loss": 0.394,
"step": 524
},
{
"epoch": 0.9985734664764622,
"grad_norm": 0.3902266323566437,
"learning_rate": 9.66973642426167e-05,
"loss": 0.4115,
"step": 525
},
{
"epoch": 1.0004755111745125,
"grad_norm": 0.4164808392524719,
"learning_rate": 9.669101302000636e-05,
"loss": 0.2972,
"step": 526
},
{
"epoch": 1.002377555872563,
"grad_norm": 0.33123117685317993,
"learning_rate": 9.6684661797396e-05,
"loss": 0.3211,
"step": 527
},
{
"epoch": 1.0042796005706134,
"grad_norm": 0.322803258895874,
"learning_rate": 9.667831057478564e-05,
"loss": 0.3424,
"step": 528
},
{
"epoch": 1.0061816452686638,
"grad_norm": 0.29135918617248535,
"learning_rate": 9.66719593521753e-05,
"loss": 0.2882,
"step": 529
},
{
"epoch": 1.0080836899667143,
"grad_norm": 0.3367983400821686,
"learning_rate": 9.666560812956494e-05,
"loss": 0.2776,
"step": 530
},
{
"epoch": 1.0099857346647647,
"grad_norm": 0.304070383310318,
"learning_rate": 9.66592569069546e-05,
"loss": 0.249,
"step": 531
},
{
"epoch": 1.011887779362815,
"grad_norm": 0.3832727372646332,
"learning_rate": 9.665290568434423e-05,
"loss": 0.3118,
"step": 532
},
{
"epoch": 1.0137898240608654,
"grad_norm": 0.3365418612957001,
"learning_rate": 9.664655446173389e-05,
"loss": 0.197,
"step": 533
},
{
"epoch": 1.0156918687589158,
"grad_norm": 0.4367881119251251,
"learning_rate": 9.664020323912354e-05,
"loss": 0.3121,
"step": 534
},
{
"epoch": 1.0175939134569663,
"grad_norm": 0.43158653378486633,
"learning_rate": 9.663385201651318e-05,
"loss": 0.3543,
"step": 535
},
{
"epoch": 1.0194959581550167,
"grad_norm": 0.43556904792785645,
"learning_rate": 9.662750079390283e-05,
"loss": 0.3121,
"step": 536
},
{
"epoch": 1.0213980028530671,
"grad_norm": 0.31828534603118896,
"learning_rate": 9.662114957129248e-05,
"loss": 0.24,
"step": 537
},
{
"epoch": 1.0233000475511174,
"grad_norm": 0.3935330808162689,
"learning_rate": 9.661479834868212e-05,
"loss": 0.2548,
"step": 538
},
{
"epoch": 1.0252020922491678,
"grad_norm": 0.3288602828979492,
"learning_rate": 9.660844712607177e-05,
"loss": 0.2219,
"step": 539
},
{
"epoch": 1.0271041369472182,
"grad_norm": 0.36314669251441956,
"learning_rate": 9.660209590346142e-05,
"loss": 0.2817,
"step": 540
},
{
"epoch": 1.0290061816452687,
"grad_norm": 0.3528159558773041,
"learning_rate": 9.659574468085106e-05,
"loss": 0.2989,
"step": 541
},
{
"epoch": 1.0309082263433191,
"grad_norm": 0.3235621750354767,
"learning_rate": 9.658939345824071e-05,
"loss": 0.2443,
"step": 542
},
{
"epoch": 1.0328102710413696,
"grad_norm": 0.3819037675857544,
"learning_rate": 9.658304223563036e-05,
"loss": 0.3494,
"step": 543
},
{
"epoch": 1.0347123157394198,
"grad_norm": 0.3885079324245453,
"learning_rate": 9.657669101302002e-05,
"loss": 0.3033,
"step": 544
},
{
"epoch": 1.0366143604374702,
"grad_norm": 0.3339099884033203,
"learning_rate": 9.657033979040965e-05,
"loss": 0.2673,
"step": 545
},
{
"epoch": 1.0385164051355207,
"grad_norm": 0.37009695172309875,
"learning_rate": 9.65639885677993e-05,
"loss": 0.3715,
"step": 546
},
{
"epoch": 1.0404184498335711,
"grad_norm": 0.3462003171443939,
"learning_rate": 9.655763734518896e-05,
"loss": 0.2664,
"step": 547
},
{
"epoch": 1.0423204945316216,
"grad_norm": 0.3916226327419281,
"learning_rate": 9.65512861225786e-05,
"loss": 0.3804,
"step": 548
},
{
"epoch": 1.044222539229672,
"grad_norm": 0.3801763951778412,
"learning_rate": 9.654493489996825e-05,
"loss": 0.2672,
"step": 549
},
{
"epoch": 1.0461245839277222,
"grad_norm": 0.37406545877456665,
"learning_rate": 9.65385836773579e-05,
"loss": 0.6203,
"step": 550
},
{
"epoch": 1.0480266286257727,
"grad_norm": 0.43677276372909546,
"learning_rate": 9.653223245474754e-05,
"loss": 0.3866,
"step": 551
},
{
"epoch": 1.0499286733238231,
"grad_norm": 0.26939406991004944,
"learning_rate": 9.652588123213719e-05,
"loss": 0.2169,
"step": 552
},
{
"epoch": 1.0518307180218736,
"grad_norm": 0.41554608941078186,
"learning_rate": 9.651953000952684e-05,
"loss": 0.3705,
"step": 553
},
{
"epoch": 1.053732762719924,
"grad_norm": 0.3090009391307831,
"learning_rate": 9.651317878691648e-05,
"loss": 0.2471,
"step": 554
},
{
"epoch": 1.0556348074179742,
"grad_norm": 0.36705514788627625,
"learning_rate": 9.650682756430613e-05,
"loss": 0.2764,
"step": 555
},
{
"epoch": 1.0575368521160247,
"grad_norm": 0.39900127053260803,
"learning_rate": 9.650047634169578e-05,
"loss": 0.2836,
"step": 556
},
{
"epoch": 1.059438896814075,
"grad_norm": 0.31405431032180786,
"learning_rate": 9.649412511908544e-05,
"loss": 0.2464,
"step": 557
},
{
"epoch": 1.0613409415121255,
"grad_norm": 0.39795488119125366,
"learning_rate": 9.648777389647507e-05,
"loss": 0.283,
"step": 558
},
{
"epoch": 1.063242986210176,
"grad_norm": 0.36270254850387573,
"learning_rate": 9.648142267386471e-05,
"loss": 0.26,
"step": 559
},
{
"epoch": 1.0651450309082264,
"grad_norm": 0.42650437355041504,
"learning_rate": 9.647507145125438e-05,
"loss": 0.2693,
"step": 560
},
{
"epoch": 1.0670470756062767,
"grad_norm": 0.3075532019138336,
"learning_rate": 9.646872022864402e-05,
"loss": 0.2941,
"step": 561
},
{
"epoch": 1.068949120304327,
"grad_norm": 0.4509059190750122,
"learning_rate": 9.646236900603367e-05,
"loss": 0.3525,
"step": 562
},
{
"epoch": 1.0708511650023775,
"grad_norm": 0.3420471251010895,
"learning_rate": 9.645601778342332e-05,
"loss": 0.2601,
"step": 563
},
{
"epoch": 1.072753209700428,
"grad_norm": 0.422493577003479,
"learning_rate": 9.644966656081296e-05,
"loss": 0.3441,
"step": 564
},
{
"epoch": 1.0746552543984784,
"grad_norm": 0.3960445821285248,
"learning_rate": 9.644331533820261e-05,
"loss": 0.3049,
"step": 565
},
{
"epoch": 1.0765572990965289,
"grad_norm": 0.32367074489593506,
"learning_rate": 9.643696411559225e-05,
"loss": 0.2694,
"step": 566
},
{
"epoch": 1.078459343794579,
"grad_norm": 0.3480624258518219,
"learning_rate": 9.643061289298191e-05,
"loss": 0.2667,
"step": 567
},
{
"epoch": 1.0803613884926295,
"grad_norm": 0.37603023648262024,
"learning_rate": 9.642426167037155e-05,
"loss": 0.2875,
"step": 568
},
{
"epoch": 1.08226343319068,
"grad_norm": 0.391438752412796,
"learning_rate": 9.641791044776119e-05,
"loss": 0.2844,
"step": 569
},
{
"epoch": 1.0841654778887304,
"grad_norm": 0.42726075649261475,
"learning_rate": 9.641155922515086e-05,
"loss": 0.3092,
"step": 570
},
{
"epoch": 1.0860675225867809,
"grad_norm": 0.4007676839828491,
"learning_rate": 9.64052080025405e-05,
"loss": 0.2405,
"step": 571
},
{
"epoch": 1.0879695672848313,
"grad_norm": 0.401592493057251,
"learning_rate": 9.639885677993013e-05,
"loss": 0.297,
"step": 572
},
{
"epoch": 1.0898716119828815,
"grad_norm": 0.3883298635482788,
"learning_rate": 9.639250555731978e-05,
"loss": 0.3201,
"step": 573
},
{
"epoch": 1.091773656680932,
"grad_norm": 0.41852253675460815,
"learning_rate": 9.638615433470944e-05,
"loss": 0.259,
"step": 574
},
{
"epoch": 1.0936757013789824,
"grad_norm": 0.4559331238269806,
"learning_rate": 9.637980311209909e-05,
"loss": 0.3204,
"step": 575
},
{
"epoch": 1.0955777460770328,
"grad_norm": 0.4163438379764557,
"learning_rate": 9.637345188948873e-05,
"loss": 0.267,
"step": 576
},
{
"epoch": 1.0974797907750833,
"grad_norm": 0.38813936710357666,
"learning_rate": 9.636710066687838e-05,
"loss": 0.2653,
"step": 577
},
{
"epoch": 1.0993818354731335,
"grad_norm": 0.373047798871994,
"learning_rate": 9.636074944426803e-05,
"loss": 0.2995,
"step": 578
},
{
"epoch": 1.101283880171184,
"grad_norm": 0.39488789439201355,
"learning_rate": 9.635439822165767e-05,
"loss": 0.2972,
"step": 579
},
{
"epoch": 1.1031859248692344,
"grad_norm": 0.37775856256484985,
"learning_rate": 9.634804699904732e-05,
"loss": 0.2833,
"step": 580
},
{
"epoch": 1.1050879695672848,
"grad_norm": 0.3843298554420471,
"learning_rate": 9.634169577643697e-05,
"loss": 0.3413,
"step": 581
},
{
"epoch": 1.1069900142653353,
"grad_norm": 0.3834189176559448,
"learning_rate": 9.633534455382661e-05,
"loss": 0.2792,
"step": 582
},
{
"epoch": 1.1088920589633857,
"grad_norm": 0.37232789397239685,
"learning_rate": 9.632899333121626e-05,
"loss": 0.2724,
"step": 583
},
{
"epoch": 1.1107941036614362,
"grad_norm": 0.2608899772167206,
"learning_rate": 9.632264210860591e-05,
"loss": 0.1966,
"step": 584
},
{
"epoch": 1.1126961483594864,
"grad_norm": 0.2676723301410675,
"learning_rate": 9.631629088599557e-05,
"loss": 0.2149,
"step": 585
},
{
"epoch": 1.1145981930575368,
"grad_norm": 0.40126022696495056,
"learning_rate": 9.63099396633852e-05,
"loss": 0.2937,
"step": 586
},
{
"epoch": 1.1165002377555873,
"grad_norm": 0.3493163287639618,
"learning_rate": 9.630358844077486e-05,
"loss": 0.2461,
"step": 587
},
{
"epoch": 1.1184022824536377,
"grad_norm": 0.39294591546058655,
"learning_rate": 9.629723721816451e-05,
"loss": 0.2922,
"step": 588
},
{
"epoch": 1.1203043271516882,
"grad_norm": 0.3855053186416626,
"learning_rate": 9.629088599555415e-05,
"loss": 0.2541,
"step": 589
},
{
"epoch": 1.1222063718497384,
"grad_norm": 0.3388477861881256,
"learning_rate": 9.628453477294378e-05,
"loss": 0.2234,
"step": 590
},
{
"epoch": 1.1241084165477888,
"grad_norm": 0.3856431841850281,
"learning_rate": 9.627818355033345e-05,
"loss": 0.2836,
"step": 591
},
{
"epoch": 1.1260104612458393,
"grad_norm": 0.39824768900871277,
"learning_rate": 9.627183232772309e-05,
"loss": 0.2562,
"step": 592
},
{
"epoch": 1.1279125059438897,
"grad_norm": 0.44484448432922363,
"learning_rate": 9.626548110511274e-05,
"loss": 0.2685,
"step": 593
},
{
"epoch": 1.1298145506419401,
"grad_norm": 0.4581182599067688,
"learning_rate": 9.625912988250239e-05,
"loss": 0.3208,
"step": 594
},
{
"epoch": 1.1317165953399906,
"grad_norm": 0.3560565412044525,
"learning_rate": 9.625277865989203e-05,
"loss": 0.2834,
"step": 595
},
{
"epoch": 1.1336186400380408,
"grad_norm": 0.4423635005950928,
"learning_rate": 9.624642743728168e-05,
"loss": 0.3154,
"step": 596
},
{
"epoch": 1.1355206847360912,
"grad_norm": 0.3797377943992615,
"learning_rate": 9.624007621467132e-05,
"loss": 0.28,
"step": 597
},
{
"epoch": 1.1374227294341417,
"grad_norm": 0.29780030250549316,
"learning_rate": 9.623372499206099e-05,
"loss": 0.2209,
"step": 598
},
{
"epoch": 1.1393247741321921,
"grad_norm": 0.3372732996940613,
"learning_rate": 9.622737376945062e-05,
"loss": 0.2502,
"step": 599
},
{
"epoch": 1.1412268188302426,
"grad_norm": 0.36365967988967896,
"learning_rate": 9.622102254684026e-05,
"loss": 0.2804,
"step": 600
},
{
"epoch": 1.1431288635282928,
"grad_norm": 0.40790894627571106,
"learning_rate": 9.621467132422993e-05,
"loss": 0.3633,
"step": 601
},
{
"epoch": 1.1450309082263432,
"grad_norm": 0.35693496465682983,
"learning_rate": 9.620832010161957e-05,
"loss": 0.3193,
"step": 602
},
{
"epoch": 1.1469329529243937,
"grad_norm": 0.3701719045639038,
"learning_rate": 9.620196887900922e-05,
"loss": 0.2937,
"step": 603
},
{
"epoch": 1.1488349976224441,
"grad_norm": 0.4299123287200928,
"learning_rate": 9.619561765639886e-05,
"loss": 0.2732,
"step": 604
},
{
"epoch": 1.1507370423204946,
"grad_norm": 0.4082129895687103,
"learning_rate": 9.618926643378851e-05,
"loss": 0.2867,
"step": 605
},
{
"epoch": 1.152639087018545,
"grad_norm": 0.49353981018066406,
"learning_rate": 9.618291521117816e-05,
"loss": 0.266,
"step": 606
},
{
"epoch": 1.1545411317165954,
"grad_norm": 0.3889831006526947,
"learning_rate": 9.61765639885678e-05,
"loss": 0.2732,
"step": 607
},
{
"epoch": 1.1564431764146457,
"grad_norm": 0.3464524745941162,
"learning_rate": 9.617021276595745e-05,
"loss": 0.2616,
"step": 608
},
{
"epoch": 1.158345221112696,
"grad_norm": 0.3498656153678894,
"learning_rate": 9.61638615433471e-05,
"loss": 0.2538,
"step": 609
},
{
"epoch": 1.1602472658107466,
"grad_norm": 0.31552717089653015,
"learning_rate": 9.615751032073674e-05,
"loss": 0.2283,
"step": 610
},
{
"epoch": 1.162149310508797,
"grad_norm": 0.3225223422050476,
"learning_rate": 9.615115909812639e-05,
"loss": 0.2428,
"step": 611
},
{
"epoch": 1.1640513552068474,
"grad_norm": 0.3108568489551544,
"learning_rate": 9.614480787551604e-05,
"loss": 0.2207,
"step": 612
},
{
"epoch": 1.1659533999048977,
"grad_norm": 0.42909371852874756,
"learning_rate": 9.613845665290568e-05,
"loss": 0.3285,
"step": 613
},
{
"epoch": 1.167855444602948,
"grad_norm": 0.3831368088722229,
"learning_rate": 9.613210543029533e-05,
"loss": 0.2425,
"step": 614
},
{
"epoch": 1.1697574893009985,
"grad_norm": 0.3891592025756836,
"learning_rate": 9.612575420768499e-05,
"loss": 0.2849,
"step": 615
},
{
"epoch": 1.171659533999049,
"grad_norm": 0.5383257865905762,
"learning_rate": 9.611940298507464e-05,
"loss": 0.3444,
"step": 616
},
{
"epoch": 1.1735615786970994,
"grad_norm": 0.4203440845012665,
"learning_rate": 9.611305176246428e-05,
"loss": 0.3198,
"step": 617
},
{
"epoch": 1.1754636233951499,
"grad_norm": 0.42422881722450256,
"learning_rate": 9.610670053985393e-05,
"loss": 0.3873,
"step": 618
},
{
"epoch": 1.1773656680932003,
"grad_norm": 0.34799742698669434,
"learning_rate": 9.610034931724358e-05,
"loss": 0.2645,
"step": 619
},
{
"epoch": 1.1792677127912505,
"grad_norm": 0.37579119205474854,
"learning_rate": 9.609399809463322e-05,
"loss": 0.3379,
"step": 620
},
{
"epoch": 1.181169757489301,
"grad_norm": 0.3958894610404968,
"learning_rate": 9.608764687202287e-05,
"loss": 0.2792,
"step": 621
},
{
"epoch": 1.1830718021873514,
"grad_norm": 0.30366870760917664,
"learning_rate": 9.608129564941252e-05,
"loss": 0.1871,
"step": 622
},
{
"epoch": 1.1849738468854019,
"grad_norm": 0.39878007769584656,
"learning_rate": 9.607494442680216e-05,
"loss": 0.2675,
"step": 623
},
{
"epoch": 1.1868758915834523,
"grad_norm": 0.35332080721855164,
"learning_rate": 9.606859320419181e-05,
"loss": 0.2856,
"step": 624
},
{
"epoch": 1.1887779362815025,
"grad_norm": 0.3391731381416321,
"learning_rate": 9.606224198158146e-05,
"loss": 0.254,
"step": 625
},
{
"epoch": 1.190679980979553,
"grad_norm": 0.39363861083984375,
"learning_rate": 9.60558907589711e-05,
"loss": 0.2447,
"step": 626
},
{
"epoch": 1.1925820256776034,
"grad_norm": 0.4773564040660858,
"learning_rate": 9.604953953636075e-05,
"loss": 0.3447,
"step": 627
},
{
"epoch": 1.1944840703756539,
"grad_norm": 0.34327152371406555,
"learning_rate": 9.60431883137504e-05,
"loss": 0.2353,
"step": 628
},
{
"epoch": 1.1963861150737043,
"grad_norm": 0.37386631965637207,
"learning_rate": 9.603683709114006e-05,
"loss": 0.2792,
"step": 629
},
{
"epoch": 1.1982881597717547,
"grad_norm": 0.4061308801174164,
"learning_rate": 9.60304858685297e-05,
"loss": 0.3216,
"step": 630
},
{
"epoch": 1.200190204469805,
"grad_norm": 0.3440467417240143,
"learning_rate": 9.602413464591933e-05,
"loss": 0.2653,
"step": 631
},
{
"epoch": 1.2020922491678554,
"grad_norm": 0.36648881435394287,
"learning_rate": 9.6017783423309e-05,
"loss": 0.2471,
"step": 632
},
{
"epoch": 1.2039942938659058,
"grad_norm": 0.3737157881259918,
"learning_rate": 9.601143220069864e-05,
"loss": 0.3255,
"step": 633
},
{
"epoch": 1.2058963385639563,
"grad_norm": 0.3840744197368622,
"learning_rate": 9.600508097808829e-05,
"loss": 0.2457,
"step": 634
},
{
"epoch": 1.2077983832620067,
"grad_norm": 0.34374961256980896,
"learning_rate": 9.599872975547793e-05,
"loss": 0.2705,
"step": 635
},
{
"epoch": 1.209700427960057,
"grad_norm": 0.3460882306098938,
"learning_rate": 9.599237853286758e-05,
"loss": 0.2308,
"step": 636
},
{
"epoch": 1.2116024726581074,
"grad_norm": 0.33316507935523987,
"learning_rate": 9.598602731025723e-05,
"loss": 0.2562,
"step": 637
},
{
"epoch": 1.2135045173561578,
"grad_norm": 0.3132528066635132,
"learning_rate": 9.597967608764687e-05,
"loss": 0.2331,
"step": 638
},
{
"epoch": 1.2154065620542083,
"grad_norm": 0.3329333961009979,
"learning_rate": 9.597332486503653e-05,
"loss": 0.2224,
"step": 639
},
{
"epoch": 1.2173086067522587,
"grad_norm": 0.35949432849884033,
"learning_rate": 9.596697364242617e-05,
"loss": 0.2337,
"step": 640
},
{
"epoch": 1.2192106514503092,
"grad_norm": 0.33591121435165405,
"learning_rate": 9.596062241981581e-05,
"loss": 0.2441,
"step": 641
},
{
"epoch": 1.2211126961483596,
"grad_norm": 0.38212794065475464,
"learning_rate": 9.595427119720546e-05,
"loss": 0.2569,
"step": 642
},
{
"epoch": 1.2230147408464098,
"grad_norm": 0.4124354124069214,
"learning_rate": 9.594791997459512e-05,
"loss": 0.3143,
"step": 643
},
{
"epoch": 1.2249167855444603,
"grad_norm": 0.4712159037590027,
"learning_rate": 9.594156875198475e-05,
"loss": 0.3153,
"step": 644
},
{
"epoch": 1.2268188302425107,
"grad_norm": 0.3652181923389435,
"learning_rate": 9.59352175293744e-05,
"loss": 0.2448,
"step": 645
},
{
"epoch": 1.2287208749405611,
"grad_norm": 0.40058213472366333,
"learning_rate": 9.592886630676406e-05,
"loss": 0.304,
"step": 646
},
{
"epoch": 1.2306229196386116,
"grad_norm": 0.4105280041694641,
"learning_rate": 9.592251508415371e-05,
"loss": 0.251,
"step": 647
},
{
"epoch": 1.2325249643366618,
"grad_norm": 0.3609527349472046,
"learning_rate": 9.591616386154335e-05,
"loss": 0.2311,
"step": 648
},
{
"epoch": 1.2344270090347123,
"grad_norm": 0.3686671257019043,
"learning_rate": 9.5909812638933e-05,
"loss": 0.2214,
"step": 649
},
{
"epoch": 1.2363290537327627,
"grad_norm": 0.27986517548561096,
"learning_rate": 9.590346141632265e-05,
"loss": 0.2531,
"step": 650
},
{
"epoch": 1.2382310984308131,
"grad_norm": 0.4477519690990448,
"learning_rate": 9.589711019371229e-05,
"loss": 0.3039,
"step": 651
},
{
"epoch": 1.2401331431288636,
"grad_norm": 0.33017873764038086,
"learning_rate": 9.589075897110194e-05,
"loss": 0.205,
"step": 652
},
{
"epoch": 1.242035187826914,
"grad_norm": 0.31245800852775574,
"learning_rate": 9.588440774849159e-05,
"loss": 0.2493,
"step": 653
},
{
"epoch": 1.2439372325249642,
"grad_norm": 0.33620285987854004,
"learning_rate": 9.587805652588123e-05,
"loss": 0.2629,
"step": 654
},
{
"epoch": 1.2458392772230147,
"grad_norm": 0.34820401668548584,
"learning_rate": 9.587170530327088e-05,
"loss": 0.2446,
"step": 655
},
{
"epoch": 1.2477413219210651,
"grad_norm": 0.4110179543495178,
"learning_rate": 9.586535408066053e-05,
"loss": 0.3345,
"step": 656
},
{
"epoch": 1.2496433666191156,
"grad_norm": 0.3637439012527466,
"learning_rate": 9.585900285805019e-05,
"loss": 0.2052,
"step": 657
},
{
"epoch": 1.251545411317166,
"grad_norm": 0.39023682475090027,
"learning_rate": 9.585265163543982e-05,
"loss": 0.2841,
"step": 658
},
{
"epoch": 1.2534474560152162,
"grad_norm": 0.3623685836791992,
"learning_rate": 9.584630041282948e-05,
"loss": 0.2286,
"step": 659
},
{
"epoch": 1.2553495007132667,
"grad_norm": 0.38151344656944275,
"learning_rate": 9.583994919021913e-05,
"loss": 0.2357,
"step": 660
},
{
"epoch": 1.2572515454113171,
"grad_norm": 0.38236725330352783,
"learning_rate": 9.583359796760877e-05,
"loss": 0.2966,
"step": 661
},
{
"epoch": 1.2591535901093676,
"grad_norm": 0.38568076491355896,
"learning_rate": 9.58272467449984e-05,
"loss": 0.3018,
"step": 662
},
{
"epoch": 1.261055634807418,
"grad_norm": 0.3488738238811493,
"learning_rate": 9.582089552238807e-05,
"loss": 0.354,
"step": 663
},
{
"epoch": 1.2629576795054684,
"grad_norm": 0.352860689163208,
"learning_rate": 9.581454429977771e-05,
"loss": 0.2143,
"step": 664
},
{
"epoch": 1.2648597242035189,
"grad_norm": 0.3734944760799408,
"learning_rate": 9.580819307716736e-05,
"loss": 0.3486,
"step": 665
},
{
"epoch": 1.266761768901569,
"grad_norm": 0.4024759531021118,
"learning_rate": 9.580184185455701e-05,
"loss": 0.2922,
"step": 666
},
{
"epoch": 1.2686638135996195,
"grad_norm": 0.37389662861824036,
"learning_rate": 9.579549063194665e-05,
"loss": 0.2545,
"step": 667
},
{
"epoch": 1.27056585829767,
"grad_norm": 0.42338186502456665,
"learning_rate": 9.57891394093363e-05,
"loss": 0.2961,
"step": 668
},
{
"epoch": 1.2724679029957204,
"grad_norm": 0.3795355260372162,
"learning_rate": 9.578278818672594e-05,
"loss": 0.2777,
"step": 669
},
{
"epoch": 1.2743699476937709,
"grad_norm": 0.3439030945301056,
"learning_rate": 9.57764369641156e-05,
"loss": 0.2179,
"step": 670
},
{
"epoch": 1.276271992391821,
"grad_norm": 0.39637741446495056,
"learning_rate": 9.577008574150524e-05,
"loss": 0.2701,
"step": 671
},
{
"epoch": 1.2781740370898715,
"grad_norm": 0.3348701298236847,
"learning_rate": 9.576373451889488e-05,
"loss": 0.2632,
"step": 672
},
{
"epoch": 1.280076081787922,
"grad_norm": 0.3696272671222687,
"learning_rate": 9.575738329628455e-05,
"loss": 0.2228,
"step": 673
},
{
"epoch": 1.2819781264859724,
"grad_norm": 0.3261694610118866,
"learning_rate": 9.575103207367419e-05,
"loss": 0.2589,
"step": 674
},
{
"epoch": 1.2838801711840229,
"grad_norm": 0.39266085624694824,
"learning_rate": 9.574468085106384e-05,
"loss": 0.2893,
"step": 675
},
{
"epoch": 1.2857822158820733,
"grad_norm": 0.4356357157230377,
"learning_rate": 9.573832962845348e-05,
"loss": 0.3249,
"step": 676
},
{
"epoch": 1.2876842605801238,
"grad_norm": 0.38992395997047424,
"learning_rate": 9.573197840584313e-05,
"loss": 0.2697,
"step": 677
},
{
"epoch": 1.289586305278174,
"grad_norm": 0.35415610671043396,
"learning_rate": 9.572562718323278e-05,
"loss": 0.2538,
"step": 678
},
{
"epoch": 1.2914883499762244,
"grad_norm": 0.38410142064094543,
"learning_rate": 9.571927596062242e-05,
"loss": 0.2325,
"step": 679
},
{
"epoch": 1.2933903946742749,
"grad_norm": 0.36036771535873413,
"learning_rate": 9.571292473801207e-05,
"loss": 0.242,
"step": 680
},
{
"epoch": 1.2952924393723253,
"grad_norm": 0.3901429772377014,
"learning_rate": 9.570657351540172e-05,
"loss": 0.3141,
"step": 681
},
{
"epoch": 1.2971944840703755,
"grad_norm": 0.3684573769569397,
"learning_rate": 9.570022229279136e-05,
"loss": 0.2725,
"step": 682
},
{
"epoch": 1.299096528768426,
"grad_norm": 0.44199153780937195,
"learning_rate": 9.569387107018101e-05,
"loss": 0.2938,
"step": 683
},
{
"epoch": 1.3009985734664764,
"grad_norm": 0.4435335695743561,
"learning_rate": 9.568751984757066e-05,
"loss": 0.3454,
"step": 684
},
{
"epoch": 1.3029006181645268,
"grad_norm": 0.3713487386703491,
"learning_rate": 9.56811686249603e-05,
"loss": 0.25,
"step": 685
},
{
"epoch": 1.3048026628625773,
"grad_norm": 0.394452840089798,
"learning_rate": 9.567481740234995e-05,
"loss": 0.3062,
"step": 686
},
{
"epoch": 1.3067047075606277,
"grad_norm": 0.47593292593955994,
"learning_rate": 9.56684661797396e-05,
"loss": 0.3131,
"step": 687
},
{
"epoch": 1.3086067522586782,
"grad_norm": 0.39060479402542114,
"learning_rate": 9.566211495712926e-05,
"loss": 0.3267,
"step": 688
},
{
"epoch": 1.3105087969567286,
"grad_norm": 0.40931451320648193,
"learning_rate": 9.56557637345189e-05,
"loss": 0.2979,
"step": 689
},
{
"epoch": 1.3124108416547788,
"grad_norm": 0.3557567000389099,
"learning_rate": 9.564941251190855e-05,
"loss": 0.213,
"step": 690
},
{
"epoch": 1.3143128863528293,
"grad_norm": 0.43843701481819153,
"learning_rate": 9.56430612892982e-05,
"loss": 0.2835,
"step": 691
},
{
"epoch": 1.3162149310508797,
"grad_norm": 0.33530867099761963,
"learning_rate": 9.563671006668784e-05,
"loss": 0.2392,
"step": 692
},
{
"epoch": 1.3181169757489302,
"grad_norm": 0.35071656107902527,
"learning_rate": 9.563035884407749e-05,
"loss": 0.1916,
"step": 693
},
{
"epoch": 1.3200190204469804,
"grad_norm": 0.3808371126651764,
"learning_rate": 9.562400762146714e-05,
"loss": 0.2426,
"step": 694
},
{
"epoch": 1.3219210651450308,
"grad_norm": 0.46641990542411804,
"learning_rate": 9.561765639885678e-05,
"loss": 0.3399,
"step": 695
},
{
"epoch": 1.3238231098430813,
"grad_norm": 0.4153888523578644,
"learning_rate": 9.561130517624643e-05,
"loss": 0.4152,
"step": 696
},
{
"epoch": 1.3257251545411317,
"grad_norm": 0.4004898965358734,
"learning_rate": 9.560495395363608e-05,
"loss": 0.3637,
"step": 697
},
{
"epoch": 1.3276271992391822,
"grad_norm": 0.421058714389801,
"learning_rate": 9.559860273102572e-05,
"loss": 0.2625,
"step": 698
},
{
"epoch": 1.3295292439372326,
"grad_norm": 0.39722004532814026,
"learning_rate": 9.559225150841537e-05,
"loss": 0.3563,
"step": 699
},
{
"epoch": 1.331431288635283,
"grad_norm": 0.3793489634990692,
"learning_rate": 9.558590028580501e-05,
"loss": 0.2306,
"step": 700
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.43592244386672974,
"learning_rate": 9.557954906319468e-05,
"loss": 0.4354,
"step": 701
},
{
"epoch": 1.3352353780313837,
"grad_norm": 0.30159738659858704,
"learning_rate": 9.557319784058432e-05,
"loss": 0.2062,
"step": 702
},
{
"epoch": 1.3371374227294341,
"grad_norm": 0.34011465311050415,
"learning_rate": 9.556684661797395e-05,
"loss": 0.2363,
"step": 703
},
{
"epoch": 1.3390394674274846,
"grad_norm": 0.41224443912506104,
"learning_rate": 9.556049539536362e-05,
"loss": 0.2913,
"step": 704
},
{
"epoch": 1.340941512125535,
"grad_norm": 0.4105536937713623,
"learning_rate": 9.555414417275326e-05,
"loss": 0.2459,
"step": 705
},
{
"epoch": 1.3428435568235852,
"grad_norm": 0.3158798813819885,
"learning_rate": 9.554779295014291e-05,
"loss": 0.1921,
"step": 706
},
{
"epoch": 1.3447456015216357,
"grad_norm": 0.4023972451686859,
"learning_rate": 9.554144172753255e-05,
"loss": 0.2406,
"step": 707
},
{
"epoch": 1.3466476462196861,
"grad_norm": 0.4204084277153015,
"learning_rate": 9.55350905049222e-05,
"loss": 0.2977,
"step": 708
},
{
"epoch": 1.3485496909177366,
"grad_norm": 0.4853519797325134,
"learning_rate": 9.552873928231185e-05,
"loss": 0.3871,
"step": 709
},
{
"epoch": 1.350451735615787,
"grad_norm": 0.3755006194114685,
"learning_rate": 9.552238805970149e-05,
"loss": 0.2399,
"step": 710
},
{
"epoch": 1.3523537803138375,
"grad_norm": 0.37587347626686096,
"learning_rate": 9.551603683709116e-05,
"loss": 0.3029,
"step": 711
},
{
"epoch": 1.354255825011888,
"grad_norm": 0.4257625937461853,
"learning_rate": 9.55096856144808e-05,
"loss": 0.2541,
"step": 712
},
{
"epoch": 1.3561578697099381,
"grad_norm": 0.29570913314819336,
"learning_rate": 9.550333439187043e-05,
"loss": 0.1668,
"step": 713
},
{
"epoch": 1.3580599144079886,
"grad_norm": 0.5089273452758789,
"learning_rate": 9.549698316926008e-05,
"loss": 0.4006,
"step": 714
},
{
"epoch": 1.359961959106039,
"grad_norm": 0.43584999442100525,
"learning_rate": 9.549063194664974e-05,
"loss": 0.2996,
"step": 715
},
{
"epoch": 1.3618640038040895,
"grad_norm": 0.4071057140827179,
"learning_rate": 9.548428072403937e-05,
"loss": 0.308,
"step": 716
},
{
"epoch": 1.3637660485021397,
"grad_norm": 0.37772196531295776,
"learning_rate": 9.547792950142903e-05,
"loss": 0.2235,
"step": 717
},
{
"epoch": 1.3656680932001901,
"grad_norm": 0.44488438963890076,
"learning_rate": 9.547157827881868e-05,
"loss": 0.2748,
"step": 718
},
{
"epoch": 1.3675701378982406,
"grad_norm": 0.3227798640727997,
"learning_rate": 9.546522705620833e-05,
"loss": 0.2609,
"step": 719
},
{
"epoch": 1.369472182596291,
"grad_norm": 0.3742448389530182,
"learning_rate": 9.545887583359797e-05,
"loss": 0.2417,
"step": 720
},
{
"epoch": 1.3713742272943414,
"grad_norm": 0.3582020699977875,
"learning_rate": 9.545252461098762e-05,
"loss": 0.2688,
"step": 721
},
{
"epoch": 1.3732762719923919,
"grad_norm": 0.3762567341327667,
"learning_rate": 9.544617338837727e-05,
"loss": 0.2939,
"step": 722
},
{
"epoch": 1.3751783166904423,
"grad_norm": 0.38103973865509033,
"learning_rate": 9.543982216576691e-05,
"loss": 0.3335,
"step": 723
},
{
"epoch": 1.3770803613884925,
"grad_norm": 0.3109844923019409,
"learning_rate": 9.543347094315656e-05,
"loss": 0.2094,
"step": 724
},
{
"epoch": 1.378982406086543,
"grad_norm": 0.3642789125442505,
"learning_rate": 9.542711972054621e-05,
"loss": 0.2879,
"step": 725
},
{
"epoch": 1.3808844507845934,
"grad_norm": 0.3879150152206421,
"learning_rate": 9.542076849793585e-05,
"loss": 0.2567,
"step": 726
},
{
"epoch": 1.3827864954826439,
"grad_norm": 0.3364320993423462,
"learning_rate": 9.54144172753255e-05,
"loss": 0.2773,
"step": 727
},
{
"epoch": 1.3846885401806943,
"grad_norm": 0.5071269273757935,
"learning_rate": 9.540806605271516e-05,
"loss": 0.2916,
"step": 728
},
{
"epoch": 1.3865905848787445,
"grad_norm": 0.425793319940567,
"learning_rate": 9.540171483010481e-05,
"loss": 0.2948,
"step": 729
},
{
"epoch": 1.388492629576795,
"grad_norm": 0.38478776812553406,
"learning_rate": 9.539536360749445e-05,
"loss": 0.2493,
"step": 730
},
{
"epoch": 1.3903946742748454,
"grad_norm": 0.4016847014427185,
"learning_rate": 9.53890123848841e-05,
"loss": 0.3038,
"step": 731
},
{
"epoch": 1.3922967189728959,
"grad_norm": 0.2799355983734131,
"learning_rate": 9.538266116227375e-05,
"loss": 0.2964,
"step": 732
},
{
"epoch": 1.3941987636709463,
"grad_norm": 0.3720659613609314,
"learning_rate": 9.537630993966339e-05,
"loss": 0.2528,
"step": 733
},
{
"epoch": 1.3961008083689967,
"grad_norm": 0.2954385578632355,
"learning_rate": 9.536995871705303e-05,
"loss": 0.2119,
"step": 734
},
{
"epoch": 1.3980028530670472,
"grad_norm": 0.35636264085769653,
"learning_rate": 9.536360749444269e-05,
"loss": 0.3042,
"step": 735
},
{
"epoch": 1.3999048977650974,
"grad_norm": 0.3219160735607147,
"learning_rate": 9.535725627183233e-05,
"loss": 0.2977,
"step": 736
},
{
"epoch": 1.4018069424631479,
"grad_norm": 0.32340940833091736,
"learning_rate": 9.535090504922198e-05,
"loss": 0.2295,
"step": 737
},
{
"epoch": 1.4037089871611983,
"grad_norm": 0.3884155750274658,
"learning_rate": 9.534455382661163e-05,
"loss": 0.2367,
"step": 738
},
{
"epoch": 1.4056110318592487,
"grad_norm": 0.3708769381046295,
"learning_rate": 9.533820260400127e-05,
"loss": 0.2807,
"step": 739
},
{
"epoch": 1.407513076557299,
"grad_norm": 0.3377797603607178,
"learning_rate": 9.533185138139092e-05,
"loss": 0.2459,
"step": 740
},
{
"epoch": 1.4094151212553494,
"grad_norm": 0.542662501335144,
"learning_rate": 9.532550015878056e-05,
"loss": 0.3883,
"step": 741
},
{
"epoch": 1.4113171659533998,
"grad_norm": 0.36908188462257385,
"learning_rate": 9.531914893617023e-05,
"loss": 0.2239,
"step": 742
},
{
"epoch": 1.4132192106514503,
"grad_norm": 0.2898438572883606,
"learning_rate": 9.531279771355987e-05,
"loss": 0.1929,
"step": 743
},
{
"epoch": 1.4151212553495007,
"grad_norm": 0.361965537071228,
"learning_rate": 9.53064464909495e-05,
"loss": 0.2758,
"step": 744
},
{
"epoch": 1.4170233000475512,
"grad_norm": 0.42736831307411194,
"learning_rate": 9.530009526833916e-05,
"loss": 0.3103,
"step": 745
},
{
"epoch": 1.4189253447456016,
"grad_norm": 0.3411954641342163,
"learning_rate": 9.529374404572881e-05,
"loss": 0.2498,
"step": 746
},
{
"epoch": 1.420827389443652,
"grad_norm": 0.3671089708805084,
"learning_rate": 9.528739282311846e-05,
"loss": 0.2961,
"step": 747
},
{
"epoch": 1.4227294341417023,
"grad_norm": 0.35021135210990906,
"learning_rate": 9.52810416005081e-05,
"loss": 0.2422,
"step": 748
},
{
"epoch": 1.4246314788397527,
"grad_norm": 0.3203287422657013,
"learning_rate": 9.527469037789775e-05,
"loss": 0.2377,
"step": 749
},
{
"epoch": 1.4265335235378032,
"grad_norm": 0.32512807846069336,
"learning_rate": 9.52683391552874e-05,
"loss": 0.2533,
"step": 750
},
{
"epoch": 1.4284355682358536,
"grad_norm": 0.39963454008102417,
"learning_rate": 9.526198793267704e-05,
"loss": 0.3191,
"step": 751
},
{
"epoch": 1.4303376129339038,
"grad_norm": 0.3722153306007385,
"learning_rate": 9.525563671006669e-05,
"loss": 0.2134,
"step": 752
},
{
"epoch": 1.4322396576319543,
"grad_norm": 0.3429708182811737,
"learning_rate": 9.524928548745634e-05,
"loss": 0.2221,
"step": 753
},
{
"epoch": 1.4341417023300047,
"grad_norm": 0.4014436602592468,
"learning_rate": 9.524293426484598e-05,
"loss": 0.2638,
"step": 754
},
{
"epoch": 1.4360437470280552,
"grad_norm": 0.38329729437828064,
"learning_rate": 9.523658304223563e-05,
"loss": 0.25,
"step": 755
},
{
"epoch": 1.4379457917261056,
"grad_norm": 0.37710002064704895,
"learning_rate": 9.523023181962529e-05,
"loss": 0.2623,
"step": 756
},
{
"epoch": 1.439847836424156,
"grad_norm": 0.4223197102546692,
"learning_rate": 9.522388059701492e-05,
"loss": 0.408,
"step": 757
},
{
"epoch": 1.4417498811222065,
"grad_norm": 0.45707425475120544,
"learning_rate": 9.521752937440458e-05,
"loss": 0.3491,
"step": 758
},
{
"epoch": 1.4436519258202567,
"grad_norm": 0.39775991439819336,
"learning_rate": 9.521117815179423e-05,
"loss": 0.2498,
"step": 759
},
{
"epoch": 1.4455539705183071,
"grad_norm": 0.3113288879394531,
"learning_rate": 9.520482692918388e-05,
"loss": 0.2191,
"step": 760
},
{
"epoch": 1.4474560152163576,
"grad_norm": 0.35126394033432007,
"learning_rate": 9.519847570657352e-05,
"loss": 0.2689,
"step": 761
},
{
"epoch": 1.449358059914408,
"grad_norm": 0.42121708393096924,
"learning_rate": 9.519212448396317e-05,
"loss": 0.2859,
"step": 762
},
{
"epoch": 1.4512601046124585,
"grad_norm": 0.37913796305656433,
"learning_rate": 9.518577326135282e-05,
"loss": 0.2676,
"step": 763
},
{
"epoch": 1.4531621493105087,
"grad_norm": 0.3767364025115967,
"learning_rate": 9.517942203874246e-05,
"loss": 0.2298,
"step": 764
},
{
"epoch": 1.4550641940085591,
"grad_norm": 0.3317908048629761,
"learning_rate": 9.517307081613211e-05,
"loss": 0.2439,
"step": 765
},
{
"epoch": 1.4569662387066096,
"grad_norm": 0.28014522790908813,
"learning_rate": 9.516671959352176e-05,
"loss": 0.207,
"step": 766
},
{
"epoch": 1.45886828340466,
"grad_norm": 0.4119054675102234,
"learning_rate": 9.51603683709114e-05,
"loss": 0.2969,
"step": 767
},
{
"epoch": 1.4607703281027105,
"grad_norm": 0.3351030647754669,
"learning_rate": 9.515401714830105e-05,
"loss": 0.2925,
"step": 768
},
{
"epoch": 1.462672372800761,
"grad_norm": 0.5204692482948303,
"learning_rate": 9.51476659256907e-05,
"loss": 0.3546,
"step": 769
},
{
"epoch": 1.4645744174988113,
"grad_norm": 0.42994043231010437,
"learning_rate": 9.514131470308034e-05,
"loss": 0.3284,
"step": 770
},
{
"epoch": 1.4664764621968616,
"grad_norm": 0.3580436408519745,
"learning_rate": 9.513496348047e-05,
"loss": 0.2639,
"step": 771
},
{
"epoch": 1.468378506894912,
"grad_norm": 0.37151291966438293,
"learning_rate": 9.512861225785963e-05,
"loss": 0.2556,
"step": 772
},
{
"epoch": 1.4702805515929624,
"grad_norm": 0.33122384548187256,
"learning_rate": 9.51222610352493e-05,
"loss": 0.2565,
"step": 773
},
{
"epoch": 1.472182596291013,
"grad_norm": 0.3718935251235962,
"learning_rate": 9.511590981263894e-05,
"loss": 0.2348,
"step": 774
},
{
"epoch": 1.474084640989063,
"grad_norm": 0.3752667009830475,
"learning_rate": 9.510955859002858e-05,
"loss": 0.2933,
"step": 775
},
{
"epoch": 1.4759866856871136,
"grad_norm": 0.44539371132850647,
"learning_rate": 9.510320736741824e-05,
"loss": 0.2699,
"step": 776
},
{
"epoch": 1.477888730385164,
"grad_norm": 0.5468220114707947,
"learning_rate": 9.509685614480788e-05,
"loss": 0.4141,
"step": 777
},
{
"epoch": 1.4797907750832144,
"grad_norm": 0.5036222338676453,
"learning_rate": 9.509050492219753e-05,
"loss": 0.3463,
"step": 778
},
{
"epoch": 1.4816928197812649,
"grad_norm": 0.3742172420024872,
"learning_rate": 9.508415369958717e-05,
"loss": 0.3104,
"step": 779
},
{
"epoch": 1.4835948644793153,
"grad_norm": 0.38696351647377014,
"learning_rate": 9.507780247697682e-05,
"loss": 0.2406,
"step": 780
},
{
"epoch": 1.4854969091773658,
"grad_norm": 0.43431171774864197,
"learning_rate": 9.507145125436647e-05,
"loss": 0.307,
"step": 781
},
{
"epoch": 1.4873989538754162,
"grad_norm": 0.3814404606819153,
"learning_rate": 9.506510003175611e-05,
"loss": 0.2681,
"step": 782
},
{
"epoch": 1.4893009985734664,
"grad_norm": 0.350359708070755,
"learning_rate": 9.505874880914578e-05,
"loss": 0.2408,
"step": 783
},
{
"epoch": 1.4912030432715169,
"grad_norm": 0.4443821609020233,
"learning_rate": 9.505239758653541e-05,
"loss": 0.3358,
"step": 784
},
{
"epoch": 1.4931050879695673,
"grad_norm": 0.2963017225265503,
"learning_rate": 9.504604636392505e-05,
"loss": 0.2085,
"step": 785
},
{
"epoch": 1.4950071326676178,
"grad_norm": 0.4765385389328003,
"learning_rate": 9.50396951413147e-05,
"loss": 0.396,
"step": 786
},
{
"epoch": 1.496909177365668,
"grad_norm": 0.3389003574848175,
"learning_rate": 9.503334391870436e-05,
"loss": 0.327,
"step": 787
},
{
"epoch": 1.4988112220637184,
"grad_norm": 0.42218640446662903,
"learning_rate": 9.5026992696094e-05,
"loss": 0.3078,
"step": 788
},
{
"epoch": 1.5007132667617689,
"grad_norm": 0.4693278670310974,
"learning_rate": 9.502064147348365e-05,
"loss": 0.2853,
"step": 789
},
{
"epoch": 1.5026153114598193,
"grad_norm": 0.3891851305961609,
"learning_rate": 9.50142902508733e-05,
"loss": 0.2493,
"step": 790
},
{
"epoch": 1.5045173561578697,
"grad_norm": 0.3862535357475281,
"learning_rate": 9.500793902826295e-05,
"loss": 0.2673,
"step": 791
},
{
"epoch": 1.5064194008559202,
"grad_norm": 0.34803205728530884,
"learning_rate": 9.500158780565259e-05,
"loss": 0.2814,
"step": 792
},
{
"epoch": 1.5083214455539706,
"grad_norm": 0.3963899314403534,
"learning_rate": 9.499523658304224e-05,
"loss": 0.3018,
"step": 793
},
{
"epoch": 1.510223490252021,
"grad_norm": 0.4004577398300171,
"learning_rate": 9.498888536043189e-05,
"loss": 0.313,
"step": 794
},
{
"epoch": 1.5121255349500713,
"grad_norm": 0.32212579250335693,
"learning_rate": 9.498253413782153e-05,
"loss": 0.2081,
"step": 795
},
{
"epoch": 1.5140275796481217,
"grad_norm": 0.32745805382728577,
"learning_rate": 9.497618291521118e-05,
"loss": 0.231,
"step": 796
},
{
"epoch": 1.5159296243461722,
"grad_norm": 0.40773364901542664,
"learning_rate": 9.496983169260083e-05,
"loss": 0.2804,
"step": 797
},
{
"epoch": 1.5178316690442224,
"grad_norm": 0.3848927319049835,
"learning_rate": 9.496348046999047e-05,
"loss": 0.288,
"step": 798
},
{
"epoch": 1.5197337137422728,
"grad_norm": 0.317124605178833,
"learning_rate": 9.495712924738012e-05,
"loss": 0.2202,
"step": 799
},
{
"epoch": 1.5216357584403233,
"grad_norm": 0.3564606010913849,
"learning_rate": 9.495077802476978e-05,
"loss": 0.2594,
"step": 800
},
{
"epoch": 1.5235378031383737,
"grad_norm": 0.3151964545249939,
"learning_rate": 9.494442680215943e-05,
"loss": 0.2138,
"step": 801
},
{
"epoch": 1.5254398478364242,
"grad_norm": 0.4009242057800293,
"learning_rate": 9.493807557954907e-05,
"loss": 0.3157,
"step": 802
},
{
"epoch": 1.5273418925344746,
"grad_norm": 0.36916011571884155,
"learning_rate": 9.49317243569387e-05,
"loss": 0.2478,
"step": 803
},
{
"epoch": 1.529243937232525,
"grad_norm": 0.372277170419693,
"learning_rate": 9.492537313432837e-05,
"loss": 0.2912,
"step": 804
},
{
"epoch": 1.5311459819305755,
"grad_norm": 0.42100057005882263,
"learning_rate": 9.491902191171801e-05,
"loss": 0.2938,
"step": 805
},
{
"epoch": 1.533048026628626,
"grad_norm": 0.3528178334236145,
"learning_rate": 9.491267068910765e-05,
"loss": 0.2519,
"step": 806
},
{
"epoch": 1.5349500713266762,
"grad_norm": 0.3655840754508972,
"learning_rate": 9.490631946649731e-05,
"loss": 0.2685,
"step": 807
},
{
"epoch": 1.5368521160247266,
"grad_norm": 0.34080174565315247,
"learning_rate": 9.489996824388695e-05,
"loss": 0.2339,
"step": 808
},
{
"epoch": 1.5387541607227768,
"grad_norm": 0.3532484173774719,
"learning_rate": 9.48936170212766e-05,
"loss": 0.2448,
"step": 809
},
{
"epoch": 1.5406562054208273,
"grad_norm": 0.33115965127944946,
"learning_rate": 9.488726579866624e-05,
"loss": 0.2549,
"step": 810
},
{
"epoch": 1.5425582501188777,
"grad_norm": 0.40624433755874634,
"learning_rate": 9.488091457605589e-05,
"loss": 0.2847,
"step": 811
},
{
"epoch": 1.5444602948169281,
"grad_norm": 0.35374221205711365,
"learning_rate": 9.487456335344554e-05,
"loss": 0.2704,
"step": 812
},
{
"epoch": 1.5463623395149786,
"grad_norm": 0.3859337568283081,
"learning_rate": 9.486821213083518e-05,
"loss": 0.2969,
"step": 813
},
{
"epoch": 1.548264384213029,
"grad_norm": 0.37984946370124817,
"learning_rate": 9.486186090822485e-05,
"loss": 0.2908,
"step": 814
},
{
"epoch": 1.5501664289110795,
"grad_norm": 0.34984755516052246,
"learning_rate": 9.485550968561449e-05,
"loss": 0.2247,
"step": 815
},
{
"epoch": 1.55206847360913,
"grad_norm": 0.32592761516571045,
"learning_rate": 9.484915846300412e-05,
"loss": 0.1985,
"step": 816
},
{
"epoch": 1.5539705183071804,
"grad_norm": 0.4273107945919037,
"learning_rate": 9.484280724039378e-05,
"loss": 0.2875,
"step": 817
},
{
"epoch": 1.5558725630052306,
"grad_norm": 0.35476601123809814,
"learning_rate": 9.483645601778343e-05,
"loss": 0.2721,
"step": 818
},
{
"epoch": 1.557774607703281,
"grad_norm": 0.30542057752609253,
"learning_rate": 9.483010479517308e-05,
"loss": 0.1966,
"step": 819
},
{
"epoch": 1.5596766524013315,
"grad_norm": 0.44310665130615234,
"learning_rate": 9.482375357256272e-05,
"loss": 0.2533,
"step": 820
},
{
"epoch": 1.5615786970993817,
"grad_norm": 0.39837488532066345,
"learning_rate": 9.481740234995237e-05,
"loss": 0.3045,
"step": 821
},
{
"epoch": 1.5634807417974321,
"grad_norm": 0.33650925755500793,
"learning_rate": 9.481105112734202e-05,
"loss": 0.3626,
"step": 822
},
{
"epoch": 1.5653827864954826,
"grad_norm": 0.39762622117996216,
"learning_rate": 9.480469990473166e-05,
"loss": 0.2862,
"step": 823
},
{
"epoch": 1.567284831193533,
"grad_norm": 0.36138975620269775,
"learning_rate": 9.479834868212131e-05,
"loss": 0.2434,
"step": 824
},
{
"epoch": 1.5691868758915835,
"grad_norm": 0.37878358364105225,
"learning_rate": 9.479199745951096e-05,
"loss": 0.2421,
"step": 825
},
{
"epoch": 1.571088920589634,
"grad_norm": 0.4009093642234802,
"learning_rate": 9.47856462369006e-05,
"loss": 0.2561,
"step": 826
},
{
"epoch": 1.5729909652876843,
"grad_norm": 0.3085389733314514,
"learning_rate": 9.477929501429025e-05,
"loss": 0.2293,
"step": 827
},
{
"epoch": 1.5748930099857348,
"grad_norm": 0.48082223534584045,
"learning_rate": 9.47729437916799e-05,
"loss": 0.3193,
"step": 828
},
{
"epoch": 1.5767950546837852,
"grad_norm": 0.42938464879989624,
"learning_rate": 9.476659256906954e-05,
"loss": 0.3319,
"step": 829
},
{
"epoch": 1.5786970993818354,
"grad_norm": 0.32788941264152527,
"learning_rate": 9.47602413464592e-05,
"loss": 0.2432,
"step": 830
},
{
"epoch": 1.5805991440798859,
"grad_norm": 0.38157737255096436,
"learning_rate": 9.475389012384885e-05,
"loss": 0.3165,
"step": 831
},
{
"epoch": 1.5825011887779363,
"grad_norm": 0.38666632771492004,
"learning_rate": 9.47475389012385e-05,
"loss": 0.2554,
"step": 832
},
{
"epoch": 1.5844032334759865,
"grad_norm": 0.3475115895271301,
"learning_rate": 9.474118767862814e-05,
"loss": 0.2679,
"step": 833
},
{
"epoch": 1.586305278174037,
"grad_norm": 0.35684680938720703,
"learning_rate": 9.473483645601779e-05,
"loss": 0.2574,
"step": 834
},
{
"epoch": 1.5882073228720874,
"grad_norm": 0.5205959677696228,
"learning_rate": 9.472848523340744e-05,
"loss": 0.3646,
"step": 835
},
{
"epoch": 1.5901093675701379,
"grad_norm": 0.37549740076065063,
"learning_rate": 9.472213401079708e-05,
"loss": 0.2741,
"step": 836
},
{
"epoch": 1.5920114122681883,
"grad_norm": 0.5251928567886353,
"learning_rate": 9.471578278818673e-05,
"loss": 0.3799,
"step": 837
},
{
"epoch": 1.5939134569662388,
"grad_norm": 0.42622271180152893,
"learning_rate": 9.470943156557638e-05,
"loss": 0.2991,
"step": 838
},
{
"epoch": 1.5958155016642892,
"grad_norm": 0.3737063407897949,
"learning_rate": 9.470308034296602e-05,
"loss": 0.288,
"step": 839
},
{
"epoch": 1.5977175463623396,
"grad_norm": 0.4851538836956024,
"learning_rate": 9.469672912035567e-05,
"loss": 0.3293,
"step": 840
},
{
"epoch": 1.5996195910603899,
"grad_norm": 0.3662918508052826,
"learning_rate": 9.469037789774533e-05,
"loss": 0.2338,
"step": 841
},
{
"epoch": 1.6015216357584403,
"grad_norm": 0.3263486325740814,
"learning_rate": 9.468402667513496e-05,
"loss": 0.2228,
"step": 842
},
{
"epoch": 1.6034236804564908,
"grad_norm": 0.4000779092311859,
"learning_rate": 9.467767545252462e-05,
"loss": 0.2635,
"step": 843
},
{
"epoch": 1.605325725154541,
"grad_norm": 0.4274492859840393,
"learning_rate": 9.467132422991425e-05,
"loss": 0.3063,
"step": 844
},
{
"epoch": 1.6072277698525914,
"grad_norm": 0.4486158490180969,
"learning_rate": 9.466497300730392e-05,
"loss": 0.3039,
"step": 845
},
{
"epoch": 1.6091298145506419,
"grad_norm": 0.48109135031700134,
"learning_rate": 9.465862178469356e-05,
"loss": 0.3471,
"step": 846
},
{
"epoch": 1.6110318592486923,
"grad_norm": 0.41299277544021606,
"learning_rate": 9.46522705620832e-05,
"loss": 0.2896,
"step": 847
},
{
"epoch": 1.6129339039467427,
"grad_norm": 0.4177182614803314,
"learning_rate": 9.464591933947286e-05,
"loss": 0.2519,
"step": 848
},
{
"epoch": 1.6148359486447932,
"grad_norm": 0.36468592286109924,
"learning_rate": 9.46395681168625e-05,
"loss": 0.275,
"step": 849
},
{
"epoch": 1.6167379933428436,
"grad_norm": 0.33025646209716797,
"learning_rate": 9.463321689425215e-05,
"loss": 0.234,
"step": 850
},
{
"epoch": 1.618640038040894,
"grad_norm": 0.4377218186855316,
"learning_rate": 9.462686567164179e-05,
"loss": 0.2939,
"step": 851
},
{
"epoch": 1.6205420827389445,
"grad_norm": 0.34059834480285645,
"learning_rate": 9.462051444903144e-05,
"loss": 0.2559,
"step": 852
},
{
"epoch": 1.6224441274369947,
"grad_norm": 0.36525094509124756,
"learning_rate": 9.46141632264211e-05,
"loss": 0.2638,
"step": 853
},
{
"epoch": 1.6243461721350452,
"grad_norm": 0.344927042722702,
"learning_rate": 9.460781200381073e-05,
"loss": 0.1906,
"step": 854
},
{
"epoch": 1.6262482168330956,
"grad_norm": 0.4097568988800049,
"learning_rate": 9.460146078120038e-05,
"loss": 0.3143,
"step": 855
},
{
"epoch": 1.6281502615311458,
"grad_norm": 0.32290300726890564,
"learning_rate": 9.459510955859004e-05,
"loss": 0.2734,
"step": 856
},
{
"epoch": 1.6300523062291963,
"grad_norm": 0.3865107595920563,
"learning_rate": 9.458875833597967e-05,
"loss": 0.3012,
"step": 857
},
{
"epoch": 1.6319543509272467,
"grad_norm": 0.3034641444683075,
"learning_rate": 9.458240711336933e-05,
"loss": 0.2164,
"step": 858
},
{
"epoch": 1.6338563956252972,
"grad_norm": 0.3896719217300415,
"learning_rate": 9.457605589075898e-05,
"loss": 0.2577,
"step": 859
},
{
"epoch": 1.6357584403233476,
"grad_norm": 0.35619622468948364,
"learning_rate": 9.456970466814862e-05,
"loss": 0.3076,
"step": 860
},
{
"epoch": 1.637660485021398,
"grad_norm": 0.39600345492362976,
"learning_rate": 9.456335344553827e-05,
"loss": 0.4003,
"step": 861
},
{
"epoch": 1.6395625297194485,
"grad_norm": 0.3511577248573303,
"learning_rate": 9.455700222292792e-05,
"loss": 0.2603,
"step": 862
},
{
"epoch": 1.641464574417499,
"grad_norm": 0.44329899549484253,
"learning_rate": 9.455065100031757e-05,
"loss": 0.2921,
"step": 863
},
{
"epoch": 1.6433666191155494,
"grad_norm": 0.3798992931842804,
"learning_rate": 9.454429977770721e-05,
"loss": 0.2897,
"step": 864
},
{
"epoch": 1.6452686638135996,
"grad_norm": 0.38711193203926086,
"learning_rate": 9.453794855509686e-05,
"loss": 0.2791,
"step": 865
},
{
"epoch": 1.64717070851165,
"grad_norm": 0.3537624478340149,
"learning_rate": 9.453159733248651e-05,
"loss": 0.2207,
"step": 866
},
{
"epoch": 1.6490727532097005,
"grad_norm": 0.350455641746521,
"learning_rate": 9.452524610987615e-05,
"loss": 0.2595,
"step": 867
},
{
"epoch": 1.6509747979077507,
"grad_norm": 0.35781386494636536,
"learning_rate": 9.45188948872658e-05,
"loss": 0.2618,
"step": 868
},
{
"epoch": 1.6528768426058011,
"grad_norm": 0.4823295772075653,
"learning_rate": 9.451254366465546e-05,
"loss": 0.3174,
"step": 869
},
{
"epoch": 1.6547788873038516,
"grad_norm": 0.31698495149612427,
"learning_rate": 9.45061924420451e-05,
"loss": 0.2165,
"step": 870
},
{
"epoch": 1.656680932001902,
"grad_norm": 0.4576948583126068,
"learning_rate": 9.449984121943475e-05,
"loss": 0.2937,
"step": 871
},
{
"epoch": 1.6585829766999525,
"grad_norm": 0.4196888506412506,
"learning_rate": 9.44934899968244e-05,
"loss": 0.2876,
"step": 872
},
{
"epoch": 1.660485021398003,
"grad_norm": 0.48588597774505615,
"learning_rate": 9.448713877421405e-05,
"loss": 0.3433,
"step": 873
},
{
"epoch": 1.6623870660960534,
"grad_norm": 0.427946537733078,
"learning_rate": 9.448078755160369e-05,
"loss": 0.3184,
"step": 874
},
{
"epoch": 1.6642891107941038,
"grad_norm": 0.4138951897621155,
"learning_rate": 9.447443632899333e-05,
"loss": 0.2738,
"step": 875
},
{
"epoch": 1.666191155492154,
"grad_norm": 0.36560842394828796,
"learning_rate": 9.446808510638299e-05,
"loss": 0.3029,
"step": 876
},
{
"epoch": 1.6680932001902045,
"grad_norm": 0.42942315340042114,
"learning_rate": 9.446173388377263e-05,
"loss": 0.2888,
"step": 877
},
{
"epoch": 1.669995244888255,
"grad_norm": 0.21167854964733124,
"learning_rate": 9.445538266116227e-05,
"loss": 0.1919,
"step": 878
},
{
"epoch": 1.6718972895863051,
"grad_norm": 0.41339564323425293,
"learning_rate": 9.444903143855193e-05,
"loss": 0.2482,
"step": 879
},
{
"epoch": 1.6737993342843556,
"grad_norm": 0.47189727425575256,
"learning_rate": 9.444268021594157e-05,
"loss": 0.328,
"step": 880
},
{
"epoch": 1.675701378982406,
"grad_norm": 0.32868659496307373,
"learning_rate": 9.443632899333122e-05,
"loss": 0.1985,
"step": 881
},
{
"epoch": 1.6776034236804565,
"grad_norm": 0.3501724898815155,
"learning_rate": 9.442997777072086e-05,
"loss": 0.2733,
"step": 882
},
{
"epoch": 1.679505468378507,
"grad_norm": 0.37144583463668823,
"learning_rate": 9.442362654811051e-05,
"loss": 0.2293,
"step": 883
},
{
"epoch": 1.6814075130765573,
"grad_norm": 0.36318424344062805,
"learning_rate": 9.441727532550017e-05,
"loss": 0.3521,
"step": 884
},
{
"epoch": 1.6833095577746078,
"grad_norm": 0.4295286238193512,
"learning_rate": 9.44109241028898e-05,
"loss": 0.3113,
"step": 885
},
{
"epoch": 1.6852116024726582,
"grad_norm": 0.3312181830406189,
"learning_rate": 9.440457288027947e-05,
"loss": 0.2818,
"step": 886
},
{
"epoch": 1.6871136471707087,
"grad_norm": 0.3743634819984436,
"learning_rate": 9.439822165766911e-05,
"loss": 0.245,
"step": 887
},
{
"epoch": 1.6890156918687589,
"grad_norm": 0.5934861898422241,
"learning_rate": 9.439187043505875e-05,
"loss": 0.3654,
"step": 888
},
{
"epoch": 1.6909177365668093,
"grad_norm": 0.4149317741394043,
"learning_rate": 9.43855192124484e-05,
"loss": 0.2584,
"step": 889
},
{
"epoch": 1.6928197812648598,
"grad_norm": 0.40615764260292053,
"learning_rate": 9.437916798983805e-05,
"loss": 0.2986,
"step": 890
},
{
"epoch": 1.69472182596291,
"grad_norm": 0.37536385655403137,
"learning_rate": 9.43728167672277e-05,
"loss": 0.2813,
"step": 891
},
{
"epoch": 1.6966238706609604,
"grad_norm": 0.41415923833847046,
"learning_rate": 9.436646554461734e-05,
"loss": 0.3333,
"step": 892
},
{
"epoch": 1.6985259153590109,
"grad_norm": 0.30747082829475403,
"learning_rate": 9.436011432200699e-05,
"loss": 0.2143,
"step": 893
},
{
"epoch": 1.7004279600570613,
"grad_norm": 0.44593873620033264,
"learning_rate": 9.435376309939664e-05,
"loss": 0.2834,
"step": 894
},
{
"epoch": 1.7023300047551118,
"grad_norm": 0.3417704403400421,
"learning_rate": 9.434741187678628e-05,
"loss": 0.2265,
"step": 895
},
{
"epoch": 1.7042320494531622,
"grad_norm": 0.3436511754989624,
"learning_rate": 9.434106065417593e-05,
"loss": 0.249,
"step": 896
},
{
"epoch": 1.7061340941512126,
"grad_norm": 0.4569544494152069,
"learning_rate": 9.433470943156559e-05,
"loss": 0.3271,
"step": 897
},
{
"epoch": 1.708036138849263,
"grad_norm": 0.3883751630783081,
"learning_rate": 9.432835820895522e-05,
"loss": 0.2673,
"step": 898
},
{
"epoch": 1.7099381835473135,
"grad_norm": 0.3915776014328003,
"learning_rate": 9.432200698634488e-05,
"loss": 0.2313,
"step": 899
},
{
"epoch": 1.7118402282453637,
"grad_norm": 0.3450072407722473,
"learning_rate": 9.431565576373453e-05,
"loss": 0.2726,
"step": 900
},
{
"epoch": 1.7137422729434142,
"grad_norm": 0.3894912004470825,
"learning_rate": 9.430930454112417e-05,
"loss": 0.2607,
"step": 901
},
{
"epoch": 1.7156443176414644,
"grad_norm": 0.3509180545806885,
"learning_rate": 9.430295331851382e-05,
"loss": 0.2781,
"step": 902
},
{
"epoch": 1.7175463623395149,
"grad_norm": 0.5164948105812073,
"learning_rate": 9.429660209590347e-05,
"loss": 0.3619,
"step": 903
},
{
"epoch": 1.7194484070375653,
"grad_norm": 0.4074023962020874,
"learning_rate": 9.429025087329312e-05,
"loss": 0.3116,
"step": 904
},
{
"epoch": 1.7213504517356157,
"grad_norm": 0.4034394323825836,
"learning_rate": 9.428389965068276e-05,
"loss": 0.3155,
"step": 905
},
{
"epoch": 1.7232524964336662,
"grad_norm": 0.32292982935905457,
"learning_rate": 9.427754842807241e-05,
"loss": 0.2171,
"step": 906
},
{
"epoch": 1.7251545411317166,
"grad_norm": 0.368856817483902,
"learning_rate": 9.427119720546206e-05,
"loss": 0.3021,
"step": 907
},
{
"epoch": 1.727056585829767,
"grad_norm": 0.34953123331069946,
"learning_rate": 9.42648459828517e-05,
"loss": 0.2701,
"step": 908
},
{
"epoch": 1.7289586305278175,
"grad_norm": 0.37510743737220764,
"learning_rate": 9.425849476024135e-05,
"loss": 0.3216,
"step": 909
},
{
"epoch": 1.730860675225868,
"grad_norm": 0.31331393122673035,
"learning_rate": 9.4252143537631e-05,
"loss": 0.2855,
"step": 910
},
{
"epoch": 1.7327627199239182,
"grad_norm": 0.3806105852127075,
"learning_rate": 9.424579231502064e-05,
"loss": 0.3216,
"step": 911
},
{
"epoch": 1.7346647646219686,
"grad_norm": 0.3693408668041229,
"learning_rate": 9.42394410924103e-05,
"loss": 0.2473,
"step": 912
},
{
"epoch": 1.736566809320019,
"grad_norm": 0.2931939959526062,
"learning_rate": 9.423308986979993e-05,
"loss": 0.1873,
"step": 913
},
{
"epoch": 1.7384688540180693,
"grad_norm": 0.4330272972583771,
"learning_rate": 9.422673864718959e-05,
"loss": 0.3078,
"step": 914
},
{
"epoch": 1.7403708987161197,
"grad_norm": 0.4881534278392792,
"learning_rate": 9.422038742457924e-05,
"loss": 0.3771,
"step": 915
},
{
"epoch": 1.7422729434141702,
"grad_norm": 0.3158344328403473,
"learning_rate": 9.421403620196888e-05,
"loss": 0.2813,
"step": 916
},
{
"epoch": 1.7441749881122206,
"grad_norm": 0.4482041299343109,
"learning_rate": 9.420768497935854e-05,
"loss": 0.3872,
"step": 917
},
{
"epoch": 1.746077032810271,
"grad_norm": 0.3493407070636749,
"learning_rate": 9.420133375674818e-05,
"loss": 0.2284,
"step": 918
},
{
"epoch": 1.7479790775083215,
"grad_norm": 0.3753608763217926,
"learning_rate": 9.419498253413782e-05,
"loss": 0.254,
"step": 919
},
{
"epoch": 1.749881122206372,
"grad_norm": 0.4550943374633789,
"learning_rate": 9.418863131152747e-05,
"loss": 0.3073,
"step": 920
},
{
"epoch": 1.7517831669044224,
"grad_norm": 0.3239607810974121,
"learning_rate": 9.418228008891712e-05,
"loss": 0.2087,
"step": 921
},
{
"epoch": 1.7536852116024728,
"grad_norm": 0.4610382616519928,
"learning_rate": 9.417592886630677e-05,
"loss": 0.3104,
"step": 922
},
{
"epoch": 1.755587256300523,
"grad_norm": 0.4382965862751007,
"learning_rate": 9.416957764369641e-05,
"loss": 0.2583,
"step": 923
},
{
"epoch": 1.7574893009985735,
"grad_norm": 0.31299924850463867,
"learning_rate": 9.416322642108606e-05,
"loss": 0.2033,
"step": 924
},
{
"epoch": 1.759391345696624,
"grad_norm": 0.33872106671333313,
"learning_rate": 9.415687519847571e-05,
"loss": 0.2366,
"step": 925
},
{
"epoch": 1.7612933903946741,
"grad_norm": 0.33771976828575134,
"learning_rate": 9.415052397586535e-05,
"loss": 0.3062,
"step": 926
},
{
"epoch": 1.7631954350927246,
"grad_norm": 0.32810178399086,
"learning_rate": 9.4144172753255e-05,
"loss": 0.2264,
"step": 927
},
{
"epoch": 1.765097479790775,
"grad_norm": 0.41518697142601013,
"learning_rate": 9.413782153064466e-05,
"loss": 0.2747,
"step": 928
},
{
"epoch": 1.7669995244888255,
"grad_norm": 0.43647775053977966,
"learning_rate": 9.41314703080343e-05,
"loss": 0.3439,
"step": 929
},
{
"epoch": 1.768901569186876,
"grad_norm": 0.2905902564525604,
"learning_rate": 9.412511908542395e-05,
"loss": 0.2327,
"step": 930
},
{
"epoch": 1.7708036138849264,
"grad_norm": 0.38527336716651917,
"learning_rate": 9.41187678628136e-05,
"loss": 0.264,
"step": 931
},
{
"epoch": 1.7727056585829768,
"grad_norm": 0.4135185182094574,
"learning_rate": 9.411241664020324e-05,
"loss": 0.3075,
"step": 932
},
{
"epoch": 1.7746077032810272,
"grad_norm": 0.30278775095939636,
"learning_rate": 9.410606541759289e-05,
"loss": 0.1831,
"step": 933
},
{
"epoch": 1.7765097479790775,
"grad_norm": 0.3687085509300232,
"learning_rate": 9.409971419498254e-05,
"loss": 0.2862,
"step": 934
},
{
"epoch": 1.778411792677128,
"grad_norm": 0.3217594623565674,
"learning_rate": 9.409336297237219e-05,
"loss": 0.1975,
"step": 935
},
{
"epoch": 1.7803138373751783,
"grad_norm": 0.3583223223686218,
"learning_rate": 9.408701174976183e-05,
"loss": 0.2345,
"step": 936
},
{
"epoch": 1.7822158820732286,
"grad_norm": 0.4119435250759125,
"learning_rate": 9.408066052715148e-05,
"loss": 0.2916,
"step": 937
},
{
"epoch": 1.784117926771279,
"grad_norm": 0.400728315114975,
"learning_rate": 9.407430930454113e-05,
"loss": 0.4505,
"step": 938
},
{
"epoch": 1.7860199714693294,
"grad_norm": 0.3988611698150635,
"learning_rate": 9.406795808193077e-05,
"loss": 0.286,
"step": 939
},
{
"epoch": 1.78792201616738,
"grad_norm": 0.4544796347618103,
"learning_rate": 9.406160685932042e-05,
"loss": 0.3268,
"step": 940
},
{
"epoch": 1.7898240608654303,
"grad_norm": 0.3785744905471802,
"learning_rate": 9.405525563671008e-05,
"loss": 0.2532,
"step": 941
},
{
"epoch": 1.7917261055634808,
"grad_norm": 0.4459128975868225,
"learning_rate": 9.404890441409971e-05,
"loss": 0.3348,
"step": 942
},
{
"epoch": 1.7936281502615312,
"grad_norm": 0.3253449499607086,
"learning_rate": 9.404255319148937e-05,
"loss": 0.1945,
"step": 943
},
{
"epoch": 1.7955301949595817,
"grad_norm": 0.4977390468120575,
"learning_rate": 9.403620196887902e-05,
"loss": 0.3,
"step": 944
},
{
"epoch": 1.797432239657632,
"grad_norm": 0.46191859245300293,
"learning_rate": 9.402985074626867e-05,
"loss": 0.3638,
"step": 945
},
{
"epoch": 1.7993342843556823,
"grad_norm": 0.38492342829704285,
"learning_rate": 9.402349952365831e-05,
"loss": 0.2566,
"step": 946
},
{
"epoch": 1.8012363290537328,
"grad_norm": 0.34863540530204773,
"learning_rate": 9.401714830104795e-05,
"loss": 0.2321,
"step": 947
},
{
"epoch": 1.8031383737517832,
"grad_norm": 0.3839346766471863,
"learning_rate": 9.401079707843761e-05,
"loss": 0.2751,
"step": 948
},
{
"epoch": 1.8050404184498334,
"grad_norm": 0.36121171712875366,
"learning_rate": 9.400444585582725e-05,
"loss": 0.2492,
"step": 949
},
{
"epoch": 1.8069424631478839,
"grad_norm": 0.3479311466217041,
"learning_rate": 9.399809463321689e-05,
"loss": 0.2436,
"step": 950
},
{
"epoch": 1.8088445078459343,
"grad_norm": 0.35279884934425354,
"learning_rate": 9.399174341060655e-05,
"loss": 0.2718,
"step": 951
},
{
"epoch": 1.8107465525439848,
"grad_norm": 0.43152448534965515,
"learning_rate": 9.398539218799619e-05,
"loss": 0.2739,
"step": 952
},
{
"epoch": 1.8126485972420352,
"grad_norm": 0.3631283938884735,
"learning_rate": 9.397904096538584e-05,
"loss": 0.2239,
"step": 953
},
{
"epoch": 1.8145506419400856,
"grad_norm": 0.4698762595653534,
"learning_rate": 9.397268974277548e-05,
"loss": 0.3247,
"step": 954
},
{
"epoch": 1.816452686638136,
"grad_norm": 0.36629432439804077,
"learning_rate": 9.396633852016513e-05,
"loss": 0.2778,
"step": 955
},
{
"epoch": 1.8183547313361865,
"grad_norm": 0.34220409393310547,
"learning_rate": 9.395998729755479e-05,
"loss": 0.2466,
"step": 956
},
{
"epoch": 1.820256776034237,
"grad_norm": 0.3768969178199768,
"learning_rate": 9.395363607494442e-05,
"loss": 0.334,
"step": 957
},
{
"epoch": 1.8221588207322872,
"grad_norm": 0.2891027629375458,
"learning_rate": 9.394728485233409e-05,
"loss": 0.206,
"step": 958
},
{
"epoch": 1.8240608654303376,
"grad_norm": 0.2802363634109497,
"learning_rate": 9.394093362972373e-05,
"loss": 0.2566,
"step": 959
},
{
"epoch": 1.825962910128388,
"grad_norm": 0.38722601532936096,
"learning_rate": 9.393458240711337e-05,
"loss": 0.2615,
"step": 960
},
{
"epoch": 1.8278649548264383,
"grad_norm": 0.45663881301879883,
"learning_rate": 9.392823118450302e-05,
"loss": 0.3521,
"step": 961
},
{
"epoch": 1.8297669995244887,
"grad_norm": 0.36096152663230896,
"learning_rate": 9.392187996189267e-05,
"loss": 0.2429,
"step": 962
},
{
"epoch": 1.8316690442225392,
"grad_norm": 0.3237638473510742,
"learning_rate": 9.391552873928232e-05,
"loss": 0.2874,
"step": 963
},
{
"epoch": 1.8335710889205896,
"grad_norm": 0.379863440990448,
"learning_rate": 9.390917751667196e-05,
"loss": 0.2504,
"step": 964
},
{
"epoch": 1.83547313361864,
"grad_norm": 0.40816691517829895,
"learning_rate": 9.390282629406161e-05,
"loss": 0.2614,
"step": 965
},
{
"epoch": 1.8373751783166905,
"grad_norm": 0.38382720947265625,
"learning_rate": 9.389647507145126e-05,
"loss": 0.2282,
"step": 966
},
{
"epoch": 1.839277223014741,
"grad_norm": 0.328861266374588,
"learning_rate": 9.38901238488409e-05,
"loss": 0.1763,
"step": 967
},
{
"epoch": 1.8411792677127914,
"grad_norm": 0.3471934497356415,
"learning_rate": 9.388377262623055e-05,
"loss": 0.2348,
"step": 968
},
{
"epoch": 1.8430813124108416,
"grad_norm": 0.44112637639045715,
"learning_rate": 9.38774214036202e-05,
"loss": 0.3496,
"step": 969
},
{
"epoch": 1.844983357108892,
"grad_norm": 0.4357364773750305,
"learning_rate": 9.387107018100984e-05,
"loss": 0.2832,
"step": 970
},
{
"epoch": 1.8468854018069425,
"grad_norm": 0.4502738118171692,
"learning_rate": 9.38647189583995e-05,
"loss": 0.2862,
"step": 971
},
{
"epoch": 1.8487874465049927,
"grad_norm": 0.3577602505683899,
"learning_rate": 9.385836773578915e-05,
"loss": 0.2019,
"step": 972
},
{
"epoch": 1.8506894912030432,
"grad_norm": 0.36250707507133484,
"learning_rate": 9.385201651317879e-05,
"loss": 0.2936,
"step": 973
},
{
"epoch": 1.8525915359010936,
"grad_norm": 0.44027233123779297,
"learning_rate": 9.384566529056844e-05,
"loss": 0.3004,
"step": 974
},
{
"epoch": 1.854493580599144,
"grad_norm": 0.4500497877597809,
"learning_rate": 9.383931406795809e-05,
"loss": 0.3,
"step": 975
},
{
"epoch": 1.8563956252971945,
"grad_norm": 0.3777524530887604,
"learning_rate": 9.383296284534774e-05,
"loss": 0.2535,
"step": 976
},
{
"epoch": 1.858297669995245,
"grad_norm": 0.3377416431903839,
"learning_rate": 9.382661162273738e-05,
"loss": 0.2767,
"step": 977
},
{
"epoch": 1.8601997146932954,
"grad_norm": 0.34563374519348145,
"learning_rate": 9.382026040012702e-05,
"loss": 0.1923,
"step": 978
},
{
"epoch": 1.8621017593913458,
"grad_norm": 0.3025479018688202,
"learning_rate": 9.381390917751668e-05,
"loss": 0.2214,
"step": 979
},
{
"epoch": 1.8640038040893963,
"grad_norm": 0.3614577054977417,
"learning_rate": 9.380755795490632e-05,
"loss": 0.299,
"step": 980
},
{
"epoch": 1.8659058487874465,
"grad_norm": 0.34508028626441956,
"learning_rate": 9.380120673229597e-05,
"loss": 0.2201,
"step": 981
},
{
"epoch": 1.867807893485497,
"grad_norm": 0.33169567584991455,
"learning_rate": 9.379485550968563e-05,
"loss": 0.2298,
"step": 982
},
{
"epoch": 1.8697099381835474,
"grad_norm": 0.4361656904220581,
"learning_rate": 9.378850428707526e-05,
"loss": 0.3109,
"step": 983
},
{
"epoch": 1.8716119828815976,
"grad_norm": 0.3832654654979706,
"learning_rate": 9.378215306446492e-05,
"loss": 0.2877,
"step": 984
},
{
"epoch": 1.873514027579648,
"grad_norm": 0.3991541862487793,
"learning_rate": 9.377580184185455e-05,
"loss": 0.2755,
"step": 985
},
{
"epoch": 1.8754160722776985,
"grad_norm": 0.6057716012001038,
"learning_rate": 9.37694506192442e-05,
"loss": 0.3665,
"step": 986
},
{
"epoch": 1.877318116975749,
"grad_norm": 0.2887308895587921,
"learning_rate": 9.376309939663386e-05,
"loss": 0.2414,
"step": 987
},
{
"epoch": 1.8792201616737993,
"grad_norm": 0.28379005193710327,
"learning_rate": 9.37567481740235e-05,
"loss": 0.1895,
"step": 988
},
{
"epoch": 1.8811222063718498,
"grad_norm": 0.36071258783340454,
"learning_rate": 9.375039695141316e-05,
"loss": 0.2855,
"step": 989
},
{
"epoch": 1.8830242510699002,
"grad_norm": 0.3872823119163513,
"learning_rate": 9.37440457288028e-05,
"loss": 0.3112,
"step": 990
},
{
"epoch": 1.8849262957679507,
"grad_norm": 0.3761101961135864,
"learning_rate": 9.373769450619244e-05,
"loss": 0.2291,
"step": 991
},
{
"epoch": 1.886828340466001,
"grad_norm": 0.404000461101532,
"learning_rate": 9.373134328358209e-05,
"loss": 0.2349,
"step": 992
},
{
"epoch": 1.8887303851640513,
"grad_norm": 0.4787864089012146,
"learning_rate": 9.372499206097174e-05,
"loss": 0.3447,
"step": 993
},
{
"epoch": 1.8906324298621018,
"grad_norm": 0.4898964762687683,
"learning_rate": 9.37186408383614e-05,
"loss": 0.3306,
"step": 994
},
{
"epoch": 1.892534474560152,
"grad_norm": 0.3915330767631531,
"learning_rate": 9.371228961575103e-05,
"loss": 0.2896,
"step": 995
},
{
"epoch": 1.8944365192582024,
"grad_norm": 0.4643494486808777,
"learning_rate": 9.370593839314068e-05,
"loss": 0.3131,
"step": 996
},
{
"epoch": 1.8963385639562529,
"grad_norm": 0.39880135655403137,
"learning_rate": 9.369958717053034e-05,
"loss": 0.2598,
"step": 997
},
{
"epoch": 1.8982406086543033,
"grad_norm": 0.3153114318847656,
"learning_rate": 9.369323594791997e-05,
"loss": 0.2429,
"step": 998
},
{
"epoch": 1.9001426533523538,
"grad_norm": 0.4997500479221344,
"learning_rate": 9.368688472530963e-05,
"loss": 0.4179,
"step": 999
},
{
"epoch": 1.9020446980504042,
"grad_norm": 0.3919009566307068,
"learning_rate": 9.368053350269928e-05,
"loss": 0.2468,
"step": 1000
},
{
"epoch": 1.9039467427484547,
"grad_norm": 0.48444265127182007,
"learning_rate": 9.367418228008892e-05,
"loss": 0.3191,
"step": 1001
},
{
"epoch": 1.905848787446505,
"grad_norm": 0.38168856501579285,
"learning_rate": 9.366783105747857e-05,
"loss": 0.2658,
"step": 1002
},
{
"epoch": 1.9077508321445555,
"grad_norm": 0.47058162093162537,
"learning_rate": 9.366147983486822e-05,
"loss": 0.3392,
"step": 1003
},
{
"epoch": 1.9096528768426058,
"grad_norm": 0.40145471692085266,
"learning_rate": 9.365512861225786e-05,
"loss": 0.2619,
"step": 1004
},
{
"epoch": 1.9115549215406562,
"grad_norm": 0.6980530619621277,
"learning_rate": 9.364877738964751e-05,
"loss": 0.3111,
"step": 1005
},
{
"epoch": 1.9134569662387066,
"grad_norm": 0.35878410935401917,
"learning_rate": 9.364242616703716e-05,
"loss": 0.3026,
"step": 1006
},
{
"epoch": 1.9153590109367569,
"grad_norm": 0.3291071653366089,
"learning_rate": 9.363607494442681e-05,
"loss": 0.2813,
"step": 1007
},
{
"epoch": 1.9172610556348073,
"grad_norm": 0.4286592900753021,
"learning_rate": 9.362972372181645e-05,
"loss": 0.2921,
"step": 1008
},
{
"epoch": 1.9191631003328578,
"grad_norm": 0.2965177893638611,
"learning_rate": 9.36233724992061e-05,
"loss": 0.2373,
"step": 1009
},
{
"epoch": 1.9210651450309082,
"grad_norm": 0.3153838515281677,
"learning_rate": 9.361702127659576e-05,
"loss": 0.2195,
"step": 1010
},
{
"epoch": 1.9229671897289586,
"grad_norm": 0.4827108085155487,
"learning_rate": 9.36106700539854e-05,
"loss": 0.3127,
"step": 1011
},
{
"epoch": 1.924869234427009,
"grad_norm": 0.43089860677719116,
"learning_rate": 9.360431883137505e-05,
"loss": 0.2687,
"step": 1012
},
{
"epoch": 1.9267712791250595,
"grad_norm": 0.43147915601730347,
"learning_rate": 9.35979676087647e-05,
"loss": 0.3953,
"step": 1013
},
{
"epoch": 1.92867332382311,
"grad_norm": 0.37924453616142273,
"learning_rate": 9.359161638615434e-05,
"loss": 0.2522,
"step": 1014
},
{
"epoch": 1.9305753685211604,
"grad_norm": 0.34664931893348694,
"learning_rate": 9.358526516354399e-05,
"loss": 0.2048,
"step": 1015
},
{
"epoch": 1.9324774132192106,
"grad_norm": 0.2877664566040039,
"learning_rate": 9.357891394093364e-05,
"loss": 0.1794,
"step": 1016
},
{
"epoch": 1.934379457917261,
"grad_norm": 0.4924784302711487,
"learning_rate": 9.357256271832329e-05,
"loss": 0.2737,
"step": 1017
},
{
"epoch": 1.9362815026153115,
"grad_norm": 0.36828553676605225,
"learning_rate": 9.356621149571293e-05,
"loss": 0.2761,
"step": 1018
},
{
"epoch": 1.9381835473133617,
"grad_norm": 0.355372816324234,
"learning_rate": 9.355986027310257e-05,
"loss": 0.2647,
"step": 1019
},
{
"epoch": 1.9400855920114122,
"grad_norm": 0.37469297647476196,
"learning_rate": 9.355350905049223e-05,
"loss": 0.2347,
"step": 1020
},
{
"epoch": 1.9419876367094626,
"grad_norm": 0.44890064001083374,
"learning_rate": 9.354715782788187e-05,
"loss": 0.2581,
"step": 1021
},
{
"epoch": 1.943889681407513,
"grad_norm": 0.355234295129776,
"learning_rate": 9.354080660527151e-05,
"loss": 0.2467,
"step": 1022
},
{
"epoch": 1.9457917261055635,
"grad_norm": 0.463871568441391,
"learning_rate": 9.353445538266116e-05,
"loss": 0.2338,
"step": 1023
},
{
"epoch": 1.947693770803614,
"grad_norm": 0.38206830620765686,
"learning_rate": 9.352810416005081e-05,
"loss": 0.2353,
"step": 1024
},
{
"epoch": 1.9495958155016644,
"grad_norm": 0.37627413868904114,
"learning_rate": 9.352175293744047e-05,
"loss": 0.2375,
"step": 1025
},
{
"epoch": 1.9514978601997148,
"grad_norm": 0.4191925823688507,
"learning_rate": 9.35154017148301e-05,
"loss": 0.2444,
"step": 1026
},
{
"epoch": 1.953399904897765,
"grad_norm": 0.41149812936782837,
"learning_rate": 9.350905049221976e-05,
"loss": 0.2905,
"step": 1027
},
{
"epoch": 1.9553019495958155,
"grad_norm": 0.329313725233078,
"learning_rate": 9.350269926960941e-05,
"loss": 0.2293,
"step": 1028
},
{
"epoch": 1.957203994293866,
"grad_norm": 0.4160427749156952,
"learning_rate": 9.349634804699905e-05,
"loss": 0.2512,
"step": 1029
},
{
"epoch": 1.9591060389919162,
"grad_norm": 0.4005848467350006,
"learning_rate": 9.34899968243887e-05,
"loss": 0.2446,
"step": 1030
},
{
"epoch": 1.9610080836899666,
"grad_norm": 0.4497627019882202,
"learning_rate": 9.348364560177835e-05,
"loss": 0.3265,
"step": 1031
},
{
"epoch": 1.962910128388017,
"grad_norm": 0.4275449216365814,
"learning_rate": 9.347729437916799e-05,
"loss": 0.302,
"step": 1032
},
{
"epoch": 1.9648121730860675,
"grad_norm": 0.33947649598121643,
"learning_rate": 9.347094315655764e-05,
"loss": 0.1903,
"step": 1033
},
{
"epoch": 1.966714217784118,
"grad_norm": 0.38422051072120667,
"learning_rate": 9.346459193394729e-05,
"loss": 0.2595,
"step": 1034
},
{
"epoch": 1.9686162624821684,
"grad_norm": 0.35371389985084534,
"learning_rate": 9.345824071133694e-05,
"loss": 0.2284,
"step": 1035
},
{
"epoch": 1.9705183071802188,
"grad_norm": 0.38803884387016296,
"learning_rate": 9.345188948872658e-05,
"loss": 0.3021,
"step": 1036
},
{
"epoch": 1.9724203518782693,
"grad_norm": 0.38203269243240356,
"learning_rate": 9.344553826611623e-05,
"loss": 0.2863,
"step": 1037
},
{
"epoch": 1.9743223965763197,
"grad_norm": 0.3267860412597656,
"learning_rate": 9.343918704350588e-05,
"loss": 0.226,
"step": 1038
},
{
"epoch": 1.97622444127437,
"grad_norm": 0.39556884765625,
"learning_rate": 9.343283582089552e-05,
"loss": 0.2727,
"step": 1039
},
{
"epoch": 1.9781264859724204,
"grad_norm": 0.4278768301010132,
"learning_rate": 9.342648459828517e-05,
"loss": 0.2723,
"step": 1040
},
{
"epoch": 1.9800285306704708,
"grad_norm": 0.37279701232910156,
"learning_rate": 9.342013337567483e-05,
"loss": 0.2685,
"step": 1041
},
{
"epoch": 1.981930575368521,
"grad_norm": 0.4421425759792328,
"learning_rate": 9.341378215306447e-05,
"loss": 0.2793,
"step": 1042
},
{
"epoch": 1.9838326200665715,
"grad_norm": 0.4341887831687927,
"learning_rate": 9.340743093045412e-05,
"loss": 0.2752,
"step": 1043
},
{
"epoch": 1.985734664764622,
"grad_norm": 0.42935600876808167,
"learning_rate": 9.340107970784377e-05,
"loss": 0.3127,
"step": 1044
},
{
"epoch": 1.9876367094626723,
"grad_norm": 0.29476839303970337,
"learning_rate": 9.339472848523341e-05,
"loss": 0.1855,
"step": 1045
},
{
"epoch": 1.9895387541607228,
"grad_norm": 0.43286338448524475,
"learning_rate": 9.338837726262306e-05,
"loss": 0.3109,
"step": 1046
},
{
"epoch": 1.9914407988587732,
"grad_norm": 0.35097062587738037,
"learning_rate": 9.338202604001271e-05,
"loss": 0.2178,
"step": 1047
},
{
"epoch": 1.9933428435568237,
"grad_norm": 0.3497145175933838,
"learning_rate": 9.337567481740236e-05,
"loss": 0.2372,
"step": 1048
},
{
"epoch": 1.9952448882548741,
"grad_norm": 0.4399060904979706,
"learning_rate": 9.3369323594792e-05,
"loss": 0.3065,
"step": 1049
},
{
"epoch": 1.9971469329529246,
"grad_norm": 0.43642693758010864,
"learning_rate": 9.336297237218164e-05,
"loss": 0.3099,
"step": 1050
},
{
"epoch": 1.9990489776509748,
"grad_norm": 0.42969372868537903,
"learning_rate": 9.33566211495713e-05,
"loss": 0.2899,
"step": 1051
},
{
"epoch": 2.000951022349025,
"grad_norm": 0.324709951877594,
"learning_rate": 9.335026992696094e-05,
"loss": 0.1977,
"step": 1052
},
{
"epoch": 2.0028530670470754,
"grad_norm": 0.2254759967327118,
"learning_rate": 9.33439187043506e-05,
"loss": 0.1513,
"step": 1053
},
{
"epoch": 2.004755111745126,
"grad_norm": 0.29324305057525635,
"learning_rate": 9.333756748174025e-05,
"loss": 0.1739,
"step": 1054
},
{
"epoch": 2.0066571564431763,
"grad_norm": 0.2934301495552063,
"learning_rate": 9.333121625912988e-05,
"loss": 0.1788,
"step": 1055
},
{
"epoch": 2.0085592011412268,
"grad_norm": 0.3355758786201477,
"learning_rate": 9.332486503651954e-05,
"loss": 0.1829,
"step": 1056
},
{
"epoch": 2.010461245839277,
"grad_norm": 0.4047424793243408,
"learning_rate": 9.331851381390917e-05,
"loss": 0.2256,
"step": 1057
},
{
"epoch": 2.0123632905373277,
"grad_norm": 0.38155117630958557,
"learning_rate": 9.331216259129883e-05,
"loss": 0.1992,
"step": 1058
},
{
"epoch": 2.014265335235378,
"grad_norm": 0.4122423827648163,
"learning_rate": 9.330581136868848e-05,
"loss": 0.2222,
"step": 1059
},
{
"epoch": 2.0161673799334285,
"grad_norm": 0.4098420739173889,
"learning_rate": 9.329946014607812e-05,
"loss": 0.1495,
"step": 1060
},
{
"epoch": 2.018069424631479,
"grad_norm": 0.37494683265686035,
"learning_rate": 9.329310892346778e-05,
"loss": 0.1955,
"step": 1061
},
{
"epoch": 2.0199714693295294,
"grad_norm": 0.4210919439792633,
"learning_rate": 9.328675770085742e-05,
"loss": 0.1851,
"step": 1062
},
{
"epoch": 2.02187351402758,
"grad_norm": 0.415770560503006,
"learning_rate": 9.328040647824706e-05,
"loss": 0.209,
"step": 1063
},
{
"epoch": 2.02377555872563,
"grad_norm": 0.38957807421684265,
"learning_rate": 9.327405525563671e-05,
"loss": 0.1597,
"step": 1064
},
{
"epoch": 2.0256776034236803,
"grad_norm": 0.3568849563598633,
"learning_rate": 9.326770403302636e-05,
"loss": 0.1564,
"step": 1065
},
{
"epoch": 2.0275796481217307,
"grad_norm": 0.4151419699192047,
"learning_rate": 9.326135281041601e-05,
"loss": 0.2213,
"step": 1066
},
{
"epoch": 2.029481692819781,
"grad_norm": 0.437418669462204,
"learning_rate": 9.325500158780565e-05,
"loss": 0.2091,
"step": 1067
},
{
"epoch": 2.0313837375178316,
"grad_norm": 0.45977523922920227,
"learning_rate": 9.32486503651953e-05,
"loss": 0.2044,
"step": 1068
},
{
"epoch": 2.033285782215882,
"grad_norm": 0.3634967803955078,
"learning_rate": 9.324229914258496e-05,
"loss": 0.1575,
"step": 1069
},
{
"epoch": 2.0351878269139325,
"grad_norm": 0.4348776638507843,
"learning_rate": 9.32359479199746e-05,
"loss": 0.1892,
"step": 1070
},
{
"epoch": 2.037089871611983,
"grad_norm": 0.39220520853996277,
"learning_rate": 9.322959669736425e-05,
"loss": 0.1962,
"step": 1071
},
{
"epoch": 2.0389919163100334,
"grad_norm": 0.4379669725894928,
"learning_rate": 9.32232454747539e-05,
"loss": 0.2201,
"step": 1072
},
{
"epoch": 2.040893961008084,
"grad_norm": 0.31880828738212585,
"learning_rate": 9.321689425214354e-05,
"loss": 0.1471,
"step": 1073
},
{
"epoch": 2.0427960057061343,
"grad_norm": 0.31966346502304077,
"learning_rate": 9.321054302953319e-05,
"loss": 0.1688,
"step": 1074
},
{
"epoch": 2.0446980504041843,
"grad_norm": 0.38291382789611816,
"learning_rate": 9.320419180692284e-05,
"loss": 0.1797,
"step": 1075
},
{
"epoch": 2.0466000951022347,
"grad_norm": 0.3871828615665436,
"learning_rate": 9.319784058431248e-05,
"loss": 0.2201,
"step": 1076
},
{
"epoch": 2.048502139800285,
"grad_norm": 0.35201162099838257,
"learning_rate": 9.319148936170213e-05,
"loss": 0.1759,
"step": 1077
},
{
"epoch": 2.0504041844983356,
"grad_norm": 0.32999902963638306,
"learning_rate": 9.318513813909178e-05,
"loss": 0.1676,
"step": 1078
},
{
"epoch": 2.052306229196386,
"grad_norm": 0.38137802481651306,
"learning_rate": 9.317878691648143e-05,
"loss": 0.181,
"step": 1079
},
{
"epoch": 2.0542082738944365,
"grad_norm": 0.28507858514785767,
"learning_rate": 9.317243569387107e-05,
"loss": 0.1333,
"step": 1080
},
{
"epoch": 2.056110318592487,
"grad_norm": 0.511489987373352,
"learning_rate": 9.316608447126071e-05,
"loss": 0.271,
"step": 1081
},
{
"epoch": 2.0580123632905374,
"grad_norm": 0.37042170763015747,
"learning_rate": 9.315973324865038e-05,
"loss": 0.2733,
"step": 1082
},
{
"epoch": 2.059914407988588,
"grad_norm": 0.3986508548259735,
"learning_rate": 9.315338202604001e-05,
"loss": 0.1964,
"step": 1083
},
{
"epoch": 2.0618164526866383,
"grad_norm": 0.37804266810417175,
"learning_rate": 9.314703080342967e-05,
"loss": 0.1601,
"step": 1084
},
{
"epoch": 2.0637184973846887,
"grad_norm": 0.32077136635780334,
"learning_rate": 9.314067958081932e-05,
"loss": 0.1462,
"step": 1085
},
{
"epoch": 2.065620542082739,
"grad_norm": 0.2813294231891632,
"learning_rate": 9.313432835820896e-05,
"loss": 0.1321,
"step": 1086
},
{
"epoch": 2.067522586780789,
"grad_norm": 0.40840163826942444,
"learning_rate": 9.312797713559861e-05,
"loss": 0.1892,
"step": 1087
},
{
"epoch": 2.0694246314788396,
"grad_norm": 0.3264133334159851,
"learning_rate": 9.312162591298825e-05,
"loss": 0.1415,
"step": 1088
},
{
"epoch": 2.07132667617689,
"grad_norm": 0.4274674952030182,
"learning_rate": 9.311527469037791e-05,
"loss": 0.1813,
"step": 1089
},
{
"epoch": 2.0732287208749405,
"grad_norm": 0.37283292412757874,
"learning_rate": 9.310892346776755e-05,
"loss": 0.1753,
"step": 1090
},
{
"epoch": 2.075130765572991,
"grad_norm": 0.32638901472091675,
"learning_rate": 9.310257224515719e-05,
"loss": 0.1731,
"step": 1091
},
{
"epoch": 2.0770328102710414,
"grad_norm": 0.3295043408870697,
"learning_rate": 9.309622102254685e-05,
"loss": 0.1934,
"step": 1092
},
{
"epoch": 2.078934854969092,
"grad_norm": 0.34605681896209717,
"learning_rate": 9.308986979993649e-05,
"loss": 0.2556,
"step": 1093
},
{
"epoch": 2.0808368996671422,
"grad_norm": 0.35646018385887146,
"learning_rate": 9.308351857732613e-05,
"loss": 0.1508,
"step": 1094
},
{
"epoch": 2.0827389443651927,
"grad_norm": 0.3224691152572632,
"learning_rate": 9.307716735471578e-05,
"loss": 0.1592,
"step": 1095
},
{
"epoch": 2.084640989063243,
"grad_norm": 0.3692566156387329,
"learning_rate": 9.307081613210543e-05,
"loss": 0.1555,
"step": 1096
},
{
"epoch": 2.0865430337612936,
"grad_norm": 0.46436119079589844,
"learning_rate": 9.306446490949509e-05,
"loss": 0.2176,
"step": 1097
},
{
"epoch": 2.088445078459344,
"grad_norm": 0.3176686465740204,
"learning_rate": 9.305811368688472e-05,
"loss": 0.1763,
"step": 1098
},
{
"epoch": 2.090347123157394,
"grad_norm": 0.29192522168159485,
"learning_rate": 9.305176246427438e-05,
"loss": 0.1485,
"step": 1099
},
{
"epoch": 2.0922491678554445,
"grad_norm": 0.34905532002449036,
"learning_rate": 9.304541124166403e-05,
"loss": 0.1657,
"step": 1100
},
{
"epoch": 2.094151212553495,
"grad_norm": 0.4198562800884247,
"learning_rate": 9.303906001905367e-05,
"loss": 0.2077,
"step": 1101
},
{
"epoch": 2.0960532572515453,
"grad_norm": 0.35974305868148804,
"learning_rate": 9.303270879644332e-05,
"loss": 0.1776,
"step": 1102
},
{
"epoch": 2.097955301949596,
"grad_norm": 0.35371047258377075,
"learning_rate": 9.302635757383297e-05,
"loss": 0.1887,
"step": 1103
},
{
"epoch": 2.0998573466476462,
"grad_norm": 0.30068957805633545,
"learning_rate": 9.302000635122261e-05,
"loss": 0.14,
"step": 1104
},
{
"epoch": 2.1017593913456967,
"grad_norm": 0.31092819571495056,
"learning_rate": 9.301365512861226e-05,
"loss": 0.1603,
"step": 1105
},
{
"epoch": 2.103661436043747,
"grad_norm": 0.3615265190601349,
"learning_rate": 9.300730390600191e-05,
"loss": 0.1791,
"step": 1106
},
{
"epoch": 2.1055634807417976,
"grad_norm": 0.2767830491065979,
"learning_rate": 9.300095268339156e-05,
"loss": 0.1243,
"step": 1107
},
{
"epoch": 2.107465525439848,
"grad_norm": 0.36988285183906555,
"learning_rate": 9.29946014607812e-05,
"loss": 0.1619,
"step": 1108
},
{
"epoch": 2.1093675701378984,
"grad_norm": 0.6014404892921448,
"learning_rate": 9.298825023817085e-05,
"loss": 0.2635,
"step": 1109
},
{
"epoch": 2.1112696148359484,
"grad_norm": 0.3621249794960022,
"learning_rate": 9.29818990155605e-05,
"loss": 0.1749,
"step": 1110
},
{
"epoch": 2.113171659533999,
"grad_norm": 0.2977392077445984,
"learning_rate": 9.297554779295014e-05,
"loss": 0.1582,
"step": 1111
},
{
"epoch": 2.1150737042320493,
"grad_norm": 0.3253994286060333,
"learning_rate": 9.29691965703398e-05,
"loss": 0.1787,
"step": 1112
},
{
"epoch": 2.1169757489300998,
"grad_norm": 0.34662213921546936,
"learning_rate": 9.296284534772945e-05,
"loss": 0.1923,
"step": 1113
},
{
"epoch": 2.11887779362815,
"grad_norm": 0.416458398103714,
"learning_rate": 9.295649412511909e-05,
"loss": 0.1941,
"step": 1114
},
{
"epoch": 2.1207798383262007,
"grad_norm": 0.36649563908576965,
"learning_rate": 9.295014290250874e-05,
"loss": 0.2233,
"step": 1115
},
{
"epoch": 2.122681883024251,
"grad_norm": 0.3445313274860382,
"learning_rate": 9.294379167989839e-05,
"loss": 0.1701,
"step": 1116
},
{
"epoch": 2.1245839277223015,
"grad_norm": 0.38747549057006836,
"learning_rate": 9.293744045728803e-05,
"loss": 0.1707,
"step": 1117
},
{
"epoch": 2.126485972420352,
"grad_norm": 0.4027896225452423,
"learning_rate": 9.293108923467768e-05,
"loss": 0.2086,
"step": 1118
},
{
"epoch": 2.1283880171184024,
"grad_norm": 0.3629845976829529,
"learning_rate": 9.292473801206733e-05,
"loss": 0.1743,
"step": 1119
},
{
"epoch": 2.130290061816453,
"grad_norm": 0.39419326186180115,
"learning_rate": 9.291838678945698e-05,
"loss": 0.1907,
"step": 1120
},
{
"epoch": 2.132192106514503,
"grad_norm": 0.36944523453712463,
"learning_rate": 9.291203556684662e-05,
"loss": 0.1631,
"step": 1121
},
{
"epoch": 2.1340941512125533,
"grad_norm": 0.4214774966239929,
"learning_rate": 9.290568434423626e-05,
"loss": 0.2397,
"step": 1122
},
{
"epoch": 2.1359961959106037,
"grad_norm": 0.3092084228992462,
"learning_rate": 9.289933312162593e-05,
"loss": 0.1396,
"step": 1123
},
{
"epoch": 2.137898240608654,
"grad_norm": 0.3649998605251312,
"learning_rate": 9.289298189901556e-05,
"loss": 0.1677,
"step": 1124
},
{
"epoch": 2.1398002853067046,
"grad_norm": 0.4131282567977905,
"learning_rate": 9.288663067640522e-05,
"loss": 0.2049,
"step": 1125
},
{
"epoch": 2.141702330004755,
"grad_norm": 0.4324544668197632,
"learning_rate": 9.288027945379485e-05,
"loss": 0.1757,
"step": 1126
},
{
"epoch": 2.1436043747028055,
"grad_norm": 0.4258798658847809,
"learning_rate": 9.28739282311845e-05,
"loss": 0.199,
"step": 1127
},
{
"epoch": 2.145506419400856,
"grad_norm": 0.4244062602519989,
"learning_rate": 9.286757700857416e-05,
"loss": 0.2006,
"step": 1128
},
{
"epoch": 2.1474084640989064,
"grad_norm": 0.4003104865550995,
"learning_rate": 9.28612257859638e-05,
"loss": 0.2098,
"step": 1129
},
{
"epoch": 2.149310508796957,
"grad_norm": 0.36191633343696594,
"learning_rate": 9.285487456335345e-05,
"loss": 0.1821,
"step": 1130
},
{
"epoch": 2.1512125534950073,
"grad_norm": 0.47675448656082153,
"learning_rate": 9.28485233407431e-05,
"loss": 0.2083,
"step": 1131
},
{
"epoch": 2.1531145981930577,
"grad_norm": 0.4418546259403229,
"learning_rate": 9.284217211813274e-05,
"loss": 0.2228,
"step": 1132
},
{
"epoch": 2.155016642891108,
"grad_norm": 0.31201982498168945,
"learning_rate": 9.283582089552239e-05,
"loss": 0.1326,
"step": 1133
},
{
"epoch": 2.156918687589158,
"grad_norm": 0.30012449622154236,
"learning_rate": 9.282946967291204e-05,
"loss": 0.1376,
"step": 1134
},
{
"epoch": 2.1588207322872086,
"grad_norm": 0.3705848455429077,
"learning_rate": 9.282311845030168e-05,
"loss": 0.1719,
"step": 1135
},
{
"epoch": 2.160722776985259,
"grad_norm": 0.4028238356113434,
"learning_rate": 9.281676722769133e-05,
"loss": 0.178,
"step": 1136
},
{
"epoch": 2.1626248216833095,
"grad_norm": 0.38973838090896606,
"learning_rate": 9.281041600508098e-05,
"loss": 0.1875,
"step": 1137
},
{
"epoch": 2.16452686638136,
"grad_norm": 0.3756285309791565,
"learning_rate": 9.280406478247064e-05,
"loss": 0.1883,
"step": 1138
},
{
"epoch": 2.1664289110794104,
"grad_norm": 0.2721819579601288,
"learning_rate": 9.279771355986027e-05,
"loss": 0.1468,
"step": 1139
},
{
"epoch": 2.168330955777461,
"grad_norm": 0.34547916054725647,
"learning_rate": 9.279136233724993e-05,
"loss": 0.2043,
"step": 1140
},
{
"epoch": 2.1702330004755113,
"grad_norm": 0.44819575548171997,
"learning_rate": 9.278501111463958e-05,
"loss": 0.2029,
"step": 1141
},
{
"epoch": 2.1721350451735617,
"grad_norm": 0.36632853746414185,
"learning_rate": 9.277865989202922e-05,
"loss": 0.1884,
"step": 1142
},
{
"epoch": 2.174037089871612,
"grad_norm": 0.37020185589790344,
"learning_rate": 9.277230866941887e-05,
"loss": 0.1819,
"step": 1143
},
{
"epoch": 2.1759391345696626,
"grad_norm": 0.4174460470676422,
"learning_rate": 9.276595744680852e-05,
"loss": 0.1918,
"step": 1144
},
{
"epoch": 2.1778411792677126,
"grad_norm": 0.4120714068412781,
"learning_rate": 9.275960622419816e-05,
"loss": 0.2496,
"step": 1145
},
{
"epoch": 2.179743223965763,
"grad_norm": 0.4350152909755707,
"learning_rate": 9.275325500158781e-05,
"loss": 0.1981,
"step": 1146
},
{
"epoch": 2.1816452686638135,
"grad_norm": 0.35637348890304565,
"learning_rate": 9.274690377897746e-05,
"loss": 0.1639,
"step": 1147
},
{
"epoch": 2.183547313361864,
"grad_norm": 0.34323298931121826,
"learning_rate": 9.27405525563671e-05,
"loss": 0.1761,
"step": 1148
},
{
"epoch": 2.1854493580599144,
"grad_norm": 0.30730780959129333,
"learning_rate": 9.273420133375675e-05,
"loss": 0.1623,
"step": 1149
},
{
"epoch": 2.187351402757965,
"grad_norm": 0.32239773869514465,
"learning_rate": 9.27278501111464e-05,
"loss": 0.1238,
"step": 1150
},
{
"epoch": 2.1892534474560152,
"grad_norm": 0.35441848635673523,
"learning_rate": 9.272149888853606e-05,
"loss": 0.1578,
"step": 1151
},
{
"epoch": 2.1911554921540657,
"grad_norm": 0.33287835121154785,
"learning_rate": 9.27151476659257e-05,
"loss": 0.1726,
"step": 1152
},
{
"epoch": 2.193057536852116,
"grad_norm": 0.3281983435153961,
"learning_rate": 9.270879644331533e-05,
"loss": 0.1435,
"step": 1153
},
{
"epoch": 2.1949595815501666,
"grad_norm": 0.31831398606300354,
"learning_rate": 9.2702445220705e-05,
"loss": 0.1585,
"step": 1154
},
{
"epoch": 2.196861626248217,
"grad_norm": 0.43460169434547424,
"learning_rate": 9.269609399809464e-05,
"loss": 0.2121,
"step": 1155
},
{
"epoch": 2.198763670946267,
"grad_norm": 0.3470516502857208,
"learning_rate": 9.268974277548429e-05,
"loss": 0.157,
"step": 1156
},
{
"epoch": 2.2006657156443175,
"grad_norm": 0.3971126079559326,
"learning_rate": 9.268339155287394e-05,
"loss": 0.1738,
"step": 1157
},
{
"epoch": 2.202567760342368,
"grad_norm": 0.39526277780532837,
"learning_rate": 9.267704033026358e-05,
"loss": 0.2117,
"step": 1158
},
{
"epoch": 2.2044698050404183,
"grad_norm": 0.31649425625801086,
"learning_rate": 9.267068910765323e-05,
"loss": 0.1966,
"step": 1159
},
{
"epoch": 2.206371849738469,
"grad_norm": 0.4104944169521332,
"learning_rate": 9.266433788504287e-05,
"loss": 0.2178,
"step": 1160
},
{
"epoch": 2.2082738944365192,
"grad_norm": 0.3751467168331146,
"learning_rate": 9.265798666243253e-05,
"loss": 0.1921,
"step": 1161
},
{
"epoch": 2.2101759391345697,
"grad_norm": 0.3348170816898346,
"learning_rate": 9.265163543982217e-05,
"loss": 0.1533,
"step": 1162
},
{
"epoch": 2.21207798383262,
"grad_norm": 0.39907872676849365,
"learning_rate": 9.264528421721181e-05,
"loss": 0.1733,
"step": 1163
},
{
"epoch": 2.2139800285306706,
"grad_norm": 0.45442381501197815,
"learning_rate": 9.263893299460147e-05,
"loss": 0.2065,
"step": 1164
},
{
"epoch": 2.215882073228721,
"grad_norm": 0.37475696206092834,
"learning_rate": 9.263258177199111e-05,
"loss": 0.1914,
"step": 1165
},
{
"epoch": 2.2177841179267714,
"grad_norm": 0.3757840394973755,
"learning_rate": 9.262623054938075e-05,
"loss": 0.1781,
"step": 1166
},
{
"epoch": 2.219686162624822,
"grad_norm": 0.3655502200126648,
"learning_rate": 9.26198793267704e-05,
"loss": 0.1814,
"step": 1167
},
{
"epoch": 2.2215882073228723,
"grad_norm": 0.4219561219215393,
"learning_rate": 9.261352810416006e-05,
"loss": 0.213,
"step": 1168
},
{
"epoch": 2.2234902520209223,
"grad_norm": 0.3741750419139862,
"learning_rate": 9.260717688154971e-05,
"loss": 0.1782,
"step": 1169
},
{
"epoch": 2.2253922967189728,
"grad_norm": 0.37189987301826477,
"learning_rate": 9.260082565893935e-05,
"loss": 0.1783,
"step": 1170
},
{
"epoch": 2.227294341417023,
"grad_norm": 0.2988317608833313,
"learning_rate": 9.2594474436329e-05,
"loss": 0.1481,
"step": 1171
},
{
"epoch": 2.2291963861150736,
"grad_norm": 0.38000479340553284,
"learning_rate": 9.258812321371865e-05,
"loss": 0.1843,
"step": 1172
},
{
"epoch": 2.231098430813124,
"grad_norm": 0.30989545583724976,
"learning_rate": 9.258177199110829e-05,
"loss": 0.1487,
"step": 1173
},
{
"epoch": 2.2330004755111745,
"grad_norm": 0.27984580397605896,
"learning_rate": 9.257542076849794e-05,
"loss": 0.1445,
"step": 1174
},
{
"epoch": 2.234902520209225,
"grad_norm": 0.3828918933868408,
"learning_rate": 9.256906954588759e-05,
"loss": 0.1709,
"step": 1175
},
{
"epoch": 2.2368045649072754,
"grad_norm": 0.33677807450294495,
"learning_rate": 9.256271832327723e-05,
"loss": 0.1656,
"step": 1176
},
{
"epoch": 2.238706609605326,
"grad_norm": 0.37769967317581177,
"learning_rate": 9.255636710066688e-05,
"loss": 0.2101,
"step": 1177
},
{
"epoch": 2.2406086543033763,
"grad_norm": 0.3978733420372009,
"learning_rate": 9.255001587805653e-05,
"loss": 0.215,
"step": 1178
},
{
"epoch": 2.2425106990014267,
"grad_norm": 0.3774537146091461,
"learning_rate": 9.254366465544618e-05,
"loss": 0.1778,
"step": 1179
},
{
"epoch": 2.2444127436994767,
"grad_norm": 0.4117525815963745,
"learning_rate": 9.253731343283582e-05,
"loss": 0.1801,
"step": 1180
},
{
"epoch": 2.246314788397527,
"grad_norm": 0.41460955142974854,
"learning_rate": 9.253096221022547e-05,
"loss": 0.1939,
"step": 1181
},
{
"epoch": 2.2482168330955776,
"grad_norm": 0.41124284267425537,
"learning_rate": 9.252461098761513e-05,
"loss": 0.1944,
"step": 1182
},
{
"epoch": 2.250118877793628,
"grad_norm": 0.39252787828445435,
"learning_rate": 9.251825976500476e-05,
"loss": 0.2037,
"step": 1183
},
{
"epoch": 2.2520209224916785,
"grad_norm": 0.4118300676345825,
"learning_rate": 9.25119085423944e-05,
"loss": 0.2067,
"step": 1184
},
{
"epoch": 2.253922967189729,
"grad_norm": 0.43823009729385376,
"learning_rate": 9.250555731978407e-05,
"loss": 0.2093,
"step": 1185
},
{
"epoch": 2.2558250118877794,
"grad_norm": 0.41397175192832947,
"learning_rate": 9.249920609717371e-05,
"loss": 0.195,
"step": 1186
},
{
"epoch": 2.25772705658583,
"grad_norm": 0.4286901652812958,
"learning_rate": 9.249285487456336e-05,
"loss": 0.1777,
"step": 1187
},
{
"epoch": 2.2596291012838803,
"grad_norm": 0.373329758644104,
"learning_rate": 9.248650365195301e-05,
"loss": 0.1759,
"step": 1188
},
{
"epoch": 2.2615311459819307,
"grad_norm": 0.4786781072616577,
"learning_rate": 9.248015242934265e-05,
"loss": 0.2509,
"step": 1189
},
{
"epoch": 2.263433190679981,
"grad_norm": 0.41533464193344116,
"learning_rate": 9.24738012067323e-05,
"loss": 0.1595,
"step": 1190
},
{
"epoch": 2.265335235378031,
"grad_norm": 0.37687090039253235,
"learning_rate": 9.246744998412194e-05,
"loss": 0.19,
"step": 1191
},
{
"epoch": 2.2672372800760816,
"grad_norm": 0.3623497188091278,
"learning_rate": 9.24610987615116e-05,
"loss": 0.1723,
"step": 1192
},
{
"epoch": 2.269139324774132,
"grad_norm": 0.378251850605011,
"learning_rate": 9.245474753890124e-05,
"loss": 0.1773,
"step": 1193
},
{
"epoch": 2.2710413694721825,
"grad_norm": 0.3755147457122803,
"learning_rate": 9.244839631629088e-05,
"loss": 0.1685,
"step": 1194
},
{
"epoch": 2.272943414170233,
"grad_norm": 0.5196719765663147,
"learning_rate": 9.244204509368055e-05,
"loss": 0.2665,
"step": 1195
},
{
"epoch": 2.2748454588682834,
"grad_norm": 0.4404764473438263,
"learning_rate": 9.243569387107018e-05,
"loss": 0.1956,
"step": 1196
},
{
"epoch": 2.276747503566334,
"grad_norm": 0.47750818729400635,
"learning_rate": 9.242934264845984e-05,
"loss": 0.2164,
"step": 1197
},
{
"epoch": 2.2786495482643843,
"grad_norm": 0.3968189060688019,
"learning_rate": 9.242299142584947e-05,
"loss": 0.2299,
"step": 1198
},
{
"epoch": 2.2805515929624347,
"grad_norm": 0.4168682396411896,
"learning_rate": 9.241664020323913e-05,
"loss": 0.1924,
"step": 1199
},
{
"epoch": 2.282453637660485,
"grad_norm": 0.3767165541648865,
"learning_rate": 9.241028898062878e-05,
"loss": 0.1868,
"step": 1200
},
{
"epoch": 2.2843556823585356,
"grad_norm": 0.37699073553085327,
"learning_rate": 9.240393775801842e-05,
"loss": 0.1968,
"step": 1201
},
{
"epoch": 2.2862577270565856,
"grad_norm": 0.4355759620666504,
"learning_rate": 9.239758653540807e-05,
"loss": 0.1988,
"step": 1202
},
{
"epoch": 2.2881597717546365,
"grad_norm": 0.42668578028678894,
"learning_rate": 9.239123531279772e-05,
"loss": 0.1988,
"step": 1203
},
{
"epoch": 2.2900618164526865,
"grad_norm": 0.44233736395835876,
"learning_rate": 9.238488409018736e-05,
"loss": 0.2128,
"step": 1204
},
{
"epoch": 2.291963861150737,
"grad_norm": 0.31429731845855713,
"learning_rate": 9.237853286757701e-05,
"loss": 0.1527,
"step": 1205
},
{
"epoch": 2.2938659058487874,
"grad_norm": 0.38366618752479553,
"learning_rate": 9.237218164496666e-05,
"loss": 0.1747,
"step": 1206
},
{
"epoch": 2.295767950546838,
"grad_norm": 0.3685773015022278,
"learning_rate": 9.23658304223563e-05,
"loss": 0.183,
"step": 1207
},
{
"epoch": 2.2976699952448882,
"grad_norm": 0.349924772977829,
"learning_rate": 9.235947919974595e-05,
"loss": 0.1641,
"step": 1208
},
{
"epoch": 2.2995720399429387,
"grad_norm": 0.3128054738044739,
"learning_rate": 9.23531279771356e-05,
"loss": 0.1682,
"step": 1209
},
{
"epoch": 2.301474084640989,
"grad_norm": 0.4457269608974457,
"learning_rate": 9.234677675452526e-05,
"loss": 0.1888,
"step": 1210
},
{
"epoch": 2.3033761293390396,
"grad_norm": 0.37438902258872986,
"learning_rate": 9.23404255319149e-05,
"loss": 0.1612,
"step": 1211
},
{
"epoch": 2.30527817403709,
"grad_norm": 0.3830793499946594,
"learning_rate": 9.233407430930455e-05,
"loss": 0.1825,
"step": 1212
},
{
"epoch": 2.3071802187351405,
"grad_norm": 0.4047216773033142,
"learning_rate": 9.23277230866942e-05,
"loss": 0.1874,
"step": 1213
},
{
"epoch": 2.309082263433191,
"grad_norm": 0.400716096162796,
"learning_rate": 9.232137186408384e-05,
"loss": 0.165,
"step": 1214
},
{
"epoch": 2.310984308131241,
"grad_norm": 0.35491228103637695,
"learning_rate": 9.231502064147349e-05,
"loss": 0.1428,
"step": 1215
},
{
"epoch": 2.3128863528292913,
"grad_norm": 0.3040875494480133,
"learning_rate": 9.230866941886314e-05,
"loss": 0.1315,
"step": 1216
},
{
"epoch": 2.314788397527342,
"grad_norm": 0.40058350563049316,
"learning_rate": 9.230231819625278e-05,
"loss": 0.2016,
"step": 1217
},
{
"epoch": 2.316690442225392,
"grad_norm": 0.33165568113327026,
"learning_rate": 9.229596697364243e-05,
"loss": 0.1668,
"step": 1218
},
{
"epoch": 2.3185924869234427,
"grad_norm": 0.29281625151634216,
"learning_rate": 9.228961575103208e-05,
"loss": 0.1577,
"step": 1219
},
{
"epoch": 2.320494531621493,
"grad_norm": 0.4083446264266968,
"learning_rate": 9.228326452842172e-05,
"loss": 0.174,
"step": 1220
},
{
"epoch": 2.3223965763195435,
"grad_norm": 0.3308553695678711,
"learning_rate": 9.227691330581137e-05,
"loss": 0.21,
"step": 1221
},
{
"epoch": 2.324298621017594,
"grad_norm": 0.4102175831794739,
"learning_rate": 9.227056208320102e-05,
"loss": 0.205,
"step": 1222
},
{
"epoch": 2.3262006657156444,
"grad_norm": 0.48705750703811646,
"learning_rate": 9.226421086059068e-05,
"loss": 0.2544,
"step": 1223
},
{
"epoch": 2.328102710413695,
"grad_norm": 0.3305780291557312,
"learning_rate": 9.225785963798031e-05,
"loss": 0.1786,
"step": 1224
},
{
"epoch": 2.3300047551117453,
"grad_norm": 0.3046979308128357,
"learning_rate": 9.225150841536995e-05,
"loss": 0.1325,
"step": 1225
},
{
"epoch": 2.3319067998097953,
"grad_norm": 0.4403087794780731,
"learning_rate": 9.224515719275962e-05,
"loss": 0.2288,
"step": 1226
},
{
"epoch": 2.3338088445078458,
"grad_norm": 0.3797864317893982,
"learning_rate": 9.223880597014926e-05,
"loss": 0.2068,
"step": 1227
},
{
"epoch": 2.335710889205896,
"grad_norm": 0.34793582558631897,
"learning_rate": 9.223245474753891e-05,
"loss": 0.182,
"step": 1228
},
{
"epoch": 2.3376129339039466,
"grad_norm": 0.30754920840263367,
"learning_rate": 9.222610352492856e-05,
"loss": 0.144,
"step": 1229
},
{
"epoch": 2.339514978601997,
"grad_norm": 0.4364961087703705,
"learning_rate": 9.22197523023182e-05,
"loss": 0.1824,
"step": 1230
},
{
"epoch": 2.3414170233000475,
"grad_norm": 0.3395443260669708,
"learning_rate": 9.221340107970785e-05,
"loss": 0.1691,
"step": 1231
},
{
"epoch": 2.343319067998098,
"grad_norm": 0.34626251459121704,
"learning_rate": 9.220704985709749e-05,
"loss": 0.2285,
"step": 1232
},
{
"epoch": 2.3452211126961484,
"grad_norm": 0.316518098115921,
"learning_rate": 9.220069863448715e-05,
"loss": 0.1469,
"step": 1233
},
{
"epoch": 2.347123157394199,
"grad_norm": 0.38813212513923645,
"learning_rate": 9.219434741187679e-05,
"loss": 0.1907,
"step": 1234
},
{
"epoch": 2.3490252020922493,
"grad_norm": 0.3442121744155884,
"learning_rate": 9.218799618926643e-05,
"loss": 0.1398,
"step": 1235
},
{
"epoch": 2.3509272467902997,
"grad_norm": 0.3373865783214569,
"learning_rate": 9.218164496665608e-05,
"loss": 0.1477,
"step": 1236
},
{
"epoch": 2.3528292914883497,
"grad_norm": 0.39781641960144043,
"learning_rate": 9.217529374404573e-05,
"loss": 0.1766,
"step": 1237
},
{
"epoch": 2.3547313361864006,
"grad_norm": 0.25478801131248474,
"learning_rate": 9.216894252143537e-05,
"loss": 0.1301,
"step": 1238
},
{
"epoch": 2.3566333808844506,
"grad_norm": 0.350087970495224,
"learning_rate": 9.216259129882502e-05,
"loss": 0.161,
"step": 1239
},
{
"epoch": 2.358535425582501,
"grad_norm": 0.4105963408946991,
"learning_rate": 9.215624007621468e-05,
"loss": 0.1887,
"step": 1240
},
{
"epoch": 2.3604374702805515,
"grad_norm": 0.4141649007797241,
"learning_rate": 9.214988885360433e-05,
"loss": 0.333,
"step": 1241
},
{
"epoch": 2.362339514978602,
"grad_norm": 0.4416482448577881,
"learning_rate": 9.214353763099397e-05,
"loss": 0.2329,
"step": 1242
},
{
"epoch": 2.3642415596766524,
"grad_norm": 0.4285755753517151,
"learning_rate": 9.213718640838362e-05,
"loss": 0.2194,
"step": 1243
},
{
"epoch": 2.366143604374703,
"grad_norm": 0.33636924624443054,
"learning_rate": 9.213083518577327e-05,
"loss": 0.1853,
"step": 1244
},
{
"epoch": 2.3680456490727533,
"grad_norm": 0.40267783403396606,
"learning_rate": 9.212448396316291e-05,
"loss": 0.1837,
"step": 1245
},
{
"epoch": 2.3699476937708037,
"grad_norm": 0.3251781463623047,
"learning_rate": 9.211813274055256e-05,
"loss": 0.1853,
"step": 1246
},
{
"epoch": 2.371849738468854,
"grad_norm": 0.3559510111808777,
"learning_rate": 9.211178151794221e-05,
"loss": 0.1735,
"step": 1247
},
{
"epoch": 2.3737517831669046,
"grad_norm": 0.3483911454677582,
"learning_rate": 9.210543029533185e-05,
"loss": 0.156,
"step": 1248
},
{
"epoch": 2.375653827864955,
"grad_norm": 0.4093637764453888,
"learning_rate": 9.20990790727215e-05,
"loss": 0.2013,
"step": 1249
},
{
"epoch": 2.377555872563005,
"grad_norm": 0.38886240124702454,
"learning_rate": 9.209272785011115e-05,
"loss": 0.1723,
"step": 1250
},
{
"epoch": 2.3794579172610555,
"grad_norm": 0.3627004325389862,
"learning_rate": 9.20863766275008e-05,
"loss": 0.1639,
"step": 1251
},
{
"epoch": 2.381359961959106,
"grad_norm": 0.33721840381622314,
"learning_rate": 9.208002540489044e-05,
"loss": 0.1613,
"step": 1252
},
{
"epoch": 2.3832620066571564,
"grad_norm": 0.4337291121482849,
"learning_rate": 9.20736741822801e-05,
"loss": 0.2036,
"step": 1253
},
{
"epoch": 2.385164051355207,
"grad_norm": 0.43212467432022095,
"learning_rate": 9.206732295966975e-05,
"loss": 0.1925,
"step": 1254
},
{
"epoch": 2.3870660960532573,
"grad_norm": 0.3450334966182709,
"learning_rate": 9.206097173705939e-05,
"loss": 0.1489,
"step": 1255
},
{
"epoch": 2.3889681407513077,
"grad_norm": 0.36295151710510254,
"learning_rate": 9.205462051444902e-05,
"loss": 0.1801,
"step": 1256
},
{
"epoch": 2.390870185449358,
"grad_norm": 0.469532310962677,
"learning_rate": 9.204826929183869e-05,
"loss": 0.2163,
"step": 1257
},
{
"epoch": 2.3927722301474086,
"grad_norm": 0.4618028402328491,
"learning_rate": 9.204191806922833e-05,
"loss": 0.2175,
"step": 1258
},
{
"epoch": 2.394674274845459,
"grad_norm": 0.3891139030456543,
"learning_rate": 9.203556684661798e-05,
"loss": 0.1585,
"step": 1259
},
{
"epoch": 2.3965763195435095,
"grad_norm": 0.4574741721153259,
"learning_rate": 9.202921562400763e-05,
"loss": 0.2545,
"step": 1260
},
{
"epoch": 2.3984783642415595,
"grad_norm": 0.49759337306022644,
"learning_rate": 9.202286440139727e-05,
"loss": 0.2208,
"step": 1261
},
{
"epoch": 2.40038040893961,
"grad_norm": 0.3180585503578186,
"learning_rate": 9.201651317878692e-05,
"loss": 0.157,
"step": 1262
},
{
"epoch": 2.4022824536376604,
"grad_norm": 0.3678848147392273,
"learning_rate": 9.201016195617656e-05,
"loss": 0.1891,
"step": 1263
},
{
"epoch": 2.404184498335711,
"grad_norm": 0.3016449809074402,
"learning_rate": 9.200381073356623e-05,
"loss": 0.1295,
"step": 1264
},
{
"epoch": 2.4060865430337612,
"grad_norm": 0.522779643535614,
"learning_rate": 9.199745951095586e-05,
"loss": 0.2814,
"step": 1265
},
{
"epoch": 2.4079885877318117,
"grad_norm": 0.45210519433021545,
"learning_rate": 9.19911082883455e-05,
"loss": 0.234,
"step": 1266
},
{
"epoch": 2.409890632429862,
"grad_norm": 0.3812367022037506,
"learning_rate": 9.198475706573517e-05,
"loss": 0.2104,
"step": 1267
},
{
"epoch": 2.4117926771279126,
"grad_norm": 0.3120013177394867,
"learning_rate": 9.19784058431248e-05,
"loss": 0.1511,
"step": 1268
},
{
"epoch": 2.413694721825963,
"grad_norm": 0.34164851903915405,
"learning_rate": 9.197205462051446e-05,
"loss": 0.1607,
"step": 1269
},
{
"epoch": 2.4155967665240135,
"grad_norm": 0.3127415180206299,
"learning_rate": 9.19657033979041e-05,
"loss": 0.143,
"step": 1270
},
{
"epoch": 2.417498811222064,
"grad_norm": 0.4628545641899109,
"learning_rate": 9.195935217529375e-05,
"loss": 0.2187,
"step": 1271
},
{
"epoch": 2.419400855920114,
"grad_norm": 0.3645714223384857,
"learning_rate": 9.19530009526834e-05,
"loss": 0.1648,
"step": 1272
},
{
"epoch": 2.4213029006181643,
"grad_norm": 0.41127142310142517,
"learning_rate": 9.194664973007304e-05,
"loss": 0.1712,
"step": 1273
},
{
"epoch": 2.4232049453162148,
"grad_norm": 0.48663556575775146,
"learning_rate": 9.194029850746269e-05,
"loss": 0.2713,
"step": 1274
},
{
"epoch": 2.425106990014265,
"grad_norm": 0.3965604305267334,
"learning_rate": 9.193394728485234e-05,
"loss": 0.1766,
"step": 1275
},
{
"epoch": 2.4270090347123157,
"grad_norm": 0.4565601646900177,
"learning_rate": 9.192759606224198e-05,
"loss": 0.1827,
"step": 1276
},
{
"epoch": 2.428911079410366,
"grad_norm": 0.4272227883338928,
"learning_rate": 9.192124483963163e-05,
"loss": 0.1874,
"step": 1277
},
{
"epoch": 2.4308131241084165,
"grad_norm": 0.42560452222824097,
"learning_rate": 9.191489361702128e-05,
"loss": 0.1829,
"step": 1278
},
{
"epoch": 2.432715168806467,
"grad_norm": 0.30827009677886963,
"learning_rate": 9.190854239441092e-05,
"loss": 0.1747,
"step": 1279
},
{
"epoch": 2.4346172135045174,
"grad_norm": 0.3780437707901001,
"learning_rate": 9.190219117180057e-05,
"loss": 0.1955,
"step": 1280
},
{
"epoch": 2.436519258202568,
"grad_norm": 0.32639580965042114,
"learning_rate": 9.189583994919023e-05,
"loss": 0.1568,
"step": 1281
},
{
"epoch": 2.4384213029006183,
"grad_norm": 0.37228289246559143,
"learning_rate": 9.188948872657988e-05,
"loss": 0.1871,
"step": 1282
},
{
"epoch": 2.4403233475986688,
"grad_norm": 0.4045466482639313,
"learning_rate": 9.188313750396952e-05,
"loss": 0.2237,
"step": 1283
},
{
"epoch": 2.442225392296719,
"grad_norm": 0.40609246492385864,
"learning_rate": 9.187678628135917e-05,
"loss": 0.2313,
"step": 1284
},
{
"epoch": 2.444127436994769,
"grad_norm": 0.36473485827445984,
"learning_rate": 9.187043505874882e-05,
"loss": 0.2528,
"step": 1285
},
{
"epoch": 2.4460294816928196,
"grad_norm": 0.4154009222984314,
"learning_rate": 9.186408383613846e-05,
"loss": 0.215,
"step": 1286
},
{
"epoch": 2.44793152639087,
"grad_norm": 0.33488062024116516,
"learning_rate": 9.185773261352811e-05,
"loss": 0.1666,
"step": 1287
},
{
"epoch": 2.4498335710889205,
"grad_norm": 0.392004132270813,
"learning_rate": 9.185138139091776e-05,
"loss": 0.2127,
"step": 1288
},
{
"epoch": 2.451735615786971,
"grad_norm": 0.32925739884376526,
"learning_rate": 9.18450301683074e-05,
"loss": 0.1459,
"step": 1289
},
{
"epoch": 2.4536376604850214,
"grad_norm": 0.3380909264087677,
"learning_rate": 9.183867894569705e-05,
"loss": 0.1482,
"step": 1290
},
{
"epoch": 2.455539705183072,
"grad_norm": 0.47436705231666565,
"learning_rate": 9.18323277230867e-05,
"loss": 0.2652,
"step": 1291
},
{
"epoch": 2.4574417498811223,
"grad_norm": 0.39543116092681885,
"learning_rate": 9.182597650047634e-05,
"loss": 0.1762,
"step": 1292
},
{
"epoch": 2.4593437945791727,
"grad_norm": 0.4776802659034729,
"learning_rate": 9.181962527786599e-05,
"loss": 0.1967,
"step": 1293
},
{
"epoch": 2.461245839277223,
"grad_norm": 0.37519994378089905,
"learning_rate": 9.181327405525563e-05,
"loss": 0.1909,
"step": 1294
},
{
"epoch": 2.4631478839752736,
"grad_norm": 0.37666913866996765,
"learning_rate": 9.18069228326453e-05,
"loss": 0.1477,
"step": 1295
},
{
"epoch": 2.4650499286733236,
"grad_norm": 0.3830261528491974,
"learning_rate": 9.180057161003494e-05,
"loss": 0.1825,
"step": 1296
},
{
"epoch": 2.466951973371374,
"grad_norm": 0.4064732789993286,
"learning_rate": 9.179422038742457e-05,
"loss": 0.2,
"step": 1297
},
{
"epoch": 2.4688540180694245,
"grad_norm": 0.318314790725708,
"learning_rate": 9.178786916481424e-05,
"loss": 0.1543,
"step": 1298
},
{
"epoch": 2.470756062767475,
"grad_norm": 0.3804973065853119,
"learning_rate": 9.178151794220388e-05,
"loss": 0.2248,
"step": 1299
},
{
"epoch": 2.4726581074655254,
"grad_norm": 0.4222256541252136,
"learning_rate": 9.177516671959353e-05,
"loss": 0.2037,
"step": 1300
},
{
"epoch": 2.474560152163576,
"grad_norm": 0.4317629337310791,
"learning_rate": 9.176881549698317e-05,
"loss": 0.1914,
"step": 1301
},
{
"epoch": 2.4764621968616263,
"grad_norm": 0.4674796760082245,
"learning_rate": 9.176246427437282e-05,
"loss": 0.212,
"step": 1302
},
{
"epoch": 2.4783642415596767,
"grad_norm": 0.40157684683799744,
"learning_rate": 9.175611305176247e-05,
"loss": 0.1948,
"step": 1303
},
{
"epoch": 2.480266286257727,
"grad_norm": 0.37824416160583496,
"learning_rate": 9.174976182915211e-05,
"loss": 0.1849,
"step": 1304
},
{
"epoch": 2.4821683309557776,
"grad_norm": 0.5870863199234009,
"learning_rate": 9.174341060654177e-05,
"loss": 0.1586,
"step": 1305
},
{
"epoch": 2.484070375653828,
"grad_norm": 0.3794877529144287,
"learning_rate": 9.173705938393141e-05,
"loss": 0.2162,
"step": 1306
},
{
"epoch": 2.485972420351878,
"grad_norm": 0.40509578585624695,
"learning_rate": 9.173070816132105e-05,
"loss": 0.1895,
"step": 1307
},
{
"epoch": 2.4878744650499285,
"grad_norm": 0.37314295768737793,
"learning_rate": 9.17243569387107e-05,
"loss": 0.1926,
"step": 1308
},
{
"epoch": 2.489776509747979,
"grad_norm": 0.32264095544815063,
"learning_rate": 9.171800571610035e-05,
"loss": 0.1385,
"step": 1309
},
{
"epoch": 2.4916785544460294,
"grad_norm": 0.43269702792167664,
"learning_rate": 9.171165449348999e-05,
"loss": 0.2189,
"step": 1310
},
{
"epoch": 2.49358059914408,
"grad_norm": 0.330098956823349,
"learning_rate": 9.170530327087964e-05,
"loss": 0.168,
"step": 1311
},
{
"epoch": 2.4954826438421303,
"grad_norm": 0.2726501524448395,
"learning_rate": 9.16989520482693e-05,
"loss": 0.1306,
"step": 1312
},
{
"epoch": 2.4973846885401807,
"grad_norm": 0.27615344524383545,
"learning_rate": 9.169260082565895e-05,
"loss": 0.1361,
"step": 1313
},
{
"epoch": 2.499286733238231,
"grad_norm": 0.3685866594314575,
"learning_rate": 9.168624960304859e-05,
"loss": 0.1901,
"step": 1314
},
{
"epoch": 2.5011887779362816,
"grad_norm": 0.323897123336792,
"learning_rate": 9.167989838043824e-05,
"loss": 0.2608,
"step": 1315
},
{
"epoch": 2.503090822634332,
"grad_norm": 0.6715079545974731,
"learning_rate": 9.167354715782789e-05,
"loss": 0.199,
"step": 1316
},
{
"epoch": 2.5049928673323825,
"grad_norm": 0.32039186358451843,
"learning_rate": 9.166719593521753e-05,
"loss": 0.1723,
"step": 1317
},
{
"epoch": 2.5068949120304325,
"grad_norm": 0.3974270224571228,
"learning_rate": 9.166084471260718e-05,
"loss": 0.1659,
"step": 1318
},
{
"epoch": 2.5087969567284834,
"grad_norm": 0.3953278362751007,
"learning_rate": 9.165449348999683e-05,
"loss": 0.1879,
"step": 1319
},
{
"epoch": 2.5106990014265333,
"grad_norm": 0.4061002731323242,
"learning_rate": 9.164814226738647e-05,
"loss": 0.1858,
"step": 1320
},
{
"epoch": 2.512601046124584,
"grad_norm": 0.3816406726837158,
"learning_rate": 9.164179104477612e-05,
"loss": 0.1899,
"step": 1321
},
{
"epoch": 2.5145030908226342,
"grad_norm": 0.3856441378593445,
"learning_rate": 9.163543982216577e-05,
"loss": 0.1727,
"step": 1322
},
{
"epoch": 2.5164051355206847,
"grad_norm": 0.47267359495162964,
"learning_rate": 9.162908859955543e-05,
"loss": 0.2137,
"step": 1323
},
{
"epoch": 2.518307180218735,
"grad_norm": 0.41764524579048157,
"learning_rate": 9.162273737694506e-05,
"loss": 0.2138,
"step": 1324
},
{
"epoch": 2.5202092249167856,
"grad_norm": 0.42864158749580383,
"learning_rate": 9.161638615433472e-05,
"loss": 0.1919,
"step": 1325
},
{
"epoch": 2.522111269614836,
"grad_norm": 0.5067504048347473,
"learning_rate": 9.161003493172437e-05,
"loss": 0.2068,
"step": 1326
},
{
"epoch": 2.5240133143128864,
"grad_norm": 0.430951863527298,
"learning_rate": 9.1603683709114e-05,
"loss": 0.2195,
"step": 1327
},
{
"epoch": 2.525915359010937,
"grad_norm": 0.37973999977111816,
"learning_rate": 9.159733248650364e-05,
"loss": 0.1799,
"step": 1328
},
{
"epoch": 2.527817403708987,
"grad_norm": 0.362768292427063,
"learning_rate": 9.159098126389331e-05,
"loss": 0.1555,
"step": 1329
},
{
"epoch": 2.5297194484070378,
"grad_norm": 0.41433513164520264,
"learning_rate": 9.158463004128295e-05,
"loss": 0.1958,
"step": 1330
},
{
"epoch": 2.5316214931050878,
"grad_norm": 0.3091717064380646,
"learning_rate": 9.15782788186726e-05,
"loss": 0.1622,
"step": 1331
},
{
"epoch": 2.533523537803138,
"grad_norm": 0.35242778062820435,
"learning_rate": 9.157192759606225e-05,
"loss": 0.1627,
"step": 1332
},
{
"epoch": 2.5354255825011887,
"grad_norm": 0.38102760910987854,
"learning_rate": 9.156557637345189e-05,
"loss": 0.1663,
"step": 1333
},
{
"epoch": 2.537327627199239,
"grad_norm": 0.4313855469226837,
"learning_rate": 9.155922515084154e-05,
"loss": 0.208,
"step": 1334
},
{
"epoch": 2.5392296718972895,
"grad_norm": 0.33921730518341064,
"learning_rate": 9.155287392823118e-05,
"loss": 0.1572,
"step": 1335
},
{
"epoch": 2.54113171659534,
"grad_norm": 0.3824930489063263,
"learning_rate": 9.154652270562085e-05,
"loss": 0.1986,
"step": 1336
},
{
"epoch": 2.5430337612933904,
"grad_norm": 0.33059945702552795,
"learning_rate": 9.154017148301048e-05,
"loss": 0.156,
"step": 1337
},
{
"epoch": 2.544935805991441,
"grad_norm": 0.4880346357822418,
"learning_rate": 9.153382026040012e-05,
"loss": 0.2319,
"step": 1338
},
{
"epoch": 2.5468378506894913,
"grad_norm": 0.27151229977607727,
"learning_rate": 9.152746903778979e-05,
"loss": 0.128,
"step": 1339
},
{
"epoch": 2.5487398953875418,
"grad_norm": 0.35515275597572327,
"learning_rate": 9.152111781517943e-05,
"loss": 0.1685,
"step": 1340
},
{
"epoch": 2.550641940085592,
"grad_norm": 0.41455206274986267,
"learning_rate": 9.151476659256908e-05,
"loss": 0.2354,
"step": 1341
},
{
"epoch": 2.552543984783642,
"grad_norm": 0.3215075731277466,
"learning_rate": 9.150841536995872e-05,
"loss": 0.1653,
"step": 1342
},
{
"epoch": 2.554446029481693,
"grad_norm": 0.34158623218536377,
"learning_rate": 9.150206414734837e-05,
"loss": 0.1598,
"step": 1343
},
{
"epoch": 2.556348074179743,
"grad_norm": 0.4195705056190491,
"learning_rate": 9.149571292473802e-05,
"loss": 0.228,
"step": 1344
},
{
"epoch": 2.5582501188777935,
"grad_norm": 0.34753212332725525,
"learning_rate": 9.148936170212766e-05,
"loss": 0.1948,
"step": 1345
},
{
"epoch": 2.560152163575844,
"grad_norm": 0.43792131543159485,
"learning_rate": 9.148301047951731e-05,
"loss": 0.2191,
"step": 1346
},
{
"epoch": 2.5620542082738944,
"grad_norm": 0.35464513301849365,
"learning_rate": 9.147665925690696e-05,
"loss": 0.1555,
"step": 1347
},
{
"epoch": 2.563956252971945,
"grad_norm": 0.50618976354599,
"learning_rate": 9.14703080342966e-05,
"loss": 0.2262,
"step": 1348
},
{
"epoch": 2.5658582976699953,
"grad_norm": 0.3603616952896118,
"learning_rate": 9.146395681168625e-05,
"loss": 0.1647,
"step": 1349
},
{
"epoch": 2.5677603423680457,
"grad_norm": 0.486316978931427,
"learning_rate": 9.14576055890759e-05,
"loss": 0.2052,
"step": 1350
},
{
"epoch": 2.569662387066096,
"grad_norm": 0.45915400981903076,
"learning_rate": 9.145125436646554e-05,
"loss": 0.218,
"step": 1351
},
{
"epoch": 2.5715644317641466,
"grad_norm": 0.3178432583808899,
"learning_rate": 9.14449031438552e-05,
"loss": 0.1453,
"step": 1352
},
{
"epoch": 2.5734664764621966,
"grad_norm": 0.3939111828804016,
"learning_rate": 9.143855192124485e-05,
"loss": 0.1784,
"step": 1353
},
{
"epoch": 2.5753685211602475,
"grad_norm": 0.3399297595024109,
"learning_rate": 9.14322006986345e-05,
"loss": 0.1644,
"step": 1354
},
{
"epoch": 2.5772705658582975,
"grad_norm": 0.39880868792533875,
"learning_rate": 9.142584947602414e-05,
"loss": 0.2139,
"step": 1355
},
{
"epoch": 2.579172610556348,
"grad_norm": 0.40534335374832153,
"learning_rate": 9.141949825341379e-05,
"loss": 0.1872,
"step": 1356
},
{
"epoch": 2.5810746552543984,
"grad_norm": 0.3201380968093872,
"learning_rate": 9.141314703080344e-05,
"loss": 0.1557,
"step": 1357
},
{
"epoch": 2.582976699952449,
"grad_norm": 0.31011682748794556,
"learning_rate": 9.140679580819308e-05,
"loss": 0.1301,
"step": 1358
},
{
"epoch": 2.5848787446504993,
"grad_norm": 0.3697820007801056,
"learning_rate": 9.140044458558273e-05,
"loss": 0.1856,
"step": 1359
},
{
"epoch": 2.5867807893485497,
"grad_norm": 0.291369765996933,
"learning_rate": 9.139409336297238e-05,
"loss": 0.1323,
"step": 1360
},
{
"epoch": 2.5886828340466,
"grad_norm": 0.4111400842666626,
"learning_rate": 9.138774214036202e-05,
"loss": 0.2271,
"step": 1361
},
{
"epoch": 2.5905848787446506,
"grad_norm": 0.4169454872608185,
"learning_rate": 9.138139091775167e-05,
"loss": 0.199,
"step": 1362
},
{
"epoch": 2.592486923442701,
"grad_norm": 0.4209660589694977,
"learning_rate": 9.137503969514132e-05,
"loss": 0.2296,
"step": 1363
},
{
"epoch": 2.594388968140751,
"grad_norm": 0.3968026041984558,
"learning_rate": 9.136868847253096e-05,
"loss": 0.2174,
"step": 1364
},
{
"epoch": 2.596291012838802,
"grad_norm": 0.3477707803249359,
"learning_rate": 9.136233724992061e-05,
"loss": 0.1818,
"step": 1365
},
{
"epoch": 2.598193057536852,
"grad_norm": 0.3979746699333191,
"learning_rate": 9.135598602731025e-05,
"loss": 0.2373,
"step": 1366
},
{
"epoch": 2.6000951022349024,
"grad_norm": 0.32050615549087524,
"learning_rate": 9.134963480469992e-05,
"loss": 0.1562,
"step": 1367
},
{
"epoch": 2.601997146932953,
"grad_norm": 0.4675930142402649,
"learning_rate": 9.134328358208956e-05,
"loss": 0.2942,
"step": 1368
},
{
"epoch": 2.6038991916310033,
"grad_norm": 0.32259052991867065,
"learning_rate": 9.13369323594792e-05,
"loss": 0.1411,
"step": 1369
},
{
"epoch": 2.6058012363290537,
"grad_norm": 0.3838285803794861,
"learning_rate": 9.133058113686886e-05,
"loss": 0.2098,
"step": 1370
},
{
"epoch": 2.607703281027104,
"grad_norm": 0.4749825596809387,
"learning_rate": 9.13242299142585e-05,
"loss": 0.2621,
"step": 1371
},
{
"epoch": 2.6096053257251546,
"grad_norm": 0.3093271255493164,
"learning_rate": 9.131787869164815e-05,
"loss": 0.1389,
"step": 1372
},
{
"epoch": 2.611507370423205,
"grad_norm": 0.4896688461303711,
"learning_rate": 9.131152746903779e-05,
"loss": 0.2347,
"step": 1373
},
{
"epoch": 2.6134094151212555,
"grad_norm": 0.39409998059272766,
"learning_rate": 9.130517624642744e-05,
"loss": 0.2224,
"step": 1374
},
{
"epoch": 2.615311459819306,
"grad_norm": 0.39578184485435486,
"learning_rate": 9.129882502381709e-05,
"loss": 0.1963,
"step": 1375
},
{
"epoch": 2.6172135045173563,
"grad_norm": 0.34999507665634155,
"learning_rate": 9.129247380120673e-05,
"loss": 0.1612,
"step": 1376
},
{
"epoch": 2.6191155492154063,
"grad_norm": 0.33919695019721985,
"learning_rate": 9.12861225785964e-05,
"loss": 0.1813,
"step": 1377
},
{
"epoch": 2.6210175939134572,
"grad_norm": 0.3273175060749054,
"learning_rate": 9.127977135598603e-05,
"loss": 0.1436,
"step": 1378
},
{
"epoch": 2.6229196386115072,
"grad_norm": 0.4175270199775696,
"learning_rate": 9.127342013337567e-05,
"loss": 0.1832,
"step": 1379
},
{
"epoch": 2.6248216833095577,
"grad_norm": 0.3580436408519745,
"learning_rate": 9.126706891076532e-05,
"loss": 0.1569,
"step": 1380
},
{
"epoch": 2.626723728007608,
"grad_norm": 0.3683449625968933,
"learning_rate": 9.126071768815498e-05,
"loss": 0.1955,
"step": 1381
},
{
"epoch": 2.6286257727056586,
"grad_norm": 0.3830251395702362,
"learning_rate": 9.125436646554461e-05,
"loss": 0.1626,
"step": 1382
},
{
"epoch": 2.630527817403709,
"grad_norm": 0.3428569734096527,
"learning_rate": 9.124801524293427e-05,
"loss": 0.1477,
"step": 1383
},
{
"epoch": 2.6324298621017594,
"grad_norm": 0.4621574878692627,
"learning_rate": 9.124166402032392e-05,
"loss": 0.1675,
"step": 1384
},
{
"epoch": 2.63433190679981,
"grad_norm": 0.40000998973846436,
"learning_rate": 9.123531279771357e-05,
"loss": 0.1751,
"step": 1385
},
{
"epoch": 2.6362339514978603,
"grad_norm": 0.4612349271774292,
"learning_rate": 9.122896157510321e-05,
"loss": 0.2165,
"step": 1386
},
{
"epoch": 2.6381359961959108,
"grad_norm": 0.47919005155563354,
"learning_rate": 9.122261035249286e-05,
"loss": 0.2,
"step": 1387
},
{
"epoch": 2.6400380408939608,
"grad_norm": 0.5020009875297546,
"learning_rate": 9.121625912988251e-05,
"loss": 0.1997,
"step": 1388
},
{
"epoch": 2.6419400855920117,
"grad_norm": 0.4959258437156677,
"learning_rate": 9.120990790727215e-05,
"loss": 0.1903,
"step": 1389
},
{
"epoch": 2.6438421302900617,
"grad_norm": 0.4882603585720062,
"learning_rate": 9.12035566846618e-05,
"loss": 0.2082,
"step": 1390
},
{
"epoch": 2.645744174988112,
"grad_norm": 0.37479934096336365,
"learning_rate": 9.119720546205145e-05,
"loss": 0.179,
"step": 1391
},
{
"epoch": 2.6476462196861625,
"grad_norm": 0.5104106068611145,
"learning_rate": 9.119085423944109e-05,
"loss": 0.2281,
"step": 1392
},
{
"epoch": 2.649548264384213,
"grad_norm": 0.3893817663192749,
"learning_rate": 9.118450301683074e-05,
"loss": 0.2324,
"step": 1393
},
{
"epoch": 2.6514503090822634,
"grad_norm": 0.35762450098991394,
"learning_rate": 9.11781517942204e-05,
"loss": 0.1933,
"step": 1394
},
{
"epoch": 2.653352353780314,
"grad_norm": 0.37635737657546997,
"learning_rate": 9.117180057161005e-05,
"loss": 0.1869,
"step": 1395
},
{
"epoch": 2.6552543984783643,
"grad_norm": 0.3230188488960266,
"learning_rate": 9.116544934899969e-05,
"loss": 0.1576,
"step": 1396
},
{
"epoch": 2.6571564431764148,
"grad_norm": 0.3708724081516266,
"learning_rate": 9.115909812638934e-05,
"loss": 0.168,
"step": 1397
},
{
"epoch": 2.659058487874465,
"grad_norm": 0.34403741359710693,
"learning_rate": 9.115274690377899e-05,
"loss": 0.2721,
"step": 1398
},
{
"epoch": 2.660960532572515,
"grad_norm": 0.2812383770942688,
"learning_rate": 9.114639568116863e-05,
"loss": 0.1605,
"step": 1399
},
{
"epoch": 2.662862577270566,
"grad_norm": 0.39116060733795166,
"learning_rate": 9.114004445855827e-05,
"loss": 0.1843,
"step": 1400
},
{
"epoch": 2.664764621968616,
"grad_norm": 0.3641309440135956,
"learning_rate": 9.113369323594793e-05,
"loss": 0.1818,
"step": 1401
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.4198780953884125,
"learning_rate": 9.112734201333757e-05,
"loss": 0.2044,
"step": 1402
},
{
"epoch": 2.668568711364717,
"grad_norm": 0.3912922739982605,
"learning_rate": 9.112099079072722e-05,
"loss": 0.1881,
"step": 1403
},
{
"epoch": 2.6704707560627674,
"grad_norm": 0.4235263764858246,
"learning_rate": 9.111463956811686e-05,
"loss": 0.2034,
"step": 1404
},
{
"epoch": 2.672372800760818,
"grad_norm": 0.3731124699115753,
"learning_rate": 9.110828834550651e-05,
"loss": 0.1803,
"step": 1405
},
{
"epoch": 2.6742748454588683,
"grad_norm": 0.3907954692840576,
"learning_rate": 9.110193712289616e-05,
"loss": 0.2074,
"step": 1406
},
{
"epoch": 2.6761768901569187,
"grad_norm": 0.3954913914203644,
"learning_rate": 9.10955859002858e-05,
"loss": 0.1797,
"step": 1407
},
{
"epoch": 2.678078934854969,
"grad_norm": 0.5066515207290649,
"learning_rate": 9.108923467767547e-05,
"loss": 0.2096,
"step": 1408
},
{
"epoch": 2.6799809795530196,
"grad_norm": 0.4380313456058502,
"learning_rate": 9.10828834550651e-05,
"loss": 0.2064,
"step": 1409
},
{
"epoch": 2.68188302425107,
"grad_norm": 0.3758976459503174,
"learning_rate": 9.107653223245474e-05,
"loss": 0.2076,
"step": 1410
},
{
"epoch": 2.6837850689491205,
"grad_norm": 0.38098809123039246,
"learning_rate": 9.10701810098444e-05,
"loss": 0.1727,
"step": 1411
},
{
"epoch": 2.6856871136471705,
"grad_norm": 0.4027041792869568,
"learning_rate": 9.106382978723405e-05,
"loss": 0.154,
"step": 1412
},
{
"epoch": 2.6875891583452214,
"grad_norm": 0.307954877614975,
"learning_rate": 9.10574785646237e-05,
"loss": 0.1766,
"step": 1413
},
{
"epoch": 2.6894912030432714,
"grad_norm": 0.4232465326786041,
"learning_rate": 9.105112734201334e-05,
"loss": 0.1866,
"step": 1414
},
{
"epoch": 2.691393247741322,
"grad_norm": 0.4296838641166687,
"learning_rate": 9.104477611940299e-05,
"loss": 0.1813,
"step": 1415
},
{
"epoch": 2.6932952924393723,
"grad_norm": 0.3334490954875946,
"learning_rate": 9.103842489679264e-05,
"loss": 0.1576,
"step": 1416
},
{
"epoch": 2.6951973371374227,
"grad_norm": 0.42984020709991455,
"learning_rate": 9.103207367418228e-05,
"loss": 0.1945,
"step": 1417
},
{
"epoch": 2.697099381835473,
"grad_norm": 0.4306494891643524,
"learning_rate": 9.102572245157193e-05,
"loss": 0.179,
"step": 1418
},
{
"epoch": 2.6990014265335236,
"grad_norm": 0.38049131631851196,
"learning_rate": 9.101937122896158e-05,
"loss": 0.1951,
"step": 1419
},
{
"epoch": 2.700903471231574,
"grad_norm": 0.3691817820072174,
"learning_rate": 9.101302000635122e-05,
"loss": 0.1725,
"step": 1420
},
{
"epoch": 2.7028055159296245,
"grad_norm": 0.32240816950798035,
"learning_rate": 9.100666878374087e-05,
"loss": 0.1852,
"step": 1421
},
{
"epoch": 2.704707560627675,
"grad_norm": 0.3735920488834381,
"learning_rate": 9.100031756113053e-05,
"loss": 0.1857,
"step": 1422
},
{
"epoch": 2.706609605325725,
"grad_norm": 0.3693629801273346,
"learning_rate": 9.099396633852016e-05,
"loss": 0.1698,
"step": 1423
},
{
"epoch": 2.708511650023776,
"grad_norm": 0.40846189856529236,
"learning_rate": 9.098761511590982e-05,
"loss": 0.2531,
"step": 1424
},
{
"epoch": 2.710413694721826,
"grad_norm": 0.3387136161327362,
"learning_rate": 9.098126389329947e-05,
"loss": 0.152,
"step": 1425
},
{
"epoch": 2.7123157394198762,
"grad_norm": 0.43113890290260315,
"learning_rate": 9.097491267068912e-05,
"loss": 0.1939,
"step": 1426
},
{
"epoch": 2.7142177841179267,
"grad_norm": 0.45811060070991516,
"learning_rate": 9.096856144807876e-05,
"loss": 0.2217,
"step": 1427
},
{
"epoch": 2.716119828815977,
"grad_norm": 0.3742765486240387,
"learning_rate": 9.096221022546841e-05,
"loss": 0.183,
"step": 1428
},
{
"epoch": 2.7180218735140276,
"grad_norm": 0.39835286140441895,
"learning_rate": 9.095585900285806e-05,
"loss": 0.248,
"step": 1429
},
{
"epoch": 2.719923918212078,
"grad_norm": 0.38528379797935486,
"learning_rate": 9.09495077802477e-05,
"loss": 0.1642,
"step": 1430
},
{
"epoch": 2.7218259629101285,
"grad_norm": 0.4142857789993286,
"learning_rate": 9.094315655763735e-05,
"loss": 0.1817,
"step": 1431
},
{
"epoch": 2.723728007608179,
"grad_norm": 0.4072723388671875,
"learning_rate": 9.0936805335027e-05,
"loss": 0.2017,
"step": 1432
},
{
"epoch": 2.7256300523062293,
"grad_norm": 0.37081822752952576,
"learning_rate": 9.093045411241664e-05,
"loss": 0.2262,
"step": 1433
},
{
"epoch": 2.7275320970042793,
"grad_norm": 0.3628768026828766,
"learning_rate": 9.092410288980629e-05,
"loss": 0.1714,
"step": 1434
},
{
"epoch": 2.7294341417023302,
"grad_norm": 0.46637046337127686,
"learning_rate": 9.091775166719594e-05,
"loss": 0.3189,
"step": 1435
},
{
"epoch": 2.7313361864003802,
"grad_norm": 0.2643025517463684,
"learning_rate": 9.091140044458558e-05,
"loss": 0.234,
"step": 1436
},
{
"epoch": 2.7332382310984307,
"grad_norm": 0.36125344038009644,
"learning_rate": 9.090504922197523e-05,
"loss": 0.1981,
"step": 1437
},
{
"epoch": 2.735140275796481,
"grad_norm": 0.3064311742782593,
"learning_rate": 9.089869799936487e-05,
"loss": 0.1644,
"step": 1438
},
{
"epoch": 2.7370423204945316,
"grad_norm": 0.372164249420166,
"learning_rate": 9.089234677675454e-05,
"loss": 0.2023,
"step": 1439
},
{
"epoch": 2.738944365192582,
"grad_norm": 0.346431165933609,
"learning_rate": 9.088599555414418e-05,
"loss": 0.1913,
"step": 1440
},
{
"epoch": 2.7408464098906324,
"grad_norm": 0.3421470522880554,
"learning_rate": 9.087964433153382e-05,
"loss": 0.1599,
"step": 1441
},
{
"epoch": 2.742748454588683,
"grad_norm": 0.33351296186447144,
"learning_rate": 9.087329310892348e-05,
"loss": 0.1775,
"step": 1442
},
{
"epoch": 2.7446504992867333,
"grad_norm": 0.3450356721878052,
"learning_rate": 9.086694188631312e-05,
"loss": 0.199,
"step": 1443
},
{
"epoch": 2.7465525439847838,
"grad_norm": 0.34339770674705505,
"learning_rate": 9.086059066370277e-05,
"loss": 0.1608,
"step": 1444
},
{
"epoch": 2.748454588682834,
"grad_norm": 0.35941675305366516,
"learning_rate": 9.085423944109241e-05,
"loss": 0.1566,
"step": 1445
},
{
"epoch": 2.7503566333808847,
"grad_norm": 0.396847665309906,
"learning_rate": 9.084788821848206e-05,
"loss": 0.1829,
"step": 1446
},
{
"epoch": 2.7522586780789347,
"grad_norm": 0.3818894028663635,
"learning_rate": 9.084153699587171e-05,
"loss": 0.2017,
"step": 1447
},
{
"epoch": 2.754160722776985,
"grad_norm": 0.46124422550201416,
"learning_rate": 9.083518577326135e-05,
"loss": 0.185,
"step": 1448
},
{
"epoch": 2.7560627674750355,
"grad_norm": 0.4047834575176239,
"learning_rate": 9.082883455065102e-05,
"loss": 0.1848,
"step": 1449
},
{
"epoch": 2.757964812173086,
"grad_norm": 0.5650888085365295,
"learning_rate": 9.082248332804065e-05,
"loss": 0.2274,
"step": 1450
},
{
"epoch": 2.7598668568711364,
"grad_norm": 0.35878250002861023,
"learning_rate": 9.081613210543029e-05,
"loss": 0.1414,
"step": 1451
},
{
"epoch": 2.761768901569187,
"grad_norm": 0.37223199009895325,
"learning_rate": 9.080978088281994e-05,
"loss": 0.1718,
"step": 1452
},
{
"epoch": 2.7636709462672373,
"grad_norm": 0.34717050194740295,
"learning_rate": 9.08034296602096e-05,
"loss": 0.1719,
"step": 1453
},
{
"epoch": 2.7655729909652877,
"grad_norm": 0.4706629812717438,
"learning_rate": 9.079707843759923e-05,
"loss": 0.1953,
"step": 1454
},
{
"epoch": 2.767475035663338,
"grad_norm": 0.40658390522003174,
"learning_rate": 9.079072721498889e-05,
"loss": 0.1723,
"step": 1455
},
{
"epoch": 2.7693770803613886,
"grad_norm": 0.5025349855422974,
"learning_rate": 9.078437599237854e-05,
"loss": 0.2122,
"step": 1456
},
{
"epoch": 2.771279125059439,
"grad_norm": 0.4134734272956848,
"learning_rate": 9.077802476976819e-05,
"loss": 0.1872,
"step": 1457
},
{
"epoch": 2.773181169757489,
"grad_norm": 0.4301147162914276,
"learning_rate": 9.077167354715783e-05,
"loss": 0.2102,
"step": 1458
},
{
"epoch": 2.77508321445554,
"grad_norm": 0.4295254051685333,
"learning_rate": 9.076532232454748e-05,
"loss": 0.2132,
"step": 1459
},
{
"epoch": 2.77698525915359,
"grad_norm": 0.40130358934402466,
"learning_rate": 9.075897110193713e-05,
"loss": 0.1891,
"step": 1460
},
{
"epoch": 2.7788873038516404,
"grad_norm": 0.4124513268470764,
"learning_rate": 9.075261987932677e-05,
"loss": 0.204,
"step": 1461
},
{
"epoch": 2.780789348549691,
"grad_norm": 0.3976169526576996,
"learning_rate": 9.074626865671642e-05,
"loss": 0.2016,
"step": 1462
},
{
"epoch": 2.7826913932477413,
"grad_norm": 0.3949052095413208,
"learning_rate": 9.073991743410607e-05,
"loss": 0.1924,
"step": 1463
},
{
"epoch": 2.7845934379457917,
"grad_norm": 0.4033919870853424,
"learning_rate": 9.073356621149571e-05,
"loss": 0.1967,
"step": 1464
},
{
"epoch": 2.786495482643842,
"grad_norm": 0.32922443747520447,
"learning_rate": 9.072721498888536e-05,
"loss": 0.1639,
"step": 1465
},
{
"epoch": 2.7883975273418926,
"grad_norm": 0.372179239988327,
"learning_rate": 9.072086376627502e-05,
"loss": 0.1783,
"step": 1466
},
{
"epoch": 2.790299572039943,
"grad_norm": 0.45123547315597534,
"learning_rate": 9.071451254366467e-05,
"loss": 0.2573,
"step": 1467
},
{
"epoch": 2.7922016167379935,
"grad_norm": 0.33130937814712524,
"learning_rate": 9.07081613210543e-05,
"loss": 0.1427,
"step": 1468
},
{
"epoch": 2.7941036614360435,
"grad_norm": 0.4377565085887909,
"learning_rate": 9.070181009844394e-05,
"loss": 0.1915,
"step": 1469
},
{
"epoch": 2.7960057061340944,
"grad_norm": 0.555698037147522,
"learning_rate": 9.069545887583361e-05,
"loss": 0.2495,
"step": 1470
},
{
"epoch": 2.7979077508321444,
"grad_norm": 0.4749322831630707,
"learning_rate": 9.068910765322325e-05,
"loss": 0.1944,
"step": 1471
},
{
"epoch": 2.799809795530195,
"grad_norm": 0.3543435335159302,
"learning_rate": 9.068275643061289e-05,
"loss": 0.1669,
"step": 1472
},
{
"epoch": 2.8017118402282453,
"grad_norm": 0.33086055517196655,
"learning_rate": 9.067640520800255e-05,
"loss": 0.1792,
"step": 1473
},
{
"epoch": 2.8036138849262957,
"grad_norm": 0.3898443281650543,
"learning_rate": 9.067005398539219e-05,
"loss": 0.171,
"step": 1474
},
{
"epoch": 2.805515929624346,
"grad_norm": 0.4169894754886627,
"learning_rate": 9.066370276278184e-05,
"loss": 0.2057,
"step": 1475
},
{
"epoch": 2.8074179743223966,
"grad_norm": 0.37259283661842346,
"learning_rate": 9.065735154017148e-05,
"loss": 0.1799,
"step": 1476
},
{
"epoch": 2.809320019020447,
"grad_norm": 0.3892917037010193,
"learning_rate": 9.065100031756113e-05,
"loss": 0.1847,
"step": 1477
},
{
"epoch": 2.8112220637184975,
"grad_norm": 0.5309971570968628,
"learning_rate": 9.064464909495078e-05,
"loss": 0.2462,
"step": 1478
},
{
"epoch": 2.813124108416548,
"grad_norm": 0.3646765351295471,
"learning_rate": 9.063829787234042e-05,
"loss": 0.168,
"step": 1479
},
{
"epoch": 2.815026153114598,
"grad_norm": 0.3424735963344574,
"learning_rate": 9.063194664973009e-05,
"loss": 0.1547,
"step": 1480
},
{
"epoch": 2.816928197812649,
"grad_norm": 0.38415202498435974,
"learning_rate": 9.062559542711973e-05,
"loss": 0.2186,
"step": 1481
},
{
"epoch": 2.818830242510699,
"grad_norm": 0.4032725393772125,
"learning_rate": 9.061924420450936e-05,
"loss": 0.1802,
"step": 1482
},
{
"epoch": 2.8207322872087492,
"grad_norm": 0.35286685824394226,
"learning_rate": 9.061289298189902e-05,
"loss": 0.139,
"step": 1483
},
{
"epoch": 2.8226343319067997,
"grad_norm": 0.35866954922676086,
"learning_rate": 9.060654175928867e-05,
"loss": 0.2022,
"step": 1484
},
{
"epoch": 2.82453637660485,
"grad_norm": 0.36488500237464905,
"learning_rate": 9.060019053667832e-05,
"loss": 0.1816,
"step": 1485
},
{
"epoch": 2.8264384213029006,
"grad_norm": 0.4557202160358429,
"learning_rate": 9.059383931406796e-05,
"loss": 0.1975,
"step": 1486
},
{
"epoch": 2.828340466000951,
"grad_norm": 0.32717350125312805,
"learning_rate": 9.058748809145761e-05,
"loss": 0.1639,
"step": 1487
},
{
"epoch": 2.8302425106990015,
"grad_norm": 0.41179734468460083,
"learning_rate": 9.058113686884726e-05,
"loss": 0.1841,
"step": 1488
},
{
"epoch": 2.832144555397052,
"grad_norm": 0.3747973144054413,
"learning_rate": 9.05747856462369e-05,
"loss": 0.1678,
"step": 1489
},
{
"epoch": 2.8340466000951023,
"grad_norm": 0.41899365186691284,
"learning_rate": 9.056843442362655e-05,
"loss": 0.2753,
"step": 1490
},
{
"epoch": 2.835948644793153,
"grad_norm": 0.397416889667511,
"learning_rate": 9.05620832010162e-05,
"loss": 0.1658,
"step": 1491
},
{
"epoch": 2.8378506894912032,
"grad_norm": 0.3874271810054779,
"learning_rate": 9.055573197840584e-05,
"loss": 0.1808,
"step": 1492
},
{
"epoch": 2.8397527341892532,
"grad_norm": 0.3698302209377289,
"learning_rate": 9.05493807557955e-05,
"loss": 0.1869,
"step": 1493
},
{
"epoch": 2.841654778887304,
"grad_norm": 0.3908369541168213,
"learning_rate": 9.054302953318515e-05,
"loss": 0.1866,
"step": 1494
},
{
"epoch": 2.843556823585354,
"grad_norm": 0.5696883201599121,
"learning_rate": 9.053667831057478e-05,
"loss": 0.2083,
"step": 1495
},
{
"epoch": 2.8454588682834046,
"grad_norm": 0.3560580611228943,
"learning_rate": 9.053032708796444e-05,
"loss": 0.1829,
"step": 1496
},
{
"epoch": 2.847360912981455,
"grad_norm": 0.4369358718395233,
"learning_rate": 9.052397586535409e-05,
"loss": 0.2302,
"step": 1497
},
{
"epoch": 2.8492629576795054,
"grad_norm": 0.4240768551826477,
"learning_rate": 9.051762464274374e-05,
"loss": 0.2204,
"step": 1498
},
{
"epoch": 2.851165002377556,
"grad_norm": 0.4078483581542969,
"learning_rate": 9.051127342013338e-05,
"loss": 0.181,
"step": 1499
},
{
"epoch": 2.8530670470756063,
"grad_norm": 0.4196905195713043,
"learning_rate": 9.050492219752303e-05,
"loss": 0.2147,
"step": 1500
},
{
"epoch": 2.8549690917736568,
"grad_norm": 0.3858025372028351,
"learning_rate": 9.049857097491268e-05,
"loss": 0.1719,
"step": 1501
},
{
"epoch": 2.856871136471707,
"grad_norm": 0.3923434019088745,
"learning_rate": 9.049221975230232e-05,
"loss": 0.1966,
"step": 1502
},
{
"epoch": 2.8587731811697576,
"grad_norm": 0.38231122493743896,
"learning_rate": 9.048586852969197e-05,
"loss": 0.186,
"step": 1503
},
{
"epoch": 2.8606752258678076,
"grad_norm": 0.3579331040382385,
"learning_rate": 9.047951730708162e-05,
"loss": 0.1777,
"step": 1504
},
{
"epoch": 2.8625772705658585,
"grad_norm": 0.2968972623348236,
"learning_rate": 9.047316608447126e-05,
"loss": 0.1456,
"step": 1505
},
{
"epoch": 2.8644793152639085,
"grad_norm": 0.3534374535083771,
"learning_rate": 9.046681486186091e-05,
"loss": 0.178,
"step": 1506
},
{
"epoch": 2.866381359961959,
"grad_norm": 0.4368778169155121,
"learning_rate": 9.046046363925057e-05,
"loss": 0.2349,
"step": 1507
},
{
"epoch": 2.8682834046600094,
"grad_norm": 0.43825942277908325,
"learning_rate": 9.04541124166402e-05,
"loss": 0.1857,
"step": 1508
},
{
"epoch": 2.87018544935806,
"grad_norm": 0.35765841603279114,
"learning_rate": 9.044776119402986e-05,
"loss": 0.1787,
"step": 1509
},
{
"epoch": 2.8720874940561103,
"grad_norm": 0.35496601462364197,
"learning_rate": 9.04414099714195e-05,
"loss": 0.1776,
"step": 1510
},
{
"epoch": 2.8739895387541607,
"grad_norm": 0.39673030376434326,
"learning_rate": 9.043505874880916e-05,
"loss": 0.1916,
"step": 1511
},
{
"epoch": 2.875891583452211,
"grad_norm": 0.3670983612537384,
"learning_rate": 9.04287075261988e-05,
"loss": 0.1726,
"step": 1512
},
{
"epoch": 2.8777936281502616,
"grad_norm": 0.4254002273082733,
"learning_rate": 9.042235630358844e-05,
"loss": 0.224,
"step": 1513
},
{
"epoch": 2.879695672848312,
"grad_norm": 0.37891489267349243,
"learning_rate": 9.041600508097809e-05,
"loss": 0.1805,
"step": 1514
},
{
"epoch": 2.881597717546362,
"grad_norm": 0.33309099078178406,
"learning_rate": 9.040965385836774e-05,
"loss": 0.1442,
"step": 1515
},
{
"epoch": 2.883499762244413,
"grad_norm": 0.4709990918636322,
"learning_rate": 9.040330263575739e-05,
"loss": 0.2342,
"step": 1516
},
{
"epoch": 2.885401806942463,
"grad_norm": 0.41639766097068787,
"learning_rate": 9.039695141314703e-05,
"loss": 0.1772,
"step": 1517
},
{
"epoch": 2.8873038516405134,
"grad_norm": 0.37914562225341797,
"learning_rate": 9.039060019053668e-05,
"loss": 0.1632,
"step": 1518
},
{
"epoch": 2.889205896338564,
"grad_norm": 0.4000544250011444,
"learning_rate": 9.038424896792633e-05,
"loss": 0.1927,
"step": 1519
},
{
"epoch": 2.8911079410366143,
"grad_norm": 0.42467859387397766,
"learning_rate": 9.037789774531597e-05,
"loss": 0.1789,
"step": 1520
},
{
"epoch": 2.8930099857346647,
"grad_norm": 0.46945691108703613,
"learning_rate": 9.037154652270562e-05,
"loss": 0.197,
"step": 1521
},
{
"epoch": 2.894912030432715,
"grad_norm": 0.43455827236175537,
"learning_rate": 9.036519530009528e-05,
"loss": 0.1877,
"step": 1522
},
{
"epoch": 2.8968140751307656,
"grad_norm": 0.5169146656990051,
"learning_rate": 9.035884407748491e-05,
"loss": 0.2039,
"step": 1523
},
{
"epoch": 2.898716119828816,
"grad_norm": 0.42767763137817383,
"learning_rate": 9.035249285487457e-05,
"loss": 0.2123,
"step": 1524
},
{
"epoch": 2.9006181645268665,
"grad_norm": 0.40808382630348206,
"learning_rate": 9.034614163226422e-05,
"loss": 0.2617,
"step": 1525
},
{
"epoch": 2.902520209224917,
"grad_norm": 0.3179365396499634,
"learning_rate": 9.033979040965386e-05,
"loss": 0.1548,
"step": 1526
},
{
"epoch": 2.9044222539229674,
"grad_norm": 0.259781152009964,
"learning_rate": 9.033343918704351e-05,
"loss": 0.1299,
"step": 1527
},
{
"epoch": 2.9063242986210174,
"grad_norm": 0.40235599875450134,
"learning_rate": 9.032708796443316e-05,
"loss": 0.1957,
"step": 1528
},
{
"epoch": 2.9082263433190683,
"grad_norm": 0.3170933127403259,
"learning_rate": 9.032073674182281e-05,
"loss": 0.1594,
"step": 1529
},
{
"epoch": 2.9101283880171183,
"grad_norm": 0.31572115421295166,
"learning_rate": 9.031438551921245e-05,
"loss": 0.1922,
"step": 1530
},
{
"epoch": 2.9120304327151687,
"grad_norm": 0.4456964433193207,
"learning_rate": 9.03080342966021e-05,
"loss": 0.2459,
"step": 1531
},
{
"epoch": 2.913932477413219,
"grad_norm": 0.3345606327056885,
"learning_rate": 9.030168307399175e-05,
"loss": 0.1708,
"step": 1532
},
{
"epoch": 2.9158345221112696,
"grad_norm": 0.4247712790966034,
"learning_rate": 9.029533185138139e-05,
"loss": 0.227,
"step": 1533
},
{
"epoch": 2.91773656680932,
"grad_norm": 0.3642347455024719,
"learning_rate": 9.028898062877104e-05,
"loss": 0.1971,
"step": 1534
},
{
"epoch": 2.9196386115073705,
"grad_norm": 0.40530455112457275,
"learning_rate": 9.02826294061607e-05,
"loss": 0.1574,
"step": 1535
},
{
"epoch": 2.921540656205421,
"grad_norm": 0.5143640637397766,
"learning_rate": 9.027627818355033e-05,
"loss": 0.207,
"step": 1536
},
{
"epoch": 2.9234427009034714,
"grad_norm": 0.4270274043083191,
"learning_rate": 9.026992696093999e-05,
"loss": 0.1971,
"step": 1537
},
{
"epoch": 2.925344745601522,
"grad_norm": 0.5170589685440063,
"learning_rate": 9.026357573832964e-05,
"loss": 0.2768,
"step": 1538
},
{
"epoch": 2.927246790299572,
"grad_norm": 0.41313278675079346,
"learning_rate": 9.025722451571929e-05,
"loss": 0.1765,
"step": 1539
},
{
"epoch": 2.9291488349976227,
"grad_norm": 0.4040130078792572,
"learning_rate": 9.025087329310893e-05,
"loss": 0.2002,
"step": 1540
},
{
"epoch": 2.9310508796956727,
"grad_norm": 0.37281498312950134,
"learning_rate": 9.024452207049857e-05,
"loss": 0.1542,
"step": 1541
},
{
"epoch": 2.932952924393723,
"grad_norm": 0.5352873802185059,
"learning_rate": 9.023817084788823e-05,
"loss": 0.2437,
"step": 1542
},
{
"epoch": 2.9348549690917736,
"grad_norm": 0.4044128358364105,
"learning_rate": 9.023181962527787e-05,
"loss": 0.1721,
"step": 1543
},
{
"epoch": 2.936757013789824,
"grad_norm": 0.35553574562072754,
"learning_rate": 9.022546840266751e-05,
"loss": 0.1838,
"step": 1544
},
{
"epoch": 2.9386590584878745,
"grad_norm": 0.42568060755729675,
"learning_rate": 9.021911718005717e-05,
"loss": 0.2022,
"step": 1545
},
{
"epoch": 2.940561103185925,
"grad_norm": 0.453700453042984,
"learning_rate": 9.021276595744681e-05,
"loss": 0.1866,
"step": 1546
},
{
"epoch": 2.9424631478839753,
"grad_norm": 0.3909238576889038,
"learning_rate": 9.020641473483646e-05,
"loss": 0.1628,
"step": 1547
},
{
"epoch": 2.944365192582026,
"grad_norm": 0.39725926518440247,
"learning_rate": 9.02000635122261e-05,
"loss": 0.217,
"step": 1548
},
{
"epoch": 2.9462672372800762,
"grad_norm": 0.34860628843307495,
"learning_rate": 9.019371228961575e-05,
"loss": 0.1724,
"step": 1549
},
{
"epoch": 2.948169281978126,
"grad_norm": 0.38813674449920654,
"learning_rate": 9.01873610670054e-05,
"loss": 0.2047,
"step": 1550
},
{
"epoch": 2.950071326676177,
"grad_norm": 0.37160560488700867,
"learning_rate": 9.018100984439504e-05,
"loss": 0.2119,
"step": 1551
},
{
"epoch": 2.951973371374227,
"grad_norm": 0.4166210889816284,
"learning_rate": 9.017465862178471e-05,
"loss": 0.2215,
"step": 1552
},
{
"epoch": 2.9538754160722775,
"grad_norm": 0.3657042980194092,
"learning_rate": 9.016830739917435e-05,
"loss": 0.1924,
"step": 1553
},
{
"epoch": 2.955777460770328,
"grad_norm": 0.37292999029159546,
"learning_rate": 9.016195617656399e-05,
"loss": 0.2329,
"step": 1554
},
{
"epoch": 2.9576795054683784,
"grad_norm": 0.3373647928237915,
"learning_rate": 9.015560495395364e-05,
"loss": 0.2034,
"step": 1555
},
{
"epoch": 2.959581550166429,
"grad_norm": 0.31643402576446533,
"learning_rate": 9.014925373134329e-05,
"loss": 0.1713,
"step": 1556
},
{
"epoch": 2.9614835948644793,
"grad_norm": 0.3107222318649292,
"learning_rate": 9.014290250873294e-05,
"loss": 0.1511,
"step": 1557
},
{
"epoch": 2.9633856395625298,
"grad_norm": 0.32063353061676025,
"learning_rate": 9.013655128612258e-05,
"loss": 0.1581,
"step": 1558
},
{
"epoch": 2.96528768426058,
"grad_norm": 0.4035079777240753,
"learning_rate": 9.013020006351223e-05,
"loss": 0.2036,
"step": 1559
},
{
"epoch": 2.9671897289586306,
"grad_norm": 0.28573077917099,
"learning_rate": 9.012384884090188e-05,
"loss": 0.1388,
"step": 1560
},
{
"epoch": 2.969091773656681,
"grad_norm": 0.38853904604911804,
"learning_rate": 9.011749761829152e-05,
"loss": 0.1981,
"step": 1561
},
{
"epoch": 2.9709938183547315,
"grad_norm": 0.39904823899269104,
"learning_rate": 9.011114639568117e-05,
"loss": 0.2249,
"step": 1562
},
{
"epoch": 2.9728958630527815,
"grad_norm": 0.3704228103160858,
"learning_rate": 9.010479517307082e-05,
"loss": 0.2176,
"step": 1563
},
{
"epoch": 2.9747979077508324,
"grad_norm": 0.3712176978588104,
"learning_rate": 9.009844395046046e-05,
"loss": 0.1685,
"step": 1564
},
{
"epoch": 2.9766999524488824,
"grad_norm": 0.47927892208099365,
"learning_rate": 9.009209272785011e-05,
"loss": 0.2027,
"step": 1565
},
{
"epoch": 2.978601997146933,
"grad_norm": 0.4230005443096161,
"learning_rate": 9.008574150523977e-05,
"loss": 0.212,
"step": 1566
},
{
"epoch": 2.9805040418449833,
"grad_norm": 0.32152169942855835,
"learning_rate": 9.00793902826294e-05,
"loss": 0.1639,
"step": 1567
},
{
"epoch": 2.9824060865430337,
"grad_norm": 0.42794153094291687,
"learning_rate": 9.007303906001906e-05,
"loss": 0.2143,
"step": 1568
},
{
"epoch": 2.984308131241084,
"grad_norm": 0.37590306997299194,
"learning_rate": 9.006668783740871e-05,
"loss": 0.189,
"step": 1569
},
{
"epoch": 2.9862101759391346,
"grad_norm": 0.3247901201248169,
"learning_rate": 9.006033661479836e-05,
"loss": 0.1616,
"step": 1570
},
{
"epoch": 2.988112220637185,
"grad_norm": 0.36269792914390564,
"learning_rate": 9.0053985392188e-05,
"loss": 0.2037,
"step": 1571
},
{
"epoch": 2.9900142653352355,
"grad_norm": 0.4436742067337036,
"learning_rate": 9.004763416957764e-05,
"loss": 0.202,
"step": 1572
},
{
"epoch": 2.991916310033286,
"grad_norm": 0.45660001039505005,
"learning_rate": 9.00412829469673e-05,
"loss": 0.2298,
"step": 1573
},
{
"epoch": 2.993818354731336,
"grad_norm": 0.3276821970939636,
"learning_rate": 9.003493172435694e-05,
"loss": 0.158,
"step": 1574
},
{
"epoch": 2.995720399429387,
"grad_norm": 0.3427131175994873,
"learning_rate": 9.002858050174659e-05,
"loss": 0.1781,
"step": 1575
},
{
"epoch": 2.997622444127437,
"grad_norm": 0.38842669129371643,
"learning_rate": 9.002222927913624e-05,
"loss": 0.1905,
"step": 1576
},
{
"epoch": 2.9995244888254873,
"grad_norm": 0.4034234285354614,
"learning_rate": 9.001587805652588e-05,
"loss": 0.1989,
"step": 1577
},
{
"epoch": 3.0014265335235377,
"grad_norm": 0.23682546615600586,
"learning_rate": 9.000952683391553e-05,
"loss": 0.0968,
"step": 1578
},
{
"epoch": 3.003328578221588,
"grad_norm": 0.23321636021137238,
"learning_rate": 9.000317561130517e-05,
"loss": 0.1278,
"step": 1579
},
{
"epoch": 3.0052306229196386,
"grad_norm": 0.2891576290130615,
"learning_rate": 8.999682438869482e-05,
"loss": 0.1297,
"step": 1580
},
{
"epoch": 3.007132667617689,
"grad_norm": 0.30067315697669983,
"learning_rate": 8.999047316608448e-05,
"loss": 0.1216,
"step": 1581
},
{
"epoch": 3.0090347123157395,
"grad_norm": 0.25676554441452026,
"learning_rate": 8.998412194347411e-05,
"loss": 0.1167,
"step": 1582
},
{
"epoch": 3.01093675701379,
"grad_norm": 0.30124133825302124,
"learning_rate": 8.997777072086378e-05,
"loss": 0.1243,
"step": 1583
},
{
"epoch": 3.0128388017118404,
"grad_norm": 0.30313733220100403,
"learning_rate": 8.997141949825342e-05,
"loss": 0.127,
"step": 1584
},
{
"epoch": 3.014740846409891,
"grad_norm": 0.36067837476730347,
"learning_rate": 8.996506827564306e-05,
"loss": 0.1331,
"step": 1585
},
{
"epoch": 3.0166428911079413,
"grad_norm": 0.3327738642692566,
"learning_rate": 8.995871705303271e-05,
"loss": 0.1304,
"step": 1586
},
{
"epoch": 3.0185449358059913,
"grad_norm": 0.2918979525566101,
"learning_rate": 8.995236583042236e-05,
"loss": 0.1127,
"step": 1587
},
{
"epoch": 3.0204469805040417,
"grad_norm": 0.40982192754745483,
"learning_rate": 8.994601460781201e-05,
"loss": 0.1283,
"step": 1588
},
{
"epoch": 3.022349025202092,
"grad_norm": 0.37201565504074097,
"learning_rate": 8.993966338520165e-05,
"loss": 0.1198,
"step": 1589
},
{
"epoch": 3.0242510699001426,
"grad_norm": 0.4271756708621979,
"learning_rate": 8.99333121625913e-05,
"loss": 0.1218,
"step": 1590
},
{
"epoch": 3.026153114598193,
"grad_norm": 0.3430047035217285,
"learning_rate": 8.992696093998095e-05,
"loss": 0.1213,
"step": 1591
},
{
"epoch": 3.0280551592962435,
"grad_norm": 0.3253467381000519,
"learning_rate": 8.992060971737059e-05,
"loss": 0.1124,
"step": 1592
},
{
"epoch": 3.029957203994294,
"grad_norm": 0.38685157895088196,
"learning_rate": 8.991425849476024e-05,
"loss": 0.112,
"step": 1593
},
{
"epoch": 3.0318592486923444,
"grad_norm": 0.36162498593330383,
"learning_rate": 8.99079072721499e-05,
"loss": 0.1061,
"step": 1594
},
{
"epoch": 3.033761293390395,
"grad_norm": 0.32084980607032776,
"learning_rate": 8.990155604953953e-05,
"loss": 0.0965,
"step": 1595
},
{
"epoch": 3.0356633380884452,
"grad_norm": 0.4037097096443176,
"learning_rate": 8.989520482692919e-05,
"loss": 0.1237,
"step": 1596
},
{
"epoch": 3.0375653827864957,
"grad_norm": 0.23668204247951508,
"learning_rate": 8.988885360431884e-05,
"loss": 0.1778,
"step": 1597
},
{
"epoch": 3.0394674274845457,
"grad_norm": 0.3448043167591095,
"learning_rate": 8.988250238170848e-05,
"loss": 0.1349,
"step": 1598
},
{
"epoch": 3.041369472182596,
"grad_norm": 0.39455583691596985,
"learning_rate": 8.987615115909813e-05,
"loss": 0.1175,
"step": 1599
},
{
"epoch": 3.0432715168806466,
"grad_norm": 0.39552587270736694,
"learning_rate": 8.986979993648778e-05,
"loss": 0.1296,
"step": 1600
},
{
"epoch": 3.045173561578697,
"grad_norm": 0.36603817343711853,
"learning_rate": 8.986344871387743e-05,
"loss": 0.1392,
"step": 1601
},
{
"epoch": 3.0470756062767475,
"grad_norm": 0.34084847569465637,
"learning_rate": 8.985709749126707e-05,
"loss": 0.1155,
"step": 1602
},
{
"epoch": 3.048977650974798,
"grad_norm": 0.36548131704330444,
"learning_rate": 8.985074626865672e-05,
"loss": 0.1381,
"step": 1603
},
{
"epoch": 3.0508796956728483,
"grad_norm": 0.30957910418510437,
"learning_rate": 8.984439504604637e-05,
"loss": 0.1123,
"step": 1604
},
{
"epoch": 3.0527817403708988,
"grad_norm": 0.38922393321990967,
"learning_rate": 8.983804382343601e-05,
"loss": 0.1588,
"step": 1605
},
{
"epoch": 3.054683785068949,
"grad_norm": 0.3416849672794342,
"learning_rate": 8.983169260082566e-05,
"loss": 0.1236,
"step": 1606
},
{
"epoch": 3.0565858297669997,
"grad_norm": 0.31353771686553955,
"learning_rate": 8.982534137821532e-05,
"loss": 0.1025,
"step": 1607
},
{
"epoch": 3.05848787446505,
"grad_norm": 0.36878702044487,
"learning_rate": 8.981899015560495e-05,
"loss": 0.1421,
"step": 1608
},
{
"epoch": 3.0603899191631005,
"grad_norm": 0.38487425446510315,
"learning_rate": 8.98126389329946e-05,
"loss": 0.1223,
"step": 1609
},
{
"epoch": 3.0622919638611505,
"grad_norm": 0.3435547649860382,
"learning_rate": 8.980628771038426e-05,
"loss": 0.1105,
"step": 1610
},
{
"epoch": 3.064194008559201,
"grad_norm": 0.422198086977005,
"learning_rate": 8.979993648777391e-05,
"loss": 0.1368,
"step": 1611
},
{
"epoch": 3.0660960532572514,
"grad_norm": 0.43352290987968445,
"learning_rate": 8.979358526516355e-05,
"loss": 0.1743,
"step": 1612
},
{
"epoch": 3.067998097955302,
"grad_norm": 0.3885476887226105,
"learning_rate": 8.978723404255319e-05,
"loss": 0.1979,
"step": 1613
},
{
"epoch": 3.0699001426533523,
"grad_norm": 0.3135451376438141,
"learning_rate": 8.978088281994285e-05,
"loss": 0.1105,
"step": 1614
},
{
"epoch": 3.0718021873514028,
"grad_norm": 0.4184531271457672,
"learning_rate": 8.977453159733249e-05,
"loss": 0.1335,
"step": 1615
},
{
"epoch": 3.073704232049453,
"grad_norm": 0.35463500022888184,
"learning_rate": 8.976818037472213e-05,
"loss": 0.1384,
"step": 1616
},
{
"epoch": 3.0756062767475036,
"grad_norm": 0.33959662914276123,
"learning_rate": 8.97618291521118e-05,
"loss": 0.118,
"step": 1617
},
{
"epoch": 3.077508321445554,
"grad_norm": 0.3295678198337555,
"learning_rate": 8.975547792950143e-05,
"loss": 0.1073,
"step": 1618
},
{
"epoch": 3.0794103661436045,
"grad_norm": 0.32906121015548706,
"learning_rate": 8.974912670689108e-05,
"loss": 0.0992,
"step": 1619
},
{
"epoch": 3.081312410841655,
"grad_norm": 0.2967415750026703,
"learning_rate": 8.974277548428072e-05,
"loss": 0.0901,
"step": 1620
},
{
"epoch": 3.0832144555397054,
"grad_norm": 0.3415001928806305,
"learning_rate": 8.973642426167037e-05,
"loss": 0.1248,
"step": 1621
},
{
"epoch": 3.0851165002377554,
"grad_norm": 0.2587614357471466,
"learning_rate": 8.973007303906003e-05,
"loss": 0.0872,
"step": 1622
},
{
"epoch": 3.087018544935806,
"grad_norm": 0.3469274640083313,
"learning_rate": 8.972372181644966e-05,
"loss": 0.1147,
"step": 1623
},
{
"epoch": 3.0889205896338563,
"grad_norm": 0.28534063696861267,
"learning_rate": 8.971737059383932e-05,
"loss": 0.1377,
"step": 1624
},
{
"epoch": 3.0908226343319067,
"grad_norm": 0.3836195170879364,
"learning_rate": 8.971101937122897e-05,
"loss": 0.1242,
"step": 1625
},
{
"epoch": 3.092724679029957,
"grad_norm": 0.40428081154823303,
"learning_rate": 8.97046681486186e-05,
"loss": 0.1017,
"step": 1626
},
{
"epoch": 3.0946267237280076,
"grad_norm": 0.37237152457237244,
"learning_rate": 8.969831692600826e-05,
"loss": 0.1318,
"step": 1627
},
{
"epoch": 3.096528768426058,
"grad_norm": 0.3669044077396393,
"learning_rate": 8.969196570339791e-05,
"loss": 0.1191,
"step": 1628
},
{
"epoch": 3.0984308131241085,
"grad_norm": 0.36814671754837036,
"learning_rate": 8.968561448078756e-05,
"loss": 0.1227,
"step": 1629
},
{
"epoch": 3.100332857822159,
"grad_norm": 0.3883667290210724,
"learning_rate": 8.96792632581772e-05,
"loss": 0.1556,
"step": 1630
},
{
"epoch": 3.1022349025202094,
"grad_norm": 0.44517648220062256,
"learning_rate": 8.967291203556685e-05,
"loss": 0.1439,
"step": 1631
},
{
"epoch": 3.10413694721826,
"grad_norm": 0.3230499029159546,
"learning_rate": 8.96665608129565e-05,
"loss": 0.117,
"step": 1632
},
{
"epoch": 3.10603899191631,
"grad_norm": 0.2505279779434204,
"learning_rate": 8.966020959034614e-05,
"loss": 0.0945,
"step": 1633
},
{
"epoch": 3.1079410366143603,
"grad_norm": 0.31753817200660706,
"learning_rate": 8.96538583677358e-05,
"loss": 0.1119,
"step": 1634
},
{
"epoch": 3.1098430813124107,
"grad_norm": 0.34199607372283936,
"learning_rate": 8.964750714512545e-05,
"loss": 0.1508,
"step": 1635
},
{
"epoch": 3.111745126010461,
"grad_norm": 0.39167290925979614,
"learning_rate": 8.964115592251508e-05,
"loss": 0.1422,
"step": 1636
},
{
"epoch": 3.1136471707085116,
"grad_norm": 0.28108343482017517,
"learning_rate": 8.963480469990474e-05,
"loss": 0.0981,
"step": 1637
},
{
"epoch": 3.115549215406562,
"grad_norm": 0.2806454598903656,
"learning_rate": 8.962845347729439e-05,
"loss": 0.1227,
"step": 1638
},
{
"epoch": 3.1174512601046125,
"grad_norm": 0.3393970727920532,
"learning_rate": 8.962210225468403e-05,
"loss": 0.1419,
"step": 1639
},
{
"epoch": 3.119353304802663,
"grad_norm": 0.3800428509712219,
"learning_rate": 8.961575103207368e-05,
"loss": 0.1323,
"step": 1640
},
{
"epoch": 3.1212553495007134,
"grad_norm": 0.3849729299545288,
"learning_rate": 8.960939980946333e-05,
"loss": 0.1505,
"step": 1641
},
{
"epoch": 3.123157394198764,
"grad_norm": 0.38189247250556946,
"learning_rate": 8.960304858685298e-05,
"loss": 0.1303,
"step": 1642
},
{
"epoch": 3.1250594388968143,
"grad_norm": 0.3030915856361389,
"learning_rate": 8.959669736424262e-05,
"loss": 0.1141,
"step": 1643
},
{
"epoch": 3.1269614835948643,
"grad_norm": 0.3842359185218811,
"learning_rate": 8.959034614163226e-05,
"loss": 0.1124,
"step": 1644
},
{
"epoch": 3.1288635282929147,
"grad_norm": 0.3637976348400116,
"learning_rate": 8.958399491902192e-05,
"loss": 0.1275,
"step": 1645
},
{
"epoch": 3.130765572990965,
"grad_norm": 0.2884964346885681,
"learning_rate": 8.957764369641156e-05,
"loss": 0.1065,
"step": 1646
},
{
"epoch": 3.1326676176890156,
"grad_norm": 0.3866124749183655,
"learning_rate": 8.957129247380121e-05,
"loss": 0.1389,
"step": 1647
},
{
"epoch": 3.134569662387066,
"grad_norm": 0.418950617313385,
"learning_rate": 8.956494125119087e-05,
"loss": 0.1406,
"step": 1648
},
{
"epoch": 3.1364717070851165,
"grad_norm": 0.37514927983283997,
"learning_rate": 8.95585900285805e-05,
"loss": 0.1239,
"step": 1649
},
{
"epoch": 3.138373751783167,
"grad_norm": 0.29558438062667847,
"learning_rate": 8.955223880597016e-05,
"loss": 0.1077,
"step": 1650
},
{
"epoch": 3.1402757964812174,
"grad_norm": 0.3241124749183655,
"learning_rate": 8.95458875833598e-05,
"loss": 0.1254,
"step": 1651
},
{
"epoch": 3.142177841179268,
"grad_norm": 0.40942251682281494,
"learning_rate": 8.953953636074945e-05,
"loss": 0.1388,
"step": 1652
},
{
"epoch": 3.1440798858773182,
"grad_norm": 0.3899609446525574,
"learning_rate": 8.95331851381391e-05,
"loss": 0.1279,
"step": 1653
},
{
"epoch": 3.1459819305753687,
"grad_norm": 0.37820303440093994,
"learning_rate": 8.952683391552874e-05,
"loss": 0.1146,
"step": 1654
},
{
"epoch": 3.147883975273419,
"grad_norm": 0.3521963059902191,
"learning_rate": 8.95204826929184e-05,
"loss": 0.1337,
"step": 1655
},
{
"epoch": 3.1497860199714696,
"grad_norm": 0.3292877674102783,
"learning_rate": 8.951413147030804e-05,
"loss": 0.1225,
"step": 1656
},
{
"epoch": 3.1516880646695196,
"grad_norm": 0.28479406237602234,
"learning_rate": 8.950778024769768e-05,
"loss": 0.1006,
"step": 1657
},
{
"epoch": 3.15359010936757,
"grad_norm": 0.2883979380130768,
"learning_rate": 8.950142902508733e-05,
"loss": 0.1114,
"step": 1658
},
{
"epoch": 3.1554921540656204,
"grad_norm": 0.33744558691978455,
"learning_rate": 8.949507780247698e-05,
"loss": 0.1263,
"step": 1659
},
{
"epoch": 3.157394198763671,
"grad_norm": 0.2845192551612854,
"learning_rate": 8.948872657986663e-05,
"loss": 0.1047,
"step": 1660
},
{
"epoch": 3.1592962434617213,
"grad_norm": 0.3539939224720001,
"learning_rate": 8.948237535725627e-05,
"loss": 0.1183,
"step": 1661
},
{
"epoch": 3.1611982881597718,
"grad_norm": 0.24927809834480286,
"learning_rate": 8.947602413464592e-05,
"loss": 0.0825,
"step": 1662
},
{
"epoch": 3.163100332857822,
"grad_norm": 0.4059623181819916,
"learning_rate": 8.946967291203558e-05,
"loss": 0.1457,
"step": 1663
},
{
"epoch": 3.1650023775558727,
"grad_norm": 0.3298782706260681,
"learning_rate": 8.946332168942521e-05,
"loss": 0.1226,
"step": 1664
},
{
"epoch": 3.166904422253923,
"grad_norm": 0.3750251829624176,
"learning_rate": 8.945697046681487e-05,
"loss": 0.144,
"step": 1665
},
{
"epoch": 3.1688064669519735,
"grad_norm": 0.40858665108680725,
"learning_rate": 8.945061924420452e-05,
"loss": 0.1426,
"step": 1666
},
{
"epoch": 3.170708511650024,
"grad_norm": 0.38032254576683044,
"learning_rate": 8.944426802159416e-05,
"loss": 0.1479,
"step": 1667
},
{
"epoch": 3.172610556348074,
"grad_norm": 0.3702940046787262,
"learning_rate": 8.943791679898381e-05,
"loss": 0.1262,
"step": 1668
},
{
"epoch": 3.1745126010461244,
"grad_norm": 0.43061700463294983,
"learning_rate": 8.943156557637346e-05,
"loss": 0.1463,
"step": 1669
},
{
"epoch": 3.176414645744175,
"grad_norm": 0.2968880832195282,
"learning_rate": 8.94252143537631e-05,
"loss": 0.1135,
"step": 1670
},
{
"epoch": 3.1783166904422253,
"grad_norm": 0.28398388624191284,
"learning_rate": 8.941886313115275e-05,
"loss": 0.1137,
"step": 1671
},
{
"epoch": 3.1802187351402758,
"grad_norm": 0.2764633595943451,
"learning_rate": 8.94125119085424e-05,
"loss": 0.0974,
"step": 1672
},
{
"epoch": 3.182120779838326,
"grad_norm": 0.39509302377700806,
"learning_rate": 8.940616068593205e-05,
"loss": 0.1491,
"step": 1673
},
{
"epoch": 3.1840228245363766,
"grad_norm": 0.2926827669143677,
"learning_rate": 8.939980946332169e-05,
"loss": 0.1207,
"step": 1674
},
{
"epoch": 3.185924869234427,
"grad_norm": 0.35445713996887207,
"learning_rate": 8.939345824071133e-05,
"loss": 0.1252,
"step": 1675
},
{
"epoch": 3.1878269139324775,
"grad_norm": 0.3183155059814453,
"learning_rate": 8.9387107018101e-05,
"loss": 0.1178,
"step": 1676
},
{
"epoch": 3.189728958630528,
"grad_norm": 0.40158188343048096,
"learning_rate": 8.938075579549063e-05,
"loss": 0.1266,
"step": 1677
},
{
"epoch": 3.1916310033285784,
"grad_norm": 0.33932897448539734,
"learning_rate": 8.937440457288029e-05,
"loss": 0.1321,
"step": 1678
},
{
"epoch": 3.1935330480266284,
"grad_norm": 0.3436925411224365,
"learning_rate": 8.936805335026994e-05,
"loss": 0.1204,
"step": 1679
},
{
"epoch": 3.195435092724679,
"grad_norm": 0.32970649003982544,
"learning_rate": 8.936170212765958e-05,
"loss": 0.1023,
"step": 1680
},
{
"epoch": 3.1973371374227293,
"grad_norm": 0.3206690549850464,
"learning_rate": 8.935535090504923e-05,
"loss": 0.1011,
"step": 1681
},
{
"epoch": 3.1992391821207797,
"grad_norm": 0.39323487877845764,
"learning_rate": 8.934899968243887e-05,
"loss": 0.1263,
"step": 1682
},
{
"epoch": 3.20114122681883,
"grad_norm": 0.3755662143230438,
"learning_rate": 8.934264845982853e-05,
"loss": 0.1345,
"step": 1683
},
{
"epoch": 3.2030432715168806,
"grad_norm": 0.3337384760379791,
"learning_rate": 8.933629723721817e-05,
"loss": 0.1094,
"step": 1684
},
{
"epoch": 3.204945316214931,
"grad_norm": 0.35307517647743225,
"learning_rate": 8.932994601460781e-05,
"loss": 0.1244,
"step": 1685
},
{
"epoch": 3.2068473609129815,
"grad_norm": 0.2809374928474426,
"learning_rate": 8.932359479199747e-05,
"loss": 0.0961,
"step": 1686
},
{
"epoch": 3.208749405611032,
"grad_norm": 0.35939821600914,
"learning_rate": 8.931724356938711e-05,
"loss": 0.1294,
"step": 1687
},
{
"epoch": 3.2106514503090824,
"grad_norm": 0.36626148223876953,
"learning_rate": 8.931089234677675e-05,
"loss": 0.141,
"step": 1688
},
{
"epoch": 3.212553495007133,
"grad_norm": 0.31976842880249023,
"learning_rate": 8.93045411241664e-05,
"loss": 0.1058,
"step": 1689
},
{
"epoch": 3.2144555397051833,
"grad_norm": 0.40340307354927063,
"learning_rate": 8.929818990155605e-05,
"loss": 0.142,
"step": 1690
},
{
"epoch": 3.2163575844032333,
"grad_norm": 0.3481243848800659,
"learning_rate": 8.92918386789457e-05,
"loss": 0.1301,
"step": 1691
},
{
"epoch": 3.2182596291012837,
"grad_norm": 0.41779786348342896,
"learning_rate": 8.928548745633534e-05,
"loss": 0.1531,
"step": 1692
},
{
"epoch": 3.220161673799334,
"grad_norm": 0.33376792073249817,
"learning_rate": 8.9279136233725e-05,
"loss": 0.1397,
"step": 1693
},
{
"epoch": 3.2220637184973846,
"grad_norm": 0.42083820700645447,
"learning_rate": 8.927278501111465e-05,
"loss": 0.1456,
"step": 1694
},
{
"epoch": 3.223965763195435,
"grad_norm": 0.23268885910511017,
"learning_rate": 8.926643378850429e-05,
"loss": 0.1261,
"step": 1695
},
{
"epoch": 3.2258678078934855,
"grad_norm": 0.3965808153152466,
"learning_rate": 8.926008256589394e-05,
"loss": 0.1454,
"step": 1696
},
{
"epoch": 3.227769852591536,
"grad_norm": 0.40782594680786133,
"learning_rate": 8.925373134328359e-05,
"loss": 0.137,
"step": 1697
},
{
"epoch": 3.2296718972895864,
"grad_norm": 0.37247705459594727,
"learning_rate": 8.924738012067323e-05,
"loss": 0.1227,
"step": 1698
},
{
"epoch": 3.231573941987637,
"grad_norm": 0.5225626230239868,
"learning_rate": 8.924102889806288e-05,
"loss": 0.1596,
"step": 1699
},
{
"epoch": 3.2334759866856873,
"grad_norm": 0.35236862301826477,
"learning_rate": 8.923467767545253e-05,
"loss": 0.1576,
"step": 1700
},
{
"epoch": 3.2353780313837377,
"grad_norm": 0.3305290639400482,
"learning_rate": 8.922832645284218e-05,
"loss": 0.1114,
"step": 1701
},
{
"epoch": 3.237280076081788,
"grad_norm": 0.37631455063819885,
"learning_rate": 8.922197523023182e-05,
"loss": 0.1278,
"step": 1702
},
{
"epoch": 3.239182120779838,
"grad_norm": 0.3439154624938965,
"learning_rate": 8.921562400762147e-05,
"loss": 0.1658,
"step": 1703
},
{
"epoch": 3.2410841654778886,
"grad_norm": 0.4184103310108185,
"learning_rate": 8.920927278501112e-05,
"loss": 0.1754,
"step": 1704
},
{
"epoch": 3.242986210175939,
"grad_norm": 0.3708958029747009,
"learning_rate": 8.920292156240076e-05,
"loss": 0.148,
"step": 1705
},
{
"epoch": 3.2448882548739895,
"grad_norm": 0.36626115441322327,
"learning_rate": 8.919657033979041e-05,
"loss": 0.152,
"step": 1706
},
{
"epoch": 3.24679029957204,
"grad_norm": 0.3738412857055664,
"learning_rate": 8.919021911718007e-05,
"loss": 0.1432,
"step": 1707
},
{
"epoch": 3.2486923442700903,
"grad_norm": 0.4470990002155304,
"learning_rate": 8.91838678945697e-05,
"loss": 0.1639,
"step": 1708
},
{
"epoch": 3.250594388968141,
"grad_norm": 0.3332229554653168,
"learning_rate": 8.917751667195936e-05,
"loss": 0.1257,
"step": 1709
},
{
"epoch": 3.2524964336661912,
"grad_norm": 0.3853921890258789,
"learning_rate": 8.917116544934901e-05,
"loss": 0.1262,
"step": 1710
},
{
"epoch": 3.2543984783642417,
"grad_norm": 0.32993221282958984,
"learning_rate": 8.916481422673865e-05,
"loss": 0.1231,
"step": 1711
},
{
"epoch": 3.256300523062292,
"grad_norm": 0.3631759285926819,
"learning_rate": 8.91584630041283e-05,
"loss": 0.148,
"step": 1712
},
{
"epoch": 3.2582025677603426,
"grad_norm": 0.40394118428230286,
"learning_rate": 8.915211178151795e-05,
"loss": 0.1542,
"step": 1713
},
{
"epoch": 3.2601046124583926,
"grad_norm": 0.3267883360385895,
"learning_rate": 8.91457605589076e-05,
"loss": 0.1411,
"step": 1714
},
{
"epoch": 3.262006657156443,
"grad_norm": 0.3076201379299164,
"learning_rate": 8.913940933629724e-05,
"loss": 0.1189,
"step": 1715
},
{
"epoch": 3.2639087018544934,
"grad_norm": 0.43854421377182007,
"learning_rate": 8.913305811368688e-05,
"loss": 0.1806,
"step": 1716
},
{
"epoch": 3.265810746552544,
"grad_norm": 0.2679373621940613,
"learning_rate": 8.912670689107654e-05,
"loss": 0.1251,
"step": 1717
},
{
"epoch": 3.2677127912505943,
"grad_norm": 0.35840150713920593,
"learning_rate": 8.912035566846618e-05,
"loss": 0.1276,
"step": 1718
},
{
"epoch": 3.2696148359486448,
"grad_norm": 0.368457168340683,
"learning_rate": 8.911400444585583e-05,
"loss": 0.1312,
"step": 1719
},
{
"epoch": 3.271516880646695,
"grad_norm": 0.3617841303348541,
"learning_rate": 8.910765322324549e-05,
"loss": 0.1165,
"step": 1720
},
{
"epoch": 3.2734189253447457,
"grad_norm": 0.34482330083847046,
"learning_rate": 8.910130200063512e-05,
"loss": 0.1246,
"step": 1721
},
{
"epoch": 3.275320970042796,
"grad_norm": 0.27358710765838623,
"learning_rate": 8.909495077802478e-05,
"loss": 0.1093,
"step": 1722
},
{
"epoch": 3.2772230147408465,
"grad_norm": 0.40264174342155457,
"learning_rate": 8.908859955541441e-05,
"loss": 0.146,
"step": 1723
},
{
"epoch": 3.279125059438897,
"grad_norm": 0.45845937728881836,
"learning_rate": 8.908224833280407e-05,
"loss": 0.1457,
"step": 1724
},
{
"epoch": 3.281027104136947,
"grad_norm": 0.34490594267845154,
"learning_rate": 8.907589711019372e-05,
"loss": 0.1247,
"step": 1725
},
{
"epoch": 3.282929148834998,
"grad_norm": 0.4256596267223358,
"learning_rate": 8.906954588758336e-05,
"loss": 0.1563,
"step": 1726
},
{
"epoch": 3.284831193533048,
"grad_norm": 0.3607080280780792,
"learning_rate": 8.906319466497302e-05,
"loss": 0.1279,
"step": 1727
},
{
"epoch": 3.2867332382310983,
"grad_norm": 0.30969080328941345,
"learning_rate": 8.905684344236266e-05,
"loss": 0.1238,
"step": 1728
},
{
"epoch": 3.2886352829291488,
"grad_norm": 0.34044647216796875,
"learning_rate": 8.90504922197523e-05,
"loss": 0.1237,
"step": 1729
},
{
"epoch": 3.290537327627199,
"grad_norm": 0.40037238597869873,
"learning_rate": 8.904414099714195e-05,
"loss": 0.1509,
"step": 1730
},
{
"epoch": 3.2924393723252496,
"grad_norm": 0.3565572500228882,
"learning_rate": 8.90377897745316e-05,
"loss": 0.1251,
"step": 1731
},
{
"epoch": 3.2943414170233,
"grad_norm": 0.33730757236480713,
"learning_rate": 8.903143855192125e-05,
"loss": 0.1527,
"step": 1732
},
{
"epoch": 3.2962434617213505,
"grad_norm": 0.4168394207954407,
"learning_rate": 8.902508732931089e-05,
"loss": 0.1429,
"step": 1733
},
{
"epoch": 3.298145506419401,
"grad_norm": 0.40814298391342163,
"learning_rate": 8.901873610670054e-05,
"loss": 0.1588,
"step": 1734
},
{
"epoch": 3.3000475511174514,
"grad_norm": 0.42030104994773865,
"learning_rate": 8.90123848840902e-05,
"loss": 0.1495,
"step": 1735
},
{
"epoch": 3.301949595815502,
"grad_norm": 0.3305467367172241,
"learning_rate": 8.900603366147983e-05,
"loss": 0.1239,
"step": 1736
},
{
"epoch": 3.3038516405135523,
"grad_norm": 0.31360068917274475,
"learning_rate": 8.899968243886949e-05,
"loss": 0.108,
"step": 1737
},
{
"epoch": 3.3057536852116023,
"grad_norm": 0.42463186383247375,
"learning_rate": 8.899333121625914e-05,
"loss": 0.1451,
"step": 1738
},
{
"epoch": 3.3076557299096527,
"grad_norm": 0.3854060471057892,
"learning_rate": 8.898697999364878e-05,
"loss": 0.1638,
"step": 1739
},
{
"epoch": 3.309557774607703,
"grad_norm": 0.46821728348731995,
"learning_rate": 8.898062877103843e-05,
"loss": 0.1718,
"step": 1740
},
{
"epoch": 3.3114598193057536,
"grad_norm": 0.33078089356422424,
"learning_rate": 8.897427754842808e-05,
"loss": 0.1153,
"step": 1741
},
{
"epoch": 3.313361864003804,
"grad_norm": 0.3746374249458313,
"learning_rate": 8.896792632581772e-05,
"loss": 0.1387,
"step": 1742
},
{
"epoch": 3.3152639087018545,
"grad_norm": 0.33252257108688354,
"learning_rate": 8.896157510320737e-05,
"loss": 0.1218,
"step": 1743
},
{
"epoch": 3.317165953399905,
"grad_norm": 0.3421841561794281,
"learning_rate": 8.895522388059702e-05,
"loss": 0.1376,
"step": 1744
},
{
"epoch": 3.3190679980979554,
"grad_norm": 0.3410481810569763,
"learning_rate": 8.894887265798667e-05,
"loss": 0.1174,
"step": 1745
},
{
"epoch": 3.320970042796006,
"grad_norm": 0.3556031882762909,
"learning_rate": 8.894252143537631e-05,
"loss": 0.1612,
"step": 1746
},
{
"epoch": 3.3228720874940563,
"grad_norm": 0.35139304399490356,
"learning_rate": 8.893617021276595e-05,
"loss": 0.1371,
"step": 1747
},
{
"epoch": 3.3247741321921067,
"grad_norm": 0.38646724820137024,
"learning_rate": 8.892981899015562e-05,
"loss": 0.1472,
"step": 1748
},
{
"epoch": 3.3266761768901567,
"grad_norm": 0.40337100625038147,
"learning_rate": 8.892346776754525e-05,
"loss": 0.1938,
"step": 1749
},
{
"epoch": 3.328578221588207,
"grad_norm": 0.2508182227611542,
"learning_rate": 8.89171165449349e-05,
"loss": 0.0987,
"step": 1750
},
{
"epoch": 3.3304802662862576,
"grad_norm": 0.392284631729126,
"learning_rate": 8.891076532232456e-05,
"loss": 0.1448,
"step": 1751
},
{
"epoch": 3.332382310984308,
"grad_norm": 0.25311291217803955,
"learning_rate": 8.89044140997142e-05,
"loss": 0.1227,
"step": 1752
},
{
"epoch": 3.3342843556823585,
"grad_norm": 0.38591787219047546,
"learning_rate": 8.889806287710385e-05,
"loss": 0.1251,
"step": 1753
},
{
"epoch": 3.336186400380409,
"grad_norm": 0.3149789869785309,
"learning_rate": 8.889171165449349e-05,
"loss": 0.1282,
"step": 1754
},
{
"epoch": 3.3380884450784594,
"grad_norm": 0.4134093225002289,
"learning_rate": 8.888536043188315e-05,
"loss": 0.1509,
"step": 1755
},
{
"epoch": 3.33999048977651,
"grad_norm": 0.3769814074039459,
"learning_rate": 8.887900920927279e-05,
"loss": 0.1283,
"step": 1756
},
{
"epoch": 3.3418925344745603,
"grad_norm": 0.42259126901626587,
"learning_rate": 8.887265798666243e-05,
"loss": 0.1319,
"step": 1757
},
{
"epoch": 3.3437945791726107,
"grad_norm": 0.4603644609451294,
"learning_rate": 8.88663067640521e-05,
"loss": 0.1427,
"step": 1758
},
{
"epoch": 3.345696623870661,
"grad_norm": 0.3804812431335449,
"learning_rate": 8.885995554144173e-05,
"loss": 0.1479,
"step": 1759
},
{
"epoch": 3.347598668568711,
"grad_norm": 0.42290598154067993,
"learning_rate": 8.885360431883137e-05,
"loss": 0.17,
"step": 1760
},
{
"epoch": 3.3495007132667616,
"grad_norm": 0.3739291727542877,
"learning_rate": 8.884725309622102e-05,
"loss": 0.1297,
"step": 1761
},
{
"epoch": 3.351402757964812,
"grad_norm": 0.36516469717025757,
"learning_rate": 8.884090187361067e-05,
"loss": 0.1294,
"step": 1762
},
{
"epoch": 3.3533048026628625,
"grad_norm": 0.32364609837532043,
"learning_rate": 8.883455065100033e-05,
"loss": 0.1211,
"step": 1763
},
{
"epoch": 3.355206847360913,
"grad_norm": 0.3903793394565582,
"learning_rate": 8.882819942838996e-05,
"loss": 0.1339,
"step": 1764
},
{
"epoch": 3.3571088920589633,
"grad_norm": 0.3321349322795868,
"learning_rate": 8.882184820577962e-05,
"loss": 0.1229,
"step": 1765
},
{
"epoch": 3.359010936757014,
"grad_norm": 0.3843282163143158,
"learning_rate": 8.881549698316927e-05,
"loss": 0.1425,
"step": 1766
},
{
"epoch": 3.3609129814550642,
"grad_norm": 0.34259116649627686,
"learning_rate": 8.88091457605589e-05,
"loss": 0.1275,
"step": 1767
},
{
"epoch": 3.3628150261531147,
"grad_norm": 0.335219144821167,
"learning_rate": 8.880279453794856e-05,
"loss": 0.1273,
"step": 1768
},
{
"epoch": 3.364717070851165,
"grad_norm": 0.3495425879955292,
"learning_rate": 8.879644331533821e-05,
"loss": 0.1112,
"step": 1769
},
{
"epoch": 3.3666191155492156,
"grad_norm": 0.430451899766922,
"learning_rate": 8.879009209272785e-05,
"loss": 0.1404,
"step": 1770
},
{
"epoch": 3.368521160247266,
"grad_norm": 0.24980789422988892,
"learning_rate": 8.87837408701175e-05,
"loss": 0.1034,
"step": 1771
},
{
"epoch": 3.3704232049453164,
"grad_norm": 0.4349839687347412,
"learning_rate": 8.877738964750715e-05,
"loss": 0.1371,
"step": 1772
},
{
"epoch": 3.3723252496433664,
"grad_norm": 0.3427116572856903,
"learning_rate": 8.87710384248968e-05,
"loss": 0.1224,
"step": 1773
},
{
"epoch": 3.374227294341417,
"grad_norm": 0.3835298418998718,
"learning_rate": 8.876468720228644e-05,
"loss": 0.1576,
"step": 1774
},
{
"epoch": 3.3761293390394673,
"grad_norm": 0.3284079432487488,
"learning_rate": 8.87583359796761e-05,
"loss": 0.1039,
"step": 1775
},
{
"epoch": 3.3780313837375178,
"grad_norm": 0.32109662890434265,
"learning_rate": 8.875198475706575e-05,
"loss": 0.1079,
"step": 1776
},
{
"epoch": 3.379933428435568,
"grad_norm": 0.27259504795074463,
"learning_rate": 8.874563353445538e-05,
"loss": 0.0983,
"step": 1777
},
{
"epoch": 3.3818354731336187,
"grad_norm": 0.3639247417449951,
"learning_rate": 8.873928231184504e-05,
"loss": 0.1297,
"step": 1778
},
{
"epoch": 3.383737517831669,
"grad_norm": 0.3729754388332367,
"learning_rate": 8.873293108923469e-05,
"loss": 0.1419,
"step": 1779
},
{
"epoch": 3.3856395625297195,
"grad_norm": 0.44657668471336365,
"learning_rate": 8.872657986662433e-05,
"loss": 0.1299,
"step": 1780
},
{
"epoch": 3.38754160722777,
"grad_norm": 0.2924906611442566,
"learning_rate": 8.872022864401398e-05,
"loss": 0.109,
"step": 1781
},
{
"epoch": 3.3894436519258204,
"grad_norm": 0.3643059730529785,
"learning_rate": 8.871387742140363e-05,
"loss": 0.1217,
"step": 1782
},
{
"epoch": 3.391345696623871,
"grad_norm": 0.31588301062583923,
"learning_rate": 8.870752619879327e-05,
"loss": 0.1309,
"step": 1783
},
{
"epoch": 3.393247741321921,
"grad_norm": 0.5099390149116516,
"learning_rate": 8.870117497618292e-05,
"loss": 0.3371,
"step": 1784
},
{
"epoch": 3.3951497860199713,
"grad_norm": 0.3374120891094208,
"learning_rate": 8.869482375357256e-05,
"loss": 0.1341,
"step": 1785
},
{
"epoch": 3.3970518307180217,
"grad_norm": 0.36739760637283325,
"learning_rate": 8.868847253096222e-05,
"loss": 0.135,
"step": 1786
},
{
"epoch": 3.398953875416072,
"grad_norm": 0.36785241961479187,
"learning_rate": 8.868212130835186e-05,
"loss": 0.1402,
"step": 1787
},
{
"epoch": 3.4008559201141226,
"grad_norm": 0.3834420442581177,
"learning_rate": 8.86757700857415e-05,
"loss": 0.132,
"step": 1788
},
{
"epoch": 3.402757964812173,
"grad_norm": 0.40532076358795166,
"learning_rate": 8.866941886313117e-05,
"loss": 0.1491,
"step": 1789
},
{
"epoch": 3.4046600095102235,
"grad_norm": 0.3840698003768921,
"learning_rate": 8.86630676405208e-05,
"loss": 0.1238,
"step": 1790
},
{
"epoch": 3.406562054208274,
"grad_norm": 0.3948921859264374,
"learning_rate": 8.865671641791046e-05,
"loss": 0.1452,
"step": 1791
},
{
"epoch": 3.4084640989063244,
"grad_norm": 0.30841973423957825,
"learning_rate": 8.86503651953001e-05,
"loss": 0.1152,
"step": 1792
},
{
"epoch": 3.410366143604375,
"grad_norm": 0.3028883635997772,
"learning_rate": 8.864401397268975e-05,
"loss": 0.103,
"step": 1793
},
{
"epoch": 3.4122681883024253,
"grad_norm": 0.3348149359226227,
"learning_rate": 8.86376627500794e-05,
"loss": 0.124,
"step": 1794
},
{
"epoch": 3.4141702330004753,
"grad_norm": 0.397709459066391,
"learning_rate": 8.863131152746904e-05,
"loss": 0.1489,
"step": 1795
},
{
"epoch": 3.4160722776985257,
"grad_norm": 0.33986514806747437,
"learning_rate": 8.862496030485869e-05,
"loss": 0.1243,
"step": 1796
},
{
"epoch": 3.417974322396576,
"grad_norm": 0.3443019688129425,
"learning_rate": 8.861860908224834e-05,
"loss": 0.1206,
"step": 1797
},
{
"epoch": 3.4198763670946266,
"grad_norm": 0.2696784734725952,
"learning_rate": 8.861225785963798e-05,
"loss": 0.0978,
"step": 1798
},
{
"epoch": 3.421778411792677,
"grad_norm": 0.3711314797401428,
"learning_rate": 8.860590663702763e-05,
"loss": 0.1416,
"step": 1799
},
{
"epoch": 3.4236804564907275,
"grad_norm": 0.4727902114391327,
"learning_rate": 8.859955541441728e-05,
"loss": 0.1749,
"step": 1800
},
{
"epoch": 3.425582501188778,
"grad_norm": 0.39370161294937134,
"learning_rate": 8.859320419180692e-05,
"loss": 0.1516,
"step": 1801
},
{
"epoch": 3.4274845458868284,
"grad_norm": 0.36975982785224915,
"learning_rate": 8.858685296919657e-05,
"loss": 0.1185,
"step": 1802
},
{
"epoch": 3.429386590584879,
"grad_norm": 0.30827558040618896,
"learning_rate": 8.858050174658622e-05,
"loss": 0.1292,
"step": 1803
},
{
"epoch": 3.4312886352829293,
"grad_norm": 0.3955543339252472,
"learning_rate": 8.857415052397588e-05,
"loss": 0.1484,
"step": 1804
},
{
"epoch": 3.4331906799809797,
"grad_norm": 0.35280320048332214,
"learning_rate": 8.856779930136551e-05,
"loss": 0.1241,
"step": 1805
},
{
"epoch": 3.4350927246790297,
"grad_norm": 0.4241807460784912,
"learning_rate": 8.856144807875517e-05,
"loss": 0.1663,
"step": 1806
},
{
"epoch": 3.4369947693770806,
"grad_norm": 0.41491755843162537,
"learning_rate": 8.855509685614482e-05,
"loss": 0.1465,
"step": 1807
},
{
"epoch": 3.4388968140751306,
"grad_norm": 0.3022492229938507,
"learning_rate": 8.854874563353446e-05,
"loss": 0.1132,
"step": 1808
},
{
"epoch": 3.440798858773181,
"grad_norm": 0.3701956570148468,
"learning_rate": 8.854239441092411e-05,
"loss": 0.1525,
"step": 1809
},
{
"epoch": 3.4427009034712315,
"grad_norm": 0.3692464232444763,
"learning_rate": 8.853604318831376e-05,
"loss": 0.1364,
"step": 1810
},
{
"epoch": 3.444602948169282,
"grad_norm": 0.2783905267715454,
"learning_rate": 8.85296919657034e-05,
"loss": 0.1112,
"step": 1811
},
{
"epoch": 3.4465049928673324,
"grad_norm": 0.26422539353370667,
"learning_rate": 8.852334074309305e-05,
"loss": 0.0871,
"step": 1812
},
{
"epoch": 3.448407037565383,
"grad_norm": 0.3428441882133484,
"learning_rate": 8.85169895204827e-05,
"loss": 0.1397,
"step": 1813
},
{
"epoch": 3.4503090822634332,
"grad_norm": 0.43042463064193726,
"learning_rate": 8.851063829787234e-05,
"loss": 0.1524,
"step": 1814
},
{
"epoch": 3.4522111269614837,
"grad_norm": 0.4124317765235901,
"learning_rate": 8.850428707526199e-05,
"loss": 0.165,
"step": 1815
},
{
"epoch": 3.454113171659534,
"grad_norm": 0.38967373967170715,
"learning_rate": 8.849793585265164e-05,
"loss": 0.129,
"step": 1816
},
{
"epoch": 3.4560152163575846,
"grad_norm": 0.3426058292388916,
"learning_rate": 8.84915846300413e-05,
"loss": 0.1229,
"step": 1817
},
{
"epoch": 3.457917261055635,
"grad_norm": 0.4571113884449005,
"learning_rate": 8.848523340743093e-05,
"loss": 0.1428,
"step": 1818
},
{
"epoch": 3.459819305753685,
"grad_norm": 0.43344834446907043,
"learning_rate": 8.847888218482057e-05,
"loss": 0.1561,
"step": 1819
},
{
"epoch": 3.4617213504517355,
"grad_norm": 0.36749354004859924,
"learning_rate": 8.847253096221024e-05,
"loss": 0.1313,
"step": 1820
},
{
"epoch": 3.463623395149786,
"grad_norm": 0.36647292971611023,
"learning_rate": 8.846617973959988e-05,
"loss": 0.1278,
"step": 1821
},
{
"epoch": 3.4655254398478363,
"grad_norm": 0.3204960525035858,
"learning_rate": 8.845982851698953e-05,
"loss": 0.11,
"step": 1822
},
{
"epoch": 3.467427484545887,
"grad_norm": 0.366187185049057,
"learning_rate": 8.845347729437918e-05,
"loss": 0.1443,
"step": 1823
},
{
"epoch": 3.4693295292439372,
"grad_norm": 0.4711836874485016,
"learning_rate": 8.844712607176882e-05,
"loss": 0.151,
"step": 1824
},
{
"epoch": 3.4712315739419877,
"grad_norm": 0.35596373677253723,
"learning_rate": 8.844077484915847e-05,
"loss": 0.1246,
"step": 1825
},
{
"epoch": 3.473133618640038,
"grad_norm": 0.41798681020736694,
"learning_rate": 8.843442362654811e-05,
"loss": 0.1575,
"step": 1826
},
{
"epoch": 3.4750356633380886,
"grad_norm": 0.3631289303302765,
"learning_rate": 8.842807240393777e-05,
"loss": 0.1105,
"step": 1827
},
{
"epoch": 3.476937708036139,
"grad_norm": 0.36891433596611023,
"learning_rate": 8.842172118132741e-05,
"loss": 0.146,
"step": 1828
},
{
"epoch": 3.4788397527341894,
"grad_norm": 0.33271533250808716,
"learning_rate": 8.841536995871705e-05,
"loss": 0.1246,
"step": 1829
},
{
"epoch": 3.4807417974322394,
"grad_norm": 0.2956920266151428,
"learning_rate": 8.840901873610671e-05,
"loss": 0.1181,
"step": 1830
},
{
"epoch": 3.48264384213029,
"grad_norm": 0.3685608506202698,
"learning_rate": 8.840266751349635e-05,
"loss": 0.1338,
"step": 1831
},
{
"epoch": 3.4845458868283403,
"grad_norm": 0.35031598806381226,
"learning_rate": 8.839631629088599e-05,
"loss": 0.1166,
"step": 1832
},
{
"epoch": 3.4864479315263908,
"grad_norm": 0.5173628330230713,
"learning_rate": 8.838996506827564e-05,
"loss": 0.157,
"step": 1833
},
{
"epoch": 3.488349976224441,
"grad_norm": 0.4643428921699524,
"learning_rate": 8.83836138456653e-05,
"loss": 0.1842,
"step": 1834
},
{
"epoch": 3.4902520209224916,
"grad_norm": 0.3688521981239319,
"learning_rate": 8.837726262305495e-05,
"loss": 0.1375,
"step": 1835
},
{
"epoch": 3.492154065620542,
"grad_norm": 0.3947365880012512,
"learning_rate": 8.837091140044458e-05,
"loss": 0.149,
"step": 1836
},
{
"epoch": 3.4940561103185925,
"grad_norm": 0.35394486784935,
"learning_rate": 8.836456017783424e-05,
"loss": 0.1252,
"step": 1837
},
{
"epoch": 3.495958155016643,
"grad_norm": 0.37168943881988525,
"learning_rate": 8.835820895522389e-05,
"loss": 0.1318,
"step": 1838
},
{
"epoch": 3.4978601997146934,
"grad_norm": 0.37239521741867065,
"learning_rate": 8.835185773261353e-05,
"loss": 0.1214,
"step": 1839
},
{
"epoch": 3.499762244412744,
"grad_norm": 0.36515411734580994,
"learning_rate": 8.834550651000318e-05,
"loss": 0.1412,
"step": 1840
},
{
"epoch": 3.501664289110794,
"grad_norm": 0.38534054160118103,
"learning_rate": 8.833915528739283e-05,
"loss": 0.1334,
"step": 1841
},
{
"epoch": 3.5035663338088447,
"grad_norm": 0.36949092149734497,
"learning_rate": 8.833280406478247e-05,
"loss": 0.1283,
"step": 1842
},
{
"epoch": 3.5054683785068947,
"grad_norm": 0.39546898007392883,
"learning_rate": 8.832645284217212e-05,
"loss": 0.1471,
"step": 1843
},
{
"epoch": 3.507370423204945,
"grad_norm": 0.34906435012817383,
"learning_rate": 8.832010161956177e-05,
"loss": 0.1386,
"step": 1844
},
{
"epoch": 3.5092724679029956,
"grad_norm": 0.44590094685554504,
"learning_rate": 8.831375039695142e-05,
"loss": 0.157,
"step": 1845
},
{
"epoch": 3.511174512601046,
"grad_norm": 0.3336107134819031,
"learning_rate": 8.830739917434106e-05,
"loss": 0.1435,
"step": 1846
},
{
"epoch": 3.5130765572990965,
"grad_norm": 0.4013485610485077,
"learning_rate": 8.830104795173071e-05,
"loss": 0.1209,
"step": 1847
},
{
"epoch": 3.514978601997147,
"grad_norm": 0.30285441875457764,
"learning_rate": 8.829469672912037e-05,
"loss": 0.108,
"step": 1848
},
{
"epoch": 3.5168806466951974,
"grad_norm": 0.440489798784256,
"learning_rate": 8.828834550651e-05,
"loss": 0.1514,
"step": 1849
},
{
"epoch": 3.518782691393248,
"grad_norm": 0.26309430599212646,
"learning_rate": 8.828199428389964e-05,
"loss": 0.0953,
"step": 1850
},
{
"epoch": 3.5206847360912983,
"grad_norm": 0.548433244228363,
"learning_rate": 8.827564306128931e-05,
"loss": 0.1977,
"step": 1851
},
{
"epoch": 3.5225867807893483,
"grad_norm": 0.4941021203994751,
"learning_rate": 8.826929183867895e-05,
"loss": 0.1268,
"step": 1852
},
{
"epoch": 3.524488825487399,
"grad_norm": 0.3945002555847168,
"learning_rate": 8.82629406160686e-05,
"loss": 0.1304,
"step": 1853
},
{
"epoch": 3.526390870185449,
"grad_norm": 0.3647942841053009,
"learning_rate": 8.825658939345825e-05,
"loss": 0.1454,
"step": 1854
},
{
"epoch": 3.5282929148834996,
"grad_norm": 0.3890063762664795,
"learning_rate": 8.825023817084789e-05,
"loss": 0.1384,
"step": 1855
},
{
"epoch": 3.53019495958155,
"grad_norm": 0.4001372456550598,
"learning_rate": 8.824388694823754e-05,
"loss": 0.1429,
"step": 1856
},
{
"epoch": 3.5320970042796005,
"grad_norm": 0.407721608877182,
"learning_rate": 8.823753572562718e-05,
"loss": 0.1374,
"step": 1857
},
{
"epoch": 3.533999048977651,
"grad_norm": 0.37832140922546387,
"learning_rate": 8.823118450301684e-05,
"loss": 0.1236,
"step": 1858
},
{
"epoch": 3.5359010936757014,
"grad_norm": 0.35406047105789185,
"learning_rate": 8.822483328040648e-05,
"loss": 0.1306,
"step": 1859
},
{
"epoch": 3.537803138373752,
"grad_norm": 0.2923578917980194,
"learning_rate": 8.821848205779612e-05,
"loss": 0.0986,
"step": 1860
},
{
"epoch": 3.5397051830718023,
"grad_norm": 0.3824620544910431,
"learning_rate": 8.821213083518579e-05,
"loss": 0.1492,
"step": 1861
},
{
"epoch": 3.5416072277698527,
"grad_norm": 0.38851413130760193,
"learning_rate": 8.820577961257542e-05,
"loss": 0.1612,
"step": 1862
},
{
"epoch": 3.543509272467903,
"grad_norm": 0.3961692154407501,
"learning_rate": 8.819942838996508e-05,
"loss": 0.1525,
"step": 1863
},
{
"epoch": 3.5454113171659536,
"grad_norm": 0.423235684633255,
"learning_rate": 8.819307716735471e-05,
"loss": 0.1514,
"step": 1864
},
{
"epoch": 3.5473133618640036,
"grad_norm": 0.3355453610420227,
"learning_rate": 8.818672594474437e-05,
"loss": 0.1183,
"step": 1865
},
{
"epoch": 3.5492154065620545,
"grad_norm": 0.44291865825653076,
"learning_rate": 8.818037472213402e-05,
"loss": 0.1457,
"step": 1866
},
{
"epoch": 3.5511174512601045,
"grad_norm": 0.39356529712677,
"learning_rate": 8.817402349952366e-05,
"loss": 0.146,
"step": 1867
},
{
"epoch": 3.553019495958155,
"grad_norm": 0.28863412141799927,
"learning_rate": 8.816767227691331e-05,
"loss": 0.1113,
"step": 1868
},
{
"epoch": 3.5549215406562054,
"grad_norm": 0.3859669268131256,
"learning_rate": 8.816132105430296e-05,
"loss": 0.1234,
"step": 1869
},
{
"epoch": 3.556823585354256,
"grad_norm": 0.3483799993991852,
"learning_rate": 8.81549698316926e-05,
"loss": 0.1324,
"step": 1870
},
{
"epoch": 3.5587256300523062,
"grad_norm": 0.3053433299064636,
"learning_rate": 8.814861860908225e-05,
"loss": 0.1252,
"step": 1871
},
{
"epoch": 3.5606276747503567,
"grad_norm": 0.44125038385391235,
"learning_rate": 8.81422673864719e-05,
"loss": 0.1627,
"step": 1872
},
{
"epoch": 3.562529719448407,
"grad_norm": 0.35409316420555115,
"learning_rate": 8.813591616386154e-05,
"loss": 0.1312,
"step": 1873
},
{
"epoch": 3.5644317641464576,
"grad_norm": 0.4219510853290558,
"learning_rate": 8.812956494125119e-05,
"loss": 0.1522,
"step": 1874
},
{
"epoch": 3.566333808844508,
"grad_norm": 0.4153057932853699,
"learning_rate": 8.812321371864084e-05,
"loss": 0.1272,
"step": 1875
},
{
"epoch": 3.568235853542558,
"grad_norm": 0.3225264549255371,
"learning_rate": 8.81168624960305e-05,
"loss": 0.1461,
"step": 1876
},
{
"epoch": 3.570137898240609,
"grad_norm": 0.41065141558647156,
"learning_rate": 8.811051127342013e-05,
"loss": 0.1466,
"step": 1877
},
{
"epoch": 3.572039942938659,
"grad_norm": 0.33854374289512634,
"learning_rate": 8.810416005080979e-05,
"loss": 0.2636,
"step": 1878
},
{
"epoch": 3.5739419876367093,
"grad_norm": 0.4266054034233093,
"learning_rate": 8.809780882819944e-05,
"loss": 0.1546,
"step": 1879
},
{
"epoch": 3.57584403233476,
"grad_norm": 0.32462188601493835,
"learning_rate": 8.809145760558908e-05,
"loss": 0.0992,
"step": 1880
},
{
"epoch": 3.5777460770328102,
"grad_norm": 0.3243044912815094,
"learning_rate": 8.808510638297873e-05,
"loss": 0.127,
"step": 1881
},
{
"epoch": 3.5796481217308607,
"grad_norm": 0.36742255091667175,
"learning_rate": 8.807875516036838e-05,
"loss": 0.1648,
"step": 1882
},
{
"epoch": 3.581550166428911,
"grad_norm": 0.47478726506233215,
"learning_rate": 8.807240393775802e-05,
"loss": 0.1402,
"step": 1883
},
{
"epoch": 3.5834522111269616,
"grad_norm": 0.29675087332725525,
"learning_rate": 8.806605271514767e-05,
"loss": 0.1102,
"step": 1884
},
{
"epoch": 3.585354255825012,
"grad_norm": 0.26269370317459106,
"learning_rate": 8.805970149253732e-05,
"loss": 0.0926,
"step": 1885
},
{
"epoch": 3.5872563005230624,
"grad_norm": 0.42690059542655945,
"learning_rate": 8.805335026992696e-05,
"loss": 0.1663,
"step": 1886
},
{
"epoch": 3.5891583452211124,
"grad_norm": 0.4843170940876007,
"learning_rate": 8.804699904731661e-05,
"loss": 0.156,
"step": 1887
},
{
"epoch": 3.5910603899191633,
"grad_norm": 0.4166446030139923,
"learning_rate": 8.804064782470626e-05,
"loss": 0.1556,
"step": 1888
},
{
"epoch": 3.5929624346172133,
"grad_norm": 0.3265363872051239,
"learning_rate": 8.803429660209592e-05,
"loss": 0.122,
"step": 1889
},
{
"epoch": 3.5948644793152638,
"grad_norm": 0.4674152433872223,
"learning_rate": 8.802794537948555e-05,
"loss": 0.1706,
"step": 1890
},
{
"epoch": 3.596766524013314,
"grad_norm": 0.4072030782699585,
"learning_rate": 8.802159415687519e-05,
"loss": 0.1465,
"step": 1891
},
{
"epoch": 3.5986685687113646,
"grad_norm": 0.4924727976322174,
"learning_rate": 8.801524293426486e-05,
"loss": 0.153,
"step": 1892
},
{
"epoch": 3.600570613409415,
"grad_norm": 0.34262821078300476,
"learning_rate": 8.80088917116545e-05,
"loss": 0.1221,
"step": 1893
},
{
"epoch": 3.6024726581074655,
"grad_norm": 0.3641190528869629,
"learning_rate": 8.800254048904415e-05,
"loss": 0.1146,
"step": 1894
},
{
"epoch": 3.604374702805516,
"grad_norm": 0.3594358265399933,
"learning_rate": 8.799618926643379e-05,
"loss": 0.1198,
"step": 1895
},
{
"epoch": 3.6062767475035664,
"grad_norm": 0.40045297145843506,
"learning_rate": 8.798983804382344e-05,
"loss": 0.2122,
"step": 1896
},
{
"epoch": 3.608178792201617,
"grad_norm": 0.40417537093162537,
"learning_rate": 8.798348682121309e-05,
"loss": 0.1523,
"step": 1897
},
{
"epoch": 3.6100808368996673,
"grad_norm": 0.3493559658527374,
"learning_rate": 8.797713559860273e-05,
"loss": 0.1105,
"step": 1898
},
{
"epoch": 3.6119828815977177,
"grad_norm": 0.3540056645870209,
"learning_rate": 8.79707843759924e-05,
"loss": 0.1205,
"step": 1899
},
{
"epoch": 3.6138849262957677,
"grad_norm": 0.4836410582065582,
"learning_rate": 8.796443315338203e-05,
"loss": 0.184,
"step": 1900
},
{
"epoch": 3.6157869709938186,
"grad_norm": 0.34036317467689514,
"learning_rate": 8.795808193077167e-05,
"loss": 0.1313,
"step": 1901
},
{
"epoch": 3.6176890156918686,
"grad_norm": 0.34924453496932983,
"learning_rate": 8.795173070816132e-05,
"loss": 0.1018,
"step": 1902
},
{
"epoch": 3.619591060389919,
"grad_norm": 0.4308503270149231,
"learning_rate": 8.794537948555097e-05,
"loss": 0.1396,
"step": 1903
},
{
"epoch": 3.6214931050879695,
"grad_norm": 0.44268596172332764,
"learning_rate": 8.793902826294061e-05,
"loss": 0.1377,
"step": 1904
},
{
"epoch": 3.62339514978602,
"grad_norm": 0.36984702944755554,
"learning_rate": 8.793267704033026e-05,
"loss": 0.1343,
"step": 1905
},
{
"epoch": 3.6252971944840704,
"grad_norm": 0.3913877606391907,
"learning_rate": 8.792632581771992e-05,
"loss": 0.1443,
"step": 1906
},
{
"epoch": 3.627199239182121,
"grad_norm": 0.4213595986366272,
"learning_rate": 8.791997459510957e-05,
"loss": 0.1537,
"step": 1907
},
{
"epoch": 3.6291012838801713,
"grad_norm": 0.4095703959465027,
"learning_rate": 8.79136233724992e-05,
"loss": 0.151,
"step": 1908
},
{
"epoch": 3.6310033285782217,
"grad_norm": 0.366328626871109,
"learning_rate": 8.790727214988886e-05,
"loss": 0.1198,
"step": 1909
},
{
"epoch": 3.632905373276272,
"grad_norm": 0.4124557375907898,
"learning_rate": 8.790092092727851e-05,
"loss": 0.1408,
"step": 1910
},
{
"epoch": 3.634807417974322,
"grad_norm": 0.36249884963035583,
"learning_rate": 8.789456970466815e-05,
"loss": 0.2058,
"step": 1911
},
{
"epoch": 3.636709462672373,
"grad_norm": 0.40580618381500244,
"learning_rate": 8.78882184820578e-05,
"loss": 0.1247,
"step": 1912
},
{
"epoch": 3.638611507370423,
"grad_norm": 0.30640462040901184,
"learning_rate": 8.788186725944745e-05,
"loss": 0.1078,
"step": 1913
},
{
"epoch": 3.6405135520684735,
"grad_norm": 0.4200808107852936,
"learning_rate": 8.787551603683709e-05,
"loss": 0.1572,
"step": 1914
},
{
"epoch": 3.642415596766524,
"grad_norm": 0.43338900804519653,
"learning_rate": 8.786916481422674e-05,
"loss": 0.1606,
"step": 1915
},
{
"epoch": 3.6443176414645744,
"grad_norm": 0.4340536296367645,
"learning_rate": 8.78628135916164e-05,
"loss": 0.1711,
"step": 1916
},
{
"epoch": 3.646219686162625,
"grad_norm": 0.3239591419696808,
"learning_rate": 8.785646236900605e-05,
"loss": 0.1166,
"step": 1917
},
{
"epoch": 3.6481217308606753,
"grad_norm": 0.3957262933254242,
"learning_rate": 8.785011114639568e-05,
"loss": 0.1605,
"step": 1918
},
{
"epoch": 3.6500237755587257,
"grad_norm": 0.4386723041534424,
"learning_rate": 8.784375992378534e-05,
"loss": 0.1595,
"step": 1919
},
{
"epoch": 3.651925820256776,
"grad_norm": 0.376113623380661,
"learning_rate": 8.783740870117499e-05,
"loss": 0.1708,
"step": 1920
},
{
"epoch": 3.6538278649548266,
"grad_norm": 0.2861535847187042,
"learning_rate": 8.783105747856463e-05,
"loss": 0.1134,
"step": 1921
},
{
"epoch": 3.6557299096528766,
"grad_norm": 0.3381497263908386,
"learning_rate": 8.782470625595426e-05,
"loss": 0.1522,
"step": 1922
},
{
"epoch": 3.6576319543509275,
"grad_norm": 0.2682400047779083,
"learning_rate": 8.781835503334393e-05,
"loss": 0.1007,
"step": 1923
},
{
"epoch": 3.6595339990489775,
"grad_norm": 0.4277699887752533,
"learning_rate": 8.781200381073357e-05,
"loss": 0.1757,
"step": 1924
},
{
"epoch": 3.661436043747028,
"grad_norm": 0.3176470696926117,
"learning_rate": 8.780565258812322e-05,
"loss": 0.1186,
"step": 1925
},
{
"epoch": 3.6633380884450784,
"grad_norm": 0.32315725088119507,
"learning_rate": 8.779930136551287e-05,
"loss": 0.1353,
"step": 1926
},
{
"epoch": 3.665240133143129,
"grad_norm": 0.44492077827453613,
"learning_rate": 8.779295014290251e-05,
"loss": 0.1689,
"step": 1927
},
{
"epoch": 3.6671421778411792,
"grad_norm": 0.33450883626937866,
"learning_rate": 8.778659892029216e-05,
"loss": 0.1171,
"step": 1928
},
{
"epoch": 3.6690442225392297,
"grad_norm": 0.45678386092185974,
"learning_rate": 8.77802476976818e-05,
"loss": 0.1547,
"step": 1929
},
{
"epoch": 3.67094626723728,
"grad_norm": 0.3756123185157776,
"learning_rate": 8.777389647507147e-05,
"loss": 0.1441,
"step": 1930
},
{
"epoch": 3.6728483119353306,
"grad_norm": 0.30440792441368103,
"learning_rate": 8.77675452524611e-05,
"loss": 0.1034,
"step": 1931
},
{
"epoch": 3.674750356633381,
"grad_norm": 0.38540956377983093,
"learning_rate": 8.776119402985074e-05,
"loss": 0.1456,
"step": 1932
},
{
"epoch": 3.6766524013314315,
"grad_norm": 0.42409566044807434,
"learning_rate": 8.775484280724041e-05,
"loss": 0.1445,
"step": 1933
},
{
"epoch": 3.678554446029482,
"grad_norm": 0.3903610408306122,
"learning_rate": 8.774849158463005e-05,
"loss": 0.1428,
"step": 1934
},
{
"epoch": 3.680456490727532,
"grad_norm": 0.4002249836921692,
"learning_rate": 8.77421403620197e-05,
"loss": 0.1328,
"step": 1935
},
{
"epoch": 3.6823585354255823,
"grad_norm": 0.37625521421432495,
"learning_rate": 8.773578913940934e-05,
"loss": 0.1271,
"step": 1936
},
{
"epoch": 3.6842605801236328,
"grad_norm": 0.333882600069046,
"learning_rate": 8.772943791679899e-05,
"loss": 0.1209,
"step": 1937
},
{
"epoch": 3.686162624821683,
"grad_norm": 0.3934018313884735,
"learning_rate": 8.772308669418864e-05,
"loss": 0.1383,
"step": 1938
},
{
"epoch": 3.6880646695197337,
"grad_norm": 0.3329316973686218,
"learning_rate": 8.771673547157828e-05,
"loss": 0.1334,
"step": 1939
},
{
"epoch": 3.689966714217784,
"grad_norm": 0.3686552047729492,
"learning_rate": 8.771038424896793e-05,
"loss": 0.1163,
"step": 1940
},
{
"epoch": 3.6918687589158345,
"grad_norm": 0.35531577467918396,
"learning_rate": 8.770403302635758e-05,
"loss": 0.114,
"step": 1941
},
{
"epoch": 3.693770803613885,
"grad_norm": 0.4164102375507355,
"learning_rate": 8.769768180374722e-05,
"loss": 0.1271,
"step": 1942
},
{
"epoch": 3.6956728483119354,
"grad_norm": 0.4182850420475006,
"learning_rate": 8.769133058113687e-05,
"loss": 0.1343,
"step": 1943
},
{
"epoch": 3.697574893009986,
"grad_norm": 0.3373199701309204,
"learning_rate": 8.768497935852652e-05,
"loss": 0.1424,
"step": 1944
},
{
"epoch": 3.6994769377080363,
"grad_norm": 0.44398215413093567,
"learning_rate": 8.767862813591616e-05,
"loss": 0.1626,
"step": 1945
},
{
"epoch": 3.7013789824060863,
"grad_norm": 0.2877051830291748,
"learning_rate": 8.767227691330581e-05,
"loss": 0.0941,
"step": 1946
},
{
"epoch": 3.703281027104137,
"grad_norm": 0.30384746193885803,
"learning_rate": 8.766592569069547e-05,
"loss": 0.1239,
"step": 1947
},
{
"epoch": 3.705183071802187,
"grad_norm": 0.41360363364219666,
"learning_rate": 8.765957446808512e-05,
"loss": 0.1567,
"step": 1948
},
{
"epoch": 3.7070851165002376,
"grad_norm": 0.28865674138069153,
"learning_rate": 8.765322324547476e-05,
"loss": 0.1165,
"step": 1949
},
{
"epoch": 3.708987161198288,
"grad_norm": 0.341654509305954,
"learning_rate": 8.764687202286441e-05,
"loss": 0.1199,
"step": 1950
},
{
"epoch": 3.7108892058963385,
"grad_norm": 0.33211663365364075,
"learning_rate": 8.764052080025406e-05,
"loss": 0.1386,
"step": 1951
},
{
"epoch": 3.712791250594389,
"grad_norm": 0.37999534606933594,
"learning_rate": 8.76341695776437e-05,
"loss": 0.1411,
"step": 1952
},
{
"epoch": 3.7146932952924394,
"grad_norm": 0.3158533573150635,
"learning_rate": 8.762781835503335e-05,
"loss": 0.1082,
"step": 1953
},
{
"epoch": 3.71659533999049,
"grad_norm": 0.42071765661239624,
"learning_rate": 8.7621467132423e-05,
"loss": 0.2395,
"step": 1954
},
{
"epoch": 3.7184973846885403,
"grad_norm": 0.3723015785217285,
"learning_rate": 8.761511590981264e-05,
"loss": 0.1427,
"step": 1955
},
{
"epoch": 3.7203994293865907,
"grad_norm": 0.31827929615974426,
"learning_rate": 8.760876468720229e-05,
"loss": 0.0983,
"step": 1956
},
{
"epoch": 3.7223014740846407,
"grad_norm": 0.45022010803222656,
"learning_rate": 8.760241346459194e-05,
"loss": 0.1658,
"step": 1957
},
{
"epoch": 3.7242035187826916,
"grad_norm": 0.4069976508617401,
"learning_rate": 8.759606224198158e-05,
"loss": 0.1277,
"step": 1958
},
{
"epoch": 3.7261055634807416,
"grad_norm": 0.3239624500274658,
"learning_rate": 8.758971101937123e-05,
"loss": 0.1204,
"step": 1959
},
{
"epoch": 3.728007608178792,
"grad_norm": 0.38038089871406555,
"learning_rate": 8.758335979676087e-05,
"loss": 0.1305,
"step": 1960
},
{
"epoch": 3.7299096528768425,
"grad_norm": 0.44531160593032837,
"learning_rate": 8.757700857415054e-05,
"loss": 0.1504,
"step": 1961
},
{
"epoch": 3.731811697574893,
"grad_norm": 0.380256712436676,
"learning_rate": 8.757065735154017e-05,
"loss": 0.1213,
"step": 1962
},
{
"epoch": 3.7337137422729434,
"grad_norm": 0.39982911944389343,
"learning_rate": 8.756430612892981e-05,
"loss": 0.1255,
"step": 1963
},
{
"epoch": 3.735615786970994,
"grad_norm": 0.39186495542526245,
"learning_rate": 8.755795490631948e-05,
"loss": 0.1459,
"step": 1964
},
{
"epoch": 3.7375178316690443,
"grad_norm": 0.4191820025444031,
"learning_rate": 8.755160368370912e-05,
"loss": 0.1269,
"step": 1965
},
{
"epoch": 3.7394198763670947,
"grad_norm": 0.3438499867916107,
"learning_rate": 8.754525246109877e-05,
"loss": 0.124,
"step": 1966
},
{
"epoch": 3.741321921065145,
"grad_norm": 0.3626823127269745,
"learning_rate": 8.753890123848841e-05,
"loss": 0.1326,
"step": 1967
},
{
"epoch": 3.743223965763195,
"grad_norm": 0.3823707103729248,
"learning_rate": 8.753255001587806e-05,
"loss": 0.1351,
"step": 1968
},
{
"epoch": 3.745126010461246,
"grad_norm": 0.3537774980068207,
"learning_rate": 8.752619879326771e-05,
"loss": 0.1079,
"step": 1969
},
{
"epoch": 3.747028055159296,
"grad_norm": 0.4008922576904297,
"learning_rate": 8.751984757065735e-05,
"loss": 0.1752,
"step": 1970
},
{
"epoch": 3.7489300998573465,
"grad_norm": 0.3501138687133789,
"learning_rate": 8.751349634804701e-05,
"loss": 0.1296,
"step": 1971
},
{
"epoch": 3.750832144555397,
"grad_norm": 0.3441070318222046,
"learning_rate": 8.750714512543665e-05,
"loss": 0.1161,
"step": 1972
},
{
"epoch": 3.7527341892534474,
"grad_norm": 0.42847099900245667,
"learning_rate": 8.750079390282629e-05,
"loss": 0.1483,
"step": 1973
},
{
"epoch": 3.754636233951498,
"grad_norm": 0.4879817068576813,
"learning_rate": 8.749444268021594e-05,
"loss": 0.1725,
"step": 1974
},
{
"epoch": 3.7565382786495483,
"grad_norm": 0.32576873898506165,
"learning_rate": 8.74880914576056e-05,
"loss": 0.1211,
"step": 1975
},
{
"epoch": 3.7584403233475987,
"grad_norm": 0.4470548927783966,
"learning_rate": 8.748174023499523e-05,
"loss": 0.155,
"step": 1976
},
{
"epoch": 3.760342368045649,
"grad_norm": 0.506020724773407,
"learning_rate": 8.747538901238488e-05,
"loss": 0.1924,
"step": 1977
},
{
"epoch": 3.7622444127436996,
"grad_norm": 0.3949258625507355,
"learning_rate": 8.746903778977454e-05,
"loss": 0.1365,
"step": 1978
},
{
"epoch": 3.76414645744175,
"grad_norm": 0.381511390209198,
"learning_rate": 8.746268656716419e-05,
"loss": 0.1706,
"step": 1979
},
{
"epoch": 3.7660485021398005,
"grad_norm": 0.32848381996154785,
"learning_rate": 8.745633534455383e-05,
"loss": 0.1302,
"step": 1980
},
{
"epoch": 3.7679505468378505,
"grad_norm": 0.39011678099632263,
"learning_rate": 8.744998412194348e-05,
"loss": 0.1501,
"step": 1981
},
{
"epoch": 3.7698525915359014,
"grad_norm": 0.35527095198631287,
"learning_rate": 8.744363289933313e-05,
"loss": 0.1218,
"step": 1982
},
{
"epoch": 3.7717546362339514,
"grad_norm": 0.4448065459728241,
"learning_rate": 8.743728167672277e-05,
"loss": 0.1527,
"step": 1983
},
{
"epoch": 3.773656680932002,
"grad_norm": 0.45173025131225586,
"learning_rate": 8.743093045411242e-05,
"loss": 0.1546,
"step": 1984
},
{
"epoch": 3.7755587256300522,
"grad_norm": 0.3051410913467407,
"learning_rate": 8.742457923150207e-05,
"loss": 0.1176,
"step": 1985
},
{
"epoch": 3.7774607703281027,
"grad_norm": 0.4559077322483063,
"learning_rate": 8.741822800889171e-05,
"loss": 0.1466,
"step": 1986
},
{
"epoch": 3.779362815026153,
"grad_norm": 0.33901482820510864,
"learning_rate": 8.741187678628136e-05,
"loss": 0.1263,
"step": 1987
},
{
"epoch": 3.7812648597242036,
"grad_norm": 0.3377963900566101,
"learning_rate": 8.740552556367101e-05,
"loss": 0.1029,
"step": 1988
},
{
"epoch": 3.783166904422254,
"grad_norm": 0.3285292088985443,
"learning_rate": 8.739917434106067e-05,
"loss": 0.1256,
"step": 1989
},
{
"epoch": 3.7850689491203044,
"grad_norm": 0.4042280614376068,
"learning_rate": 8.73928231184503e-05,
"loss": 0.1554,
"step": 1990
},
{
"epoch": 3.786970993818355,
"grad_norm": 0.374153733253479,
"learning_rate": 8.738647189583996e-05,
"loss": 0.1109,
"step": 1991
},
{
"epoch": 3.788873038516405,
"grad_norm": 0.3667593002319336,
"learning_rate": 8.738012067322961e-05,
"loss": 0.1014,
"step": 1992
},
{
"epoch": 3.7907750832144558,
"grad_norm": 0.40893805027008057,
"learning_rate": 8.737376945061925e-05,
"loss": 0.137,
"step": 1993
},
{
"epoch": 3.7926771279125058,
"grad_norm": 0.4428877830505371,
"learning_rate": 8.736741822800888e-05,
"loss": 0.1516,
"step": 1994
},
{
"epoch": 3.794579172610556,
"grad_norm": 0.4404061734676361,
"learning_rate": 8.736106700539855e-05,
"loss": 0.155,
"step": 1995
},
{
"epoch": 3.7964812173086067,
"grad_norm": 0.3298742473125458,
"learning_rate": 8.735471578278819e-05,
"loss": 0.1244,
"step": 1996
},
{
"epoch": 3.798383262006657,
"grad_norm": 0.36190545558929443,
"learning_rate": 8.734836456017784e-05,
"loss": 0.148,
"step": 1997
},
{
"epoch": 3.8002853067047075,
"grad_norm": 0.34386786818504333,
"learning_rate": 8.734201333756749e-05,
"loss": 0.1479,
"step": 1998
},
{
"epoch": 3.802187351402758,
"grad_norm": 0.434257835149765,
"learning_rate": 8.733566211495713e-05,
"loss": 0.1624,
"step": 1999
},
{
"epoch": 3.8040893961008084,
"grad_norm": 0.369232177734375,
"learning_rate": 8.732931089234678e-05,
"loss": 0.1297,
"step": 2000
},
{
"epoch": 3.805991440798859,
"grad_norm": 0.31438469886779785,
"learning_rate": 8.732295966973642e-05,
"loss": 0.1074,
"step": 2001
},
{
"epoch": 3.8078934854969093,
"grad_norm": 0.4128814935684204,
"learning_rate": 8.731660844712609e-05,
"loss": 0.1489,
"step": 2002
},
{
"epoch": 3.8097955301949593,
"grad_norm": 0.2960624694824219,
"learning_rate": 8.731025722451572e-05,
"loss": 0.1063,
"step": 2003
},
{
"epoch": 3.81169757489301,
"grad_norm": 0.35740041732788086,
"learning_rate": 8.730390600190536e-05,
"loss": 0.1438,
"step": 2004
},
{
"epoch": 3.81359961959106,
"grad_norm": 0.3402657210826874,
"learning_rate": 8.729755477929501e-05,
"loss": 0.151,
"step": 2005
},
{
"epoch": 3.8155016642891106,
"grad_norm": 0.3280869722366333,
"learning_rate": 8.729120355668467e-05,
"loss": 0.112,
"step": 2006
},
{
"epoch": 3.817403708987161,
"grad_norm": 0.3747129440307617,
"learning_rate": 8.728485233407432e-05,
"loss": 0.1191,
"step": 2007
},
{
"epoch": 3.8193057536852115,
"grad_norm": 0.3609796464443207,
"learning_rate": 8.727850111146396e-05,
"loss": 0.1373,
"step": 2008
},
{
"epoch": 3.821207798383262,
"grad_norm": 0.38992708921432495,
"learning_rate": 8.727214988885361e-05,
"loss": 0.1474,
"step": 2009
},
{
"epoch": 3.8231098430813124,
"grad_norm": 0.3531118929386139,
"learning_rate": 8.726579866624326e-05,
"loss": 0.1188,
"step": 2010
},
{
"epoch": 3.825011887779363,
"grad_norm": 0.30585137009620667,
"learning_rate": 8.72594474436329e-05,
"loss": 0.1072,
"step": 2011
},
{
"epoch": 3.8269139324774133,
"grad_norm": 0.40438538789749146,
"learning_rate": 8.725309622102255e-05,
"loss": 0.1527,
"step": 2012
},
{
"epoch": 3.8288159771754637,
"grad_norm": 0.31290772557258606,
"learning_rate": 8.72467449984122e-05,
"loss": 0.1251,
"step": 2013
},
{
"epoch": 3.830718021873514,
"grad_norm": 0.389160692691803,
"learning_rate": 8.724039377580184e-05,
"loss": 0.1387,
"step": 2014
},
{
"epoch": 3.8326200665715646,
"grad_norm": 0.34139397740364075,
"learning_rate": 8.723404255319149e-05,
"loss": 0.1205,
"step": 2015
},
{
"epoch": 3.8345221112696146,
"grad_norm": 0.4144088923931122,
"learning_rate": 8.722769133058114e-05,
"loss": 0.1493,
"step": 2016
},
{
"epoch": 3.8364241559676655,
"grad_norm": 0.3793914318084717,
"learning_rate": 8.722134010797078e-05,
"loss": 0.1379,
"step": 2017
},
{
"epoch": 3.8383262006657155,
"grad_norm": 0.3809344470500946,
"learning_rate": 8.721498888536043e-05,
"loss": 0.196,
"step": 2018
},
{
"epoch": 3.840228245363766,
"grad_norm": 0.3764810860157013,
"learning_rate": 8.720863766275009e-05,
"loss": 0.1096,
"step": 2019
},
{
"epoch": 3.8421302900618164,
"grad_norm": 0.47973567247390747,
"learning_rate": 8.720228644013974e-05,
"loss": 0.1195,
"step": 2020
},
{
"epoch": 3.844032334759867,
"grad_norm": 0.4527863562107086,
"learning_rate": 8.719593521752938e-05,
"loss": 0.2112,
"step": 2021
},
{
"epoch": 3.8459343794579173,
"grad_norm": 0.39066699147224426,
"learning_rate": 8.718958399491903e-05,
"loss": 0.1281,
"step": 2022
},
{
"epoch": 3.8478364241559677,
"grad_norm": 0.37056446075439453,
"learning_rate": 8.718323277230868e-05,
"loss": 0.1519,
"step": 2023
},
{
"epoch": 3.849738468854018,
"grad_norm": 0.516057550907135,
"learning_rate": 8.717688154969832e-05,
"loss": 0.1657,
"step": 2024
},
{
"epoch": 3.8516405135520686,
"grad_norm": 0.3468872010707855,
"learning_rate": 8.717053032708797e-05,
"loss": 0.1408,
"step": 2025
},
{
"epoch": 3.853542558250119,
"grad_norm": 0.5452744364738464,
"learning_rate": 8.716417910447762e-05,
"loss": 0.3173,
"step": 2026
},
{
"epoch": 3.855444602948169,
"grad_norm": 0.4378301501274109,
"learning_rate": 8.715782788186726e-05,
"loss": 0.136,
"step": 2027
},
{
"epoch": 3.85734664764622,
"grad_norm": 0.49818679690361023,
"learning_rate": 8.715147665925691e-05,
"loss": 0.233,
"step": 2028
},
{
"epoch": 3.85924869234427,
"grad_norm": 0.4228188693523407,
"learning_rate": 8.714512543664656e-05,
"loss": 0.1485,
"step": 2029
},
{
"epoch": 3.8611507370423204,
"grad_norm": 0.34110891819000244,
"learning_rate": 8.71387742140362e-05,
"loss": 0.1455,
"step": 2030
},
{
"epoch": 3.863052781740371,
"grad_norm": 0.38667479157447815,
"learning_rate": 8.713242299142585e-05,
"loss": 0.1302,
"step": 2031
},
{
"epoch": 3.8649548264384213,
"grad_norm": 0.3971845805644989,
"learning_rate": 8.712607176881549e-05,
"loss": 0.1562,
"step": 2032
},
{
"epoch": 3.8668568711364717,
"grad_norm": 0.32637760043144226,
"learning_rate": 8.711972054620516e-05,
"loss": 0.1213,
"step": 2033
},
{
"epoch": 3.868758915834522,
"grad_norm": 0.3475836217403412,
"learning_rate": 8.71133693235948e-05,
"loss": 0.1514,
"step": 2034
},
{
"epoch": 3.8706609605325726,
"grad_norm": 0.37775367498397827,
"learning_rate": 8.710701810098443e-05,
"loss": 0.1672,
"step": 2035
},
{
"epoch": 3.872563005230623,
"grad_norm": 0.4611580967903137,
"learning_rate": 8.71006668783741e-05,
"loss": 0.1977,
"step": 2036
},
{
"epoch": 3.8744650499286735,
"grad_norm": 0.34681427478790283,
"learning_rate": 8.709431565576374e-05,
"loss": 0.127,
"step": 2037
},
{
"epoch": 3.8763670946267235,
"grad_norm": 0.3547581732273102,
"learning_rate": 8.708796443315339e-05,
"loss": 0.1432,
"step": 2038
},
{
"epoch": 3.8782691393247744,
"grad_norm": 0.3560992479324341,
"learning_rate": 8.708161321054303e-05,
"loss": 0.1269,
"step": 2039
},
{
"epoch": 3.8801711840228243,
"grad_norm": 0.48965948820114136,
"learning_rate": 8.707526198793268e-05,
"loss": 0.1694,
"step": 2040
},
{
"epoch": 3.882073228720875,
"grad_norm": 0.4042951464653015,
"learning_rate": 8.706891076532233e-05,
"loss": 0.1432,
"step": 2041
},
{
"epoch": 3.8839752734189252,
"grad_norm": 0.40321534872055054,
"learning_rate": 8.706255954271197e-05,
"loss": 0.1206,
"step": 2042
},
{
"epoch": 3.8858773181169757,
"grad_norm": 0.5154759883880615,
"learning_rate": 8.705620832010164e-05,
"loss": 0.2034,
"step": 2043
},
{
"epoch": 3.887779362815026,
"grad_norm": 0.3707939684391022,
"learning_rate": 8.704985709749127e-05,
"loss": 0.1408,
"step": 2044
},
{
"epoch": 3.8896814075130766,
"grad_norm": 0.46117648482322693,
"learning_rate": 8.704350587488091e-05,
"loss": 0.1921,
"step": 2045
},
{
"epoch": 3.891583452211127,
"grad_norm": 0.4917357265949249,
"learning_rate": 8.703715465227056e-05,
"loss": 0.1684,
"step": 2046
},
{
"epoch": 3.8934854969091774,
"grad_norm": 0.36523228883743286,
"learning_rate": 8.703080342966022e-05,
"loss": 0.1977,
"step": 2047
},
{
"epoch": 3.895387541607228,
"grad_norm": 0.3557770550251007,
"learning_rate": 8.702445220704985e-05,
"loss": 0.1326,
"step": 2048
},
{
"epoch": 3.8972895863052783,
"grad_norm": 0.2716139853000641,
"learning_rate": 8.70181009844395e-05,
"loss": 0.1119,
"step": 2049
},
{
"epoch": 3.8991916310033288,
"grad_norm": 0.3266098201274872,
"learning_rate": 8.701174976182916e-05,
"loss": 0.1355,
"step": 2050
},
{
"epoch": 3.9010936757013788,
"grad_norm": 0.4549683928489685,
"learning_rate": 8.700539853921881e-05,
"loss": 0.174,
"step": 2051
},
{
"epoch": 3.9029957203994297,
"grad_norm": 0.3865867555141449,
"learning_rate": 8.699904731660845e-05,
"loss": 0.131,
"step": 2052
},
{
"epoch": 3.9048977650974797,
"grad_norm": 0.4354785084724426,
"learning_rate": 8.69926960939981e-05,
"loss": 0.1497,
"step": 2053
},
{
"epoch": 3.90679980979553,
"grad_norm": 0.38822686672210693,
"learning_rate": 8.698634487138775e-05,
"loss": 0.1272,
"step": 2054
},
{
"epoch": 3.9087018544935805,
"grad_norm": 0.4395056366920471,
"learning_rate": 8.697999364877739e-05,
"loss": 0.1801,
"step": 2055
},
{
"epoch": 3.910603899191631,
"grad_norm": 0.4310166835784912,
"learning_rate": 8.697364242616704e-05,
"loss": 0.1457,
"step": 2056
},
{
"epoch": 3.9125059438896814,
"grad_norm": 0.42527538537979126,
"learning_rate": 8.69672912035567e-05,
"loss": 0.1827,
"step": 2057
},
{
"epoch": 3.914407988587732,
"grad_norm": 0.41284388303756714,
"learning_rate": 8.696093998094633e-05,
"loss": 0.1588,
"step": 2058
},
{
"epoch": 3.9163100332857823,
"grad_norm": 0.3561374247074127,
"learning_rate": 8.695458875833598e-05,
"loss": 0.138,
"step": 2059
},
{
"epoch": 3.9182120779838328,
"grad_norm": 0.4057970941066742,
"learning_rate": 8.694823753572564e-05,
"loss": 0.1504,
"step": 2060
},
{
"epoch": 3.920114122681883,
"grad_norm": 0.47292712330818176,
"learning_rate": 8.694188631311529e-05,
"loss": 0.1417,
"step": 2061
},
{
"epoch": 3.922016167379933,
"grad_norm": 0.4207940995693207,
"learning_rate": 8.693553509050493e-05,
"loss": 0.1372,
"step": 2062
},
{
"epoch": 3.923918212077984,
"grad_norm": 0.5482998490333557,
"learning_rate": 8.692918386789456e-05,
"loss": 0.1917,
"step": 2063
},
{
"epoch": 3.925820256776034,
"grad_norm": 0.41113635897636414,
"learning_rate": 8.692283264528423e-05,
"loss": 0.1479,
"step": 2064
},
{
"epoch": 3.9277223014740845,
"grad_norm": 0.3470059037208557,
"learning_rate": 8.691648142267387e-05,
"loss": 0.1235,
"step": 2065
},
{
"epoch": 3.929624346172135,
"grad_norm": 0.4131185710430145,
"learning_rate": 8.69101302000635e-05,
"loss": 0.1476,
"step": 2066
},
{
"epoch": 3.9315263908701854,
"grad_norm": 0.3750738501548767,
"learning_rate": 8.690377897745317e-05,
"loss": 0.1517,
"step": 2067
},
{
"epoch": 3.933428435568236,
"grad_norm": 0.37411704659461975,
"learning_rate": 8.689742775484281e-05,
"loss": 0.1493,
"step": 2068
},
{
"epoch": 3.9353304802662863,
"grad_norm": 0.4208986759185791,
"learning_rate": 8.689107653223246e-05,
"loss": 0.1558,
"step": 2069
},
{
"epoch": 3.9372325249643367,
"grad_norm": 0.36959660053253174,
"learning_rate": 8.68847253096221e-05,
"loss": 0.1247,
"step": 2070
},
{
"epoch": 3.939134569662387,
"grad_norm": 0.3977148234844208,
"learning_rate": 8.687837408701175e-05,
"loss": 0.1428,
"step": 2071
},
{
"epoch": 3.9410366143604376,
"grad_norm": 0.40076392889022827,
"learning_rate": 8.68720228644014e-05,
"loss": 0.1652,
"step": 2072
},
{
"epoch": 3.9429386590584876,
"grad_norm": 0.3828325569629669,
"learning_rate": 8.686567164179104e-05,
"loss": 0.1518,
"step": 2073
},
{
"epoch": 3.9448407037565385,
"grad_norm": 0.35112518072128296,
"learning_rate": 8.685932041918071e-05,
"loss": 0.1303,
"step": 2074
},
{
"epoch": 3.9467427484545885,
"grad_norm": 0.31564921140670776,
"learning_rate": 8.685296919657035e-05,
"loss": 0.1325,
"step": 2075
},
{
"epoch": 3.948644793152639,
"grad_norm": 0.3110829293727875,
"learning_rate": 8.684661797395998e-05,
"loss": 0.0958,
"step": 2076
},
{
"epoch": 3.9505468378506894,
"grad_norm": 0.41574040055274963,
"learning_rate": 8.684026675134964e-05,
"loss": 0.142,
"step": 2077
},
{
"epoch": 3.95244888254874,
"grad_norm": 0.4371127188205719,
"learning_rate": 8.683391552873929e-05,
"loss": 0.1699,
"step": 2078
},
{
"epoch": 3.9543509272467903,
"grad_norm": 0.41888341307640076,
"learning_rate": 8.682756430612894e-05,
"loss": 0.1467,
"step": 2079
},
{
"epoch": 3.9562529719448407,
"grad_norm": 0.4013144373893738,
"learning_rate": 8.682121308351858e-05,
"loss": 0.1541,
"step": 2080
},
{
"epoch": 3.958155016642891,
"grad_norm": 0.3627847135066986,
"learning_rate": 8.681486186090823e-05,
"loss": 0.1412,
"step": 2081
},
{
"epoch": 3.9600570613409416,
"grad_norm": 0.34517934918403625,
"learning_rate": 8.680851063829788e-05,
"loss": 0.1302,
"step": 2082
},
{
"epoch": 3.961959106038992,
"grad_norm": 0.409612238407135,
"learning_rate": 8.680215941568752e-05,
"loss": 0.1806,
"step": 2083
},
{
"epoch": 3.9638611507370425,
"grad_norm": 0.37562572956085205,
"learning_rate": 8.679580819307717e-05,
"loss": 0.1305,
"step": 2084
},
{
"epoch": 3.965763195435093,
"grad_norm": 0.30839917063713074,
"learning_rate": 8.678945697046682e-05,
"loss": 0.1179,
"step": 2085
},
{
"epoch": 3.967665240133143,
"grad_norm": 0.4009683430194855,
"learning_rate": 8.678310574785646e-05,
"loss": 0.1392,
"step": 2086
},
{
"epoch": 3.969567284831194,
"grad_norm": 0.5373052358627319,
"learning_rate": 8.677675452524611e-05,
"loss": 0.2366,
"step": 2087
},
{
"epoch": 3.971469329529244,
"grad_norm": 0.44061073660850525,
"learning_rate": 8.677040330263576e-05,
"loss": 0.1541,
"step": 2088
},
{
"epoch": 3.9733713742272943,
"grad_norm": 0.6880194544792175,
"learning_rate": 8.67640520800254e-05,
"loss": 0.1822,
"step": 2089
},
{
"epoch": 3.9752734189253447,
"grad_norm": 0.4342186450958252,
"learning_rate": 8.675770085741505e-05,
"loss": 0.1398,
"step": 2090
},
{
"epoch": 3.977175463623395,
"grad_norm": 0.3437482714653015,
"learning_rate": 8.675134963480471e-05,
"loss": 0.1407,
"step": 2091
},
{
"epoch": 3.9790775083214456,
"grad_norm": 0.43729832768440247,
"learning_rate": 8.674499841219436e-05,
"loss": 0.1604,
"step": 2092
},
{
"epoch": 3.980979553019496,
"grad_norm": 0.36654895544052124,
"learning_rate": 8.6738647189584e-05,
"loss": 0.1261,
"step": 2093
},
{
"epoch": 3.9828815977175465,
"grad_norm": 0.40422323346138,
"learning_rate": 8.673229596697365e-05,
"loss": 0.1463,
"step": 2094
},
{
"epoch": 3.984783642415597,
"grad_norm": 0.37436428666114807,
"learning_rate": 8.67259447443633e-05,
"loss": 0.1283,
"step": 2095
},
{
"epoch": 3.9866856871136473,
"grad_norm": 0.4568138122558594,
"learning_rate": 8.671959352175294e-05,
"loss": 0.1735,
"step": 2096
},
{
"epoch": 3.9885877318116973,
"grad_norm": 0.3864310681819916,
"learning_rate": 8.671324229914259e-05,
"loss": 0.1458,
"step": 2097
},
{
"epoch": 3.9904897765097482,
"grad_norm": 0.3622378408908844,
"learning_rate": 8.670689107653224e-05,
"loss": 0.1333,
"step": 2098
},
{
"epoch": 3.9923918212077982,
"grad_norm": 0.5126944780349731,
"learning_rate": 8.670053985392188e-05,
"loss": 0.1897,
"step": 2099
},
{
"epoch": 3.9942938659058487,
"grad_norm": 0.3905584216117859,
"learning_rate": 8.669418863131153e-05,
"loss": 0.1743,
"step": 2100
},
{
"epoch": 3.996195910603899,
"grad_norm": 0.4149746298789978,
"learning_rate": 8.668783740870118e-05,
"loss": 0.1686,
"step": 2101
},
{
"epoch": 3.9980979553019496,
"grad_norm": 0.30447009205818176,
"learning_rate": 8.668148618609082e-05,
"loss": 0.1079,
"step": 2102
},
{
"epoch": 4.0,
"grad_norm": 0.533173143863678,
"learning_rate": 8.667513496348047e-05,
"loss": 0.1652,
"step": 2103
},
{
"epoch": 4.00190204469805,
"grad_norm": 0.26669684052467346,
"learning_rate": 8.666878374087011e-05,
"loss": 0.1105,
"step": 2104
},
{
"epoch": 4.003804089396101,
"grad_norm": 0.2511195242404938,
"learning_rate": 8.666243251825978e-05,
"loss": 0.1018,
"step": 2105
},
{
"epoch": 4.005706134094151,
"grad_norm": 0.2838079035282135,
"learning_rate": 8.665608129564942e-05,
"loss": 0.0979,
"step": 2106
},
{
"epoch": 4.007608178792202,
"grad_norm": 0.3789231479167938,
"learning_rate": 8.664973007303905e-05,
"loss": 0.1216,
"step": 2107
},
{
"epoch": 4.009510223490252,
"grad_norm": 0.36412686109542847,
"learning_rate": 8.664337885042872e-05,
"loss": 0.0924,
"step": 2108
},
{
"epoch": 4.011412268188303,
"grad_norm": 0.3399736285209656,
"learning_rate": 8.663702762781836e-05,
"loss": 0.1007,
"step": 2109
},
{
"epoch": 4.013314312886353,
"grad_norm": 0.3104216456413269,
"learning_rate": 8.663067640520801e-05,
"loss": 0.1146,
"step": 2110
},
{
"epoch": 4.0152163575844035,
"grad_norm": 0.33002039790153503,
"learning_rate": 8.662432518259765e-05,
"loss": 0.1112,
"step": 2111
},
{
"epoch": 4.0171184022824535,
"grad_norm": 0.3158220946788788,
"learning_rate": 8.66179739599873e-05,
"loss": 0.0983,
"step": 2112
},
{
"epoch": 4.019020446980504,
"grad_norm": 0.3281852900981903,
"learning_rate": 8.661162273737695e-05,
"loss": 0.1002,
"step": 2113
},
{
"epoch": 4.020922491678554,
"grad_norm": 0.42810752987861633,
"learning_rate": 8.660527151476659e-05,
"loss": 0.145,
"step": 2114
},
{
"epoch": 4.022824536376604,
"grad_norm": 0.343757301568985,
"learning_rate": 8.659892029215624e-05,
"loss": 0.1046,
"step": 2115
},
{
"epoch": 4.024726581074655,
"grad_norm": 0.3978208601474762,
"learning_rate": 8.65925690695459e-05,
"loss": 0.1232,
"step": 2116
},
{
"epoch": 4.026628625772705,
"grad_norm": 0.3716939687728882,
"learning_rate": 8.658621784693553e-05,
"loss": 0.1073,
"step": 2117
},
{
"epoch": 4.028530670470756,
"grad_norm": 0.3938986659049988,
"learning_rate": 8.657986662432518e-05,
"loss": 0.1162,
"step": 2118
},
{
"epoch": 4.030432715168806,
"grad_norm": 0.26515620946884155,
"learning_rate": 8.657351540171484e-05,
"loss": 0.0927,
"step": 2119
},
{
"epoch": 4.032334759866857,
"grad_norm": 0.4481755197048187,
"learning_rate": 8.656716417910447e-05,
"loss": 0.1192,
"step": 2120
},
{
"epoch": 4.034236804564907,
"grad_norm": 0.2902253568172455,
"learning_rate": 8.656081295649413e-05,
"loss": 0.0972,
"step": 2121
},
{
"epoch": 4.036138849262958,
"grad_norm": 0.3764674961566925,
"learning_rate": 8.655446173388378e-05,
"loss": 0.1242,
"step": 2122
},
{
"epoch": 4.038040893961008,
"grad_norm": 0.4040977954864502,
"learning_rate": 8.654811051127343e-05,
"loss": 0.1053,
"step": 2123
},
{
"epoch": 4.039942938659059,
"grad_norm": 0.3967365026473999,
"learning_rate": 8.654175928866307e-05,
"loss": 0.1132,
"step": 2124
},
{
"epoch": 4.041844983357109,
"grad_norm": 0.4135635197162628,
"learning_rate": 8.653540806605272e-05,
"loss": 0.1171,
"step": 2125
},
{
"epoch": 4.04374702805516,
"grad_norm": 0.43473535776138306,
"learning_rate": 8.652905684344237e-05,
"loss": 0.1227,
"step": 2126
},
{
"epoch": 4.04564907275321,
"grad_norm": 0.30436238646507263,
"learning_rate": 8.652270562083201e-05,
"loss": 0.0853,
"step": 2127
},
{
"epoch": 4.04755111745126,
"grad_norm": 0.3265203535556793,
"learning_rate": 8.651635439822166e-05,
"loss": 0.1007,
"step": 2128
},
{
"epoch": 4.049453162149311,
"grad_norm": 0.3733639121055603,
"learning_rate": 8.651000317561131e-05,
"loss": 0.1164,
"step": 2129
},
{
"epoch": 4.051355206847361,
"grad_norm": 0.3707481324672699,
"learning_rate": 8.650365195300095e-05,
"loss": 0.1225,
"step": 2130
},
{
"epoch": 4.0532572515454115,
"grad_norm": 0.39869242906570435,
"learning_rate": 8.64973007303906e-05,
"loss": 0.1127,
"step": 2131
},
{
"epoch": 4.0551592962434615,
"grad_norm": 0.31656894087791443,
"learning_rate": 8.649094950778026e-05,
"loss": 0.0936,
"step": 2132
},
{
"epoch": 4.057061340941512,
"grad_norm": 0.32848450541496277,
"learning_rate": 8.648459828516991e-05,
"loss": 0.1192,
"step": 2133
},
{
"epoch": 4.058963385639562,
"grad_norm": 0.41309690475463867,
"learning_rate": 8.647824706255955e-05,
"loss": 0.1224,
"step": 2134
},
{
"epoch": 4.060865430337613,
"grad_norm": 0.30171439051628113,
"learning_rate": 8.647189583994918e-05,
"loss": 0.1108,
"step": 2135
},
{
"epoch": 4.062767475035663,
"grad_norm": 0.31793013215065,
"learning_rate": 8.646554461733885e-05,
"loss": 0.0958,
"step": 2136
},
{
"epoch": 4.064669519733714,
"grad_norm": 0.3515986502170563,
"learning_rate": 8.645919339472849e-05,
"loss": 0.098,
"step": 2137
},
{
"epoch": 4.066571564431764,
"grad_norm": 0.2572970390319824,
"learning_rate": 8.645284217211813e-05,
"loss": 0.0782,
"step": 2138
},
{
"epoch": 4.068473609129814,
"grad_norm": 0.40460988879203796,
"learning_rate": 8.644649094950779e-05,
"loss": 0.111,
"step": 2139
},
{
"epoch": 4.070375653827865,
"grad_norm": 0.25654932856559753,
"learning_rate": 8.644013972689743e-05,
"loss": 0.078,
"step": 2140
},
{
"epoch": 4.072277698525915,
"grad_norm": 0.3793332278728485,
"learning_rate": 8.643378850428708e-05,
"loss": 0.1113,
"step": 2141
},
{
"epoch": 4.074179743223966,
"grad_norm": 0.3457014560699463,
"learning_rate": 8.642743728167672e-05,
"loss": 0.1016,
"step": 2142
},
{
"epoch": 4.076081787922016,
"grad_norm": 0.41619420051574707,
"learning_rate": 8.642108605906637e-05,
"loss": 0.1379,
"step": 2143
},
{
"epoch": 4.077983832620067,
"grad_norm": 0.3582102656364441,
"learning_rate": 8.641473483645602e-05,
"loss": 0.1068,
"step": 2144
},
{
"epoch": 4.079885877318117,
"grad_norm": 0.4142124652862549,
"learning_rate": 8.640838361384566e-05,
"loss": 0.1155,
"step": 2145
},
{
"epoch": 4.081787922016168,
"grad_norm": 0.3544979393482208,
"learning_rate": 8.640203239123533e-05,
"loss": 0.0969,
"step": 2146
},
{
"epoch": 4.083689966714218,
"grad_norm": 0.37561002373695374,
"learning_rate": 8.639568116862497e-05,
"loss": 0.1218,
"step": 2147
},
{
"epoch": 4.085592011412269,
"grad_norm": 0.3568158447742462,
"learning_rate": 8.63893299460146e-05,
"loss": 0.1225,
"step": 2148
},
{
"epoch": 4.087494056110319,
"grad_norm": 0.3126932382583618,
"learning_rate": 8.638297872340426e-05,
"loss": 0.084,
"step": 2149
},
{
"epoch": 4.089396100808369,
"grad_norm": 0.4232020378112793,
"learning_rate": 8.637662750079391e-05,
"loss": 0.1155,
"step": 2150
},
{
"epoch": 4.0912981455064195,
"grad_norm": 0.4121897518634796,
"learning_rate": 8.637027627818356e-05,
"loss": 0.1352,
"step": 2151
},
{
"epoch": 4.0932001902044695,
"grad_norm": 0.3292025923728943,
"learning_rate": 8.63639250555732e-05,
"loss": 0.115,
"step": 2152
},
{
"epoch": 4.09510223490252,
"grad_norm": 0.3273860514163971,
"learning_rate": 8.635757383296285e-05,
"loss": 0.1087,
"step": 2153
},
{
"epoch": 4.09700427960057,
"grad_norm": 0.36760157346725464,
"learning_rate": 8.63512226103525e-05,
"loss": 0.1206,
"step": 2154
},
{
"epoch": 4.098906324298621,
"grad_norm": 0.3717329502105713,
"learning_rate": 8.634487138774214e-05,
"loss": 0.1244,
"step": 2155
},
{
"epoch": 4.100808368996671,
"grad_norm": 0.379068523645401,
"learning_rate": 8.633852016513179e-05,
"loss": 0.1048,
"step": 2156
},
{
"epoch": 4.102710413694722,
"grad_norm": 0.30912551283836365,
"learning_rate": 8.633216894252144e-05,
"loss": 0.0838,
"step": 2157
},
{
"epoch": 4.104612458392772,
"grad_norm": 0.3093559741973877,
"learning_rate": 8.632581771991108e-05,
"loss": 0.0948,
"step": 2158
},
{
"epoch": 4.106514503090823,
"grad_norm": 0.2924623489379883,
"learning_rate": 8.631946649730073e-05,
"loss": 0.085,
"step": 2159
},
{
"epoch": 4.108416547788873,
"grad_norm": 0.335437536239624,
"learning_rate": 8.631311527469039e-05,
"loss": 0.102,
"step": 2160
},
{
"epoch": 4.110318592486923,
"grad_norm": 0.37450480461120605,
"learning_rate": 8.630676405208002e-05,
"loss": 0.1102,
"step": 2161
},
{
"epoch": 4.112220637184974,
"grad_norm": 0.40548086166381836,
"learning_rate": 8.630041282946968e-05,
"loss": 0.1122,
"step": 2162
},
{
"epoch": 4.114122681883024,
"grad_norm": 0.2255704551935196,
"learning_rate": 8.629406160685933e-05,
"loss": 0.0875,
"step": 2163
},
{
"epoch": 4.116024726581075,
"grad_norm": 0.3774515390396118,
"learning_rate": 8.628771038424898e-05,
"loss": 0.1007,
"step": 2164
},
{
"epoch": 4.117926771279125,
"grad_norm": 0.4410356879234314,
"learning_rate": 8.628135916163862e-05,
"loss": 0.1238,
"step": 2165
},
{
"epoch": 4.119828815977176,
"grad_norm": 0.3007069230079651,
"learning_rate": 8.627500793902826e-05,
"loss": 0.0849,
"step": 2166
},
{
"epoch": 4.121730860675226,
"grad_norm": 0.3165019750595093,
"learning_rate": 8.626865671641792e-05,
"loss": 0.0959,
"step": 2167
},
{
"epoch": 4.1236329053732765,
"grad_norm": 0.3213941752910614,
"learning_rate": 8.626230549380756e-05,
"loss": 0.1011,
"step": 2168
},
{
"epoch": 4.1255349500713265,
"grad_norm": 0.2742742598056793,
"learning_rate": 8.625595427119721e-05,
"loss": 0.0855,
"step": 2169
},
{
"epoch": 4.127436994769377,
"grad_norm": 0.35063308477401733,
"learning_rate": 8.624960304858686e-05,
"loss": 0.1115,
"step": 2170
},
{
"epoch": 4.129339039467427,
"grad_norm": 0.4272489845752716,
"learning_rate": 8.62432518259765e-05,
"loss": 0.1162,
"step": 2171
},
{
"epoch": 4.131241084165478,
"grad_norm": 0.27256911993026733,
"learning_rate": 8.623690060336615e-05,
"loss": 0.1066,
"step": 2172
},
{
"epoch": 4.133143128863528,
"grad_norm": 0.275309294462204,
"learning_rate": 8.623054938075579e-05,
"loss": 0.1029,
"step": 2173
},
{
"epoch": 4.135045173561578,
"grad_norm": 0.2678431570529938,
"learning_rate": 8.622419815814544e-05,
"loss": 0.0836,
"step": 2174
},
{
"epoch": 4.136947218259629,
"grad_norm": 0.3313474953174591,
"learning_rate": 8.62178469355351e-05,
"loss": 0.0925,
"step": 2175
},
{
"epoch": 4.138849262957679,
"grad_norm": 0.2514117658138275,
"learning_rate": 8.621149571292473e-05,
"loss": 0.0905,
"step": 2176
},
{
"epoch": 4.14075130765573,
"grad_norm": 0.2868940532207489,
"learning_rate": 8.62051444903144e-05,
"loss": 0.1057,
"step": 2177
},
{
"epoch": 4.14265335235378,
"grad_norm": 0.3867243826389313,
"learning_rate": 8.619879326770404e-05,
"loss": 0.1151,
"step": 2178
},
{
"epoch": 4.144555397051831,
"grad_norm": 0.3011827766895294,
"learning_rate": 8.619244204509368e-05,
"loss": 0.1152,
"step": 2179
},
{
"epoch": 4.146457441749881,
"grad_norm": 0.33059659600257874,
"learning_rate": 8.618609082248333e-05,
"loss": 0.1121,
"step": 2180
},
{
"epoch": 4.148359486447932,
"grad_norm": 0.45777612924575806,
"learning_rate": 8.617973959987298e-05,
"loss": 0.133,
"step": 2181
},
{
"epoch": 4.150261531145982,
"grad_norm": 0.39224299788475037,
"learning_rate": 8.617338837726263e-05,
"loss": 0.1381,
"step": 2182
},
{
"epoch": 4.152163575844033,
"grad_norm": 0.2813168168067932,
"learning_rate": 8.616703715465227e-05,
"loss": 0.0939,
"step": 2183
},
{
"epoch": 4.154065620542083,
"grad_norm": 0.30850479006767273,
"learning_rate": 8.616068593204192e-05,
"loss": 0.1016,
"step": 2184
},
{
"epoch": 4.155967665240133,
"grad_norm": 0.2755066156387329,
"learning_rate": 8.615433470943157e-05,
"loss": 0.1253,
"step": 2185
},
{
"epoch": 4.157869709938184,
"grad_norm": 0.25375935435295105,
"learning_rate": 8.614798348682121e-05,
"loss": 0.088,
"step": 2186
},
{
"epoch": 4.159771754636234,
"grad_norm": 0.27644097805023193,
"learning_rate": 8.614163226421086e-05,
"loss": 0.1053,
"step": 2187
},
{
"epoch": 4.1616737993342845,
"grad_norm": 0.30916059017181396,
"learning_rate": 8.613528104160052e-05,
"loss": 0.1075,
"step": 2188
},
{
"epoch": 4.1635758440323345,
"grad_norm": 0.3316441476345062,
"learning_rate": 8.612892981899015e-05,
"loss": 0.1087,
"step": 2189
},
{
"epoch": 4.165477888730385,
"grad_norm": 0.27464917302131653,
"learning_rate": 8.61225785963798e-05,
"loss": 0.079,
"step": 2190
},
{
"epoch": 4.167379933428435,
"grad_norm": 0.3684466779232025,
"learning_rate": 8.611622737376946e-05,
"loss": 0.1312,
"step": 2191
},
{
"epoch": 4.169281978126486,
"grad_norm": 0.33914482593536377,
"learning_rate": 8.61098761511591e-05,
"loss": 0.0991,
"step": 2192
},
{
"epoch": 4.171184022824536,
"grad_norm": 0.3610948324203491,
"learning_rate": 8.610352492854875e-05,
"loss": 0.1068,
"step": 2193
},
{
"epoch": 4.173086067522587,
"grad_norm": 0.2824098765850067,
"learning_rate": 8.60971737059384e-05,
"loss": 0.0913,
"step": 2194
},
{
"epoch": 4.174988112220637,
"grad_norm": 0.28685760498046875,
"learning_rate": 8.609082248332805e-05,
"loss": 0.098,
"step": 2195
},
{
"epoch": 4.176890156918688,
"grad_norm": 0.44503989815711975,
"learning_rate": 8.608447126071769e-05,
"loss": 0.1441,
"step": 2196
},
{
"epoch": 4.178792201616738,
"grad_norm": 0.4228593409061432,
"learning_rate": 8.607812003810734e-05,
"loss": 0.1228,
"step": 2197
},
{
"epoch": 4.180694246314788,
"grad_norm": 0.34366467595100403,
"learning_rate": 8.607176881549699e-05,
"loss": 0.0969,
"step": 2198
},
{
"epoch": 4.182596291012839,
"grad_norm": 0.3302469849586487,
"learning_rate": 8.606541759288663e-05,
"loss": 0.1093,
"step": 2199
},
{
"epoch": 4.184498335710889,
"grad_norm": 0.316914826631546,
"learning_rate": 8.605906637027628e-05,
"loss": 0.096,
"step": 2200
},
{
"epoch": 4.18640038040894,
"grad_norm": 0.3100655972957611,
"learning_rate": 8.605271514766594e-05,
"loss": 0.0902,
"step": 2201
},
{
"epoch": 4.18830242510699,
"grad_norm": 0.2934771776199341,
"learning_rate": 8.604636392505557e-05,
"loss": 0.1011,
"step": 2202
},
{
"epoch": 4.190204469805041,
"grad_norm": 0.32837802171707153,
"learning_rate": 8.604001270244523e-05,
"loss": 0.1284,
"step": 2203
},
{
"epoch": 4.192106514503091,
"grad_norm": 0.3842618465423584,
"learning_rate": 8.603366147983488e-05,
"loss": 0.1072,
"step": 2204
},
{
"epoch": 4.194008559201142,
"grad_norm": 0.29006102681159973,
"learning_rate": 8.602731025722453e-05,
"loss": 0.0919,
"step": 2205
},
{
"epoch": 4.195910603899192,
"grad_norm": 0.31507110595703125,
"learning_rate": 8.602095903461417e-05,
"loss": 0.1103,
"step": 2206
},
{
"epoch": 4.1978126485972425,
"grad_norm": 0.35961470007896423,
"learning_rate": 8.60146078120038e-05,
"loss": 0.1738,
"step": 2207
},
{
"epoch": 4.1997146932952925,
"grad_norm": 0.34587833285331726,
"learning_rate": 8.600825658939347e-05,
"loss": 0.1096,
"step": 2208
},
{
"epoch": 4.2016167379933425,
"grad_norm": 0.37271326780319214,
"learning_rate": 8.600190536678311e-05,
"loss": 0.1186,
"step": 2209
},
{
"epoch": 4.203518782691393,
"grad_norm": 0.31880611181259155,
"learning_rate": 8.599555414417275e-05,
"loss": 0.1046,
"step": 2210
},
{
"epoch": 4.205420827389443,
"grad_norm": 0.28906506299972534,
"learning_rate": 8.598920292156241e-05,
"loss": 0.0988,
"step": 2211
},
{
"epoch": 4.207322872087494,
"grad_norm": 0.33470967411994934,
"learning_rate": 8.598285169895205e-05,
"loss": 0.1056,
"step": 2212
},
{
"epoch": 4.209224916785544,
"grad_norm": 0.3186233341693878,
"learning_rate": 8.59765004763417e-05,
"loss": 0.1203,
"step": 2213
},
{
"epoch": 4.211126961483595,
"grad_norm": 0.3465280532836914,
"learning_rate": 8.597014925373134e-05,
"loss": 0.1073,
"step": 2214
},
{
"epoch": 4.213029006181645,
"grad_norm": 0.27451473474502563,
"learning_rate": 8.596379803112099e-05,
"loss": 0.0965,
"step": 2215
},
{
"epoch": 4.214931050879696,
"grad_norm": 0.35004234313964844,
"learning_rate": 8.595744680851064e-05,
"loss": 0.1003,
"step": 2216
},
{
"epoch": 4.216833095577746,
"grad_norm": 0.36494818329811096,
"learning_rate": 8.595109558590028e-05,
"loss": 0.1143,
"step": 2217
},
{
"epoch": 4.218735140275797,
"grad_norm": 0.4278135597705841,
"learning_rate": 8.594474436328995e-05,
"loss": 0.1234,
"step": 2218
},
{
"epoch": 4.220637184973847,
"grad_norm": 0.5124382972717285,
"learning_rate": 8.593839314067959e-05,
"loss": 0.1158,
"step": 2219
},
{
"epoch": 4.222539229671897,
"grad_norm": 0.39850741624832153,
"learning_rate": 8.593204191806923e-05,
"loss": 0.1295,
"step": 2220
},
{
"epoch": 4.224441274369948,
"grad_norm": 0.4141925573348999,
"learning_rate": 8.592569069545888e-05,
"loss": 0.1103,
"step": 2221
},
{
"epoch": 4.226343319067998,
"grad_norm": 0.274980366230011,
"learning_rate": 8.591933947284853e-05,
"loss": 0.0927,
"step": 2222
},
{
"epoch": 4.228245363766049,
"grad_norm": 0.4274260103702545,
"learning_rate": 8.591298825023818e-05,
"loss": 0.1248,
"step": 2223
},
{
"epoch": 4.230147408464099,
"grad_norm": 0.39051416516304016,
"learning_rate": 8.590663702762782e-05,
"loss": 0.1068,
"step": 2224
},
{
"epoch": 4.2320494531621495,
"grad_norm": 0.3913654685020447,
"learning_rate": 8.590028580501747e-05,
"loss": 0.1212,
"step": 2225
},
{
"epoch": 4.2339514978601995,
"grad_norm": 0.33034393191337585,
"learning_rate": 8.589393458240712e-05,
"loss": 0.0875,
"step": 2226
},
{
"epoch": 4.23585354255825,
"grad_norm": 0.405618280172348,
"learning_rate": 8.588758335979676e-05,
"loss": 0.1228,
"step": 2227
},
{
"epoch": 4.2377555872563,
"grad_norm": 0.3220268189907074,
"learning_rate": 8.588123213718641e-05,
"loss": 0.1046,
"step": 2228
},
{
"epoch": 4.239657631954351,
"grad_norm": 0.32537737488746643,
"learning_rate": 8.587488091457606e-05,
"loss": 0.0901,
"step": 2229
},
{
"epoch": 4.241559676652401,
"grad_norm": 0.3968732953071594,
"learning_rate": 8.58685296919657e-05,
"loss": 0.1753,
"step": 2230
},
{
"epoch": 4.243461721350451,
"grad_norm": 0.3441084325313568,
"learning_rate": 8.586217846935535e-05,
"loss": 0.1181,
"step": 2231
},
{
"epoch": 4.245363766048502,
"grad_norm": 0.4014514684677124,
"learning_rate": 8.5855827246745e-05,
"loss": 0.1067,
"step": 2232
},
{
"epoch": 4.247265810746552,
"grad_norm": 0.40167930722236633,
"learning_rate": 8.584947602413464e-05,
"loss": 0.1142,
"step": 2233
},
{
"epoch": 4.249167855444603,
"grad_norm": 0.3604772984981537,
"learning_rate": 8.58431248015243e-05,
"loss": 0.108,
"step": 2234
},
{
"epoch": 4.251069900142653,
"grad_norm": 0.4210832118988037,
"learning_rate": 8.583677357891395e-05,
"loss": 0.1161,
"step": 2235
},
{
"epoch": 4.252971944840704,
"grad_norm": 0.34467047452926636,
"learning_rate": 8.58304223563036e-05,
"loss": 0.1187,
"step": 2236
},
{
"epoch": 4.254873989538754,
"grad_norm": 0.8141130805015564,
"learning_rate": 8.582407113369324e-05,
"loss": 0.1766,
"step": 2237
},
{
"epoch": 4.256776034236805,
"grad_norm": 0.28791263699531555,
"learning_rate": 8.581771991108288e-05,
"loss": 0.0953,
"step": 2238
},
{
"epoch": 4.258678078934855,
"grad_norm": 0.2527415454387665,
"learning_rate": 8.581136868847254e-05,
"loss": 0.0847,
"step": 2239
},
{
"epoch": 4.260580123632906,
"grad_norm": 0.2793647050857544,
"learning_rate": 8.580501746586218e-05,
"loss": 0.116,
"step": 2240
},
{
"epoch": 4.262482168330956,
"grad_norm": 0.5324682593345642,
"learning_rate": 8.579866624325183e-05,
"loss": 0.1357,
"step": 2241
},
{
"epoch": 4.264384213029006,
"grad_norm": 0.31979575753211975,
"learning_rate": 8.579231502064148e-05,
"loss": 0.1004,
"step": 2242
},
{
"epoch": 4.266286257727057,
"grad_norm": 0.453645795583725,
"learning_rate": 8.578596379803112e-05,
"loss": 0.121,
"step": 2243
},
{
"epoch": 4.268188302425107,
"grad_norm": 0.2688881754875183,
"learning_rate": 8.577961257542077e-05,
"loss": 0.0935,
"step": 2244
},
{
"epoch": 4.2700903471231575,
"grad_norm": 0.30262473225593567,
"learning_rate": 8.577326135281041e-05,
"loss": 0.086,
"step": 2245
},
{
"epoch": 4.2719923918212075,
"grad_norm": 0.4076935648918152,
"learning_rate": 8.576691013020006e-05,
"loss": 0.1075,
"step": 2246
},
{
"epoch": 4.273894436519258,
"grad_norm": 0.5229641199111938,
"learning_rate": 8.576055890758972e-05,
"loss": 0.1585,
"step": 2247
},
{
"epoch": 4.275796481217308,
"grad_norm": 0.3732607960700989,
"learning_rate": 8.575420768497935e-05,
"loss": 0.1065,
"step": 2248
},
{
"epoch": 4.277698525915359,
"grad_norm": 0.39624014496803284,
"learning_rate": 8.574785646236902e-05,
"loss": 0.1229,
"step": 2249
},
{
"epoch": 4.279600570613409,
"grad_norm": 0.47354966402053833,
"learning_rate": 8.574150523975866e-05,
"loss": 0.1574,
"step": 2250
},
{
"epoch": 4.28150261531146,
"grad_norm": 0.35089337825775146,
"learning_rate": 8.57351540171483e-05,
"loss": 0.1098,
"step": 2251
},
{
"epoch": 4.28340466000951,
"grad_norm": 0.3599602282047272,
"learning_rate": 8.572880279453795e-05,
"loss": 0.1136,
"step": 2252
},
{
"epoch": 4.285306704707561,
"grad_norm": 0.4661259949207306,
"learning_rate": 8.57224515719276e-05,
"loss": 0.1297,
"step": 2253
},
{
"epoch": 4.287208749405611,
"grad_norm": 0.27821779251098633,
"learning_rate": 8.571610034931725e-05,
"loss": 0.0974,
"step": 2254
},
{
"epoch": 4.289110794103661,
"grad_norm": 0.3892570436000824,
"learning_rate": 8.570974912670689e-05,
"loss": 0.1362,
"step": 2255
},
{
"epoch": 4.291012838801712,
"grad_norm": 0.3612288534641266,
"learning_rate": 8.570339790409654e-05,
"loss": 0.121,
"step": 2256
},
{
"epoch": 4.292914883499762,
"grad_norm": 0.3542415499687195,
"learning_rate": 8.56970466814862e-05,
"loss": 0.1004,
"step": 2257
},
{
"epoch": 4.294816928197813,
"grad_norm": 0.3457956910133362,
"learning_rate": 8.569069545887583e-05,
"loss": 0.1035,
"step": 2258
},
{
"epoch": 4.296718972895863,
"grad_norm": 0.42984023690223694,
"learning_rate": 8.568434423626548e-05,
"loss": 0.1236,
"step": 2259
},
{
"epoch": 4.298621017593914,
"grad_norm": 0.3002376854419708,
"learning_rate": 8.567799301365514e-05,
"loss": 0.0867,
"step": 2260
},
{
"epoch": 4.300523062291964,
"grad_norm": 0.3134646415710449,
"learning_rate": 8.567164179104477e-05,
"loss": 0.0928,
"step": 2261
},
{
"epoch": 4.302425106990015,
"grad_norm": 0.35177892446517944,
"learning_rate": 8.566529056843443e-05,
"loss": 0.1072,
"step": 2262
},
{
"epoch": 4.304327151688065,
"grad_norm": 0.40704670548439026,
"learning_rate": 8.565893934582408e-05,
"loss": 0.1216,
"step": 2263
},
{
"epoch": 4.3062291963861155,
"grad_norm": 0.40002110600471497,
"learning_rate": 8.565258812321372e-05,
"loss": 0.1153,
"step": 2264
},
{
"epoch": 4.3081312410841655,
"grad_norm": 0.28185611963272095,
"learning_rate": 8.564623690060337e-05,
"loss": 0.0815,
"step": 2265
},
{
"epoch": 4.310033285782216,
"grad_norm": 0.45204728841781616,
"learning_rate": 8.563988567799302e-05,
"loss": 0.1285,
"step": 2266
},
{
"epoch": 4.311935330480266,
"grad_norm": 0.39130833745002747,
"learning_rate": 8.563353445538267e-05,
"loss": 0.1235,
"step": 2267
},
{
"epoch": 4.313837375178316,
"grad_norm": 0.29855722188949585,
"learning_rate": 8.562718323277231e-05,
"loss": 0.0943,
"step": 2268
},
{
"epoch": 4.315739419876367,
"grad_norm": 0.2964162826538086,
"learning_rate": 8.562083201016196e-05,
"loss": 0.1056,
"step": 2269
},
{
"epoch": 4.317641464574417,
"grad_norm": 0.3408963978290558,
"learning_rate": 8.561448078755161e-05,
"loss": 0.1096,
"step": 2270
},
{
"epoch": 4.319543509272468,
"grad_norm": 0.26335135102272034,
"learning_rate": 8.560812956494125e-05,
"loss": 0.1258,
"step": 2271
},
{
"epoch": 4.321445553970518,
"grad_norm": 0.45781078934669495,
"learning_rate": 8.56017783423309e-05,
"loss": 0.1441,
"step": 2272
},
{
"epoch": 4.323347598668569,
"grad_norm": 0.30225613713264465,
"learning_rate": 8.559542711972056e-05,
"loss": 0.0886,
"step": 2273
},
{
"epoch": 4.325249643366619,
"grad_norm": 0.39499637484550476,
"learning_rate": 8.55890758971102e-05,
"loss": 0.108,
"step": 2274
},
{
"epoch": 4.32715168806467,
"grad_norm": 0.25995761156082153,
"learning_rate": 8.558272467449985e-05,
"loss": 0.0832,
"step": 2275
},
{
"epoch": 4.32905373276272,
"grad_norm": 0.4667019248008728,
"learning_rate": 8.557637345188948e-05,
"loss": 0.1376,
"step": 2276
},
{
"epoch": 4.330955777460771,
"grad_norm": 0.6616588830947876,
"learning_rate": 8.557002222927915e-05,
"loss": 0.1402,
"step": 2277
},
{
"epoch": 4.332857822158821,
"grad_norm": 0.362642765045166,
"learning_rate": 8.556367100666879e-05,
"loss": 0.1036,
"step": 2278
},
{
"epoch": 4.334759866856871,
"grad_norm": 0.34205347299575806,
"learning_rate": 8.555731978405843e-05,
"loss": 0.0901,
"step": 2279
},
{
"epoch": 4.336661911554922,
"grad_norm": 0.428653746843338,
"learning_rate": 8.555096856144809e-05,
"loss": 0.1291,
"step": 2280
},
{
"epoch": 4.338563956252972,
"grad_norm": 0.31291234493255615,
"learning_rate": 8.554461733883773e-05,
"loss": 0.091,
"step": 2281
},
{
"epoch": 4.3404660009510225,
"grad_norm": 0.33913081884384155,
"learning_rate": 8.553826611622737e-05,
"loss": 0.0844,
"step": 2282
},
{
"epoch": 4.3423680456490725,
"grad_norm": 0.3302326500415802,
"learning_rate": 8.553191489361702e-05,
"loss": 0.0894,
"step": 2283
},
{
"epoch": 4.344270090347123,
"grad_norm": 0.39421653747558594,
"learning_rate": 8.552556367100667e-05,
"loss": 0.1173,
"step": 2284
},
{
"epoch": 4.346172135045173,
"grad_norm": 0.35651376843452454,
"learning_rate": 8.551921244839632e-05,
"loss": 0.0945,
"step": 2285
},
{
"epoch": 4.348074179743224,
"grad_norm": 0.37059125304222107,
"learning_rate": 8.551286122578596e-05,
"loss": 0.1223,
"step": 2286
},
{
"epoch": 4.349976224441274,
"grad_norm": 0.31241846084594727,
"learning_rate": 8.550651000317561e-05,
"loss": 0.1057,
"step": 2287
},
{
"epoch": 4.351878269139325,
"grad_norm": 0.29532214999198914,
"learning_rate": 8.550015878056527e-05,
"loss": 0.1008,
"step": 2288
},
{
"epoch": 4.353780313837375,
"grad_norm": 0.435973584651947,
"learning_rate": 8.54938075579549e-05,
"loss": 0.1258,
"step": 2289
},
{
"epoch": 4.355682358535425,
"grad_norm": 0.3240755498409271,
"learning_rate": 8.548745633534456e-05,
"loss": 0.1383,
"step": 2290
},
{
"epoch": 4.357584403233476,
"grad_norm": 0.3592849373817444,
"learning_rate": 8.548110511273421e-05,
"loss": 0.118,
"step": 2291
},
{
"epoch": 4.359486447931526,
"grad_norm": 0.3495205342769623,
"learning_rate": 8.547475389012385e-05,
"loss": 0.1182,
"step": 2292
},
{
"epoch": 4.361388492629577,
"grad_norm": 0.35103073716163635,
"learning_rate": 8.54684026675135e-05,
"loss": 0.1075,
"step": 2293
},
{
"epoch": 4.363290537327627,
"grad_norm": 0.4233345091342926,
"learning_rate": 8.546205144490315e-05,
"loss": 0.1111,
"step": 2294
},
{
"epoch": 4.365192582025678,
"grad_norm": 0.3999617099761963,
"learning_rate": 8.54557002222928e-05,
"loss": 0.1172,
"step": 2295
},
{
"epoch": 4.367094626723728,
"grad_norm": 0.3122519254684448,
"learning_rate": 8.544934899968244e-05,
"loss": 0.0973,
"step": 2296
},
{
"epoch": 4.368996671421779,
"grad_norm": 0.2844139039516449,
"learning_rate": 8.544299777707209e-05,
"loss": 0.0972,
"step": 2297
},
{
"epoch": 4.370898716119829,
"grad_norm": 0.3841843008995056,
"learning_rate": 8.543664655446174e-05,
"loss": 0.1145,
"step": 2298
},
{
"epoch": 4.37280076081788,
"grad_norm": 0.35272732377052307,
"learning_rate": 8.543029533185138e-05,
"loss": 0.1,
"step": 2299
},
{
"epoch": 4.37470280551593,
"grad_norm": 0.3861033618450165,
"learning_rate": 8.542394410924103e-05,
"loss": 0.12,
"step": 2300
},
{
"epoch": 4.37660485021398,
"grad_norm": 0.2895589768886566,
"learning_rate": 8.541759288663069e-05,
"loss": 0.0857,
"step": 2301
},
{
"epoch": 4.3785068949120305,
"grad_norm": 0.4067385792732239,
"learning_rate": 8.541124166402032e-05,
"loss": 0.114,
"step": 2302
},
{
"epoch": 4.3804089396100805,
"grad_norm": 0.3439483642578125,
"learning_rate": 8.540489044140998e-05,
"loss": 0.1218,
"step": 2303
},
{
"epoch": 4.382310984308131,
"grad_norm": 0.273703396320343,
"learning_rate": 8.539853921879963e-05,
"loss": 0.0919,
"step": 2304
},
{
"epoch": 4.384213029006181,
"grad_norm": 0.2975528836250305,
"learning_rate": 8.539218799618927e-05,
"loss": 0.0786,
"step": 2305
},
{
"epoch": 4.386115073704232,
"grad_norm": 0.3109762370586395,
"learning_rate": 8.538583677357892e-05,
"loss": 0.1043,
"step": 2306
},
{
"epoch": 4.388017118402282,
"grad_norm": 0.30896326899528503,
"learning_rate": 8.537948555096857e-05,
"loss": 0.0986,
"step": 2307
},
{
"epoch": 4.389919163100333,
"grad_norm": 0.24300821125507355,
"learning_rate": 8.537313432835822e-05,
"loss": 0.0821,
"step": 2308
},
{
"epoch": 4.391821207798383,
"grad_norm": 0.2907545566558838,
"learning_rate": 8.536678310574786e-05,
"loss": 0.0943,
"step": 2309
},
{
"epoch": 4.393723252496434,
"grad_norm": 0.4220617115497589,
"learning_rate": 8.53604318831375e-05,
"loss": 0.1359,
"step": 2310
},
{
"epoch": 4.395625297194484,
"grad_norm": 0.3436138331890106,
"learning_rate": 8.535408066052716e-05,
"loss": 0.1106,
"step": 2311
},
{
"epoch": 4.397527341892534,
"grad_norm": 0.36533981561660767,
"learning_rate": 8.53477294379168e-05,
"loss": 0.1194,
"step": 2312
},
{
"epoch": 4.399429386590585,
"grad_norm": 0.3554334044456482,
"learning_rate": 8.534137821530645e-05,
"loss": 0.1571,
"step": 2313
},
{
"epoch": 4.401331431288635,
"grad_norm": 0.3670365512371063,
"learning_rate": 8.53350269926961e-05,
"loss": 0.1299,
"step": 2314
},
{
"epoch": 4.403233475986686,
"grad_norm": 0.4539790451526642,
"learning_rate": 8.532867577008574e-05,
"loss": 0.1348,
"step": 2315
},
{
"epoch": 4.405135520684736,
"grad_norm": 0.29808804392814636,
"learning_rate": 8.53223245474754e-05,
"loss": 0.1046,
"step": 2316
},
{
"epoch": 4.407037565382787,
"grad_norm": 0.3486464321613312,
"learning_rate": 8.531597332486503e-05,
"loss": 0.1047,
"step": 2317
},
{
"epoch": 4.408939610080837,
"grad_norm": 0.2947161793708801,
"learning_rate": 8.530962210225469e-05,
"loss": 0.0814,
"step": 2318
},
{
"epoch": 4.410841654778888,
"grad_norm": 0.3321152627468109,
"learning_rate": 8.530327087964434e-05,
"loss": 0.1068,
"step": 2319
},
{
"epoch": 4.412743699476938,
"grad_norm": 0.2441323846578598,
"learning_rate": 8.529691965703398e-05,
"loss": 0.0813,
"step": 2320
},
{
"epoch": 4.4146457441749885,
"grad_norm": 0.37151622772216797,
"learning_rate": 8.529056843442364e-05,
"loss": 0.0995,
"step": 2321
},
{
"epoch": 4.4165477888730384,
"grad_norm": 0.330240398645401,
"learning_rate": 8.528421721181328e-05,
"loss": 0.0999,
"step": 2322
},
{
"epoch": 4.418449833571089,
"grad_norm": 0.38048794865608215,
"learning_rate": 8.527786598920292e-05,
"loss": 0.1065,
"step": 2323
},
{
"epoch": 4.420351878269139,
"grad_norm": 0.3825136423110962,
"learning_rate": 8.527151476659257e-05,
"loss": 0.1021,
"step": 2324
},
{
"epoch": 4.422253922967189,
"grad_norm": 0.3410681486129761,
"learning_rate": 8.526516354398222e-05,
"loss": 0.0899,
"step": 2325
},
{
"epoch": 4.42415596766524,
"grad_norm": 0.33466002345085144,
"learning_rate": 8.525881232137187e-05,
"loss": 0.1051,
"step": 2326
},
{
"epoch": 4.42605801236329,
"grad_norm": 0.3932620584964752,
"learning_rate": 8.525246109876151e-05,
"loss": 0.1156,
"step": 2327
},
{
"epoch": 4.427960057061341,
"grad_norm": 0.31098031997680664,
"learning_rate": 8.524610987615116e-05,
"loss": 0.1026,
"step": 2328
},
{
"epoch": 4.429862101759391,
"grad_norm": 0.3773583471775055,
"learning_rate": 8.523975865354082e-05,
"loss": 0.1113,
"step": 2329
},
{
"epoch": 4.431764146457442,
"grad_norm": 0.33763033151626587,
"learning_rate": 8.523340743093045e-05,
"loss": 0.0941,
"step": 2330
},
{
"epoch": 4.433666191155492,
"grad_norm": 0.23584803938865662,
"learning_rate": 8.52270562083201e-05,
"loss": 0.0777,
"step": 2331
},
{
"epoch": 4.435568235853543,
"grad_norm": 0.3598161041736603,
"learning_rate": 8.522070498570976e-05,
"loss": 0.1173,
"step": 2332
},
{
"epoch": 4.437470280551593,
"grad_norm": 0.3960074484348297,
"learning_rate": 8.52143537630994e-05,
"loss": 0.119,
"step": 2333
},
{
"epoch": 4.439372325249644,
"grad_norm": 0.3260672092437744,
"learning_rate": 8.520800254048905e-05,
"loss": 0.1107,
"step": 2334
},
{
"epoch": 4.441274369947694,
"grad_norm": 0.3651185929775238,
"learning_rate": 8.52016513178787e-05,
"loss": 0.0993,
"step": 2335
},
{
"epoch": 4.443176414645745,
"grad_norm": 0.39154887199401855,
"learning_rate": 8.519530009526834e-05,
"loss": 0.1168,
"step": 2336
},
{
"epoch": 4.445078459343795,
"grad_norm": 0.3429001569747925,
"learning_rate": 8.518894887265799e-05,
"loss": 0.1111,
"step": 2337
},
{
"epoch": 4.446980504041845,
"grad_norm": 0.3407055735588074,
"learning_rate": 8.518259765004764e-05,
"loss": 0.1032,
"step": 2338
},
{
"epoch": 4.4488825487398955,
"grad_norm": 0.3813023567199707,
"learning_rate": 8.517624642743729e-05,
"loss": 0.1077,
"step": 2339
},
{
"epoch": 4.4507845934379455,
"grad_norm": 0.2836807370185852,
"learning_rate": 8.516989520482693e-05,
"loss": 0.0833,
"step": 2340
},
{
"epoch": 4.452686638135996,
"grad_norm": 0.4083840250968933,
"learning_rate": 8.516354398221657e-05,
"loss": 0.1254,
"step": 2341
},
{
"epoch": 4.454588682834046,
"grad_norm": 0.29835161566734314,
"learning_rate": 8.515719275960623e-05,
"loss": 0.1207,
"step": 2342
},
{
"epoch": 4.456490727532097,
"grad_norm": 0.30677247047424316,
"learning_rate": 8.515084153699587e-05,
"loss": 0.0807,
"step": 2343
},
{
"epoch": 4.458392772230147,
"grad_norm": 0.312853068113327,
"learning_rate": 8.514449031438552e-05,
"loss": 0.1174,
"step": 2344
},
{
"epoch": 4.460294816928198,
"grad_norm": 0.431356281042099,
"learning_rate": 8.513813909177518e-05,
"loss": 0.1324,
"step": 2345
},
{
"epoch": 4.462196861626248,
"grad_norm": 0.2785525918006897,
"learning_rate": 8.513178786916482e-05,
"loss": 0.1025,
"step": 2346
},
{
"epoch": 4.464098906324299,
"grad_norm": 0.2919105291366577,
"learning_rate": 8.512543664655447e-05,
"loss": 0.1154,
"step": 2347
},
{
"epoch": 4.466000951022349,
"grad_norm": 0.4356403350830078,
"learning_rate": 8.51190854239441e-05,
"loss": 0.1161,
"step": 2348
},
{
"epoch": 4.467902995720399,
"grad_norm": 0.3411230146884918,
"learning_rate": 8.511273420133377e-05,
"loss": 0.1032,
"step": 2349
},
{
"epoch": 4.46980504041845,
"grad_norm": 0.3335597515106201,
"learning_rate": 8.510638297872341e-05,
"loss": 0.1427,
"step": 2350
},
{
"epoch": 4.4717070851165,
"grad_norm": 0.3813069760799408,
"learning_rate": 8.510003175611305e-05,
"loss": 0.1214,
"step": 2351
},
{
"epoch": 4.473609129814551,
"grad_norm": 0.2616579830646515,
"learning_rate": 8.509368053350271e-05,
"loss": 0.0914,
"step": 2352
},
{
"epoch": 4.475511174512601,
"grad_norm": 0.24161195755004883,
"learning_rate": 8.508732931089235e-05,
"loss": 0.0806,
"step": 2353
},
{
"epoch": 4.477413219210652,
"grad_norm": 0.41089168190956116,
"learning_rate": 8.508097808828199e-05,
"loss": 0.1095,
"step": 2354
},
{
"epoch": 4.479315263908702,
"grad_norm": 0.2930002510547638,
"learning_rate": 8.507462686567164e-05,
"loss": 0.0851,
"step": 2355
},
{
"epoch": 4.481217308606753,
"grad_norm": 0.38217440247535706,
"learning_rate": 8.506827564306129e-05,
"loss": 0.106,
"step": 2356
},
{
"epoch": 4.483119353304803,
"grad_norm": 0.4617588520050049,
"learning_rate": 8.506192442045094e-05,
"loss": 0.1269,
"step": 2357
},
{
"epoch": 4.4850213980028535,
"grad_norm": 0.33491015434265137,
"learning_rate": 8.505557319784058e-05,
"loss": 0.1086,
"step": 2358
},
{
"epoch": 4.4869234427009035,
"grad_norm": 0.31024834513664246,
"learning_rate": 8.504922197523023e-05,
"loss": 0.1039,
"step": 2359
},
{
"epoch": 4.4888254873989535,
"grad_norm": 0.36780717968940735,
"learning_rate": 8.504287075261989e-05,
"loss": 0.1102,
"step": 2360
},
{
"epoch": 4.490727532097004,
"grad_norm": 0.40606439113616943,
"learning_rate": 8.503651953000952e-05,
"loss": 0.13,
"step": 2361
},
{
"epoch": 4.492629576795054,
"grad_norm": 0.4511033296585083,
"learning_rate": 8.503016830739918e-05,
"loss": 0.1182,
"step": 2362
},
{
"epoch": 4.494531621493105,
"grad_norm": 0.36328256130218506,
"learning_rate": 8.502381708478883e-05,
"loss": 0.1024,
"step": 2363
},
{
"epoch": 4.496433666191155,
"grad_norm": 0.3860591650009155,
"learning_rate": 8.501746586217847e-05,
"loss": 0.1019,
"step": 2364
},
{
"epoch": 4.498335710889206,
"grad_norm": 0.46222564578056335,
"learning_rate": 8.501111463956812e-05,
"loss": 0.1132,
"step": 2365
},
{
"epoch": 4.500237755587256,
"grad_norm": 0.3612005412578583,
"learning_rate": 8.500476341695777e-05,
"loss": 0.0963,
"step": 2366
},
{
"epoch": 4.502139800285307,
"grad_norm": 0.43513086438179016,
"learning_rate": 8.499841219434742e-05,
"loss": 0.1109,
"step": 2367
},
{
"epoch": 4.504041844983357,
"grad_norm": 0.2950316071510315,
"learning_rate": 8.499206097173706e-05,
"loss": 0.1124,
"step": 2368
},
{
"epoch": 4.505943889681408,
"grad_norm": 0.36488962173461914,
"learning_rate": 8.498570974912671e-05,
"loss": 0.1,
"step": 2369
},
{
"epoch": 4.507845934379458,
"grad_norm": 0.3592323064804077,
"learning_rate": 8.497935852651636e-05,
"loss": 0.0995,
"step": 2370
},
{
"epoch": 4.509747979077508,
"grad_norm": 0.34753555059432983,
"learning_rate": 8.4973007303906e-05,
"loss": 0.1026,
"step": 2371
},
{
"epoch": 4.511650023775559,
"grad_norm": 0.39495691657066345,
"learning_rate": 8.496665608129565e-05,
"loss": 0.1272,
"step": 2372
},
{
"epoch": 4.513552068473609,
"grad_norm": 0.3553752601146698,
"learning_rate": 8.49603048586853e-05,
"loss": 0.1136,
"step": 2373
},
{
"epoch": 4.51545411317166,
"grad_norm": 0.37848785519599915,
"learning_rate": 8.495395363607494e-05,
"loss": 0.1069,
"step": 2374
},
{
"epoch": 4.51735615786971,
"grad_norm": 0.33565762639045715,
"learning_rate": 8.49476024134646e-05,
"loss": 0.1075,
"step": 2375
},
{
"epoch": 4.519258202567761,
"grad_norm": 0.3359149694442749,
"learning_rate": 8.494125119085425e-05,
"loss": 0.098,
"step": 2376
},
{
"epoch": 4.521160247265811,
"grad_norm": 0.3218232989311218,
"learning_rate": 8.493489996824389e-05,
"loss": 0.096,
"step": 2377
},
{
"epoch": 4.5230622919638614,
"grad_norm": 0.3153054714202881,
"learning_rate": 8.492854874563354e-05,
"loss": 0.1015,
"step": 2378
},
{
"epoch": 4.5249643366619114,
"grad_norm": 0.37637823820114136,
"learning_rate": 8.492219752302319e-05,
"loss": 0.1164,
"step": 2379
},
{
"epoch": 4.526866381359962,
"grad_norm": 0.3270327150821686,
"learning_rate": 8.491584630041284e-05,
"loss": 0.1084,
"step": 2380
},
{
"epoch": 4.528768426058012,
"grad_norm": 0.23998558521270752,
"learning_rate": 8.490949507780248e-05,
"loss": 0.0777,
"step": 2381
},
{
"epoch": 4.530670470756062,
"grad_norm": 0.31294015049934387,
"learning_rate": 8.490314385519212e-05,
"loss": 0.0807,
"step": 2382
},
{
"epoch": 4.532572515454113,
"grad_norm": 0.3305555582046509,
"learning_rate": 8.489679263258178e-05,
"loss": 0.1011,
"step": 2383
},
{
"epoch": 4.534474560152163,
"grad_norm": 0.35641244053840637,
"learning_rate": 8.489044140997142e-05,
"loss": 0.11,
"step": 2384
},
{
"epoch": 4.536376604850214,
"grad_norm": 0.3511948883533478,
"learning_rate": 8.488409018736107e-05,
"loss": 0.1009,
"step": 2385
},
{
"epoch": 4.538278649548264,
"grad_norm": 0.3899917006492615,
"learning_rate": 8.487773896475071e-05,
"loss": 0.1285,
"step": 2386
},
{
"epoch": 4.540180694246315,
"grad_norm": 0.4415057897567749,
"learning_rate": 8.487138774214036e-05,
"loss": 0.1434,
"step": 2387
},
{
"epoch": 4.542082738944365,
"grad_norm": 0.42669907212257385,
"learning_rate": 8.486503651953002e-05,
"loss": 0.1201,
"step": 2388
},
{
"epoch": 4.543984783642416,
"grad_norm": 0.27351129055023193,
"learning_rate": 8.485868529691965e-05,
"loss": 0.0761,
"step": 2389
},
{
"epoch": 4.545886828340466,
"grad_norm": 0.31243595480918884,
"learning_rate": 8.48523340743093e-05,
"loss": 0.0909,
"step": 2390
},
{
"epoch": 4.547788873038517,
"grad_norm": 0.36273542046546936,
"learning_rate": 8.484598285169896e-05,
"loss": 0.1156,
"step": 2391
},
{
"epoch": 4.549690917736567,
"grad_norm": 0.3167242109775543,
"learning_rate": 8.48396316290886e-05,
"loss": 0.2065,
"step": 2392
},
{
"epoch": 4.551592962434617,
"grad_norm": 0.3072797358036041,
"learning_rate": 8.483328040647825e-05,
"loss": 0.0939,
"step": 2393
},
{
"epoch": 4.553495007132668,
"grad_norm": 0.32601553201675415,
"learning_rate": 8.48269291838679e-05,
"loss": 0.1052,
"step": 2394
},
{
"epoch": 4.555397051830718,
"grad_norm": 0.41232773661613464,
"learning_rate": 8.482057796125754e-05,
"loss": 0.1207,
"step": 2395
},
{
"epoch": 4.5572990965287685,
"grad_norm": 0.46499213576316833,
"learning_rate": 8.481422673864719e-05,
"loss": 0.1251,
"step": 2396
},
{
"epoch": 4.5592011412268185,
"grad_norm": 0.3984009325504303,
"learning_rate": 8.480787551603684e-05,
"loss": 0.1317,
"step": 2397
},
{
"epoch": 4.561103185924869,
"grad_norm": 0.3825131356716156,
"learning_rate": 8.48015242934265e-05,
"loss": 0.1273,
"step": 2398
},
{
"epoch": 4.563005230622919,
"grad_norm": 0.39657148718833923,
"learning_rate": 8.479517307081613e-05,
"loss": 0.145,
"step": 2399
},
{
"epoch": 4.56490727532097,
"grad_norm": 0.3764631748199463,
"learning_rate": 8.478882184820578e-05,
"loss": 0.1133,
"step": 2400
},
{
"epoch": 4.56680932001902,
"grad_norm": 0.2968275249004364,
"learning_rate": 8.478247062559544e-05,
"loss": 0.0885,
"step": 2401
},
{
"epoch": 4.568711364717071,
"grad_norm": 0.326856791973114,
"learning_rate": 8.477611940298507e-05,
"loss": 0.0923,
"step": 2402
},
{
"epoch": 4.570613409415121,
"grad_norm": 0.38287606835365295,
"learning_rate": 8.476976818037473e-05,
"loss": 0.141,
"step": 2403
},
{
"epoch": 4.572515454113171,
"grad_norm": 0.47493815422058105,
"learning_rate": 8.476341695776438e-05,
"loss": 0.1146,
"step": 2404
},
{
"epoch": 4.574417498811222,
"grad_norm": 0.35078614950180054,
"learning_rate": 8.475706573515402e-05,
"loss": 0.1153,
"step": 2405
},
{
"epoch": 4.576319543509273,
"grad_norm": 0.3837313950061798,
"learning_rate": 8.475071451254367e-05,
"loss": 0.1408,
"step": 2406
},
{
"epoch": 4.578221588207323,
"grad_norm": 0.3800102472305298,
"learning_rate": 8.474436328993332e-05,
"loss": 0.1224,
"step": 2407
},
{
"epoch": 4.580123632905373,
"grad_norm": 0.40831804275512695,
"learning_rate": 8.473801206732296e-05,
"loss": 0.1283,
"step": 2408
},
{
"epoch": 4.582025677603424,
"grad_norm": 0.34854429960250854,
"learning_rate": 8.473166084471261e-05,
"loss": 0.101,
"step": 2409
},
{
"epoch": 4.583927722301474,
"grad_norm": 0.3317374885082245,
"learning_rate": 8.472530962210226e-05,
"loss": 0.0986,
"step": 2410
},
{
"epoch": 4.585829766999525,
"grad_norm": 0.3316230773925781,
"learning_rate": 8.471895839949191e-05,
"loss": 0.0955,
"step": 2411
},
{
"epoch": 4.587731811697575,
"grad_norm": 0.3458825945854187,
"learning_rate": 8.471260717688155e-05,
"loss": 0.1246,
"step": 2412
},
{
"epoch": 4.589633856395626,
"grad_norm": 0.2985215187072754,
"learning_rate": 8.470625595427119e-05,
"loss": 0.0904,
"step": 2413
},
{
"epoch": 4.591535901093676,
"grad_norm": 0.5128130912780762,
"learning_rate": 8.469990473166086e-05,
"loss": 0.1119,
"step": 2414
},
{
"epoch": 4.5934379457917265,
"grad_norm": 0.3538981080055237,
"learning_rate": 8.46935535090505e-05,
"loss": 0.1276,
"step": 2415
},
{
"epoch": 4.5953399904897765,
"grad_norm": 0.24112893640995026,
"learning_rate": 8.468720228644015e-05,
"loss": 0.0813,
"step": 2416
},
{
"epoch": 4.597242035187827,
"grad_norm": 0.34151947498321533,
"learning_rate": 8.46808510638298e-05,
"loss": 0.1214,
"step": 2417
},
{
"epoch": 4.599144079885877,
"grad_norm": 0.3011094629764557,
"learning_rate": 8.467449984121944e-05,
"loss": 0.0955,
"step": 2418
},
{
"epoch": 4.601046124583927,
"grad_norm": 0.45026248693466187,
"learning_rate": 8.466814861860909e-05,
"loss": 0.1309,
"step": 2419
},
{
"epoch": 4.602948169281978,
"grad_norm": 0.38199952244758606,
"learning_rate": 8.466179739599873e-05,
"loss": 0.1229,
"step": 2420
},
{
"epoch": 4.604850213980028,
"grad_norm": 0.44846484065055847,
"learning_rate": 8.465544617338839e-05,
"loss": 0.1254,
"step": 2421
},
{
"epoch": 4.606752258678079,
"grad_norm": 0.29512494802474976,
"learning_rate": 8.464909495077803e-05,
"loss": 0.0874,
"step": 2422
},
{
"epoch": 4.608654303376129,
"grad_norm": 0.34601306915283203,
"learning_rate": 8.464274372816767e-05,
"loss": 0.0928,
"step": 2423
},
{
"epoch": 4.61055634807418,
"grad_norm": 0.4081529378890991,
"learning_rate": 8.463639250555733e-05,
"loss": 0.1161,
"step": 2424
},
{
"epoch": 4.61245839277223,
"grad_norm": 0.39208075404167175,
"learning_rate": 8.463004128294697e-05,
"loss": 0.1124,
"step": 2425
},
{
"epoch": 4.614360437470281,
"grad_norm": 0.2740732431411743,
"learning_rate": 8.462369006033661e-05,
"loss": 0.0698,
"step": 2426
},
{
"epoch": 4.616262482168331,
"grad_norm": 0.37493231892585754,
"learning_rate": 8.461733883772626e-05,
"loss": 0.089,
"step": 2427
},
{
"epoch": 4.618164526866382,
"grad_norm": 0.4912300407886505,
"learning_rate": 8.461098761511591e-05,
"loss": 0.1374,
"step": 2428
},
{
"epoch": 4.620066571564432,
"grad_norm": 0.44587963819503784,
"learning_rate": 8.460463639250557e-05,
"loss": 0.1207,
"step": 2429
},
{
"epoch": 4.621968616262482,
"grad_norm": 0.4140859544277191,
"learning_rate": 8.45982851698952e-05,
"loss": 0.1333,
"step": 2430
},
{
"epoch": 4.623870660960533,
"grad_norm": 0.3500138223171234,
"learning_rate": 8.459193394728486e-05,
"loss": 0.1032,
"step": 2431
},
{
"epoch": 4.625772705658583,
"grad_norm": 0.3875083327293396,
"learning_rate": 8.458558272467451e-05,
"loss": 0.1018,
"step": 2432
},
{
"epoch": 4.627674750356634,
"grad_norm": 0.5065046548843384,
"learning_rate": 8.457923150206415e-05,
"loss": 0.125,
"step": 2433
},
{
"epoch": 4.629576795054684,
"grad_norm": 0.2707502841949463,
"learning_rate": 8.45728802794538e-05,
"loss": 0.1002,
"step": 2434
},
{
"epoch": 4.6314788397527344,
"grad_norm": 0.38502418994903564,
"learning_rate": 8.456652905684345e-05,
"loss": 0.1264,
"step": 2435
},
{
"epoch": 4.633380884450784,
"grad_norm": 0.34822702407836914,
"learning_rate": 8.456017783423309e-05,
"loss": 0.1184,
"step": 2436
},
{
"epoch": 4.635282929148835,
"grad_norm": 0.33620592951774597,
"learning_rate": 8.455382661162274e-05,
"loss": 0.1264,
"step": 2437
},
{
"epoch": 4.637184973846885,
"grad_norm": 0.3064115345478058,
"learning_rate": 8.454747538901239e-05,
"loss": 0.1122,
"step": 2438
},
{
"epoch": 4.639087018544936,
"grad_norm": 0.34428808093070984,
"learning_rate": 8.454112416640204e-05,
"loss": 0.1083,
"step": 2439
},
{
"epoch": 4.640989063242986,
"grad_norm": 0.3312735855579376,
"learning_rate": 8.453477294379168e-05,
"loss": 0.1046,
"step": 2440
},
{
"epoch": 4.642891107941036,
"grad_norm": 0.42405757308006287,
"learning_rate": 8.452842172118133e-05,
"loss": 0.1364,
"step": 2441
},
{
"epoch": 4.644793152639087,
"grad_norm": 0.39682331681251526,
"learning_rate": 8.452207049857099e-05,
"loss": 0.1262,
"step": 2442
},
{
"epoch": 4.646695197337137,
"grad_norm": 0.3447044789791107,
"learning_rate": 8.451571927596062e-05,
"loss": 0.1158,
"step": 2443
},
{
"epoch": 4.648597242035188,
"grad_norm": 0.40121355652809143,
"learning_rate": 8.450936805335026e-05,
"loss": 0.1246,
"step": 2444
},
{
"epoch": 4.650499286733238,
"grad_norm": 0.3898472785949707,
"learning_rate": 8.450301683073993e-05,
"loss": 0.1244,
"step": 2445
},
{
"epoch": 4.652401331431289,
"grad_norm": 0.2964152991771698,
"learning_rate": 8.449666560812957e-05,
"loss": 0.0925,
"step": 2446
},
{
"epoch": 4.654303376129339,
"grad_norm": 0.2836705446243286,
"learning_rate": 8.449031438551922e-05,
"loss": 0.101,
"step": 2447
},
{
"epoch": 4.65620542082739,
"grad_norm": 0.3003692030906677,
"learning_rate": 8.448396316290887e-05,
"loss": 0.0922,
"step": 2448
},
{
"epoch": 4.65810746552544,
"grad_norm": 0.5348609089851379,
"learning_rate": 8.447761194029851e-05,
"loss": 0.1735,
"step": 2449
},
{
"epoch": 4.660009510223491,
"grad_norm": 0.3387379050254822,
"learning_rate": 8.447126071768816e-05,
"loss": 0.1126,
"step": 2450
},
{
"epoch": 4.661911554921541,
"grad_norm": 0.30646830797195435,
"learning_rate": 8.44649094950778e-05,
"loss": 0.085,
"step": 2451
},
{
"epoch": 4.663813599619591,
"grad_norm": 0.34434470534324646,
"learning_rate": 8.445855827246746e-05,
"loss": 0.1113,
"step": 2452
},
{
"epoch": 4.6657156443176415,
"grad_norm": 0.38273414969444275,
"learning_rate": 8.44522070498571e-05,
"loss": 0.1135,
"step": 2453
},
{
"epoch": 4.6676176890156915,
"grad_norm": 0.44843336939811707,
"learning_rate": 8.444585582724674e-05,
"loss": 0.1497,
"step": 2454
},
{
"epoch": 4.669519733713742,
"grad_norm": 0.4575416147708893,
"learning_rate": 8.44395046046364e-05,
"loss": 0.1082,
"step": 2455
},
{
"epoch": 4.671421778411792,
"grad_norm": 0.38473185896873474,
"learning_rate": 8.443315338202604e-05,
"loss": 0.1255,
"step": 2456
},
{
"epoch": 4.673323823109843,
"grad_norm": 0.3839578926563263,
"learning_rate": 8.44268021594157e-05,
"loss": 0.1106,
"step": 2457
},
{
"epoch": 4.675225867807893,
"grad_norm": 0.35472893714904785,
"learning_rate": 8.442045093680533e-05,
"loss": 0.1122,
"step": 2458
},
{
"epoch": 4.677127912505944,
"grad_norm": 0.34224382042884827,
"learning_rate": 8.441409971419499e-05,
"loss": 0.0963,
"step": 2459
},
{
"epoch": 4.679029957203994,
"grad_norm": 0.3992440104484558,
"learning_rate": 8.440774849158464e-05,
"loss": 0.1234,
"step": 2460
},
{
"epoch": 4.680932001902045,
"grad_norm": 0.39441943168640137,
"learning_rate": 8.440139726897428e-05,
"loss": 0.11,
"step": 2461
},
{
"epoch": 4.682834046600095,
"grad_norm": 0.43852171301841736,
"learning_rate": 8.439504604636393e-05,
"loss": 0.1361,
"step": 2462
},
{
"epoch": 4.684736091298145,
"grad_norm": 0.35047483444213867,
"learning_rate": 8.438869482375358e-05,
"loss": 0.0981,
"step": 2463
},
{
"epoch": 4.686638135996196,
"grad_norm": 0.3970755934715271,
"learning_rate": 8.438234360114322e-05,
"loss": 0.1196,
"step": 2464
},
{
"epoch": 4.688540180694246,
"grad_norm": 0.2760510742664337,
"learning_rate": 8.437599237853287e-05,
"loss": 0.1035,
"step": 2465
},
{
"epoch": 4.690442225392297,
"grad_norm": 0.26530909538269043,
"learning_rate": 8.436964115592252e-05,
"loss": 0.1589,
"step": 2466
},
{
"epoch": 4.692344270090347,
"grad_norm": 0.2989928126335144,
"learning_rate": 8.436328993331216e-05,
"loss": 0.0945,
"step": 2467
},
{
"epoch": 4.694246314788398,
"grad_norm": 0.42447128891944885,
"learning_rate": 8.435693871070181e-05,
"loss": 0.1433,
"step": 2468
},
{
"epoch": 4.696148359486448,
"grad_norm": 0.4014334976673126,
"learning_rate": 8.435058748809146e-05,
"loss": 0.1242,
"step": 2469
},
{
"epoch": 4.698050404184499,
"grad_norm": 0.3872852921485901,
"learning_rate": 8.434423626548111e-05,
"loss": 0.1195,
"step": 2470
},
{
"epoch": 4.699952448882549,
"grad_norm": 0.3857705891132355,
"learning_rate": 8.433788504287075e-05,
"loss": 0.108,
"step": 2471
},
{
"epoch": 4.7018544935805995,
"grad_norm": 0.3534420430660248,
"learning_rate": 8.43315338202604e-05,
"loss": 0.1218,
"step": 2472
},
{
"epoch": 4.7037565382786495,
"grad_norm": 0.32009604573249817,
"learning_rate": 8.432518259765006e-05,
"loss": 0.1053,
"step": 2473
},
{
"epoch": 4.7056585829766995,
"grad_norm": 0.2501387894153595,
"learning_rate": 8.43188313750397e-05,
"loss": 0.0668,
"step": 2474
},
{
"epoch": 4.70756062767475,
"grad_norm": 0.3360025882720947,
"learning_rate": 8.431248015242935e-05,
"loss": 0.1119,
"step": 2475
},
{
"epoch": 4.709462672372801,
"grad_norm": 0.31509891152381897,
"learning_rate": 8.4306128929819e-05,
"loss": 0.0955,
"step": 2476
},
{
"epoch": 4.711364717070851,
"grad_norm": 0.42007285356521606,
"learning_rate": 8.429977770720864e-05,
"loss": 0.1441,
"step": 2477
},
{
"epoch": 4.713266761768901,
"grad_norm": 0.39764338731765747,
"learning_rate": 8.429342648459829e-05,
"loss": 0.1175,
"step": 2478
},
{
"epoch": 4.715168806466952,
"grad_norm": 0.33381861448287964,
"learning_rate": 8.428707526198794e-05,
"loss": 0.1199,
"step": 2479
},
{
"epoch": 4.717070851165002,
"grad_norm": 0.2918257415294647,
"learning_rate": 8.428072403937758e-05,
"loss": 0.0796,
"step": 2480
},
{
"epoch": 4.718972895863053,
"grad_norm": 0.42560750246047974,
"learning_rate": 8.427437281676723e-05,
"loss": 0.114,
"step": 2481
},
{
"epoch": 4.720874940561103,
"grad_norm": 0.3700113594532013,
"learning_rate": 8.426802159415688e-05,
"loss": 0.1145,
"step": 2482
},
{
"epoch": 4.722776985259154,
"grad_norm": 0.39171457290649414,
"learning_rate": 8.426167037154653e-05,
"loss": 0.128,
"step": 2483
},
{
"epoch": 4.724679029957204,
"grad_norm": 0.3000270426273346,
"learning_rate": 8.425531914893617e-05,
"loss": 0.0932,
"step": 2484
},
{
"epoch": 4.726581074655254,
"grad_norm": 0.2848623991012573,
"learning_rate": 8.424896792632581e-05,
"loss": 0.086,
"step": 2485
},
{
"epoch": 4.728483119353305,
"grad_norm": 0.3404539227485657,
"learning_rate": 8.424261670371548e-05,
"loss": 0.0934,
"step": 2486
},
{
"epoch": 4.730385164051356,
"grad_norm": 0.31609418988227844,
"learning_rate": 8.423626548110511e-05,
"loss": 0.0985,
"step": 2487
},
{
"epoch": 4.732287208749406,
"grad_norm": 0.34037312865257263,
"learning_rate": 8.422991425849477e-05,
"loss": 0.1193,
"step": 2488
},
{
"epoch": 4.734189253447456,
"grad_norm": 0.31899651885032654,
"learning_rate": 8.422356303588442e-05,
"loss": 0.1137,
"step": 2489
},
{
"epoch": 4.736091298145507,
"grad_norm": 0.39307737350463867,
"learning_rate": 8.421721181327406e-05,
"loss": 0.1452,
"step": 2490
},
{
"epoch": 4.7379933428435566,
"grad_norm": 0.26885175704956055,
"learning_rate": 8.421086059066371e-05,
"loss": 0.1025,
"step": 2491
},
{
"epoch": 4.739895387541607,
"grad_norm": 0.23492799699306488,
"learning_rate": 8.420450936805335e-05,
"loss": 0.0821,
"step": 2492
},
{
"epoch": 4.741797432239657,
"grad_norm": 0.30144715309143066,
"learning_rate": 8.419815814544301e-05,
"loss": 0.0924,
"step": 2493
},
{
"epoch": 4.743699476937708,
"grad_norm": 0.3370392322540283,
"learning_rate": 8.419180692283265e-05,
"loss": 0.1281,
"step": 2494
},
{
"epoch": 4.745601521635758,
"grad_norm": 0.3939819633960724,
"learning_rate": 8.418545570022229e-05,
"loss": 0.1115,
"step": 2495
},
{
"epoch": 4.747503566333809,
"grad_norm": 0.7242825627326965,
"learning_rate": 8.417910447761194e-05,
"loss": 0.1038,
"step": 2496
},
{
"epoch": 4.749405611031859,
"grad_norm": 0.3430320620536804,
"learning_rate": 8.417275325500159e-05,
"loss": 0.107,
"step": 2497
},
{
"epoch": 4.75130765572991,
"grad_norm": 0.37956321239471436,
"learning_rate": 8.416640203239123e-05,
"loss": 0.1203,
"step": 2498
},
{
"epoch": 4.75320970042796,
"grad_norm": 0.3118121027946472,
"learning_rate": 8.416005080978088e-05,
"loss": 0.0961,
"step": 2499
},
{
"epoch": 4.75511174512601,
"grad_norm": 0.3842122554779053,
"learning_rate": 8.415369958717053e-05,
"loss": 0.1095,
"step": 2500
},
{
"epoch": 4.757013789824061,
"grad_norm": 0.36103618144989014,
"learning_rate": 8.414734836456019e-05,
"loss": 0.107,
"step": 2501
},
{
"epoch": 4.758915834522111,
"grad_norm": 0.4404369592666626,
"learning_rate": 8.414099714194982e-05,
"loss": 0.0972,
"step": 2502
},
{
"epoch": 4.760817879220162,
"grad_norm": 0.45303696393966675,
"learning_rate": 8.413464591933948e-05,
"loss": 0.1286,
"step": 2503
},
{
"epoch": 4.762719923918212,
"grad_norm": 0.36196044087409973,
"learning_rate": 8.412829469672913e-05,
"loss": 0.1095,
"step": 2504
},
{
"epoch": 4.764621968616263,
"grad_norm": 0.49001795053482056,
"learning_rate": 8.412194347411877e-05,
"loss": 0.1578,
"step": 2505
},
{
"epoch": 4.766524013314313,
"grad_norm": 0.32446369528770447,
"learning_rate": 8.411559225150842e-05,
"loss": 0.0991,
"step": 2506
},
{
"epoch": 4.768426058012364,
"grad_norm": 0.3021388053894043,
"learning_rate": 8.410924102889807e-05,
"loss": 0.0902,
"step": 2507
},
{
"epoch": 4.770328102710414,
"grad_norm": 0.28912147879600525,
"learning_rate": 8.410288980628771e-05,
"loss": 0.106,
"step": 2508
},
{
"epoch": 4.7722301474084645,
"grad_norm": 0.40766748785972595,
"learning_rate": 8.409653858367736e-05,
"loss": 0.1155,
"step": 2509
},
{
"epoch": 4.7741321921065145,
"grad_norm": 0.5005617737770081,
"learning_rate": 8.409018736106701e-05,
"loss": 0.1674,
"step": 2510
},
{
"epoch": 4.7760342368045645,
"grad_norm": 0.4575154781341553,
"learning_rate": 8.408383613845666e-05,
"loss": 0.1639,
"step": 2511
},
{
"epoch": 4.777936281502615,
"grad_norm": 0.4962354302406311,
"learning_rate": 8.40774849158463e-05,
"loss": 0.1336,
"step": 2512
},
{
"epoch": 4.779838326200665,
"grad_norm": 0.4569809138774872,
"learning_rate": 8.407113369323595e-05,
"loss": 0.1323,
"step": 2513
},
{
"epoch": 4.781740370898716,
"grad_norm": 0.34369999170303345,
"learning_rate": 8.40647824706256e-05,
"loss": 0.1171,
"step": 2514
},
{
"epoch": 4.783642415596766,
"grad_norm": 0.3565669655799866,
"learning_rate": 8.405843124801524e-05,
"loss": 0.1159,
"step": 2515
},
{
"epoch": 4.785544460294817,
"grad_norm": 0.24039465188980103,
"learning_rate": 8.405208002540488e-05,
"loss": 0.0976,
"step": 2516
},
{
"epoch": 4.787446504992867,
"grad_norm": 0.37532779574394226,
"learning_rate": 8.404572880279455e-05,
"loss": 0.1129,
"step": 2517
},
{
"epoch": 4.789348549690918,
"grad_norm": 0.334505170583725,
"learning_rate": 8.403937758018419e-05,
"loss": 0.1016,
"step": 2518
},
{
"epoch": 4.791250594388968,
"grad_norm": 0.43082761764526367,
"learning_rate": 8.403302635757384e-05,
"loss": 0.1307,
"step": 2519
},
{
"epoch": 4.793152639087019,
"grad_norm": 0.4381292760372162,
"learning_rate": 8.402667513496349e-05,
"loss": 0.1137,
"step": 2520
},
{
"epoch": 4.795054683785069,
"grad_norm": 0.4337981045246124,
"learning_rate": 8.402032391235313e-05,
"loss": 0.1281,
"step": 2521
},
{
"epoch": 4.796956728483119,
"grad_norm": 0.4429587721824646,
"learning_rate": 8.401397268974278e-05,
"loss": 0.1191,
"step": 2522
},
{
"epoch": 4.79885877318117,
"grad_norm": 0.4298746883869171,
"learning_rate": 8.400762146713242e-05,
"loss": 0.1367,
"step": 2523
},
{
"epoch": 4.80076081787922,
"grad_norm": 0.42826715111732483,
"learning_rate": 8.400127024452208e-05,
"loss": 0.1222,
"step": 2524
},
{
"epoch": 4.802662862577271,
"grad_norm": 0.37338751554489136,
"learning_rate": 8.399491902191172e-05,
"loss": 0.1048,
"step": 2525
},
{
"epoch": 4.804564907275321,
"grad_norm": 0.38671061396598816,
"learning_rate": 8.398856779930136e-05,
"loss": 0.1154,
"step": 2526
},
{
"epoch": 4.806466951973372,
"grad_norm": 0.3544102907180786,
"learning_rate": 8.398221657669103e-05,
"loss": 0.1055,
"step": 2527
},
{
"epoch": 4.808368996671422,
"grad_norm": 0.38023364543914795,
"learning_rate": 8.397586535408066e-05,
"loss": 0.1117,
"step": 2528
},
{
"epoch": 4.8102710413694725,
"grad_norm": 0.3622092008590698,
"learning_rate": 8.396951413147032e-05,
"loss": 0.1099,
"step": 2529
},
{
"epoch": 4.8121730860675225,
"grad_norm": 0.692039966583252,
"learning_rate": 8.396316290885995e-05,
"loss": 0.1335,
"step": 2530
},
{
"epoch": 4.814075130765573,
"grad_norm": 0.35321712493896484,
"learning_rate": 8.39568116862496e-05,
"loss": 0.1175,
"step": 2531
},
{
"epoch": 4.815977175463623,
"grad_norm": 0.37036386132240295,
"learning_rate": 8.395046046363926e-05,
"loss": 0.1253,
"step": 2532
},
{
"epoch": 4.817879220161673,
"grad_norm": 0.42249128222465515,
"learning_rate": 8.39441092410289e-05,
"loss": 0.1163,
"step": 2533
},
{
"epoch": 4.819781264859724,
"grad_norm": 0.3563583195209503,
"learning_rate": 8.393775801841855e-05,
"loss": 0.1597,
"step": 2534
},
{
"epoch": 4.821683309557774,
"grad_norm": 0.39946305751800537,
"learning_rate": 8.39314067958082e-05,
"loss": 0.1156,
"step": 2535
},
{
"epoch": 4.823585354255825,
"grad_norm": 0.31761807203292847,
"learning_rate": 8.392505557319784e-05,
"loss": 0.0946,
"step": 2536
},
{
"epoch": 4.825487398953875,
"grad_norm": 0.4180295765399933,
"learning_rate": 8.391870435058749e-05,
"loss": 0.1271,
"step": 2537
},
{
"epoch": 4.827389443651926,
"grad_norm": 0.36158043146133423,
"learning_rate": 8.391235312797714e-05,
"loss": 0.106,
"step": 2538
},
{
"epoch": 4.829291488349976,
"grad_norm": 0.4044169783592224,
"learning_rate": 8.390600190536678e-05,
"loss": 0.1094,
"step": 2539
},
{
"epoch": 4.831193533048027,
"grad_norm": 0.3362937569618225,
"learning_rate": 8.389965068275643e-05,
"loss": 0.078,
"step": 2540
},
{
"epoch": 4.833095577746077,
"grad_norm": 0.3558341860771179,
"learning_rate": 8.389329946014608e-05,
"loss": 0.1125,
"step": 2541
},
{
"epoch": 4.834997622444128,
"grad_norm": 0.44893354177474976,
"learning_rate": 8.388694823753574e-05,
"loss": 0.1393,
"step": 2542
},
{
"epoch": 4.836899667142178,
"grad_norm": 0.3790888488292694,
"learning_rate": 8.388059701492537e-05,
"loss": 0.1312,
"step": 2543
},
{
"epoch": 4.838801711840228,
"grad_norm": 0.24070213735103607,
"learning_rate": 8.387424579231503e-05,
"loss": 0.0772,
"step": 2544
},
{
"epoch": 4.840703756538279,
"grad_norm": 0.4367123246192932,
"learning_rate": 8.386789456970468e-05,
"loss": 0.1227,
"step": 2545
},
{
"epoch": 4.842605801236329,
"grad_norm": 0.3168450891971588,
"learning_rate": 8.386154334709432e-05,
"loss": 0.0928,
"step": 2546
},
{
"epoch": 4.8445078459343796,
"grad_norm": 0.36236846446990967,
"learning_rate": 8.385519212448397e-05,
"loss": 0.0997,
"step": 2547
},
{
"epoch": 4.8464098906324296,
"grad_norm": 0.31763169169425964,
"learning_rate": 8.384884090187362e-05,
"loss": 0.1093,
"step": 2548
},
{
"epoch": 4.84831193533048,
"grad_norm": 0.3502260148525238,
"learning_rate": 8.384248967926326e-05,
"loss": 0.1299,
"step": 2549
},
{
"epoch": 4.85021398002853,
"grad_norm": 0.3593395948410034,
"learning_rate": 8.383613845665291e-05,
"loss": 0.1066,
"step": 2550
},
{
"epoch": 4.852116024726581,
"grad_norm": 0.39665883779525757,
"learning_rate": 8.382978723404256e-05,
"loss": 0.1267,
"step": 2551
},
{
"epoch": 4.854018069424631,
"grad_norm": 0.4395765960216522,
"learning_rate": 8.38234360114322e-05,
"loss": 0.174,
"step": 2552
},
{
"epoch": 4.855920114122682,
"grad_norm": 0.3507075607776642,
"learning_rate": 8.381708478882185e-05,
"loss": 0.0953,
"step": 2553
},
{
"epoch": 4.857822158820732,
"grad_norm": 0.3769589364528656,
"learning_rate": 8.381073356621149e-05,
"loss": 0.1395,
"step": 2554
},
{
"epoch": 4.859724203518782,
"grad_norm": 0.30503159761428833,
"learning_rate": 8.380438234360116e-05,
"loss": 0.0937,
"step": 2555
},
{
"epoch": 4.861626248216833,
"grad_norm": 0.39943060278892517,
"learning_rate": 8.37980311209908e-05,
"loss": 0.1103,
"step": 2556
},
{
"epoch": 4.863528292914884,
"grad_norm": 0.36200422048568726,
"learning_rate": 8.379167989838043e-05,
"loss": 0.1135,
"step": 2557
},
{
"epoch": 4.865430337612934,
"grad_norm": 0.3811735510826111,
"learning_rate": 8.37853286757701e-05,
"loss": 0.1265,
"step": 2558
},
{
"epoch": 4.867332382310984,
"grad_norm": 0.42090871930122375,
"learning_rate": 8.377897745315974e-05,
"loss": 0.1339,
"step": 2559
},
{
"epoch": 4.869234427009035,
"grad_norm": 0.41796380281448364,
"learning_rate": 8.377262623054939e-05,
"loss": 0.1136,
"step": 2560
},
{
"epoch": 4.871136471707085,
"grad_norm": 0.33189094066619873,
"learning_rate": 8.376627500793903e-05,
"loss": 0.0923,
"step": 2561
},
{
"epoch": 4.873038516405136,
"grad_norm": 0.46369072794914246,
"learning_rate": 8.375992378532868e-05,
"loss": 0.1236,
"step": 2562
},
{
"epoch": 4.874940561103186,
"grad_norm": 0.27973759174346924,
"learning_rate": 8.375357256271833e-05,
"loss": 0.0933,
"step": 2563
},
{
"epoch": 4.876842605801237,
"grad_norm": 0.39309409260749817,
"learning_rate": 8.374722134010797e-05,
"loss": 0.1135,
"step": 2564
},
{
"epoch": 4.878744650499287,
"grad_norm": 0.43652641773223877,
"learning_rate": 8.374087011749763e-05,
"loss": 0.136,
"step": 2565
},
{
"epoch": 4.8806466951973375,
"grad_norm": 0.30485180020332336,
"learning_rate": 8.373451889488727e-05,
"loss": 0.0894,
"step": 2566
},
{
"epoch": 4.8825487398953875,
"grad_norm": 0.40164196491241455,
"learning_rate": 8.372816767227691e-05,
"loss": 0.1235,
"step": 2567
},
{
"epoch": 4.884450784593438,
"grad_norm": 0.3442533314228058,
"learning_rate": 8.372181644966656e-05,
"loss": 0.1222,
"step": 2568
},
{
"epoch": 4.886352829291488,
"grad_norm": 0.38092851638793945,
"learning_rate": 8.371546522705621e-05,
"loss": 0.1135,
"step": 2569
},
{
"epoch": 4.888254873989538,
"grad_norm": 0.37114188075065613,
"learning_rate": 8.370911400444585e-05,
"loss": 0.1181,
"step": 2570
},
{
"epoch": 4.890156918687589,
"grad_norm": 0.35971492528915405,
"learning_rate": 8.37027627818355e-05,
"loss": 0.1247,
"step": 2571
},
{
"epoch": 4.892058963385639,
"grad_norm": 0.25756967067718506,
"learning_rate": 8.369641155922516e-05,
"loss": 0.0929,
"step": 2572
},
{
"epoch": 4.89396100808369,
"grad_norm": 0.4541129171848297,
"learning_rate": 8.369006033661481e-05,
"loss": 0.142,
"step": 2573
},
{
"epoch": 4.89586305278174,
"grad_norm": 0.48526903986930847,
"learning_rate": 8.368370911400445e-05,
"loss": 0.1612,
"step": 2574
},
{
"epoch": 4.897765097479791,
"grad_norm": 0.31703343987464905,
"learning_rate": 8.36773578913941e-05,
"loss": 0.1135,
"step": 2575
},
{
"epoch": 4.899667142177841,
"grad_norm": 0.2969724237918854,
"learning_rate": 8.367100666878375e-05,
"loss": 0.1148,
"step": 2576
},
{
"epoch": 4.901569186875892,
"grad_norm": 0.37165188789367676,
"learning_rate": 8.366465544617339e-05,
"loss": 0.1066,
"step": 2577
},
{
"epoch": 4.903471231573942,
"grad_norm": 0.2899304926395416,
"learning_rate": 8.365830422356304e-05,
"loss": 0.0896,
"step": 2578
},
{
"epoch": 4.905373276271993,
"grad_norm": 0.3420521914958954,
"learning_rate": 8.365195300095269e-05,
"loss": 0.0929,
"step": 2579
},
{
"epoch": 4.907275320970043,
"grad_norm": 0.48174387216567993,
"learning_rate": 8.364560177834233e-05,
"loss": 0.1422,
"step": 2580
},
{
"epoch": 4.909177365668093,
"grad_norm": 0.3492242693901062,
"learning_rate": 8.363925055573198e-05,
"loss": 0.1116,
"step": 2581
},
{
"epoch": 4.911079410366144,
"grad_norm": 0.367914080619812,
"learning_rate": 8.363289933312163e-05,
"loss": 0.1139,
"step": 2582
},
{
"epoch": 4.912981455064194,
"grad_norm": 0.32939612865448,
"learning_rate": 8.362654811051129e-05,
"loss": 0.1175,
"step": 2583
},
{
"epoch": 4.914883499762245,
"grad_norm": 0.3939587473869324,
"learning_rate": 8.362019688790092e-05,
"loss": 0.1263,
"step": 2584
},
{
"epoch": 4.916785544460295,
"grad_norm": 0.36641520261764526,
"learning_rate": 8.361384566529058e-05,
"loss": 0.1219,
"step": 2585
},
{
"epoch": 4.9186875891583455,
"grad_norm": 0.2804834544658661,
"learning_rate": 8.360749444268023e-05,
"loss": 0.0839,
"step": 2586
},
{
"epoch": 4.9205896338563955,
"grad_norm": 0.310461163520813,
"learning_rate": 8.360114322006987e-05,
"loss": 0.0949,
"step": 2587
},
{
"epoch": 4.922491678554446,
"grad_norm": 0.34361201524734497,
"learning_rate": 8.35947919974595e-05,
"loss": 0.1167,
"step": 2588
},
{
"epoch": 4.924393723252496,
"grad_norm": 0.3348811864852905,
"learning_rate": 8.358844077484917e-05,
"loss": 0.1035,
"step": 2589
},
{
"epoch": 4.926295767950547,
"grad_norm": 0.24014593660831451,
"learning_rate": 8.358208955223881e-05,
"loss": 0.1413,
"step": 2590
},
{
"epoch": 4.928197812648597,
"grad_norm": 0.4338441491127014,
"learning_rate": 8.357573832962846e-05,
"loss": 0.1186,
"step": 2591
},
{
"epoch": 4.930099857346647,
"grad_norm": 0.3601210415363312,
"learning_rate": 8.356938710701811e-05,
"loss": 0.1014,
"step": 2592
},
{
"epoch": 4.932001902044698,
"grad_norm": 0.2996499538421631,
"learning_rate": 8.356303588440775e-05,
"loss": 0.0906,
"step": 2593
},
{
"epoch": 4.933903946742748,
"grad_norm": 0.30851230025291443,
"learning_rate": 8.35566846617974e-05,
"loss": 0.0806,
"step": 2594
},
{
"epoch": 4.935805991440799,
"grad_norm": 0.22290165722370148,
"learning_rate": 8.355033343918704e-05,
"loss": 0.0728,
"step": 2595
},
{
"epoch": 4.937708036138849,
"grad_norm": 0.28518247604370117,
"learning_rate": 8.35439822165767e-05,
"loss": 0.0894,
"step": 2596
},
{
"epoch": 4.9396100808369,
"grad_norm": 0.424231618642807,
"learning_rate": 8.353763099396634e-05,
"loss": 0.1157,
"step": 2597
},
{
"epoch": 4.94151212553495,
"grad_norm": 0.5748564600944519,
"learning_rate": 8.353127977135598e-05,
"loss": 0.1777,
"step": 2598
},
{
"epoch": 4.943414170233001,
"grad_norm": 0.39010798931121826,
"learning_rate": 8.352492854874565e-05,
"loss": 0.104,
"step": 2599
},
{
"epoch": 4.945316214931051,
"grad_norm": 0.40491625666618347,
"learning_rate": 8.351857732613529e-05,
"loss": 0.115,
"step": 2600
},
{
"epoch": 4.947218259629102,
"grad_norm": 0.3881874084472656,
"learning_rate": 8.351222610352494e-05,
"loss": 0.1125,
"step": 2601
},
{
"epoch": 4.949120304327152,
"grad_norm": 0.4075947403907776,
"learning_rate": 8.350587488091458e-05,
"loss": 0.1376,
"step": 2602
},
{
"epoch": 4.951022349025202,
"grad_norm": 0.4263762831687927,
"learning_rate": 8.349952365830423e-05,
"loss": 0.1214,
"step": 2603
},
{
"epoch": 4.9529243937232525,
"grad_norm": 0.4403824806213379,
"learning_rate": 8.349317243569388e-05,
"loss": 0.1212,
"step": 2604
},
{
"epoch": 4.9548264384213025,
"grad_norm": 0.41958004236221313,
"learning_rate": 8.348682121308352e-05,
"loss": 0.1197,
"step": 2605
},
{
"epoch": 4.956728483119353,
"grad_norm": 0.3664645850658417,
"learning_rate": 8.348046999047317e-05,
"loss": 0.1208,
"step": 2606
},
{
"epoch": 4.958630527817403,
"grad_norm": 0.3618158996105194,
"learning_rate": 8.347411876786282e-05,
"loss": 0.1241,
"step": 2607
},
{
"epoch": 4.960532572515454,
"grad_norm": 0.3135223686695099,
"learning_rate": 8.346776754525246e-05,
"loss": 0.0807,
"step": 2608
},
{
"epoch": 4.962434617213504,
"grad_norm": 0.3673211932182312,
"learning_rate": 8.346141632264211e-05,
"loss": 0.1188,
"step": 2609
},
{
"epoch": 4.964336661911555,
"grad_norm": 0.34168919920921326,
"learning_rate": 8.345506510003176e-05,
"loss": 0.1113,
"step": 2610
},
{
"epoch": 4.966238706609605,
"grad_norm": 0.3807981312274933,
"learning_rate": 8.34487138774214e-05,
"loss": 0.1243,
"step": 2611
},
{
"epoch": 4.968140751307656,
"grad_norm": 0.35833629965782166,
"learning_rate": 8.344236265481105e-05,
"loss": 0.1175,
"step": 2612
},
{
"epoch": 4.970042796005706,
"grad_norm": 0.4410795569419861,
"learning_rate": 8.34360114322007e-05,
"loss": 0.1174,
"step": 2613
},
{
"epoch": 4.971944840703756,
"grad_norm": 0.27122291922569275,
"learning_rate": 8.342966020959036e-05,
"loss": 0.1062,
"step": 2614
},
{
"epoch": 4.973846885401807,
"grad_norm": 0.3411978483200073,
"learning_rate": 8.342330898698e-05,
"loss": 0.1274,
"step": 2615
},
{
"epoch": 4.975748930099857,
"grad_norm": 0.36536306142807007,
"learning_rate": 8.341695776436965e-05,
"loss": 0.1182,
"step": 2616
},
{
"epoch": 4.977650974797908,
"grad_norm": 0.3873109221458435,
"learning_rate": 8.34106065417593e-05,
"loss": 0.1043,
"step": 2617
},
{
"epoch": 4.979553019495958,
"grad_norm": 0.30192115902900696,
"learning_rate": 8.340425531914894e-05,
"loss": 0.0984,
"step": 2618
},
{
"epoch": 4.981455064194009,
"grad_norm": 0.37886565923690796,
"learning_rate": 8.339790409653859e-05,
"loss": 0.1161,
"step": 2619
},
{
"epoch": 4.983357108892059,
"grad_norm": 0.34957846999168396,
"learning_rate": 8.339155287392824e-05,
"loss": 0.1083,
"step": 2620
},
{
"epoch": 4.98525915359011,
"grad_norm": 0.3169527053833008,
"learning_rate": 8.338520165131788e-05,
"loss": 0.088,
"step": 2621
},
{
"epoch": 4.98716119828816,
"grad_norm": 0.41983914375305176,
"learning_rate": 8.337885042870753e-05,
"loss": 0.1158,
"step": 2622
},
{
"epoch": 4.9890632429862105,
"grad_norm": 0.3467552661895752,
"learning_rate": 8.337249920609718e-05,
"loss": 0.0958,
"step": 2623
},
{
"epoch": 4.9909652876842605,
"grad_norm": 0.3872130513191223,
"learning_rate": 8.336614798348682e-05,
"loss": 0.1012,
"step": 2624
},
{
"epoch": 4.9928673323823105,
"grad_norm": 0.2966238856315613,
"learning_rate": 8.335979676087647e-05,
"loss": 0.0913,
"step": 2625
}
],
"logging_steps": 1,
"max_steps": 15750,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 525,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.721151480093e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}