AIM-project-adapters / checkpoint-525 /trainer_state.json
oodeh's picture
Upload folder using huggingface_hub
80bfed8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9985734664764622,
"eval_steps": 500,
"global_step": 525,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019020446980504042,
"grad_norm": 0.9932524561882019,
"learning_rate": 2e-05,
"loss": 1.3348,
"step": 1
},
{
"epoch": 0.0038040893961008085,
"grad_norm": 0.9241018295288086,
"learning_rate": 4e-05,
"loss": 1.3131,
"step": 2
},
{
"epoch": 0.005706134094151213,
"grad_norm": 1.1556137800216675,
"learning_rate": 6e-05,
"loss": 1.5644,
"step": 3
},
{
"epoch": 0.007608178792201617,
"grad_norm": 0.8612737059593201,
"learning_rate": 8e-05,
"loss": 1.2192,
"step": 4
},
{
"epoch": 0.009510223490252021,
"grad_norm": 0.8998388648033142,
"learning_rate": 0.0001,
"loss": 1.3651,
"step": 5
},
{
"epoch": 0.011412268188302425,
"grad_norm": 0.7211980819702148,
"learning_rate": 9.999364877738964e-05,
"loss": 1.2525,
"step": 6
},
{
"epoch": 0.01331431288635283,
"grad_norm": 0.44894707202911377,
"learning_rate": 9.998729755477931e-05,
"loss": 1.1999,
"step": 7
},
{
"epoch": 0.015216357584403234,
"grad_norm": 0.4338511824607849,
"learning_rate": 9.998094633216895e-05,
"loss": 1.0147,
"step": 8
},
{
"epoch": 0.017118402282453638,
"grad_norm": 0.5658989548683167,
"learning_rate": 9.99745951095586e-05,
"loss": 1.1997,
"step": 9
},
{
"epoch": 0.019020446980504042,
"grad_norm": 0.4467356503009796,
"learning_rate": 9.996824388694824e-05,
"loss": 1.0424,
"step": 10
},
{
"epoch": 0.020922491678554447,
"grad_norm": 0.3743385374546051,
"learning_rate": 9.996189266433789e-05,
"loss": 1.0902,
"step": 11
},
{
"epoch": 0.02282453637660485,
"grad_norm": 0.30667275190353394,
"learning_rate": 9.995554144172754e-05,
"loss": 0.8736,
"step": 12
},
{
"epoch": 0.024726581074655255,
"grad_norm": 0.48634254932403564,
"learning_rate": 9.994919021911718e-05,
"loss": 0.977,
"step": 13
},
{
"epoch": 0.02662862577270566,
"grad_norm": 0.4229658246040344,
"learning_rate": 9.994283899650683e-05,
"loss": 0.9673,
"step": 14
},
{
"epoch": 0.028530670470756064,
"grad_norm": 0.39269882440567017,
"learning_rate": 9.993648777389648e-05,
"loss": 1.0001,
"step": 15
},
{
"epoch": 0.030432715168806468,
"grad_norm": 0.38597363233566284,
"learning_rate": 9.993013655128612e-05,
"loss": 0.9705,
"step": 16
},
{
"epoch": 0.03233475986685687,
"grad_norm": 0.40809136629104614,
"learning_rate": 9.992378532867577e-05,
"loss": 0.9246,
"step": 17
},
{
"epoch": 0.034236804564907276,
"grad_norm": 0.4431133270263672,
"learning_rate": 9.991743410606542e-05,
"loss": 1.0409,
"step": 18
},
{
"epoch": 0.03613884926295768,
"grad_norm": 0.5659255981445312,
"learning_rate": 9.991108288345506e-05,
"loss": 1.1118,
"step": 19
},
{
"epoch": 0.038040893961008085,
"grad_norm": 0.4943106472492218,
"learning_rate": 9.990473166084471e-05,
"loss": 0.9213,
"step": 20
},
{
"epoch": 0.039942938659058486,
"grad_norm": 0.48820945620536804,
"learning_rate": 9.989838043823437e-05,
"loss": 0.9108,
"step": 21
},
{
"epoch": 0.04184498335710889,
"grad_norm": 0.4464576542377472,
"learning_rate": 9.989202921562402e-05,
"loss": 0.8959,
"step": 22
},
{
"epoch": 0.043747028055159294,
"grad_norm": 0.3870016038417816,
"learning_rate": 9.988567799301366e-05,
"loss": 0.8013,
"step": 23
},
{
"epoch": 0.0456490727532097,
"grad_norm": 0.42381179332733154,
"learning_rate": 9.987932677040331e-05,
"loss": 0.8584,
"step": 24
},
{
"epoch": 0.0475511174512601,
"grad_norm": 0.37170907855033875,
"learning_rate": 9.987297554779296e-05,
"loss": 0.7849,
"step": 25
},
{
"epoch": 0.04945316214931051,
"grad_norm": 0.4516700506210327,
"learning_rate": 9.98666243251826e-05,
"loss": 0.8902,
"step": 26
},
{
"epoch": 0.05135520684736091,
"grad_norm": 0.3525027334690094,
"learning_rate": 9.986027310257225e-05,
"loss": 0.6029,
"step": 27
},
{
"epoch": 0.05325725154541132,
"grad_norm": 0.437707781791687,
"learning_rate": 9.98539218799619e-05,
"loss": 0.7387,
"step": 28
},
{
"epoch": 0.05515929624346172,
"grad_norm": 0.45205071568489075,
"learning_rate": 9.984757065735154e-05,
"loss": 0.7468,
"step": 29
},
{
"epoch": 0.05706134094151213,
"grad_norm": 0.3709086775779724,
"learning_rate": 9.984121943474119e-05,
"loss": 0.7365,
"step": 30
},
{
"epoch": 0.05896338563956253,
"grad_norm": 0.4089844822883606,
"learning_rate": 9.983486821213084e-05,
"loss": 0.6563,
"step": 31
},
{
"epoch": 0.060865430337612936,
"grad_norm": 0.45955532789230347,
"learning_rate": 9.982851698952048e-05,
"loss": 0.8021,
"step": 32
},
{
"epoch": 0.06276747503566334,
"grad_norm": 0.5240988731384277,
"learning_rate": 9.982216576691013e-05,
"loss": 0.6933,
"step": 33
},
{
"epoch": 0.06466951973371374,
"grad_norm": 0.4703526496887207,
"learning_rate": 9.981581454429977e-05,
"loss": 0.7339,
"step": 34
},
{
"epoch": 0.06657156443176415,
"grad_norm": 0.5659805536270142,
"learning_rate": 9.980946332168944e-05,
"loss": 0.8139,
"step": 35
},
{
"epoch": 0.06847360912981455,
"grad_norm": 0.39259326457977295,
"learning_rate": 9.980311209907908e-05,
"loss": 0.5838,
"step": 36
},
{
"epoch": 0.07037565382786495,
"grad_norm": 0.4165003001689911,
"learning_rate": 9.979676087646871e-05,
"loss": 0.674,
"step": 37
},
{
"epoch": 0.07227769852591535,
"grad_norm": 0.4533802568912506,
"learning_rate": 9.979040965385838e-05,
"loss": 0.6974,
"step": 38
},
{
"epoch": 0.07417974322396577,
"grad_norm": 0.5213814973831177,
"learning_rate": 9.978405843124802e-05,
"loss": 0.7896,
"step": 39
},
{
"epoch": 0.07608178792201617,
"grad_norm": 0.3241259753704071,
"learning_rate": 9.977770720863767e-05,
"loss": 0.5895,
"step": 40
},
{
"epoch": 0.07798383262006657,
"grad_norm": 0.34446167945861816,
"learning_rate": 9.977135598602731e-05,
"loss": 0.6222,
"step": 41
},
{
"epoch": 0.07988587731811697,
"grad_norm": 0.49035167694091797,
"learning_rate": 9.976500476341696e-05,
"loss": 0.6978,
"step": 42
},
{
"epoch": 0.08178792201616739,
"grad_norm": 0.4795296788215637,
"learning_rate": 9.975865354080661e-05,
"loss": 0.7368,
"step": 43
},
{
"epoch": 0.08368996671421779,
"grad_norm": 0.44959381222724915,
"learning_rate": 9.975230231819625e-05,
"loss": 0.57,
"step": 44
},
{
"epoch": 0.08559201141226819,
"grad_norm": 0.4577605426311493,
"learning_rate": 9.974595109558592e-05,
"loss": 0.691,
"step": 45
},
{
"epoch": 0.08749405611031859,
"grad_norm": 0.41654840111732483,
"learning_rate": 9.973959987297555e-05,
"loss": 0.6346,
"step": 46
},
{
"epoch": 0.089396100808369,
"grad_norm": 0.6599829196929932,
"learning_rate": 9.973324865036519e-05,
"loss": 0.6358,
"step": 47
},
{
"epoch": 0.0912981455064194,
"grad_norm": 0.38539162278175354,
"learning_rate": 9.972689742775484e-05,
"loss": 0.5723,
"step": 48
},
{
"epoch": 0.0932001902044698,
"grad_norm": 0.4626316428184509,
"learning_rate": 9.97205462051445e-05,
"loss": 0.6845,
"step": 49
},
{
"epoch": 0.0951022349025202,
"grad_norm": 0.348387211561203,
"learning_rate": 9.971419498253413e-05,
"loss": 0.4857,
"step": 50
},
{
"epoch": 0.09700427960057062,
"grad_norm": 0.4964020252227783,
"learning_rate": 9.970784375992379e-05,
"loss": 0.7141,
"step": 51
},
{
"epoch": 0.09890632429862102,
"grad_norm": 0.4282241463661194,
"learning_rate": 9.970149253731344e-05,
"loss": 0.6619,
"step": 52
},
{
"epoch": 0.10080836899667142,
"grad_norm": 0.35991716384887695,
"learning_rate": 9.969514131470309e-05,
"loss": 0.4727,
"step": 53
},
{
"epoch": 0.10271041369472182,
"grad_norm": 0.3936012387275696,
"learning_rate": 9.968879009209273e-05,
"loss": 0.5644,
"step": 54
},
{
"epoch": 0.10461245839277224,
"grad_norm": 0.39267924427986145,
"learning_rate": 9.968243886948238e-05,
"loss": 0.5126,
"step": 55
},
{
"epoch": 0.10651450309082264,
"grad_norm": 0.4119136333465576,
"learning_rate": 9.967608764687203e-05,
"loss": 0.471,
"step": 56
},
{
"epoch": 0.10841654778887304,
"grad_norm": 0.5160384178161621,
"learning_rate": 9.966973642426167e-05,
"loss": 0.6555,
"step": 57
},
{
"epoch": 0.11031859248692344,
"grad_norm": 0.4742174744606018,
"learning_rate": 9.966338520165132e-05,
"loss": 0.6093,
"step": 58
},
{
"epoch": 0.11222063718497385,
"grad_norm": 0.3615169823169708,
"learning_rate": 9.965703397904097e-05,
"loss": 0.5527,
"step": 59
},
{
"epoch": 0.11412268188302425,
"grad_norm": 0.5700575113296509,
"learning_rate": 9.965068275643061e-05,
"loss": 0.5713,
"step": 60
},
{
"epoch": 0.11602472658107466,
"grad_norm": 0.4825727939605713,
"learning_rate": 9.964433153382026e-05,
"loss": 0.5142,
"step": 61
},
{
"epoch": 0.11792677127912506,
"grad_norm": 0.392088919878006,
"learning_rate": 9.963798031120992e-05,
"loss": 0.513,
"step": 62
},
{
"epoch": 0.11982881597717546,
"grad_norm": 0.35883110761642456,
"learning_rate": 9.963162908859957e-05,
"loss": 0.501,
"step": 63
},
{
"epoch": 0.12173086067522587,
"grad_norm": 0.39946749806404114,
"learning_rate": 9.96252778659892e-05,
"loss": 0.5532,
"step": 64
},
{
"epoch": 0.12363290537327627,
"grad_norm": 0.4191288352012634,
"learning_rate": 9.961892664337886e-05,
"loss": 0.5258,
"step": 65
},
{
"epoch": 0.12553495007132667,
"grad_norm": 0.3662487268447876,
"learning_rate": 9.961257542076851e-05,
"loss": 0.5121,
"step": 66
},
{
"epoch": 0.1274369947693771,
"grad_norm": 0.5582164525985718,
"learning_rate": 9.960622419815815e-05,
"loss": 0.6494,
"step": 67
},
{
"epoch": 0.12933903946742747,
"grad_norm": 0.485128790140152,
"learning_rate": 9.959987297554779e-05,
"loss": 0.6022,
"step": 68
},
{
"epoch": 0.1312410841654779,
"grad_norm": 0.3816944360733032,
"learning_rate": 9.959352175293745e-05,
"loss": 0.4851,
"step": 69
},
{
"epoch": 0.1331431288635283,
"grad_norm": 0.3637336194515228,
"learning_rate": 9.958717053032709e-05,
"loss": 0.4344,
"step": 70
},
{
"epoch": 0.1350451735615787,
"grad_norm": 0.4418705105781555,
"learning_rate": 9.958081930771674e-05,
"loss": 0.6008,
"step": 71
},
{
"epoch": 0.1369472182596291,
"grad_norm": 0.44138631224632263,
"learning_rate": 9.95744680851064e-05,
"loss": 0.5319,
"step": 72
},
{
"epoch": 0.1388492629576795,
"grad_norm": 0.37523001432418823,
"learning_rate": 9.956811686249603e-05,
"loss": 0.657,
"step": 73
},
{
"epoch": 0.1407513076557299,
"grad_norm": 0.4489665627479553,
"learning_rate": 9.956176563988568e-05,
"loss": 0.5526,
"step": 74
},
{
"epoch": 0.14265335235378032,
"grad_norm": 0.39318791031837463,
"learning_rate": 9.955541441727532e-05,
"loss": 0.6046,
"step": 75
},
{
"epoch": 0.1445553970518307,
"grad_norm": 0.4817538261413574,
"learning_rate": 9.954906319466499e-05,
"loss": 0.5149,
"step": 76
},
{
"epoch": 0.14645744174988112,
"grad_norm": 0.4451163411140442,
"learning_rate": 9.954271197205463e-05,
"loss": 0.4892,
"step": 77
},
{
"epoch": 0.14835948644793154,
"grad_norm": 0.29836660623550415,
"learning_rate": 9.953636074944426e-05,
"loss": 0.4005,
"step": 78
},
{
"epoch": 0.15026153114598192,
"grad_norm": 0.3185100555419922,
"learning_rate": 9.953000952683393e-05,
"loss": 0.4168,
"step": 79
},
{
"epoch": 0.15216357584403234,
"grad_norm": 0.26550424098968506,
"learning_rate": 9.952365830422357e-05,
"loss": 0.39,
"step": 80
},
{
"epoch": 0.15406562054208273,
"grad_norm": 0.4328240156173706,
"learning_rate": 9.951730708161322e-05,
"loss": 0.5041,
"step": 81
},
{
"epoch": 0.15596766524013314,
"grad_norm": 0.5178936123847961,
"learning_rate": 9.951095585900286e-05,
"loss": 0.6017,
"step": 82
},
{
"epoch": 0.15786970993818356,
"grad_norm": 0.45657551288604736,
"learning_rate": 9.950460463639251e-05,
"loss": 0.5734,
"step": 83
},
{
"epoch": 0.15977175463623394,
"grad_norm": 0.5482913851737976,
"learning_rate": 9.949825341378216e-05,
"loss": 0.6015,
"step": 84
},
{
"epoch": 0.16167379933428436,
"grad_norm": 0.39362308382987976,
"learning_rate": 9.94919021911718e-05,
"loss": 0.5712,
"step": 85
},
{
"epoch": 0.16357584403233477,
"grad_norm": 0.4381113350391388,
"learning_rate": 9.948555096856145e-05,
"loss": 0.5194,
"step": 86
},
{
"epoch": 0.16547788873038516,
"grad_norm": 0.5021312236785889,
"learning_rate": 9.94791997459511e-05,
"loss": 0.5279,
"step": 87
},
{
"epoch": 0.16737993342843557,
"grad_norm": 0.4364267587661743,
"learning_rate": 9.947284852334074e-05,
"loss": 0.5892,
"step": 88
},
{
"epoch": 0.16928197812648596,
"grad_norm": 0.37873050570487976,
"learning_rate": 9.94664973007304e-05,
"loss": 0.5328,
"step": 89
},
{
"epoch": 0.17118402282453637,
"grad_norm": 0.4768919050693512,
"learning_rate": 9.946014607812005e-05,
"loss": 0.4889,
"step": 90
},
{
"epoch": 0.1730860675225868,
"grad_norm": 0.3834541440010071,
"learning_rate": 9.945379485550968e-05,
"loss": 0.4642,
"step": 91
},
{
"epoch": 0.17498811222063718,
"grad_norm": 0.48581764101982117,
"learning_rate": 9.944744363289934e-05,
"loss": 0.4741,
"step": 92
},
{
"epoch": 0.1768901569186876,
"grad_norm": 0.39364808797836304,
"learning_rate": 9.944109241028899e-05,
"loss": 0.5684,
"step": 93
},
{
"epoch": 0.178792201616738,
"grad_norm": 0.4657204747200012,
"learning_rate": 9.943474118767864e-05,
"loss": 0.609,
"step": 94
},
{
"epoch": 0.1806942463147884,
"grad_norm": 0.40989887714385986,
"learning_rate": 9.942838996506828e-05,
"loss": 0.4319,
"step": 95
},
{
"epoch": 0.1825962910128388,
"grad_norm": 0.43797624111175537,
"learning_rate": 9.942203874245793e-05,
"loss": 0.4997,
"step": 96
},
{
"epoch": 0.1844983357108892,
"grad_norm": 0.3887675106525421,
"learning_rate": 9.941568751984758e-05,
"loss": 0.5548,
"step": 97
},
{
"epoch": 0.1864003804089396,
"grad_norm": 0.39017003774642944,
"learning_rate": 9.940933629723722e-05,
"loss": 0.5113,
"step": 98
},
{
"epoch": 0.18830242510699002,
"grad_norm": 0.41409194469451904,
"learning_rate": 9.940298507462687e-05,
"loss": 0.5496,
"step": 99
},
{
"epoch": 0.1902044698050404,
"grad_norm": 0.34578803181648254,
"learning_rate": 9.939663385201652e-05,
"loss": 0.4048,
"step": 100
},
{
"epoch": 0.19210651450309082,
"grad_norm": 0.32233092188835144,
"learning_rate": 9.939028262940616e-05,
"loss": 0.4442,
"step": 101
},
{
"epoch": 0.19400855920114124,
"grad_norm": 0.45841965079307556,
"learning_rate": 9.938393140679581e-05,
"loss": 0.5646,
"step": 102
},
{
"epoch": 0.19591060389919163,
"grad_norm": 0.3825596272945404,
"learning_rate": 9.937758018418547e-05,
"loss": 0.4583,
"step": 103
},
{
"epoch": 0.19781264859724204,
"grad_norm": 0.44690102338790894,
"learning_rate": 9.93712289615751e-05,
"loss": 0.5799,
"step": 104
},
{
"epoch": 0.19971469329529243,
"grad_norm": 0.4881773591041565,
"learning_rate": 9.936487773896476e-05,
"loss": 0.4094,
"step": 105
},
{
"epoch": 0.20161673799334284,
"grad_norm": 0.4745669960975647,
"learning_rate": 9.93585265163544e-05,
"loss": 0.6068,
"step": 106
},
{
"epoch": 0.20351878269139326,
"grad_norm": 0.5497081279754639,
"learning_rate": 9.935217529374406e-05,
"loss": 0.4654,
"step": 107
},
{
"epoch": 0.20542082738944364,
"grad_norm": 0.3564707636833191,
"learning_rate": 9.93458240711337e-05,
"loss": 0.5678,
"step": 108
},
{
"epoch": 0.20732287208749406,
"grad_norm": 0.446321964263916,
"learning_rate": 9.933947284852334e-05,
"loss": 0.4503,
"step": 109
},
{
"epoch": 0.20922491678554447,
"grad_norm": 0.4253140389919281,
"learning_rate": 9.9333121625913e-05,
"loss": 0.538,
"step": 110
},
{
"epoch": 0.21112696148359486,
"grad_norm": 0.4123047888278961,
"learning_rate": 9.932677040330264e-05,
"loss": 0.4359,
"step": 111
},
{
"epoch": 0.21302900618164528,
"grad_norm": 0.3887772262096405,
"learning_rate": 9.932041918069229e-05,
"loss": 0.5534,
"step": 112
},
{
"epoch": 0.21493105087969566,
"grad_norm": 0.38153669238090515,
"learning_rate": 9.931406795808193e-05,
"loss": 0.4296,
"step": 113
},
{
"epoch": 0.21683309557774608,
"grad_norm": 0.43017521500587463,
"learning_rate": 9.930771673547158e-05,
"loss": 0.5899,
"step": 114
},
{
"epoch": 0.2187351402757965,
"grad_norm": 0.40156394243240356,
"learning_rate": 9.930136551286123e-05,
"loss": 0.3917,
"step": 115
},
{
"epoch": 0.22063718497384688,
"grad_norm": 0.3576590120792389,
"learning_rate": 9.929501429025087e-05,
"loss": 0.3908,
"step": 116
},
{
"epoch": 0.2225392296718973,
"grad_norm": 0.33245769143104553,
"learning_rate": 9.928866306764054e-05,
"loss": 0.4043,
"step": 117
},
{
"epoch": 0.2244412743699477,
"grad_norm": 0.43169739842414856,
"learning_rate": 9.928231184503018e-05,
"loss": 0.5569,
"step": 118
},
{
"epoch": 0.2263433190679981,
"grad_norm": 0.4004412293434143,
"learning_rate": 9.927596062241981e-05,
"loss": 0.4931,
"step": 119
},
{
"epoch": 0.2282453637660485,
"grad_norm": 0.3550797998905182,
"learning_rate": 9.926960939980947e-05,
"loss": 0.4505,
"step": 120
},
{
"epoch": 0.2301474084640989,
"grad_norm": 0.3701287508010864,
"learning_rate": 9.926325817719912e-05,
"loss": 0.4967,
"step": 121
},
{
"epoch": 0.2320494531621493,
"grad_norm": 0.4120308756828308,
"learning_rate": 9.925690695458876e-05,
"loss": 0.4408,
"step": 122
},
{
"epoch": 0.23395149786019973,
"grad_norm": 0.4737403392791748,
"learning_rate": 9.925055573197841e-05,
"loss": 0.7221,
"step": 123
},
{
"epoch": 0.2358535425582501,
"grad_norm": 0.37103158235549927,
"learning_rate": 9.924420450936806e-05,
"loss": 0.4419,
"step": 124
},
{
"epoch": 0.23775558725630053,
"grad_norm": 0.48644623160362244,
"learning_rate": 9.923785328675771e-05,
"loss": 0.5006,
"step": 125
},
{
"epoch": 0.2396576319543509,
"grad_norm": 0.3381918966770172,
"learning_rate": 9.923150206414735e-05,
"loss": 0.4786,
"step": 126
},
{
"epoch": 0.24155967665240133,
"grad_norm": 0.4500490128993988,
"learning_rate": 9.9225150841537e-05,
"loss": 0.4984,
"step": 127
},
{
"epoch": 0.24346172135045174,
"grad_norm": 0.5506143569946289,
"learning_rate": 9.921879961892665e-05,
"loss": 0.4857,
"step": 128
},
{
"epoch": 0.24536376604850213,
"grad_norm": 0.4111080467700958,
"learning_rate": 9.921244839631629e-05,
"loss": 0.4464,
"step": 129
},
{
"epoch": 0.24726581074655254,
"grad_norm": 0.52936851978302,
"learning_rate": 9.920609717370594e-05,
"loss": 0.5664,
"step": 130
},
{
"epoch": 0.24916785544460296,
"grad_norm": 0.465009480714798,
"learning_rate": 9.91997459510956e-05,
"loss": 0.4318,
"step": 131
},
{
"epoch": 0.25106990014265335,
"grad_norm": 0.3044665455818176,
"learning_rate": 9.919339472848523e-05,
"loss": 0.4284,
"step": 132
},
{
"epoch": 0.25297194484070373,
"grad_norm": 0.4849638342857361,
"learning_rate": 9.918704350587488e-05,
"loss": 0.5956,
"step": 133
},
{
"epoch": 0.2548739895387542,
"grad_norm": 0.4701893925666809,
"learning_rate": 9.918069228326454e-05,
"loss": 0.4541,
"step": 134
},
{
"epoch": 0.25677603423680456,
"grad_norm": 0.42524924874305725,
"learning_rate": 9.917434106065419e-05,
"loss": 0.4991,
"step": 135
},
{
"epoch": 0.25867807893485495,
"grad_norm": 0.46284592151641846,
"learning_rate": 9.916798983804383e-05,
"loss": 0.453,
"step": 136
},
{
"epoch": 0.2605801236329054,
"grad_norm": 0.40281572937965393,
"learning_rate": 9.916163861543348e-05,
"loss": 0.4771,
"step": 137
},
{
"epoch": 0.2624821683309558,
"grad_norm": 0.425214558839798,
"learning_rate": 9.915528739282313e-05,
"loss": 0.4665,
"step": 138
},
{
"epoch": 0.26438421302900617,
"grad_norm": 0.4181045889854431,
"learning_rate": 9.914893617021277e-05,
"loss": 0.5014,
"step": 139
},
{
"epoch": 0.2662862577270566,
"grad_norm": 0.4024779498577118,
"learning_rate": 9.914258494760241e-05,
"loss": 0.5905,
"step": 140
},
{
"epoch": 0.268188302425107,
"grad_norm": 0.3768770694732666,
"learning_rate": 9.913623372499207e-05,
"loss": 0.408,
"step": 141
},
{
"epoch": 0.2700903471231574,
"grad_norm": 0.4033905267715454,
"learning_rate": 9.912988250238171e-05,
"loss": 0.4511,
"step": 142
},
{
"epoch": 0.2719923918212078,
"grad_norm": 0.32505708932876587,
"learning_rate": 9.912353127977136e-05,
"loss": 0.4395,
"step": 143
},
{
"epoch": 0.2738944365192582,
"grad_norm": 0.3487790822982788,
"learning_rate": 9.9117180057161e-05,
"loss": 0.3601,
"step": 144
},
{
"epoch": 0.2757964812173086,
"grad_norm": 0.30558326840400696,
"learning_rate": 9.911082883455065e-05,
"loss": 0.4607,
"step": 145
},
{
"epoch": 0.277698525915359,
"grad_norm": 0.3752080500125885,
"learning_rate": 9.91044776119403e-05,
"loss": 0.3957,
"step": 146
},
{
"epoch": 0.2796005706134094,
"grad_norm": 0.3506644368171692,
"learning_rate": 9.909812638932994e-05,
"loss": 0.366,
"step": 147
},
{
"epoch": 0.2815026153114598,
"grad_norm": 0.43430307507514954,
"learning_rate": 9.909177516671961e-05,
"loss": 0.4542,
"step": 148
},
{
"epoch": 0.2834046600095102,
"grad_norm": 0.41930171847343445,
"learning_rate": 9.908542394410925e-05,
"loss": 0.709,
"step": 149
},
{
"epoch": 0.28530670470756064,
"grad_norm": 0.3717108964920044,
"learning_rate": 9.907907272149888e-05,
"loss": 0.4701,
"step": 150
},
{
"epoch": 0.28720874940561103,
"grad_norm": 0.4177984595298767,
"learning_rate": 9.907272149888854e-05,
"loss": 0.6189,
"step": 151
},
{
"epoch": 0.2891107941036614,
"grad_norm": 0.37706881761550903,
"learning_rate": 9.906637027627819e-05,
"loss": 0.4546,
"step": 152
},
{
"epoch": 0.29101283880171186,
"grad_norm": 0.4210599660873413,
"learning_rate": 9.906001905366784e-05,
"loss": 0.4716,
"step": 153
},
{
"epoch": 0.29291488349976225,
"grad_norm": 0.3707990050315857,
"learning_rate": 9.905366783105748e-05,
"loss": 0.4644,
"step": 154
},
{
"epoch": 0.29481692819781263,
"grad_norm": 0.36913537979125977,
"learning_rate": 9.904731660844713e-05,
"loss": 0.4605,
"step": 155
},
{
"epoch": 0.2967189728958631,
"grad_norm": 0.41291072964668274,
"learning_rate": 9.904096538583678e-05,
"loss": 0.4294,
"step": 156
},
{
"epoch": 0.29862101759391346,
"grad_norm": 0.30809640884399414,
"learning_rate": 9.903461416322642e-05,
"loss": 0.4369,
"step": 157
},
{
"epoch": 0.30052306229196385,
"grad_norm": 0.4266267716884613,
"learning_rate": 9.902826294061607e-05,
"loss": 0.456,
"step": 158
},
{
"epoch": 0.3024251069900143,
"grad_norm": 0.37408629059791565,
"learning_rate": 9.902191171800572e-05,
"loss": 0.4359,
"step": 159
},
{
"epoch": 0.3043271516880647,
"grad_norm": 0.40199100971221924,
"learning_rate": 9.901556049539536e-05,
"loss": 0.4433,
"step": 160
},
{
"epoch": 0.30622919638611507,
"grad_norm": 0.3430602252483368,
"learning_rate": 9.900920927278501e-05,
"loss": 0.4317,
"step": 161
},
{
"epoch": 0.30813124108416545,
"grad_norm": 0.5091786980628967,
"learning_rate": 9.900285805017467e-05,
"loss": 0.5824,
"step": 162
},
{
"epoch": 0.3100332857822159,
"grad_norm": 0.34287527203559875,
"learning_rate": 9.89965068275643e-05,
"loss": 0.4025,
"step": 163
},
{
"epoch": 0.3119353304802663,
"grad_norm": 0.4919246733188629,
"learning_rate": 9.899015560495396e-05,
"loss": 0.5612,
"step": 164
},
{
"epoch": 0.31383737517831667,
"grad_norm": 0.35404297709465027,
"learning_rate": 9.898380438234361e-05,
"loss": 0.4731,
"step": 165
},
{
"epoch": 0.3157394198763671,
"grad_norm": 0.3590085506439209,
"learning_rate": 9.897745315973326e-05,
"loss": 0.4365,
"step": 166
},
{
"epoch": 0.3176414645744175,
"grad_norm": 0.4132196605205536,
"learning_rate": 9.89711019371229e-05,
"loss": 0.3485,
"step": 167
},
{
"epoch": 0.3195435092724679,
"grad_norm": 0.46459728479385376,
"learning_rate": 9.896475071451255e-05,
"loss": 0.4327,
"step": 168
},
{
"epoch": 0.3214455539705183,
"grad_norm": 0.435651957988739,
"learning_rate": 9.89583994919022e-05,
"loss": 0.4684,
"step": 169
},
{
"epoch": 0.3233475986685687,
"grad_norm": 0.38278958201408386,
"learning_rate": 9.895204826929184e-05,
"loss": 0.4265,
"step": 170
},
{
"epoch": 0.3252496433666191,
"grad_norm": 0.31499558687210083,
"learning_rate": 9.894569704668149e-05,
"loss": 0.4099,
"step": 171
},
{
"epoch": 0.32715168806466954,
"grad_norm": 0.40141284465789795,
"learning_rate": 9.893934582407114e-05,
"loss": 0.4461,
"step": 172
},
{
"epoch": 0.32905373276271993,
"grad_norm": 0.42945384979248047,
"learning_rate": 9.893299460146078e-05,
"loss": 0.4379,
"step": 173
},
{
"epoch": 0.3309557774607703,
"grad_norm": 0.5186269283294678,
"learning_rate": 9.892664337885043e-05,
"loss": 0.5134,
"step": 174
},
{
"epoch": 0.33285782215882076,
"grad_norm": 0.3771612048149109,
"learning_rate": 9.892029215624009e-05,
"loss": 0.4617,
"step": 175
},
{
"epoch": 0.33475986685687115,
"grad_norm": 0.48396849632263184,
"learning_rate": 9.891394093362972e-05,
"loss": 0.4944,
"step": 176
},
{
"epoch": 0.33666191155492153,
"grad_norm": 0.5303121209144592,
"learning_rate": 9.890758971101938e-05,
"loss": 0.4049,
"step": 177
},
{
"epoch": 0.3385639562529719,
"grad_norm": 0.33063024282455444,
"learning_rate": 9.890123848840901e-05,
"loss": 0.401,
"step": 178
},
{
"epoch": 0.34046600095102236,
"grad_norm": 0.3764759302139282,
"learning_rate": 9.889488726579868e-05,
"loss": 0.4222,
"step": 179
},
{
"epoch": 0.34236804564907275,
"grad_norm": 0.27206951379776,
"learning_rate": 9.888853604318832e-05,
"loss": 0.3206,
"step": 180
},
{
"epoch": 0.34427009034712314,
"grad_norm": 0.3893122971057892,
"learning_rate": 9.888218482057796e-05,
"loss": 0.3558,
"step": 181
},
{
"epoch": 0.3461721350451736,
"grad_norm": 0.42340540885925293,
"learning_rate": 9.887583359796762e-05,
"loss": 0.3948,
"step": 182
},
{
"epoch": 0.34807417974322397,
"grad_norm": 0.4103796184062958,
"learning_rate": 9.886948237535726e-05,
"loss": 0.4769,
"step": 183
},
{
"epoch": 0.34997622444127435,
"grad_norm": 0.39225244522094727,
"learning_rate": 9.886313115274691e-05,
"loss": 0.441,
"step": 184
},
{
"epoch": 0.3518782691393248,
"grad_norm": 0.3774043023586273,
"learning_rate": 9.885677993013655e-05,
"loss": 0.3018,
"step": 185
},
{
"epoch": 0.3537803138373752,
"grad_norm": 0.4012366235256195,
"learning_rate": 9.88504287075262e-05,
"loss": 0.4217,
"step": 186
},
{
"epoch": 0.35568235853542557,
"grad_norm": 0.37299972772598267,
"learning_rate": 9.884407748491585e-05,
"loss": 0.4518,
"step": 187
},
{
"epoch": 0.357584403233476,
"grad_norm": 0.34713125228881836,
"learning_rate": 9.883772626230549e-05,
"loss": 0.3882,
"step": 188
},
{
"epoch": 0.3594864479315264,
"grad_norm": 0.4148958623409271,
"learning_rate": 9.883137503969516e-05,
"loss": 0.4979,
"step": 189
},
{
"epoch": 0.3613884926295768,
"grad_norm": 0.3979155421257019,
"learning_rate": 9.88250238170848e-05,
"loss": 0.3854,
"step": 190
},
{
"epoch": 0.36329053732762717,
"grad_norm": 0.42723751068115234,
"learning_rate": 9.881867259447443e-05,
"loss": 0.4325,
"step": 191
},
{
"epoch": 0.3651925820256776,
"grad_norm": 0.4195951521396637,
"learning_rate": 9.881232137186409e-05,
"loss": 0.3917,
"step": 192
},
{
"epoch": 0.367094626723728,
"grad_norm": 0.43937554955482483,
"learning_rate": 9.880597014925374e-05,
"loss": 0.3907,
"step": 193
},
{
"epoch": 0.3689966714217784,
"grad_norm": 0.3176072835922241,
"learning_rate": 9.879961892664338e-05,
"loss": 0.3581,
"step": 194
},
{
"epoch": 0.37089871611982883,
"grad_norm": 0.39909854531288147,
"learning_rate": 9.879326770403303e-05,
"loss": 0.5881,
"step": 195
},
{
"epoch": 0.3728007608178792,
"grad_norm": 0.35058659315109253,
"learning_rate": 9.878691648142268e-05,
"loss": 0.4753,
"step": 196
},
{
"epoch": 0.3747028055159296,
"grad_norm": 0.3353765904903412,
"learning_rate": 9.878056525881233e-05,
"loss": 0.4014,
"step": 197
},
{
"epoch": 0.37660485021398005,
"grad_norm": 0.4102007746696472,
"learning_rate": 9.877421403620197e-05,
"loss": 0.4841,
"step": 198
},
{
"epoch": 0.37850689491203043,
"grad_norm": 0.45450812578201294,
"learning_rate": 9.876786281359162e-05,
"loss": 0.4655,
"step": 199
},
{
"epoch": 0.3804089396100808,
"grad_norm": 0.32525572180747986,
"learning_rate": 9.876151159098127e-05,
"loss": 0.3869,
"step": 200
},
{
"epoch": 0.38231098430813126,
"grad_norm": 0.4488207697868347,
"learning_rate": 9.875516036837091e-05,
"loss": 0.4743,
"step": 201
},
{
"epoch": 0.38421302900618165,
"grad_norm": 0.432962030172348,
"learning_rate": 9.874880914576056e-05,
"loss": 0.4171,
"step": 202
},
{
"epoch": 0.38611507370423204,
"grad_norm": 0.4264095723628998,
"learning_rate": 9.874245792315022e-05,
"loss": 0.4344,
"step": 203
},
{
"epoch": 0.3880171184022825,
"grad_norm": 0.43752139806747437,
"learning_rate": 9.873610670053985e-05,
"loss": 0.5248,
"step": 204
},
{
"epoch": 0.38991916310033287,
"grad_norm": 0.42547503113746643,
"learning_rate": 9.87297554779295e-05,
"loss": 0.4011,
"step": 205
},
{
"epoch": 0.39182120779838325,
"grad_norm": 0.34600159525871277,
"learning_rate": 9.872340425531916e-05,
"loss": 0.3444,
"step": 206
},
{
"epoch": 0.39372325249643364,
"grad_norm": 0.3614776134490967,
"learning_rate": 9.871705303270881e-05,
"loss": 0.4784,
"step": 207
},
{
"epoch": 0.3956252971944841,
"grad_norm": 0.47591882944107056,
"learning_rate": 9.871070181009845e-05,
"loss": 0.5159,
"step": 208
},
{
"epoch": 0.39752734189253447,
"grad_norm": 0.3321515917778015,
"learning_rate": 9.870435058748809e-05,
"loss": 0.4382,
"step": 209
},
{
"epoch": 0.39942938659058486,
"grad_norm": 0.45849499106407166,
"learning_rate": 9.869799936487775e-05,
"loss": 0.4269,
"step": 210
},
{
"epoch": 0.4013314312886353,
"grad_norm": 0.3666900098323822,
"learning_rate": 9.869164814226739e-05,
"loss": 0.4077,
"step": 211
},
{
"epoch": 0.4032334759866857,
"grad_norm": 0.3387741446495056,
"learning_rate": 9.868529691965703e-05,
"loss": 0.4485,
"step": 212
},
{
"epoch": 0.4051355206847361,
"grad_norm": 0.3360239267349243,
"learning_rate": 9.86789456970467e-05,
"loss": 0.4042,
"step": 213
},
{
"epoch": 0.4070375653827865,
"grad_norm": 0.40923500061035156,
"learning_rate": 9.867259447443633e-05,
"loss": 0.5001,
"step": 214
},
{
"epoch": 0.4089396100808369,
"grad_norm": 0.3974573314189911,
"learning_rate": 9.866624325182598e-05,
"loss": 0.4984,
"step": 215
},
{
"epoch": 0.4108416547788873,
"grad_norm": 0.4095960557460785,
"learning_rate": 9.865989202921562e-05,
"loss": 0.3837,
"step": 216
},
{
"epoch": 0.41274369947693773,
"grad_norm": 0.3334168493747711,
"learning_rate": 9.865354080660527e-05,
"loss": 0.3935,
"step": 217
},
{
"epoch": 0.4146457441749881,
"grad_norm": 0.5007266998291016,
"learning_rate": 9.864718958399493e-05,
"loss": 0.4443,
"step": 218
},
{
"epoch": 0.4165477888730385,
"grad_norm": 0.35881495475769043,
"learning_rate": 9.864083836138456e-05,
"loss": 0.3835,
"step": 219
},
{
"epoch": 0.41844983357108895,
"grad_norm": 0.3785092830657959,
"learning_rate": 9.863448713877423e-05,
"loss": 0.3884,
"step": 220
},
{
"epoch": 0.42035187826913933,
"grad_norm": 0.41435107588768005,
"learning_rate": 9.862813591616387e-05,
"loss": 0.4116,
"step": 221
},
{
"epoch": 0.4222539229671897,
"grad_norm": 0.41338756680488586,
"learning_rate": 9.86217846935535e-05,
"loss": 0.5235,
"step": 222
},
{
"epoch": 0.4241559676652401,
"grad_norm": 0.4335710406303406,
"learning_rate": 9.861543347094316e-05,
"loss": 0.516,
"step": 223
},
{
"epoch": 0.42605801236329055,
"grad_norm": 0.37374967336654663,
"learning_rate": 9.860908224833281e-05,
"loss": 0.4663,
"step": 224
},
{
"epoch": 0.42796005706134094,
"grad_norm": 0.3213825821876526,
"learning_rate": 9.860273102572246e-05,
"loss": 0.3636,
"step": 225
},
{
"epoch": 0.4298621017593913,
"grad_norm": 0.41535523533821106,
"learning_rate": 9.85963798031121e-05,
"loss": 0.3677,
"step": 226
},
{
"epoch": 0.43176414645744177,
"grad_norm": 0.3543884754180908,
"learning_rate": 9.859002858050175e-05,
"loss": 0.376,
"step": 227
},
{
"epoch": 0.43366619115549215,
"grad_norm": 0.4012312889099121,
"learning_rate": 9.85836773578914e-05,
"loss": 0.4886,
"step": 228
},
{
"epoch": 0.43556823585354254,
"grad_norm": 0.3928169310092926,
"learning_rate": 9.857732613528104e-05,
"loss": 0.3741,
"step": 229
},
{
"epoch": 0.437470280551593,
"grad_norm": 0.4982980191707611,
"learning_rate": 9.85709749126707e-05,
"loss": 0.5704,
"step": 230
},
{
"epoch": 0.43937232524964337,
"grad_norm": 0.356545090675354,
"learning_rate": 9.856462369006035e-05,
"loss": 0.3618,
"step": 231
},
{
"epoch": 0.44127436994769376,
"grad_norm": 0.5087487697601318,
"learning_rate": 9.855827246744998e-05,
"loss": 0.4733,
"step": 232
},
{
"epoch": 0.4431764146457442,
"grad_norm": 0.3566097021102905,
"learning_rate": 9.855192124483964e-05,
"loss": 0.3771,
"step": 233
},
{
"epoch": 0.4450784593437946,
"grad_norm": 0.3210541605949402,
"learning_rate": 9.854557002222929e-05,
"loss": 0.4341,
"step": 234
},
{
"epoch": 0.446980504041845,
"grad_norm": 0.25422924757003784,
"learning_rate": 9.853921879961893e-05,
"loss": 0.3987,
"step": 235
},
{
"epoch": 0.4488825487398954,
"grad_norm": 0.39164894819259644,
"learning_rate": 9.853286757700858e-05,
"loss": 0.4149,
"step": 236
},
{
"epoch": 0.4507845934379458,
"grad_norm": 0.37471455335617065,
"learning_rate": 9.852651635439823e-05,
"loss": 0.4471,
"step": 237
},
{
"epoch": 0.4526866381359962,
"grad_norm": 0.37678262591362,
"learning_rate": 9.852016513178788e-05,
"loss": 0.3943,
"step": 238
},
{
"epoch": 0.4545886828340466,
"grad_norm": 0.4653976857662201,
"learning_rate": 9.851381390917752e-05,
"loss": 0.4848,
"step": 239
},
{
"epoch": 0.456490727532097,
"grad_norm": 0.46764564514160156,
"learning_rate": 9.850746268656717e-05,
"loss": 0.4624,
"step": 240
},
{
"epoch": 0.4583927722301474,
"grad_norm": 0.3803463876247406,
"learning_rate": 9.850111146395682e-05,
"loss": 0.442,
"step": 241
},
{
"epoch": 0.4602948169281978,
"grad_norm": 0.33662229776382446,
"learning_rate": 9.849476024134646e-05,
"loss": 0.4564,
"step": 242
},
{
"epoch": 0.46219686162624823,
"grad_norm": 0.42181041836738586,
"learning_rate": 9.848840901873611e-05,
"loss": 0.4702,
"step": 243
},
{
"epoch": 0.4640989063242986,
"grad_norm": 0.40373390913009644,
"learning_rate": 9.848205779612576e-05,
"loss": 0.3745,
"step": 244
},
{
"epoch": 0.466000951022349,
"grad_norm": 0.36634379625320435,
"learning_rate": 9.84757065735154e-05,
"loss": 0.428,
"step": 245
},
{
"epoch": 0.46790299572039945,
"grad_norm": 0.35369235277175903,
"learning_rate": 9.846935535090506e-05,
"loss": 0.3986,
"step": 246
},
{
"epoch": 0.46980504041844984,
"grad_norm": 0.4154004454612732,
"learning_rate": 9.846300412829471e-05,
"loss": 0.3512,
"step": 247
},
{
"epoch": 0.4717070851165002,
"grad_norm": 0.3689868450164795,
"learning_rate": 9.845665290568435e-05,
"loss": 0.3708,
"step": 248
},
{
"epoch": 0.47360912981455067,
"grad_norm": 0.38414841890335083,
"learning_rate": 9.8450301683074e-05,
"loss": 0.3401,
"step": 249
},
{
"epoch": 0.47551117451260105,
"grad_norm": 0.39936143159866333,
"learning_rate": 9.844395046046364e-05,
"loss": 0.4328,
"step": 250
},
{
"epoch": 0.47741321921065144,
"grad_norm": 0.30578187108039856,
"learning_rate": 9.84375992378533e-05,
"loss": 0.3694,
"step": 251
},
{
"epoch": 0.4793152639087018,
"grad_norm": 0.39497658610343933,
"learning_rate": 9.843124801524294e-05,
"loss": 0.3945,
"step": 252
},
{
"epoch": 0.48121730860675227,
"grad_norm": 0.44466689229011536,
"learning_rate": 9.842489679263258e-05,
"loss": 0.4485,
"step": 253
},
{
"epoch": 0.48311935330480266,
"grad_norm": 0.3614617586135864,
"learning_rate": 9.841854557002223e-05,
"loss": 0.3701,
"step": 254
},
{
"epoch": 0.48502139800285304,
"grad_norm": 0.3102608621120453,
"learning_rate": 9.841219434741188e-05,
"loss": 0.3677,
"step": 255
},
{
"epoch": 0.4869234427009035,
"grad_norm": 0.36049678921699524,
"learning_rate": 9.840584312480153e-05,
"loss": 0.411,
"step": 256
},
{
"epoch": 0.4888254873989539,
"grad_norm": 0.4025668501853943,
"learning_rate": 9.839949190219117e-05,
"loss": 0.433,
"step": 257
},
{
"epoch": 0.49072753209700426,
"grad_norm": 0.4131562113761902,
"learning_rate": 9.839314067958082e-05,
"loss": 0.4818,
"step": 258
},
{
"epoch": 0.4926295767950547,
"grad_norm": 0.481468141078949,
"learning_rate": 9.838678945697047e-05,
"loss": 0.5226,
"step": 259
},
{
"epoch": 0.4945316214931051,
"grad_norm": 0.2845190167427063,
"learning_rate": 9.838043823436011e-05,
"loss": 0.3323,
"step": 260
},
{
"epoch": 0.4964336661911555,
"grad_norm": 0.40381497144699097,
"learning_rate": 9.837408701174976e-05,
"loss": 0.4025,
"step": 261
},
{
"epoch": 0.4983357108892059,
"grad_norm": 0.4109043478965759,
"learning_rate": 9.836773578913942e-05,
"loss": 0.4429,
"step": 262
},
{
"epoch": 0.5002377555872562,
"grad_norm": 0.4256783425807953,
"learning_rate": 9.836138456652906e-05,
"loss": 0.3994,
"step": 263
},
{
"epoch": 0.5021398002853067,
"grad_norm": 0.35044407844543457,
"learning_rate": 9.835503334391871e-05,
"loss": 0.4431,
"step": 264
},
{
"epoch": 0.5040418449833571,
"grad_norm": 0.4456939697265625,
"learning_rate": 9.834868212130836e-05,
"loss": 0.5424,
"step": 265
},
{
"epoch": 0.5059438896814075,
"grad_norm": 0.36340197920799255,
"learning_rate": 9.8342330898698e-05,
"loss": 0.4199,
"step": 266
},
{
"epoch": 0.5078459343794579,
"grad_norm": 0.4018803536891937,
"learning_rate": 9.833597967608765e-05,
"loss": 0.4132,
"step": 267
},
{
"epoch": 0.5097479790775084,
"grad_norm": 0.3372616469860077,
"learning_rate": 9.83296284534773e-05,
"loss": 0.3239,
"step": 268
},
{
"epoch": 0.5116500237755587,
"grad_norm": 0.4497722387313843,
"learning_rate": 9.832327723086695e-05,
"loss": 0.4019,
"step": 269
},
{
"epoch": 0.5135520684736091,
"grad_norm": 0.422269344329834,
"learning_rate": 9.831692600825659e-05,
"loss": 0.45,
"step": 270
},
{
"epoch": 0.5154541131716596,
"grad_norm": 0.4167305529117584,
"learning_rate": 9.831057478564624e-05,
"loss": 0.4172,
"step": 271
},
{
"epoch": 0.5173561578697099,
"grad_norm": 0.4340919554233551,
"learning_rate": 9.83042235630359e-05,
"loss": 0.5042,
"step": 272
},
{
"epoch": 0.5192582025677603,
"grad_norm": 0.4179072380065918,
"learning_rate": 9.829787234042553e-05,
"loss": 0.3499,
"step": 273
},
{
"epoch": 0.5211602472658108,
"grad_norm": 0.39216554164886475,
"learning_rate": 9.829152111781518e-05,
"loss": 0.4729,
"step": 274
},
{
"epoch": 0.5230622919638611,
"grad_norm": 0.4485825002193451,
"learning_rate": 9.828516989520484e-05,
"loss": 0.4449,
"step": 275
},
{
"epoch": 0.5249643366619116,
"grad_norm": 0.3843270242214203,
"learning_rate": 9.827881867259447e-05,
"loss": 0.5416,
"step": 276
},
{
"epoch": 0.526866381359962,
"grad_norm": 0.30829140543937683,
"learning_rate": 9.827246744998413e-05,
"loss": 0.4004,
"step": 277
},
{
"epoch": 0.5287684260580123,
"grad_norm": 0.2905525863170624,
"learning_rate": 9.826611622737378e-05,
"loss": 0.3574,
"step": 278
},
{
"epoch": 0.5306704707560628,
"grad_norm": 0.3848637342453003,
"learning_rate": 9.825976500476343e-05,
"loss": 0.4021,
"step": 279
},
{
"epoch": 0.5325725154541132,
"grad_norm": 0.32691988348960876,
"learning_rate": 9.825341378215307e-05,
"loss": 0.4317,
"step": 280
},
{
"epoch": 0.5344745601521635,
"grad_norm": 0.3506065011024475,
"learning_rate": 9.824706255954271e-05,
"loss": 0.329,
"step": 281
},
{
"epoch": 0.536376604850214,
"grad_norm": 0.3102387487888336,
"learning_rate": 9.824071133693237e-05,
"loss": 0.3695,
"step": 282
},
{
"epoch": 0.5382786495482644,
"grad_norm": 0.45750680565834045,
"learning_rate": 9.823436011432201e-05,
"loss": 0.4232,
"step": 283
},
{
"epoch": 0.5401806942463148,
"grad_norm": 0.297134131193161,
"learning_rate": 9.822800889171165e-05,
"loss": 0.4137,
"step": 284
},
{
"epoch": 0.5420827389443652,
"grad_norm": 0.3696708679199219,
"learning_rate": 9.822165766910131e-05,
"loss": 0.4598,
"step": 285
},
{
"epoch": 0.5439847836424156,
"grad_norm": 0.31236112117767334,
"learning_rate": 9.821530644649095e-05,
"loss": 0.314,
"step": 286
},
{
"epoch": 0.545886828340466,
"grad_norm": 0.3596087694168091,
"learning_rate": 9.82089552238806e-05,
"loss": 0.4164,
"step": 287
},
{
"epoch": 0.5477888730385164,
"grad_norm": 0.33347079157829285,
"learning_rate": 9.820260400127024e-05,
"loss": 0.3915,
"step": 288
},
{
"epoch": 0.5496909177365669,
"grad_norm": 0.37818920612335205,
"learning_rate": 9.81962527786599e-05,
"loss": 0.3994,
"step": 289
},
{
"epoch": 0.5515929624346172,
"grad_norm": 0.3968106806278229,
"learning_rate": 9.818990155604955e-05,
"loss": 0.3611,
"step": 290
},
{
"epoch": 0.5534950071326676,
"grad_norm": 0.34991270303726196,
"learning_rate": 9.818355033343918e-05,
"loss": 0.3703,
"step": 291
},
{
"epoch": 0.555397051830718,
"grad_norm": 0.4046263098716736,
"learning_rate": 9.817719911082885e-05,
"loss": 0.3302,
"step": 292
},
{
"epoch": 0.5572990965287684,
"grad_norm": 0.35804587602615356,
"learning_rate": 9.817084788821849e-05,
"loss": 0.373,
"step": 293
},
{
"epoch": 0.5592011412268189,
"grad_norm": 0.3538301885128021,
"learning_rate": 9.816449666560813e-05,
"loss": 0.3482,
"step": 294
},
{
"epoch": 0.5611031859248692,
"grad_norm": 0.36835455894470215,
"learning_rate": 9.815814544299778e-05,
"loss": 0.3393,
"step": 295
},
{
"epoch": 0.5630052306229196,
"grad_norm": 0.48919835686683655,
"learning_rate": 9.815179422038743e-05,
"loss": 0.4213,
"step": 296
},
{
"epoch": 0.5649072753209701,
"grad_norm": 0.3472330570220947,
"learning_rate": 9.814544299777708e-05,
"loss": 0.3996,
"step": 297
},
{
"epoch": 0.5668093200190204,
"grad_norm": 0.428611159324646,
"learning_rate": 9.813909177516672e-05,
"loss": 0.4524,
"step": 298
},
{
"epoch": 0.5687113647170708,
"grad_norm": 0.4176979959011078,
"learning_rate": 9.813274055255637e-05,
"loss": 0.3787,
"step": 299
},
{
"epoch": 0.5706134094151213,
"grad_norm": 0.41548797488212585,
"learning_rate": 9.812638932994602e-05,
"loss": 0.4758,
"step": 300
},
{
"epoch": 0.5725154541131716,
"grad_norm": 0.3926902413368225,
"learning_rate": 9.812003810733566e-05,
"loss": 0.434,
"step": 301
},
{
"epoch": 0.5744174988112221,
"grad_norm": 0.392846018075943,
"learning_rate": 9.811368688472531e-05,
"loss": 0.3928,
"step": 302
},
{
"epoch": 0.5763195435092725,
"grad_norm": 0.36347585916519165,
"learning_rate": 9.810733566211497e-05,
"loss": 0.4264,
"step": 303
},
{
"epoch": 0.5782215882073228,
"grad_norm": 0.4314410090446472,
"learning_rate": 9.81009844395046e-05,
"loss": 0.4199,
"step": 304
},
{
"epoch": 0.5801236329053733,
"grad_norm": 0.337494820356369,
"learning_rate": 9.809463321689426e-05,
"loss": 0.4181,
"step": 305
},
{
"epoch": 0.5820256776034237,
"grad_norm": 0.27786335349082947,
"learning_rate": 9.808828199428391e-05,
"loss": 0.3,
"step": 306
},
{
"epoch": 0.583927722301474,
"grad_norm": 0.37235599756240845,
"learning_rate": 9.808193077167355e-05,
"loss": 0.3927,
"step": 307
},
{
"epoch": 0.5858297669995245,
"grad_norm": 0.37353670597076416,
"learning_rate": 9.80755795490632e-05,
"loss": 0.4146,
"step": 308
},
{
"epoch": 0.5877318116975749,
"grad_norm": 0.3919946551322937,
"learning_rate": 9.806922832645285e-05,
"loss": 0.5055,
"step": 309
},
{
"epoch": 0.5896338563956253,
"grad_norm": 0.45411062240600586,
"learning_rate": 9.80628771038425e-05,
"loss": 0.5347,
"step": 310
},
{
"epoch": 0.5915359010936757,
"grad_norm": 0.4087005853652954,
"learning_rate": 9.805652588123214e-05,
"loss": 0.3732,
"step": 311
},
{
"epoch": 0.5934379457917262,
"grad_norm": 0.313297837972641,
"learning_rate": 9.805017465862178e-05,
"loss": 0.3093,
"step": 312
},
{
"epoch": 0.5953399904897765,
"grad_norm": 0.40149226784706116,
"learning_rate": 9.804382343601144e-05,
"loss": 0.4404,
"step": 313
},
{
"epoch": 0.5972420351878269,
"grad_norm": 0.34245574474334717,
"learning_rate": 9.803747221340108e-05,
"loss": 0.4036,
"step": 314
},
{
"epoch": 0.5991440798858774,
"grad_norm": 0.38059449195861816,
"learning_rate": 9.803112099079073e-05,
"loss": 0.3763,
"step": 315
},
{
"epoch": 0.6010461245839277,
"grad_norm": 0.4539381265640259,
"learning_rate": 9.802476976818039e-05,
"loss": 0.4551,
"step": 316
},
{
"epoch": 0.6029481692819781,
"grad_norm": 0.4077235460281372,
"learning_rate": 9.801841854557002e-05,
"loss": 0.4641,
"step": 317
},
{
"epoch": 0.6048502139800286,
"grad_norm": 0.3426643908023834,
"learning_rate": 9.801206732295968e-05,
"loss": 0.3684,
"step": 318
},
{
"epoch": 0.6067522586780789,
"grad_norm": 0.3042270839214325,
"learning_rate": 9.800571610034931e-05,
"loss": 0.373,
"step": 319
},
{
"epoch": 0.6086543033761294,
"grad_norm": 0.4373973309993744,
"learning_rate": 9.799936487773897e-05,
"loss": 0.5442,
"step": 320
},
{
"epoch": 0.6105563480741797,
"grad_norm": 0.385797917842865,
"learning_rate": 9.799301365512862e-05,
"loss": 0.4218,
"step": 321
},
{
"epoch": 0.6124583927722301,
"grad_norm": 0.33210891485214233,
"learning_rate": 9.798666243251826e-05,
"loss": 0.3062,
"step": 322
},
{
"epoch": 0.6143604374702806,
"grad_norm": 0.3997063636779785,
"learning_rate": 9.798031120990792e-05,
"loss": 0.4104,
"step": 323
},
{
"epoch": 0.6162624821683309,
"grad_norm": 0.4837460219860077,
"learning_rate": 9.797395998729756e-05,
"loss": 0.5271,
"step": 324
},
{
"epoch": 0.6181645268663813,
"grad_norm": 0.36420971155166626,
"learning_rate": 9.79676087646872e-05,
"loss": 0.4033,
"step": 325
},
{
"epoch": 0.6200665715644318,
"grad_norm": 0.33610865473747253,
"learning_rate": 9.796125754207685e-05,
"loss": 0.3992,
"step": 326
},
{
"epoch": 0.6219686162624821,
"grad_norm": 0.28999099135398865,
"learning_rate": 9.79549063194665e-05,
"loss": 0.3675,
"step": 327
},
{
"epoch": 0.6238706609605326,
"grad_norm": 0.359401673078537,
"learning_rate": 9.794855509685615e-05,
"loss": 0.4363,
"step": 328
},
{
"epoch": 0.625772705658583,
"grad_norm": 0.3948569595813751,
"learning_rate": 9.794220387424579e-05,
"loss": 0.3698,
"step": 329
},
{
"epoch": 0.6276747503566333,
"grad_norm": 0.3753513991832733,
"learning_rate": 9.793585265163544e-05,
"loss": 0.4397,
"step": 330
},
{
"epoch": 0.6295767950546838,
"grad_norm": 0.32612451910972595,
"learning_rate": 9.79295014290251e-05,
"loss": 0.3846,
"step": 331
},
{
"epoch": 0.6314788397527342,
"grad_norm": 0.40796539187431335,
"learning_rate": 9.792315020641473e-05,
"loss": 0.371,
"step": 332
},
{
"epoch": 0.6333808844507846,
"grad_norm": 0.4358294904232025,
"learning_rate": 9.791679898380439e-05,
"loss": 0.4052,
"step": 333
},
{
"epoch": 0.635282929148835,
"grad_norm": 0.39615437388420105,
"learning_rate": 9.791044776119404e-05,
"loss": 0.3686,
"step": 334
},
{
"epoch": 0.6371849738468854,
"grad_norm": 0.32977715134620667,
"learning_rate": 9.790409653858368e-05,
"loss": 0.4404,
"step": 335
},
{
"epoch": 0.6390870185449358,
"grad_norm": 0.38361093401908875,
"learning_rate": 9.789774531597333e-05,
"loss": 0.3709,
"step": 336
},
{
"epoch": 0.6409890632429862,
"grad_norm": 0.40280988812446594,
"learning_rate": 9.789139409336298e-05,
"loss": 0.3322,
"step": 337
},
{
"epoch": 0.6428911079410367,
"grad_norm": 0.3682766854763031,
"learning_rate": 9.788504287075262e-05,
"loss": 0.4144,
"step": 338
},
{
"epoch": 0.644793152639087,
"grad_norm": 0.39864271879196167,
"learning_rate": 9.787869164814227e-05,
"loss": 0.4404,
"step": 339
},
{
"epoch": 0.6466951973371374,
"grad_norm": 0.3244321048259735,
"learning_rate": 9.787234042553192e-05,
"loss": 0.3541,
"step": 340
},
{
"epoch": 0.6485972420351879,
"grad_norm": 0.323403924703598,
"learning_rate": 9.786598920292157e-05,
"loss": 0.3374,
"step": 341
},
{
"epoch": 0.6504992867332382,
"grad_norm": 0.3881044387817383,
"learning_rate": 9.785963798031121e-05,
"loss": 0.4415,
"step": 342
},
{
"epoch": 0.6524013314312886,
"grad_norm": 0.35189467668533325,
"learning_rate": 9.785328675770086e-05,
"loss": 0.401,
"step": 343
},
{
"epoch": 0.6543033761293391,
"grad_norm": 0.3553767800331116,
"learning_rate": 9.784693553509052e-05,
"loss": 0.456,
"step": 344
},
{
"epoch": 0.6562054208273894,
"grad_norm": 0.3302605152130127,
"learning_rate": 9.784058431248015e-05,
"loss": 0.472,
"step": 345
},
{
"epoch": 0.6581074655254399,
"grad_norm": 0.4526873826980591,
"learning_rate": 9.78342330898698e-05,
"loss": 0.3908,
"step": 346
},
{
"epoch": 0.6600095102234903,
"grad_norm": 0.3232348561286926,
"learning_rate": 9.782788186725946e-05,
"loss": 0.3421,
"step": 347
},
{
"epoch": 0.6619115549215406,
"grad_norm": 0.38508203625679016,
"learning_rate": 9.78215306446491e-05,
"loss": 0.4093,
"step": 348
},
{
"epoch": 0.6638135996195911,
"grad_norm": 0.3187748193740845,
"learning_rate": 9.781517942203875e-05,
"loss": 0.4319,
"step": 349
},
{
"epoch": 0.6657156443176415,
"grad_norm": 0.2614807188510895,
"learning_rate": 9.78088281994284e-05,
"loss": 0.314,
"step": 350
},
{
"epoch": 0.6676176890156919,
"grad_norm": 0.40218180418014526,
"learning_rate": 9.780247697681805e-05,
"loss": 0.4404,
"step": 351
},
{
"epoch": 0.6695197337137423,
"grad_norm": 0.4016517996788025,
"learning_rate": 9.779612575420769e-05,
"loss": 0.5063,
"step": 352
},
{
"epoch": 0.6714217784117926,
"grad_norm": 0.3333278000354767,
"learning_rate": 9.778977453159733e-05,
"loss": 0.2966,
"step": 353
},
{
"epoch": 0.6733238231098431,
"grad_norm": 0.4535547196865082,
"learning_rate": 9.778342330898699e-05,
"loss": 0.4077,
"step": 354
},
{
"epoch": 0.6752258678078935,
"grad_norm": 0.4180653393268585,
"learning_rate": 9.777707208637663e-05,
"loss": 0.4554,
"step": 355
},
{
"epoch": 0.6771279125059438,
"grad_norm": 0.43454670906066895,
"learning_rate": 9.777072086376627e-05,
"loss": 0.4403,
"step": 356
},
{
"epoch": 0.6790299572039943,
"grad_norm": 0.45290321111679077,
"learning_rate": 9.776436964115594e-05,
"loss": 0.4037,
"step": 357
},
{
"epoch": 0.6809320019020447,
"grad_norm": 0.34165212512016296,
"learning_rate": 9.775801841854557e-05,
"loss": 0.3044,
"step": 358
},
{
"epoch": 0.6828340466000951,
"grad_norm": 0.435138463973999,
"learning_rate": 9.775166719593523e-05,
"loss": 0.4293,
"step": 359
},
{
"epoch": 0.6847360912981455,
"grad_norm": 0.36061882972717285,
"learning_rate": 9.774531597332486e-05,
"loss": 0.4052,
"step": 360
},
{
"epoch": 0.6866381359961959,
"grad_norm": 0.4023354947566986,
"learning_rate": 9.773896475071452e-05,
"loss": 0.4232,
"step": 361
},
{
"epoch": 0.6885401806942463,
"grad_norm": 0.39200109243392944,
"learning_rate": 9.773261352810417e-05,
"loss": 0.3882,
"step": 362
},
{
"epoch": 0.6904422253922967,
"grad_norm": 0.34504035115242004,
"learning_rate": 9.77262623054938e-05,
"loss": 0.4063,
"step": 363
},
{
"epoch": 0.6923442700903472,
"grad_norm": 0.31081900000572205,
"learning_rate": 9.771991108288346e-05,
"loss": 0.251,
"step": 364
},
{
"epoch": 0.6942463147883975,
"grad_norm": 0.3800300061702728,
"learning_rate": 9.771355986027311e-05,
"loss": 0.3722,
"step": 365
},
{
"epoch": 0.6961483594864479,
"grad_norm": 0.3476494550704956,
"learning_rate": 9.770720863766275e-05,
"loss": 0.382,
"step": 366
},
{
"epoch": 0.6980504041844984,
"grad_norm": 0.38069918751716614,
"learning_rate": 9.77008574150524e-05,
"loss": 0.4329,
"step": 367
},
{
"epoch": 0.6999524488825487,
"grad_norm": 0.4034759998321533,
"learning_rate": 9.769450619244205e-05,
"loss": 0.4112,
"step": 368
},
{
"epoch": 0.7018544935805991,
"grad_norm": 0.4232093393802643,
"learning_rate": 9.76881549698317e-05,
"loss": 0.4524,
"step": 369
},
{
"epoch": 0.7037565382786496,
"grad_norm": 0.40627321600914,
"learning_rate": 9.768180374722134e-05,
"loss": 0.388,
"step": 370
},
{
"epoch": 0.7056585829766999,
"grad_norm": 0.41021519899368286,
"learning_rate": 9.767545252461099e-05,
"loss": 0.3741,
"step": 371
},
{
"epoch": 0.7075606276747504,
"grad_norm": 0.3615809679031372,
"learning_rate": 9.766910130200065e-05,
"loss": 0.4432,
"step": 372
},
{
"epoch": 0.7094626723728008,
"grad_norm": 0.3088645935058594,
"learning_rate": 9.766275007939028e-05,
"loss": 0.3343,
"step": 373
},
{
"epoch": 0.7113647170708511,
"grad_norm": 0.380659818649292,
"learning_rate": 9.765639885677994e-05,
"loss": 0.4092,
"step": 374
},
{
"epoch": 0.7132667617689016,
"grad_norm": 0.28462380170822144,
"learning_rate": 9.765004763416959e-05,
"loss": 0.31,
"step": 375
},
{
"epoch": 0.715168806466952,
"grad_norm": 0.3215513229370117,
"learning_rate": 9.764369641155923e-05,
"loss": 0.4115,
"step": 376
},
{
"epoch": 0.7170708511650024,
"grad_norm": 0.397651731967926,
"learning_rate": 9.763734518894888e-05,
"loss": 0.4369,
"step": 377
},
{
"epoch": 0.7189728958630528,
"grad_norm": 0.31436121463775635,
"learning_rate": 9.763099396633853e-05,
"loss": 0.4339,
"step": 378
},
{
"epoch": 0.7208749405611032,
"grad_norm": 0.4024806320667267,
"learning_rate": 9.762464274372817e-05,
"loss": 0.4252,
"step": 379
},
{
"epoch": 0.7227769852591536,
"grad_norm": 0.37994107604026794,
"learning_rate": 9.761829152111782e-05,
"loss": 0.3483,
"step": 380
},
{
"epoch": 0.724679029957204,
"grad_norm": 0.44616061449050903,
"learning_rate": 9.761194029850747e-05,
"loss": 0.3809,
"step": 381
},
{
"epoch": 0.7265810746552543,
"grad_norm": 0.3396744728088379,
"learning_rate": 9.760558907589712e-05,
"loss": 0.3382,
"step": 382
},
{
"epoch": 0.7284831193533048,
"grad_norm": 0.334839791059494,
"learning_rate": 9.759923785328676e-05,
"loss": 0.3465,
"step": 383
},
{
"epoch": 0.7303851640513552,
"grad_norm": 0.417478084564209,
"learning_rate": 9.75928866306764e-05,
"loss": 0.3191,
"step": 384
},
{
"epoch": 0.7322872087494056,
"grad_norm": 0.30790823698043823,
"learning_rate": 9.758653540806606e-05,
"loss": 0.3139,
"step": 385
},
{
"epoch": 0.734189253447456,
"grad_norm": 0.4008057415485382,
"learning_rate": 9.75801841854557e-05,
"loss": 0.419,
"step": 386
},
{
"epoch": 0.7360912981455064,
"grad_norm": 0.42966723442077637,
"learning_rate": 9.757383296284535e-05,
"loss": 0.3634,
"step": 387
},
{
"epoch": 0.7379933428435568,
"grad_norm": 0.33789002895355225,
"learning_rate": 9.7567481740235e-05,
"loss": 0.3966,
"step": 388
},
{
"epoch": 0.7398953875416072,
"grad_norm": 0.35244229435920715,
"learning_rate": 9.756113051762464e-05,
"loss": 0.3991,
"step": 389
},
{
"epoch": 0.7417974322396577,
"grad_norm": 0.3581864833831787,
"learning_rate": 9.75547792950143e-05,
"loss": 0.347,
"step": 390
},
{
"epoch": 0.743699476937708,
"grad_norm": 0.30788975954055786,
"learning_rate": 9.754842807240394e-05,
"loss": 0.3485,
"step": 391
},
{
"epoch": 0.7456015216357584,
"grad_norm": 0.5155593156814575,
"learning_rate": 9.754207684979359e-05,
"loss": 0.4793,
"step": 392
},
{
"epoch": 0.7475035663338089,
"grad_norm": 0.4183029532432556,
"learning_rate": 9.753572562718324e-05,
"loss": 0.4064,
"step": 393
},
{
"epoch": 0.7494056110318592,
"grad_norm": 0.36132046580314636,
"learning_rate": 9.752937440457288e-05,
"loss": 0.3539,
"step": 394
},
{
"epoch": 0.7513076557299097,
"grad_norm": 0.4269217252731323,
"learning_rate": 9.752302318196254e-05,
"loss": 0.4358,
"step": 395
},
{
"epoch": 0.7532097004279601,
"grad_norm": 0.38872459530830383,
"learning_rate": 9.751667195935218e-05,
"loss": 0.3238,
"step": 396
},
{
"epoch": 0.7551117451260104,
"grad_norm": 0.4668743312358856,
"learning_rate": 9.751032073674182e-05,
"loss": 0.4218,
"step": 397
},
{
"epoch": 0.7570137898240609,
"grad_norm": 0.3817143738269806,
"learning_rate": 9.750396951413147e-05,
"loss": 0.4332,
"step": 398
},
{
"epoch": 0.7589158345221113,
"grad_norm": 0.4089401960372925,
"learning_rate": 9.749761829152112e-05,
"loss": 0.319,
"step": 399
},
{
"epoch": 0.7608178792201616,
"grad_norm": 0.36516866087913513,
"learning_rate": 9.749126706891077e-05,
"loss": 0.3858,
"step": 400
},
{
"epoch": 0.7627199239182121,
"grad_norm": 0.3843027949333191,
"learning_rate": 9.748491584630041e-05,
"loss": 0.4682,
"step": 401
},
{
"epoch": 0.7646219686162625,
"grad_norm": 0.36987295746803284,
"learning_rate": 9.747856462369006e-05,
"loss": 0.3328,
"step": 402
},
{
"epoch": 0.7665240133143129,
"grad_norm": 0.4972301721572876,
"learning_rate": 9.747221340107972e-05,
"loss": 0.3939,
"step": 403
},
{
"epoch": 0.7684260580123633,
"grad_norm": 0.4319972097873688,
"learning_rate": 9.746586217846935e-05,
"loss": 0.3918,
"step": 404
},
{
"epoch": 0.7703281027104137,
"grad_norm": 0.364364892244339,
"learning_rate": 9.7459510955859e-05,
"loss": 0.3871,
"step": 405
},
{
"epoch": 0.7722301474084641,
"grad_norm": 0.43767908215522766,
"learning_rate": 9.745315973324866e-05,
"loss": 0.3973,
"step": 406
},
{
"epoch": 0.7741321921065145,
"grad_norm": 0.44734928011894226,
"learning_rate": 9.74468085106383e-05,
"loss": 0.3884,
"step": 407
},
{
"epoch": 0.776034236804565,
"grad_norm": 0.3817954957485199,
"learning_rate": 9.744045728802795e-05,
"loss": 0.3647,
"step": 408
},
{
"epoch": 0.7779362815026153,
"grad_norm": 0.3619462251663208,
"learning_rate": 9.74341060654176e-05,
"loss": 0.4994,
"step": 409
},
{
"epoch": 0.7798383262006657,
"grad_norm": 0.38225993514060974,
"learning_rate": 9.742775484280724e-05,
"loss": 0.4116,
"step": 410
},
{
"epoch": 0.7817403708987162,
"grad_norm": 0.39784252643585205,
"learning_rate": 9.742140362019689e-05,
"loss": 0.3729,
"step": 411
},
{
"epoch": 0.7836424155967665,
"grad_norm": 0.3188072443008423,
"learning_rate": 9.741505239758654e-05,
"loss": 0.3767,
"step": 412
},
{
"epoch": 0.785544460294817,
"grad_norm": 0.4509223401546478,
"learning_rate": 9.74087011749762e-05,
"loss": 0.4595,
"step": 413
},
{
"epoch": 0.7874465049928673,
"grad_norm": 0.40249937772750854,
"learning_rate": 9.740234995236583e-05,
"loss": 0.3761,
"step": 414
},
{
"epoch": 0.7893485496909177,
"grad_norm": 0.3387410044670105,
"learning_rate": 9.739599872975547e-05,
"loss": 0.401,
"step": 415
},
{
"epoch": 0.7912505943889682,
"grad_norm": 0.47670629620552063,
"learning_rate": 9.738964750714514e-05,
"loss": 0.3656,
"step": 416
},
{
"epoch": 0.7931526390870185,
"grad_norm": 0.37239211797714233,
"learning_rate": 9.738329628453477e-05,
"loss": 0.4885,
"step": 417
},
{
"epoch": 0.7950546837850689,
"grad_norm": 0.3347351849079132,
"learning_rate": 9.737694506192443e-05,
"loss": 0.291,
"step": 418
},
{
"epoch": 0.7969567284831194,
"grad_norm": 0.3727717399597168,
"learning_rate": 9.737059383931408e-05,
"loss": 0.3506,
"step": 419
},
{
"epoch": 0.7988587731811697,
"grad_norm": 0.3866841793060303,
"learning_rate": 9.736424261670372e-05,
"loss": 0.4355,
"step": 420
},
{
"epoch": 0.8007608178792202,
"grad_norm": 0.39670372009277344,
"learning_rate": 9.735789139409337e-05,
"loss": 0.4041,
"step": 421
},
{
"epoch": 0.8026628625772706,
"grad_norm": 0.35946765542030334,
"learning_rate": 9.7351540171483e-05,
"loss": 0.3378,
"step": 422
},
{
"epoch": 0.8045649072753209,
"grad_norm": 0.24180381000041962,
"learning_rate": 9.734518894887267e-05,
"loss": 0.3133,
"step": 423
},
{
"epoch": 0.8064669519733714,
"grad_norm": 0.4238085150718689,
"learning_rate": 9.733883772626231e-05,
"loss": 0.3968,
"step": 424
},
{
"epoch": 0.8083689966714218,
"grad_norm": 0.35451412200927734,
"learning_rate": 9.733248650365195e-05,
"loss": 0.3456,
"step": 425
},
{
"epoch": 0.8102710413694721,
"grad_norm": 0.49277418851852417,
"learning_rate": 9.732613528104161e-05,
"loss": 0.3916,
"step": 426
},
{
"epoch": 0.8121730860675226,
"grad_norm": 0.34536874294281006,
"learning_rate": 9.731978405843125e-05,
"loss": 0.537,
"step": 427
},
{
"epoch": 0.814075130765573,
"grad_norm": 0.3002311885356903,
"learning_rate": 9.731343283582089e-05,
"loss": 0.3842,
"step": 428
},
{
"epoch": 0.8159771754636234,
"grad_norm": 0.29766812920570374,
"learning_rate": 9.730708161321054e-05,
"loss": 0.2979,
"step": 429
},
{
"epoch": 0.8178792201616738,
"grad_norm": 0.34347230195999146,
"learning_rate": 9.73007303906002e-05,
"loss": 0.3996,
"step": 430
},
{
"epoch": 0.8197812648597242,
"grad_norm": 0.42430102825164795,
"learning_rate": 9.729437916798985e-05,
"loss": 0.4677,
"step": 431
},
{
"epoch": 0.8216833095577746,
"grad_norm": 0.3375668227672577,
"learning_rate": 9.728802794537948e-05,
"loss": 0.4257,
"step": 432
},
{
"epoch": 0.823585354255825,
"grad_norm": 0.3718586266040802,
"learning_rate": 9.728167672276914e-05,
"loss": 0.3555,
"step": 433
},
{
"epoch": 0.8254873989538755,
"grad_norm": 0.4310496151447296,
"learning_rate": 9.727532550015879e-05,
"loss": 0.4026,
"step": 434
},
{
"epoch": 0.8273894436519258,
"grad_norm": 0.43832001090049744,
"learning_rate": 9.726897427754843e-05,
"loss": 0.4421,
"step": 435
},
{
"epoch": 0.8292914883499762,
"grad_norm": 0.42209911346435547,
"learning_rate": 9.726262305493808e-05,
"loss": 0.397,
"step": 436
},
{
"epoch": 0.8311935330480267,
"grad_norm": 0.4297396242618561,
"learning_rate": 9.725627183232773e-05,
"loss": 0.4244,
"step": 437
},
{
"epoch": 0.833095577746077,
"grad_norm": 0.40587079524993896,
"learning_rate": 9.724992060971737e-05,
"loss": 0.3753,
"step": 438
},
{
"epoch": 0.8349976224441275,
"grad_norm": 0.4127040505409241,
"learning_rate": 9.724356938710702e-05,
"loss": 0.3926,
"step": 439
},
{
"epoch": 0.8368996671421779,
"grad_norm": 0.3734678030014038,
"learning_rate": 9.723721816449667e-05,
"loss": 0.3338,
"step": 440
},
{
"epoch": 0.8388017118402282,
"grad_norm": 0.38152286410331726,
"learning_rate": 9.723086694188632e-05,
"loss": 0.3893,
"step": 441
},
{
"epoch": 0.8407037565382787,
"grad_norm": 0.4234791398048401,
"learning_rate": 9.722451571927596e-05,
"loss": 0.3104,
"step": 442
},
{
"epoch": 0.842605801236329,
"grad_norm": 0.49204525351524353,
"learning_rate": 9.721816449666561e-05,
"loss": 0.3698,
"step": 443
},
{
"epoch": 0.8445078459343794,
"grad_norm": 0.40980932116508484,
"learning_rate": 9.721181327405527e-05,
"loss": 0.3901,
"step": 444
},
{
"epoch": 0.8464098906324299,
"grad_norm": 0.3330426514148712,
"learning_rate": 9.72054620514449e-05,
"loss": 0.3118,
"step": 445
},
{
"epoch": 0.8483119353304802,
"grad_norm": 0.3042624890804291,
"learning_rate": 9.719911082883456e-05,
"loss": 0.3003,
"step": 446
},
{
"epoch": 0.8502139800285307,
"grad_norm": 0.34576475620269775,
"learning_rate": 9.719275960622421e-05,
"loss": 0.3332,
"step": 447
},
{
"epoch": 0.8521160247265811,
"grad_norm": 0.2980082035064697,
"learning_rate": 9.718640838361385e-05,
"loss": 0.3285,
"step": 448
},
{
"epoch": 0.8540180694246314,
"grad_norm": 0.31439459323883057,
"learning_rate": 9.71800571610035e-05,
"loss": 0.3178,
"step": 449
},
{
"epoch": 0.8559201141226819,
"grad_norm": 0.37447845935821533,
"learning_rate": 9.717370593839315e-05,
"loss": 0.3861,
"step": 450
},
{
"epoch": 0.8578221588207323,
"grad_norm": 0.4261024594306946,
"learning_rate": 9.716735471578279e-05,
"loss": 0.4377,
"step": 451
},
{
"epoch": 0.8597242035187826,
"grad_norm": 0.3328630328178406,
"learning_rate": 9.716100349317244e-05,
"loss": 0.2791,
"step": 452
},
{
"epoch": 0.8616262482168331,
"grad_norm": 0.41943463683128357,
"learning_rate": 9.715465227056209e-05,
"loss": 0.4693,
"step": 453
},
{
"epoch": 0.8635282929148835,
"grad_norm": 0.4295640289783478,
"learning_rate": 9.714830104795174e-05,
"loss": 0.4105,
"step": 454
},
{
"epoch": 0.8654303376129339,
"grad_norm": 0.3548508882522583,
"learning_rate": 9.714194982534138e-05,
"loss": 0.3024,
"step": 455
},
{
"epoch": 0.8673323823109843,
"grad_norm": 0.5577777624130249,
"learning_rate": 9.713559860273102e-05,
"loss": 0.3961,
"step": 456
},
{
"epoch": 0.8692344270090347,
"grad_norm": 0.4119040071964264,
"learning_rate": 9.712924738012069e-05,
"loss": 0.3143,
"step": 457
},
{
"epoch": 0.8711364717070851,
"grad_norm": 0.40272560715675354,
"learning_rate": 9.712289615751032e-05,
"loss": 0.3452,
"step": 458
},
{
"epoch": 0.8730385164051355,
"grad_norm": 0.456386536359787,
"learning_rate": 9.711654493489998e-05,
"loss": 0.403,
"step": 459
},
{
"epoch": 0.874940561103186,
"grad_norm": 0.3982544541358948,
"learning_rate": 9.711019371228963e-05,
"loss": 0.4498,
"step": 460
},
{
"epoch": 0.8768426058012363,
"grad_norm": 0.29361623525619507,
"learning_rate": 9.710384248967927e-05,
"loss": 0.3724,
"step": 461
},
{
"epoch": 0.8787446504992867,
"grad_norm": 0.3854773938655853,
"learning_rate": 9.709749126706892e-05,
"loss": 0.4162,
"step": 462
},
{
"epoch": 0.8806466951973372,
"grad_norm": 0.3760225474834442,
"learning_rate": 9.709114004445856e-05,
"loss": 0.4335,
"step": 463
},
{
"epoch": 0.8825487398953875,
"grad_norm": 0.4936290383338928,
"learning_rate": 9.708478882184821e-05,
"loss": 0.3522,
"step": 464
},
{
"epoch": 0.884450784593438,
"grad_norm": 0.3584468364715576,
"learning_rate": 9.707843759923786e-05,
"loss": 0.552,
"step": 465
},
{
"epoch": 0.8863528292914884,
"grad_norm": 0.3523949086666107,
"learning_rate": 9.70720863766275e-05,
"loss": 0.3498,
"step": 466
},
{
"epoch": 0.8882548739895387,
"grad_norm": 0.42082804441452026,
"learning_rate": 9.706573515401716e-05,
"loss": 0.4863,
"step": 467
},
{
"epoch": 0.8901569186875892,
"grad_norm": 0.4284763038158417,
"learning_rate": 9.70593839314068e-05,
"loss": 0.4737,
"step": 468
},
{
"epoch": 0.8920589633856396,
"grad_norm": 0.3609261214733124,
"learning_rate": 9.705303270879644e-05,
"loss": 0.3208,
"step": 469
},
{
"epoch": 0.89396100808369,
"grad_norm": 0.31832849979400635,
"learning_rate": 9.704668148618609e-05,
"loss": 0.2545,
"step": 470
},
{
"epoch": 0.8958630527817404,
"grad_norm": 0.38202738761901855,
"learning_rate": 9.704033026357574e-05,
"loss": 0.3952,
"step": 471
},
{
"epoch": 0.8977650974797908,
"grad_norm": 0.347649484872818,
"learning_rate": 9.70339790409654e-05,
"loss": 0.3776,
"step": 472
},
{
"epoch": 0.8996671421778412,
"grad_norm": 0.41626760363578796,
"learning_rate": 9.702762781835503e-05,
"loss": 0.4152,
"step": 473
},
{
"epoch": 0.9015691868758916,
"grad_norm": 0.4042579233646393,
"learning_rate": 9.702127659574469e-05,
"loss": 0.3813,
"step": 474
},
{
"epoch": 0.9034712315739419,
"grad_norm": 0.38196825981140137,
"learning_rate": 9.701492537313434e-05,
"loss": 0.4398,
"step": 475
},
{
"epoch": 0.9053732762719924,
"grad_norm": 0.3867753744125366,
"learning_rate": 9.700857415052398e-05,
"loss": 0.4995,
"step": 476
},
{
"epoch": 0.9072753209700428,
"grad_norm": 0.34228166937828064,
"learning_rate": 9.700222292791363e-05,
"loss": 0.284,
"step": 477
},
{
"epoch": 0.9091773656680932,
"grad_norm": 0.3962937593460083,
"learning_rate": 9.699587170530328e-05,
"loss": 0.3501,
"step": 478
},
{
"epoch": 0.9110794103661436,
"grad_norm": 0.3665268123149872,
"learning_rate": 9.698952048269292e-05,
"loss": 0.2737,
"step": 479
},
{
"epoch": 0.912981455064194,
"grad_norm": 0.3775653839111328,
"learning_rate": 9.698316926008257e-05,
"loss": 0.3173,
"step": 480
},
{
"epoch": 0.9148834997622444,
"grad_norm": 0.3584369421005249,
"learning_rate": 9.697681803747222e-05,
"loss": 0.3055,
"step": 481
},
{
"epoch": 0.9167855444602948,
"grad_norm": 0.3510100245475769,
"learning_rate": 9.697046681486186e-05,
"loss": 0.3278,
"step": 482
},
{
"epoch": 0.9186875891583453,
"grad_norm": 0.33394765853881836,
"learning_rate": 9.696411559225151e-05,
"loss": 0.2954,
"step": 483
},
{
"epoch": 0.9205896338563956,
"grad_norm": 0.437014102935791,
"learning_rate": 9.695776436964116e-05,
"loss": 0.3797,
"step": 484
},
{
"epoch": 0.922491678554446,
"grad_norm": 0.37421244382858276,
"learning_rate": 9.695141314703082e-05,
"loss": 0.3521,
"step": 485
},
{
"epoch": 0.9243937232524965,
"grad_norm": 0.37696099281311035,
"learning_rate": 9.694506192442045e-05,
"loss": 0.3455,
"step": 486
},
{
"epoch": 0.9262957679505468,
"grad_norm": 0.5452500581741333,
"learning_rate": 9.693871070181009e-05,
"loss": 0.3624,
"step": 487
},
{
"epoch": 0.9281978126485972,
"grad_norm": 0.4049624502658844,
"learning_rate": 9.693235947919976e-05,
"loss": 0.4017,
"step": 488
},
{
"epoch": 0.9300998573466477,
"grad_norm": 0.32757866382598877,
"learning_rate": 9.69260082565894e-05,
"loss": 0.3536,
"step": 489
},
{
"epoch": 0.932001902044698,
"grad_norm": 0.298367977142334,
"learning_rate": 9.691965703397905e-05,
"loss": 0.3374,
"step": 490
},
{
"epoch": 0.9339039467427485,
"grad_norm": 0.22035005688667297,
"learning_rate": 9.69133058113687e-05,
"loss": 0.2855,
"step": 491
},
{
"epoch": 0.9358059914407989,
"grad_norm": 0.43000441789627075,
"learning_rate": 9.690695458875834e-05,
"loss": 0.4544,
"step": 492
},
{
"epoch": 0.9377080361388492,
"grad_norm": 0.28024253249168396,
"learning_rate": 9.690060336614799e-05,
"loss": 0.308,
"step": 493
},
{
"epoch": 0.9396100808368997,
"grad_norm": 0.53145432472229,
"learning_rate": 9.689425214353763e-05,
"loss": 0.4569,
"step": 494
},
{
"epoch": 0.9415121255349501,
"grad_norm": 0.4006127715110779,
"learning_rate": 9.688790092092729e-05,
"loss": 0.419,
"step": 495
},
{
"epoch": 0.9434141702330004,
"grad_norm": 0.4057261645793915,
"learning_rate": 9.688154969831693e-05,
"loss": 0.3553,
"step": 496
},
{
"epoch": 0.9453162149310509,
"grad_norm": 0.40803465247154236,
"learning_rate": 9.687519847570657e-05,
"loss": 0.3735,
"step": 497
},
{
"epoch": 0.9472182596291013,
"grad_norm": 0.34222155809402466,
"learning_rate": 9.686884725309623e-05,
"loss": 0.367,
"step": 498
},
{
"epoch": 0.9491203043271517,
"grad_norm": 0.40403544902801514,
"learning_rate": 9.686249603048587e-05,
"loss": 0.416,
"step": 499
},
{
"epoch": 0.9510223490252021,
"grad_norm": 0.33636951446533203,
"learning_rate": 9.685614480787551e-05,
"loss": 0.3423,
"step": 500
},
{
"epoch": 0.9529243937232525,
"grad_norm": 0.3394258916378021,
"learning_rate": 9.684979358526516e-05,
"loss": 0.3282,
"step": 501
},
{
"epoch": 0.9548264384213029,
"grad_norm": 0.3682473599910736,
"learning_rate": 9.684344236265482e-05,
"loss": 0.406,
"step": 502
},
{
"epoch": 0.9567284831193533,
"grad_norm": 0.35073623061180115,
"learning_rate": 9.683709114004447e-05,
"loss": 0.376,
"step": 503
},
{
"epoch": 0.9586305278174037,
"grad_norm": 0.36000022292137146,
"learning_rate": 9.68307399174341e-05,
"loss": 0.3969,
"step": 504
},
{
"epoch": 0.9605325725154541,
"grad_norm": 0.361158162355423,
"learning_rate": 9.682438869482376e-05,
"loss": 0.347,
"step": 505
},
{
"epoch": 0.9624346172135045,
"grad_norm": 0.3075178265571594,
"learning_rate": 9.681803747221341e-05,
"loss": 0.4362,
"step": 506
},
{
"epoch": 0.9643366619115549,
"grad_norm": 0.30084747076034546,
"learning_rate": 9.681168624960305e-05,
"loss": 0.3563,
"step": 507
},
{
"epoch": 0.9662387066096053,
"grad_norm": 0.3221014440059662,
"learning_rate": 9.68053350269927e-05,
"loss": 0.3366,
"step": 508
},
{
"epoch": 0.9681407513076558,
"grad_norm": 0.36464688181877136,
"learning_rate": 9.679898380438235e-05,
"loss": 0.3992,
"step": 509
},
{
"epoch": 0.9700427960057061,
"grad_norm": 0.32443803548812866,
"learning_rate": 9.679263258177199e-05,
"loss": 0.3293,
"step": 510
},
{
"epoch": 0.9719448407037565,
"grad_norm": 0.3689454197883606,
"learning_rate": 9.678628135916164e-05,
"loss": 0.3546,
"step": 511
},
{
"epoch": 0.973846885401807,
"grad_norm": 0.3754975199699402,
"learning_rate": 9.677993013655129e-05,
"loss": 0.3856,
"step": 512
},
{
"epoch": 0.9757489300998573,
"grad_norm": 0.3642953634262085,
"learning_rate": 9.677357891394094e-05,
"loss": 0.4326,
"step": 513
},
{
"epoch": 0.9776509747979077,
"grad_norm": 0.43278223276138306,
"learning_rate": 9.676722769133058e-05,
"loss": 0.3964,
"step": 514
},
{
"epoch": 0.9795530194959582,
"grad_norm": 0.43771886825561523,
"learning_rate": 9.676087646872023e-05,
"loss": 0.3861,
"step": 515
},
{
"epoch": 0.9814550641940085,
"grad_norm": 0.34908977150917053,
"learning_rate": 9.675452524610989e-05,
"loss": 0.3981,
"step": 516
},
{
"epoch": 0.983357108892059,
"grad_norm": 0.35733312368392944,
"learning_rate": 9.674817402349953e-05,
"loss": 0.3636,
"step": 517
},
{
"epoch": 0.9852591535901094,
"grad_norm": 0.3636298179626465,
"learning_rate": 9.674182280088918e-05,
"loss": 0.4336,
"step": 518
},
{
"epoch": 0.9871611982881597,
"grad_norm": 0.32771605253219604,
"learning_rate": 9.673547157827883e-05,
"loss": 0.3481,
"step": 519
},
{
"epoch": 0.9890632429862102,
"grad_norm": 0.40213117003440857,
"learning_rate": 9.672912035566847e-05,
"loss": 0.3707,
"step": 520
},
{
"epoch": 0.9909652876842606,
"grad_norm": 0.3386654257774353,
"learning_rate": 9.672276913305812e-05,
"loss": 0.3384,
"step": 521
},
{
"epoch": 0.992867332382311,
"grad_norm": 0.3965696096420288,
"learning_rate": 9.671641791044777e-05,
"loss": 0.3595,
"step": 522
},
{
"epoch": 0.9947693770803614,
"grad_norm": 0.38238459825515747,
"learning_rate": 9.671006668783741e-05,
"loss": 0.3714,
"step": 523
},
{
"epoch": 0.9966714217784118,
"grad_norm": 0.3248405456542969,
"learning_rate": 9.670371546522706e-05,
"loss": 0.394,
"step": 524
},
{
"epoch": 0.9985734664764622,
"grad_norm": 0.3902266323566437,
"learning_rate": 9.66973642426167e-05,
"loss": 0.4115,
"step": 525
}
],
"logging_steps": 1,
"max_steps": 15750,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 525,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.347361345425408e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}