themachinefan's picture
Training in progress, step 1684, checkpoint
df0fa0e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 100,
"global_step": 1684,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023788284269997025,
"grad_norm": 1.1563122272491455,
"learning_rate": 0.0,
"loss": 2.0206,
"step": 1
},
{
"epoch": 0.004757656853999405,
"grad_norm": 1.4585579633712769,
"learning_rate": 1e-05,
"loss": 2.2693,
"step": 2
},
{
"epoch": 0.007136485280999108,
"grad_norm": 1.5164328813552856,
"learning_rate": 2e-05,
"loss": 2.3356,
"step": 3
},
{
"epoch": 0.00951531370799881,
"grad_norm": 0.9573532938957214,
"learning_rate": 3e-05,
"loss": 1.6731,
"step": 4
},
{
"epoch": 0.011894142134998514,
"grad_norm": 1.3015975952148438,
"learning_rate": 4e-05,
"loss": 2.0563,
"step": 5
},
{
"epoch": 0.014272970561998216,
"grad_norm": 1.1276603937149048,
"learning_rate": 5e-05,
"loss": 1.7849,
"step": 6
},
{
"epoch": 0.016651798988997917,
"grad_norm": 1.1689510345458984,
"learning_rate": 6e-05,
"loss": 1.8166,
"step": 7
},
{
"epoch": 0.01903062741599762,
"grad_norm": 0.9140409827232361,
"learning_rate": 7e-05,
"loss": 1.5923,
"step": 8
},
{
"epoch": 0.021409455842997322,
"grad_norm": 0.8196120262145996,
"learning_rate": 8e-05,
"loss": 1.4652,
"step": 9
},
{
"epoch": 0.023788284269997028,
"grad_norm": 0.5995307564735413,
"learning_rate": 9e-05,
"loss": 1.2367,
"step": 10
},
{
"epoch": 0.02616711269699673,
"grad_norm": 0.7658697366714478,
"learning_rate": 0.0001,
"loss": 1.3004,
"step": 11
},
{
"epoch": 0.028545941123996433,
"grad_norm": 0.6701432466506958,
"learning_rate": 9.994026284348866e-05,
"loss": 1.1174,
"step": 12
},
{
"epoch": 0.030924769550996135,
"grad_norm": 0.7277427315711975,
"learning_rate": 9.98805256869773e-05,
"loss": 1.1932,
"step": 13
},
{
"epoch": 0.033303597977995834,
"grad_norm": 0.6935728788375854,
"learning_rate": 9.982078853046596e-05,
"loss": 1.1501,
"step": 14
},
{
"epoch": 0.03568242640499554,
"grad_norm": 0.7421849370002747,
"learning_rate": 9.97610513739546e-05,
"loss": 1.0886,
"step": 15
},
{
"epoch": 0.03806125483199524,
"grad_norm": 0.7694389820098877,
"learning_rate": 9.970131421744326e-05,
"loss": 1.068,
"step": 16
},
{
"epoch": 0.040440083258994945,
"grad_norm": 0.7087035775184631,
"learning_rate": 9.96415770609319e-05,
"loss": 0.9678,
"step": 17
},
{
"epoch": 0.042818911685994644,
"grad_norm": 0.660852313041687,
"learning_rate": 9.958183990442056e-05,
"loss": 0.9252,
"step": 18
},
{
"epoch": 0.04519774011299435,
"grad_norm": 0.6802922487258911,
"learning_rate": 9.952210274790921e-05,
"loss": 0.9095,
"step": 19
},
{
"epoch": 0.047576568539994056,
"grad_norm": 0.5777844786643982,
"learning_rate": 9.946236559139786e-05,
"loss": 0.9658,
"step": 20
},
{
"epoch": 0.049955396966993755,
"grad_norm": 0.4977063238620758,
"learning_rate": 9.940262843488651e-05,
"loss": 0.8399,
"step": 21
},
{
"epoch": 0.05233422539399346,
"grad_norm": 0.601166307926178,
"learning_rate": 9.934289127837514e-05,
"loss": 0.767,
"step": 22
},
{
"epoch": 0.05471305382099316,
"grad_norm": 0.5210549235343933,
"learning_rate": 9.928315412186381e-05,
"loss": 0.8259,
"step": 23
},
{
"epoch": 0.057091882247992866,
"grad_norm": 0.5965569615364075,
"learning_rate": 9.922341696535246e-05,
"loss": 0.9172,
"step": 24
},
{
"epoch": 0.059470710674992565,
"grad_norm": 0.5033414959907532,
"learning_rate": 9.916367980884111e-05,
"loss": 0.8809,
"step": 25
},
{
"epoch": 0.06184953910199227,
"grad_norm": 0.5232973098754883,
"learning_rate": 9.910394265232975e-05,
"loss": 0.7822,
"step": 26
},
{
"epoch": 0.06422836752899197,
"grad_norm": 0.47480395436286926,
"learning_rate": 9.90442054958184e-05,
"loss": 0.6984,
"step": 27
},
{
"epoch": 0.06660719595599167,
"grad_norm": 0.5224344730377197,
"learning_rate": 9.898446833930706e-05,
"loss": 0.6915,
"step": 28
},
{
"epoch": 0.06898602438299138,
"grad_norm": 0.4992648661136627,
"learning_rate": 9.892473118279571e-05,
"loss": 0.685,
"step": 29
},
{
"epoch": 0.07136485280999108,
"grad_norm": 0.4528586268424988,
"learning_rate": 9.886499402628435e-05,
"loss": 0.6188,
"step": 30
},
{
"epoch": 0.07374368123699078,
"grad_norm": 0.49073758721351624,
"learning_rate": 9.8805256869773e-05,
"loss": 0.581,
"step": 31
},
{
"epoch": 0.07612250966399048,
"grad_norm": 0.5179185271263123,
"learning_rate": 9.874551971326166e-05,
"loss": 0.6578,
"step": 32
},
{
"epoch": 0.07850133809099019,
"grad_norm": 0.49443480372428894,
"learning_rate": 9.868578255675031e-05,
"loss": 0.6404,
"step": 33
},
{
"epoch": 0.08088016651798989,
"grad_norm": 0.4802263081073761,
"learning_rate": 9.862604540023895e-05,
"loss": 0.5601,
"step": 34
},
{
"epoch": 0.08325899494498959,
"grad_norm": 0.43969476222991943,
"learning_rate": 9.85663082437276e-05,
"loss": 0.5594,
"step": 35
},
{
"epoch": 0.08563782337198929,
"grad_norm": 0.40470626950263977,
"learning_rate": 9.850657108721625e-05,
"loss": 0.5224,
"step": 36
},
{
"epoch": 0.088016651798989,
"grad_norm": 0.48150962591171265,
"learning_rate": 9.844683393070491e-05,
"loss": 0.6268,
"step": 37
},
{
"epoch": 0.0903954802259887,
"grad_norm": 0.49091798067092896,
"learning_rate": 9.838709677419355e-05,
"loss": 0.6127,
"step": 38
},
{
"epoch": 0.0927743086529884,
"grad_norm": 0.40109407901763916,
"learning_rate": 9.83273596176822e-05,
"loss": 0.5474,
"step": 39
},
{
"epoch": 0.09515313707998811,
"grad_norm": 0.4158681333065033,
"learning_rate": 9.826762246117085e-05,
"loss": 0.5739,
"step": 40
},
{
"epoch": 0.09753196550698781,
"grad_norm": 0.4261043667793274,
"learning_rate": 9.820788530465951e-05,
"loss": 0.5421,
"step": 41
},
{
"epoch": 0.09991079393398751,
"grad_norm": 0.3990945816040039,
"learning_rate": 9.814814814814815e-05,
"loss": 0.5748,
"step": 42
},
{
"epoch": 0.10228962236098721,
"grad_norm": 0.45243942737579346,
"learning_rate": 9.80884109916368e-05,
"loss": 0.6495,
"step": 43
},
{
"epoch": 0.10466845078798692,
"grad_norm": 0.40185514092445374,
"learning_rate": 9.802867383512545e-05,
"loss": 0.5058,
"step": 44
},
{
"epoch": 0.10704727921498662,
"grad_norm": 0.5162461996078491,
"learning_rate": 9.79689366786141e-05,
"loss": 0.6102,
"step": 45
},
{
"epoch": 0.10942610764198632,
"grad_norm": 0.4457720220088959,
"learning_rate": 9.790919952210275e-05,
"loss": 0.5633,
"step": 46
},
{
"epoch": 0.11180493606898602,
"grad_norm": 0.4560127556324005,
"learning_rate": 9.78494623655914e-05,
"loss": 0.5784,
"step": 47
},
{
"epoch": 0.11418376449598573,
"grad_norm": 0.38972005248069763,
"learning_rate": 9.778972520908005e-05,
"loss": 0.5795,
"step": 48
},
{
"epoch": 0.11656259292298543,
"grad_norm": 0.43415090441703796,
"learning_rate": 9.77299880525687e-05,
"loss": 0.4822,
"step": 49
},
{
"epoch": 0.11894142134998513,
"grad_norm": 0.44866281747817993,
"learning_rate": 9.767025089605735e-05,
"loss": 0.5978,
"step": 50
},
{
"epoch": 0.12132024977698483,
"grad_norm": 0.42778265476226807,
"learning_rate": 9.7610513739546e-05,
"loss": 0.5713,
"step": 51
},
{
"epoch": 0.12369907820398454,
"grad_norm": 0.37236231565475464,
"learning_rate": 9.755077658303465e-05,
"loss": 0.5136,
"step": 52
},
{
"epoch": 0.12607790663098423,
"grad_norm": 0.4558245837688446,
"learning_rate": 9.74910394265233e-05,
"loss": 0.5411,
"step": 53
},
{
"epoch": 0.12845673505798394,
"grad_norm": 0.410610169172287,
"learning_rate": 9.743130227001195e-05,
"loss": 0.5581,
"step": 54
},
{
"epoch": 0.13083556348498365,
"grad_norm": 0.4050186276435852,
"learning_rate": 9.73715651135006e-05,
"loss": 0.4935,
"step": 55
},
{
"epoch": 0.13321439191198334,
"grad_norm": 0.45527154207229614,
"learning_rate": 9.731182795698925e-05,
"loss": 0.5917,
"step": 56
},
{
"epoch": 0.13559322033898305,
"grad_norm": 0.4652462899684906,
"learning_rate": 9.72520908004779e-05,
"loss": 0.5731,
"step": 57
},
{
"epoch": 0.13797204876598276,
"grad_norm": 0.4191625118255615,
"learning_rate": 9.719235364396656e-05,
"loss": 0.5287,
"step": 58
},
{
"epoch": 0.14035087719298245,
"grad_norm": 0.5013799071311951,
"learning_rate": 9.713261648745519e-05,
"loss": 0.508,
"step": 59
},
{
"epoch": 0.14272970561998216,
"grad_norm": 0.4258359372615814,
"learning_rate": 9.707287933094386e-05,
"loss": 0.5213,
"step": 60
},
{
"epoch": 0.14510853404698187,
"grad_norm": 0.42738595604896545,
"learning_rate": 9.70131421744325e-05,
"loss": 0.5043,
"step": 61
},
{
"epoch": 0.14748736247398156,
"grad_norm": 0.38936150074005127,
"learning_rate": 9.695340501792116e-05,
"loss": 0.5113,
"step": 62
},
{
"epoch": 0.14986619090098127,
"grad_norm": 0.46111366152763367,
"learning_rate": 9.68936678614098e-05,
"loss": 0.5738,
"step": 63
},
{
"epoch": 0.15224501932798096,
"grad_norm": 0.38841429352760315,
"learning_rate": 9.683393070489846e-05,
"loss": 0.4687,
"step": 64
},
{
"epoch": 0.15462384775498067,
"grad_norm": 0.47296616435050964,
"learning_rate": 9.677419354838711e-05,
"loss": 0.5165,
"step": 65
},
{
"epoch": 0.15700267618198038,
"grad_norm": 0.4431455433368683,
"learning_rate": 9.671445639187576e-05,
"loss": 0.4682,
"step": 66
},
{
"epoch": 0.15938150460898007,
"grad_norm": 0.43318790197372437,
"learning_rate": 9.66547192353644e-05,
"loss": 0.5467,
"step": 67
},
{
"epoch": 0.16176033303597978,
"grad_norm": 0.4447453022003174,
"learning_rate": 9.659498207885304e-05,
"loss": 0.4962,
"step": 68
},
{
"epoch": 0.1641391614629795,
"grad_norm": 0.4104563891887665,
"learning_rate": 9.653524492234171e-05,
"loss": 0.4536,
"step": 69
},
{
"epoch": 0.16651798988997918,
"grad_norm": 0.4398542046546936,
"learning_rate": 9.647550776583036e-05,
"loss": 0.5473,
"step": 70
},
{
"epoch": 0.1688968183169789,
"grad_norm": 0.43461278080940247,
"learning_rate": 9.6415770609319e-05,
"loss": 0.474,
"step": 71
},
{
"epoch": 0.17127564674397858,
"grad_norm": 0.4528830051422119,
"learning_rate": 9.635603345280765e-05,
"loss": 0.5247,
"step": 72
},
{
"epoch": 0.1736544751709783,
"grad_norm": 0.45138227939605713,
"learning_rate": 9.62962962962963e-05,
"loss": 0.542,
"step": 73
},
{
"epoch": 0.176033303597978,
"grad_norm": 0.43729302287101746,
"learning_rate": 9.623655913978496e-05,
"loss": 0.5053,
"step": 74
},
{
"epoch": 0.1784121320249777,
"grad_norm": 0.4513843357563019,
"learning_rate": 9.61768219832736e-05,
"loss": 0.4978,
"step": 75
},
{
"epoch": 0.1807909604519774,
"grad_norm": 0.39364778995513916,
"learning_rate": 9.611708482676225e-05,
"loss": 0.4489,
"step": 76
},
{
"epoch": 0.1831697888789771,
"grad_norm": 0.5211411714553833,
"learning_rate": 9.60573476702509e-05,
"loss": 0.5445,
"step": 77
},
{
"epoch": 0.1855486173059768,
"grad_norm": 0.49370935559272766,
"learning_rate": 9.599761051373956e-05,
"loss": 0.5108,
"step": 78
},
{
"epoch": 0.1879274457329765,
"grad_norm": 0.4482332766056061,
"learning_rate": 9.59378733572282e-05,
"loss": 0.5219,
"step": 79
},
{
"epoch": 0.19030627415997622,
"grad_norm": 0.45065587759017944,
"learning_rate": 9.587813620071685e-05,
"loss": 0.5503,
"step": 80
},
{
"epoch": 0.1926851025869759,
"grad_norm": 0.4308435320854187,
"learning_rate": 9.58183990442055e-05,
"loss": 0.5377,
"step": 81
},
{
"epoch": 0.19506393101397562,
"grad_norm": 0.45612701773643494,
"learning_rate": 9.575866188769415e-05,
"loss": 0.4786,
"step": 82
},
{
"epoch": 0.1974427594409753,
"grad_norm": 0.4834578335285187,
"learning_rate": 9.56989247311828e-05,
"loss": 0.5582,
"step": 83
},
{
"epoch": 0.19982158786797502,
"grad_norm": 0.45820561051368713,
"learning_rate": 9.563918757467145e-05,
"loss": 0.5158,
"step": 84
},
{
"epoch": 0.20220041629497473,
"grad_norm": 0.42592278122901917,
"learning_rate": 9.55794504181601e-05,
"loss": 0.5587,
"step": 85
},
{
"epoch": 0.20457924472197442,
"grad_norm": 0.43931952118873596,
"learning_rate": 9.551971326164875e-05,
"loss": 0.4604,
"step": 86
},
{
"epoch": 0.20695807314897413,
"grad_norm": 0.504015326499939,
"learning_rate": 9.54599761051374e-05,
"loss": 0.5234,
"step": 87
},
{
"epoch": 0.20933690157597384,
"grad_norm": 0.41193896532058716,
"learning_rate": 9.540023894862605e-05,
"loss": 0.4782,
"step": 88
},
{
"epoch": 0.21171573000297353,
"grad_norm": 0.47255855798721313,
"learning_rate": 9.53405017921147e-05,
"loss": 0.4457,
"step": 89
},
{
"epoch": 0.21409455842997324,
"grad_norm": 0.41772446036338806,
"learning_rate": 9.528076463560335e-05,
"loss": 0.4297,
"step": 90
},
{
"epoch": 0.21647338685697295,
"grad_norm": 0.47142553329467773,
"learning_rate": 9.5221027479092e-05,
"loss": 0.5292,
"step": 91
},
{
"epoch": 0.21885221528397264,
"grad_norm": 0.40953025221824646,
"learning_rate": 9.516129032258065e-05,
"loss": 0.4847,
"step": 92
},
{
"epoch": 0.22123104371097235,
"grad_norm": 0.4205390512943268,
"learning_rate": 9.51015531660693e-05,
"loss": 0.4924,
"step": 93
},
{
"epoch": 0.22360987213797204,
"grad_norm": 0.4367603063583374,
"learning_rate": 9.504181600955795e-05,
"loss": 0.5321,
"step": 94
},
{
"epoch": 0.22598870056497175,
"grad_norm": 0.40274521708488464,
"learning_rate": 9.49820788530466e-05,
"loss": 0.467,
"step": 95
},
{
"epoch": 0.22836752899197146,
"grad_norm": 0.49425846338272095,
"learning_rate": 9.492234169653524e-05,
"loss": 0.5386,
"step": 96
},
{
"epoch": 0.23074635741897115,
"grad_norm": 0.4841647744178772,
"learning_rate": 9.48626045400239e-05,
"loss": 0.4556,
"step": 97
},
{
"epoch": 0.23312518584597086,
"grad_norm": 0.41210493445396423,
"learning_rate": 9.480286738351255e-05,
"loss": 0.4885,
"step": 98
},
{
"epoch": 0.23550401427297057,
"grad_norm": 0.4671674072742462,
"learning_rate": 9.47431302270012e-05,
"loss": 0.5642,
"step": 99
},
{
"epoch": 0.23788284269997026,
"grad_norm": 0.4276902973651886,
"learning_rate": 9.468339307048984e-05,
"loss": 0.4897,
"step": 100
},
{
"epoch": 0.23788284269997026,
"eval_loss": 0.4852786660194397,
"eval_runtime": 27.1696,
"eval_samples_per_second": 27.531,
"eval_steps_per_second": 13.765,
"step": 100
},
{
"epoch": 0.24026167112696997,
"grad_norm": 0.48646774888038635,
"learning_rate": 9.46236559139785e-05,
"loss": 0.4827,
"step": 101
},
{
"epoch": 0.24264049955396966,
"grad_norm": 0.4662117063999176,
"learning_rate": 9.456391875746716e-05,
"loss": 0.4601,
"step": 102
},
{
"epoch": 0.24501932798096937,
"grad_norm": 0.39756080508232117,
"learning_rate": 9.45041816009558e-05,
"loss": 0.4574,
"step": 103
},
{
"epoch": 0.24739815640796908,
"grad_norm": 0.4817146062850952,
"learning_rate": 9.444444444444444e-05,
"loss": 0.5051,
"step": 104
},
{
"epoch": 0.24977698483496877,
"grad_norm": 0.5022764205932617,
"learning_rate": 9.438470728793309e-05,
"loss": 0.4912,
"step": 105
},
{
"epoch": 0.25215581326196845,
"grad_norm": 0.4829760193824768,
"learning_rate": 9.432497013142176e-05,
"loss": 0.4778,
"step": 106
},
{
"epoch": 0.25453464168896817,
"grad_norm": 0.4534595310688019,
"learning_rate": 9.42652329749104e-05,
"loss": 0.5318,
"step": 107
},
{
"epoch": 0.2569134701159679,
"grad_norm": 0.48831233382225037,
"learning_rate": 9.420549581839904e-05,
"loss": 0.4606,
"step": 108
},
{
"epoch": 0.2592922985429676,
"grad_norm": 0.4774167835712433,
"learning_rate": 9.41457586618877e-05,
"loss": 0.5262,
"step": 109
},
{
"epoch": 0.2616711269699673,
"grad_norm": 0.46727365255355835,
"learning_rate": 9.408602150537636e-05,
"loss": 0.4316,
"step": 110
},
{
"epoch": 0.264049955396967,
"grad_norm": 0.547978401184082,
"learning_rate": 9.402628434886501e-05,
"loss": 0.5375,
"step": 111
},
{
"epoch": 0.2664287838239667,
"grad_norm": 0.5053251385688782,
"learning_rate": 9.396654719235364e-05,
"loss": 0.4526,
"step": 112
},
{
"epoch": 0.2688076122509664,
"grad_norm": 0.5039639472961426,
"learning_rate": 9.39068100358423e-05,
"loss": 0.5364,
"step": 113
},
{
"epoch": 0.2711864406779661,
"grad_norm": 0.5594891905784607,
"learning_rate": 9.384707287933095e-05,
"loss": 0.5086,
"step": 114
},
{
"epoch": 0.2735652691049658,
"grad_norm": 0.45772382616996765,
"learning_rate": 9.378733572281961e-05,
"loss": 0.5296,
"step": 115
},
{
"epoch": 0.2759440975319655,
"grad_norm": 0.5358996987342834,
"learning_rate": 9.372759856630825e-05,
"loss": 0.5925,
"step": 116
},
{
"epoch": 0.2783229259589652,
"grad_norm": 0.4769960343837738,
"learning_rate": 9.36678614097969e-05,
"loss": 0.5323,
"step": 117
},
{
"epoch": 0.2807017543859649,
"grad_norm": 0.4564250409603119,
"learning_rate": 9.360812425328555e-05,
"loss": 0.4207,
"step": 118
},
{
"epoch": 0.2830805828129646,
"grad_norm": 0.4692419469356537,
"learning_rate": 9.35483870967742e-05,
"loss": 0.4771,
"step": 119
},
{
"epoch": 0.2854594112399643,
"grad_norm": 0.42338937520980835,
"learning_rate": 9.348864994026285e-05,
"loss": 0.4622,
"step": 120
},
{
"epoch": 0.28783823966696404,
"grad_norm": 0.46972939372062683,
"learning_rate": 9.34289127837515e-05,
"loss": 0.5536,
"step": 121
},
{
"epoch": 0.29021706809396375,
"grad_norm": 0.4813671112060547,
"learning_rate": 9.336917562724015e-05,
"loss": 0.4483,
"step": 122
},
{
"epoch": 0.2925958965209634,
"grad_norm": 0.4687402546405792,
"learning_rate": 9.33094384707288e-05,
"loss": 0.5196,
"step": 123
},
{
"epoch": 0.2949747249479631,
"grad_norm": 0.48412230610847473,
"learning_rate": 9.324970131421745e-05,
"loss": 0.583,
"step": 124
},
{
"epoch": 0.29735355337496283,
"grad_norm": 0.4591488838195801,
"learning_rate": 9.31899641577061e-05,
"loss": 0.5234,
"step": 125
},
{
"epoch": 0.29973238180196254,
"grad_norm": 0.41953685879707336,
"learning_rate": 9.313022700119475e-05,
"loss": 0.4022,
"step": 126
},
{
"epoch": 0.30211121022896226,
"grad_norm": 0.4601673185825348,
"learning_rate": 9.30704898446834e-05,
"loss": 0.4862,
"step": 127
},
{
"epoch": 0.3044900386559619,
"grad_norm": 0.5358420014381409,
"learning_rate": 9.301075268817204e-05,
"loss": 0.5561,
"step": 128
},
{
"epoch": 0.3068688670829616,
"grad_norm": 0.4296037256717682,
"learning_rate": 9.29510155316607e-05,
"loss": 0.3769,
"step": 129
},
{
"epoch": 0.30924769550996134,
"grad_norm": 0.461069792509079,
"learning_rate": 9.289127837514935e-05,
"loss": 0.4367,
"step": 130
},
{
"epoch": 0.31162652393696105,
"grad_norm": 0.45425426959991455,
"learning_rate": 9.2831541218638e-05,
"loss": 0.453,
"step": 131
},
{
"epoch": 0.31400535236396077,
"grad_norm": 0.4828135371208191,
"learning_rate": 9.277180406212664e-05,
"loss": 0.366,
"step": 132
},
{
"epoch": 0.3163841807909605,
"grad_norm": 0.5032231211662292,
"learning_rate": 9.27120669056153e-05,
"loss": 0.5314,
"step": 133
},
{
"epoch": 0.31876300921796014,
"grad_norm": 0.42812997102737427,
"learning_rate": 9.265232974910395e-05,
"loss": 0.4845,
"step": 134
},
{
"epoch": 0.32114183764495985,
"grad_norm": 0.4545380473136902,
"learning_rate": 9.25925925925926e-05,
"loss": 0.474,
"step": 135
},
{
"epoch": 0.32352066607195956,
"grad_norm": 0.4786789119243622,
"learning_rate": 9.253285543608124e-05,
"loss": 0.5126,
"step": 136
},
{
"epoch": 0.3258994944989593,
"grad_norm": 0.48447903990745544,
"learning_rate": 9.247311827956989e-05,
"loss": 0.5048,
"step": 137
},
{
"epoch": 0.328278322925959,
"grad_norm": 0.4667385518550873,
"learning_rate": 9.241338112305855e-05,
"loss": 0.4826,
"step": 138
},
{
"epoch": 0.33065715135295864,
"grad_norm": 0.5055387616157532,
"learning_rate": 9.23536439665472e-05,
"loss": 0.4674,
"step": 139
},
{
"epoch": 0.33303597977995836,
"grad_norm": 0.5160491466522217,
"learning_rate": 9.229390681003584e-05,
"loss": 0.4607,
"step": 140
},
{
"epoch": 0.33541480820695807,
"grad_norm": 0.3776059150695801,
"learning_rate": 9.223416965352449e-05,
"loss": 0.3956,
"step": 141
},
{
"epoch": 0.3377936366339578,
"grad_norm": 0.4551769196987152,
"learning_rate": 9.217443249701314e-05,
"loss": 0.462,
"step": 142
},
{
"epoch": 0.3401724650609575,
"grad_norm": 0.4852535128593445,
"learning_rate": 9.21146953405018e-05,
"loss": 0.4803,
"step": 143
},
{
"epoch": 0.34255129348795715,
"grad_norm": 0.5050077438354492,
"learning_rate": 9.205495818399044e-05,
"loss": 0.5,
"step": 144
},
{
"epoch": 0.34493012191495687,
"grad_norm": 0.46231594681739807,
"learning_rate": 9.199522102747909e-05,
"loss": 0.4652,
"step": 145
},
{
"epoch": 0.3473089503419566,
"grad_norm": 0.434261679649353,
"learning_rate": 9.193548387096774e-05,
"loss": 0.4542,
"step": 146
},
{
"epoch": 0.3496877787689563,
"grad_norm": 0.44038939476013184,
"learning_rate": 9.18757467144564e-05,
"loss": 0.4922,
"step": 147
},
{
"epoch": 0.352066607195956,
"grad_norm": 0.48634183406829834,
"learning_rate": 9.181600955794504e-05,
"loss": 0.4709,
"step": 148
},
{
"epoch": 0.3544454356229557,
"grad_norm": 0.47852379083633423,
"learning_rate": 9.175627240143369e-05,
"loss": 0.5384,
"step": 149
},
{
"epoch": 0.3568242640499554,
"grad_norm": 0.48153775930404663,
"learning_rate": 9.169653524492234e-05,
"loss": 0.4515,
"step": 150
},
{
"epoch": 0.3592030924769551,
"grad_norm": 0.4894790053367615,
"learning_rate": 9.163679808841099e-05,
"loss": 0.54,
"step": 151
},
{
"epoch": 0.3615819209039548,
"grad_norm": 0.5082180500030518,
"learning_rate": 9.157706093189964e-05,
"loss": 0.4483,
"step": 152
},
{
"epoch": 0.3639607493309545,
"grad_norm": 0.4721032381057739,
"learning_rate": 9.15173237753883e-05,
"loss": 0.5123,
"step": 153
},
{
"epoch": 0.3663395777579542,
"grad_norm": 0.4319252371788025,
"learning_rate": 9.145758661887694e-05,
"loss": 0.4481,
"step": 154
},
{
"epoch": 0.3687184061849539,
"grad_norm": 0.48786380887031555,
"learning_rate": 9.13978494623656e-05,
"loss": 0.4753,
"step": 155
},
{
"epoch": 0.3710972346119536,
"grad_norm": 0.5081771016120911,
"learning_rate": 9.133811230585424e-05,
"loss": 0.5056,
"step": 156
},
{
"epoch": 0.3734760630389533,
"grad_norm": 0.4977443218231201,
"learning_rate": 9.12783751493429e-05,
"loss": 0.4685,
"step": 157
},
{
"epoch": 0.375854891465953,
"grad_norm": 0.4570608139038086,
"learning_rate": 9.121863799283154e-05,
"loss": 0.3681,
"step": 158
},
{
"epoch": 0.37823371989295274,
"grad_norm": 0.42695313692092896,
"learning_rate": 9.11589008363202e-05,
"loss": 0.4141,
"step": 159
},
{
"epoch": 0.38061254831995245,
"grad_norm": 0.43205609917640686,
"learning_rate": 9.109916367980885e-05,
"loss": 0.4596,
"step": 160
},
{
"epoch": 0.3829913767469521,
"grad_norm": 0.4466572701931,
"learning_rate": 9.10394265232975e-05,
"loss": 0.4247,
"step": 161
},
{
"epoch": 0.3853702051739518,
"grad_norm": 0.4255600571632385,
"learning_rate": 9.097968936678615e-05,
"loss": 0.4557,
"step": 162
},
{
"epoch": 0.38774903360095153,
"grad_norm": 0.4489794671535492,
"learning_rate": 9.09199522102748e-05,
"loss": 0.4372,
"step": 163
},
{
"epoch": 0.39012786202795124,
"grad_norm": 0.4137515425682068,
"learning_rate": 9.086021505376345e-05,
"loss": 0.429,
"step": 164
},
{
"epoch": 0.39250669045495096,
"grad_norm": 0.5630534887313843,
"learning_rate": 9.080047789725208e-05,
"loss": 0.4875,
"step": 165
},
{
"epoch": 0.3948855188819506,
"grad_norm": 0.45907342433929443,
"learning_rate": 9.074074074074075e-05,
"loss": 0.4258,
"step": 166
},
{
"epoch": 0.3972643473089503,
"grad_norm": 0.5133002996444702,
"learning_rate": 9.06810035842294e-05,
"loss": 0.4657,
"step": 167
},
{
"epoch": 0.39964317573595004,
"grad_norm": 0.49119633436203003,
"learning_rate": 9.062126642771805e-05,
"loss": 0.5184,
"step": 168
},
{
"epoch": 0.40202200416294975,
"grad_norm": 0.5076906681060791,
"learning_rate": 9.056152927120668e-05,
"loss": 0.4548,
"step": 169
},
{
"epoch": 0.40440083258994947,
"grad_norm": 0.49528005719184875,
"learning_rate": 9.050179211469535e-05,
"loss": 0.4214,
"step": 170
},
{
"epoch": 0.4067796610169492,
"grad_norm": 0.48812952637672424,
"learning_rate": 9.0442054958184e-05,
"loss": 0.4825,
"step": 171
},
{
"epoch": 0.40915848944394884,
"grad_norm": 0.44896411895751953,
"learning_rate": 9.038231780167265e-05,
"loss": 0.4465,
"step": 172
},
{
"epoch": 0.41153731787094855,
"grad_norm": 0.42637521028518677,
"learning_rate": 9.032258064516129e-05,
"loss": 0.4469,
"step": 173
},
{
"epoch": 0.41391614629794826,
"grad_norm": 0.47798776626586914,
"learning_rate": 9.026284348864994e-05,
"loss": 0.4689,
"step": 174
},
{
"epoch": 0.416294974724948,
"grad_norm": 0.4603348970413208,
"learning_rate": 9.02031063321386e-05,
"loss": 0.4494,
"step": 175
},
{
"epoch": 0.4186738031519477,
"grad_norm": 0.44408583641052246,
"learning_rate": 9.014336917562725e-05,
"loss": 0.4727,
"step": 176
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.4634898602962494,
"learning_rate": 9.008363201911589e-05,
"loss": 0.455,
"step": 177
},
{
"epoch": 0.42343146000594706,
"grad_norm": 0.42439034581184387,
"learning_rate": 9.002389486260454e-05,
"loss": 0.4771,
"step": 178
},
{
"epoch": 0.42581028843294677,
"grad_norm": 0.4173770248889923,
"learning_rate": 8.99641577060932e-05,
"loss": 0.4437,
"step": 179
},
{
"epoch": 0.4281891168599465,
"grad_norm": 0.5092681050300598,
"learning_rate": 8.990442054958185e-05,
"loss": 0.4728,
"step": 180
},
{
"epoch": 0.4305679452869462,
"grad_norm": 0.41918063163757324,
"learning_rate": 8.984468339307049e-05,
"loss": 0.4711,
"step": 181
},
{
"epoch": 0.4329467737139459,
"grad_norm": 0.482857346534729,
"learning_rate": 8.978494623655914e-05,
"loss": 0.4661,
"step": 182
},
{
"epoch": 0.43532560214094557,
"grad_norm": 0.44419020414352417,
"learning_rate": 8.972520908004779e-05,
"loss": 0.4264,
"step": 183
},
{
"epoch": 0.4377044305679453,
"grad_norm": 0.4146061837673187,
"learning_rate": 8.966547192353645e-05,
"loss": 0.4652,
"step": 184
},
{
"epoch": 0.440083258994945,
"grad_norm": 0.5836295485496521,
"learning_rate": 8.960573476702509e-05,
"loss": 0.4986,
"step": 185
},
{
"epoch": 0.4424620874219447,
"grad_norm": 0.5084491968154907,
"learning_rate": 8.954599761051374e-05,
"loss": 0.4474,
"step": 186
},
{
"epoch": 0.4448409158489444,
"grad_norm": 0.4597759544849396,
"learning_rate": 8.948626045400239e-05,
"loss": 0.4368,
"step": 187
},
{
"epoch": 0.4472197442759441,
"grad_norm": 0.5420514345169067,
"learning_rate": 8.942652329749104e-05,
"loss": 0.5872,
"step": 188
},
{
"epoch": 0.4495985727029438,
"grad_norm": 0.43812426924705505,
"learning_rate": 8.936678614097969e-05,
"loss": 0.4105,
"step": 189
},
{
"epoch": 0.4519774011299435,
"grad_norm": 0.5247114300727844,
"learning_rate": 8.930704898446834e-05,
"loss": 0.4847,
"step": 190
},
{
"epoch": 0.4543562295569432,
"grad_norm": 0.470450222492218,
"learning_rate": 8.924731182795699e-05,
"loss": 0.4386,
"step": 191
},
{
"epoch": 0.4567350579839429,
"grad_norm": 0.47008588910102844,
"learning_rate": 8.918757467144564e-05,
"loss": 0.4321,
"step": 192
},
{
"epoch": 0.4591138864109426,
"grad_norm": 0.4504205584526062,
"learning_rate": 8.912783751493429e-05,
"loss": 0.4462,
"step": 193
},
{
"epoch": 0.4614927148379423,
"grad_norm": 0.5049585103988647,
"learning_rate": 8.906810035842294e-05,
"loss": 0.4664,
"step": 194
},
{
"epoch": 0.463871543264942,
"grad_norm": 0.48764532804489136,
"learning_rate": 8.900836320191159e-05,
"loss": 0.439,
"step": 195
},
{
"epoch": 0.4662503716919417,
"grad_norm": 0.40391966700553894,
"learning_rate": 8.894862604540024e-05,
"loss": 0.3806,
"step": 196
},
{
"epoch": 0.46862920011894144,
"grad_norm": 0.46843641996383667,
"learning_rate": 8.888888888888889e-05,
"loss": 0.4332,
"step": 197
},
{
"epoch": 0.47100802854594115,
"grad_norm": 0.5837143063545227,
"learning_rate": 8.882915173237754e-05,
"loss": 0.5652,
"step": 198
},
{
"epoch": 0.4733868569729408,
"grad_norm": 0.49761733412742615,
"learning_rate": 8.87694145758662e-05,
"loss": 0.4827,
"step": 199
},
{
"epoch": 0.4757656853999405,
"grad_norm": 0.43618375062942505,
"learning_rate": 8.870967741935484e-05,
"loss": 0.4084,
"step": 200
},
{
"epoch": 0.4757656853999405,
"eval_loss": 0.4430273473262787,
"eval_runtime": 24.6904,
"eval_samples_per_second": 30.295,
"eval_steps_per_second": 15.148,
"step": 200
},
{
"epoch": 0.47814451382694023,
"grad_norm": 0.5256602168083191,
"learning_rate": 8.86499402628435e-05,
"loss": 0.5669,
"step": 201
},
{
"epoch": 0.48052334225393994,
"grad_norm": 0.46762749552726746,
"learning_rate": 8.859020310633214e-05,
"loss": 0.4767,
"step": 202
},
{
"epoch": 0.48290217068093966,
"grad_norm": 0.44799891114234924,
"learning_rate": 8.85304659498208e-05,
"loss": 0.4338,
"step": 203
},
{
"epoch": 0.4852809991079393,
"grad_norm": 0.5003095269203186,
"learning_rate": 8.847072879330945e-05,
"loss": 0.409,
"step": 204
},
{
"epoch": 0.487659827534939,
"grad_norm": 0.4540422856807709,
"learning_rate": 8.84109916367981e-05,
"loss": 0.415,
"step": 205
},
{
"epoch": 0.49003865596193874,
"grad_norm": 0.5260767936706543,
"learning_rate": 8.835125448028673e-05,
"loss": 0.4591,
"step": 206
},
{
"epoch": 0.49241748438893845,
"grad_norm": 0.5634750723838806,
"learning_rate": 8.82915173237754e-05,
"loss": 0.4961,
"step": 207
},
{
"epoch": 0.49479631281593817,
"grad_norm": 0.5199817419052124,
"learning_rate": 8.823178016726405e-05,
"loss": 0.4258,
"step": 208
},
{
"epoch": 0.4971751412429379,
"grad_norm": 0.5310624241828918,
"learning_rate": 8.81720430107527e-05,
"loss": 0.5003,
"step": 209
},
{
"epoch": 0.49955396966993754,
"grad_norm": 0.4997275769710541,
"learning_rate": 8.811230585424133e-05,
"loss": 0.5096,
"step": 210
},
{
"epoch": 0.5019327980969372,
"grad_norm": 0.47169461846351624,
"learning_rate": 8.805256869772998e-05,
"loss": 0.3788,
"step": 211
},
{
"epoch": 0.5043116265239369,
"grad_norm": 0.49595358967781067,
"learning_rate": 8.799283154121865e-05,
"loss": 0.5021,
"step": 212
},
{
"epoch": 0.5066904549509367,
"grad_norm": 0.46028879284858704,
"learning_rate": 8.79330943847073e-05,
"loss": 0.4533,
"step": 213
},
{
"epoch": 0.5090692833779363,
"grad_norm": 0.4750324785709381,
"learning_rate": 8.787335722819593e-05,
"loss": 0.4634,
"step": 214
},
{
"epoch": 0.5114481118049361,
"grad_norm": 0.4960343539714813,
"learning_rate": 8.781362007168459e-05,
"loss": 0.4516,
"step": 215
},
{
"epoch": 0.5138269402319358,
"grad_norm": 0.4688979983329773,
"learning_rate": 8.775388291517325e-05,
"loss": 0.4208,
"step": 216
},
{
"epoch": 0.5162057686589355,
"grad_norm": 0.5113106966018677,
"learning_rate": 8.76941457586619e-05,
"loss": 0.4894,
"step": 217
},
{
"epoch": 0.5185845970859352,
"grad_norm": 0.39522895216941833,
"learning_rate": 8.763440860215054e-05,
"loss": 0.4661,
"step": 218
},
{
"epoch": 0.5209634255129348,
"grad_norm": 0.5012136101722717,
"learning_rate": 8.757467144563919e-05,
"loss": 0.4127,
"step": 219
},
{
"epoch": 0.5233422539399346,
"grad_norm": 0.4807124435901642,
"learning_rate": 8.751493428912784e-05,
"loss": 0.3925,
"step": 220
},
{
"epoch": 0.5257210823669343,
"grad_norm": 0.5423474311828613,
"learning_rate": 8.74551971326165e-05,
"loss": 0.4961,
"step": 221
},
{
"epoch": 0.528099910793934,
"grad_norm": 0.48710131645202637,
"learning_rate": 8.739545997610514e-05,
"loss": 0.425,
"step": 222
},
{
"epoch": 0.5304787392209337,
"grad_norm": 0.47647836804389954,
"learning_rate": 8.733572281959379e-05,
"loss": 0.4065,
"step": 223
},
{
"epoch": 0.5328575676479334,
"grad_norm": 0.5683121085166931,
"learning_rate": 8.727598566308244e-05,
"loss": 0.5268,
"step": 224
},
{
"epoch": 0.5352363960749331,
"grad_norm": 0.47146156430244446,
"learning_rate": 8.72162485065711e-05,
"loss": 0.4059,
"step": 225
},
{
"epoch": 0.5376152245019328,
"grad_norm": 0.460894376039505,
"learning_rate": 8.715651135005974e-05,
"loss": 0.4245,
"step": 226
},
{
"epoch": 0.5399940529289325,
"grad_norm": 0.5514392256736755,
"learning_rate": 8.709677419354839e-05,
"loss": 0.506,
"step": 227
},
{
"epoch": 0.5423728813559322,
"grad_norm": 0.5026625394821167,
"learning_rate": 8.703703703703704e-05,
"loss": 0.4574,
"step": 228
},
{
"epoch": 0.5447517097829319,
"grad_norm": 0.45298993587493896,
"learning_rate": 8.697729988052569e-05,
"loss": 0.3697,
"step": 229
},
{
"epoch": 0.5471305382099316,
"grad_norm": 0.5072413682937622,
"learning_rate": 8.691756272401434e-05,
"loss": 0.4165,
"step": 230
},
{
"epoch": 0.5495093666369313,
"grad_norm": 0.44066810607910156,
"learning_rate": 8.685782556750299e-05,
"loss": 0.4554,
"step": 231
},
{
"epoch": 0.551888195063931,
"grad_norm": 0.513289749622345,
"learning_rate": 8.679808841099164e-05,
"loss": 0.4498,
"step": 232
},
{
"epoch": 0.5542670234909307,
"grad_norm": 0.555833637714386,
"learning_rate": 8.673835125448029e-05,
"loss": 0.5903,
"step": 233
},
{
"epoch": 0.5566458519179304,
"grad_norm": 0.44822368025779724,
"learning_rate": 8.667861409796894e-05,
"loss": 0.3928,
"step": 234
},
{
"epoch": 0.5590246803449301,
"grad_norm": 0.44668492674827576,
"learning_rate": 8.661887694145759e-05,
"loss": 0.3957,
"step": 235
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.44891655445098877,
"learning_rate": 8.655913978494624e-05,
"loss": 0.4083,
"step": 236
},
{
"epoch": 0.5637823371989296,
"grad_norm": 0.4478503465652466,
"learning_rate": 8.649940262843489e-05,
"loss": 0.366,
"step": 237
},
{
"epoch": 0.5661611656259292,
"grad_norm": 0.5855295658111572,
"learning_rate": 8.643966547192354e-05,
"loss": 0.4751,
"step": 238
},
{
"epoch": 0.568539994052929,
"grad_norm": 0.536170244216919,
"learning_rate": 8.637992831541219e-05,
"loss": 0.4637,
"step": 239
},
{
"epoch": 0.5709188224799286,
"grad_norm": 0.6756762862205505,
"learning_rate": 8.632019115890084e-05,
"loss": 0.5934,
"step": 240
},
{
"epoch": 0.5732976509069283,
"grad_norm": 0.41015246510505676,
"learning_rate": 8.626045400238949e-05,
"loss": 0.3901,
"step": 241
},
{
"epoch": 0.5756764793339281,
"grad_norm": 0.4543169438838959,
"learning_rate": 8.620071684587814e-05,
"loss": 0.4198,
"step": 242
},
{
"epoch": 0.5780553077609277,
"grad_norm": 0.44006866216659546,
"learning_rate": 8.614097968936678e-05,
"loss": 0.3953,
"step": 243
},
{
"epoch": 0.5804341361879275,
"grad_norm": 0.4517867863178253,
"learning_rate": 8.608124253285544e-05,
"loss": 0.3867,
"step": 244
},
{
"epoch": 0.5828129646149272,
"grad_norm": 0.49809014797210693,
"learning_rate": 8.60215053763441e-05,
"loss": 0.5027,
"step": 245
},
{
"epoch": 0.5851917930419268,
"grad_norm": 0.5044611692428589,
"learning_rate": 8.596176821983274e-05,
"loss": 0.4684,
"step": 246
},
{
"epoch": 0.5875706214689266,
"grad_norm": 0.4469461441040039,
"learning_rate": 8.590203106332138e-05,
"loss": 0.3945,
"step": 247
},
{
"epoch": 0.5899494498959262,
"grad_norm": 0.4750162959098816,
"learning_rate": 8.584229390681004e-05,
"loss": 0.4048,
"step": 248
},
{
"epoch": 0.592328278322926,
"grad_norm": 0.4861357808113098,
"learning_rate": 8.57825567502987e-05,
"loss": 0.4435,
"step": 249
},
{
"epoch": 0.5947071067499257,
"grad_norm": 0.49261415004730225,
"learning_rate": 8.572281959378735e-05,
"loss": 0.4717,
"step": 250
},
{
"epoch": 0.5970859351769253,
"grad_norm": 0.5840692520141602,
"learning_rate": 8.566308243727598e-05,
"loss": 0.5115,
"step": 251
},
{
"epoch": 0.5994647636039251,
"grad_norm": 0.4797023832798004,
"learning_rate": 8.560334528076463e-05,
"loss": 0.4224,
"step": 252
},
{
"epoch": 0.6018435920309247,
"grad_norm": 0.5371966361999512,
"learning_rate": 8.55436081242533e-05,
"loss": 0.5191,
"step": 253
},
{
"epoch": 0.6042224204579245,
"grad_norm": 0.450199693441391,
"learning_rate": 8.548387096774195e-05,
"loss": 0.4073,
"step": 254
},
{
"epoch": 0.6066012488849242,
"grad_norm": 0.5445570945739746,
"learning_rate": 8.542413381123058e-05,
"loss": 0.4863,
"step": 255
},
{
"epoch": 0.6089800773119238,
"grad_norm": 0.4752117395401001,
"learning_rate": 8.536439665471923e-05,
"loss": 0.4522,
"step": 256
},
{
"epoch": 0.6113589057389236,
"grad_norm": 0.5417598485946655,
"learning_rate": 8.530465949820788e-05,
"loss": 0.4937,
"step": 257
},
{
"epoch": 0.6137377341659233,
"grad_norm": 0.4615798592567444,
"learning_rate": 8.524492234169655e-05,
"loss": 0.4858,
"step": 258
},
{
"epoch": 0.616116562592923,
"grad_norm": 0.46850234270095825,
"learning_rate": 8.518518518518518e-05,
"loss": 0.4354,
"step": 259
},
{
"epoch": 0.6184953910199227,
"grad_norm": 0.6526990532875061,
"learning_rate": 8.512544802867384e-05,
"loss": 0.5441,
"step": 260
},
{
"epoch": 0.6208742194469223,
"grad_norm": 0.42547082901000977,
"learning_rate": 8.506571087216249e-05,
"loss": 0.3834,
"step": 261
},
{
"epoch": 0.6232530478739221,
"grad_norm": 0.46142813563346863,
"learning_rate": 8.500597371565115e-05,
"loss": 0.4562,
"step": 262
},
{
"epoch": 0.6256318763009218,
"grad_norm": 0.42186999320983887,
"learning_rate": 8.494623655913979e-05,
"loss": 0.4341,
"step": 263
},
{
"epoch": 0.6280107047279215,
"grad_norm": 0.5033466815948486,
"learning_rate": 8.488649940262844e-05,
"loss": 0.4606,
"step": 264
},
{
"epoch": 0.6303895331549212,
"grad_norm": 0.4589903950691223,
"learning_rate": 8.482676224611709e-05,
"loss": 0.4232,
"step": 265
},
{
"epoch": 0.632768361581921,
"grad_norm": 0.43397510051727295,
"learning_rate": 8.476702508960574e-05,
"loss": 0.4604,
"step": 266
},
{
"epoch": 0.6351471900089206,
"grad_norm": 0.4586094319820404,
"learning_rate": 8.470728793309439e-05,
"loss": 0.4637,
"step": 267
},
{
"epoch": 0.6375260184359203,
"grad_norm": 0.4164815843105316,
"learning_rate": 8.464755077658304e-05,
"loss": 0.3969,
"step": 268
},
{
"epoch": 0.63990484686292,
"grad_norm": 0.5223293304443359,
"learning_rate": 8.458781362007169e-05,
"loss": 0.4753,
"step": 269
},
{
"epoch": 0.6422836752899197,
"grad_norm": 0.4807034134864807,
"learning_rate": 8.452807646356034e-05,
"loss": 0.4249,
"step": 270
},
{
"epoch": 0.6446625037169195,
"grad_norm": 0.39427343010902405,
"learning_rate": 8.446833930704899e-05,
"loss": 0.4014,
"step": 271
},
{
"epoch": 0.6470413321439191,
"grad_norm": 0.5013017654418945,
"learning_rate": 8.440860215053764e-05,
"loss": 0.505,
"step": 272
},
{
"epoch": 0.6494201605709188,
"grad_norm": 0.4361181855201721,
"learning_rate": 8.434886499402629e-05,
"loss": 0.442,
"step": 273
},
{
"epoch": 0.6517989889979185,
"grad_norm": 0.4051946699619293,
"learning_rate": 8.428912783751494e-05,
"loss": 0.3891,
"step": 274
},
{
"epoch": 0.6541778174249182,
"grad_norm": 0.48686203360557556,
"learning_rate": 8.422939068100359e-05,
"loss": 0.5328,
"step": 275
},
{
"epoch": 0.656556645851918,
"grad_norm": 0.4902230203151703,
"learning_rate": 8.416965352449224e-05,
"loss": 0.5243,
"step": 276
},
{
"epoch": 0.6589354742789176,
"grad_norm": 0.45863983035087585,
"learning_rate": 8.410991636798089e-05,
"loss": 0.3835,
"step": 277
},
{
"epoch": 0.6613143027059173,
"grad_norm": 0.5191627144813538,
"learning_rate": 8.405017921146954e-05,
"loss": 0.5338,
"step": 278
},
{
"epoch": 0.6636931311329171,
"grad_norm": 0.47656649351119995,
"learning_rate": 8.399044205495819e-05,
"loss": 0.4346,
"step": 279
},
{
"epoch": 0.6660719595599167,
"grad_norm": 0.5256597399711609,
"learning_rate": 8.393070489844683e-05,
"loss": 0.4918,
"step": 280
},
{
"epoch": 0.6684507879869165,
"grad_norm": 0.5173395872116089,
"learning_rate": 8.387096774193549e-05,
"loss": 0.4374,
"step": 281
},
{
"epoch": 0.6708296164139161,
"grad_norm": 0.4496103525161743,
"learning_rate": 8.381123058542414e-05,
"loss": 0.4269,
"step": 282
},
{
"epoch": 0.6732084448409158,
"grad_norm": 0.46055564284324646,
"learning_rate": 8.375149342891279e-05,
"loss": 0.407,
"step": 283
},
{
"epoch": 0.6755872732679156,
"grad_norm": 0.5227301120758057,
"learning_rate": 8.369175627240143e-05,
"loss": 0.5523,
"step": 284
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.44740229845046997,
"learning_rate": 8.363201911589009e-05,
"loss": 0.4504,
"step": 285
},
{
"epoch": 0.680344930121915,
"grad_norm": 0.4591153562068939,
"learning_rate": 8.357228195937874e-05,
"loss": 0.3691,
"step": 286
},
{
"epoch": 0.6827237585489146,
"grad_norm": 0.5262241959571838,
"learning_rate": 8.351254480286739e-05,
"loss": 0.4774,
"step": 287
},
{
"epoch": 0.6851025869759143,
"grad_norm": 0.4027566909790039,
"learning_rate": 8.345280764635603e-05,
"loss": 0.4508,
"step": 288
},
{
"epoch": 0.6874814154029141,
"grad_norm": 0.4727931618690491,
"learning_rate": 8.339307048984468e-05,
"loss": 0.4829,
"step": 289
},
{
"epoch": 0.6898602438299137,
"grad_norm": 0.5389280915260315,
"learning_rate": 8.333333333333334e-05,
"loss": 0.4643,
"step": 290
},
{
"epoch": 0.6922390722569135,
"grad_norm": 0.46183347702026367,
"learning_rate": 8.3273596176822e-05,
"loss": 0.348,
"step": 291
},
{
"epoch": 0.6946179006839132,
"grad_norm": 0.4557804465293884,
"learning_rate": 8.321385902031063e-05,
"loss": 0.3903,
"step": 292
},
{
"epoch": 0.6969967291109129,
"grad_norm": 0.49101144075393677,
"learning_rate": 8.315412186379928e-05,
"loss": 0.4898,
"step": 293
},
{
"epoch": 0.6993755575379126,
"grad_norm": 0.4704493284225464,
"learning_rate": 8.309438470728795e-05,
"loss": 0.4569,
"step": 294
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.42415690422058105,
"learning_rate": 8.30346475507766e-05,
"loss": 0.3992,
"step": 295
},
{
"epoch": 0.704133214391912,
"grad_norm": 0.48090270161628723,
"learning_rate": 8.297491039426523e-05,
"loss": 0.473,
"step": 296
},
{
"epoch": 0.7065120428189117,
"grad_norm": 0.4778149724006653,
"learning_rate": 8.291517323775388e-05,
"loss": 0.4849,
"step": 297
},
{
"epoch": 0.7088908712459114,
"grad_norm": 0.5467464923858643,
"learning_rate": 8.285543608124253e-05,
"loss": 0.502,
"step": 298
},
{
"epoch": 0.7112696996729111,
"grad_norm": 0.43591874837875366,
"learning_rate": 8.27956989247312e-05,
"loss": 0.4206,
"step": 299
},
{
"epoch": 0.7136485280999108,
"grad_norm": 0.45697838068008423,
"learning_rate": 8.273596176821983e-05,
"loss": 0.4414,
"step": 300
},
{
"epoch": 0.7136485280999108,
"eval_loss": 0.42019587755203247,
"eval_runtime": 24.6142,
"eval_samples_per_second": 30.389,
"eval_steps_per_second": 15.195,
"step": 300
},
{
"epoch": 0.7160273565269105,
"grad_norm": 0.5003380179405212,
"learning_rate": 8.267622461170848e-05,
"loss": 0.4433,
"step": 301
},
{
"epoch": 0.7184061849539102,
"grad_norm": 0.48988282680511475,
"learning_rate": 8.261648745519713e-05,
"loss": 0.4097,
"step": 302
},
{
"epoch": 0.72078501338091,
"grad_norm": 0.48710161447525024,
"learning_rate": 8.255675029868578e-05,
"loss": 0.5047,
"step": 303
},
{
"epoch": 0.7231638418079096,
"grad_norm": 0.49213966727256775,
"learning_rate": 8.249701314217443e-05,
"loss": 0.3946,
"step": 304
},
{
"epoch": 0.7255426702349093,
"grad_norm": 0.48221632838249207,
"learning_rate": 8.243727598566309e-05,
"loss": 0.4533,
"step": 305
},
{
"epoch": 0.727921498661909,
"grad_norm": 0.4673924446105957,
"learning_rate": 8.237753882915174e-05,
"loss": 0.4467,
"step": 306
},
{
"epoch": 0.7303003270889087,
"grad_norm": 0.4333641827106476,
"learning_rate": 8.231780167264039e-05,
"loss": 0.3813,
"step": 307
},
{
"epoch": 0.7326791555159085,
"grad_norm": 0.48312193155288696,
"learning_rate": 8.225806451612904e-05,
"loss": 0.4388,
"step": 308
},
{
"epoch": 0.7350579839429081,
"grad_norm": 0.4759059250354767,
"learning_rate": 8.219832735961769e-05,
"loss": 0.3819,
"step": 309
},
{
"epoch": 0.7374368123699078,
"grad_norm": 0.47491195797920227,
"learning_rate": 8.213859020310634e-05,
"loss": 0.4421,
"step": 310
},
{
"epoch": 0.7398156407969075,
"grad_norm": 0.4345269799232483,
"learning_rate": 8.207885304659499e-05,
"loss": 0.4134,
"step": 311
},
{
"epoch": 0.7421944692239072,
"grad_norm": 0.48786696791648865,
"learning_rate": 8.201911589008364e-05,
"loss": 0.3868,
"step": 312
},
{
"epoch": 0.744573297650907,
"grad_norm": 0.5398519039154053,
"learning_rate": 8.195937873357229e-05,
"loss": 0.4272,
"step": 313
},
{
"epoch": 0.7469521260779066,
"grad_norm": 0.4489103853702545,
"learning_rate": 8.189964157706094e-05,
"loss": 0.4376,
"step": 314
},
{
"epoch": 0.7493309545049064,
"grad_norm": 0.4804931581020355,
"learning_rate": 8.183990442054959e-05,
"loss": 0.4128,
"step": 315
},
{
"epoch": 0.751709782931906,
"grad_norm": 0.5021992921829224,
"learning_rate": 8.178016726403824e-05,
"loss": 0.4401,
"step": 316
},
{
"epoch": 0.7540886113589057,
"grad_norm": 0.6202664375305176,
"learning_rate": 8.172043010752689e-05,
"loss": 0.5967,
"step": 317
},
{
"epoch": 0.7564674397859055,
"grad_norm": 0.4981567859649658,
"learning_rate": 8.166069295101554e-05,
"loss": 0.5046,
"step": 318
},
{
"epoch": 0.7588462682129051,
"grad_norm": 0.4543679654598236,
"learning_rate": 8.160095579450419e-05,
"loss": 0.4256,
"step": 319
},
{
"epoch": 0.7612250966399049,
"grad_norm": 0.48019784688949585,
"learning_rate": 8.154121863799284e-05,
"loss": 0.4365,
"step": 320
},
{
"epoch": 0.7636039250669046,
"grad_norm": 0.6639605164527893,
"learning_rate": 8.148148148148148e-05,
"loss": 0.5101,
"step": 321
},
{
"epoch": 0.7659827534939042,
"grad_norm": 0.4873949885368347,
"learning_rate": 8.142174432497014e-05,
"loss": 0.4971,
"step": 322
},
{
"epoch": 0.768361581920904,
"grad_norm": 0.4552108943462372,
"learning_rate": 8.136200716845879e-05,
"loss": 0.3995,
"step": 323
},
{
"epoch": 0.7707404103479036,
"grad_norm": 0.42462024092674255,
"learning_rate": 8.130227001194744e-05,
"loss": 0.4306,
"step": 324
},
{
"epoch": 0.7731192387749034,
"grad_norm": 0.47372832894325256,
"learning_rate": 8.124253285543608e-05,
"loss": 0.3956,
"step": 325
},
{
"epoch": 0.7754980672019031,
"grad_norm": 0.41488945484161377,
"learning_rate": 8.118279569892473e-05,
"loss": 0.4634,
"step": 326
},
{
"epoch": 0.7778768956289027,
"grad_norm": 0.41355305910110474,
"learning_rate": 8.112305854241339e-05,
"loss": 0.3903,
"step": 327
},
{
"epoch": 0.7802557240559025,
"grad_norm": 0.4431304335594177,
"learning_rate": 8.106332138590204e-05,
"loss": 0.3842,
"step": 328
},
{
"epoch": 0.7826345524829021,
"grad_norm": 0.41997721791267395,
"learning_rate": 8.100358422939068e-05,
"loss": 0.4125,
"step": 329
},
{
"epoch": 0.7850133809099019,
"grad_norm": 0.5141931772232056,
"learning_rate": 8.094384707287933e-05,
"loss": 0.4582,
"step": 330
},
{
"epoch": 0.7873922093369016,
"grad_norm": 0.48614996671676636,
"learning_rate": 8.088410991636799e-05,
"loss": 0.417,
"step": 331
},
{
"epoch": 0.7897710377639012,
"grad_norm": 0.5671504139900208,
"learning_rate": 8.082437275985664e-05,
"loss": 0.5316,
"step": 332
},
{
"epoch": 0.792149866190901,
"grad_norm": 0.483743280172348,
"learning_rate": 8.076463560334528e-05,
"loss": 0.4018,
"step": 333
},
{
"epoch": 0.7945286946179007,
"grad_norm": 0.5026981234550476,
"learning_rate": 8.070489844683393e-05,
"loss": 0.415,
"step": 334
},
{
"epoch": 0.7969075230449004,
"grad_norm": 0.4384533762931824,
"learning_rate": 8.064516129032258e-05,
"loss": 0.4116,
"step": 335
},
{
"epoch": 0.7992863514719001,
"grad_norm": 0.5437987446784973,
"learning_rate": 8.058542413381124e-05,
"loss": 0.3674,
"step": 336
},
{
"epoch": 0.8016651798988997,
"grad_norm": 0.4940624237060547,
"learning_rate": 8.052568697729988e-05,
"loss": 0.4128,
"step": 337
},
{
"epoch": 0.8040440083258995,
"grad_norm": 0.46025657653808594,
"learning_rate": 8.046594982078853e-05,
"loss": 0.4514,
"step": 338
},
{
"epoch": 0.8064228367528992,
"grad_norm": 0.48849228024482727,
"learning_rate": 8.040621266427718e-05,
"loss": 0.4903,
"step": 339
},
{
"epoch": 0.8088016651798989,
"grad_norm": 0.49576497077941895,
"learning_rate": 8.034647550776585e-05,
"loss": 0.3421,
"step": 340
},
{
"epoch": 0.8111804936068986,
"grad_norm": 0.44594496488571167,
"learning_rate": 8.028673835125448e-05,
"loss": 0.4878,
"step": 341
},
{
"epoch": 0.8135593220338984,
"grad_norm": 0.535062849521637,
"learning_rate": 8.022700119474313e-05,
"loss": 0.4713,
"step": 342
},
{
"epoch": 0.815938150460898,
"grad_norm": 0.4524611532688141,
"learning_rate": 8.016726403823178e-05,
"loss": 0.4416,
"step": 343
},
{
"epoch": 0.8183169788878977,
"grad_norm": 0.5021877288818359,
"learning_rate": 8.010752688172043e-05,
"loss": 0.4265,
"step": 344
},
{
"epoch": 0.8206958073148974,
"grad_norm": 0.5109665989875793,
"learning_rate": 8.004778972520908e-05,
"loss": 0.4438,
"step": 345
},
{
"epoch": 0.8230746357418971,
"grad_norm": 0.46363264322280884,
"learning_rate": 7.998805256869773e-05,
"loss": 0.4209,
"step": 346
},
{
"epoch": 0.8254534641688969,
"grad_norm": 0.47233110666275024,
"learning_rate": 7.992831541218638e-05,
"loss": 0.4135,
"step": 347
},
{
"epoch": 0.8278322925958965,
"grad_norm": 0.42109566926956177,
"learning_rate": 7.986857825567503e-05,
"loss": 0.4168,
"step": 348
},
{
"epoch": 0.8302111210228962,
"grad_norm": 0.4773065149784088,
"learning_rate": 7.980884109916368e-05,
"loss": 0.4437,
"step": 349
},
{
"epoch": 0.832589949449896,
"grad_norm": 0.5352240800857544,
"learning_rate": 7.974910394265234e-05,
"loss": 0.3954,
"step": 350
},
{
"epoch": 0.8349687778768956,
"grad_norm": 0.3983041048049927,
"learning_rate": 7.968936678614099e-05,
"loss": 0.3578,
"step": 351
},
{
"epoch": 0.8373476063038954,
"grad_norm": 0.4744306206703186,
"learning_rate": 7.962962962962964e-05,
"loss": 0.4262,
"step": 352
},
{
"epoch": 0.839726434730895,
"grad_norm": 0.4249997138977051,
"learning_rate": 7.956989247311829e-05,
"loss": 0.4078,
"step": 353
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.4846518337726593,
"learning_rate": 7.951015531660694e-05,
"loss": 0.4084,
"step": 354
},
{
"epoch": 0.8444840915848945,
"grad_norm": 0.5177565217018127,
"learning_rate": 7.945041816009559e-05,
"loss": 0.4306,
"step": 355
},
{
"epoch": 0.8468629200118941,
"grad_norm": 0.5126465559005737,
"learning_rate": 7.939068100358424e-05,
"loss": 0.4932,
"step": 356
},
{
"epoch": 0.8492417484388939,
"grad_norm": 0.45076677203178406,
"learning_rate": 7.933094384707289e-05,
"loss": 0.3486,
"step": 357
},
{
"epoch": 0.8516205768658935,
"grad_norm": 0.4830567538738251,
"learning_rate": 7.927120669056152e-05,
"loss": 0.4967,
"step": 358
},
{
"epoch": 0.8539994052928932,
"grad_norm": 0.436100572347641,
"learning_rate": 7.921146953405019e-05,
"loss": 0.4028,
"step": 359
},
{
"epoch": 0.856378233719893,
"grad_norm": 0.5462535619735718,
"learning_rate": 7.915173237753884e-05,
"loss": 0.3865,
"step": 360
},
{
"epoch": 0.8587570621468926,
"grad_norm": 0.5388931632041931,
"learning_rate": 7.909199522102749e-05,
"loss": 0.4424,
"step": 361
},
{
"epoch": 0.8611358905738924,
"grad_norm": 0.5704916715621948,
"learning_rate": 7.903225806451613e-05,
"loss": 0.5744,
"step": 362
},
{
"epoch": 0.863514719000892,
"grad_norm": 0.442672461271286,
"learning_rate": 7.897252090800479e-05,
"loss": 0.4299,
"step": 363
},
{
"epoch": 0.8658935474278918,
"grad_norm": 0.4462336599826813,
"learning_rate": 7.891278375149344e-05,
"loss": 0.4498,
"step": 364
},
{
"epoch": 0.8682723758548915,
"grad_norm": 0.5641991496086121,
"learning_rate": 7.885304659498209e-05,
"loss": 0.3614,
"step": 365
},
{
"epoch": 0.8706512042818911,
"grad_norm": 0.47875943779945374,
"learning_rate": 7.879330943847073e-05,
"loss": 0.4333,
"step": 366
},
{
"epoch": 0.8730300327088909,
"grad_norm": 0.4294171929359436,
"learning_rate": 7.873357228195938e-05,
"loss": 0.4294,
"step": 367
},
{
"epoch": 0.8754088611358906,
"grad_norm": 0.499260812997818,
"learning_rate": 7.867383512544804e-05,
"loss": 0.4785,
"step": 368
},
{
"epoch": 0.8777876895628903,
"grad_norm": 0.4474776089191437,
"learning_rate": 7.861409796893669e-05,
"loss": 0.3835,
"step": 369
},
{
"epoch": 0.88016651798989,
"grad_norm": 0.4485014081001282,
"learning_rate": 7.855436081242533e-05,
"loss": 0.4452,
"step": 370
},
{
"epoch": 0.8825453464168896,
"grad_norm": 0.45007187128067017,
"learning_rate": 7.849462365591398e-05,
"loss": 0.4115,
"step": 371
},
{
"epoch": 0.8849241748438894,
"grad_norm": 0.5280534029006958,
"learning_rate": 7.843488649940263e-05,
"loss": 0.4891,
"step": 372
},
{
"epoch": 0.8873030032708891,
"grad_norm": 0.45218995213508606,
"learning_rate": 7.837514934289129e-05,
"loss": 0.4952,
"step": 373
},
{
"epoch": 0.8896818316978888,
"grad_norm": 0.4186769425868988,
"learning_rate": 7.831541218637993e-05,
"loss": 0.3688,
"step": 374
},
{
"epoch": 0.8920606601248885,
"grad_norm": 0.45719078183174133,
"learning_rate": 7.825567502986858e-05,
"loss": 0.3713,
"step": 375
},
{
"epoch": 0.8944394885518882,
"grad_norm": 0.5285611748695374,
"learning_rate": 7.819593787335723e-05,
"loss": 0.4358,
"step": 376
},
{
"epoch": 0.8968183169788879,
"grad_norm": 0.4786120653152466,
"learning_rate": 7.81362007168459e-05,
"loss": 0.3769,
"step": 377
},
{
"epoch": 0.8991971454058876,
"grad_norm": 0.42901334166526794,
"learning_rate": 7.807646356033453e-05,
"loss": 0.3876,
"step": 378
},
{
"epoch": 0.9015759738328873,
"grad_norm": 0.5145654082298279,
"learning_rate": 7.801672640382318e-05,
"loss": 0.4675,
"step": 379
},
{
"epoch": 0.903954802259887,
"grad_norm": 0.49353912472724915,
"learning_rate": 7.795698924731183e-05,
"loss": 0.4615,
"step": 380
},
{
"epoch": 0.9063336306868867,
"grad_norm": 0.5090721845626831,
"learning_rate": 7.789725209080048e-05,
"loss": 0.4628,
"step": 381
},
{
"epoch": 0.9087124591138864,
"grad_norm": 0.4594820737838745,
"learning_rate": 7.783751493428913e-05,
"loss": 0.4605,
"step": 382
},
{
"epoch": 0.9110912875408861,
"grad_norm": 0.5224221348762512,
"learning_rate": 7.777777777777778e-05,
"loss": 0.3948,
"step": 383
},
{
"epoch": 0.9134701159678859,
"grad_norm": 0.4966912567615509,
"learning_rate": 7.771804062126643e-05,
"loss": 0.4646,
"step": 384
},
{
"epoch": 0.9158489443948855,
"grad_norm": 0.41586506366729736,
"learning_rate": 7.765830346475508e-05,
"loss": 0.3713,
"step": 385
},
{
"epoch": 0.9182277728218852,
"grad_norm": 0.44495782256126404,
"learning_rate": 7.759856630824373e-05,
"loss": 0.4071,
"step": 386
},
{
"epoch": 0.9206066012488849,
"grad_norm": 0.4939152002334595,
"learning_rate": 7.753882915173238e-05,
"loss": 0.5035,
"step": 387
},
{
"epoch": 0.9229854296758846,
"grad_norm": 0.5020651817321777,
"learning_rate": 7.747909199522103e-05,
"loss": 0.458,
"step": 388
},
{
"epoch": 0.9253642581028844,
"grad_norm": 0.5235796570777893,
"learning_rate": 7.741935483870968e-05,
"loss": 0.4411,
"step": 389
},
{
"epoch": 0.927743086529884,
"grad_norm": 0.47786203026771545,
"learning_rate": 7.735961768219832e-05,
"loss": 0.4452,
"step": 390
},
{
"epoch": 0.9301219149568838,
"grad_norm": 0.5545435547828674,
"learning_rate": 7.729988052568698e-05,
"loss": 0.4522,
"step": 391
},
{
"epoch": 0.9325007433838834,
"grad_norm": 0.46032214164733887,
"learning_rate": 7.724014336917563e-05,
"loss": 0.3926,
"step": 392
},
{
"epoch": 0.9348795718108831,
"grad_norm": 0.49607938528060913,
"learning_rate": 7.718040621266428e-05,
"loss": 0.4277,
"step": 393
},
{
"epoch": 0.9372584002378829,
"grad_norm": 0.558600664138794,
"learning_rate": 7.712066905615292e-05,
"loss": 0.4564,
"step": 394
},
{
"epoch": 0.9396372286648825,
"grad_norm": 0.47056880593299866,
"learning_rate": 7.706093189964157e-05,
"loss": 0.3932,
"step": 395
},
{
"epoch": 0.9420160570918823,
"grad_norm": 0.4679563343524933,
"learning_rate": 7.700119474313024e-05,
"loss": 0.3767,
"step": 396
},
{
"epoch": 0.944394885518882,
"grad_norm": 0.5144210457801819,
"learning_rate": 7.694145758661889e-05,
"loss": 0.463,
"step": 397
},
{
"epoch": 0.9467737139458816,
"grad_norm": 0.538809597492218,
"learning_rate": 7.688172043010752e-05,
"loss": 0.422,
"step": 398
},
{
"epoch": 0.9491525423728814,
"grad_norm": 0.4757307767868042,
"learning_rate": 7.682198327359617e-05,
"loss": 0.3726,
"step": 399
},
{
"epoch": 0.951531370799881,
"grad_norm": 0.5315593481063843,
"learning_rate": 7.676224611708484e-05,
"loss": 0.4213,
"step": 400
},
{
"epoch": 0.951531370799881,
"eval_loss": 0.40764564275741577,
"eval_runtime": 24.5873,
"eval_samples_per_second": 30.422,
"eval_steps_per_second": 15.211,
"step": 400
},
{
"epoch": 0.9539101992268808,
"grad_norm": 0.5194830298423767,
"learning_rate": 7.670250896057349e-05,
"loss": 0.4463,
"step": 401
},
{
"epoch": 0.9562890276538805,
"grad_norm": 0.49560976028442383,
"learning_rate": 7.664277180406212e-05,
"loss": 0.3966,
"step": 402
},
{
"epoch": 0.9586678560808801,
"grad_norm": 0.5115331411361694,
"learning_rate": 7.658303464755077e-05,
"loss": 0.4229,
"step": 403
},
{
"epoch": 0.9610466845078799,
"grad_norm": 0.4300745725631714,
"learning_rate": 7.652329749103942e-05,
"loss": 0.4448,
"step": 404
},
{
"epoch": 0.9634255129348795,
"grad_norm": 0.4714706242084503,
"learning_rate": 7.646356033452809e-05,
"loss": 0.5054,
"step": 405
},
{
"epoch": 0.9658043413618793,
"grad_norm": 0.4393133521080017,
"learning_rate": 7.640382317801672e-05,
"loss": 0.3574,
"step": 406
},
{
"epoch": 0.968183169788879,
"grad_norm": 0.47342050075531006,
"learning_rate": 7.634408602150538e-05,
"loss": 0.3789,
"step": 407
},
{
"epoch": 0.9705619982158786,
"grad_norm": 0.4490165114402771,
"learning_rate": 7.628434886499403e-05,
"loss": 0.386,
"step": 408
},
{
"epoch": 0.9729408266428784,
"grad_norm": 0.3915969133377075,
"learning_rate": 7.622461170848269e-05,
"loss": 0.3605,
"step": 409
},
{
"epoch": 0.975319655069878,
"grad_norm": 0.4768807888031006,
"learning_rate": 7.616487455197133e-05,
"loss": 0.4476,
"step": 410
},
{
"epoch": 0.9776984834968778,
"grad_norm": 0.5957807302474976,
"learning_rate": 7.610513739545998e-05,
"loss": 0.3811,
"step": 411
},
{
"epoch": 0.9800773119238775,
"grad_norm": 0.4522106647491455,
"learning_rate": 7.604540023894863e-05,
"loss": 0.3792,
"step": 412
},
{
"epoch": 0.9824561403508771,
"grad_norm": 0.49110835790634155,
"learning_rate": 7.598566308243728e-05,
"loss": 0.4261,
"step": 413
},
{
"epoch": 0.9848349687778769,
"grad_norm": 0.49582499265670776,
"learning_rate": 7.592592592592593e-05,
"loss": 0.3845,
"step": 414
},
{
"epoch": 0.9872137972048766,
"grad_norm": 0.4370538890361786,
"learning_rate": 7.586618876941458e-05,
"loss": 0.36,
"step": 415
},
{
"epoch": 0.9895926256318763,
"grad_norm": 0.473349928855896,
"learning_rate": 7.580645161290323e-05,
"loss": 0.419,
"step": 416
},
{
"epoch": 0.991971454058876,
"grad_norm": 0.5078774094581604,
"learning_rate": 7.574671445639188e-05,
"loss": 0.4279,
"step": 417
},
{
"epoch": 0.9943502824858758,
"grad_norm": 0.4728812873363495,
"learning_rate": 7.568697729988053e-05,
"loss": 0.4926,
"step": 418
},
{
"epoch": 0.9967291109128754,
"grad_norm": 0.5215616822242737,
"learning_rate": 7.562724014336918e-05,
"loss": 0.4064,
"step": 419
},
{
"epoch": 0.9991079393398751,
"grad_norm": 0.5137555003166199,
"learning_rate": 7.556750298685783e-05,
"loss": 0.4205,
"step": 420
},
{
"epoch": 1.0,
"grad_norm": 0.7743244171142578,
"learning_rate": 7.550776583034648e-05,
"loss": 0.3431,
"step": 421
},
{
"epoch": 1.0023788284269997,
"grad_norm": 0.46033695340156555,
"learning_rate": 7.544802867383513e-05,
"loss": 0.3297,
"step": 422
},
{
"epoch": 1.0047576568539993,
"grad_norm": 0.423931747674942,
"learning_rate": 7.538829151732378e-05,
"loss": 0.377,
"step": 423
},
{
"epoch": 1.0071364852809992,
"grad_norm": 0.45330631732940674,
"learning_rate": 7.532855436081243e-05,
"loss": 0.4413,
"step": 424
},
{
"epoch": 1.0095153137079989,
"grad_norm": 0.4233931601047516,
"learning_rate": 7.526881720430108e-05,
"loss": 0.3698,
"step": 425
},
{
"epoch": 1.0118941421349985,
"grad_norm": 0.44395965337753296,
"learning_rate": 7.520908004778973e-05,
"loss": 0.3871,
"step": 426
},
{
"epoch": 1.0142729705619982,
"grad_norm": 0.47254008054733276,
"learning_rate": 7.514934289127837e-05,
"loss": 0.3216,
"step": 427
},
{
"epoch": 1.0166517989889978,
"grad_norm": 0.4359164237976074,
"learning_rate": 7.508960573476703e-05,
"loss": 0.3463,
"step": 428
},
{
"epoch": 1.0190306274159977,
"grad_norm": 0.4071188271045685,
"learning_rate": 7.502986857825568e-05,
"loss": 0.3649,
"step": 429
},
{
"epoch": 1.0214094558429974,
"grad_norm": 0.4662891924381256,
"learning_rate": 7.497013142174433e-05,
"loss": 0.3709,
"step": 430
},
{
"epoch": 1.023788284269997,
"grad_norm": 0.4861668348312378,
"learning_rate": 7.491039426523297e-05,
"loss": 0.3461,
"step": 431
},
{
"epoch": 1.0261671126969967,
"grad_norm": 0.44614556431770325,
"learning_rate": 7.485065710872163e-05,
"loss": 0.348,
"step": 432
},
{
"epoch": 1.0285459411239963,
"grad_norm": 0.5915560722351074,
"learning_rate": 7.479091995221028e-05,
"loss": 0.4335,
"step": 433
},
{
"epoch": 1.0309247695509962,
"grad_norm": 0.5173319578170776,
"learning_rate": 7.473118279569893e-05,
"loss": 0.4246,
"step": 434
},
{
"epoch": 1.0333035979779959,
"grad_norm": 0.5578414797782898,
"learning_rate": 7.467144563918757e-05,
"loss": 0.45,
"step": 435
},
{
"epoch": 1.0356824264049955,
"grad_norm": 0.4671054184436798,
"learning_rate": 7.461170848267622e-05,
"loss": 0.4,
"step": 436
},
{
"epoch": 1.0380612548319952,
"grad_norm": 0.5529332756996155,
"learning_rate": 7.455197132616488e-05,
"loss": 0.3572,
"step": 437
},
{
"epoch": 1.0404400832589948,
"grad_norm": 0.39636898040771484,
"learning_rate": 7.449223416965353e-05,
"loss": 0.3505,
"step": 438
},
{
"epoch": 1.0428189116859947,
"grad_norm": 0.48718351125717163,
"learning_rate": 7.443249701314217e-05,
"loss": 0.3856,
"step": 439
},
{
"epoch": 1.0451977401129944,
"grad_norm": 0.45662057399749756,
"learning_rate": 7.437275985663082e-05,
"loss": 0.3257,
"step": 440
},
{
"epoch": 1.047576568539994,
"grad_norm": 0.5234172344207764,
"learning_rate": 7.431302270011949e-05,
"loss": 0.3933,
"step": 441
},
{
"epoch": 1.0499553969669937,
"grad_norm": 0.46789664030075073,
"learning_rate": 7.425328554360814e-05,
"loss": 0.3381,
"step": 442
},
{
"epoch": 1.0523342253939933,
"grad_norm": 0.4628084897994995,
"learning_rate": 7.419354838709677e-05,
"loss": 0.3924,
"step": 443
},
{
"epoch": 1.0547130538209932,
"grad_norm": 0.44630610942840576,
"learning_rate": 7.413381123058542e-05,
"loss": 0.3711,
"step": 444
},
{
"epoch": 1.0570918822479929,
"grad_norm": 0.49540749192237854,
"learning_rate": 7.407407407407407e-05,
"loss": 0.3633,
"step": 445
},
{
"epoch": 1.0594707106749925,
"grad_norm": 0.4955281913280487,
"learning_rate": 7.401433691756274e-05,
"loss": 0.3842,
"step": 446
},
{
"epoch": 1.0618495391019922,
"grad_norm": 0.512876570224762,
"learning_rate": 7.395459976105137e-05,
"loss": 0.3781,
"step": 447
},
{
"epoch": 1.064228367528992,
"grad_norm": 0.45539894700050354,
"learning_rate": 7.389486260454002e-05,
"loss": 0.3227,
"step": 448
},
{
"epoch": 1.0666071959559917,
"grad_norm": 0.48431605100631714,
"learning_rate": 7.383512544802867e-05,
"loss": 0.3943,
"step": 449
},
{
"epoch": 1.0689860243829914,
"grad_norm": 0.4580828547477722,
"learning_rate": 7.377538829151732e-05,
"loss": 0.3791,
"step": 450
},
{
"epoch": 1.071364852809991,
"grad_norm": 0.48635613918304443,
"learning_rate": 7.371565113500597e-05,
"loss": 0.2822,
"step": 451
},
{
"epoch": 1.0737436812369907,
"grad_norm": 0.5143231153488159,
"learning_rate": 7.365591397849463e-05,
"loss": 0.3912,
"step": 452
},
{
"epoch": 1.0761225096639904,
"grad_norm": 0.44039368629455566,
"learning_rate": 7.359617682198328e-05,
"loss": 0.2943,
"step": 453
},
{
"epoch": 1.0785013380909902,
"grad_norm": 0.5627120137214661,
"learning_rate": 7.353643966547193e-05,
"loss": 0.4064,
"step": 454
},
{
"epoch": 1.08088016651799,
"grad_norm": 0.539448618888855,
"learning_rate": 7.347670250896058e-05,
"loss": 0.432,
"step": 455
},
{
"epoch": 1.0832589949449896,
"grad_norm": 0.5089407563209534,
"learning_rate": 7.341696535244923e-05,
"loss": 0.3737,
"step": 456
},
{
"epoch": 1.0856378233719892,
"grad_norm": 0.5352144837379456,
"learning_rate": 7.335722819593788e-05,
"loss": 0.4279,
"step": 457
},
{
"epoch": 1.088016651798989,
"grad_norm": 0.49480384588241577,
"learning_rate": 7.329749103942653e-05,
"loss": 0.3251,
"step": 458
},
{
"epoch": 1.0903954802259888,
"grad_norm": 0.49400433897972107,
"learning_rate": 7.323775388291518e-05,
"loss": 0.321,
"step": 459
},
{
"epoch": 1.0927743086529884,
"grad_norm": 0.4779919981956482,
"learning_rate": 7.317801672640383e-05,
"loss": 0.3446,
"step": 460
},
{
"epoch": 1.095153137079988,
"grad_norm": 0.5801271796226501,
"learning_rate": 7.311827956989248e-05,
"loss": 0.4476,
"step": 461
},
{
"epoch": 1.0975319655069877,
"grad_norm": 0.49812352657318115,
"learning_rate": 7.305854241338113e-05,
"loss": 0.3112,
"step": 462
},
{
"epoch": 1.0999107939339876,
"grad_norm": 0.47404006123542786,
"learning_rate": 7.299880525686978e-05,
"loss": 0.3562,
"step": 463
},
{
"epoch": 1.1022896223609873,
"grad_norm": 0.5289527177810669,
"learning_rate": 7.293906810035843e-05,
"loss": 0.3986,
"step": 464
},
{
"epoch": 1.104668450787987,
"grad_norm": 0.5155789256095886,
"learning_rate": 7.287933094384708e-05,
"loss": 0.3943,
"step": 465
},
{
"epoch": 1.1070472792149866,
"grad_norm": 0.5496542453765869,
"learning_rate": 7.281959378733573e-05,
"loss": 0.398,
"step": 466
},
{
"epoch": 1.1094261076419862,
"grad_norm": 0.47129660844802856,
"learning_rate": 7.275985663082438e-05,
"loss": 0.3039,
"step": 467
},
{
"epoch": 1.1118049360689861,
"grad_norm": 0.4683108627796173,
"learning_rate": 7.270011947431302e-05,
"loss": 0.3282,
"step": 468
},
{
"epoch": 1.1141837644959858,
"grad_norm": 0.49074095487594604,
"learning_rate": 7.264038231780168e-05,
"loss": 0.3097,
"step": 469
},
{
"epoch": 1.1165625929229854,
"grad_norm": 0.47383546829223633,
"learning_rate": 7.258064516129033e-05,
"loss": 0.3536,
"step": 470
},
{
"epoch": 1.118941421349985,
"grad_norm": 0.5356380939483643,
"learning_rate": 7.252090800477898e-05,
"loss": 0.3153,
"step": 471
},
{
"epoch": 1.1213202497769847,
"grad_norm": 0.46198195219039917,
"learning_rate": 7.246117084826762e-05,
"loss": 0.3815,
"step": 472
},
{
"epoch": 1.1236990782039846,
"grad_norm": 0.6104227304458618,
"learning_rate": 7.240143369175627e-05,
"loss": 0.4299,
"step": 473
},
{
"epoch": 1.1260779066309843,
"grad_norm": 0.4720747172832489,
"learning_rate": 7.234169653524493e-05,
"loss": 0.3439,
"step": 474
},
{
"epoch": 1.128456735057984,
"grad_norm": 0.5005800724029541,
"learning_rate": 7.228195937873358e-05,
"loss": 0.35,
"step": 475
},
{
"epoch": 1.1308355634849836,
"grad_norm": 0.48051440715789795,
"learning_rate": 7.222222222222222e-05,
"loss": 0.3473,
"step": 476
},
{
"epoch": 1.1332143919119833,
"grad_norm": 0.5383554697036743,
"learning_rate": 7.216248506571087e-05,
"loss": 0.3363,
"step": 477
},
{
"epoch": 1.1355932203389831,
"grad_norm": 0.5020559430122375,
"learning_rate": 7.210274790919953e-05,
"loss": 0.3175,
"step": 478
},
{
"epoch": 1.1379720487659828,
"grad_norm": 0.523763120174408,
"learning_rate": 7.204301075268818e-05,
"loss": 0.3663,
"step": 479
},
{
"epoch": 1.1403508771929824,
"grad_norm": 0.45200997591018677,
"learning_rate": 7.198327359617682e-05,
"loss": 0.3154,
"step": 480
},
{
"epoch": 1.142729705619982,
"grad_norm": 0.47503340244293213,
"learning_rate": 7.192353643966547e-05,
"loss": 0.2779,
"step": 481
},
{
"epoch": 1.1451085340469818,
"grad_norm": 0.47074899077415466,
"learning_rate": 7.186379928315412e-05,
"loss": 0.3705,
"step": 482
},
{
"epoch": 1.1474873624739816,
"grad_norm": 0.6446363925933838,
"learning_rate": 7.180406212664278e-05,
"loss": 0.3343,
"step": 483
},
{
"epoch": 1.1498661909009813,
"grad_norm": 0.516174852848053,
"learning_rate": 7.174432497013142e-05,
"loss": 0.3437,
"step": 484
},
{
"epoch": 1.152245019327981,
"grad_norm": 0.4703758955001831,
"learning_rate": 7.168458781362007e-05,
"loss": 0.3604,
"step": 485
},
{
"epoch": 1.1546238477549806,
"grad_norm": 0.4764377176761627,
"learning_rate": 7.162485065710872e-05,
"loss": 0.3724,
"step": 486
},
{
"epoch": 1.1570026761819805,
"grad_norm": 0.5020294785499573,
"learning_rate": 7.156511350059739e-05,
"loss": 0.3681,
"step": 487
},
{
"epoch": 1.1593815046089802,
"grad_norm": 0.5461302399635315,
"learning_rate": 7.150537634408602e-05,
"loss": 0.3805,
"step": 488
},
{
"epoch": 1.1617603330359798,
"grad_norm": 0.5569891333580017,
"learning_rate": 7.144563918757467e-05,
"loss": 0.4203,
"step": 489
},
{
"epoch": 1.1641391614629795,
"grad_norm": 0.5478572845458984,
"learning_rate": 7.138590203106332e-05,
"loss": 0.4285,
"step": 490
},
{
"epoch": 1.1665179898899791,
"grad_norm": 0.5138799548149109,
"learning_rate": 7.132616487455197e-05,
"loss": 0.3738,
"step": 491
},
{
"epoch": 1.1688968183169788,
"grad_norm": 0.5300459265708923,
"learning_rate": 7.126642771804062e-05,
"loss": 0.411,
"step": 492
},
{
"epoch": 1.1712756467439787,
"grad_norm": 0.4664587378501892,
"learning_rate": 7.120669056152927e-05,
"loss": 0.3519,
"step": 493
},
{
"epoch": 1.1736544751709783,
"grad_norm": 0.4465605616569519,
"learning_rate": 7.114695340501792e-05,
"loss": 0.3515,
"step": 494
},
{
"epoch": 1.176033303597978,
"grad_norm": 0.48844248056411743,
"learning_rate": 7.108721624850657e-05,
"loss": 0.3656,
"step": 495
},
{
"epoch": 1.1784121320249776,
"grad_norm": 0.4901794195175171,
"learning_rate": 7.102747909199522e-05,
"loss": 0.3904,
"step": 496
},
{
"epoch": 1.1807909604519775,
"grad_norm": 0.5040355324745178,
"learning_rate": 7.096774193548388e-05,
"loss": 0.3242,
"step": 497
},
{
"epoch": 1.1831697888789772,
"grad_norm": 0.5615472793579102,
"learning_rate": 7.090800477897253e-05,
"loss": 0.3417,
"step": 498
},
{
"epoch": 1.1855486173059768,
"grad_norm": 0.4797894358634949,
"learning_rate": 7.084826762246118e-05,
"loss": 0.3423,
"step": 499
},
{
"epoch": 1.1879274457329765,
"grad_norm": 0.4823111891746521,
"learning_rate": 7.078853046594983e-05,
"loss": 0.35,
"step": 500
},
{
"epoch": 1.1879274457329765,
"eval_loss": 0.4021657109260559,
"eval_runtime": 24.5893,
"eval_samples_per_second": 30.42,
"eval_steps_per_second": 15.21,
"step": 500
},
{
"epoch": 1.1903062741599761,
"grad_norm": 0.5362781286239624,
"learning_rate": 7.072879330943848e-05,
"loss": 0.3355,
"step": 501
},
{
"epoch": 1.1926851025869758,
"grad_norm": 0.5271429419517517,
"learning_rate": 7.066905615292713e-05,
"loss": 0.3473,
"step": 502
},
{
"epoch": 1.1950639310139757,
"grad_norm": 0.48875752091407776,
"learning_rate": 7.060931899641578e-05,
"loss": 0.3527,
"step": 503
},
{
"epoch": 1.1974427594409753,
"grad_norm": 0.5725129842758179,
"learning_rate": 7.054958183990443e-05,
"loss": 0.3942,
"step": 504
},
{
"epoch": 1.199821587867975,
"grad_norm": 0.5125251412391663,
"learning_rate": 7.048984468339306e-05,
"loss": 0.3818,
"step": 505
},
{
"epoch": 1.2022004162949746,
"grad_norm": 0.5617442727088928,
"learning_rate": 7.043010752688173e-05,
"loss": 0.3741,
"step": 506
},
{
"epoch": 1.2045792447219745,
"grad_norm": 0.4992014467716217,
"learning_rate": 7.037037037037038e-05,
"loss": 0.368,
"step": 507
},
{
"epoch": 1.2069580731489742,
"grad_norm": 0.5008623600006104,
"learning_rate": 7.031063321385903e-05,
"loss": 0.3235,
"step": 508
},
{
"epoch": 1.2093369015759738,
"grad_norm": 0.49350398778915405,
"learning_rate": 7.025089605734767e-05,
"loss": 0.3355,
"step": 509
},
{
"epoch": 1.2117157300029735,
"grad_norm": 0.5325546860694885,
"learning_rate": 7.019115890083633e-05,
"loss": 0.4046,
"step": 510
},
{
"epoch": 1.2140945584299732,
"grad_norm": 0.49078550934791565,
"learning_rate": 7.013142174432498e-05,
"loss": 0.3521,
"step": 511
},
{
"epoch": 1.216473386856973,
"grad_norm": 0.5535020232200623,
"learning_rate": 7.007168458781363e-05,
"loss": 0.3914,
"step": 512
},
{
"epoch": 1.2188522152839727,
"grad_norm": 0.4585452079772949,
"learning_rate": 7.001194743130227e-05,
"loss": 0.2666,
"step": 513
},
{
"epoch": 1.2212310437109724,
"grad_norm": 0.6677319407463074,
"learning_rate": 6.995221027479092e-05,
"loss": 0.4613,
"step": 514
},
{
"epoch": 1.223609872137972,
"grad_norm": 0.5839059948921204,
"learning_rate": 6.989247311827958e-05,
"loss": 0.4185,
"step": 515
},
{
"epoch": 1.2259887005649717,
"grad_norm": 0.5040207505226135,
"learning_rate": 6.983273596176823e-05,
"loss": 0.3351,
"step": 516
},
{
"epoch": 1.2283675289919715,
"grad_norm": 0.5528902411460876,
"learning_rate": 6.977299880525687e-05,
"loss": 0.3561,
"step": 517
},
{
"epoch": 1.2307463574189712,
"grad_norm": 0.5583405494689941,
"learning_rate": 6.971326164874552e-05,
"loss": 0.3641,
"step": 518
},
{
"epoch": 1.2331251858459709,
"grad_norm": 0.5596750378608704,
"learning_rate": 6.965352449223417e-05,
"loss": 0.3536,
"step": 519
},
{
"epoch": 1.2355040142729705,
"grad_norm": 0.5579570531845093,
"learning_rate": 6.959378733572283e-05,
"loss": 0.4025,
"step": 520
},
{
"epoch": 1.2378828426999702,
"grad_norm": 0.5613878965377808,
"learning_rate": 6.953405017921147e-05,
"loss": 0.3825,
"step": 521
},
{
"epoch": 1.24026167112697,
"grad_norm": 0.5365767478942871,
"learning_rate": 6.947431302270012e-05,
"loss": 0.3801,
"step": 522
},
{
"epoch": 1.2426404995539697,
"grad_norm": 0.558989942073822,
"learning_rate": 6.941457586618877e-05,
"loss": 0.391,
"step": 523
},
{
"epoch": 1.2450193279809694,
"grad_norm": 0.5318464636802673,
"learning_rate": 6.935483870967743e-05,
"loss": 0.3548,
"step": 524
},
{
"epoch": 1.247398156407969,
"grad_norm": 0.5546141266822815,
"learning_rate": 6.929510155316607e-05,
"loss": 0.3988,
"step": 525
},
{
"epoch": 1.2497769848349687,
"grad_norm": 0.5842304229736328,
"learning_rate": 6.923536439665472e-05,
"loss": 0.4038,
"step": 526
},
{
"epoch": 1.2521558132619686,
"grad_norm": 0.5150060653686523,
"learning_rate": 6.917562724014337e-05,
"loss": 0.3793,
"step": 527
},
{
"epoch": 1.2545346416889682,
"grad_norm": 0.4285811185836792,
"learning_rate": 6.911589008363202e-05,
"loss": 0.3562,
"step": 528
},
{
"epoch": 1.2569134701159679,
"grad_norm": 0.4896494746208191,
"learning_rate": 6.905615292712067e-05,
"loss": 0.3246,
"step": 529
},
{
"epoch": 1.2592922985429675,
"grad_norm": 0.5333868861198425,
"learning_rate": 6.899641577060932e-05,
"loss": 0.4024,
"step": 530
},
{
"epoch": 1.2616711269699672,
"grad_norm": 0.5703678727149963,
"learning_rate": 6.893667861409797e-05,
"loss": 0.3583,
"step": 531
},
{
"epoch": 1.264049955396967,
"grad_norm": 0.4980472922325134,
"learning_rate": 6.887694145758662e-05,
"loss": 0.3631,
"step": 532
},
{
"epoch": 1.2664287838239667,
"grad_norm": 0.4485708475112915,
"learning_rate": 6.881720430107527e-05,
"loss": 0.2715,
"step": 533
},
{
"epoch": 1.2688076122509664,
"grad_norm": 0.454412579536438,
"learning_rate": 6.875746714456392e-05,
"loss": 0.2944,
"step": 534
},
{
"epoch": 1.271186440677966,
"grad_norm": 0.4816785156726837,
"learning_rate": 6.869772998805257e-05,
"loss": 0.4177,
"step": 535
},
{
"epoch": 1.273565269104966,
"grad_norm": 0.5463387370109558,
"learning_rate": 6.863799283154122e-05,
"loss": 0.3994,
"step": 536
},
{
"epoch": 1.2759440975319656,
"grad_norm": 0.5411667823791504,
"learning_rate": 6.857825567502987e-05,
"loss": 0.3623,
"step": 537
},
{
"epoch": 1.2783229259589652,
"grad_norm": 0.5007591247558594,
"learning_rate": 6.851851851851852e-05,
"loss": 0.3953,
"step": 538
},
{
"epoch": 1.280701754385965,
"grad_norm": 0.5197054147720337,
"learning_rate": 6.845878136200717e-05,
"loss": 0.3355,
"step": 539
},
{
"epoch": 1.2830805828129646,
"grad_norm": 0.48684370517730713,
"learning_rate": 6.839904420549582e-05,
"loss": 0.3826,
"step": 540
},
{
"epoch": 1.2854594112399642,
"grad_norm": 0.5456849932670593,
"learning_rate": 6.833930704898447e-05,
"loss": 0.3919,
"step": 541
},
{
"epoch": 1.287838239666964,
"grad_norm": 0.4965335428714752,
"learning_rate": 6.827956989247311e-05,
"loss": 0.3905,
"step": 542
},
{
"epoch": 1.2902170680939637,
"grad_norm": 0.4824836850166321,
"learning_rate": 6.821983273596178e-05,
"loss": 0.3212,
"step": 543
},
{
"epoch": 1.2925958965209634,
"grad_norm": 0.5223451256752014,
"learning_rate": 6.816009557945043e-05,
"loss": 0.3359,
"step": 544
},
{
"epoch": 1.294974724947963,
"grad_norm": 0.4838787317276001,
"learning_rate": 6.810035842293908e-05,
"loss": 0.2781,
"step": 545
},
{
"epoch": 1.297353553374963,
"grad_norm": 0.4827715754508972,
"learning_rate": 6.804062126642771e-05,
"loss": 0.3118,
"step": 546
},
{
"epoch": 1.2997323818019626,
"grad_norm": 0.49691149592399597,
"learning_rate": 6.798088410991638e-05,
"loss": 0.3527,
"step": 547
},
{
"epoch": 1.3021112102289623,
"grad_norm": 0.49155953526496887,
"learning_rate": 6.792114695340503e-05,
"loss": 0.3626,
"step": 548
},
{
"epoch": 1.304490038655962,
"grad_norm": 0.5283527374267578,
"learning_rate": 6.786140979689368e-05,
"loss": 0.3875,
"step": 549
},
{
"epoch": 1.3068688670829616,
"grad_norm": 0.5794263482093811,
"learning_rate": 6.780167264038231e-05,
"loss": 0.3357,
"step": 550
},
{
"epoch": 1.3092476955099612,
"grad_norm": 0.5505433082580566,
"learning_rate": 6.774193548387096e-05,
"loss": 0.4037,
"step": 551
},
{
"epoch": 1.311626523936961,
"grad_norm": 0.48500585556030273,
"learning_rate": 6.768219832735963e-05,
"loss": 0.3564,
"step": 552
},
{
"epoch": 1.3140053523639608,
"grad_norm": 0.5165260434150696,
"learning_rate": 6.762246117084828e-05,
"loss": 0.3726,
"step": 553
},
{
"epoch": 1.3163841807909604,
"grad_norm": 0.5240358710289001,
"learning_rate": 6.756272401433692e-05,
"loss": 0.3444,
"step": 554
},
{
"epoch": 1.31876300921796,
"grad_norm": 0.45539531111717224,
"learning_rate": 6.750298685782557e-05,
"loss": 0.314,
"step": 555
},
{
"epoch": 1.32114183764496,
"grad_norm": 0.48855656385421753,
"learning_rate": 6.744324970131423e-05,
"loss": 0.3942,
"step": 556
},
{
"epoch": 1.3235206660719596,
"grad_norm": 0.4930460751056671,
"learning_rate": 6.738351254480288e-05,
"loss": 0.3297,
"step": 557
},
{
"epoch": 1.3258994944989593,
"grad_norm": 0.4979415833950043,
"learning_rate": 6.732377538829152e-05,
"loss": 0.343,
"step": 558
},
{
"epoch": 1.328278322925959,
"grad_norm": 0.47355780005455017,
"learning_rate": 6.726403823178017e-05,
"loss": 0.3013,
"step": 559
},
{
"epoch": 1.3306571513529586,
"grad_norm": 0.5629305839538574,
"learning_rate": 6.720430107526882e-05,
"loss": 0.3732,
"step": 560
},
{
"epoch": 1.3330359797799582,
"grad_norm": 0.44539135694503784,
"learning_rate": 6.714456391875748e-05,
"loss": 0.3009,
"step": 561
},
{
"epoch": 1.3354148082069581,
"grad_norm": 0.5273521542549133,
"learning_rate": 6.708482676224612e-05,
"loss": 0.3454,
"step": 562
},
{
"epoch": 1.3377936366339578,
"grad_norm": 0.5609079003334045,
"learning_rate": 6.702508960573477e-05,
"loss": 0.3395,
"step": 563
},
{
"epoch": 1.3401724650609574,
"grad_norm": 0.4624035954475403,
"learning_rate": 6.696535244922342e-05,
"loss": 0.3244,
"step": 564
},
{
"epoch": 1.342551293487957,
"grad_norm": 0.4840734302997589,
"learning_rate": 6.690561529271207e-05,
"loss": 0.3705,
"step": 565
},
{
"epoch": 1.344930121914957,
"grad_norm": 0.5756794810295105,
"learning_rate": 6.684587813620072e-05,
"loss": 0.3406,
"step": 566
},
{
"epoch": 1.3473089503419566,
"grad_norm": 0.5743434429168701,
"learning_rate": 6.678614097968937e-05,
"loss": 0.4137,
"step": 567
},
{
"epoch": 1.3496877787689563,
"grad_norm": 0.5068850517272949,
"learning_rate": 6.672640382317802e-05,
"loss": 0.4139,
"step": 568
},
{
"epoch": 1.352066607195956,
"grad_norm": 0.5004804134368896,
"learning_rate": 6.666666666666667e-05,
"loss": 0.3534,
"step": 569
},
{
"epoch": 1.3544454356229556,
"grad_norm": 0.5731816291809082,
"learning_rate": 6.660692951015532e-05,
"loss": 0.3521,
"step": 570
},
{
"epoch": 1.3568242640499553,
"grad_norm": 0.5832188129425049,
"learning_rate": 6.654719235364397e-05,
"loss": 0.3541,
"step": 571
},
{
"epoch": 1.3592030924769551,
"grad_norm": 0.4894527792930603,
"learning_rate": 6.648745519713262e-05,
"loss": 0.36,
"step": 572
},
{
"epoch": 1.3615819209039548,
"grad_norm": 0.5579627752304077,
"learning_rate": 6.642771804062127e-05,
"loss": 0.4035,
"step": 573
},
{
"epoch": 1.3639607493309545,
"grad_norm": 0.5148259997367859,
"learning_rate": 6.636798088410992e-05,
"loss": 0.3376,
"step": 574
},
{
"epoch": 1.3663395777579543,
"grad_norm": 0.5262428522109985,
"learning_rate": 6.630824372759857e-05,
"loss": 0.375,
"step": 575
},
{
"epoch": 1.368718406184954,
"grad_norm": 0.4974241554737091,
"learning_rate": 6.624850657108722e-05,
"loss": 0.3895,
"step": 576
},
{
"epoch": 1.3710972346119537,
"grad_norm": 0.6021261215209961,
"learning_rate": 6.618876941457587e-05,
"loss": 0.4076,
"step": 577
},
{
"epoch": 1.3734760630389533,
"grad_norm": 0.5429256558418274,
"learning_rate": 6.612903225806452e-05,
"loss": 0.3665,
"step": 578
},
{
"epoch": 1.375854891465953,
"grad_norm": 0.5747050046920776,
"learning_rate": 6.606929510155317e-05,
"loss": 0.3845,
"step": 579
},
{
"epoch": 1.3782337198929526,
"grad_norm": 0.5203744173049927,
"learning_rate": 6.600955794504182e-05,
"loss": 0.2655,
"step": 580
},
{
"epoch": 1.3806125483199525,
"grad_norm": 0.5207931995391846,
"learning_rate": 6.594982078853047e-05,
"loss": 0.3998,
"step": 581
},
{
"epoch": 1.3829913767469522,
"grad_norm": 0.4678313732147217,
"learning_rate": 6.589008363201912e-05,
"loss": 0.2987,
"step": 582
},
{
"epoch": 1.3853702051739518,
"grad_norm": 0.5557482242584229,
"learning_rate": 6.583034647550776e-05,
"loss": 0.399,
"step": 583
},
{
"epoch": 1.3877490336009515,
"grad_norm": 0.5944557785987854,
"learning_rate": 6.577060931899642e-05,
"loss": 0.4344,
"step": 584
},
{
"epoch": 1.3901278620279514,
"grad_norm": 0.5242769718170166,
"learning_rate": 6.571087216248507e-05,
"loss": 0.3141,
"step": 585
},
{
"epoch": 1.392506690454951,
"grad_norm": 0.5812731385231018,
"learning_rate": 6.565113500597372e-05,
"loss": 0.3739,
"step": 586
},
{
"epoch": 1.3948855188819507,
"grad_norm": 0.5420759320259094,
"learning_rate": 6.559139784946236e-05,
"loss": 0.3644,
"step": 587
},
{
"epoch": 1.3972643473089503,
"grad_norm": 0.565941333770752,
"learning_rate": 6.553166069295101e-05,
"loss": 0.4276,
"step": 588
},
{
"epoch": 1.39964317573595,
"grad_norm": 0.502975344657898,
"learning_rate": 6.547192353643968e-05,
"loss": 0.3868,
"step": 589
},
{
"epoch": 1.4020220041629496,
"grad_norm": 0.5543567538261414,
"learning_rate": 6.541218637992833e-05,
"loss": 0.3978,
"step": 590
},
{
"epoch": 1.4044008325899495,
"grad_norm": 0.5115704536437988,
"learning_rate": 6.535244922341696e-05,
"loss": 0.3002,
"step": 591
},
{
"epoch": 1.4067796610169492,
"grad_norm": 0.50956130027771,
"learning_rate": 6.529271206690561e-05,
"loss": 0.3456,
"step": 592
},
{
"epoch": 1.4091584894439488,
"grad_norm": 0.6440345048904419,
"learning_rate": 6.523297491039428e-05,
"loss": 0.4266,
"step": 593
},
{
"epoch": 1.4115373178709485,
"grad_norm": 0.49989134073257446,
"learning_rate": 6.517323775388293e-05,
"loss": 0.364,
"step": 594
},
{
"epoch": 1.4139161462979484,
"grad_norm": 0.5295883417129517,
"learning_rate": 6.511350059737156e-05,
"loss": 0.3685,
"step": 595
},
{
"epoch": 1.416294974724948,
"grad_norm": 0.5092173218727112,
"learning_rate": 6.505376344086021e-05,
"loss": 0.3517,
"step": 596
},
{
"epoch": 1.4186738031519477,
"grad_norm": 0.43385863304138184,
"learning_rate": 6.499402628434886e-05,
"loss": 0.2672,
"step": 597
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.5401270985603333,
"learning_rate": 6.493428912783753e-05,
"loss": 0.3773,
"step": 598
},
{
"epoch": 1.423431460005947,
"grad_norm": 0.45737171173095703,
"learning_rate": 6.487455197132617e-05,
"loss": 0.2862,
"step": 599
},
{
"epoch": 1.4258102884329467,
"grad_norm": 0.5563622713088989,
"learning_rate": 6.481481481481482e-05,
"loss": 0.3362,
"step": 600
},
{
"epoch": 1.4258102884329467,
"eval_loss": 0.39686334133148193,
"eval_runtime": 24.7459,
"eval_samples_per_second": 30.227,
"eval_steps_per_second": 15.114,
"step": 600
},
{
"epoch": 1.4281891168599465,
"grad_norm": 0.5734651684761047,
"learning_rate": 6.475507765830347e-05,
"loss": 0.4083,
"step": 601
},
{
"epoch": 1.4305679452869462,
"grad_norm": 0.45734620094299316,
"learning_rate": 6.469534050179213e-05,
"loss": 0.3329,
"step": 602
},
{
"epoch": 1.4329467737139459,
"grad_norm": 0.49928852915763855,
"learning_rate": 6.463560334528077e-05,
"loss": 0.3785,
"step": 603
},
{
"epoch": 1.4353256021409455,
"grad_norm": 0.48517173528671265,
"learning_rate": 6.457586618876942e-05,
"loss": 0.3299,
"step": 604
},
{
"epoch": 1.4377044305679454,
"grad_norm": 0.4913112223148346,
"learning_rate": 6.451612903225807e-05,
"loss": 0.3249,
"step": 605
},
{
"epoch": 1.440083258994945,
"grad_norm": 0.5694173574447632,
"learning_rate": 6.445639187574672e-05,
"loss": 0.3494,
"step": 606
},
{
"epoch": 1.4424620874219447,
"grad_norm": 0.47303032875061035,
"learning_rate": 6.439665471923537e-05,
"loss": 0.347,
"step": 607
},
{
"epoch": 1.4448409158489444,
"grad_norm": 0.5557957291603088,
"learning_rate": 6.433691756272402e-05,
"loss": 0.4251,
"step": 608
},
{
"epoch": 1.447219744275944,
"grad_norm": 0.5228065848350525,
"learning_rate": 6.427718040621267e-05,
"loss": 0.3118,
"step": 609
},
{
"epoch": 1.4495985727029437,
"grad_norm": 0.5313312411308289,
"learning_rate": 6.421744324970132e-05,
"loss": 0.3826,
"step": 610
},
{
"epoch": 1.4519774011299436,
"grad_norm": 0.5596655607223511,
"learning_rate": 6.415770609318996e-05,
"loss": 0.4413,
"step": 611
},
{
"epoch": 1.4543562295569432,
"grad_norm": 0.575697124004364,
"learning_rate": 6.409796893667862e-05,
"loss": 0.4097,
"step": 612
},
{
"epoch": 1.4567350579839429,
"grad_norm": 0.4731719493865967,
"learning_rate": 6.403823178016727e-05,
"loss": 0.3443,
"step": 613
},
{
"epoch": 1.4591138864109425,
"grad_norm": 0.5273444056510925,
"learning_rate": 6.397849462365592e-05,
"loss": 0.3843,
"step": 614
},
{
"epoch": 1.4614927148379424,
"grad_norm": 0.5462388396263123,
"learning_rate": 6.391875746714456e-05,
"loss": 0.3972,
"step": 615
},
{
"epoch": 1.463871543264942,
"grad_norm": 0.45089593529701233,
"learning_rate": 6.385902031063322e-05,
"loss": 0.3287,
"step": 616
},
{
"epoch": 1.4662503716919417,
"grad_norm": 0.47337663173675537,
"learning_rate": 6.379928315412187e-05,
"loss": 0.3347,
"step": 617
},
{
"epoch": 1.4686292001189414,
"grad_norm": 0.5648345947265625,
"learning_rate": 6.373954599761052e-05,
"loss": 0.3703,
"step": 618
},
{
"epoch": 1.471008028545941,
"grad_norm": 0.5006834268569946,
"learning_rate": 6.367980884109916e-05,
"loss": 0.3699,
"step": 619
},
{
"epoch": 1.4733868569729407,
"grad_norm": 0.51920086145401,
"learning_rate": 6.362007168458781e-05,
"loss": 0.3799,
"step": 620
},
{
"epoch": 1.4757656853999406,
"grad_norm": 0.5321612358093262,
"learning_rate": 6.356033452807647e-05,
"loss": 0.3866,
"step": 621
},
{
"epoch": 1.4781445138269402,
"grad_norm": 0.5410431623458862,
"learning_rate": 6.350059737156512e-05,
"loss": 0.3777,
"step": 622
},
{
"epoch": 1.48052334225394,
"grad_norm": 0.5751019716262817,
"learning_rate": 6.344086021505376e-05,
"loss": 0.3585,
"step": 623
},
{
"epoch": 1.4829021706809398,
"grad_norm": 0.5092802047729492,
"learning_rate": 6.338112305854241e-05,
"loss": 0.3229,
"step": 624
},
{
"epoch": 1.4852809991079394,
"grad_norm": 0.5037229061126709,
"learning_rate": 6.332138590203107e-05,
"loss": 0.3243,
"step": 625
},
{
"epoch": 1.487659827534939,
"grad_norm": 0.5060740113258362,
"learning_rate": 6.326164874551972e-05,
"loss": 0.4023,
"step": 626
},
{
"epoch": 1.4900386559619387,
"grad_norm": 0.5434842705726624,
"learning_rate": 6.320191158900836e-05,
"loss": 0.3541,
"step": 627
},
{
"epoch": 1.4924174843889384,
"grad_norm": 0.5165389180183411,
"learning_rate": 6.314217443249701e-05,
"loss": 0.317,
"step": 628
},
{
"epoch": 1.494796312815938,
"grad_norm": 0.4614243805408478,
"learning_rate": 6.308243727598566e-05,
"loss": 0.3269,
"step": 629
},
{
"epoch": 1.497175141242938,
"grad_norm": 0.5254392027854919,
"learning_rate": 6.302270011947432e-05,
"loss": 0.3223,
"step": 630
},
{
"epoch": 1.4995539696699376,
"grad_norm": 0.54349285364151,
"learning_rate": 6.296296296296296e-05,
"loss": 0.3861,
"step": 631
},
{
"epoch": 1.5019327980969372,
"grad_norm": 0.6002068519592285,
"learning_rate": 6.290322580645161e-05,
"loss": 0.4794,
"step": 632
},
{
"epoch": 1.504311626523937,
"grad_norm": 0.5521604418754578,
"learning_rate": 6.284348864994026e-05,
"loss": 0.355,
"step": 633
},
{
"epoch": 1.5066904549509368,
"grad_norm": 0.5643691420555115,
"learning_rate": 6.278375149342891e-05,
"loss": 0.4274,
"step": 634
},
{
"epoch": 1.5090692833779364,
"grad_norm": 0.5313323736190796,
"learning_rate": 6.272401433691756e-05,
"loss": 0.3817,
"step": 635
},
{
"epoch": 1.511448111804936,
"grad_norm": 0.5157251358032227,
"learning_rate": 6.266427718040621e-05,
"loss": 0.3636,
"step": 636
},
{
"epoch": 1.5138269402319358,
"grad_norm": 0.6033576130867004,
"learning_rate": 6.260454002389486e-05,
"loss": 0.3274,
"step": 637
},
{
"epoch": 1.5162057686589354,
"grad_norm": 0.5338175296783447,
"learning_rate": 6.254480286738351e-05,
"loss": 0.4118,
"step": 638
},
{
"epoch": 1.518584597085935,
"grad_norm": 0.5195765495300293,
"learning_rate": 6.248506571087216e-05,
"loss": 0.3153,
"step": 639
},
{
"epoch": 1.5209634255129347,
"grad_norm": 0.4563392996788025,
"learning_rate": 6.242532855436081e-05,
"loss": 0.3512,
"step": 640
},
{
"epoch": 1.5233422539399346,
"grad_norm": 0.5181533694267273,
"learning_rate": 6.236559139784946e-05,
"loss": 0.3473,
"step": 641
},
{
"epoch": 1.5257210823669343,
"grad_norm": 0.5112624168395996,
"learning_rate": 6.230585424133811e-05,
"loss": 0.3215,
"step": 642
},
{
"epoch": 1.5280999107939341,
"grad_norm": 0.48247793316841125,
"learning_rate": 6.224611708482677e-05,
"loss": 0.3685,
"step": 643
},
{
"epoch": 1.5304787392209338,
"grad_norm": 0.4956831932067871,
"learning_rate": 6.218637992831542e-05,
"loss": 0.3542,
"step": 644
},
{
"epoch": 1.5328575676479335,
"grad_norm": 0.5187450051307678,
"learning_rate": 6.212664277180407e-05,
"loss": 0.3949,
"step": 645
},
{
"epoch": 1.5352363960749331,
"grad_norm": 0.4967552721500397,
"learning_rate": 6.206690561529272e-05,
"loss": 0.4084,
"step": 646
},
{
"epoch": 1.5376152245019328,
"grad_norm": 0.5258282423019409,
"learning_rate": 6.200716845878137e-05,
"loss": 0.3735,
"step": 647
},
{
"epoch": 1.5399940529289324,
"grad_norm": 0.42625510692596436,
"learning_rate": 6.194743130227002e-05,
"loss": 0.278,
"step": 648
},
{
"epoch": 1.542372881355932,
"grad_norm": 0.5355501770973206,
"learning_rate": 6.188769414575867e-05,
"loss": 0.3549,
"step": 649
},
{
"epoch": 1.5447517097829317,
"grad_norm": 0.48417362570762634,
"learning_rate": 6.182795698924732e-05,
"loss": 0.3222,
"step": 650
},
{
"epoch": 1.5471305382099316,
"grad_norm": 0.5076582431793213,
"learning_rate": 6.176821983273597e-05,
"loss": 0.3242,
"step": 651
},
{
"epoch": 1.5495093666369313,
"grad_norm": 0.49634814262390137,
"learning_rate": 6.17084826762246e-05,
"loss": 0.3095,
"step": 652
},
{
"epoch": 1.5518881950639312,
"grad_norm": 0.5837588310241699,
"learning_rate": 6.164874551971327e-05,
"loss": 0.4119,
"step": 653
},
{
"epoch": 1.5542670234909308,
"grad_norm": 0.5286645293235779,
"learning_rate": 6.158900836320192e-05,
"loss": 0.3891,
"step": 654
},
{
"epoch": 1.5566458519179305,
"grad_norm": 0.4566105306148529,
"learning_rate": 6.152927120669057e-05,
"loss": 0.2904,
"step": 655
},
{
"epoch": 1.5590246803449301,
"grad_norm": 0.6136950254440308,
"learning_rate": 6.14695340501792e-05,
"loss": 0.3836,
"step": 656
},
{
"epoch": 1.5614035087719298,
"grad_norm": 0.7156520485877991,
"learning_rate": 6.140979689366786e-05,
"loss": 0.2578,
"step": 657
},
{
"epoch": 1.5637823371989295,
"grad_norm": 0.5305624604225159,
"learning_rate": 6.135005973715652e-05,
"loss": 0.37,
"step": 658
},
{
"epoch": 1.566161165625929,
"grad_norm": 0.5822964310646057,
"learning_rate": 6.129032258064517e-05,
"loss": 0.4268,
"step": 659
},
{
"epoch": 1.568539994052929,
"grad_norm": 0.4895828664302826,
"learning_rate": 6.12305854241338e-05,
"loss": 0.317,
"step": 660
},
{
"epoch": 1.5709188224799286,
"grad_norm": 0.4806120693683624,
"learning_rate": 6.117084826762246e-05,
"loss": 0.3224,
"step": 661
},
{
"epoch": 1.5732976509069283,
"grad_norm": 0.5153700113296509,
"learning_rate": 6.111111111111112e-05,
"loss": 0.387,
"step": 662
},
{
"epoch": 1.5756764793339282,
"grad_norm": 0.4535806477069855,
"learning_rate": 6.105137395459977e-05,
"loss": 0.3369,
"step": 663
},
{
"epoch": 1.5780553077609278,
"grad_norm": 0.5814690589904785,
"learning_rate": 6.0991636798088415e-05,
"loss": 0.5091,
"step": 664
},
{
"epoch": 1.5804341361879275,
"grad_norm": 0.5779417157173157,
"learning_rate": 6.093189964157706e-05,
"loss": 0.4162,
"step": 665
},
{
"epoch": 1.5828129646149272,
"grad_norm": 0.5305196046829224,
"learning_rate": 6.087216248506571e-05,
"loss": 0.3734,
"step": 666
},
{
"epoch": 1.5851917930419268,
"grad_norm": 0.5372288823127747,
"learning_rate": 6.0812425328554366e-05,
"loss": 0.2981,
"step": 667
},
{
"epoch": 1.5875706214689265,
"grad_norm": 0.49461305141448975,
"learning_rate": 6.0752688172043016e-05,
"loss": 0.3412,
"step": 668
},
{
"epoch": 1.5899494498959261,
"grad_norm": 0.49554646015167236,
"learning_rate": 6.069295101553166e-05,
"loss": 0.3639,
"step": 669
},
{
"epoch": 1.592328278322926,
"grad_norm": 0.515537679195404,
"learning_rate": 6.063321385902031e-05,
"loss": 0.3132,
"step": 670
},
{
"epoch": 1.5947071067499257,
"grad_norm": 0.5863080024719238,
"learning_rate": 6.057347670250897e-05,
"loss": 0.3877,
"step": 671
},
{
"epoch": 1.5970859351769253,
"grad_norm": 0.5367560982704163,
"learning_rate": 6.051373954599762e-05,
"loss": 0.3591,
"step": 672
},
{
"epoch": 1.5994647636039252,
"grad_norm": 0.4917997419834137,
"learning_rate": 6.045400238948626e-05,
"loss": 0.3451,
"step": 673
},
{
"epoch": 1.6018435920309249,
"grad_norm": 0.539308488368988,
"learning_rate": 6.039426523297491e-05,
"loss": 0.3696,
"step": 674
},
{
"epoch": 1.6042224204579245,
"grad_norm": 0.6007647514343262,
"learning_rate": 6.033452807646356e-05,
"loss": 0.3717,
"step": 675
},
{
"epoch": 1.6066012488849242,
"grad_norm": 0.4990600347518921,
"learning_rate": 6.027479091995222e-05,
"loss": 0.2856,
"step": 676
},
{
"epoch": 1.6089800773119238,
"grad_norm": 0.5643115043640137,
"learning_rate": 6.021505376344086e-05,
"loss": 0.3852,
"step": 677
},
{
"epoch": 1.6113589057389235,
"grad_norm": 0.5239197611808777,
"learning_rate": 6.015531660692951e-05,
"loss": 0.3479,
"step": 678
},
{
"epoch": 1.6137377341659231,
"grad_norm": 0.576170802116394,
"learning_rate": 6.009557945041816e-05,
"loss": 0.3959,
"step": 679
},
{
"epoch": 1.616116562592923,
"grad_norm": 0.5128218531608582,
"learning_rate": 6.0035842293906806e-05,
"loss": 0.2702,
"step": 680
},
{
"epoch": 1.6184953910199227,
"grad_norm": 0.5164602994918823,
"learning_rate": 5.997610513739546e-05,
"loss": 0.3526,
"step": 681
},
{
"epoch": 1.6208742194469223,
"grad_norm": 0.6454161405563354,
"learning_rate": 5.991636798088411e-05,
"loss": 0.4639,
"step": 682
},
{
"epoch": 1.6232530478739222,
"grad_norm": 0.6234579086303711,
"learning_rate": 5.9856630824372764e-05,
"loss": 0.4439,
"step": 683
},
{
"epoch": 1.6256318763009219,
"grad_norm": 0.5979102849960327,
"learning_rate": 5.979689366786141e-05,
"loss": 0.3348,
"step": 684
},
{
"epoch": 1.6280107047279215,
"grad_norm": 0.4938049018383026,
"learning_rate": 5.9737156511350064e-05,
"loss": 0.3097,
"step": 685
},
{
"epoch": 1.6303895331549212,
"grad_norm": 0.5276683568954468,
"learning_rate": 5.9677419354838715e-05,
"loss": 0.3375,
"step": 686
},
{
"epoch": 1.6327683615819208,
"grad_norm": 0.5086409449577332,
"learning_rate": 5.9617682198327365e-05,
"loss": 0.3752,
"step": 687
},
{
"epoch": 1.6351471900089205,
"grad_norm": 0.4907076954841614,
"learning_rate": 5.955794504181601e-05,
"loss": 0.3211,
"step": 688
},
{
"epoch": 1.6375260184359202,
"grad_norm": 0.534393310546875,
"learning_rate": 5.949820788530466e-05,
"loss": 0.3701,
"step": 689
},
{
"epoch": 1.63990484686292,
"grad_norm": 0.5492992997169495,
"learning_rate": 5.9438470728793316e-05,
"loss": 0.4018,
"step": 690
},
{
"epoch": 1.6422836752899197,
"grad_norm": 0.5407986640930176,
"learning_rate": 5.9378733572281966e-05,
"loss": 0.2754,
"step": 691
},
{
"epoch": 1.6446625037169196,
"grad_norm": 0.5436089038848877,
"learning_rate": 5.931899641577061e-05,
"loss": 0.3546,
"step": 692
},
{
"epoch": 1.6470413321439192,
"grad_norm": 0.4981859028339386,
"learning_rate": 5.925925925925926e-05,
"loss": 0.3077,
"step": 693
},
{
"epoch": 1.649420160570919,
"grad_norm": 0.6169450283050537,
"learning_rate": 5.919952210274792e-05,
"loss": 0.4087,
"step": 694
},
{
"epoch": 1.6517989889979185,
"grad_norm": 0.4453374147415161,
"learning_rate": 5.913978494623657e-05,
"loss": 0.3179,
"step": 695
},
{
"epoch": 1.6541778174249182,
"grad_norm": 0.5288130044937134,
"learning_rate": 5.908004778972521e-05,
"loss": 0.3406,
"step": 696
},
{
"epoch": 1.6565566458519179,
"grad_norm": 0.4710347056388855,
"learning_rate": 5.902031063321386e-05,
"loss": 0.2985,
"step": 697
},
{
"epoch": 1.6589354742789175,
"grad_norm": 0.48624956607818604,
"learning_rate": 5.8960573476702505e-05,
"loss": 0.3804,
"step": 698
},
{
"epoch": 1.6613143027059172,
"grad_norm": 0.5085429549217224,
"learning_rate": 5.890083632019117e-05,
"loss": 0.3055,
"step": 699
},
{
"epoch": 1.663693131132917,
"grad_norm": 0.5178834199905396,
"learning_rate": 5.884109916367981e-05,
"loss": 0.485,
"step": 700
},
{
"epoch": 1.663693131132917,
"eval_loss": 0.38970109820365906,
"eval_runtime": 24.7392,
"eval_samples_per_second": 30.235,
"eval_steps_per_second": 15.118,
"step": 700
},
{
"epoch": 1.6660719595599167,
"grad_norm": 0.5501241683959961,
"learning_rate": 5.878136200716846e-05,
"loss": 0.3649,
"step": 701
},
{
"epoch": 1.6684507879869166,
"grad_norm": 0.485432893037796,
"learning_rate": 5.8721624850657106e-05,
"loss": 0.3161,
"step": 702
},
{
"epoch": 1.6708296164139163,
"grad_norm": 0.49101677536964417,
"learning_rate": 5.8661887694145756e-05,
"loss": 0.3169,
"step": 703
},
{
"epoch": 1.673208444840916,
"grad_norm": 0.5963920950889587,
"learning_rate": 5.860215053763441e-05,
"loss": 0.393,
"step": 704
},
{
"epoch": 1.6755872732679156,
"grad_norm": 0.49317649006843567,
"learning_rate": 5.8542413381123063e-05,
"loss": 0.2979,
"step": 705
},
{
"epoch": 1.6779661016949152,
"grad_norm": 0.5598815679550171,
"learning_rate": 5.848267622461171e-05,
"loss": 0.3924,
"step": 706
},
{
"epoch": 1.6803449301219149,
"grad_norm": 0.4852607250213623,
"learning_rate": 5.842293906810036e-05,
"loss": 0.278,
"step": 707
},
{
"epoch": 1.6827237585489145,
"grad_norm": 0.5489526987075806,
"learning_rate": 5.8363201911589014e-05,
"loss": 0.3864,
"step": 708
},
{
"epoch": 1.6851025869759142,
"grad_norm": 0.4885106086730957,
"learning_rate": 5.8303464755077665e-05,
"loss": 0.3562,
"step": 709
},
{
"epoch": 1.687481415402914,
"grad_norm": 0.47927844524383545,
"learning_rate": 5.824372759856631e-05,
"loss": 0.3441,
"step": 710
},
{
"epoch": 1.6898602438299137,
"grad_norm": 0.5159226655960083,
"learning_rate": 5.818399044205496e-05,
"loss": 0.3259,
"step": 711
},
{
"epoch": 1.6922390722569136,
"grad_norm": 0.5427981615066528,
"learning_rate": 5.812425328554361e-05,
"loss": 0.3629,
"step": 712
},
{
"epoch": 1.6946179006839133,
"grad_norm": 0.5536279678344727,
"learning_rate": 5.8064516129032266e-05,
"loss": 0.3578,
"step": 713
},
{
"epoch": 1.696996729110913,
"grad_norm": 0.5016468167304993,
"learning_rate": 5.800477897252091e-05,
"loss": 0.3633,
"step": 714
},
{
"epoch": 1.6993755575379126,
"grad_norm": 0.49730172753334045,
"learning_rate": 5.794504181600956e-05,
"loss": 0.3682,
"step": 715
},
{
"epoch": 1.7017543859649122,
"grad_norm": 0.5634946823120117,
"learning_rate": 5.788530465949821e-05,
"loss": 0.4343,
"step": 716
},
{
"epoch": 1.704133214391912,
"grad_norm": 0.6075212955474854,
"learning_rate": 5.782556750298687e-05,
"loss": 0.4506,
"step": 717
},
{
"epoch": 1.7065120428189116,
"grad_norm": 0.4885086119174957,
"learning_rate": 5.776583034647551e-05,
"loss": 0.3652,
"step": 718
},
{
"epoch": 1.7088908712459114,
"grad_norm": 0.5155897736549377,
"learning_rate": 5.770609318996416e-05,
"loss": 0.3198,
"step": 719
},
{
"epoch": 1.711269699672911,
"grad_norm": 0.5561105608940125,
"learning_rate": 5.764635603345281e-05,
"loss": 0.4273,
"step": 720
},
{
"epoch": 1.7136485280999108,
"grad_norm": 0.6169248223304749,
"learning_rate": 5.7586618876941455e-05,
"loss": 0.3704,
"step": 721
},
{
"epoch": 1.7160273565269106,
"grad_norm": 0.5170378684997559,
"learning_rate": 5.752688172043011e-05,
"loss": 0.4371,
"step": 722
},
{
"epoch": 1.7184061849539103,
"grad_norm": 0.4240604341030121,
"learning_rate": 5.746714456391876e-05,
"loss": 0.302,
"step": 723
},
{
"epoch": 1.72078501338091,
"grad_norm": 0.6027634739875793,
"learning_rate": 5.740740740740741e-05,
"loss": 0.3999,
"step": 724
},
{
"epoch": 1.7231638418079096,
"grad_norm": 0.4671403467655182,
"learning_rate": 5.7347670250896056e-05,
"loss": 0.3012,
"step": 725
},
{
"epoch": 1.7255426702349093,
"grad_norm": 0.4920845925807953,
"learning_rate": 5.7287933094384706e-05,
"loss": 0.3595,
"step": 726
},
{
"epoch": 1.727921498661909,
"grad_norm": 0.48627743124961853,
"learning_rate": 5.722819593787336e-05,
"loss": 0.3263,
"step": 727
},
{
"epoch": 1.7303003270889086,
"grad_norm": 0.4806533455848694,
"learning_rate": 5.7168458781362014e-05,
"loss": 0.302,
"step": 728
},
{
"epoch": 1.7326791555159085,
"grad_norm": 0.5916662812232971,
"learning_rate": 5.710872162485066e-05,
"loss": 0.4725,
"step": 729
},
{
"epoch": 1.735057983942908,
"grad_norm": 0.5074958801269531,
"learning_rate": 5.704898446833931e-05,
"loss": 0.365,
"step": 730
},
{
"epoch": 1.7374368123699078,
"grad_norm": 0.4481968283653259,
"learning_rate": 5.6989247311827965e-05,
"loss": 0.2967,
"step": 731
},
{
"epoch": 1.7398156407969076,
"grad_norm": 0.48121869564056396,
"learning_rate": 5.6929510155316615e-05,
"loss": 0.2601,
"step": 732
},
{
"epoch": 1.7421944692239073,
"grad_norm": 0.6281145811080933,
"learning_rate": 5.686977299880526e-05,
"loss": 0.3374,
"step": 733
},
{
"epoch": 1.744573297650907,
"grad_norm": 0.5491541624069214,
"learning_rate": 5.681003584229391e-05,
"loss": 0.3351,
"step": 734
},
{
"epoch": 1.7469521260779066,
"grad_norm": 0.5338577032089233,
"learning_rate": 5.675029868578255e-05,
"loss": 0.3048,
"step": 735
},
{
"epoch": 1.7493309545049063,
"grad_norm": 0.5534673929214478,
"learning_rate": 5.6690561529271216e-05,
"loss": 0.3444,
"step": 736
},
{
"epoch": 1.751709782931906,
"grad_norm": 0.5476568341255188,
"learning_rate": 5.663082437275986e-05,
"loss": 0.3367,
"step": 737
},
{
"epoch": 1.7540886113589056,
"grad_norm": 0.5127911567687988,
"learning_rate": 5.657108721624851e-05,
"loss": 0.4117,
"step": 738
},
{
"epoch": 1.7564674397859055,
"grad_norm": 0.5235200524330139,
"learning_rate": 5.6511350059737153e-05,
"loss": 0.2968,
"step": 739
},
{
"epoch": 1.7588462682129051,
"grad_norm": 0.5279961228370667,
"learning_rate": 5.645161290322582e-05,
"loss": 0.337,
"step": 740
},
{
"epoch": 1.761225096639905,
"grad_norm": 0.5380825400352478,
"learning_rate": 5.639187574671446e-05,
"loss": 0.3376,
"step": 741
},
{
"epoch": 1.7636039250669047,
"grad_norm": 0.5725474953651428,
"learning_rate": 5.633213859020311e-05,
"loss": 0.3843,
"step": 742
},
{
"epoch": 1.7659827534939043,
"grad_norm": 0.47840648889541626,
"learning_rate": 5.6272401433691755e-05,
"loss": 0.3816,
"step": 743
},
{
"epoch": 1.768361581920904,
"grad_norm": 0.5369259119033813,
"learning_rate": 5.6212664277180405e-05,
"loss": 0.3751,
"step": 744
},
{
"epoch": 1.7707404103479036,
"grad_norm": 0.5530030131340027,
"learning_rate": 5.615292712066906e-05,
"loss": 0.3387,
"step": 745
},
{
"epoch": 1.7731192387749033,
"grad_norm": 0.47207772731781006,
"learning_rate": 5.609318996415771e-05,
"loss": 0.3897,
"step": 746
},
{
"epoch": 1.775498067201903,
"grad_norm": 0.49299487471580505,
"learning_rate": 5.6033452807646356e-05,
"loss": 0.3318,
"step": 747
},
{
"epoch": 1.7778768956289026,
"grad_norm": 0.48968756198883057,
"learning_rate": 5.5973715651135006e-05,
"loss": 0.3949,
"step": 748
},
{
"epoch": 1.7802557240559025,
"grad_norm": 0.4778074026107788,
"learning_rate": 5.5913978494623656e-05,
"loss": 0.3891,
"step": 749
},
{
"epoch": 1.7826345524829021,
"grad_norm": 0.536257803440094,
"learning_rate": 5.5854241338112313e-05,
"loss": 0.3005,
"step": 750
},
{
"epoch": 1.785013380909902,
"grad_norm": 0.5621615648269653,
"learning_rate": 5.579450418160096e-05,
"loss": 0.384,
"step": 751
},
{
"epoch": 1.7873922093369017,
"grad_norm": 0.4926300644874573,
"learning_rate": 5.573476702508961e-05,
"loss": 0.3277,
"step": 752
},
{
"epoch": 1.7897710377639013,
"grad_norm": 0.49344128370285034,
"learning_rate": 5.567502986857826e-05,
"loss": 0.3386,
"step": 753
},
{
"epoch": 1.792149866190901,
"grad_norm": 0.4578976333141327,
"learning_rate": 5.5615292712066915e-05,
"loss": 0.3371,
"step": 754
},
{
"epoch": 1.7945286946179007,
"grad_norm": 0.5874441266059875,
"learning_rate": 5.555555555555556e-05,
"loss": 0.3814,
"step": 755
},
{
"epoch": 1.7969075230449003,
"grad_norm": 0.548207700252533,
"learning_rate": 5.549581839904421e-05,
"loss": 0.377,
"step": 756
},
{
"epoch": 1.7992863514719,
"grad_norm": 0.4734908938407898,
"learning_rate": 5.543608124253286e-05,
"loss": 0.3264,
"step": 757
},
{
"epoch": 1.8016651798988996,
"grad_norm": 0.5327095985412598,
"learning_rate": 5.53763440860215e-05,
"loss": 0.3742,
"step": 758
},
{
"epoch": 1.8040440083258995,
"grad_norm": 0.562744140625,
"learning_rate": 5.531660692951016e-05,
"loss": 0.3545,
"step": 759
},
{
"epoch": 1.8064228367528992,
"grad_norm": 0.47441789507865906,
"learning_rate": 5.525686977299881e-05,
"loss": 0.3274,
"step": 760
},
{
"epoch": 1.808801665179899,
"grad_norm": 0.5133342742919922,
"learning_rate": 5.519713261648746e-05,
"loss": 0.3693,
"step": 761
},
{
"epoch": 1.8111804936068987,
"grad_norm": 0.5697010159492493,
"learning_rate": 5.5137395459976104e-05,
"loss": 0.3936,
"step": 762
},
{
"epoch": 1.8135593220338984,
"grad_norm": 0.5646381974220276,
"learning_rate": 5.507765830346476e-05,
"loss": 0.4184,
"step": 763
},
{
"epoch": 1.815938150460898,
"grad_norm": 0.4727121889591217,
"learning_rate": 5.501792114695341e-05,
"loss": 0.3406,
"step": 764
},
{
"epoch": 1.8183169788878977,
"grad_norm": 0.5393305420875549,
"learning_rate": 5.495818399044206e-05,
"loss": 0.3324,
"step": 765
},
{
"epoch": 1.8206958073148973,
"grad_norm": 0.5477187037467957,
"learning_rate": 5.4898446833930705e-05,
"loss": 0.3,
"step": 766
},
{
"epoch": 1.823074635741897,
"grad_norm": 0.6073659062385559,
"learning_rate": 5.4838709677419355e-05,
"loss": 0.439,
"step": 767
},
{
"epoch": 1.8254534641688969,
"grad_norm": 0.5603262186050415,
"learning_rate": 5.477897252090801e-05,
"loss": 0.363,
"step": 768
},
{
"epoch": 1.8278322925958965,
"grad_norm": 0.507232129573822,
"learning_rate": 5.471923536439666e-05,
"loss": 0.307,
"step": 769
},
{
"epoch": 1.8302111210228962,
"grad_norm": 0.605401337146759,
"learning_rate": 5.4659498207885306e-05,
"loss": 0.3638,
"step": 770
},
{
"epoch": 1.832589949449896,
"grad_norm": 0.4511045515537262,
"learning_rate": 5.4599761051373956e-05,
"loss": 0.2924,
"step": 771
},
{
"epoch": 1.8349687778768957,
"grad_norm": 0.5773246884346008,
"learning_rate": 5.45400238948626e-05,
"loss": 0.3979,
"step": 772
},
{
"epoch": 1.8373476063038954,
"grad_norm": 0.5383531451225281,
"learning_rate": 5.4480286738351264e-05,
"loss": 0.3491,
"step": 773
},
{
"epoch": 1.839726434730895,
"grad_norm": 0.48941031098365784,
"learning_rate": 5.442054958183991e-05,
"loss": 0.3749,
"step": 774
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.5394012331962585,
"learning_rate": 5.436081242532856e-05,
"loss": 0.3267,
"step": 775
},
{
"epoch": 1.8444840915848943,
"grad_norm": 0.5717118382453918,
"learning_rate": 5.43010752688172e-05,
"loss": 0.3667,
"step": 776
},
{
"epoch": 1.846862920011894,
"grad_norm": 0.634171724319458,
"learning_rate": 5.4241338112305865e-05,
"loss": 0.4774,
"step": 777
},
{
"epoch": 1.8492417484388939,
"grad_norm": 0.5577214956283569,
"learning_rate": 5.418160095579451e-05,
"loss": 0.393,
"step": 778
},
{
"epoch": 1.8516205768658935,
"grad_norm": 0.6162117123603821,
"learning_rate": 5.412186379928316e-05,
"loss": 0.416,
"step": 779
},
{
"epoch": 1.8539994052928932,
"grad_norm": 0.48546504974365234,
"learning_rate": 5.40621266427718e-05,
"loss": 0.2887,
"step": 780
},
{
"epoch": 1.856378233719893,
"grad_norm": 0.5625360012054443,
"learning_rate": 5.400238948626045e-05,
"loss": 0.3789,
"step": 781
},
{
"epoch": 1.8587570621468927,
"grad_norm": 0.5299000144004822,
"learning_rate": 5.394265232974911e-05,
"loss": 0.3646,
"step": 782
},
{
"epoch": 1.8611358905738924,
"grad_norm": 0.5966357588768005,
"learning_rate": 5.388291517323776e-05,
"loss": 0.3386,
"step": 783
},
{
"epoch": 1.863514719000892,
"grad_norm": 0.520456075668335,
"learning_rate": 5.3823178016726403e-05,
"loss": 0.3239,
"step": 784
},
{
"epoch": 1.8658935474278917,
"grad_norm": 0.5554583668708801,
"learning_rate": 5.3763440860215054e-05,
"loss": 0.3616,
"step": 785
},
{
"epoch": 1.8682723758548914,
"grad_norm": 0.5244592428207397,
"learning_rate": 5.370370370370371e-05,
"loss": 0.3714,
"step": 786
},
{
"epoch": 1.870651204281891,
"grad_norm": 0.4749840497970581,
"learning_rate": 5.364396654719236e-05,
"loss": 0.3731,
"step": 787
},
{
"epoch": 1.873030032708891,
"grad_norm": 0.5398964881896973,
"learning_rate": 5.3584229390681005e-05,
"loss": 0.3281,
"step": 788
},
{
"epoch": 1.8754088611358906,
"grad_norm": 0.5688257217407227,
"learning_rate": 5.3524492234169655e-05,
"loss": 0.3677,
"step": 789
},
{
"epoch": 1.8777876895628904,
"grad_norm": 0.5520471930503845,
"learning_rate": 5.34647550776583e-05,
"loss": 0.3801,
"step": 790
},
{
"epoch": 1.88016651798989,
"grad_norm": 0.547389566898346,
"learning_rate": 5.340501792114696e-05,
"loss": 0.368,
"step": 791
},
{
"epoch": 1.8825453464168898,
"grad_norm": 0.47842028737068176,
"learning_rate": 5.3345280764635606e-05,
"loss": 0.3322,
"step": 792
},
{
"epoch": 1.8849241748438894,
"grad_norm": 0.5586636066436768,
"learning_rate": 5.3285543608124256e-05,
"loss": 0.3678,
"step": 793
},
{
"epoch": 1.887303003270889,
"grad_norm": 0.5233385562896729,
"learning_rate": 5.32258064516129e-05,
"loss": 0.3498,
"step": 794
},
{
"epoch": 1.8896818316978887,
"grad_norm": 0.4903348684310913,
"learning_rate": 5.316606929510155e-05,
"loss": 0.3427,
"step": 795
},
{
"epoch": 1.8920606601248884,
"grad_norm": 0.48647114634513855,
"learning_rate": 5.310633213859021e-05,
"loss": 0.2726,
"step": 796
},
{
"epoch": 1.894439488551888,
"grad_norm": 0.5279799699783325,
"learning_rate": 5.304659498207886e-05,
"loss": 0.3228,
"step": 797
},
{
"epoch": 1.896818316978888,
"grad_norm": 0.6157146692276001,
"learning_rate": 5.29868578255675e-05,
"loss": 0.3942,
"step": 798
},
{
"epoch": 1.8991971454058876,
"grad_norm": 0.5570871829986572,
"learning_rate": 5.292712066905615e-05,
"loss": 0.375,
"step": 799
},
{
"epoch": 1.9015759738328875,
"grad_norm": 0.48831409215927124,
"learning_rate": 5.286738351254481e-05,
"loss": 0.3498,
"step": 800
},
{
"epoch": 1.9015759738328875,
"eval_loss": 0.3837396204471588,
"eval_runtime": 24.7664,
"eval_samples_per_second": 30.202,
"eval_steps_per_second": 15.101,
"step": 800
},
{
"epoch": 1.9039548022598871,
"grad_norm": 0.5372702479362488,
"learning_rate": 5.280764635603346e-05,
"loss": 0.3304,
"step": 801
},
{
"epoch": 1.9063336306868868,
"grad_norm": 0.4852701425552368,
"learning_rate": 5.27479091995221e-05,
"loss": 0.3299,
"step": 802
},
{
"epoch": 1.9087124591138864,
"grad_norm": 0.614133894443512,
"learning_rate": 5.268817204301075e-05,
"loss": 0.3738,
"step": 803
},
{
"epoch": 1.911091287540886,
"grad_norm": 0.5791040658950806,
"learning_rate": 5.26284348864994e-05,
"loss": 0.3164,
"step": 804
},
{
"epoch": 1.9134701159678857,
"grad_norm": 0.542182207107544,
"learning_rate": 5.256869772998806e-05,
"loss": 0.3784,
"step": 805
},
{
"epoch": 1.9158489443948854,
"grad_norm": 0.5163738131523132,
"learning_rate": 5.25089605734767e-05,
"loss": 0.3684,
"step": 806
},
{
"epoch": 1.918227772821885,
"grad_norm": 0.5529613494873047,
"learning_rate": 5.2449223416965354e-05,
"loss": 0.3658,
"step": 807
},
{
"epoch": 1.920606601248885,
"grad_norm": 0.545074462890625,
"learning_rate": 5.2389486260454004e-05,
"loss": 0.3343,
"step": 808
},
{
"epoch": 1.9229854296758846,
"grad_norm": 0.5097286701202393,
"learning_rate": 5.232974910394266e-05,
"loss": 0.3189,
"step": 809
},
{
"epoch": 1.9253642581028845,
"grad_norm": 0.5587269067764282,
"learning_rate": 5.2270011947431304e-05,
"loss": 0.2856,
"step": 810
},
{
"epoch": 1.9277430865298841,
"grad_norm": 0.47120344638824463,
"learning_rate": 5.2210274790919955e-05,
"loss": 0.336,
"step": 811
},
{
"epoch": 1.9301219149568838,
"grad_norm": 0.5026476979255676,
"learning_rate": 5.2150537634408605e-05,
"loss": 0.3642,
"step": 812
},
{
"epoch": 1.9325007433838834,
"grad_norm": 0.4788722097873688,
"learning_rate": 5.209080047789725e-05,
"loss": 0.3609,
"step": 813
},
{
"epoch": 1.934879571810883,
"grad_norm": 0.5492738485336304,
"learning_rate": 5.2031063321385906e-05,
"loss": 0.3353,
"step": 814
},
{
"epoch": 1.9372584002378828,
"grad_norm": 0.5270171165466309,
"learning_rate": 5.1971326164874556e-05,
"loss": 0.2813,
"step": 815
},
{
"epoch": 1.9396372286648824,
"grad_norm": 0.5411754250526428,
"learning_rate": 5.1911589008363206e-05,
"loss": 0.3842,
"step": 816
},
{
"epoch": 1.9420160570918823,
"grad_norm": 0.5464585423469543,
"learning_rate": 5.185185185185185e-05,
"loss": 0.339,
"step": 817
},
{
"epoch": 1.944394885518882,
"grad_norm": 0.5153761506080627,
"learning_rate": 5.17921146953405e-05,
"loss": 0.3486,
"step": 818
},
{
"epoch": 1.9467737139458816,
"grad_norm": 0.5572924613952637,
"learning_rate": 5.173237753882916e-05,
"loss": 0.39,
"step": 819
},
{
"epoch": 1.9491525423728815,
"grad_norm": 0.5603399872779846,
"learning_rate": 5.167264038231781e-05,
"loss": 0.3799,
"step": 820
},
{
"epoch": 1.9515313707998811,
"grad_norm": 0.5274227857589722,
"learning_rate": 5.161290322580645e-05,
"loss": 0.3355,
"step": 821
},
{
"epoch": 1.9539101992268808,
"grad_norm": 0.5358468890190125,
"learning_rate": 5.15531660692951e-05,
"loss": 0.3252,
"step": 822
},
{
"epoch": 1.9562890276538805,
"grad_norm": 0.5450243353843689,
"learning_rate": 5.149342891278376e-05,
"loss": 0.3685,
"step": 823
},
{
"epoch": 1.9586678560808801,
"grad_norm": 0.5191539525985718,
"learning_rate": 5.143369175627241e-05,
"loss": 0.3151,
"step": 824
},
{
"epoch": 1.9610466845078798,
"grad_norm": 0.48101523518562317,
"learning_rate": 5.137395459976105e-05,
"loss": 0.3056,
"step": 825
},
{
"epoch": 1.9634255129348794,
"grad_norm": 0.4933248460292816,
"learning_rate": 5.13142174432497e-05,
"loss": 0.3303,
"step": 826
},
{
"epoch": 1.9658043413618793,
"grad_norm": 0.5369696617126465,
"learning_rate": 5.1254480286738346e-05,
"loss": 0.3708,
"step": 827
},
{
"epoch": 1.968183169788879,
"grad_norm": 0.4536949396133423,
"learning_rate": 5.119474313022701e-05,
"loss": 0.3057,
"step": 828
},
{
"epoch": 1.9705619982158786,
"grad_norm": 0.5654657483100891,
"learning_rate": 5.1135005973715653e-05,
"loss": 0.4042,
"step": 829
},
{
"epoch": 1.9729408266428785,
"grad_norm": 0.5114840865135193,
"learning_rate": 5.1075268817204304e-05,
"loss": 0.3863,
"step": 830
},
{
"epoch": 1.9753196550698782,
"grad_norm": 0.5588715672492981,
"learning_rate": 5.101553166069295e-05,
"loss": 0.3143,
"step": 831
},
{
"epoch": 1.9776984834968778,
"grad_norm": 0.5302734375,
"learning_rate": 5.095579450418161e-05,
"loss": 0.3683,
"step": 832
},
{
"epoch": 1.9800773119238775,
"grad_norm": 0.4852687120437622,
"learning_rate": 5.0896057347670255e-05,
"loss": 0.2965,
"step": 833
},
{
"epoch": 1.9824561403508771,
"grad_norm": 0.5499604940414429,
"learning_rate": 5.0836320191158905e-05,
"loss": 0.3342,
"step": 834
},
{
"epoch": 1.9848349687778768,
"grad_norm": 0.5646479725837708,
"learning_rate": 5.077658303464755e-05,
"loss": 0.3291,
"step": 835
},
{
"epoch": 1.9872137972048765,
"grad_norm": 0.4746275246143341,
"learning_rate": 5.07168458781362e-05,
"loss": 0.3243,
"step": 836
},
{
"epoch": 1.9895926256318763,
"grad_norm": 0.5071300864219666,
"learning_rate": 5.0657108721624856e-05,
"loss": 0.3593,
"step": 837
},
{
"epoch": 1.991971454058876,
"grad_norm": 0.4464685916900635,
"learning_rate": 5.0597371565113506e-05,
"loss": 0.3016,
"step": 838
},
{
"epoch": 1.9943502824858759,
"grad_norm": 0.5198728442192078,
"learning_rate": 5.053763440860215e-05,
"loss": 0.4076,
"step": 839
},
{
"epoch": 1.9967291109128755,
"grad_norm": 0.4641326367855072,
"learning_rate": 5.04778972520908e-05,
"loss": 0.3481,
"step": 840
},
{
"epoch": 1.9991079393398752,
"grad_norm": 0.5178696513175964,
"learning_rate": 5.041816009557945e-05,
"loss": 0.3941,
"step": 841
},
{
"epoch": 2.0,
"grad_norm": 0.9851695895195007,
"learning_rate": 5.035842293906811e-05,
"loss": 0.3565,
"step": 842
},
{
"epoch": 2.0023788284269997,
"grad_norm": 0.47530972957611084,
"learning_rate": 5.029868578255675e-05,
"loss": 0.319,
"step": 843
},
{
"epoch": 2.0047576568539993,
"grad_norm": 0.42570582032203674,
"learning_rate": 5.02389486260454e-05,
"loss": 0.2761,
"step": 844
},
{
"epoch": 2.007136485280999,
"grad_norm": 0.45585909485816956,
"learning_rate": 5.017921146953405e-05,
"loss": 0.3012,
"step": 845
},
{
"epoch": 2.0095153137079986,
"grad_norm": 0.4082046449184418,
"learning_rate": 5.011947431302271e-05,
"loss": 0.2773,
"step": 846
},
{
"epoch": 2.0118941421349987,
"grad_norm": 0.4512028992176056,
"learning_rate": 5.005973715651135e-05,
"loss": 0.32,
"step": 847
},
{
"epoch": 2.0142729705619984,
"grad_norm": 0.5121605396270752,
"learning_rate": 5e-05,
"loss": 0.3279,
"step": 848
},
{
"epoch": 2.016651798988998,
"grad_norm": 0.5313460230827332,
"learning_rate": 4.994026284348865e-05,
"loss": 0.338,
"step": 849
},
{
"epoch": 2.0190306274159977,
"grad_norm": 0.45323655009269714,
"learning_rate": 4.98805256869773e-05,
"loss": 0.2707,
"step": 850
},
{
"epoch": 2.0214094558429974,
"grad_norm": 0.5094829201698303,
"learning_rate": 4.982078853046595e-05,
"loss": 0.2984,
"step": 851
},
{
"epoch": 2.023788284269997,
"grad_norm": 0.4851239323616028,
"learning_rate": 4.9761051373954604e-05,
"loss": 0.2722,
"step": 852
},
{
"epoch": 2.0261671126969967,
"grad_norm": 0.4356750547885895,
"learning_rate": 4.9701314217443254e-05,
"loss": 0.2569,
"step": 853
},
{
"epoch": 2.0285459411239963,
"grad_norm": 0.5339999794960022,
"learning_rate": 4.9641577060931904e-05,
"loss": 0.306,
"step": 854
},
{
"epoch": 2.030924769550996,
"grad_norm": 0.6356980204582214,
"learning_rate": 4.9581839904420555e-05,
"loss": 0.3564,
"step": 855
},
{
"epoch": 2.0333035979779956,
"grad_norm": 0.4849913418292999,
"learning_rate": 4.95221027479092e-05,
"loss": 0.3277,
"step": 856
},
{
"epoch": 2.0356824264049957,
"grad_norm": 0.5072147250175476,
"learning_rate": 4.9462365591397855e-05,
"loss": 0.305,
"step": 857
},
{
"epoch": 2.0380612548319954,
"grad_norm": 0.5672339797019958,
"learning_rate": 4.94026284348865e-05,
"loss": 0.3457,
"step": 858
},
{
"epoch": 2.040440083258995,
"grad_norm": 0.5494199991226196,
"learning_rate": 4.9342891278375156e-05,
"loss": 0.3739,
"step": 859
},
{
"epoch": 2.0428189116859947,
"grad_norm": 0.5226100087165833,
"learning_rate": 4.92831541218638e-05,
"loss": 0.2661,
"step": 860
},
{
"epoch": 2.0451977401129944,
"grad_norm": 0.49554064869880676,
"learning_rate": 4.9223416965352456e-05,
"loss": 0.2994,
"step": 861
},
{
"epoch": 2.047576568539994,
"grad_norm": 0.5684688091278076,
"learning_rate": 4.91636798088411e-05,
"loss": 0.353,
"step": 862
},
{
"epoch": 2.0499553969669937,
"grad_norm": 0.5614392161369324,
"learning_rate": 4.910394265232976e-05,
"loss": 0.2898,
"step": 863
},
{
"epoch": 2.0523342253939933,
"grad_norm": 0.522982120513916,
"learning_rate": 4.90442054958184e-05,
"loss": 0.277,
"step": 864
},
{
"epoch": 2.054713053820993,
"grad_norm": 0.48135489225387573,
"learning_rate": 4.898446833930705e-05,
"loss": 0.2326,
"step": 865
},
{
"epoch": 2.0570918822479927,
"grad_norm": 0.505936324596405,
"learning_rate": 4.89247311827957e-05,
"loss": 0.2519,
"step": 866
},
{
"epoch": 2.0594707106749928,
"grad_norm": 0.5319898724555969,
"learning_rate": 4.886499402628435e-05,
"loss": 0.2914,
"step": 867
},
{
"epoch": 2.0618495391019924,
"grad_norm": 0.5627797842025757,
"learning_rate": 4.8805256869773e-05,
"loss": 0.2827,
"step": 868
},
{
"epoch": 2.064228367528992,
"grad_norm": 0.5255232453346252,
"learning_rate": 4.874551971326165e-05,
"loss": 0.2431,
"step": 869
},
{
"epoch": 2.0666071959559917,
"grad_norm": 0.58738774061203,
"learning_rate": 4.86857825567503e-05,
"loss": 0.2937,
"step": 870
},
{
"epoch": 2.0689860243829914,
"grad_norm": 0.4902366101741791,
"learning_rate": 4.862604540023895e-05,
"loss": 0.2908,
"step": 871
},
{
"epoch": 2.071364852809991,
"grad_norm": 0.5246136784553528,
"learning_rate": 4.8566308243727596e-05,
"loss": 0.2602,
"step": 872
},
{
"epoch": 2.0737436812369907,
"grad_norm": 0.555491030216217,
"learning_rate": 4.850657108721625e-05,
"loss": 0.2981,
"step": 873
},
{
"epoch": 2.0761225096639904,
"grad_norm": 0.5617752075195312,
"learning_rate": 4.84468339307049e-05,
"loss": 0.3404,
"step": 874
},
{
"epoch": 2.07850133809099,
"grad_norm": 0.5913593769073486,
"learning_rate": 4.8387096774193554e-05,
"loss": 0.3252,
"step": 875
},
{
"epoch": 2.0808801665179897,
"grad_norm": 0.5444798469543457,
"learning_rate": 4.83273596176822e-05,
"loss": 0.2884,
"step": 876
},
{
"epoch": 2.08325899494499,
"grad_norm": 0.5903862714767456,
"learning_rate": 4.8267622461170854e-05,
"loss": 0.2807,
"step": 877
},
{
"epoch": 2.0856378233719894,
"grad_norm": 0.49321287870407104,
"learning_rate": 4.82078853046595e-05,
"loss": 0.2533,
"step": 878
},
{
"epoch": 2.088016651798989,
"grad_norm": 0.5765815377235413,
"learning_rate": 4.814814814814815e-05,
"loss": 0.3025,
"step": 879
},
{
"epoch": 2.0903954802259888,
"grad_norm": 0.6189472079277039,
"learning_rate": 4.80884109916368e-05,
"loss": 0.3258,
"step": 880
},
{
"epoch": 2.0927743086529884,
"grad_norm": 0.48916128277778625,
"learning_rate": 4.802867383512545e-05,
"loss": 0.2493,
"step": 881
},
{
"epoch": 2.095153137079988,
"grad_norm": 0.5639986991882324,
"learning_rate": 4.79689366786141e-05,
"loss": 0.3068,
"step": 882
},
{
"epoch": 2.0975319655069877,
"grad_norm": 0.6248376965522766,
"learning_rate": 4.790919952210275e-05,
"loss": 0.2955,
"step": 883
},
{
"epoch": 2.0999107939339874,
"grad_norm": 0.5759831666946411,
"learning_rate": 4.78494623655914e-05,
"loss": 0.3161,
"step": 884
},
{
"epoch": 2.102289622360987,
"grad_norm": 0.5416770577430725,
"learning_rate": 4.778972520908005e-05,
"loss": 0.2909,
"step": 885
},
{
"epoch": 2.1046684507879867,
"grad_norm": 0.5953570604324341,
"learning_rate": 4.77299880525687e-05,
"loss": 0.3355,
"step": 886
},
{
"epoch": 2.107047279214987,
"grad_norm": 0.5626474022865295,
"learning_rate": 4.767025089605735e-05,
"loss": 0.2999,
"step": 887
},
{
"epoch": 2.1094261076419865,
"grad_norm": 0.535835325717926,
"learning_rate": 4.7610513739546e-05,
"loss": 0.2792,
"step": 888
},
{
"epoch": 2.111804936068986,
"grad_norm": 0.4889310598373413,
"learning_rate": 4.755077658303465e-05,
"loss": 0.2638,
"step": 889
},
{
"epoch": 2.1141837644959858,
"grad_norm": 0.5014443397521973,
"learning_rate": 4.74910394265233e-05,
"loss": 0.2865,
"step": 890
},
{
"epoch": 2.1165625929229854,
"grad_norm": 0.5222111344337463,
"learning_rate": 4.743130227001195e-05,
"loss": 0.2829,
"step": 891
},
{
"epoch": 2.118941421349985,
"grad_norm": 0.5849918723106384,
"learning_rate": 4.73715651135006e-05,
"loss": 0.3257,
"step": 892
},
{
"epoch": 2.1213202497769847,
"grad_norm": 0.516745924949646,
"learning_rate": 4.731182795698925e-05,
"loss": 0.3169,
"step": 893
},
{
"epoch": 2.1236990782039844,
"grad_norm": 0.5032657384872437,
"learning_rate": 4.72520908004779e-05,
"loss": 0.2685,
"step": 894
},
{
"epoch": 2.126077906630984,
"grad_norm": 0.5356237888336182,
"learning_rate": 4.7192353643966546e-05,
"loss": 0.2767,
"step": 895
},
{
"epoch": 2.128456735057984,
"grad_norm": 0.6084557771682739,
"learning_rate": 4.71326164874552e-05,
"loss": 0.2951,
"step": 896
},
{
"epoch": 2.130835563484984,
"grad_norm": 0.5304860472679138,
"learning_rate": 4.707287933094385e-05,
"loss": 0.2738,
"step": 897
},
{
"epoch": 2.1332143919119835,
"grad_norm": 0.5701313018798828,
"learning_rate": 4.7013142174432504e-05,
"loss": 0.3126,
"step": 898
},
{
"epoch": 2.135593220338983,
"grad_norm": 0.5780851244926453,
"learning_rate": 4.695340501792115e-05,
"loss": 0.2865,
"step": 899
},
{
"epoch": 2.137972048765983,
"grad_norm": 0.4513755440711975,
"learning_rate": 4.6893667861409805e-05,
"loss": 0.2668,
"step": 900
},
{
"epoch": 2.137972048765983,
"eval_loss": 0.39199650287628174,
"eval_runtime": 24.8995,
"eval_samples_per_second": 30.041,
"eval_steps_per_second": 15.02,
"step": 900
},
{
"epoch": 2.1403508771929824,
"grad_norm": 0.5768704414367676,
"learning_rate": 4.683393070489845e-05,
"loss": 0.256,
"step": 901
},
{
"epoch": 2.142729705619982,
"grad_norm": 0.5071364045143127,
"learning_rate": 4.67741935483871e-05,
"loss": 0.2681,
"step": 902
},
{
"epoch": 2.1451085340469818,
"grad_norm": 0.6088786721229553,
"learning_rate": 4.671445639187575e-05,
"loss": 0.2721,
"step": 903
},
{
"epoch": 2.1474873624739814,
"grad_norm": 0.5659234523773193,
"learning_rate": 4.66547192353644e-05,
"loss": 0.3302,
"step": 904
},
{
"epoch": 2.149866190900981,
"grad_norm": 0.5510467886924744,
"learning_rate": 4.659498207885305e-05,
"loss": 0.2804,
"step": 905
},
{
"epoch": 2.1522450193279807,
"grad_norm": 0.6317784190177917,
"learning_rate": 4.65352449223417e-05,
"loss": 0.4077,
"step": 906
},
{
"epoch": 2.154623847754981,
"grad_norm": 0.5438193082809448,
"learning_rate": 4.647550776583035e-05,
"loss": 0.349,
"step": 907
},
{
"epoch": 2.1570026761819805,
"grad_norm": 0.5480299592018127,
"learning_rate": 4.6415770609319e-05,
"loss": 0.2482,
"step": 908
},
{
"epoch": 2.15938150460898,
"grad_norm": 0.5093241333961487,
"learning_rate": 4.635603345280765e-05,
"loss": 0.285,
"step": 909
},
{
"epoch": 2.16176033303598,
"grad_norm": 0.600224494934082,
"learning_rate": 4.62962962962963e-05,
"loss": 0.2936,
"step": 910
},
{
"epoch": 2.1641391614629795,
"grad_norm": 0.5286124348640442,
"learning_rate": 4.6236559139784944e-05,
"loss": 0.3155,
"step": 911
},
{
"epoch": 2.166517989889979,
"grad_norm": 0.5486891865730286,
"learning_rate": 4.61768219832736e-05,
"loss": 0.2868,
"step": 912
},
{
"epoch": 2.168896818316979,
"grad_norm": 0.5705512166023254,
"learning_rate": 4.6117084826762245e-05,
"loss": 0.2534,
"step": 913
},
{
"epoch": 2.1712756467439784,
"grad_norm": 0.5283891558647156,
"learning_rate": 4.60573476702509e-05,
"loss": 0.2449,
"step": 914
},
{
"epoch": 2.173654475170978,
"grad_norm": 0.5561732053756714,
"learning_rate": 4.5997610513739546e-05,
"loss": 0.2469,
"step": 915
},
{
"epoch": 2.176033303597978,
"grad_norm": 0.5191594362258911,
"learning_rate": 4.59378733572282e-05,
"loss": 0.266,
"step": 916
},
{
"epoch": 2.178412132024978,
"grad_norm": 0.5965383648872375,
"learning_rate": 4.5878136200716846e-05,
"loss": 0.3015,
"step": 917
},
{
"epoch": 2.1807909604519775,
"grad_norm": 0.5929700136184692,
"learning_rate": 4.5818399044205496e-05,
"loss": 0.3173,
"step": 918
},
{
"epoch": 2.183169788878977,
"grad_norm": 0.5193952918052673,
"learning_rate": 4.575866188769415e-05,
"loss": 0.3002,
"step": 919
},
{
"epoch": 2.185548617305977,
"grad_norm": 0.5817952752113342,
"learning_rate": 4.56989247311828e-05,
"loss": 0.3217,
"step": 920
},
{
"epoch": 2.1879274457329765,
"grad_norm": 0.5975373983383179,
"learning_rate": 4.563918757467145e-05,
"loss": 0.3132,
"step": 921
},
{
"epoch": 2.190306274159976,
"grad_norm": 0.5984764695167542,
"learning_rate": 4.55794504181601e-05,
"loss": 0.2953,
"step": 922
},
{
"epoch": 2.192685102586976,
"grad_norm": 0.6337423324584961,
"learning_rate": 4.551971326164875e-05,
"loss": 0.2945,
"step": 923
},
{
"epoch": 2.1950639310139755,
"grad_norm": 0.6041463613510132,
"learning_rate": 4.54599761051374e-05,
"loss": 0.2643,
"step": 924
},
{
"epoch": 2.197442759440975,
"grad_norm": 0.5808432102203369,
"learning_rate": 4.540023894862604e-05,
"loss": 0.3021,
"step": 925
},
{
"epoch": 2.199821587867975,
"grad_norm": 0.5449970364570618,
"learning_rate": 4.53405017921147e-05,
"loss": 0.2452,
"step": 926
},
{
"epoch": 2.202200416294975,
"grad_norm": 0.5480767488479614,
"learning_rate": 4.528076463560334e-05,
"loss": 0.2762,
"step": 927
},
{
"epoch": 2.2045792447219745,
"grad_norm": 0.5530625581741333,
"learning_rate": 4.5221027479092e-05,
"loss": 0.3128,
"step": 928
},
{
"epoch": 2.206958073148974,
"grad_norm": 0.5787012577056885,
"learning_rate": 4.516129032258064e-05,
"loss": 0.2905,
"step": 929
},
{
"epoch": 2.209336901575974,
"grad_norm": 0.5901174545288086,
"learning_rate": 4.51015531660693e-05,
"loss": 0.2822,
"step": 930
},
{
"epoch": 2.2117157300029735,
"grad_norm": 0.571593165397644,
"learning_rate": 4.5041816009557944e-05,
"loss": 0.3344,
"step": 931
},
{
"epoch": 2.214094558429973,
"grad_norm": 0.6192464828491211,
"learning_rate": 4.49820788530466e-05,
"loss": 0.294,
"step": 932
},
{
"epoch": 2.216473386856973,
"grad_norm": 0.5928755402565002,
"learning_rate": 4.4922341696535244e-05,
"loss": 0.2952,
"step": 933
},
{
"epoch": 2.2188522152839725,
"grad_norm": 0.5460443496704102,
"learning_rate": 4.4862604540023894e-05,
"loss": 0.2638,
"step": 934
},
{
"epoch": 2.221231043710972,
"grad_norm": 0.47907713055610657,
"learning_rate": 4.4802867383512545e-05,
"loss": 0.2629,
"step": 935
},
{
"epoch": 2.2236098721379722,
"grad_norm": 0.552532434463501,
"learning_rate": 4.4743130227001195e-05,
"loss": 0.2394,
"step": 936
},
{
"epoch": 2.225988700564972,
"grad_norm": 0.6192559599876404,
"learning_rate": 4.4683393070489845e-05,
"loss": 0.2764,
"step": 937
},
{
"epoch": 2.2283675289919715,
"grad_norm": 0.6276203393936157,
"learning_rate": 4.4623655913978496e-05,
"loss": 0.3492,
"step": 938
},
{
"epoch": 2.230746357418971,
"grad_norm": 0.5791794061660767,
"learning_rate": 4.4563918757467146e-05,
"loss": 0.2908,
"step": 939
},
{
"epoch": 2.233125185845971,
"grad_norm": 0.5309539437294006,
"learning_rate": 4.4504181600955796e-05,
"loss": 0.2684,
"step": 940
},
{
"epoch": 2.2355040142729705,
"grad_norm": 0.5494111180305481,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.2727,
"step": 941
},
{
"epoch": 2.23788284269997,
"grad_norm": 0.5954378843307495,
"learning_rate": 4.43847072879331e-05,
"loss": 0.275,
"step": 942
},
{
"epoch": 2.24026167112697,
"grad_norm": 0.5295591950416565,
"learning_rate": 4.432497013142175e-05,
"loss": 0.259,
"step": 943
},
{
"epoch": 2.2426404995539695,
"grad_norm": 0.6501302123069763,
"learning_rate": 4.42652329749104e-05,
"loss": 0.3046,
"step": 944
},
{
"epoch": 2.2450193279809696,
"grad_norm": 0.6218990087509155,
"learning_rate": 4.420549581839905e-05,
"loss": 0.3367,
"step": 945
},
{
"epoch": 2.2473981564079692,
"grad_norm": 0.549957275390625,
"learning_rate": 4.41457586618877e-05,
"loss": 0.2916,
"step": 946
},
{
"epoch": 2.249776984834969,
"grad_norm": 0.6548086404800415,
"learning_rate": 4.408602150537635e-05,
"loss": 0.3352,
"step": 947
},
{
"epoch": 2.2521558132619686,
"grad_norm": 0.5784884095191956,
"learning_rate": 4.402628434886499e-05,
"loss": 0.3103,
"step": 948
},
{
"epoch": 2.254534641688968,
"grad_norm": 0.5839419364929199,
"learning_rate": 4.396654719235365e-05,
"loss": 0.3194,
"step": 949
},
{
"epoch": 2.256913470115968,
"grad_norm": 0.6073633432388306,
"learning_rate": 4.390681003584229e-05,
"loss": 0.3136,
"step": 950
},
{
"epoch": 2.2592922985429675,
"grad_norm": 0.5259355902671814,
"learning_rate": 4.384707287933095e-05,
"loss": 0.2504,
"step": 951
},
{
"epoch": 2.261671126969967,
"grad_norm": 0.5521760582923889,
"learning_rate": 4.378733572281959e-05,
"loss": 0.2783,
"step": 952
},
{
"epoch": 2.264049955396967,
"grad_norm": 0.6189229488372803,
"learning_rate": 4.372759856630825e-05,
"loss": 0.2586,
"step": 953
},
{
"epoch": 2.2664287838239665,
"grad_norm": 0.5720909833908081,
"learning_rate": 4.3667861409796894e-05,
"loss": 0.2539,
"step": 954
},
{
"epoch": 2.268807612250966,
"grad_norm": 0.5107256174087524,
"learning_rate": 4.360812425328555e-05,
"loss": 0.2295,
"step": 955
},
{
"epoch": 2.2711864406779663,
"grad_norm": 0.5512247681617737,
"learning_rate": 4.3548387096774194e-05,
"loss": 0.323,
"step": 956
},
{
"epoch": 2.273565269104966,
"grad_norm": 0.6190577745437622,
"learning_rate": 4.3488649940262845e-05,
"loss": 0.3358,
"step": 957
},
{
"epoch": 2.2759440975319656,
"grad_norm": 0.5246328711509705,
"learning_rate": 4.3428912783751495e-05,
"loss": 0.2112,
"step": 958
},
{
"epoch": 2.2783229259589652,
"grad_norm": 0.6078363656997681,
"learning_rate": 4.3369175627240145e-05,
"loss": 0.2555,
"step": 959
},
{
"epoch": 2.280701754385965,
"grad_norm": 0.6270473599433899,
"learning_rate": 4.3309438470728796e-05,
"loss": 0.292,
"step": 960
},
{
"epoch": 2.2830805828129646,
"grad_norm": 0.6397281885147095,
"learning_rate": 4.3249701314217446e-05,
"loss": 0.2761,
"step": 961
},
{
"epoch": 2.285459411239964,
"grad_norm": 0.6567078828811646,
"learning_rate": 4.3189964157706096e-05,
"loss": 0.2742,
"step": 962
},
{
"epoch": 2.287838239666964,
"grad_norm": 0.5602155327796936,
"learning_rate": 4.3130227001194746e-05,
"loss": 0.2316,
"step": 963
},
{
"epoch": 2.2902170680939635,
"grad_norm": 0.5947728157043457,
"learning_rate": 4.307048984468339e-05,
"loss": 0.3116,
"step": 964
},
{
"epoch": 2.2925958965209636,
"grad_norm": 0.5324766039848328,
"learning_rate": 4.301075268817205e-05,
"loss": 0.2906,
"step": 965
},
{
"epoch": 2.2949747249479633,
"grad_norm": 0.5670318603515625,
"learning_rate": 4.295101553166069e-05,
"loss": 0.3218,
"step": 966
},
{
"epoch": 2.297353553374963,
"grad_norm": 0.6235133409500122,
"learning_rate": 4.289127837514935e-05,
"loss": 0.2306,
"step": 967
},
{
"epoch": 2.2997323818019626,
"grad_norm": 0.6106896996498108,
"learning_rate": 4.283154121863799e-05,
"loss": 0.3038,
"step": 968
},
{
"epoch": 2.3021112102289623,
"grad_norm": 0.6239914298057556,
"learning_rate": 4.277180406212665e-05,
"loss": 0.289,
"step": 969
},
{
"epoch": 2.304490038655962,
"grad_norm": 0.588311493396759,
"learning_rate": 4.271206690561529e-05,
"loss": 0.301,
"step": 970
},
{
"epoch": 2.3068688670829616,
"grad_norm": 0.6421441435813904,
"learning_rate": 4.265232974910394e-05,
"loss": 0.3437,
"step": 971
},
{
"epoch": 2.3092476955099612,
"grad_norm": 0.6681007742881775,
"learning_rate": 4.259259259259259e-05,
"loss": 0.33,
"step": 972
},
{
"epoch": 2.311626523936961,
"grad_norm": 0.6258997321128845,
"learning_rate": 4.253285543608124e-05,
"loss": 0.297,
"step": 973
},
{
"epoch": 2.314005352363961,
"grad_norm": 0.572743833065033,
"learning_rate": 4.247311827956989e-05,
"loss": 0.2313,
"step": 974
},
{
"epoch": 2.3163841807909606,
"grad_norm": 0.5629462003707886,
"learning_rate": 4.241338112305854e-05,
"loss": 0.2776,
"step": 975
},
{
"epoch": 2.3187630092179603,
"grad_norm": 0.5810324549674988,
"learning_rate": 4.2353643966547194e-05,
"loss": 0.2514,
"step": 976
},
{
"epoch": 2.32114183764496,
"grad_norm": 0.6199133396148682,
"learning_rate": 4.2293906810035844e-05,
"loss": 0.3355,
"step": 977
},
{
"epoch": 2.3235206660719596,
"grad_norm": 0.4951048195362091,
"learning_rate": 4.2234169653524494e-05,
"loss": 0.2709,
"step": 978
},
{
"epoch": 2.3258994944989593,
"grad_norm": 0.5519804358482361,
"learning_rate": 4.2174432497013144e-05,
"loss": 0.263,
"step": 979
},
{
"epoch": 2.328278322925959,
"grad_norm": 0.5669978857040405,
"learning_rate": 4.2114695340501795e-05,
"loss": 0.2733,
"step": 980
},
{
"epoch": 2.3306571513529586,
"grad_norm": 0.5783933401107788,
"learning_rate": 4.2054958183990445e-05,
"loss": 0.3205,
"step": 981
},
{
"epoch": 2.3330359797799582,
"grad_norm": 0.5702626705169678,
"learning_rate": 4.1995221027479095e-05,
"loss": 0.2642,
"step": 982
},
{
"epoch": 2.335414808206958,
"grad_norm": 0.5818063020706177,
"learning_rate": 4.1935483870967746e-05,
"loss": 0.2586,
"step": 983
},
{
"epoch": 2.3377936366339576,
"grad_norm": 0.6229294538497925,
"learning_rate": 4.1875746714456396e-05,
"loss": 0.3086,
"step": 984
},
{
"epoch": 2.3401724650609577,
"grad_norm": 0.5085659027099609,
"learning_rate": 4.1816009557945046e-05,
"loss": 0.3015,
"step": 985
},
{
"epoch": 2.3425512934879573,
"grad_norm": 0.5718642473220825,
"learning_rate": 4.1756272401433697e-05,
"loss": 0.3043,
"step": 986
},
{
"epoch": 2.344930121914957,
"grad_norm": 0.6011469960212708,
"learning_rate": 4.169653524492234e-05,
"loss": 0.2431,
"step": 987
},
{
"epoch": 2.3473089503419566,
"grad_norm": 0.5666202306747437,
"learning_rate": 4.1636798088411e-05,
"loss": 0.2424,
"step": 988
},
{
"epoch": 2.3496877787689563,
"grad_norm": 0.4900258481502533,
"learning_rate": 4.157706093189964e-05,
"loss": 0.2791,
"step": 989
},
{
"epoch": 2.352066607195956,
"grad_norm": 0.5660254955291748,
"learning_rate": 4.15173237753883e-05,
"loss": 0.2993,
"step": 990
},
{
"epoch": 2.3544454356229556,
"grad_norm": 0.5348349213600159,
"learning_rate": 4.145758661887694e-05,
"loss": 0.289,
"step": 991
},
{
"epoch": 2.3568242640499553,
"grad_norm": 0.6012505292892456,
"learning_rate": 4.13978494623656e-05,
"loss": 0.3272,
"step": 992
},
{
"epoch": 2.359203092476955,
"grad_norm": 0.6564033627510071,
"learning_rate": 4.133811230585424e-05,
"loss": 0.3512,
"step": 993
},
{
"epoch": 2.361581920903955,
"grad_norm": 0.6032342314720154,
"learning_rate": 4.127837514934289e-05,
"loss": 0.2783,
"step": 994
},
{
"epoch": 2.3639607493309547,
"grad_norm": 0.5665048360824585,
"learning_rate": 4.121863799283154e-05,
"loss": 0.2502,
"step": 995
},
{
"epoch": 2.3663395777579543,
"grad_norm": 0.5682721138000488,
"learning_rate": 4.115890083632019e-05,
"loss": 0.2321,
"step": 996
},
{
"epoch": 2.368718406184954,
"grad_norm": 0.5423948168754578,
"learning_rate": 4.109916367980884e-05,
"loss": 0.303,
"step": 997
},
{
"epoch": 2.3710972346119537,
"grad_norm": 0.6970639824867249,
"learning_rate": 4.1039426523297493e-05,
"loss": 0.3652,
"step": 998
},
{
"epoch": 2.3734760630389533,
"grad_norm": 0.5611357688903809,
"learning_rate": 4.0979689366786144e-05,
"loss": 0.3083,
"step": 999
},
{
"epoch": 2.375854891465953,
"grad_norm": 0.5091648697853088,
"learning_rate": 4.0919952210274794e-05,
"loss": 0.2621,
"step": 1000
},
{
"epoch": 2.375854891465953,
"eval_loss": 0.39063429832458496,
"eval_runtime": 24.8071,
"eval_samples_per_second": 30.153,
"eval_steps_per_second": 15.076,
"step": 1000
},
{
"epoch": 2.3782337198929526,
"grad_norm": 0.657648503780365,
"learning_rate": 4.0860215053763444e-05,
"loss": 0.3406,
"step": 1001
},
{
"epoch": 2.3806125483199523,
"grad_norm": 0.5850951671600342,
"learning_rate": 4.0800477897252095e-05,
"loss": 0.2944,
"step": 1002
},
{
"epoch": 2.382991376746952,
"grad_norm": 0.5571763515472412,
"learning_rate": 4.074074074074074e-05,
"loss": 0.248,
"step": 1003
},
{
"epoch": 2.3853702051739516,
"grad_norm": 0.5550372004508972,
"learning_rate": 4.0681003584229395e-05,
"loss": 0.3049,
"step": 1004
},
{
"epoch": 2.3877490336009517,
"grad_norm": 0.5554943680763245,
"learning_rate": 4.062126642771804e-05,
"loss": 0.3124,
"step": 1005
},
{
"epoch": 2.3901278620279514,
"grad_norm": 0.5197229385375977,
"learning_rate": 4.0561529271206696e-05,
"loss": 0.2344,
"step": 1006
},
{
"epoch": 2.392506690454951,
"grad_norm": 0.5891127586364746,
"learning_rate": 4.050179211469534e-05,
"loss": 0.305,
"step": 1007
},
{
"epoch": 2.3948855188819507,
"grad_norm": 0.6232504844665527,
"learning_rate": 4.0442054958183996e-05,
"loss": 0.3513,
"step": 1008
},
{
"epoch": 2.3972643473089503,
"grad_norm": 0.7023825645446777,
"learning_rate": 4.038231780167264e-05,
"loss": 0.3333,
"step": 1009
},
{
"epoch": 2.39964317573595,
"grad_norm": 0.6072371006011963,
"learning_rate": 4.032258064516129e-05,
"loss": 0.2898,
"step": 1010
},
{
"epoch": 2.4020220041629496,
"grad_norm": 0.5343210101127625,
"learning_rate": 4.026284348864994e-05,
"loss": 0.2502,
"step": 1011
},
{
"epoch": 2.4044008325899493,
"grad_norm": 0.5760728120803833,
"learning_rate": 4.020310633213859e-05,
"loss": 0.2727,
"step": 1012
},
{
"epoch": 2.406779661016949,
"grad_norm": 0.5475410223007202,
"learning_rate": 4.014336917562724e-05,
"loss": 0.3089,
"step": 1013
},
{
"epoch": 2.409158489443949,
"grad_norm": 0.6408705115318298,
"learning_rate": 4.008363201911589e-05,
"loss": 0.2844,
"step": 1014
},
{
"epoch": 2.4115373178709487,
"grad_norm": 0.5707472562789917,
"learning_rate": 4.002389486260454e-05,
"loss": 0.2819,
"step": 1015
},
{
"epoch": 2.4139161462979484,
"grad_norm": 0.5893364548683167,
"learning_rate": 3.996415770609319e-05,
"loss": 0.3223,
"step": 1016
},
{
"epoch": 2.416294974724948,
"grad_norm": 0.5145408511161804,
"learning_rate": 3.990442054958184e-05,
"loss": 0.2778,
"step": 1017
},
{
"epoch": 2.4186738031519477,
"grad_norm": 0.6715821623802185,
"learning_rate": 3.984468339307049e-05,
"loss": 0.3036,
"step": 1018
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.5638325810432434,
"learning_rate": 3.978494623655914e-05,
"loss": 0.3206,
"step": 1019
},
{
"epoch": 2.423431460005947,
"grad_norm": 0.5965414643287659,
"learning_rate": 3.972520908004779e-05,
"loss": 0.2817,
"step": 1020
},
{
"epoch": 2.4258102884329467,
"grad_norm": 0.6358485221862793,
"learning_rate": 3.9665471923536444e-05,
"loss": 0.3195,
"step": 1021
},
{
"epoch": 2.4281891168599463,
"grad_norm": 0.5645793080329895,
"learning_rate": 3.9605734767025094e-05,
"loss": 0.2882,
"step": 1022
},
{
"epoch": 2.4305679452869464,
"grad_norm": 0.6534497737884521,
"learning_rate": 3.9545997610513744e-05,
"loss": 0.2937,
"step": 1023
},
{
"epoch": 2.432946773713946,
"grad_norm": 0.5793471932411194,
"learning_rate": 3.9486260454002395e-05,
"loss": 0.2921,
"step": 1024
},
{
"epoch": 2.4353256021409457,
"grad_norm": 0.6896522045135498,
"learning_rate": 3.9426523297491045e-05,
"loss": 0.2904,
"step": 1025
},
{
"epoch": 2.4377044305679454,
"grad_norm": 0.6361718773841858,
"learning_rate": 3.936678614097969e-05,
"loss": 0.3316,
"step": 1026
},
{
"epoch": 2.440083258994945,
"grad_norm": 0.5773645043373108,
"learning_rate": 3.9307048984468345e-05,
"loss": 0.3056,
"step": 1027
},
{
"epoch": 2.4424620874219447,
"grad_norm": 0.5956210494041443,
"learning_rate": 3.924731182795699e-05,
"loss": 0.2966,
"step": 1028
},
{
"epoch": 2.4448409158489444,
"grad_norm": 0.50902259349823,
"learning_rate": 3.9187574671445646e-05,
"loss": 0.2504,
"step": 1029
},
{
"epoch": 2.447219744275944,
"grad_norm": 0.6098092794418335,
"learning_rate": 3.912783751493429e-05,
"loss": 0.2785,
"step": 1030
},
{
"epoch": 2.4495985727029437,
"grad_norm": 0.6550672054290771,
"learning_rate": 3.906810035842295e-05,
"loss": 0.283,
"step": 1031
},
{
"epoch": 2.4519774011299433,
"grad_norm": 0.5612165927886963,
"learning_rate": 3.900836320191159e-05,
"loss": 0.2738,
"step": 1032
},
{
"epoch": 2.454356229556943,
"grad_norm": 0.5695220232009888,
"learning_rate": 3.894862604540024e-05,
"loss": 0.2887,
"step": 1033
},
{
"epoch": 2.456735057983943,
"grad_norm": 0.559490978717804,
"learning_rate": 3.888888888888889e-05,
"loss": 0.2697,
"step": 1034
},
{
"epoch": 2.4591138864109428,
"grad_norm": 0.5419275760650635,
"learning_rate": 3.882915173237754e-05,
"loss": 0.243,
"step": 1035
},
{
"epoch": 2.4614927148379424,
"grad_norm": 0.6212007999420166,
"learning_rate": 3.876941457586619e-05,
"loss": 0.2965,
"step": 1036
},
{
"epoch": 2.463871543264942,
"grad_norm": 0.5144377946853638,
"learning_rate": 3.870967741935484e-05,
"loss": 0.2576,
"step": 1037
},
{
"epoch": 2.4662503716919417,
"grad_norm": 0.6120803952217102,
"learning_rate": 3.864994026284349e-05,
"loss": 0.3826,
"step": 1038
},
{
"epoch": 2.4686292001189414,
"grad_norm": 0.6212862133979797,
"learning_rate": 3.859020310633214e-05,
"loss": 0.3232,
"step": 1039
},
{
"epoch": 2.471008028545941,
"grad_norm": 0.6324489116668701,
"learning_rate": 3.8530465949820786e-05,
"loss": 0.3372,
"step": 1040
},
{
"epoch": 2.4733868569729407,
"grad_norm": 0.5289970636367798,
"learning_rate": 3.847072879330944e-05,
"loss": 0.3208,
"step": 1041
},
{
"epoch": 2.4757656853999404,
"grad_norm": 0.5548078417778015,
"learning_rate": 3.8410991636798086e-05,
"loss": 0.2724,
"step": 1042
},
{
"epoch": 2.4781445138269405,
"grad_norm": 0.5034798383712769,
"learning_rate": 3.8351254480286743e-05,
"loss": 0.2471,
"step": 1043
},
{
"epoch": 2.48052334225394,
"grad_norm": 0.5730745196342468,
"learning_rate": 3.829151732377539e-05,
"loss": 0.3225,
"step": 1044
},
{
"epoch": 2.4829021706809398,
"grad_norm": 0.580128014087677,
"learning_rate": 3.8231780167264044e-05,
"loss": 0.2591,
"step": 1045
},
{
"epoch": 2.4852809991079394,
"grad_norm": 0.5486919283866882,
"learning_rate": 3.817204301075269e-05,
"loss": 0.2867,
"step": 1046
},
{
"epoch": 2.487659827534939,
"grad_norm": 0.5794557332992554,
"learning_rate": 3.8112305854241345e-05,
"loss": 0.3313,
"step": 1047
},
{
"epoch": 2.4900386559619387,
"grad_norm": 0.5111564993858337,
"learning_rate": 3.805256869772999e-05,
"loss": 0.2931,
"step": 1048
},
{
"epoch": 2.4924174843889384,
"grad_norm": 0.5828002095222473,
"learning_rate": 3.799283154121864e-05,
"loss": 0.2933,
"step": 1049
},
{
"epoch": 2.494796312815938,
"grad_norm": 0.6088399291038513,
"learning_rate": 3.793309438470729e-05,
"loss": 0.2784,
"step": 1050
},
{
"epoch": 2.4971751412429377,
"grad_norm": 0.5388132929801941,
"learning_rate": 3.787335722819594e-05,
"loss": 0.263,
"step": 1051
},
{
"epoch": 2.4995539696699374,
"grad_norm": 0.6311586499214172,
"learning_rate": 3.781362007168459e-05,
"loss": 0.2742,
"step": 1052
},
{
"epoch": 2.501932798096937,
"grad_norm": 0.6956512331962585,
"learning_rate": 3.775388291517324e-05,
"loss": 0.2726,
"step": 1053
},
{
"epoch": 2.504311626523937,
"grad_norm": 0.5791674852371216,
"learning_rate": 3.769414575866189e-05,
"loss": 0.2594,
"step": 1054
},
{
"epoch": 2.506690454950937,
"grad_norm": 0.6453083157539368,
"learning_rate": 3.763440860215054e-05,
"loss": 0.299,
"step": 1055
},
{
"epoch": 2.5090692833779364,
"grad_norm": 0.6255761384963989,
"learning_rate": 3.7574671445639184e-05,
"loss": 0.284,
"step": 1056
},
{
"epoch": 2.511448111804936,
"grad_norm": 0.5033700466156006,
"learning_rate": 3.751493428912784e-05,
"loss": 0.2419,
"step": 1057
},
{
"epoch": 2.5138269402319358,
"grad_norm": 0.6552553772926331,
"learning_rate": 3.7455197132616484e-05,
"loss": 0.326,
"step": 1058
},
{
"epoch": 2.5162057686589354,
"grad_norm": 0.5841801762580872,
"learning_rate": 3.739545997610514e-05,
"loss": 0.2795,
"step": 1059
},
{
"epoch": 2.518584597085935,
"grad_norm": 0.7399122714996338,
"learning_rate": 3.7335722819593785e-05,
"loss": 0.3375,
"step": 1060
},
{
"epoch": 2.5209634255129347,
"grad_norm": 0.5951675176620483,
"learning_rate": 3.727598566308244e-05,
"loss": 0.2637,
"step": 1061
},
{
"epoch": 2.5233422539399344,
"grad_norm": 0.5338478088378906,
"learning_rate": 3.7216248506571086e-05,
"loss": 0.3215,
"step": 1062
},
{
"epoch": 2.5257210823669345,
"grad_norm": 0.6295543909072876,
"learning_rate": 3.715651135005974e-05,
"loss": 0.337,
"step": 1063
},
{
"epoch": 2.528099910793934,
"grad_norm": 0.5548101663589478,
"learning_rate": 3.7096774193548386e-05,
"loss": 0.2745,
"step": 1064
},
{
"epoch": 2.530478739220934,
"grad_norm": 0.5178088545799255,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.23,
"step": 1065
},
{
"epoch": 2.5328575676479335,
"grad_norm": 0.5105859041213989,
"learning_rate": 3.697729988052569e-05,
"loss": 0.1886,
"step": 1066
},
{
"epoch": 2.535236396074933,
"grad_norm": 0.5834839344024658,
"learning_rate": 3.691756272401434e-05,
"loss": 0.2866,
"step": 1067
},
{
"epoch": 2.5376152245019328,
"grad_norm": 0.6643683314323425,
"learning_rate": 3.685782556750299e-05,
"loss": 0.3365,
"step": 1068
},
{
"epoch": 2.5399940529289324,
"grad_norm": 0.5959658622741699,
"learning_rate": 3.679808841099164e-05,
"loss": 0.3177,
"step": 1069
},
{
"epoch": 2.542372881355932,
"grad_norm": 0.587335467338562,
"learning_rate": 3.673835125448029e-05,
"loss": 0.3069,
"step": 1070
},
{
"epoch": 2.5447517097829317,
"grad_norm": 0.6357367038726807,
"learning_rate": 3.667861409796894e-05,
"loss": 0.3166,
"step": 1071
},
{
"epoch": 2.547130538209932,
"grad_norm": 0.5437172055244446,
"learning_rate": 3.661887694145759e-05,
"loss": 0.2837,
"step": 1072
},
{
"epoch": 2.549509366636931,
"grad_norm": 0.5983290076255798,
"learning_rate": 3.655913978494624e-05,
"loss": 0.2586,
"step": 1073
},
{
"epoch": 2.551888195063931,
"grad_norm": 0.6598303914070129,
"learning_rate": 3.649940262843489e-05,
"loss": 0.2916,
"step": 1074
},
{
"epoch": 2.554267023490931,
"grad_norm": 0.5528435707092285,
"learning_rate": 3.643966547192354e-05,
"loss": 0.2843,
"step": 1075
},
{
"epoch": 2.5566458519179305,
"grad_norm": 0.7062620520591736,
"learning_rate": 3.637992831541219e-05,
"loss": 0.3117,
"step": 1076
},
{
"epoch": 2.55902468034493,
"grad_norm": 0.5995641946792603,
"learning_rate": 3.632019115890084e-05,
"loss": 0.3298,
"step": 1077
},
{
"epoch": 2.56140350877193,
"grad_norm": 0.6305801272392273,
"learning_rate": 3.626045400238949e-05,
"loss": 0.3426,
"step": 1078
},
{
"epoch": 2.5637823371989295,
"grad_norm": 0.6942247748374939,
"learning_rate": 3.6200716845878134e-05,
"loss": 0.2996,
"step": 1079
},
{
"epoch": 2.566161165625929,
"grad_norm": 0.6617063879966736,
"learning_rate": 3.614097968936679e-05,
"loss": 0.2998,
"step": 1080
},
{
"epoch": 2.568539994052929,
"grad_norm": 0.5509942173957825,
"learning_rate": 3.6081242532855435e-05,
"loss": 0.2671,
"step": 1081
},
{
"epoch": 2.5709188224799284,
"grad_norm": 0.6745384931564331,
"learning_rate": 3.602150537634409e-05,
"loss": 0.2895,
"step": 1082
},
{
"epoch": 2.5732976509069285,
"grad_norm": 0.6246810555458069,
"learning_rate": 3.5961768219832735e-05,
"loss": 0.2544,
"step": 1083
},
{
"epoch": 2.575676479333928,
"grad_norm": 0.6448361873626709,
"learning_rate": 3.590203106332139e-05,
"loss": 0.3294,
"step": 1084
},
{
"epoch": 2.578055307760928,
"grad_norm": 0.6272534728050232,
"learning_rate": 3.5842293906810036e-05,
"loss": 0.2618,
"step": 1085
},
{
"epoch": 2.5804341361879275,
"grad_norm": 0.6175225377082825,
"learning_rate": 3.578255675029869e-05,
"loss": 0.3305,
"step": 1086
},
{
"epoch": 2.582812964614927,
"grad_norm": 0.5742083191871643,
"learning_rate": 3.5722819593787336e-05,
"loss": 0.2782,
"step": 1087
},
{
"epoch": 2.585191793041927,
"grad_norm": 0.6171672344207764,
"learning_rate": 3.566308243727599e-05,
"loss": 0.2625,
"step": 1088
},
{
"epoch": 2.5875706214689265,
"grad_norm": 0.5434128642082214,
"learning_rate": 3.560334528076464e-05,
"loss": 0.2562,
"step": 1089
},
{
"epoch": 2.589949449895926,
"grad_norm": 0.6696468591690063,
"learning_rate": 3.554360812425329e-05,
"loss": 0.3718,
"step": 1090
},
{
"epoch": 2.592328278322926,
"grad_norm": 0.574690043926239,
"learning_rate": 3.548387096774194e-05,
"loss": 0.2481,
"step": 1091
},
{
"epoch": 2.594707106749926,
"grad_norm": 0.606299638748169,
"learning_rate": 3.542413381123059e-05,
"loss": 0.2644,
"step": 1092
},
{
"epoch": 2.597085935176925,
"grad_norm": 0.6124521493911743,
"learning_rate": 3.536439665471924e-05,
"loss": 0.2608,
"step": 1093
},
{
"epoch": 2.599464763603925,
"grad_norm": 0.5545802712440491,
"learning_rate": 3.530465949820789e-05,
"loss": 0.2714,
"step": 1094
},
{
"epoch": 2.601843592030925,
"grad_norm": 0.6126631498336792,
"learning_rate": 3.524492234169653e-05,
"loss": 0.2823,
"step": 1095
},
{
"epoch": 2.6042224204579245,
"grad_norm": 0.5954961180686951,
"learning_rate": 3.518518518518519e-05,
"loss": 0.2937,
"step": 1096
},
{
"epoch": 2.606601248884924,
"grad_norm": 0.616807222366333,
"learning_rate": 3.512544802867383e-05,
"loss": 0.3133,
"step": 1097
},
{
"epoch": 2.608980077311924,
"grad_norm": 0.6112419962882996,
"learning_rate": 3.506571087216249e-05,
"loss": 0.2772,
"step": 1098
},
{
"epoch": 2.6113589057389235,
"grad_norm": 0.552832841873169,
"learning_rate": 3.500597371565113e-05,
"loss": 0.2726,
"step": 1099
},
{
"epoch": 2.613737734165923,
"grad_norm": 0.5582488775253296,
"learning_rate": 3.494623655913979e-05,
"loss": 0.2947,
"step": 1100
},
{
"epoch": 2.613737734165923,
"eval_loss": 0.38853010535240173,
"eval_runtime": 24.8956,
"eval_samples_per_second": 30.045,
"eval_steps_per_second": 15.023,
"step": 1100
},
{
"epoch": 2.6161165625929232,
"grad_norm": 0.5219835638999939,
"learning_rate": 3.4886499402628434e-05,
"loss": 0.3032,
"step": 1101
},
{
"epoch": 2.6184953910199225,
"grad_norm": 0.612126350402832,
"learning_rate": 3.4826762246117084e-05,
"loss": 0.3414,
"step": 1102
},
{
"epoch": 2.6208742194469226,
"grad_norm": 0.5982509255409241,
"learning_rate": 3.4767025089605734e-05,
"loss": 0.2909,
"step": 1103
},
{
"epoch": 2.623253047873922,
"grad_norm": 0.6476831436157227,
"learning_rate": 3.4707287933094385e-05,
"loss": 0.3255,
"step": 1104
},
{
"epoch": 2.625631876300922,
"grad_norm": 0.5892149806022644,
"learning_rate": 3.4647550776583035e-05,
"loss": 0.2788,
"step": 1105
},
{
"epoch": 2.6280107047279215,
"grad_norm": 0.5823227763175964,
"learning_rate": 3.4587813620071685e-05,
"loss": 0.2953,
"step": 1106
},
{
"epoch": 2.630389533154921,
"grad_norm": 0.5910363793373108,
"learning_rate": 3.4528076463560336e-05,
"loss": 0.2619,
"step": 1107
},
{
"epoch": 2.632768361581921,
"grad_norm": 0.6204510927200317,
"learning_rate": 3.4468339307048986e-05,
"loss": 0.2771,
"step": 1108
},
{
"epoch": 2.6351471900089205,
"grad_norm": 0.5862094759941101,
"learning_rate": 3.4408602150537636e-05,
"loss": 0.2577,
"step": 1109
},
{
"epoch": 2.63752601843592,
"grad_norm": 0.5883625149726868,
"learning_rate": 3.4348864994026287e-05,
"loss": 0.2714,
"step": 1110
},
{
"epoch": 2.63990484686292,
"grad_norm": 0.594275951385498,
"learning_rate": 3.428912783751494e-05,
"loss": 0.2588,
"step": 1111
},
{
"epoch": 2.64228367528992,
"grad_norm": 0.628243088722229,
"learning_rate": 3.422939068100359e-05,
"loss": 0.2898,
"step": 1112
},
{
"epoch": 2.6446625037169196,
"grad_norm": 0.597488284111023,
"learning_rate": 3.416965352449224e-05,
"loss": 0.3002,
"step": 1113
},
{
"epoch": 2.6470413321439192,
"grad_norm": 0.5316476225852966,
"learning_rate": 3.410991636798089e-05,
"loss": 0.2652,
"step": 1114
},
{
"epoch": 2.649420160570919,
"grad_norm": 0.4937030076980591,
"learning_rate": 3.405017921146954e-05,
"loss": 0.2695,
"step": 1115
},
{
"epoch": 2.6517989889979185,
"grad_norm": 0.5909802913665771,
"learning_rate": 3.399044205495819e-05,
"loss": 0.2897,
"step": 1116
},
{
"epoch": 2.654177817424918,
"grad_norm": 0.6558974981307983,
"learning_rate": 3.393070489844684e-05,
"loss": 0.2713,
"step": 1117
},
{
"epoch": 2.656556645851918,
"grad_norm": 0.5037103295326233,
"learning_rate": 3.387096774193548e-05,
"loss": 0.2507,
"step": 1118
},
{
"epoch": 2.6589354742789175,
"grad_norm": 0.5422642827033997,
"learning_rate": 3.381123058542414e-05,
"loss": 0.2866,
"step": 1119
},
{
"epoch": 2.661314302705917,
"grad_norm": 0.6684660315513611,
"learning_rate": 3.375149342891278e-05,
"loss": 0.2693,
"step": 1120
},
{
"epoch": 2.6636931311329173,
"grad_norm": 0.6448227167129517,
"learning_rate": 3.369175627240144e-05,
"loss": 0.3011,
"step": 1121
},
{
"epoch": 2.6660719595599165,
"grad_norm": 0.5330891609191895,
"learning_rate": 3.3632019115890083e-05,
"loss": 0.2492,
"step": 1122
},
{
"epoch": 2.6684507879869166,
"grad_norm": 0.6081724166870117,
"learning_rate": 3.357228195937874e-05,
"loss": 0.2665,
"step": 1123
},
{
"epoch": 2.6708296164139163,
"grad_norm": 0.6162919402122498,
"learning_rate": 3.3512544802867384e-05,
"loss": 0.3346,
"step": 1124
},
{
"epoch": 2.673208444840916,
"grad_norm": 0.681792676448822,
"learning_rate": 3.3452807646356034e-05,
"loss": 0.3543,
"step": 1125
},
{
"epoch": 2.6755872732679156,
"grad_norm": 0.675923228263855,
"learning_rate": 3.3393070489844685e-05,
"loss": 0.2943,
"step": 1126
},
{
"epoch": 2.6779661016949152,
"grad_norm": 0.6438702940940857,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.2654,
"step": 1127
},
{
"epoch": 2.680344930121915,
"grad_norm": 0.5402166247367859,
"learning_rate": 3.3273596176821985e-05,
"loss": 0.2676,
"step": 1128
},
{
"epoch": 2.6827237585489145,
"grad_norm": 0.5861081480979919,
"learning_rate": 3.3213859020310636e-05,
"loss": 0.2892,
"step": 1129
},
{
"epoch": 2.685102586975914,
"grad_norm": 0.6178301572799683,
"learning_rate": 3.3154121863799286e-05,
"loss": 0.3191,
"step": 1130
},
{
"epoch": 2.687481415402914,
"grad_norm": 0.6531718373298645,
"learning_rate": 3.3094384707287936e-05,
"loss": 0.3261,
"step": 1131
},
{
"epoch": 2.689860243829914,
"grad_norm": 0.6978683471679688,
"learning_rate": 3.3034647550776586e-05,
"loss": 0.2942,
"step": 1132
},
{
"epoch": 2.6922390722569136,
"grad_norm": 0.571123480796814,
"learning_rate": 3.297491039426524e-05,
"loss": 0.2324,
"step": 1133
},
{
"epoch": 2.6946179006839133,
"grad_norm": 0.6293614506721497,
"learning_rate": 3.291517323775388e-05,
"loss": 0.3065,
"step": 1134
},
{
"epoch": 2.696996729110913,
"grad_norm": 0.6265407800674438,
"learning_rate": 3.285543608124254e-05,
"loss": 0.2506,
"step": 1135
},
{
"epoch": 2.6993755575379126,
"grad_norm": 0.5328508019447327,
"learning_rate": 3.279569892473118e-05,
"loss": 0.2653,
"step": 1136
},
{
"epoch": 2.7017543859649122,
"grad_norm": 0.6570659875869751,
"learning_rate": 3.273596176821984e-05,
"loss": 0.2438,
"step": 1137
},
{
"epoch": 2.704133214391912,
"grad_norm": 0.7575013637542725,
"learning_rate": 3.267622461170848e-05,
"loss": 0.4259,
"step": 1138
},
{
"epoch": 2.7065120428189116,
"grad_norm": 0.6472057700157166,
"learning_rate": 3.261648745519714e-05,
"loss": 0.2812,
"step": 1139
},
{
"epoch": 2.708890871245911,
"grad_norm": 0.5538692474365234,
"learning_rate": 3.255675029868578e-05,
"loss": 0.2403,
"step": 1140
},
{
"epoch": 2.7112696996729113,
"grad_norm": 0.5290868282318115,
"learning_rate": 3.249701314217443e-05,
"loss": 0.248,
"step": 1141
},
{
"epoch": 2.7136485280999105,
"grad_norm": 0.6288734078407288,
"learning_rate": 3.243727598566308e-05,
"loss": 0.3678,
"step": 1142
},
{
"epoch": 2.7160273565269106,
"grad_norm": 0.6795669794082642,
"learning_rate": 3.237753882915173e-05,
"loss": 0.2997,
"step": 1143
},
{
"epoch": 2.7184061849539103,
"grad_norm": 0.6042711138725281,
"learning_rate": 3.231780167264038e-05,
"loss": 0.2916,
"step": 1144
},
{
"epoch": 2.72078501338091,
"grad_norm": 0.6086244583129883,
"learning_rate": 3.2258064516129034e-05,
"loss": 0.2755,
"step": 1145
},
{
"epoch": 2.7231638418079096,
"grad_norm": 0.5196691751480103,
"learning_rate": 3.2198327359617684e-05,
"loss": 0.2972,
"step": 1146
},
{
"epoch": 2.7255426702349093,
"grad_norm": 0.5514973998069763,
"learning_rate": 3.2138590203106334e-05,
"loss": 0.2221,
"step": 1147
},
{
"epoch": 2.727921498661909,
"grad_norm": 0.5679486989974976,
"learning_rate": 3.207885304659498e-05,
"loss": 0.3141,
"step": 1148
},
{
"epoch": 2.7303003270889086,
"grad_norm": 0.6501855254173279,
"learning_rate": 3.2019115890083635e-05,
"loss": 0.3131,
"step": 1149
},
{
"epoch": 2.7326791555159087,
"grad_norm": 0.5475935935974121,
"learning_rate": 3.195937873357228e-05,
"loss": 0.2932,
"step": 1150
},
{
"epoch": 2.735057983942908,
"grad_norm": 0.6490268707275391,
"learning_rate": 3.1899641577060935e-05,
"loss": 0.2759,
"step": 1151
},
{
"epoch": 2.737436812369908,
"grad_norm": 0.6104749441146851,
"learning_rate": 3.183990442054958e-05,
"loss": 0.3175,
"step": 1152
},
{
"epoch": 2.7398156407969076,
"grad_norm": 0.581358790397644,
"learning_rate": 3.1780167264038236e-05,
"loss": 0.3054,
"step": 1153
},
{
"epoch": 2.7421944692239073,
"grad_norm": 0.5382530093193054,
"learning_rate": 3.172043010752688e-05,
"loss": 0.2848,
"step": 1154
},
{
"epoch": 2.744573297650907,
"grad_norm": 0.6220831871032715,
"learning_rate": 3.1660692951015537e-05,
"loss": 0.3404,
"step": 1155
},
{
"epoch": 2.7469521260779066,
"grad_norm": 0.6008325815200806,
"learning_rate": 3.160095579450418e-05,
"loss": 0.2543,
"step": 1156
},
{
"epoch": 2.7493309545049063,
"grad_norm": 0.5742363929748535,
"learning_rate": 3.154121863799283e-05,
"loss": 0.259,
"step": 1157
},
{
"epoch": 2.751709782931906,
"grad_norm": 0.5634133219718933,
"learning_rate": 3.148148148148148e-05,
"loss": 0.2698,
"step": 1158
},
{
"epoch": 2.7540886113589056,
"grad_norm": 0.5804802775382996,
"learning_rate": 3.142174432497013e-05,
"loss": 0.2744,
"step": 1159
},
{
"epoch": 2.7564674397859052,
"grad_norm": 0.6175990700721741,
"learning_rate": 3.136200716845878e-05,
"loss": 0.2503,
"step": 1160
},
{
"epoch": 2.7588462682129054,
"grad_norm": 0.6470534801483154,
"learning_rate": 3.130227001194743e-05,
"loss": 0.3464,
"step": 1161
},
{
"epoch": 2.761225096639905,
"grad_norm": 0.6418605446815491,
"learning_rate": 3.124253285543608e-05,
"loss": 0.2886,
"step": 1162
},
{
"epoch": 2.7636039250669047,
"grad_norm": 0.8086894750595093,
"learning_rate": 3.118279569892473e-05,
"loss": 0.2267,
"step": 1163
},
{
"epoch": 2.7659827534939043,
"grad_norm": 0.616875410079956,
"learning_rate": 3.112305854241338e-05,
"loss": 0.3042,
"step": 1164
},
{
"epoch": 2.768361581920904,
"grad_norm": 0.5028004050254822,
"learning_rate": 3.106332138590203e-05,
"loss": 0.2081,
"step": 1165
},
{
"epoch": 2.7707404103479036,
"grad_norm": 0.5773189067840576,
"learning_rate": 3.100358422939068e-05,
"loss": 0.3195,
"step": 1166
},
{
"epoch": 2.7731192387749033,
"grad_norm": 0.7181592583656311,
"learning_rate": 3.0943847072879333e-05,
"loss": 0.3569,
"step": 1167
},
{
"epoch": 2.775498067201903,
"grad_norm": 0.5970394611358643,
"learning_rate": 3.0884109916367984e-05,
"loss": 0.264,
"step": 1168
},
{
"epoch": 2.7778768956289026,
"grad_norm": 1.4490431547164917,
"learning_rate": 3.0824372759856634e-05,
"loss": 0.276,
"step": 1169
},
{
"epoch": 2.7802557240559027,
"grad_norm": 0.6196287870407104,
"learning_rate": 3.0764635603345284e-05,
"loss": 0.296,
"step": 1170
},
{
"epoch": 2.782634552482902,
"grad_norm": 0.6437617540359497,
"learning_rate": 3.070489844683393e-05,
"loss": 0.268,
"step": 1171
},
{
"epoch": 2.785013380909902,
"grad_norm": 0.6326783895492554,
"learning_rate": 3.0645161290322585e-05,
"loss": 0.329,
"step": 1172
},
{
"epoch": 2.7873922093369017,
"grad_norm": 0.5962085127830505,
"learning_rate": 3.058542413381123e-05,
"loss": 0.2598,
"step": 1173
},
{
"epoch": 2.7897710377639013,
"grad_norm": 0.7013174891471863,
"learning_rate": 3.0525686977299886e-05,
"loss": 0.308,
"step": 1174
},
{
"epoch": 2.792149866190901,
"grad_norm": 0.6110320687294006,
"learning_rate": 3.046594982078853e-05,
"loss": 0.2844,
"step": 1175
},
{
"epoch": 2.7945286946179007,
"grad_norm": 0.5971323251724243,
"learning_rate": 3.0406212664277183e-05,
"loss": 0.2795,
"step": 1176
},
{
"epoch": 2.7969075230449003,
"grad_norm": 0.6299296021461487,
"learning_rate": 3.034647550776583e-05,
"loss": 0.2621,
"step": 1177
},
{
"epoch": 2.7992863514719,
"grad_norm": 0.6195304989814758,
"learning_rate": 3.0286738351254483e-05,
"loss": 0.3349,
"step": 1178
},
{
"epoch": 2.8016651798988996,
"grad_norm": 0.49884721636772156,
"learning_rate": 3.022700119474313e-05,
"loss": 0.2469,
"step": 1179
},
{
"epoch": 2.8040440083258993,
"grad_norm": 0.5852887034416199,
"learning_rate": 3.016726403823178e-05,
"loss": 0.2885,
"step": 1180
},
{
"epoch": 2.8064228367528994,
"grad_norm": 0.6772944331169128,
"learning_rate": 3.010752688172043e-05,
"loss": 0.283,
"step": 1181
},
{
"epoch": 2.808801665179899,
"grad_norm": 0.5862092971801758,
"learning_rate": 3.004778972520908e-05,
"loss": 0.3337,
"step": 1182
},
{
"epoch": 2.8111804936068987,
"grad_norm": 0.5878643989562988,
"learning_rate": 2.998805256869773e-05,
"loss": 0.2948,
"step": 1183
},
{
"epoch": 2.8135593220338984,
"grad_norm": 0.5235293507575989,
"learning_rate": 2.9928315412186382e-05,
"loss": 0.2513,
"step": 1184
},
{
"epoch": 2.815938150460898,
"grad_norm": 0.5556120276451111,
"learning_rate": 2.9868578255675032e-05,
"loss": 0.2719,
"step": 1185
},
{
"epoch": 2.8183169788878977,
"grad_norm": 0.6422498822212219,
"learning_rate": 2.9808841099163682e-05,
"loss": 0.3302,
"step": 1186
},
{
"epoch": 2.8206958073148973,
"grad_norm": 0.6159522533416748,
"learning_rate": 2.974910394265233e-05,
"loss": 0.3088,
"step": 1187
},
{
"epoch": 2.823074635741897,
"grad_norm": 0.5432312488555908,
"learning_rate": 2.9689366786140983e-05,
"loss": 0.2808,
"step": 1188
},
{
"epoch": 2.8254534641688966,
"grad_norm": 0.6293365359306335,
"learning_rate": 2.962962962962963e-05,
"loss": 0.2902,
"step": 1189
},
{
"epoch": 2.8278322925958967,
"grad_norm": 0.6163249015808105,
"learning_rate": 2.9569892473118284e-05,
"loss": 0.2708,
"step": 1190
},
{
"epoch": 2.830211121022896,
"grad_norm": 0.5814666748046875,
"learning_rate": 2.951015531660693e-05,
"loss": 0.2855,
"step": 1191
},
{
"epoch": 2.832589949449896,
"grad_norm": 0.5678106546401978,
"learning_rate": 2.9450418160095584e-05,
"loss": 0.2909,
"step": 1192
},
{
"epoch": 2.8349687778768957,
"grad_norm": 0.5888578295707703,
"learning_rate": 2.939068100358423e-05,
"loss": 0.2653,
"step": 1193
},
{
"epoch": 2.8373476063038954,
"grad_norm": 0.5649316310882568,
"learning_rate": 2.9330943847072878e-05,
"loss": 0.3173,
"step": 1194
},
{
"epoch": 2.839726434730895,
"grad_norm": 0.6372131109237671,
"learning_rate": 2.9271206690561532e-05,
"loss": 0.3098,
"step": 1195
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.6272296905517578,
"learning_rate": 2.921146953405018e-05,
"loss": 0.3157,
"step": 1196
},
{
"epoch": 2.8444840915848943,
"grad_norm": 0.6271634101867676,
"learning_rate": 2.9151732377538832e-05,
"loss": 0.3352,
"step": 1197
},
{
"epoch": 2.846862920011894,
"grad_norm": 0.5737547874450684,
"learning_rate": 2.909199522102748e-05,
"loss": 0.2748,
"step": 1198
},
{
"epoch": 2.849241748438894,
"grad_norm": 0.6345760226249695,
"learning_rate": 2.9032258064516133e-05,
"loss": 0.2769,
"step": 1199
},
{
"epoch": 2.8516205768658933,
"grad_norm": 0.5594867467880249,
"learning_rate": 2.897252090800478e-05,
"loss": 0.2573,
"step": 1200
},
{
"epoch": 2.8516205768658933,
"eval_loss": 0.3864609897136688,
"eval_runtime": 24.8837,
"eval_samples_per_second": 30.06,
"eval_steps_per_second": 15.03,
"step": 1200
},
{
"epoch": 2.8539994052928934,
"grad_norm": 0.5716099143028259,
"learning_rate": 2.8912783751493434e-05,
"loss": 0.2803,
"step": 1201
},
{
"epoch": 2.856378233719893,
"grad_norm": 0.7210864424705505,
"learning_rate": 2.885304659498208e-05,
"loss": 0.2867,
"step": 1202
},
{
"epoch": 2.8587570621468927,
"grad_norm": 0.7296307682991028,
"learning_rate": 2.8793309438470727e-05,
"loss": 0.3317,
"step": 1203
},
{
"epoch": 2.8611358905738924,
"grad_norm": 0.7070258259773254,
"learning_rate": 2.873357228195938e-05,
"loss": 0.3031,
"step": 1204
},
{
"epoch": 2.863514719000892,
"grad_norm": 0.6060933470726013,
"learning_rate": 2.8673835125448028e-05,
"loss": 0.2816,
"step": 1205
},
{
"epoch": 2.8658935474278917,
"grad_norm": 0.6421394944190979,
"learning_rate": 2.861409796893668e-05,
"loss": 0.2717,
"step": 1206
},
{
"epoch": 2.8682723758548914,
"grad_norm": 0.6352380514144897,
"learning_rate": 2.855436081242533e-05,
"loss": 0.3115,
"step": 1207
},
{
"epoch": 2.870651204281891,
"grad_norm": 0.6790797114372253,
"learning_rate": 2.8494623655913982e-05,
"loss": 0.3528,
"step": 1208
},
{
"epoch": 2.8730300327088907,
"grad_norm": 0.5601657629013062,
"learning_rate": 2.843488649940263e-05,
"loss": 0.2254,
"step": 1209
},
{
"epoch": 2.875408861135891,
"grad_norm": 0.5759854912757874,
"learning_rate": 2.8375149342891276e-05,
"loss": 0.284,
"step": 1210
},
{
"epoch": 2.8777876895628904,
"grad_norm": 0.6258363127708435,
"learning_rate": 2.831541218637993e-05,
"loss": 0.28,
"step": 1211
},
{
"epoch": 2.88016651798989,
"grad_norm": 0.7181396484375,
"learning_rate": 2.8255675029868577e-05,
"loss": 0.3192,
"step": 1212
},
{
"epoch": 2.8825453464168898,
"grad_norm": 0.6534887552261353,
"learning_rate": 2.819593787335723e-05,
"loss": 0.3195,
"step": 1213
},
{
"epoch": 2.8849241748438894,
"grad_norm": 0.6765838265419006,
"learning_rate": 2.8136200716845877e-05,
"loss": 0.3031,
"step": 1214
},
{
"epoch": 2.887303003270889,
"grad_norm": 0.6807898879051208,
"learning_rate": 2.807646356033453e-05,
"loss": 0.315,
"step": 1215
},
{
"epoch": 2.8896818316978887,
"grad_norm": 0.6026751399040222,
"learning_rate": 2.8016726403823178e-05,
"loss": 0.2811,
"step": 1216
},
{
"epoch": 2.8920606601248884,
"grad_norm": 0.5754644870758057,
"learning_rate": 2.7956989247311828e-05,
"loss": 0.2772,
"step": 1217
},
{
"epoch": 2.894439488551888,
"grad_norm": 0.5280768871307373,
"learning_rate": 2.789725209080048e-05,
"loss": 0.2345,
"step": 1218
},
{
"epoch": 2.896818316978888,
"grad_norm": 0.6319217681884766,
"learning_rate": 2.783751493428913e-05,
"loss": 0.2649,
"step": 1219
},
{
"epoch": 2.8991971454058874,
"grad_norm": 0.5606786608695984,
"learning_rate": 2.777777777777778e-05,
"loss": 0.2557,
"step": 1220
},
{
"epoch": 2.9015759738328875,
"grad_norm": 0.6417257785797119,
"learning_rate": 2.771804062126643e-05,
"loss": 0.3148,
"step": 1221
},
{
"epoch": 2.903954802259887,
"grad_norm": 0.5431168079376221,
"learning_rate": 2.765830346475508e-05,
"loss": 0.2775,
"step": 1222
},
{
"epoch": 2.9063336306868868,
"grad_norm": 0.6295545697212219,
"learning_rate": 2.759856630824373e-05,
"loss": 0.3051,
"step": 1223
},
{
"epoch": 2.9087124591138864,
"grad_norm": 0.6698517203330994,
"learning_rate": 2.753882915173238e-05,
"loss": 0.3033,
"step": 1224
},
{
"epoch": 2.911091287540886,
"grad_norm": 0.5729504227638245,
"learning_rate": 2.747909199522103e-05,
"loss": 0.26,
"step": 1225
},
{
"epoch": 2.9134701159678857,
"grad_norm": 0.5817504525184631,
"learning_rate": 2.7419354838709678e-05,
"loss": 0.2457,
"step": 1226
},
{
"epoch": 2.9158489443948854,
"grad_norm": 0.6074779629707336,
"learning_rate": 2.735961768219833e-05,
"loss": 0.2623,
"step": 1227
},
{
"epoch": 2.918227772821885,
"grad_norm": 0.7250639796257019,
"learning_rate": 2.7299880525686978e-05,
"loss": 0.3272,
"step": 1228
},
{
"epoch": 2.9206066012488847,
"grad_norm": 0.6558791399002075,
"learning_rate": 2.7240143369175632e-05,
"loss": 0.2699,
"step": 1229
},
{
"epoch": 2.922985429675885,
"grad_norm": 0.5835295915603638,
"learning_rate": 2.718040621266428e-05,
"loss": 0.3002,
"step": 1230
},
{
"epoch": 2.9253642581028845,
"grad_norm": 0.6902837157249451,
"learning_rate": 2.7120669056152932e-05,
"loss": 0.3135,
"step": 1231
},
{
"epoch": 2.927743086529884,
"grad_norm": 0.5926578640937805,
"learning_rate": 2.706093189964158e-05,
"loss": 0.2616,
"step": 1232
},
{
"epoch": 2.930121914956884,
"grad_norm": 0.5405444502830505,
"learning_rate": 2.7001194743130226e-05,
"loss": 0.2587,
"step": 1233
},
{
"epoch": 2.9325007433838834,
"grad_norm": 0.606576144695282,
"learning_rate": 2.694145758661888e-05,
"loss": 0.2717,
"step": 1234
},
{
"epoch": 2.934879571810883,
"grad_norm": 0.6612190008163452,
"learning_rate": 2.6881720430107527e-05,
"loss": 0.3005,
"step": 1235
},
{
"epoch": 2.9372584002378828,
"grad_norm": 0.6425749063491821,
"learning_rate": 2.682198327359618e-05,
"loss": 0.2606,
"step": 1236
},
{
"epoch": 2.9396372286648824,
"grad_norm": 0.6714048385620117,
"learning_rate": 2.6762246117084827e-05,
"loss": 0.3116,
"step": 1237
},
{
"epoch": 2.942016057091882,
"grad_norm": 0.6480368971824646,
"learning_rate": 2.670250896057348e-05,
"loss": 0.3088,
"step": 1238
},
{
"epoch": 2.944394885518882,
"grad_norm": 0.6665281653404236,
"learning_rate": 2.6642771804062128e-05,
"loss": 0.3281,
"step": 1239
},
{
"epoch": 2.9467737139458814,
"grad_norm": 0.5490178465843201,
"learning_rate": 2.6583034647550775e-05,
"loss": 0.2708,
"step": 1240
},
{
"epoch": 2.9491525423728815,
"grad_norm": 0.6380129456520081,
"learning_rate": 2.652329749103943e-05,
"loss": 0.2738,
"step": 1241
},
{
"epoch": 2.951531370799881,
"grad_norm": 0.6085153818130493,
"learning_rate": 2.6463560334528076e-05,
"loss": 0.2426,
"step": 1242
},
{
"epoch": 2.953910199226881,
"grad_norm": 0.6035470366477966,
"learning_rate": 2.640382317801673e-05,
"loss": 0.3003,
"step": 1243
},
{
"epoch": 2.9562890276538805,
"grad_norm": 0.6204206943511963,
"learning_rate": 2.6344086021505376e-05,
"loss": 0.2981,
"step": 1244
},
{
"epoch": 2.95866785608088,
"grad_norm": 0.629393458366394,
"learning_rate": 2.628434886499403e-05,
"loss": 0.3259,
"step": 1245
},
{
"epoch": 2.96104668450788,
"grad_norm": 0.6644812226295471,
"learning_rate": 2.6224611708482677e-05,
"loss": 0.2825,
"step": 1246
},
{
"epoch": 2.9634255129348794,
"grad_norm": 0.6230280995368958,
"learning_rate": 2.616487455197133e-05,
"loss": 0.2443,
"step": 1247
},
{
"epoch": 2.9658043413618795,
"grad_norm": 0.6109925508499146,
"learning_rate": 2.6105137395459977e-05,
"loss": 0.3038,
"step": 1248
},
{
"epoch": 2.9681831697888788,
"grad_norm": 0.5434057712554932,
"learning_rate": 2.6045400238948624e-05,
"loss": 0.2835,
"step": 1249
},
{
"epoch": 2.970561998215879,
"grad_norm": 0.6708266735076904,
"learning_rate": 2.5985663082437278e-05,
"loss": 0.3569,
"step": 1250
},
{
"epoch": 2.9729408266428785,
"grad_norm": 0.614422619342804,
"learning_rate": 2.5925925925925925e-05,
"loss": 0.3171,
"step": 1251
},
{
"epoch": 2.975319655069878,
"grad_norm": 0.6892760992050171,
"learning_rate": 2.586618876941458e-05,
"loss": 0.2787,
"step": 1252
},
{
"epoch": 2.977698483496878,
"grad_norm": 0.6347784399986267,
"learning_rate": 2.5806451612903226e-05,
"loss": 0.2649,
"step": 1253
},
{
"epoch": 2.9800773119238775,
"grad_norm": 0.5815712809562683,
"learning_rate": 2.574671445639188e-05,
"loss": 0.332,
"step": 1254
},
{
"epoch": 2.982456140350877,
"grad_norm": 0.57394939661026,
"learning_rate": 2.5686977299880526e-05,
"loss": 0.2549,
"step": 1255
},
{
"epoch": 2.984834968777877,
"grad_norm": 0.5753270387649536,
"learning_rate": 2.5627240143369173e-05,
"loss": 0.291,
"step": 1256
},
{
"epoch": 2.9872137972048765,
"grad_norm": 0.7134138941764832,
"learning_rate": 2.5567502986857827e-05,
"loss": 0.3167,
"step": 1257
},
{
"epoch": 2.989592625631876,
"grad_norm": 0.5441657900810242,
"learning_rate": 2.5507765830346474e-05,
"loss": 0.2621,
"step": 1258
},
{
"epoch": 2.991971454058876,
"grad_norm": 0.5922890305519104,
"learning_rate": 2.5448028673835127e-05,
"loss": 0.2679,
"step": 1259
},
{
"epoch": 2.994350282485876,
"grad_norm": 0.559668779373169,
"learning_rate": 2.5388291517323774e-05,
"loss": 0.3293,
"step": 1260
},
{
"epoch": 2.9967291109128755,
"grad_norm": 0.5969208478927612,
"learning_rate": 2.5328554360812428e-05,
"loss": 0.2749,
"step": 1261
},
{
"epoch": 2.999107939339875,
"grad_norm": 0.609047532081604,
"learning_rate": 2.5268817204301075e-05,
"loss": 0.3221,
"step": 1262
},
{
"epoch": 3.0,
"grad_norm": 1.1574978828430176,
"learning_rate": 2.5209080047789725e-05,
"loss": 0.2991,
"step": 1263
},
{
"epoch": 3.0023788284269997,
"grad_norm": 0.5617911219596863,
"learning_rate": 2.5149342891278375e-05,
"loss": 0.2266,
"step": 1264
},
{
"epoch": 3.0047576568539993,
"grad_norm": 0.519255518913269,
"learning_rate": 2.5089605734767026e-05,
"loss": 0.2256,
"step": 1265
},
{
"epoch": 3.007136485280999,
"grad_norm": 0.5122116208076477,
"learning_rate": 2.5029868578255676e-05,
"loss": 0.2438,
"step": 1266
},
{
"epoch": 3.0095153137079986,
"grad_norm": 0.5582380890846252,
"learning_rate": 2.4970131421744326e-05,
"loss": 0.284,
"step": 1267
},
{
"epoch": 3.0118941421349987,
"grad_norm": 0.5494282841682434,
"learning_rate": 2.4910394265232977e-05,
"loss": 0.2118,
"step": 1268
},
{
"epoch": 3.0142729705619984,
"grad_norm": 0.585095226764679,
"learning_rate": 2.4850657108721627e-05,
"loss": 0.2572,
"step": 1269
},
{
"epoch": 3.016651798988998,
"grad_norm": 0.5811973810195923,
"learning_rate": 2.4790919952210277e-05,
"loss": 0.2532,
"step": 1270
},
{
"epoch": 3.0190306274159977,
"grad_norm": 0.5602511763572693,
"learning_rate": 2.4731182795698928e-05,
"loss": 0.2175,
"step": 1271
},
{
"epoch": 3.0214094558429974,
"grad_norm": 0.5386038422584534,
"learning_rate": 2.4671445639187578e-05,
"loss": 0.1973,
"step": 1272
},
{
"epoch": 3.023788284269997,
"grad_norm": 0.5802236795425415,
"learning_rate": 2.4611708482676228e-05,
"loss": 0.2655,
"step": 1273
},
{
"epoch": 3.0261671126969967,
"grad_norm": 0.652818500995636,
"learning_rate": 2.455197132616488e-05,
"loss": 0.2907,
"step": 1274
},
{
"epoch": 3.0285459411239963,
"grad_norm": 0.6712412238121033,
"learning_rate": 2.4492234169653525e-05,
"loss": 0.2186,
"step": 1275
},
{
"epoch": 3.030924769550996,
"grad_norm": 0.6123949885368347,
"learning_rate": 2.4432497013142176e-05,
"loss": 0.2088,
"step": 1276
},
{
"epoch": 3.0333035979779956,
"grad_norm": 0.6561902761459351,
"learning_rate": 2.4372759856630826e-05,
"loss": 0.279,
"step": 1277
},
{
"epoch": 3.0356824264049957,
"grad_norm": 0.6276388168334961,
"learning_rate": 2.4313022700119476e-05,
"loss": 0.271,
"step": 1278
},
{
"epoch": 3.0380612548319954,
"grad_norm": 0.6087429523468018,
"learning_rate": 2.4253285543608127e-05,
"loss": 0.2306,
"step": 1279
},
{
"epoch": 3.040440083258995,
"grad_norm": 0.5728775858879089,
"learning_rate": 2.4193548387096777e-05,
"loss": 0.2348,
"step": 1280
},
{
"epoch": 3.0428189116859947,
"grad_norm": 0.5878280401229858,
"learning_rate": 2.4133811230585427e-05,
"loss": 0.2411,
"step": 1281
},
{
"epoch": 3.0451977401129944,
"grad_norm": 0.5655471086502075,
"learning_rate": 2.4074074074074074e-05,
"loss": 0.2077,
"step": 1282
},
{
"epoch": 3.047576568539994,
"grad_norm": 0.5882824659347534,
"learning_rate": 2.4014336917562724e-05,
"loss": 0.227,
"step": 1283
},
{
"epoch": 3.0499553969669937,
"grad_norm": 0.6607369184494019,
"learning_rate": 2.3954599761051375e-05,
"loss": 0.2578,
"step": 1284
},
{
"epoch": 3.0523342253939933,
"grad_norm": 0.5448257327079773,
"learning_rate": 2.3894862604540025e-05,
"loss": 0.2349,
"step": 1285
},
{
"epoch": 3.054713053820993,
"grad_norm": 0.6125568747520447,
"learning_rate": 2.3835125448028675e-05,
"loss": 0.2324,
"step": 1286
},
{
"epoch": 3.0570918822479927,
"grad_norm": 0.6578051447868347,
"learning_rate": 2.3775388291517326e-05,
"loss": 0.2301,
"step": 1287
},
{
"epoch": 3.0594707106749928,
"grad_norm": 0.6022703051567078,
"learning_rate": 2.3715651135005976e-05,
"loss": 0.2335,
"step": 1288
},
{
"epoch": 3.0618495391019924,
"grad_norm": 0.6820448637008667,
"learning_rate": 2.3655913978494626e-05,
"loss": 0.2426,
"step": 1289
},
{
"epoch": 3.064228367528992,
"grad_norm": 0.6197579503059387,
"learning_rate": 2.3596176821983273e-05,
"loss": 0.2528,
"step": 1290
},
{
"epoch": 3.0666071959559917,
"grad_norm": 0.6325532793998718,
"learning_rate": 2.3536439665471923e-05,
"loss": 0.2398,
"step": 1291
},
{
"epoch": 3.0689860243829914,
"grad_norm": 0.6155019402503967,
"learning_rate": 2.3476702508960574e-05,
"loss": 0.2767,
"step": 1292
},
{
"epoch": 3.071364852809991,
"grad_norm": 0.6121396422386169,
"learning_rate": 2.3416965352449224e-05,
"loss": 0.2391,
"step": 1293
},
{
"epoch": 3.0737436812369907,
"grad_norm": 0.553560733795166,
"learning_rate": 2.3357228195937874e-05,
"loss": 0.2009,
"step": 1294
},
{
"epoch": 3.0761225096639904,
"grad_norm": 0.5787554383277893,
"learning_rate": 2.3297491039426525e-05,
"loss": 0.2359,
"step": 1295
},
{
"epoch": 3.07850133809099,
"grad_norm": 0.6845440864562988,
"learning_rate": 2.3237753882915175e-05,
"loss": 0.2573,
"step": 1296
},
{
"epoch": 3.0808801665179897,
"grad_norm": 0.6331266760826111,
"learning_rate": 2.3178016726403825e-05,
"loss": 0.2295,
"step": 1297
},
{
"epoch": 3.08325899494499,
"grad_norm": 0.640466034412384,
"learning_rate": 2.3118279569892472e-05,
"loss": 0.2456,
"step": 1298
},
{
"epoch": 3.0856378233719894,
"grad_norm": 0.7017742991447449,
"learning_rate": 2.3058542413381122e-05,
"loss": 0.2793,
"step": 1299
},
{
"epoch": 3.088016651798989,
"grad_norm": 0.7358404397964478,
"learning_rate": 2.2998805256869773e-05,
"loss": 0.277,
"step": 1300
},
{
"epoch": 3.088016651798989,
"eval_loss": 0.4049249291419983,
"eval_runtime": 24.7093,
"eval_samples_per_second": 30.272,
"eval_steps_per_second": 15.136,
"step": 1300
},
{
"epoch": 3.0903954802259888,
"grad_norm": 0.64457106590271,
"learning_rate": 2.2939068100358423e-05,
"loss": 0.2707,
"step": 1301
},
{
"epoch": 3.0927743086529884,
"grad_norm": 0.672550618648529,
"learning_rate": 2.2879330943847073e-05,
"loss": 0.248,
"step": 1302
},
{
"epoch": 3.095153137079988,
"grad_norm": 0.6365009546279907,
"learning_rate": 2.2819593787335724e-05,
"loss": 0.2387,
"step": 1303
},
{
"epoch": 3.0975319655069877,
"grad_norm": 0.6442080736160278,
"learning_rate": 2.2759856630824374e-05,
"loss": 0.2216,
"step": 1304
},
{
"epoch": 3.0999107939339874,
"grad_norm": 0.6259413361549377,
"learning_rate": 2.270011947431302e-05,
"loss": 0.2541,
"step": 1305
},
{
"epoch": 3.102289622360987,
"grad_norm": 0.6365742683410645,
"learning_rate": 2.264038231780167e-05,
"loss": 0.2415,
"step": 1306
},
{
"epoch": 3.1046684507879867,
"grad_norm": 0.5824887156486511,
"learning_rate": 2.258064516129032e-05,
"loss": 0.2529,
"step": 1307
},
{
"epoch": 3.107047279214987,
"grad_norm": 0.6794803142547607,
"learning_rate": 2.2520908004778972e-05,
"loss": 0.23,
"step": 1308
},
{
"epoch": 3.1094261076419865,
"grad_norm": 0.6019396185874939,
"learning_rate": 2.2461170848267622e-05,
"loss": 0.2162,
"step": 1309
},
{
"epoch": 3.111804936068986,
"grad_norm": 0.5950125455856323,
"learning_rate": 2.2401433691756272e-05,
"loss": 0.29,
"step": 1310
},
{
"epoch": 3.1141837644959858,
"grad_norm": 0.6780076026916504,
"learning_rate": 2.2341696535244923e-05,
"loss": 0.2402,
"step": 1311
},
{
"epoch": 3.1165625929229854,
"grad_norm": 0.6014128923416138,
"learning_rate": 2.2281959378733573e-05,
"loss": 0.2104,
"step": 1312
},
{
"epoch": 3.118941421349985,
"grad_norm": 0.6915101408958435,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.3026,
"step": 1313
},
{
"epoch": 3.1213202497769847,
"grad_norm": 0.6975738406181335,
"learning_rate": 2.2162485065710874e-05,
"loss": 0.2436,
"step": 1314
},
{
"epoch": 3.1236990782039844,
"grad_norm": 0.7148664593696594,
"learning_rate": 2.2102747909199524e-05,
"loss": 0.2506,
"step": 1315
},
{
"epoch": 3.126077906630984,
"grad_norm": 0.6675700545310974,
"learning_rate": 2.2043010752688174e-05,
"loss": 0.2308,
"step": 1316
},
{
"epoch": 3.128456735057984,
"grad_norm": 0.6789043545722961,
"learning_rate": 2.1983273596176824e-05,
"loss": 0.2353,
"step": 1317
},
{
"epoch": 3.130835563484984,
"grad_norm": 0.6762030124664307,
"learning_rate": 2.1923536439665475e-05,
"loss": 0.2479,
"step": 1318
},
{
"epoch": 3.1332143919119835,
"grad_norm": 0.6441430449485779,
"learning_rate": 2.1863799283154125e-05,
"loss": 0.1835,
"step": 1319
},
{
"epoch": 3.135593220338983,
"grad_norm": 0.6008920073509216,
"learning_rate": 2.1804062126642775e-05,
"loss": 0.2179,
"step": 1320
},
{
"epoch": 3.137972048765983,
"grad_norm": 0.7051548361778259,
"learning_rate": 2.1744324970131422e-05,
"loss": 0.2748,
"step": 1321
},
{
"epoch": 3.1403508771929824,
"grad_norm": 0.6684269905090332,
"learning_rate": 2.1684587813620073e-05,
"loss": 0.2622,
"step": 1322
},
{
"epoch": 3.142729705619982,
"grad_norm": 0.5970059633255005,
"learning_rate": 2.1624850657108723e-05,
"loss": 0.2305,
"step": 1323
},
{
"epoch": 3.1451085340469818,
"grad_norm": 0.6092391014099121,
"learning_rate": 2.1565113500597373e-05,
"loss": 0.2169,
"step": 1324
},
{
"epoch": 3.1474873624739814,
"grad_norm": 0.7716182470321655,
"learning_rate": 2.1505376344086024e-05,
"loss": 0.2244,
"step": 1325
},
{
"epoch": 3.149866190900981,
"grad_norm": 0.7600685954093933,
"learning_rate": 2.1445639187574674e-05,
"loss": 0.2807,
"step": 1326
},
{
"epoch": 3.1522450193279807,
"grad_norm": 0.6617460250854492,
"learning_rate": 2.1385902031063324e-05,
"loss": 0.2643,
"step": 1327
},
{
"epoch": 3.154623847754981,
"grad_norm": 0.6389066576957703,
"learning_rate": 2.132616487455197e-05,
"loss": 0.2674,
"step": 1328
},
{
"epoch": 3.1570026761819805,
"grad_norm": 0.6522324085235596,
"learning_rate": 2.126642771804062e-05,
"loss": 0.2083,
"step": 1329
},
{
"epoch": 3.15938150460898,
"grad_norm": 0.7513405084609985,
"learning_rate": 2.120669056152927e-05,
"loss": 0.2785,
"step": 1330
},
{
"epoch": 3.16176033303598,
"grad_norm": 0.5705693364143372,
"learning_rate": 2.1146953405017922e-05,
"loss": 0.1822,
"step": 1331
},
{
"epoch": 3.1641391614629795,
"grad_norm": 0.6877608299255371,
"learning_rate": 2.1087216248506572e-05,
"loss": 0.2754,
"step": 1332
},
{
"epoch": 3.166517989889979,
"grad_norm": 0.6934994459152222,
"learning_rate": 2.1027479091995223e-05,
"loss": 0.1944,
"step": 1333
},
{
"epoch": 3.168896818316979,
"grad_norm": 0.7194043397903442,
"learning_rate": 2.0967741935483873e-05,
"loss": 0.2657,
"step": 1334
},
{
"epoch": 3.1712756467439784,
"grad_norm": 0.6782123446464539,
"learning_rate": 2.0908004778972523e-05,
"loss": 0.2531,
"step": 1335
},
{
"epoch": 3.173654475170978,
"grad_norm": 0.7624220848083496,
"learning_rate": 2.084826762246117e-05,
"loss": 0.2082,
"step": 1336
},
{
"epoch": 3.176033303597978,
"grad_norm": 0.6336691975593567,
"learning_rate": 2.078853046594982e-05,
"loss": 0.2304,
"step": 1337
},
{
"epoch": 3.178412132024978,
"grad_norm": 0.6183249950408936,
"learning_rate": 2.072879330943847e-05,
"loss": 0.1725,
"step": 1338
},
{
"epoch": 3.1807909604519775,
"grad_norm": 0.6695713400840759,
"learning_rate": 2.066905615292712e-05,
"loss": 0.1807,
"step": 1339
},
{
"epoch": 3.183169788878977,
"grad_norm": 0.5882018208503723,
"learning_rate": 2.060931899641577e-05,
"loss": 0.2072,
"step": 1340
},
{
"epoch": 3.185548617305977,
"grad_norm": 0.6536471843719482,
"learning_rate": 2.054958183990442e-05,
"loss": 0.2962,
"step": 1341
},
{
"epoch": 3.1879274457329765,
"grad_norm": 0.6349655985832214,
"learning_rate": 2.0489844683393072e-05,
"loss": 0.2309,
"step": 1342
},
{
"epoch": 3.190306274159976,
"grad_norm": 0.6827989816665649,
"learning_rate": 2.0430107526881722e-05,
"loss": 0.2236,
"step": 1343
},
{
"epoch": 3.192685102586976,
"grad_norm": 0.6630003452301025,
"learning_rate": 2.037037037037037e-05,
"loss": 0.2739,
"step": 1344
},
{
"epoch": 3.1950639310139755,
"grad_norm": 0.5332040190696716,
"learning_rate": 2.031063321385902e-05,
"loss": 0.1947,
"step": 1345
},
{
"epoch": 3.197442759440975,
"grad_norm": 0.624686062335968,
"learning_rate": 2.025089605734767e-05,
"loss": 0.2403,
"step": 1346
},
{
"epoch": 3.199821587867975,
"grad_norm": 0.6674289703369141,
"learning_rate": 2.019115890083632e-05,
"loss": 0.2448,
"step": 1347
},
{
"epoch": 3.202200416294975,
"grad_norm": 0.6246338486671448,
"learning_rate": 2.013142174432497e-05,
"loss": 0.2025,
"step": 1348
},
{
"epoch": 3.2045792447219745,
"grad_norm": 0.6760092377662659,
"learning_rate": 2.007168458781362e-05,
"loss": 0.2186,
"step": 1349
},
{
"epoch": 3.206958073148974,
"grad_norm": 0.6240797638893127,
"learning_rate": 2.001194743130227e-05,
"loss": 0.1938,
"step": 1350
},
{
"epoch": 3.209336901575974,
"grad_norm": 0.5926909446716309,
"learning_rate": 1.995221027479092e-05,
"loss": 0.2389,
"step": 1351
},
{
"epoch": 3.2117157300029735,
"grad_norm": 0.623314619064331,
"learning_rate": 1.989247311827957e-05,
"loss": 0.2479,
"step": 1352
},
{
"epoch": 3.214094558429973,
"grad_norm": 0.6462867259979248,
"learning_rate": 1.9832735961768222e-05,
"loss": 0.2363,
"step": 1353
},
{
"epoch": 3.216473386856973,
"grad_norm": 0.6551673412322998,
"learning_rate": 1.9772998805256872e-05,
"loss": 0.2214,
"step": 1354
},
{
"epoch": 3.2188522152839725,
"grad_norm": 0.6646662354469299,
"learning_rate": 1.9713261648745522e-05,
"loss": 0.1996,
"step": 1355
},
{
"epoch": 3.221231043710972,
"grad_norm": 0.6474018096923828,
"learning_rate": 1.9653524492234173e-05,
"loss": 0.2457,
"step": 1356
},
{
"epoch": 3.2236098721379722,
"grad_norm": 0.6855640411376953,
"learning_rate": 1.9593787335722823e-05,
"loss": 0.271,
"step": 1357
},
{
"epoch": 3.225988700564972,
"grad_norm": 0.6802095174789429,
"learning_rate": 1.9534050179211473e-05,
"loss": 0.2271,
"step": 1358
},
{
"epoch": 3.2283675289919715,
"grad_norm": 0.6579050421714783,
"learning_rate": 1.947431302270012e-05,
"loss": 0.2601,
"step": 1359
},
{
"epoch": 3.230746357418971,
"grad_norm": 0.6538249850273132,
"learning_rate": 1.941457586618877e-05,
"loss": 0.257,
"step": 1360
},
{
"epoch": 3.233125185845971,
"grad_norm": 0.6673462986946106,
"learning_rate": 1.935483870967742e-05,
"loss": 0.2185,
"step": 1361
},
{
"epoch": 3.2355040142729705,
"grad_norm": 0.7256568074226379,
"learning_rate": 1.929510155316607e-05,
"loss": 0.2904,
"step": 1362
},
{
"epoch": 3.23788284269997,
"grad_norm": 0.5458927750587463,
"learning_rate": 1.923536439665472e-05,
"loss": 0.1592,
"step": 1363
},
{
"epoch": 3.24026167112697,
"grad_norm": 0.6696286201477051,
"learning_rate": 1.9175627240143372e-05,
"loss": 0.2131,
"step": 1364
},
{
"epoch": 3.2426404995539695,
"grad_norm": 0.7205179333686829,
"learning_rate": 1.9115890083632022e-05,
"loss": 0.2046,
"step": 1365
},
{
"epoch": 3.2450193279809696,
"grad_norm": 0.7840339541435242,
"learning_rate": 1.9056152927120672e-05,
"loss": 0.2388,
"step": 1366
},
{
"epoch": 3.2473981564079692,
"grad_norm": 0.7273231744766235,
"learning_rate": 1.899641577060932e-05,
"loss": 0.2431,
"step": 1367
},
{
"epoch": 3.249776984834969,
"grad_norm": 0.7814990282058716,
"learning_rate": 1.893667861409797e-05,
"loss": 0.2464,
"step": 1368
},
{
"epoch": 3.2521558132619686,
"grad_norm": 0.8068521618843079,
"learning_rate": 1.887694145758662e-05,
"loss": 0.299,
"step": 1369
},
{
"epoch": 3.254534641688968,
"grad_norm": 0.6146328449249268,
"learning_rate": 1.881720430107527e-05,
"loss": 0.2077,
"step": 1370
},
{
"epoch": 3.256913470115968,
"grad_norm": 0.6214416027069092,
"learning_rate": 1.875746714456392e-05,
"loss": 0.243,
"step": 1371
},
{
"epoch": 3.2592922985429675,
"grad_norm": 0.6775934100151062,
"learning_rate": 1.869772998805257e-05,
"loss": 0.2181,
"step": 1372
},
{
"epoch": 3.261671126969967,
"grad_norm": 0.6707413792610168,
"learning_rate": 1.863799283154122e-05,
"loss": 0.2385,
"step": 1373
},
{
"epoch": 3.264049955396967,
"grad_norm": 0.6954108476638794,
"learning_rate": 1.857825567502987e-05,
"loss": 0.2367,
"step": 1374
},
{
"epoch": 3.2664287838239665,
"grad_norm": 0.683694064617157,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.2761,
"step": 1375
},
{
"epoch": 3.268807612250966,
"grad_norm": 0.6562153697013855,
"learning_rate": 1.845878136200717e-05,
"loss": 0.1777,
"step": 1376
},
{
"epoch": 3.2711864406779663,
"grad_norm": 0.6518459916114807,
"learning_rate": 1.839904420549582e-05,
"loss": 0.228,
"step": 1377
},
{
"epoch": 3.273565269104966,
"grad_norm": 0.5777814984321594,
"learning_rate": 1.833930704898447e-05,
"loss": 0.22,
"step": 1378
},
{
"epoch": 3.2759440975319656,
"grad_norm": 0.6501063108444214,
"learning_rate": 1.827956989247312e-05,
"loss": 0.2363,
"step": 1379
},
{
"epoch": 3.2783229259589652,
"grad_norm": 0.6595446467399597,
"learning_rate": 1.821983273596177e-05,
"loss": 0.2106,
"step": 1380
},
{
"epoch": 3.280701754385965,
"grad_norm": 0.66554856300354,
"learning_rate": 1.816009557945042e-05,
"loss": 0.2224,
"step": 1381
},
{
"epoch": 3.2830805828129646,
"grad_norm": 0.6982068419456482,
"learning_rate": 1.8100358422939067e-05,
"loss": 0.2215,
"step": 1382
},
{
"epoch": 3.285459411239964,
"grad_norm": 0.7318804860115051,
"learning_rate": 1.8040621266427717e-05,
"loss": 0.2905,
"step": 1383
},
{
"epoch": 3.287838239666964,
"grad_norm": 0.685772716999054,
"learning_rate": 1.7980884109916368e-05,
"loss": 0.2241,
"step": 1384
},
{
"epoch": 3.2902170680939635,
"grad_norm": 0.662263035774231,
"learning_rate": 1.7921146953405018e-05,
"loss": 0.2333,
"step": 1385
},
{
"epoch": 3.2925958965209636,
"grad_norm": 0.6051913499832153,
"learning_rate": 1.7861409796893668e-05,
"loss": 0.2162,
"step": 1386
},
{
"epoch": 3.2949747249479633,
"grad_norm": 0.6377174258232117,
"learning_rate": 1.780167264038232e-05,
"loss": 0.1745,
"step": 1387
},
{
"epoch": 3.297353553374963,
"grad_norm": 0.6460472345352173,
"learning_rate": 1.774193548387097e-05,
"loss": 0.2591,
"step": 1388
},
{
"epoch": 3.2997323818019626,
"grad_norm": 0.5293865203857422,
"learning_rate": 1.768219832735962e-05,
"loss": 0.188,
"step": 1389
},
{
"epoch": 3.3021112102289623,
"grad_norm": 0.9448232650756836,
"learning_rate": 1.7622461170848266e-05,
"loss": 0.2767,
"step": 1390
},
{
"epoch": 3.304490038655962,
"grad_norm": 0.6465947031974792,
"learning_rate": 1.7562724014336916e-05,
"loss": 0.2408,
"step": 1391
},
{
"epoch": 3.3068688670829616,
"grad_norm": 0.7603315711021423,
"learning_rate": 1.7502986857825567e-05,
"loss": 0.2325,
"step": 1392
},
{
"epoch": 3.3092476955099612,
"grad_norm": 0.6722696423530579,
"learning_rate": 1.7443249701314217e-05,
"loss": 0.2089,
"step": 1393
},
{
"epoch": 3.311626523936961,
"grad_norm": 0.7081143856048584,
"learning_rate": 1.7383512544802867e-05,
"loss": 0.2684,
"step": 1394
},
{
"epoch": 3.314005352363961,
"grad_norm": 0.6449976563453674,
"learning_rate": 1.7323775388291518e-05,
"loss": 0.2041,
"step": 1395
},
{
"epoch": 3.3163841807909606,
"grad_norm": 0.8436914682388306,
"learning_rate": 1.7264038231780168e-05,
"loss": 0.2326,
"step": 1396
},
{
"epoch": 3.3187630092179603,
"grad_norm": 0.6485365033149719,
"learning_rate": 1.7204301075268818e-05,
"loss": 0.2081,
"step": 1397
},
{
"epoch": 3.32114183764496,
"grad_norm": 0.6128183603286743,
"learning_rate": 1.714456391875747e-05,
"loss": 0.1998,
"step": 1398
},
{
"epoch": 3.3235206660719596,
"grad_norm": 0.6225998401641846,
"learning_rate": 1.708482676224612e-05,
"loss": 0.1919,
"step": 1399
},
{
"epoch": 3.3258994944989593,
"grad_norm": 0.7354225516319275,
"learning_rate": 1.702508960573477e-05,
"loss": 0.2243,
"step": 1400
},
{
"epoch": 3.3258994944989593,
"eval_loss": 0.40811267495155334,
"eval_runtime": 24.7259,
"eval_samples_per_second": 30.252,
"eval_steps_per_second": 15.126,
"step": 1400
},
{
"epoch": 3.328278322925959,
"grad_norm": 0.693684458732605,
"learning_rate": 1.696535244922342e-05,
"loss": 0.1834,
"step": 1401
},
{
"epoch": 3.3306571513529586,
"grad_norm": 0.7312796711921692,
"learning_rate": 1.690561529271207e-05,
"loss": 0.2561,
"step": 1402
},
{
"epoch": 3.3330359797799582,
"grad_norm": 0.6952280402183533,
"learning_rate": 1.684587813620072e-05,
"loss": 0.2586,
"step": 1403
},
{
"epoch": 3.335414808206958,
"grad_norm": 0.6589756011962891,
"learning_rate": 1.678614097968937e-05,
"loss": 0.2643,
"step": 1404
},
{
"epoch": 3.3377936366339576,
"grad_norm": 0.5879402756690979,
"learning_rate": 1.6726403823178017e-05,
"loss": 0.2092,
"step": 1405
},
{
"epoch": 3.3401724650609577,
"grad_norm": 0.5845806002616882,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.2077,
"step": 1406
},
{
"epoch": 3.3425512934879573,
"grad_norm": 0.7089442014694214,
"learning_rate": 1.6606929510155318e-05,
"loss": 0.2417,
"step": 1407
},
{
"epoch": 3.344930121914957,
"grad_norm": 0.6234202980995178,
"learning_rate": 1.6547192353643968e-05,
"loss": 0.1982,
"step": 1408
},
{
"epoch": 3.3473089503419566,
"grad_norm": 0.6180989742279053,
"learning_rate": 1.648745519713262e-05,
"loss": 0.1913,
"step": 1409
},
{
"epoch": 3.3496877787689563,
"grad_norm": 0.6704964637756348,
"learning_rate": 1.642771804062127e-05,
"loss": 0.2388,
"step": 1410
},
{
"epoch": 3.352066607195956,
"grad_norm": 0.6707442998886108,
"learning_rate": 1.636798088410992e-05,
"loss": 0.2897,
"step": 1411
},
{
"epoch": 3.3544454356229556,
"grad_norm": 0.7604225873947144,
"learning_rate": 1.630824372759857e-05,
"loss": 0.2933,
"step": 1412
},
{
"epoch": 3.3568242640499553,
"grad_norm": 0.7985626459121704,
"learning_rate": 1.6248506571087216e-05,
"loss": 0.2905,
"step": 1413
},
{
"epoch": 3.359203092476955,
"grad_norm": 0.6078615188598633,
"learning_rate": 1.6188769414575866e-05,
"loss": 0.2217,
"step": 1414
},
{
"epoch": 3.361581920903955,
"grad_norm": 0.8050974011421204,
"learning_rate": 1.6129032258064517e-05,
"loss": 0.2299,
"step": 1415
},
{
"epoch": 3.3639607493309547,
"grad_norm": 0.675726056098938,
"learning_rate": 1.6069295101553167e-05,
"loss": 0.1939,
"step": 1416
},
{
"epoch": 3.3663395777579543,
"grad_norm": 0.6330052614212036,
"learning_rate": 1.6009557945041817e-05,
"loss": 0.2316,
"step": 1417
},
{
"epoch": 3.368718406184954,
"grad_norm": 0.7237552404403687,
"learning_rate": 1.5949820788530468e-05,
"loss": 0.2821,
"step": 1418
},
{
"epoch": 3.3710972346119537,
"grad_norm": 0.6474528312683105,
"learning_rate": 1.5890083632019118e-05,
"loss": 0.2136,
"step": 1419
},
{
"epoch": 3.3734760630389533,
"grad_norm": 0.7798577547073364,
"learning_rate": 1.5830346475507768e-05,
"loss": 0.223,
"step": 1420
},
{
"epoch": 3.375854891465953,
"grad_norm": 0.7392546534538269,
"learning_rate": 1.5770609318996415e-05,
"loss": 0.2513,
"step": 1421
},
{
"epoch": 3.3782337198929526,
"grad_norm": 0.7369937896728516,
"learning_rate": 1.5710872162485066e-05,
"loss": 0.2672,
"step": 1422
},
{
"epoch": 3.3806125483199523,
"grad_norm": 0.7227432131767273,
"learning_rate": 1.5651135005973716e-05,
"loss": 0.2551,
"step": 1423
},
{
"epoch": 3.382991376746952,
"grad_norm": 0.6788824200630188,
"learning_rate": 1.5591397849462366e-05,
"loss": 0.2378,
"step": 1424
},
{
"epoch": 3.3853702051739516,
"grad_norm": 0.6660415530204773,
"learning_rate": 1.5531660692951016e-05,
"loss": 0.22,
"step": 1425
},
{
"epoch": 3.3877490336009517,
"grad_norm": 0.6559244394302368,
"learning_rate": 1.5471923536439667e-05,
"loss": 0.2282,
"step": 1426
},
{
"epoch": 3.3901278620279514,
"grad_norm": 0.7439149618148804,
"learning_rate": 1.5412186379928317e-05,
"loss": 0.251,
"step": 1427
},
{
"epoch": 3.392506690454951,
"grad_norm": 0.6336953639984131,
"learning_rate": 1.5352449223416964e-05,
"loss": 0.2226,
"step": 1428
},
{
"epoch": 3.3948855188819507,
"grad_norm": 0.6222031712532043,
"learning_rate": 1.5292712066905614e-05,
"loss": 0.2445,
"step": 1429
},
{
"epoch": 3.3972643473089503,
"grad_norm": 0.6259122490882874,
"learning_rate": 1.5232974910394265e-05,
"loss": 0.2376,
"step": 1430
},
{
"epoch": 3.39964317573595,
"grad_norm": 0.6915052533149719,
"learning_rate": 1.5173237753882915e-05,
"loss": 0.2365,
"step": 1431
},
{
"epoch": 3.4020220041629496,
"grad_norm": 0.7592293620109558,
"learning_rate": 1.5113500597371565e-05,
"loss": 0.2395,
"step": 1432
},
{
"epoch": 3.4044008325899493,
"grad_norm": 0.7728424668312073,
"learning_rate": 1.5053763440860215e-05,
"loss": 0.2445,
"step": 1433
},
{
"epoch": 3.406779661016949,
"grad_norm": 0.602087140083313,
"learning_rate": 1.4994026284348866e-05,
"loss": 0.2054,
"step": 1434
},
{
"epoch": 3.409158489443949,
"grad_norm": 0.5934260487556458,
"learning_rate": 1.4934289127837516e-05,
"loss": 0.178,
"step": 1435
},
{
"epoch": 3.4115373178709487,
"grad_norm": 0.6058094501495361,
"learning_rate": 1.4874551971326165e-05,
"loss": 0.1932,
"step": 1436
},
{
"epoch": 3.4139161462979484,
"grad_norm": 0.6478481888771057,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.2139,
"step": 1437
},
{
"epoch": 3.416294974724948,
"grad_norm": 0.7313652634620667,
"learning_rate": 1.4755077658303465e-05,
"loss": 0.2884,
"step": 1438
},
{
"epoch": 3.4186738031519477,
"grad_norm": 0.6436812281608582,
"learning_rate": 1.4695340501792116e-05,
"loss": 0.2411,
"step": 1439
},
{
"epoch": 3.4210526315789473,
"grad_norm": 0.5855724215507507,
"learning_rate": 1.4635603345280766e-05,
"loss": 0.2268,
"step": 1440
},
{
"epoch": 3.423431460005947,
"grad_norm": 0.6639002561569214,
"learning_rate": 1.4575866188769416e-05,
"loss": 0.2157,
"step": 1441
},
{
"epoch": 3.4258102884329467,
"grad_norm": 0.6343371272087097,
"learning_rate": 1.4516129032258066e-05,
"loss": 0.2036,
"step": 1442
},
{
"epoch": 3.4281891168599463,
"grad_norm": 0.6803449988365173,
"learning_rate": 1.4456391875746717e-05,
"loss": 0.2474,
"step": 1443
},
{
"epoch": 3.4305679452869464,
"grad_norm": 0.5939714908599854,
"learning_rate": 1.4396654719235364e-05,
"loss": 0.1785,
"step": 1444
},
{
"epoch": 3.432946773713946,
"grad_norm": 0.7304908037185669,
"learning_rate": 1.4336917562724014e-05,
"loss": 0.2588,
"step": 1445
},
{
"epoch": 3.4353256021409457,
"grad_norm": 0.5911548733711243,
"learning_rate": 1.4277180406212664e-05,
"loss": 0.2105,
"step": 1446
},
{
"epoch": 3.4377044305679454,
"grad_norm": 0.6603512167930603,
"learning_rate": 1.4217443249701315e-05,
"loss": 0.2324,
"step": 1447
},
{
"epoch": 3.440083258994945,
"grad_norm": 0.618499755859375,
"learning_rate": 1.4157706093189965e-05,
"loss": 0.221,
"step": 1448
},
{
"epoch": 3.4424620874219447,
"grad_norm": 0.7231235504150391,
"learning_rate": 1.4097968936678615e-05,
"loss": 0.2625,
"step": 1449
},
{
"epoch": 3.4448409158489444,
"grad_norm": 0.6488828659057617,
"learning_rate": 1.4038231780167265e-05,
"loss": 0.2161,
"step": 1450
},
{
"epoch": 3.447219744275944,
"grad_norm": 0.695941150188446,
"learning_rate": 1.3978494623655914e-05,
"loss": 0.1978,
"step": 1451
},
{
"epoch": 3.4495985727029437,
"grad_norm": 0.6400462985038757,
"learning_rate": 1.3918757467144564e-05,
"loss": 0.2576,
"step": 1452
},
{
"epoch": 3.4519774011299433,
"grad_norm": 0.6457123160362244,
"learning_rate": 1.3859020310633215e-05,
"loss": 0.2296,
"step": 1453
},
{
"epoch": 3.454356229556943,
"grad_norm": 0.6414039731025696,
"learning_rate": 1.3799283154121865e-05,
"loss": 0.2452,
"step": 1454
},
{
"epoch": 3.456735057983943,
"grad_norm": 0.7506712675094604,
"learning_rate": 1.3739545997610515e-05,
"loss": 0.2473,
"step": 1455
},
{
"epoch": 3.4591138864109428,
"grad_norm": 0.7017701864242554,
"learning_rate": 1.3679808841099166e-05,
"loss": 0.2143,
"step": 1456
},
{
"epoch": 3.4614927148379424,
"grad_norm": 0.6323108077049255,
"learning_rate": 1.3620071684587816e-05,
"loss": 0.2232,
"step": 1457
},
{
"epoch": 3.463871543264942,
"grad_norm": 0.6644439101219177,
"learning_rate": 1.3560334528076466e-05,
"loss": 0.2429,
"step": 1458
},
{
"epoch": 3.4662503716919417,
"grad_norm": 0.7907066345214844,
"learning_rate": 1.3500597371565113e-05,
"loss": 0.2537,
"step": 1459
},
{
"epoch": 3.4686292001189414,
"grad_norm": 0.6679920554161072,
"learning_rate": 1.3440860215053763e-05,
"loss": 0.1961,
"step": 1460
},
{
"epoch": 3.471008028545941,
"grad_norm": 0.7501968145370483,
"learning_rate": 1.3381123058542414e-05,
"loss": 0.2933,
"step": 1461
},
{
"epoch": 3.4733868569729407,
"grad_norm": 0.6941911578178406,
"learning_rate": 1.3321385902031064e-05,
"loss": 0.2156,
"step": 1462
},
{
"epoch": 3.4757656853999404,
"grad_norm": 0.6447787284851074,
"learning_rate": 1.3261648745519714e-05,
"loss": 0.1829,
"step": 1463
},
{
"epoch": 3.4781445138269405,
"grad_norm": 0.6218757629394531,
"learning_rate": 1.3201911589008365e-05,
"loss": 0.2292,
"step": 1464
},
{
"epoch": 3.48052334225394,
"grad_norm": 0.7498683929443359,
"learning_rate": 1.3142174432497015e-05,
"loss": 0.2756,
"step": 1465
},
{
"epoch": 3.4829021706809398,
"grad_norm": 0.6936948299407959,
"learning_rate": 1.3082437275985665e-05,
"loss": 0.2607,
"step": 1466
},
{
"epoch": 3.4852809991079394,
"grad_norm": 0.6592556238174438,
"learning_rate": 1.3022700119474312e-05,
"loss": 0.1836,
"step": 1467
},
{
"epoch": 3.487659827534939,
"grad_norm": 0.6678502559661865,
"learning_rate": 1.2962962962962962e-05,
"loss": 0.249,
"step": 1468
},
{
"epoch": 3.4900386559619387,
"grad_norm": 0.5917351841926575,
"learning_rate": 1.2903225806451613e-05,
"loss": 0.2522,
"step": 1469
},
{
"epoch": 3.4924174843889384,
"grad_norm": 0.7131730914115906,
"learning_rate": 1.2843488649940263e-05,
"loss": 0.2573,
"step": 1470
},
{
"epoch": 3.494796312815938,
"grad_norm": 0.6857194900512695,
"learning_rate": 1.2783751493428913e-05,
"loss": 0.2162,
"step": 1471
},
{
"epoch": 3.4971751412429377,
"grad_norm": 0.5957016348838806,
"learning_rate": 1.2724014336917564e-05,
"loss": 0.1861,
"step": 1472
},
{
"epoch": 3.4995539696699374,
"grad_norm": 0.6489928364753723,
"learning_rate": 1.2664277180406214e-05,
"loss": 0.2385,
"step": 1473
},
{
"epoch": 3.501932798096937,
"grad_norm": 0.6470348834991455,
"learning_rate": 1.2604540023894863e-05,
"loss": 0.261,
"step": 1474
},
{
"epoch": 3.504311626523937,
"grad_norm": 0.6595022678375244,
"learning_rate": 1.2544802867383513e-05,
"loss": 0.2107,
"step": 1475
},
{
"epoch": 3.506690454950937,
"grad_norm": 0.6955150961875916,
"learning_rate": 1.2485065710872163e-05,
"loss": 0.2775,
"step": 1476
},
{
"epoch": 3.5090692833779364,
"grad_norm": 0.5981859564781189,
"learning_rate": 1.2425328554360813e-05,
"loss": 0.2169,
"step": 1477
},
{
"epoch": 3.511448111804936,
"grad_norm": 0.7145429849624634,
"learning_rate": 1.2365591397849464e-05,
"loss": 0.236,
"step": 1478
},
{
"epoch": 3.5138269402319358,
"grad_norm": 0.7278533577919006,
"learning_rate": 1.2305854241338114e-05,
"loss": 0.2556,
"step": 1479
},
{
"epoch": 3.5162057686589354,
"grad_norm": 0.6920650005340576,
"learning_rate": 1.2246117084826763e-05,
"loss": 0.2613,
"step": 1480
},
{
"epoch": 3.518584597085935,
"grad_norm": 0.6695207357406616,
"learning_rate": 1.2186379928315413e-05,
"loss": 0.2451,
"step": 1481
},
{
"epoch": 3.5209634255129347,
"grad_norm": 0.736623227596283,
"learning_rate": 1.2126642771804063e-05,
"loss": 0.2641,
"step": 1482
},
{
"epoch": 3.5233422539399344,
"grad_norm": 0.6805233955383301,
"learning_rate": 1.2066905615292714e-05,
"loss": 0.2185,
"step": 1483
},
{
"epoch": 3.5257210823669345,
"grad_norm": 0.6742261052131653,
"learning_rate": 1.2007168458781362e-05,
"loss": 0.2401,
"step": 1484
},
{
"epoch": 3.528099910793934,
"grad_norm": 0.5891537666320801,
"learning_rate": 1.1947431302270013e-05,
"loss": 0.2007,
"step": 1485
},
{
"epoch": 3.530478739220934,
"grad_norm": 0.7856806516647339,
"learning_rate": 1.1887694145758663e-05,
"loss": 0.2935,
"step": 1486
},
{
"epoch": 3.5328575676479335,
"grad_norm": 0.657417356967926,
"learning_rate": 1.1827956989247313e-05,
"loss": 0.2079,
"step": 1487
},
{
"epoch": 3.535236396074933,
"grad_norm": 0.817315936088562,
"learning_rate": 1.1768219832735962e-05,
"loss": 0.2897,
"step": 1488
},
{
"epoch": 3.5376152245019328,
"grad_norm": 0.67595374584198,
"learning_rate": 1.1708482676224612e-05,
"loss": 0.2737,
"step": 1489
},
{
"epoch": 3.5399940529289324,
"grad_norm": 0.6928833723068237,
"learning_rate": 1.1648745519713262e-05,
"loss": 0.2437,
"step": 1490
},
{
"epoch": 3.542372881355932,
"grad_norm": 0.6645117402076721,
"learning_rate": 1.1589008363201913e-05,
"loss": 0.1768,
"step": 1491
},
{
"epoch": 3.5447517097829317,
"grad_norm": 0.7368005514144897,
"learning_rate": 1.1529271206690561e-05,
"loss": 0.2562,
"step": 1492
},
{
"epoch": 3.547130538209932,
"grad_norm": 0.6753305792808533,
"learning_rate": 1.1469534050179212e-05,
"loss": 0.2068,
"step": 1493
},
{
"epoch": 3.549509366636931,
"grad_norm": 0.6689797043800354,
"learning_rate": 1.1409796893667862e-05,
"loss": 0.2141,
"step": 1494
},
{
"epoch": 3.551888195063931,
"grad_norm": 0.7002167701721191,
"learning_rate": 1.135005973715651e-05,
"loss": 0.2314,
"step": 1495
},
{
"epoch": 3.554267023490931,
"grad_norm": 0.6405853629112244,
"learning_rate": 1.129032258064516e-05,
"loss": 0.2289,
"step": 1496
},
{
"epoch": 3.5566458519179305,
"grad_norm": 0.7196563482284546,
"learning_rate": 1.1230585424133811e-05,
"loss": 0.279,
"step": 1497
},
{
"epoch": 3.55902468034493,
"grad_norm": 0.7163687348365784,
"learning_rate": 1.1170848267622461e-05,
"loss": 0.2054,
"step": 1498
},
{
"epoch": 3.56140350877193,
"grad_norm": 0.6668791174888611,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.2417,
"step": 1499
},
{
"epoch": 3.5637823371989295,
"grad_norm": 0.8013843894004822,
"learning_rate": 1.1051373954599762e-05,
"loss": 0.2523,
"step": 1500
},
{
"epoch": 3.5637823371989295,
"eval_loss": 0.4071366488933563,
"eval_runtime": 24.9101,
"eval_samples_per_second": 30.028,
"eval_steps_per_second": 15.014,
"step": 1500
},
{
"epoch": 3.566161165625929,
"grad_norm": 0.649554431438446,
"learning_rate": 1.0991636798088412e-05,
"loss": 0.262,
"step": 1501
},
{
"epoch": 3.568539994052929,
"grad_norm": 0.6857299208641052,
"learning_rate": 1.0931899641577063e-05,
"loss": 0.2615,
"step": 1502
},
{
"epoch": 3.5709188224799284,
"grad_norm": 0.7175583839416504,
"learning_rate": 1.0872162485065711e-05,
"loss": 0.258,
"step": 1503
},
{
"epoch": 3.5732976509069285,
"grad_norm": 0.6463188529014587,
"learning_rate": 1.0812425328554361e-05,
"loss": 0.2259,
"step": 1504
},
{
"epoch": 3.575676479333928,
"grad_norm": 0.7195451855659485,
"learning_rate": 1.0752688172043012e-05,
"loss": 0.2458,
"step": 1505
},
{
"epoch": 3.578055307760928,
"grad_norm": 0.7340347766876221,
"learning_rate": 1.0692951015531662e-05,
"loss": 0.2478,
"step": 1506
},
{
"epoch": 3.5804341361879275,
"grad_norm": 0.7194491028785706,
"learning_rate": 1.063321385902031e-05,
"loss": 0.2369,
"step": 1507
},
{
"epoch": 3.582812964614927,
"grad_norm": 0.7110004425048828,
"learning_rate": 1.0573476702508961e-05,
"loss": 0.257,
"step": 1508
},
{
"epoch": 3.585191793041927,
"grad_norm": 0.6778460144996643,
"learning_rate": 1.0513739545997611e-05,
"loss": 0.2128,
"step": 1509
},
{
"epoch": 3.5875706214689265,
"grad_norm": 0.7083544135093689,
"learning_rate": 1.0454002389486262e-05,
"loss": 0.2743,
"step": 1510
},
{
"epoch": 3.589949449895926,
"grad_norm": 0.7141408324241638,
"learning_rate": 1.039426523297491e-05,
"loss": 0.2752,
"step": 1511
},
{
"epoch": 3.592328278322926,
"grad_norm": 0.6944218873977661,
"learning_rate": 1.033452807646356e-05,
"loss": 0.2627,
"step": 1512
},
{
"epoch": 3.594707106749926,
"grad_norm": 0.5565508604049683,
"learning_rate": 1.027479091995221e-05,
"loss": 0.1928,
"step": 1513
},
{
"epoch": 3.597085935176925,
"grad_norm": 0.5949556231498718,
"learning_rate": 1.0215053763440861e-05,
"loss": 0.2007,
"step": 1514
},
{
"epoch": 3.599464763603925,
"grad_norm": 0.6212142109870911,
"learning_rate": 1.015531660692951e-05,
"loss": 0.1798,
"step": 1515
},
{
"epoch": 3.601843592030925,
"grad_norm": 0.7298859357833862,
"learning_rate": 1.009557945041816e-05,
"loss": 0.2466,
"step": 1516
},
{
"epoch": 3.6042224204579245,
"grad_norm": 0.8149348497390747,
"learning_rate": 1.003584229390681e-05,
"loss": 0.3083,
"step": 1517
},
{
"epoch": 3.606601248884924,
"grad_norm": 0.7248669862747192,
"learning_rate": 9.97610513739546e-06,
"loss": 0.2124,
"step": 1518
},
{
"epoch": 3.608980077311924,
"grad_norm": 0.6633000373840332,
"learning_rate": 9.916367980884111e-06,
"loss": 0.2021,
"step": 1519
},
{
"epoch": 3.6113589057389235,
"grad_norm": 0.5681532621383667,
"learning_rate": 9.856630824372761e-06,
"loss": 0.1449,
"step": 1520
},
{
"epoch": 3.613737734165923,
"grad_norm": 0.6119810938835144,
"learning_rate": 9.796893667861412e-06,
"loss": 0.2125,
"step": 1521
},
{
"epoch": 3.6161165625929232,
"grad_norm": 0.6357595920562744,
"learning_rate": 9.73715651135006e-06,
"loss": 0.2047,
"step": 1522
},
{
"epoch": 3.6184953910199225,
"grad_norm": 0.6470745205879211,
"learning_rate": 9.67741935483871e-06,
"loss": 0.2453,
"step": 1523
},
{
"epoch": 3.6208742194469226,
"grad_norm": 0.6681517958641052,
"learning_rate": 9.61768219832736e-06,
"loss": 0.213,
"step": 1524
},
{
"epoch": 3.623253047873922,
"grad_norm": 0.6562544107437134,
"learning_rate": 9.557945041816011e-06,
"loss": 0.2493,
"step": 1525
},
{
"epoch": 3.625631876300922,
"grad_norm": 0.7458012700080872,
"learning_rate": 9.49820788530466e-06,
"loss": 0.2309,
"step": 1526
},
{
"epoch": 3.6280107047279215,
"grad_norm": 0.6401541233062744,
"learning_rate": 9.43847072879331e-06,
"loss": 0.2141,
"step": 1527
},
{
"epoch": 3.630389533154921,
"grad_norm": 0.6830606460571289,
"learning_rate": 9.37873357228196e-06,
"loss": 0.2194,
"step": 1528
},
{
"epoch": 3.632768361581921,
"grad_norm": 0.613207995891571,
"learning_rate": 9.31899641577061e-06,
"loss": 0.2095,
"step": 1529
},
{
"epoch": 3.6351471900089205,
"grad_norm": 0.7548460960388184,
"learning_rate": 9.259259259259259e-06,
"loss": 0.2218,
"step": 1530
},
{
"epoch": 3.63752601843592,
"grad_norm": 0.6122269034385681,
"learning_rate": 9.19952210274791e-06,
"loss": 0.2229,
"step": 1531
},
{
"epoch": 3.63990484686292,
"grad_norm": 0.763201117515564,
"learning_rate": 9.13978494623656e-06,
"loss": 0.2915,
"step": 1532
},
{
"epoch": 3.64228367528992,
"grad_norm": 0.5817426443099976,
"learning_rate": 9.08004778972521e-06,
"loss": 0.2133,
"step": 1533
},
{
"epoch": 3.6446625037169196,
"grad_norm": 0.6793233752250671,
"learning_rate": 9.020310633213859e-06,
"loss": 0.1959,
"step": 1534
},
{
"epoch": 3.6470413321439192,
"grad_norm": 0.7488880157470703,
"learning_rate": 8.960573476702509e-06,
"loss": 0.2454,
"step": 1535
},
{
"epoch": 3.649420160570919,
"grad_norm": 0.6657071709632874,
"learning_rate": 8.90083632019116e-06,
"loss": 0.2157,
"step": 1536
},
{
"epoch": 3.6517989889979185,
"grad_norm": 0.6894775629043579,
"learning_rate": 8.84109916367981e-06,
"loss": 0.2498,
"step": 1537
},
{
"epoch": 3.654177817424918,
"grad_norm": 0.6401564478874207,
"learning_rate": 8.781362007168458e-06,
"loss": 0.1939,
"step": 1538
},
{
"epoch": 3.656556645851918,
"grad_norm": 0.7901315689086914,
"learning_rate": 8.721624850657108e-06,
"loss": 0.2725,
"step": 1539
},
{
"epoch": 3.6589354742789175,
"grad_norm": 0.6278257369995117,
"learning_rate": 8.661887694145759e-06,
"loss": 0.2293,
"step": 1540
},
{
"epoch": 3.661314302705917,
"grad_norm": 0.6456495523452759,
"learning_rate": 8.602150537634409e-06,
"loss": 0.2433,
"step": 1541
},
{
"epoch": 3.6636931311329173,
"grad_norm": 0.6201629042625427,
"learning_rate": 8.54241338112306e-06,
"loss": 0.1798,
"step": 1542
},
{
"epoch": 3.6660719595599165,
"grad_norm": 0.7407602071762085,
"learning_rate": 8.48267622461171e-06,
"loss": 0.2428,
"step": 1543
},
{
"epoch": 3.6684507879869166,
"grad_norm": 0.7235687375068665,
"learning_rate": 8.42293906810036e-06,
"loss": 0.2421,
"step": 1544
},
{
"epoch": 3.6708296164139163,
"grad_norm": 0.6602762341499329,
"learning_rate": 8.363201911589009e-06,
"loss": 0.2513,
"step": 1545
},
{
"epoch": 3.673208444840916,
"grad_norm": 0.7141433954238892,
"learning_rate": 8.303464755077659e-06,
"loss": 0.2656,
"step": 1546
},
{
"epoch": 3.6755872732679156,
"grad_norm": 0.6279122233390808,
"learning_rate": 8.24372759856631e-06,
"loss": 0.1955,
"step": 1547
},
{
"epoch": 3.6779661016949152,
"grad_norm": 0.6232267618179321,
"learning_rate": 8.18399044205496e-06,
"loss": 0.1934,
"step": 1548
},
{
"epoch": 3.680344930121915,
"grad_norm": 0.7122899889945984,
"learning_rate": 8.124253285543608e-06,
"loss": 0.2608,
"step": 1549
},
{
"epoch": 3.6827237585489145,
"grad_norm": 0.6872605085372925,
"learning_rate": 8.064516129032258e-06,
"loss": 0.255,
"step": 1550
},
{
"epoch": 3.685102586975914,
"grad_norm": 0.7177041172981262,
"learning_rate": 8.004778972520909e-06,
"loss": 0.308,
"step": 1551
},
{
"epoch": 3.687481415402914,
"grad_norm": 0.6372491121292114,
"learning_rate": 7.945041816009559e-06,
"loss": 0.194,
"step": 1552
},
{
"epoch": 3.689860243829914,
"grad_norm": 0.6712515354156494,
"learning_rate": 7.885304659498208e-06,
"loss": 0.211,
"step": 1553
},
{
"epoch": 3.6922390722569136,
"grad_norm": 0.6724277138710022,
"learning_rate": 7.825567502986858e-06,
"loss": 0.22,
"step": 1554
},
{
"epoch": 3.6946179006839133,
"grad_norm": 0.729597806930542,
"learning_rate": 7.765830346475508e-06,
"loss": 0.2722,
"step": 1555
},
{
"epoch": 3.696996729110913,
"grad_norm": 0.7727295160293579,
"learning_rate": 7.706093189964159e-06,
"loss": 0.2225,
"step": 1556
},
{
"epoch": 3.6993755575379126,
"grad_norm": 0.8393397927284241,
"learning_rate": 7.646356033452807e-06,
"loss": 0.253,
"step": 1557
},
{
"epoch": 3.7017543859649122,
"grad_norm": 0.5970509052276611,
"learning_rate": 7.586618876941457e-06,
"loss": 0.2052,
"step": 1558
},
{
"epoch": 3.704133214391912,
"grad_norm": 0.6462686061859131,
"learning_rate": 7.526881720430108e-06,
"loss": 0.2158,
"step": 1559
},
{
"epoch": 3.7065120428189116,
"grad_norm": 0.6514200568199158,
"learning_rate": 7.467144563918758e-06,
"loss": 0.2331,
"step": 1560
},
{
"epoch": 3.708890871245911,
"grad_norm": 0.588154137134552,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.2044,
"step": 1561
},
{
"epoch": 3.7112696996729113,
"grad_norm": 0.7149415016174316,
"learning_rate": 7.347670250896058e-06,
"loss": 0.2486,
"step": 1562
},
{
"epoch": 3.7136485280999105,
"grad_norm": 0.6742112636566162,
"learning_rate": 7.287933094384708e-06,
"loss": 0.2379,
"step": 1563
},
{
"epoch": 3.7160273565269106,
"grad_norm": 0.6923094987869263,
"learning_rate": 7.228195937873358e-06,
"loss": 0.2474,
"step": 1564
},
{
"epoch": 3.7184061849539103,
"grad_norm": 0.7106072902679443,
"learning_rate": 7.168458781362007e-06,
"loss": 0.2541,
"step": 1565
},
{
"epoch": 3.72078501338091,
"grad_norm": 0.6844406127929688,
"learning_rate": 7.108721624850657e-06,
"loss": 0.2154,
"step": 1566
},
{
"epoch": 3.7231638418079096,
"grad_norm": 0.7566594481468201,
"learning_rate": 7.048984468339308e-06,
"loss": 0.2432,
"step": 1567
},
{
"epoch": 3.7255426702349093,
"grad_norm": 0.6934136152267456,
"learning_rate": 6.989247311827957e-06,
"loss": 0.1982,
"step": 1568
},
{
"epoch": 3.727921498661909,
"grad_norm": 0.6883266568183899,
"learning_rate": 6.929510155316607e-06,
"loss": 0.2299,
"step": 1569
},
{
"epoch": 3.7303003270889086,
"grad_norm": 0.6047712564468384,
"learning_rate": 6.869772998805258e-06,
"loss": 0.193,
"step": 1570
},
{
"epoch": 3.7326791555159087,
"grad_norm": 0.7365937232971191,
"learning_rate": 6.810035842293908e-06,
"loss": 0.224,
"step": 1571
},
{
"epoch": 3.735057983942908,
"grad_norm": 0.7228516936302185,
"learning_rate": 6.7502986857825566e-06,
"loss": 0.2644,
"step": 1572
},
{
"epoch": 3.737436812369908,
"grad_norm": 0.6381697058677673,
"learning_rate": 6.690561529271207e-06,
"loss": 0.1966,
"step": 1573
},
{
"epoch": 3.7398156407969076,
"grad_norm": 0.7666082978248596,
"learning_rate": 6.630824372759857e-06,
"loss": 0.2725,
"step": 1574
},
{
"epoch": 3.7421944692239073,
"grad_norm": 0.6610901951789856,
"learning_rate": 6.5710872162485075e-06,
"loss": 0.2398,
"step": 1575
},
{
"epoch": 3.744573297650907,
"grad_norm": 0.7147103548049927,
"learning_rate": 6.511350059737156e-06,
"loss": 0.246,
"step": 1576
},
{
"epoch": 3.7469521260779066,
"grad_norm": 0.681191086769104,
"learning_rate": 6.451612903225806e-06,
"loss": 0.2185,
"step": 1577
},
{
"epoch": 3.7493309545049063,
"grad_norm": 0.6059114336967468,
"learning_rate": 6.391875746714457e-06,
"loss": 0.2127,
"step": 1578
},
{
"epoch": 3.751709782931906,
"grad_norm": 0.7046500444412231,
"learning_rate": 6.332138590203107e-06,
"loss": 0.226,
"step": 1579
},
{
"epoch": 3.7540886113589056,
"grad_norm": 0.6811462640762329,
"learning_rate": 6.2724014336917564e-06,
"loss": 0.2474,
"step": 1580
},
{
"epoch": 3.7564674397859052,
"grad_norm": 0.7344135642051697,
"learning_rate": 6.212664277180407e-06,
"loss": 0.2569,
"step": 1581
},
{
"epoch": 3.7588462682129054,
"grad_norm": 0.7511917352676392,
"learning_rate": 6.152927120669057e-06,
"loss": 0.2848,
"step": 1582
},
{
"epoch": 3.761225096639905,
"grad_norm": 0.7010191679000854,
"learning_rate": 6.0931899641577065e-06,
"loss": 0.2017,
"step": 1583
},
{
"epoch": 3.7636039250669047,
"grad_norm": 0.722025990486145,
"learning_rate": 6.033452807646357e-06,
"loss": 0.2473,
"step": 1584
},
{
"epoch": 3.7659827534939043,
"grad_norm": 0.6399304866790771,
"learning_rate": 5.973715651135006e-06,
"loss": 0.2219,
"step": 1585
},
{
"epoch": 3.768361581920904,
"grad_norm": 0.6593719720840454,
"learning_rate": 5.9139784946236566e-06,
"loss": 0.2105,
"step": 1586
},
{
"epoch": 3.7707404103479036,
"grad_norm": 0.6997963786125183,
"learning_rate": 5.854241338112306e-06,
"loss": 0.2317,
"step": 1587
},
{
"epoch": 3.7731192387749033,
"grad_norm": 0.7119143009185791,
"learning_rate": 5.794504181600956e-06,
"loss": 0.2567,
"step": 1588
},
{
"epoch": 3.775498067201903,
"grad_norm": 0.6408453583717346,
"learning_rate": 5.734767025089606e-06,
"loss": 0.2252,
"step": 1589
},
{
"epoch": 3.7778768956289026,
"grad_norm": 0.6919389963150024,
"learning_rate": 5.675029868578255e-06,
"loss": 0.1793,
"step": 1590
},
{
"epoch": 3.7802557240559027,
"grad_norm": 0.6644212007522583,
"learning_rate": 5.6152927120669055e-06,
"loss": 0.2008,
"step": 1591
},
{
"epoch": 3.782634552482902,
"grad_norm": 0.7227513194084167,
"learning_rate": 5.555555555555556e-06,
"loss": 0.2518,
"step": 1592
},
{
"epoch": 3.785013380909902,
"grad_norm": 0.7298300266265869,
"learning_rate": 5.495818399044206e-06,
"loss": 0.2362,
"step": 1593
},
{
"epoch": 3.7873922093369017,
"grad_norm": 0.5880789160728455,
"learning_rate": 5.436081242532856e-06,
"loss": 0.1614,
"step": 1594
},
{
"epoch": 3.7897710377639013,
"grad_norm": 0.7107828259468079,
"learning_rate": 5.376344086021506e-06,
"loss": 0.2027,
"step": 1595
},
{
"epoch": 3.792149866190901,
"grad_norm": 0.7066324949264526,
"learning_rate": 5.316606929510155e-06,
"loss": 0.2406,
"step": 1596
},
{
"epoch": 3.7945286946179007,
"grad_norm": 0.7864248156547546,
"learning_rate": 5.256869772998806e-06,
"loss": 0.3063,
"step": 1597
},
{
"epoch": 3.7969075230449003,
"grad_norm": 0.8586356043815613,
"learning_rate": 5.197132616487455e-06,
"loss": 0.2634,
"step": 1598
},
{
"epoch": 3.7992863514719,
"grad_norm": 0.6787355542182922,
"learning_rate": 5.137395459976105e-06,
"loss": 0.2439,
"step": 1599
},
{
"epoch": 3.8016651798988996,
"grad_norm": 0.6626994013786316,
"learning_rate": 5.077658303464755e-06,
"loss": 0.2148,
"step": 1600
},
{
"epoch": 3.8016651798988996,
"eval_loss": 0.406727135181427,
"eval_runtime": 24.8753,
"eval_samples_per_second": 30.07,
"eval_steps_per_second": 15.035,
"step": 1600
},
{
"epoch": 3.8040440083258993,
"grad_norm": 0.7175625562667847,
"learning_rate": 5.017921146953405e-06,
"loss": 0.2611,
"step": 1601
},
{
"epoch": 3.8064228367528994,
"grad_norm": 0.6542512774467468,
"learning_rate": 4.9581839904420555e-06,
"loss": 0.2489,
"step": 1602
},
{
"epoch": 3.808801665179899,
"grad_norm": 0.5721225738525391,
"learning_rate": 4.898446833930706e-06,
"loss": 0.1609,
"step": 1603
},
{
"epoch": 3.8111804936068987,
"grad_norm": 0.6456505060195923,
"learning_rate": 4.838709677419355e-06,
"loss": 0.2108,
"step": 1604
},
{
"epoch": 3.8135593220338984,
"grad_norm": 0.7169181108474731,
"learning_rate": 4.7789725209080055e-06,
"loss": 0.2584,
"step": 1605
},
{
"epoch": 3.815938150460898,
"grad_norm": 0.7455881237983704,
"learning_rate": 4.719235364396655e-06,
"loss": 0.2769,
"step": 1606
},
{
"epoch": 3.8183169788878977,
"grad_norm": 0.706899881362915,
"learning_rate": 4.659498207885305e-06,
"loss": 0.188,
"step": 1607
},
{
"epoch": 3.8206958073148973,
"grad_norm": 0.7342420220375061,
"learning_rate": 4.599761051373955e-06,
"loss": 0.2373,
"step": 1608
},
{
"epoch": 3.823074635741897,
"grad_norm": 0.6335705518722534,
"learning_rate": 4.540023894862605e-06,
"loss": 0.2299,
"step": 1609
},
{
"epoch": 3.8254534641688966,
"grad_norm": 0.7097893953323364,
"learning_rate": 4.4802867383512545e-06,
"loss": 0.321,
"step": 1610
},
{
"epoch": 3.8278322925958967,
"grad_norm": 0.7030773758888245,
"learning_rate": 4.420549581839905e-06,
"loss": 0.2354,
"step": 1611
},
{
"epoch": 3.830211121022896,
"grad_norm": 0.7927830815315247,
"learning_rate": 4.360812425328554e-06,
"loss": 0.2692,
"step": 1612
},
{
"epoch": 3.832589949449896,
"grad_norm": 0.7883430123329163,
"learning_rate": 4.3010752688172045e-06,
"loss": 0.2383,
"step": 1613
},
{
"epoch": 3.8349687778768957,
"grad_norm": 0.777301549911499,
"learning_rate": 4.241338112305855e-06,
"loss": 0.2841,
"step": 1614
},
{
"epoch": 3.8373476063038954,
"grad_norm": 0.7094290852546692,
"learning_rate": 4.181600955794504e-06,
"loss": 0.2444,
"step": 1615
},
{
"epoch": 3.839726434730895,
"grad_norm": 0.6646528244018555,
"learning_rate": 4.121863799283155e-06,
"loss": 0.2186,
"step": 1616
},
{
"epoch": 3.8421052631578947,
"grad_norm": 0.6202448010444641,
"learning_rate": 4.062126642771804e-06,
"loss": 0.1955,
"step": 1617
},
{
"epoch": 3.8444840915848943,
"grad_norm": 0.7290095090866089,
"learning_rate": 4.002389486260454e-06,
"loss": 0.2387,
"step": 1618
},
{
"epoch": 3.846862920011894,
"grad_norm": 0.832423746585846,
"learning_rate": 3.942652329749104e-06,
"loss": 0.2889,
"step": 1619
},
{
"epoch": 3.849241748438894,
"grad_norm": 0.7755414247512817,
"learning_rate": 3.882915173237754e-06,
"loss": 0.1783,
"step": 1620
},
{
"epoch": 3.8516205768658933,
"grad_norm": 0.6566904187202454,
"learning_rate": 3.8231780167264036e-06,
"loss": 0.2044,
"step": 1621
},
{
"epoch": 3.8539994052928934,
"grad_norm": 0.707445502281189,
"learning_rate": 3.763440860215054e-06,
"loss": 0.3036,
"step": 1622
},
{
"epoch": 3.856378233719893,
"grad_norm": 0.6451879143714905,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.3102,
"step": 1623
},
{
"epoch": 3.8587570621468927,
"grad_norm": 0.6781940460205078,
"learning_rate": 3.643966547192354e-06,
"loss": 0.196,
"step": 1624
},
{
"epoch": 3.8611358905738924,
"grad_norm": 0.6479291319847107,
"learning_rate": 3.5842293906810035e-06,
"loss": 0.2299,
"step": 1625
},
{
"epoch": 3.863514719000892,
"grad_norm": 0.7208328247070312,
"learning_rate": 3.524492234169654e-06,
"loss": 0.2393,
"step": 1626
},
{
"epoch": 3.8658935474278917,
"grad_norm": 0.6611060500144958,
"learning_rate": 3.4647550776583037e-06,
"loss": 0.2117,
"step": 1627
},
{
"epoch": 3.8682723758548914,
"grad_norm": 0.6570304036140442,
"learning_rate": 3.405017921146954e-06,
"loss": 0.2343,
"step": 1628
},
{
"epoch": 3.870651204281891,
"grad_norm": 0.7514824271202087,
"learning_rate": 3.3452807646356034e-06,
"loss": 0.238,
"step": 1629
},
{
"epoch": 3.8730300327088907,
"grad_norm": 0.6288111209869385,
"learning_rate": 3.2855436081242537e-06,
"loss": 0.2154,
"step": 1630
},
{
"epoch": 3.875408861135891,
"grad_norm": 0.7032327651977539,
"learning_rate": 3.225806451612903e-06,
"loss": 0.2279,
"step": 1631
},
{
"epoch": 3.8777876895628904,
"grad_norm": 0.6228746175765991,
"learning_rate": 3.1660692951015535e-06,
"loss": 0.1931,
"step": 1632
},
{
"epoch": 3.88016651798989,
"grad_norm": 0.6107305884361267,
"learning_rate": 3.1063321385902034e-06,
"loss": 0.2251,
"step": 1633
},
{
"epoch": 3.8825453464168898,
"grad_norm": 0.6516736149787903,
"learning_rate": 3.0465949820788532e-06,
"loss": 0.2714,
"step": 1634
},
{
"epoch": 3.8849241748438894,
"grad_norm": 0.7341564893722534,
"learning_rate": 2.986857825567503e-06,
"loss": 0.241,
"step": 1635
},
{
"epoch": 3.887303003270889,
"grad_norm": 0.6014671325683594,
"learning_rate": 2.927120669056153e-06,
"loss": 0.1807,
"step": 1636
},
{
"epoch": 3.8896818316978887,
"grad_norm": 0.7346065044403076,
"learning_rate": 2.867383512544803e-06,
"loss": 0.2421,
"step": 1637
},
{
"epoch": 3.8920606601248884,
"grad_norm": 0.7137173414230347,
"learning_rate": 2.8076463560334528e-06,
"loss": 0.2606,
"step": 1638
},
{
"epoch": 3.894439488551888,
"grad_norm": 0.7425440549850464,
"learning_rate": 2.747909199522103e-06,
"loss": 0.218,
"step": 1639
},
{
"epoch": 3.896818316978888,
"grad_norm": 0.5857921838760376,
"learning_rate": 2.688172043010753e-06,
"loss": 0.1838,
"step": 1640
},
{
"epoch": 3.8991971454058874,
"grad_norm": 0.5919234156608582,
"learning_rate": 2.628434886499403e-06,
"loss": 0.1848,
"step": 1641
},
{
"epoch": 3.9015759738328875,
"grad_norm": 0.7037178874015808,
"learning_rate": 2.5686977299880527e-06,
"loss": 0.2417,
"step": 1642
},
{
"epoch": 3.903954802259887,
"grad_norm": 0.7201714515686035,
"learning_rate": 2.5089605734767026e-06,
"loss": 0.2424,
"step": 1643
},
{
"epoch": 3.9063336306868868,
"grad_norm": 0.7508910298347473,
"learning_rate": 2.449223416965353e-06,
"loss": 0.2837,
"step": 1644
},
{
"epoch": 3.9087124591138864,
"grad_norm": 0.7130612730979919,
"learning_rate": 2.3894862604540028e-06,
"loss": 0.2364,
"step": 1645
},
{
"epoch": 3.911091287540886,
"grad_norm": 0.6658805012702942,
"learning_rate": 2.3297491039426526e-06,
"loss": 0.215,
"step": 1646
},
{
"epoch": 3.9134701159678857,
"grad_norm": 0.6496844291687012,
"learning_rate": 2.2700119474313025e-06,
"loss": 0.2213,
"step": 1647
},
{
"epoch": 3.9158489443948854,
"grad_norm": 0.6499704122543335,
"learning_rate": 2.2102747909199524e-06,
"loss": 0.2403,
"step": 1648
},
{
"epoch": 3.918227772821885,
"grad_norm": 0.7895413637161255,
"learning_rate": 2.1505376344086023e-06,
"loss": 0.2743,
"step": 1649
},
{
"epoch": 3.9206066012488847,
"grad_norm": 0.6984461545944214,
"learning_rate": 2.090800477897252e-06,
"loss": 0.2492,
"step": 1650
},
{
"epoch": 3.922985429675885,
"grad_norm": 0.6958913207054138,
"learning_rate": 2.031063321385902e-06,
"loss": 0.223,
"step": 1651
},
{
"epoch": 3.9253642581028845,
"grad_norm": 0.8210883736610413,
"learning_rate": 1.971326164874552e-06,
"loss": 0.3268,
"step": 1652
},
{
"epoch": 3.927743086529884,
"grad_norm": 0.7338096499443054,
"learning_rate": 1.9115890083632018e-06,
"loss": 0.2821,
"step": 1653
},
{
"epoch": 3.930121914956884,
"grad_norm": 0.7852990031242371,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.2819,
"step": 1654
},
{
"epoch": 3.9325007433838834,
"grad_norm": 0.587360680103302,
"learning_rate": 1.7921146953405017e-06,
"loss": 0.219,
"step": 1655
},
{
"epoch": 3.934879571810883,
"grad_norm": 0.5991771221160889,
"learning_rate": 1.7323775388291518e-06,
"loss": 0.1996,
"step": 1656
},
{
"epoch": 3.9372584002378828,
"grad_norm": 0.7518739104270935,
"learning_rate": 1.6726403823178017e-06,
"loss": 0.2585,
"step": 1657
},
{
"epoch": 3.9396372286648824,
"grad_norm": 0.725003719329834,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.2435,
"step": 1658
},
{
"epoch": 3.942016057091882,
"grad_norm": 0.5956133604049683,
"learning_rate": 1.5531660692951017e-06,
"loss": 0.2052,
"step": 1659
},
{
"epoch": 3.944394885518882,
"grad_norm": 0.5697284936904907,
"learning_rate": 1.4934289127837516e-06,
"loss": 0.1942,
"step": 1660
},
{
"epoch": 3.9467737139458814,
"grad_norm": 0.6322106122970581,
"learning_rate": 1.4336917562724014e-06,
"loss": 0.1886,
"step": 1661
},
{
"epoch": 3.9491525423728815,
"grad_norm": 0.6705097556114197,
"learning_rate": 1.3739545997610515e-06,
"loss": 0.2222,
"step": 1662
},
{
"epoch": 3.951531370799881,
"grad_norm": 0.8009798526763916,
"learning_rate": 1.3142174432497014e-06,
"loss": 0.2858,
"step": 1663
},
{
"epoch": 3.953910199226881,
"grad_norm": 0.5885463356971741,
"learning_rate": 1.2544802867383513e-06,
"loss": 0.1702,
"step": 1664
},
{
"epoch": 3.9562890276538805,
"grad_norm": 0.7516363263130188,
"learning_rate": 1.1947431302270014e-06,
"loss": 0.2719,
"step": 1665
},
{
"epoch": 3.95866785608088,
"grad_norm": 0.6109394431114197,
"learning_rate": 1.1350059737156513e-06,
"loss": 0.2404,
"step": 1666
},
{
"epoch": 3.96104668450788,
"grad_norm": 0.6410830020904541,
"learning_rate": 1.0752688172043011e-06,
"loss": 0.2229,
"step": 1667
},
{
"epoch": 3.9634255129348794,
"grad_norm": 0.6514939665794373,
"learning_rate": 1.015531660692951e-06,
"loss": 0.2404,
"step": 1668
},
{
"epoch": 3.9658043413618795,
"grad_norm": 0.7575217485427856,
"learning_rate": 9.557945041816009e-07,
"loss": 0.22,
"step": 1669
},
{
"epoch": 3.9681831697888788,
"grad_norm": 0.675889790058136,
"learning_rate": 8.960573476702509e-07,
"loss": 0.2421,
"step": 1670
},
{
"epoch": 3.970561998215879,
"grad_norm": 0.7055429816246033,
"learning_rate": 8.363201911589009e-07,
"loss": 0.2476,
"step": 1671
},
{
"epoch": 3.9729408266428785,
"grad_norm": 0.7240319848060608,
"learning_rate": 7.765830346475508e-07,
"loss": 0.2222,
"step": 1672
},
{
"epoch": 3.975319655069878,
"grad_norm": 0.7141379714012146,
"learning_rate": 7.168458781362007e-07,
"loss": 0.1832,
"step": 1673
},
{
"epoch": 3.977698483496878,
"grad_norm": 0.6299167275428772,
"learning_rate": 6.571087216248507e-07,
"loss": 0.2301,
"step": 1674
},
{
"epoch": 3.9800773119238775,
"grad_norm": 0.7430551052093506,
"learning_rate": 5.973715651135007e-07,
"loss": 0.2718,
"step": 1675
},
{
"epoch": 3.982456140350877,
"grad_norm": 0.6470373272895813,
"learning_rate": 5.376344086021506e-07,
"loss": 0.2408,
"step": 1676
},
{
"epoch": 3.984834968777877,
"grad_norm": 0.6226888298988342,
"learning_rate": 4.778972520908004e-07,
"loss": 0.2267,
"step": 1677
},
{
"epoch": 3.9872137972048765,
"grad_norm": 0.6659932732582092,
"learning_rate": 4.1816009557945043e-07,
"loss": 0.2575,
"step": 1678
},
{
"epoch": 3.989592625631876,
"grad_norm": 0.6825204491615295,
"learning_rate": 3.5842293906810036e-07,
"loss": 0.2001,
"step": 1679
},
{
"epoch": 3.991971454058876,
"grad_norm": 0.6531214118003845,
"learning_rate": 2.9868578255675034e-07,
"loss": 0.2078,
"step": 1680
},
{
"epoch": 3.994350282485876,
"grad_norm": 0.674887478351593,
"learning_rate": 2.389486260454002e-07,
"loss": 0.2757,
"step": 1681
},
{
"epoch": 3.9967291109128755,
"grad_norm": 0.704774796962738,
"learning_rate": 1.7921146953405018e-07,
"loss": 0.2122,
"step": 1682
},
{
"epoch": 3.999107939339875,
"grad_norm": 0.6378912329673767,
"learning_rate": 1.194743130227001e-07,
"loss": 0.2269,
"step": 1683
},
{
"epoch": 4.0,
"grad_norm": 1.035918951034546,
"learning_rate": 5.973715651135006e-08,
"loss": 0.2651,
"step": 1684
}
],
"logging_steps": 1,
"max_steps": 1684,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.855004092650957e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}