QTrack / gemma3_finetunned /trainer_state.json
tawheed-tariq's picture
Add files using upload-large-folder tool
5f1d711 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.16478536705940514,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 8.239268352970256e-05,
"grad_norm": 370.92846474567483,
"learning_rate": 0.0,
"loss": 1.9502,
"step": 1
},
{
"epoch": 0.00016478536705940512,
"grad_norm": 478.9745480395951,
"learning_rate": 1.3717421124828532e-08,
"loss": 1.7099,
"step": 2
},
{
"epoch": 0.0002471780505891077,
"grad_norm": 816.010339911805,
"learning_rate": 2.7434842249657065e-08,
"loss": 1.9407,
"step": 3
},
{
"epoch": 0.00032957073411881023,
"grad_norm": 472.8469994501748,
"learning_rate": 4.1152263374485605e-08,
"loss": 2.0365,
"step": 4
},
{
"epoch": 0.00041196341764851283,
"grad_norm": 578.3746838755396,
"learning_rate": 5.486968449931413e-08,
"loss": 1.9464,
"step": 5
},
{
"epoch": 0.0004943561011782154,
"grad_norm": 404.71316071877214,
"learning_rate": 6.858710562414266e-08,
"loss": 1.6153,
"step": 6
},
{
"epoch": 0.000576748784707918,
"grad_norm": 336.6982918558558,
"learning_rate": 8.230452674897121e-08,
"loss": 1.5113,
"step": 7
},
{
"epoch": 0.0006591414682376205,
"grad_norm": 341.52944273145397,
"learning_rate": 9.602194787379974e-08,
"loss": 1.8882,
"step": 8
},
{
"epoch": 0.0007415341517673231,
"grad_norm": 391.4084771497659,
"learning_rate": 1.0973936899862826e-07,
"loss": 2.0696,
"step": 9
},
{
"epoch": 0.0008239268352970257,
"grad_norm": 422.9036219570121,
"learning_rate": 1.234567901234568e-07,
"loss": 2.0605,
"step": 10
},
{
"epoch": 0.0009063195188267282,
"grad_norm": 797.3739771528897,
"learning_rate": 1.3717421124828532e-07,
"loss": 2.4847,
"step": 11
},
{
"epoch": 0.0009887122023564308,
"grad_norm": 375.3289213197193,
"learning_rate": 1.5089163237311387e-07,
"loss": 1.9973,
"step": 12
},
{
"epoch": 0.0010711048858861334,
"grad_norm": 392.38417270652195,
"learning_rate": 1.6460905349794242e-07,
"loss": 1.8486,
"step": 13
},
{
"epoch": 0.001153497569415836,
"grad_norm": 299.91546496064956,
"learning_rate": 1.7832647462277092e-07,
"loss": 2.0903,
"step": 14
},
{
"epoch": 0.0012358902529455383,
"grad_norm": 323.60559376717487,
"learning_rate": 1.9204389574759947e-07,
"loss": 2.199,
"step": 15
},
{
"epoch": 0.001318282936475241,
"grad_norm": 448.30929677447216,
"learning_rate": 2.05761316872428e-07,
"loss": 1.9866,
"step": 16
},
{
"epoch": 0.0014006756200049435,
"grad_norm": 496.01105968376754,
"learning_rate": 2.1947873799725652e-07,
"loss": 1.7145,
"step": 17
},
{
"epoch": 0.0014830683035346461,
"grad_norm": 254.87258601806334,
"learning_rate": 2.3319615912208507e-07,
"loss": 1.4035,
"step": 18
},
{
"epoch": 0.0015654609870643487,
"grad_norm": 196.08575158272188,
"learning_rate": 2.469135802469136e-07,
"loss": 1.5703,
"step": 19
},
{
"epoch": 0.0016478536705940513,
"grad_norm": 181.93540410857392,
"learning_rate": 2.606310013717421e-07,
"loss": 1.2346,
"step": 20
},
{
"epoch": 0.0017302463541237537,
"grad_norm": 292.3097054026038,
"learning_rate": 2.7434842249657064e-07,
"loss": 1.6449,
"step": 21
},
{
"epoch": 0.0018126390376534563,
"grad_norm": 189.7490456066442,
"learning_rate": 2.880658436213992e-07,
"loss": 1.3064,
"step": 22
},
{
"epoch": 0.001895031721183159,
"grad_norm": 154.94758342589185,
"learning_rate": 3.0178326474622774e-07,
"loss": 0.8725,
"step": 23
},
{
"epoch": 0.0019774244047128615,
"grad_norm": 175.30016139724518,
"learning_rate": 3.1550068587105627e-07,
"loss": 1.0209,
"step": 24
},
{
"epoch": 0.002059817088242564,
"grad_norm": 138.98861988367486,
"learning_rate": 3.2921810699588484e-07,
"loss": 0.819,
"step": 25
},
{
"epoch": 0.0021422097717722667,
"grad_norm": 171.4439871730333,
"learning_rate": 3.4293552812071337e-07,
"loss": 0.9231,
"step": 26
},
{
"epoch": 0.0022246024553019693,
"grad_norm": 198.66126017668074,
"learning_rate": 3.5665294924554184e-07,
"loss": 1.1098,
"step": 27
},
{
"epoch": 0.002306995138831672,
"grad_norm": 128.99561291431132,
"learning_rate": 3.7037037037037036e-07,
"loss": 0.6259,
"step": 28
},
{
"epoch": 0.0023893878223613745,
"grad_norm": 139.7562027108659,
"learning_rate": 3.8408779149519894e-07,
"loss": 0.9026,
"step": 29
},
{
"epoch": 0.0024717805058910767,
"grad_norm": 8493.779228229192,
"learning_rate": 3.9780521262002746e-07,
"loss": 7.6096,
"step": 30
},
{
"epoch": 0.0025541731894207793,
"grad_norm": 96.52109090080035,
"learning_rate": 4.11522633744856e-07,
"loss": 0.9848,
"step": 31
},
{
"epoch": 0.002636565872950482,
"grad_norm": 46.277401372799,
"learning_rate": 4.252400548696845e-07,
"loss": 0.7812,
"step": 32
},
{
"epoch": 0.0027189585564801845,
"grad_norm": 90.33996022715957,
"learning_rate": 4.3895747599451304e-07,
"loss": 0.9017,
"step": 33
},
{
"epoch": 0.002801351240009887,
"grad_norm": 34.651039695496756,
"learning_rate": 4.526748971193416e-07,
"loss": 0.5284,
"step": 34
},
{
"epoch": 0.0028837439235395897,
"grad_norm": 57.438589709256526,
"learning_rate": 4.6639231824417014e-07,
"loss": 0.6959,
"step": 35
},
{
"epoch": 0.0029661366070692923,
"grad_norm": 88.03798435445572,
"learning_rate": 4.801097393689986e-07,
"loss": 0.6558,
"step": 36
},
{
"epoch": 0.003048529290598995,
"grad_norm": 168.93972734994549,
"learning_rate": 4.938271604938272e-07,
"loss": 1.0815,
"step": 37
},
{
"epoch": 0.0031309219741286975,
"grad_norm": 46.38654936308973,
"learning_rate": 5.075445816186558e-07,
"loss": 0.6026,
"step": 38
},
{
"epoch": 0.0032133146576584,
"grad_norm": 44.94814475982433,
"learning_rate": 5.212620027434842e-07,
"loss": 0.7385,
"step": 39
},
{
"epoch": 0.0032957073411881027,
"grad_norm": 165.7034467276052,
"learning_rate": 5.349794238683128e-07,
"loss": 0.6596,
"step": 40
},
{
"epoch": 0.0033781000247178053,
"grad_norm": 97.98006118196572,
"learning_rate": 5.486968449931413e-07,
"loss": 1.02,
"step": 41
},
{
"epoch": 0.0034604927082475074,
"grad_norm": 69.93402365846087,
"learning_rate": 5.624142661179699e-07,
"loss": 0.674,
"step": 42
},
{
"epoch": 0.00354288539177721,
"grad_norm": 1163.1480162660114,
"learning_rate": 5.761316872427984e-07,
"loss": 3.2917,
"step": 43
},
{
"epoch": 0.0036252780753069126,
"grad_norm": 74.37426736775474,
"learning_rate": 5.898491083676269e-07,
"loss": 0.7122,
"step": 44
},
{
"epoch": 0.0037076707588366152,
"grad_norm": 91.55829350532069,
"learning_rate": 6.035665294924555e-07,
"loss": 0.8284,
"step": 45
},
{
"epoch": 0.003790063442366318,
"grad_norm": 46.12057854425198,
"learning_rate": 6.17283950617284e-07,
"loss": 0.8129,
"step": 46
},
{
"epoch": 0.0038724561258960204,
"grad_norm": 65.2084871860374,
"learning_rate": 6.310013717421125e-07,
"loss": 0.7338,
"step": 47
},
{
"epoch": 0.003954848809425723,
"grad_norm": 76.89157130197114,
"learning_rate": 6.44718792866941e-07,
"loss": 0.3108,
"step": 48
},
{
"epoch": 0.004037241492955425,
"grad_norm": 73.53979766200762,
"learning_rate": 6.584362139917697e-07,
"loss": 0.508,
"step": 49
},
{
"epoch": 0.004119634176485128,
"grad_norm": 94.68664560631554,
"learning_rate": 6.721536351165982e-07,
"loss": 0.9084,
"step": 50
},
{
"epoch": 0.00420202686001483,
"grad_norm": 146.08418676025838,
"learning_rate": 6.858710562414267e-07,
"loss": 0.8076,
"step": 51
},
{
"epoch": 0.004284419543544533,
"grad_norm": 98.79171062611543,
"learning_rate": 6.995884773662552e-07,
"loss": 0.5388,
"step": 52
},
{
"epoch": 0.004366812227074236,
"grad_norm": 63.9354584989466,
"learning_rate": 7.133058984910837e-07,
"loss": 0.6614,
"step": 53
},
{
"epoch": 0.004449204910603939,
"grad_norm": 28.560826747239517,
"learning_rate": 7.270233196159123e-07,
"loss": 0.221,
"step": 54
},
{
"epoch": 0.004531597594133641,
"grad_norm": 119.28245305633594,
"learning_rate": 7.407407407407407e-07,
"loss": 0.7162,
"step": 55
},
{
"epoch": 0.004613990277663344,
"grad_norm": 106.52974721356492,
"learning_rate": 7.544581618655693e-07,
"loss": 0.7543,
"step": 56
},
{
"epoch": 0.004696382961193046,
"grad_norm": 216.2323272199254,
"learning_rate": 7.681755829903979e-07,
"loss": 0.612,
"step": 57
},
{
"epoch": 0.004778775644722749,
"grad_norm": 81.76961781882962,
"learning_rate": 7.818930041152265e-07,
"loss": 0.9122,
"step": 58
},
{
"epoch": 0.004861168328252451,
"grad_norm": 81.51218859422868,
"learning_rate": 7.956104252400549e-07,
"loss": 0.4165,
"step": 59
},
{
"epoch": 0.004943561011782153,
"grad_norm": 45.85355666847451,
"learning_rate": 8.093278463648835e-07,
"loss": 0.3937,
"step": 60
},
{
"epoch": 0.005025953695311856,
"grad_norm": 113.09646005752293,
"learning_rate": 8.23045267489712e-07,
"loss": 0.7733,
"step": 61
},
{
"epoch": 0.0051083463788415585,
"grad_norm": 42.137339872436065,
"learning_rate": 8.367626886145406e-07,
"loss": 0.5397,
"step": 62
},
{
"epoch": 0.005190739062371262,
"grad_norm": 99.72516559451445,
"learning_rate": 8.50480109739369e-07,
"loss": 0.7653,
"step": 63
},
{
"epoch": 0.005273131745900964,
"grad_norm": 37.868334658532945,
"learning_rate": 8.641975308641976e-07,
"loss": 0.4084,
"step": 64
},
{
"epoch": 0.005355524429430667,
"grad_norm": 253.35802935100432,
"learning_rate": 8.779149519890261e-07,
"loss": 0.6051,
"step": 65
},
{
"epoch": 0.005437917112960369,
"grad_norm": 29.821485219757093,
"learning_rate": 8.916323731138548e-07,
"loss": 0.4946,
"step": 66
},
{
"epoch": 0.005520309796490072,
"grad_norm": 26.807855345732474,
"learning_rate": 9.053497942386832e-07,
"loss": 0.3619,
"step": 67
},
{
"epoch": 0.005602702480019774,
"grad_norm": 45.035127254975706,
"learning_rate": 9.190672153635118e-07,
"loss": 0.5831,
"step": 68
},
{
"epoch": 0.005685095163549477,
"grad_norm": 44.226686731133306,
"learning_rate": 9.327846364883403e-07,
"loss": 0.6431,
"step": 69
},
{
"epoch": 0.005767487847079179,
"grad_norm": 60.33898555308833,
"learning_rate": 9.465020576131687e-07,
"loss": 0.4899,
"step": 70
},
{
"epoch": 0.005849880530608882,
"grad_norm": 35.25027498428163,
"learning_rate": 9.602194787379972e-07,
"loss": 0.3887,
"step": 71
},
{
"epoch": 0.0059322732141385845,
"grad_norm": 73.25174042894214,
"learning_rate": 9.73936899862826e-07,
"loss": 0.6252,
"step": 72
},
{
"epoch": 0.006014665897668287,
"grad_norm": 52.590662239348354,
"learning_rate": 9.876543209876544e-07,
"loss": 0.3872,
"step": 73
},
{
"epoch": 0.00609705858119799,
"grad_norm": 29.726473681600194,
"learning_rate": 1.001371742112483e-06,
"loss": 0.4349,
"step": 74
},
{
"epoch": 0.006179451264727692,
"grad_norm": 26.072142658169017,
"learning_rate": 1.0150891632373115e-06,
"loss": 0.6492,
"step": 75
},
{
"epoch": 0.006261843948257395,
"grad_norm": 38.68492891617437,
"learning_rate": 1.02880658436214e-06,
"loss": 0.8022,
"step": 76
},
{
"epoch": 0.006344236631787097,
"grad_norm": 120.40708886210712,
"learning_rate": 1.0425240054869685e-06,
"loss": 0.5611,
"step": 77
},
{
"epoch": 0.0064266293153168,
"grad_norm": 20.221000748278993,
"learning_rate": 1.0562414266117972e-06,
"loss": 0.2969,
"step": 78
},
{
"epoch": 0.006509021998846502,
"grad_norm": 137.6805973085389,
"learning_rate": 1.0699588477366256e-06,
"loss": 0.648,
"step": 79
},
{
"epoch": 0.006591414682376205,
"grad_norm": 40.1096379523084,
"learning_rate": 1.083676268861454e-06,
"loss": 0.8103,
"step": 80
},
{
"epoch": 0.0066738073659059075,
"grad_norm": 32.096473511201374,
"learning_rate": 1.0973936899862826e-06,
"loss": 0.6186,
"step": 81
},
{
"epoch": 0.0067562000494356105,
"grad_norm": 24.075343816904766,
"learning_rate": 1.111111111111111e-06,
"loss": 0.3877,
"step": 82
},
{
"epoch": 0.006838592732965313,
"grad_norm": 48.42109801664082,
"learning_rate": 1.1248285322359397e-06,
"loss": 0.447,
"step": 83
},
{
"epoch": 0.006920985416495015,
"grad_norm": 42.65831233770232,
"learning_rate": 1.1385459533607684e-06,
"loss": 0.7162,
"step": 84
},
{
"epoch": 0.007003378100024718,
"grad_norm": 71.20273416415172,
"learning_rate": 1.1522633744855969e-06,
"loss": 0.6573,
"step": 85
},
{
"epoch": 0.00708577078355442,
"grad_norm": 70.73981135151499,
"learning_rate": 1.1659807956104253e-06,
"loss": 0.3774,
"step": 86
},
{
"epoch": 0.007168163467084123,
"grad_norm": 17.02358862648308,
"learning_rate": 1.1796982167352538e-06,
"loss": 0.4372,
"step": 87
},
{
"epoch": 0.007250556150613825,
"grad_norm": 38.56110621340388,
"learning_rate": 1.1934156378600823e-06,
"loss": 0.5007,
"step": 88
},
{
"epoch": 0.007332948834143528,
"grad_norm": 21.689880371993823,
"learning_rate": 1.207133058984911e-06,
"loss": 0.4363,
"step": 89
},
{
"epoch": 0.0074153415176732304,
"grad_norm": 53.876169409804625,
"learning_rate": 1.2208504801097394e-06,
"loss": 0.4091,
"step": 90
},
{
"epoch": 0.0074977342012029335,
"grad_norm": 48.147837297588566,
"learning_rate": 1.234567901234568e-06,
"loss": 0.6979,
"step": 91
},
{
"epoch": 0.007580126884732636,
"grad_norm": 20.467449188390766,
"learning_rate": 1.2482853223593966e-06,
"loss": 0.4081,
"step": 92
},
{
"epoch": 0.007662519568262339,
"grad_norm": 23.825819702066855,
"learning_rate": 1.262002743484225e-06,
"loss": 0.5095,
"step": 93
},
{
"epoch": 0.007744912251792041,
"grad_norm": 49.54875914048349,
"learning_rate": 1.2757201646090535e-06,
"loss": 0.8153,
"step": 94
},
{
"epoch": 0.007827304935321744,
"grad_norm": 36.71859670716872,
"learning_rate": 1.289437585733882e-06,
"loss": 0.4975,
"step": 95
},
{
"epoch": 0.007909697618851446,
"grad_norm": 52.89761869922755,
"learning_rate": 1.3031550068587107e-06,
"loss": 0.6777,
"step": 96
},
{
"epoch": 0.007992090302381148,
"grad_norm": 262.046184232095,
"learning_rate": 1.3168724279835394e-06,
"loss": 0.5125,
"step": 97
},
{
"epoch": 0.00807448298591085,
"grad_norm": 23.8518705316023,
"learning_rate": 1.3305898491083676e-06,
"loss": 0.5802,
"step": 98
},
{
"epoch": 0.008156875669440554,
"grad_norm": 24.43774608417277,
"learning_rate": 1.3443072702331963e-06,
"loss": 0.4466,
"step": 99
},
{
"epoch": 0.008239268352970256,
"grad_norm": 27.243336976835526,
"learning_rate": 1.3580246913580248e-06,
"loss": 0.606,
"step": 100
},
{
"epoch": 0.008321661036499959,
"grad_norm": 18.838152665368614,
"learning_rate": 1.3717421124828535e-06,
"loss": 0.4605,
"step": 101
},
{
"epoch": 0.00840405372002966,
"grad_norm": 26.949888572345216,
"learning_rate": 1.3854595336076817e-06,
"loss": 0.4532,
"step": 102
},
{
"epoch": 0.008486446403559365,
"grad_norm": 21.572594872057856,
"learning_rate": 1.3991769547325104e-06,
"loss": 0.4991,
"step": 103
},
{
"epoch": 0.008568839087089067,
"grad_norm": 28.33027763947139,
"learning_rate": 1.412894375857339e-06,
"loss": 0.5669,
"step": 104
},
{
"epoch": 0.008651231770618769,
"grad_norm": 31.09867407487906,
"learning_rate": 1.4266117969821674e-06,
"loss": 0.5158,
"step": 105
},
{
"epoch": 0.008733624454148471,
"grad_norm": 47.502117851757255,
"learning_rate": 1.440329218106996e-06,
"loss": 0.6078,
"step": 106
},
{
"epoch": 0.008816017137678173,
"grad_norm": 30.0902294117928,
"learning_rate": 1.4540466392318245e-06,
"loss": 0.6867,
"step": 107
},
{
"epoch": 0.008898409821207877,
"grad_norm": 16.049485540251304,
"learning_rate": 1.4677640603566532e-06,
"loss": 0.5267,
"step": 108
},
{
"epoch": 0.00898080250473758,
"grad_norm": 30.186256751846674,
"learning_rate": 1.4814814814814815e-06,
"loss": 0.6437,
"step": 109
},
{
"epoch": 0.009063195188267282,
"grad_norm": 23.921754142654017,
"learning_rate": 1.4951989026063101e-06,
"loss": 0.7187,
"step": 110
},
{
"epoch": 0.009145587871796984,
"grad_norm": 40.13689702977842,
"learning_rate": 1.5089163237311386e-06,
"loss": 0.4591,
"step": 111
},
{
"epoch": 0.009227980555326688,
"grad_norm": 21.792212279571824,
"learning_rate": 1.5226337448559673e-06,
"loss": 0.4377,
"step": 112
},
{
"epoch": 0.00931037323885639,
"grad_norm": 12.609083806149128,
"learning_rate": 1.5363511659807958e-06,
"loss": 0.4975,
"step": 113
},
{
"epoch": 0.009392765922386092,
"grad_norm": 19.801853097766696,
"learning_rate": 1.5500685871056242e-06,
"loss": 0.4519,
"step": 114
},
{
"epoch": 0.009475158605915794,
"grad_norm": 44.527628785852514,
"learning_rate": 1.563786008230453e-06,
"loss": 0.5393,
"step": 115
},
{
"epoch": 0.009557551289445498,
"grad_norm": 17.968320630306675,
"learning_rate": 1.5775034293552812e-06,
"loss": 0.6014,
"step": 116
},
{
"epoch": 0.0096399439729752,
"grad_norm": 23.423995548663576,
"learning_rate": 1.5912208504801099e-06,
"loss": 0.4331,
"step": 117
},
{
"epoch": 0.009722336656504902,
"grad_norm": 18.98686296805731,
"learning_rate": 1.6049382716049383e-06,
"loss": 0.5621,
"step": 118
},
{
"epoch": 0.009804729340034605,
"grad_norm": 13.635326129289362,
"learning_rate": 1.618655692729767e-06,
"loss": 0.2893,
"step": 119
},
{
"epoch": 0.009887122023564307,
"grad_norm": 29.502202435441244,
"learning_rate": 1.6323731138545953e-06,
"loss": 0.5988,
"step": 120
},
{
"epoch": 0.00996951470709401,
"grad_norm": 26.759044629536252,
"learning_rate": 1.646090534979424e-06,
"loss": 0.6966,
"step": 121
},
{
"epoch": 0.010051907390623713,
"grad_norm": 16.944673727591262,
"learning_rate": 1.6598079561042526e-06,
"loss": 0.6288,
"step": 122
},
{
"epoch": 0.010134300074153415,
"grad_norm": 22.18252955446083,
"learning_rate": 1.6735253772290811e-06,
"loss": 0.6527,
"step": 123
},
{
"epoch": 0.010216692757683117,
"grad_norm": 14.663608441939818,
"learning_rate": 1.6872427983539098e-06,
"loss": 0.4992,
"step": 124
},
{
"epoch": 0.010299085441212821,
"grad_norm": 27.846664256554586,
"learning_rate": 1.700960219478738e-06,
"loss": 0.6578,
"step": 125
},
{
"epoch": 0.010381478124742523,
"grad_norm": 48.120411539456136,
"learning_rate": 1.7146776406035667e-06,
"loss": 0.7731,
"step": 126
},
{
"epoch": 0.010463870808272225,
"grad_norm": 29.505384191045792,
"learning_rate": 1.7283950617283952e-06,
"loss": 0.4631,
"step": 127
},
{
"epoch": 0.010546263491801927,
"grad_norm": 27.267562026668486,
"learning_rate": 1.7421124828532237e-06,
"loss": 0.7196,
"step": 128
},
{
"epoch": 0.010628656175331631,
"grad_norm": 16.00289092345597,
"learning_rate": 1.7558299039780521e-06,
"loss": 0.5238,
"step": 129
},
{
"epoch": 0.010711048858861334,
"grad_norm": 20.034041777867913,
"learning_rate": 1.7695473251028808e-06,
"loss": 0.48,
"step": 130
},
{
"epoch": 0.010793441542391036,
"grad_norm": 16.125317675567455,
"learning_rate": 1.7832647462277095e-06,
"loss": 0.6135,
"step": 131
},
{
"epoch": 0.010875834225920738,
"grad_norm": 198.72635885269693,
"learning_rate": 1.7969821673525378e-06,
"loss": 1.8359,
"step": 132
},
{
"epoch": 0.01095822690945044,
"grad_norm": 11.49829810544229,
"learning_rate": 1.8106995884773665e-06,
"loss": 0.4496,
"step": 133
},
{
"epoch": 0.011040619592980144,
"grad_norm": 54.65603884199396,
"learning_rate": 1.824417009602195e-06,
"loss": 0.6657,
"step": 134
},
{
"epoch": 0.011123012276509846,
"grad_norm": 23.069821716903398,
"learning_rate": 1.8381344307270236e-06,
"loss": 0.5426,
"step": 135
},
{
"epoch": 0.011205404960039548,
"grad_norm": 13.204812144916009,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.612,
"step": 136
},
{
"epoch": 0.01128779764356925,
"grad_norm": 13.956836795334933,
"learning_rate": 1.8655692729766806e-06,
"loss": 0.532,
"step": 137
},
{
"epoch": 0.011370190327098954,
"grad_norm": 42.68872796386726,
"learning_rate": 1.879286694101509e-06,
"loss": 0.7006,
"step": 138
},
{
"epoch": 0.011452583010628656,
"grad_norm": 16.612308273214413,
"learning_rate": 1.8930041152263375e-06,
"loss": 0.5123,
"step": 139
},
{
"epoch": 0.011534975694158359,
"grad_norm": 18.144654907032912,
"learning_rate": 1.9067215363511662e-06,
"loss": 0.3141,
"step": 140
},
{
"epoch": 0.01161736837768806,
"grad_norm": 16.814344046499077,
"learning_rate": 1.9204389574759944e-06,
"loss": 0.6542,
"step": 141
},
{
"epoch": 0.011699761061217765,
"grad_norm": 21.160095993478766,
"learning_rate": 1.9341563786008233e-06,
"loss": 0.5819,
"step": 142
},
{
"epoch": 0.011782153744747467,
"grad_norm": 21.297656919271585,
"learning_rate": 1.947873799725652e-06,
"loss": 0.5107,
"step": 143
},
{
"epoch": 0.011864546428277169,
"grad_norm": 15.566851005374614,
"learning_rate": 1.9615912208504803e-06,
"loss": 0.6187,
"step": 144
},
{
"epoch": 0.011946939111806871,
"grad_norm": 16.02129799647006,
"learning_rate": 1.9753086419753087e-06,
"loss": 0.4715,
"step": 145
},
{
"epoch": 0.012029331795336573,
"grad_norm": 11.717994264174337,
"learning_rate": 1.9890260631001372e-06,
"loss": 0.4021,
"step": 146
},
{
"epoch": 0.012111724478866277,
"grad_norm": 21.22813881358679,
"learning_rate": 2.002743484224966e-06,
"loss": 0.2599,
"step": 147
},
{
"epoch": 0.01219411716239598,
"grad_norm": 15.200100122381537,
"learning_rate": 2.0164609053497946e-06,
"loss": 0.45,
"step": 148
},
{
"epoch": 0.012276509845925682,
"grad_norm": 24.89750355075059,
"learning_rate": 2.030178326474623e-06,
"loss": 0.772,
"step": 149
},
{
"epoch": 0.012358902529455384,
"grad_norm": 16.304820858469412,
"learning_rate": 2.0438957475994515e-06,
"loss": 0.446,
"step": 150
},
{
"epoch": 0.012441295212985088,
"grad_norm": 20.608928374910505,
"learning_rate": 2.05761316872428e-06,
"loss": 0.5507,
"step": 151
},
{
"epoch": 0.01252368789651479,
"grad_norm": 10.483108607114513,
"learning_rate": 2.0713305898491085e-06,
"loss": 0.4834,
"step": 152
},
{
"epoch": 0.012606080580044492,
"grad_norm": 12.697803561984879,
"learning_rate": 2.085048010973937e-06,
"loss": 0.5253,
"step": 153
},
{
"epoch": 0.012688473263574194,
"grad_norm": 24.461540625272452,
"learning_rate": 2.0987654320987654e-06,
"loss": 0.6982,
"step": 154
},
{
"epoch": 0.012770865947103896,
"grad_norm": 17.323695037238057,
"learning_rate": 2.1124828532235943e-06,
"loss": 0.6608,
"step": 155
},
{
"epoch": 0.0128532586306336,
"grad_norm": 19.1728467069908,
"learning_rate": 2.1262002743484228e-06,
"loss": 0.6158,
"step": 156
},
{
"epoch": 0.012935651314163302,
"grad_norm": 14.335840726971144,
"learning_rate": 2.1399176954732512e-06,
"loss": 0.6844,
"step": 157
},
{
"epoch": 0.013018043997693005,
"grad_norm": 20.095242492343232,
"learning_rate": 2.1536351165980797e-06,
"loss": 0.497,
"step": 158
},
{
"epoch": 0.013100436681222707,
"grad_norm": 10.114501664370549,
"learning_rate": 2.167352537722908e-06,
"loss": 0.5262,
"step": 159
},
{
"epoch": 0.01318282936475241,
"grad_norm": 13.305214604549445,
"learning_rate": 2.1810699588477367e-06,
"loss": 0.5619,
"step": 160
},
{
"epoch": 0.013265222048282113,
"grad_norm": 19.721782800895156,
"learning_rate": 2.194787379972565e-06,
"loss": 0.5357,
"step": 161
},
{
"epoch": 0.013347614731811815,
"grad_norm": 19.7228102937409,
"learning_rate": 2.208504801097394e-06,
"loss": 0.4225,
"step": 162
},
{
"epoch": 0.013430007415341517,
"grad_norm": 240.83778830697852,
"learning_rate": 2.222222222222222e-06,
"loss": 2.4384,
"step": 163
},
{
"epoch": 0.013512400098871221,
"grad_norm": 11.380285250812992,
"learning_rate": 2.235939643347051e-06,
"loss": 0.6533,
"step": 164
},
{
"epoch": 0.013594792782400923,
"grad_norm": 9.94152540099469,
"learning_rate": 2.2496570644718794e-06,
"loss": 0.5497,
"step": 165
},
{
"epoch": 0.013677185465930625,
"grad_norm": 12.090836450223387,
"learning_rate": 2.263374485596708e-06,
"loss": 0.4756,
"step": 166
},
{
"epoch": 0.013759578149460328,
"grad_norm": 14.813308219199923,
"learning_rate": 2.277091906721537e-06,
"loss": 0.5355,
"step": 167
},
{
"epoch": 0.01384197083299003,
"grad_norm": 13.192872206591804,
"learning_rate": 2.290809327846365e-06,
"loss": 0.5875,
"step": 168
},
{
"epoch": 0.013924363516519734,
"grad_norm": 16.210695640291387,
"learning_rate": 2.3045267489711937e-06,
"loss": 0.5253,
"step": 169
},
{
"epoch": 0.014006756200049436,
"grad_norm": 12.039792190252744,
"learning_rate": 2.3182441700960222e-06,
"loss": 0.4517,
"step": 170
},
{
"epoch": 0.014089148883579138,
"grad_norm": 23.04062666474093,
"learning_rate": 2.3319615912208507e-06,
"loss": 0.4083,
"step": 171
},
{
"epoch": 0.01417154156710884,
"grad_norm": 19.979153089914988,
"learning_rate": 2.345679012345679e-06,
"loss": 0.6887,
"step": 172
},
{
"epoch": 0.014253934250638544,
"grad_norm": 21.895537557735427,
"learning_rate": 2.3593964334705076e-06,
"loss": 0.8094,
"step": 173
},
{
"epoch": 0.014336326934168246,
"grad_norm": 31.47401830070671,
"learning_rate": 2.3731138545953365e-06,
"loss": 0.7431,
"step": 174
},
{
"epoch": 0.014418719617697948,
"grad_norm": 12.750465460746202,
"learning_rate": 2.3868312757201646e-06,
"loss": 0.6583,
"step": 175
},
{
"epoch": 0.01450111230122765,
"grad_norm": 13.307184351874149,
"learning_rate": 2.4005486968449935e-06,
"loss": 0.6077,
"step": 176
},
{
"epoch": 0.014583504984757354,
"grad_norm": 10.435374769314452,
"learning_rate": 2.414266117969822e-06,
"loss": 0.5739,
"step": 177
},
{
"epoch": 0.014665897668287057,
"grad_norm": 15.566819292000186,
"learning_rate": 2.4279835390946504e-06,
"loss": 0.644,
"step": 178
},
{
"epoch": 0.014748290351816759,
"grad_norm": 12.814513858300232,
"learning_rate": 2.441700960219479e-06,
"loss": 0.5858,
"step": 179
},
{
"epoch": 0.014830683035346461,
"grad_norm": 12.12622273494356,
"learning_rate": 2.4554183813443074e-06,
"loss": 0.5202,
"step": 180
},
{
"epoch": 0.014913075718876163,
"grad_norm": 16.96998648395035,
"learning_rate": 2.469135802469136e-06,
"loss": 0.3457,
"step": 181
},
{
"epoch": 0.014995468402405867,
"grad_norm": 13.91986946254961,
"learning_rate": 2.4828532235939647e-06,
"loss": 0.5681,
"step": 182
},
{
"epoch": 0.015077861085935569,
"grad_norm": 12.486810040618805,
"learning_rate": 2.496570644718793e-06,
"loss": 0.5125,
"step": 183
},
{
"epoch": 0.015160253769465271,
"grad_norm": 10.303008103171251,
"learning_rate": 2.5102880658436217e-06,
"loss": 0.6385,
"step": 184
},
{
"epoch": 0.015242646452994973,
"grad_norm": 13.183010022460554,
"learning_rate": 2.52400548696845e-06,
"loss": 0.3552,
"step": 185
},
{
"epoch": 0.015325039136524677,
"grad_norm": 10.107898578134508,
"learning_rate": 2.5377229080932786e-06,
"loss": 0.4341,
"step": 186
},
{
"epoch": 0.01540743182005438,
"grad_norm": 8.570843302612268,
"learning_rate": 2.551440329218107e-06,
"loss": 0.4343,
"step": 187
},
{
"epoch": 0.015489824503584082,
"grad_norm": 17.3196847847868,
"learning_rate": 2.565157750342936e-06,
"loss": 0.6971,
"step": 188
},
{
"epoch": 0.015572217187113784,
"grad_norm": 11.86766768913693,
"learning_rate": 2.578875171467764e-06,
"loss": 0.5436,
"step": 189
},
{
"epoch": 0.015654609870643488,
"grad_norm": 10.49550664029216,
"learning_rate": 2.5925925925925925e-06,
"loss": 0.342,
"step": 190
},
{
"epoch": 0.01573700255417319,
"grad_norm": 9.038437970250417,
"learning_rate": 2.6063100137174214e-06,
"loss": 0.3151,
"step": 191
},
{
"epoch": 0.015819395237702892,
"grad_norm": 15.678199292955869,
"learning_rate": 2.62002743484225e-06,
"loss": 0.7268,
"step": 192
},
{
"epoch": 0.015901787921232594,
"grad_norm": 13.800404804526247,
"learning_rate": 2.6337448559670788e-06,
"loss": 0.5118,
"step": 193
},
{
"epoch": 0.015984180604762296,
"grad_norm": 74.10559559217063,
"learning_rate": 2.647462277091907e-06,
"loss": 0.7444,
"step": 194
},
{
"epoch": 0.016066573288292,
"grad_norm": 12.20315952893777,
"learning_rate": 2.6611796982167353e-06,
"loss": 0.4277,
"step": 195
},
{
"epoch": 0.0161489659718217,
"grad_norm": 10.05719320789487,
"learning_rate": 2.674897119341564e-06,
"loss": 0.4664,
"step": 196
},
{
"epoch": 0.016231358655351406,
"grad_norm": 42.082856786319546,
"learning_rate": 2.6886145404663926e-06,
"loss": 0.3969,
"step": 197
},
{
"epoch": 0.01631375133888111,
"grad_norm": 15.787631693690875,
"learning_rate": 2.7023319615912207e-06,
"loss": 0.7307,
"step": 198
},
{
"epoch": 0.01639614402241081,
"grad_norm": 8.901740684680457,
"learning_rate": 2.7160493827160496e-06,
"loss": 0.5109,
"step": 199
},
{
"epoch": 0.016478536705940513,
"grad_norm": 28.934834071007202,
"learning_rate": 2.729766803840878e-06,
"loss": 0.4942,
"step": 200
},
{
"epoch": 0.016560929389470215,
"grad_norm": 18.793354020178867,
"learning_rate": 2.743484224965707e-06,
"loss": 0.5592,
"step": 201
},
{
"epoch": 0.016643322072999917,
"grad_norm": 13.60338783501572,
"learning_rate": 2.7572016460905354e-06,
"loss": 0.6025,
"step": 202
},
{
"epoch": 0.01672571475652962,
"grad_norm": 8.038968073425716,
"learning_rate": 2.7709190672153635e-06,
"loss": 0.5211,
"step": 203
},
{
"epoch": 0.01680810744005932,
"grad_norm": 11.559001618222288,
"learning_rate": 2.7846364883401924e-06,
"loss": 0.5185,
"step": 204
},
{
"epoch": 0.016890500123589024,
"grad_norm": 10.70606495183075,
"learning_rate": 2.798353909465021e-06,
"loss": 0.5378,
"step": 205
},
{
"epoch": 0.01697289280711873,
"grad_norm": 15.724659491801045,
"learning_rate": 2.8120713305898493e-06,
"loss": 0.3996,
"step": 206
},
{
"epoch": 0.01705528549064843,
"grad_norm": 15.632077558092512,
"learning_rate": 2.825788751714678e-06,
"loss": 0.5294,
"step": 207
},
{
"epoch": 0.017137678174178134,
"grad_norm": 15.35567010238041,
"learning_rate": 2.8395061728395062e-06,
"loss": 0.5789,
"step": 208
},
{
"epoch": 0.017220070857707836,
"grad_norm": 12.247079248152177,
"learning_rate": 2.8532235939643347e-06,
"loss": 0.4783,
"step": 209
},
{
"epoch": 0.017302463541237538,
"grad_norm": 13.787412538148317,
"learning_rate": 2.8669410150891636e-06,
"loss": 0.6358,
"step": 210
},
{
"epoch": 0.01738485622476724,
"grad_norm": 10.388866874954653,
"learning_rate": 2.880658436213992e-06,
"loss": 0.4617,
"step": 211
},
{
"epoch": 0.017467248908296942,
"grad_norm": 10.149440548768066,
"learning_rate": 2.89437585733882e-06,
"loss": 0.372,
"step": 212
},
{
"epoch": 0.017549641591826644,
"grad_norm": 12.782054026030952,
"learning_rate": 2.908093278463649e-06,
"loss": 0.5502,
"step": 213
},
{
"epoch": 0.017632034275356347,
"grad_norm": 8.980692409189274,
"learning_rate": 2.9218106995884775e-06,
"loss": 0.5311,
"step": 214
},
{
"epoch": 0.017714426958886052,
"grad_norm": 12.126638458623237,
"learning_rate": 2.9355281207133064e-06,
"loss": 0.6599,
"step": 215
},
{
"epoch": 0.017796819642415754,
"grad_norm": 10.503433095750024,
"learning_rate": 2.949245541838135e-06,
"loss": 0.4327,
"step": 216
},
{
"epoch": 0.017879212325945457,
"grad_norm": 12.219841823090144,
"learning_rate": 2.962962962962963e-06,
"loss": 0.7044,
"step": 217
},
{
"epoch": 0.01796160500947516,
"grad_norm": 18.87320464359166,
"learning_rate": 2.976680384087792e-06,
"loss": 0.7467,
"step": 218
},
{
"epoch": 0.01804399769300486,
"grad_norm": 289.359254982659,
"learning_rate": 2.9903978052126203e-06,
"loss": 3.0217,
"step": 219
},
{
"epoch": 0.018126390376534563,
"grad_norm": 7.733672679042532,
"learning_rate": 3.004115226337449e-06,
"loss": 0.4293,
"step": 220
},
{
"epoch": 0.018208783060064265,
"grad_norm": 16.61269730251294,
"learning_rate": 3.0178326474622772e-06,
"loss": 0.6614,
"step": 221
},
{
"epoch": 0.018291175743593967,
"grad_norm": 8.31112554516155,
"learning_rate": 3.0315500685871057e-06,
"loss": 0.5271,
"step": 222
},
{
"epoch": 0.018373568427123673,
"grad_norm": 13.664445288630535,
"learning_rate": 3.0452674897119346e-06,
"loss": 0.7018,
"step": 223
},
{
"epoch": 0.018455961110653375,
"grad_norm": 10.005927544238816,
"learning_rate": 3.058984910836763e-06,
"loss": 0.5524,
"step": 224
},
{
"epoch": 0.018538353794183077,
"grad_norm": 15.446861208215383,
"learning_rate": 3.0727023319615915e-06,
"loss": 0.6341,
"step": 225
},
{
"epoch": 0.01862074647771278,
"grad_norm": 16.079846485759564,
"learning_rate": 3.08641975308642e-06,
"loss": 0.8794,
"step": 226
},
{
"epoch": 0.018703139161242482,
"grad_norm": 10.175892407022696,
"learning_rate": 3.1001371742112485e-06,
"loss": 0.6755,
"step": 227
},
{
"epoch": 0.018785531844772184,
"grad_norm": 56.127455072454026,
"learning_rate": 3.113854595336077e-06,
"loss": 0.3321,
"step": 228
},
{
"epoch": 0.018867924528301886,
"grad_norm": 12.930244631445957,
"learning_rate": 3.127572016460906e-06,
"loss": 0.4804,
"step": 229
},
{
"epoch": 0.018950317211831588,
"grad_norm": 12.529583269551953,
"learning_rate": 3.141289437585734e-06,
"loss": 0.5312,
"step": 230
},
{
"epoch": 0.01903270989536129,
"grad_norm": 9.78335819090374,
"learning_rate": 3.1550068587105624e-06,
"loss": 0.6044,
"step": 231
},
{
"epoch": 0.019115102578890996,
"grad_norm": 9.981952585751747,
"learning_rate": 3.1687242798353912e-06,
"loss": 0.7049,
"step": 232
},
{
"epoch": 0.019197495262420698,
"grad_norm": 17.065859536580135,
"learning_rate": 3.1824417009602197e-06,
"loss": 0.6509,
"step": 233
},
{
"epoch": 0.0192798879459504,
"grad_norm": 10.93465261939953,
"learning_rate": 3.1961591220850486e-06,
"loss": 0.7744,
"step": 234
},
{
"epoch": 0.019362280629480103,
"grad_norm": 9.705094089624701,
"learning_rate": 3.2098765432098767e-06,
"loss": 0.6289,
"step": 235
},
{
"epoch": 0.019444673313009805,
"grad_norm": 11.03377155515954,
"learning_rate": 3.223593964334705e-06,
"loss": 0.4788,
"step": 236
},
{
"epoch": 0.019527065996539507,
"grad_norm": 9.129123781076657,
"learning_rate": 3.237311385459534e-06,
"loss": 0.5489,
"step": 237
},
{
"epoch": 0.01960945868006921,
"grad_norm": 8.697937237915472,
"learning_rate": 3.2510288065843625e-06,
"loss": 0.5861,
"step": 238
},
{
"epoch": 0.01969185136359891,
"grad_norm": 8.870677568511018,
"learning_rate": 3.2647462277091905e-06,
"loss": 0.5946,
"step": 239
},
{
"epoch": 0.019774244047128613,
"grad_norm": 23.189212971761012,
"learning_rate": 3.2784636488340194e-06,
"loss": 0.4848,
"step": 240
},
{
"epoch": 0.01985663673065832,
"grad_norm": 8.803471237780029,
"learning_rate": 3.292181069958848e-06,
"loss": 0.3981,
"step": 241
},
{
"epoch": 0.01993902941418802,
"grad_norm": 9.276823579497725,
"learning_rate": 3.305898491083677e-06,
"loss": 0.3854,
"step": 242
},
{
"epoch": 0.020021422097717723,
"grad_norm": 15.048560056515383,
"learning_rate": 3.3196159122085053e-06,
"loss": 0.737,
"step": 243
},
{
"epoch": 0.020103814781247425,
"grad_norm": 8.848106416589038,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.2963,
"step": 244
},
{
"epoch": 0.020186207464777128,
"grad_norm": 7.313536463258056,
"learning_rate": 3.3470507544581622e-06,
"loss": 0.2768,
"step": 245
},
{
"epoch": 0.02026860014830683,
"grad_norm": 12.219856537027805,
"learning_rate": 3.3607681755829907e-06,
"loss": 0.4306,
"step": 246
},
{
"epoch": 0.020350992831836532,
"grad_norm": 10.00453954068384,
"learning_rate": 3.3744855967078196e-06,
"loss": 0.2573,
"step": 247
},
{
"epoch": 0.020433385515366234,
"grad_norm": 16.88134336345359,
"learning_rate": 3.3882030178326476e-06,
"loss": 0.2629,
"step": 248
},
{
"epoch": 0.020515778198895936,
"grad_norm": 12.88958428233626,
"learning_rate": 3.401920438957476e-06,
"loss": 0.4504,
"step": 249
},
{
"epoch": 0.020598170882425642,
"grad_norm": 19.916078497234878,
"learning_rate": 3.415637860082305e-06,
"loss": 0.7189,
"step": 250
},
{
"epoch": 0.020680563565955344,
"grad_norm": 12.379454068967135,
"learning_rate": 3.4293552812071335e-06,
"loss": 0.5047,
"step": 251
},
{
"epoch": 0.020762956249485046,
"grad_norm": 7.363299478552216,
"learning_rate": 3.443072702331962e-06,
"loss": 0.308,
"step": 252
},
{
"epoch": 0.02084534893301475,
"grad_norm": 14.9221134616295,
"learning_rate": 3.4567901234567904e-06,
"loss": 0.5358,
"step": 253
},
{
"epoch": 0.02092774161654445,
"grad_norm": 75.3026629510051,
"learning_rate": 3.470507544581619e-06,
"loss": 1.3456,
"step": 254
},
{
"epoch": 0.021010134300074153,
"grad_norm": 13.144224277254176,
"learning_rate": 3.4842249657064474e-06,
"loss": 0.7256,
"step": 255
},
{
"epoch": 0.021092526983603855,
"grad_norm": 8.29902926659797,
"learning_rate": 3.4979423868312762e-06,
"loss": 0.479,
"step": 256
},
{
"epoch": 0.021174919667133557,
"grad_norm": 36.142615394611894,
"learning_rate": 3.5116598079561043e-06,
"loss": 0.5905,
"step": 257
},
{
"epoch": 0.021257312350663263,
"grad_norm": 18.894771750217856,
"learning_rate": 3.5253772290809328e-06,
"loss": 0.5372,
"step": 258
},
{
"epoch": 0.021339705034192965,
"grad_norm": 8.876306670885448,
"learning_rate": 3.5390946502057617e-06,
"loss": 0.507,
"step": 259
},
{
"epoch": 0.021422097717722667,
"grad_norm": 12.322148718207554,
"learning_rate": 3.55281207133059e-06,
"loss": 0.5313,
"step": 260
},
{
"epoch": 0.02150449040125237,
"grad_norm": 9.429328982690008,
"learning_rate": 3.566529492455419e-06,
"loss": 0.6338,
"step": 261
},
{
"epoch": 0.02158688308478207,
"grad_norm": 8.439579932933407,
"learning_rate": 3.580246913580247e-06,
"loss": 0.64,
"step": 262
},
{
"epoch": 0.021669275768311774,
"grad_norm": 7.027927606341166,
"learning_rate": 3.5939643347050755e-06,
"loss": 0.694,
"step": 263
},
{
"epoch": 0.021751668451841476,
"grad_norm": 18.461422860766792,
"learning_rate": 3.6076817558299044e-06,
"loss": 0.8644,
"step": 264
},
{
"epoch": 0.021834061135371178,
"grad_norm": 7.312928184572379,
"learning_rate": 3.621399176954733e-06,
"loss": 0.5441,
"step": 265
},
{
"epoch": 0.02191645381890088,
"grad_norm": 10.896094086625412,
"learning_rate": 3.635116598079561e-06,
"loss": 0.3631,
"step": 266
},
{
"epoch": 0.021998846502430586,
"grad_norm": 12.46612913960122,
"learning_rate": 3.64883401920439e-06,
"loss": 0.6946,
"step": 267
},
{
"epoch": 0.022081239185960288,
"grad_norm": 8.87385816825834,
"learning_rate": 3.6625514403292183e-06,
"loss": 0.6441,
"step": 268
},
{
"epoch": 0.02216363186948999,
"grad_norm": 7.66230536481842,
"learning_rate": 3.6762688614540472e-06,
"loss": 0.4107,
"step": 269
},
{
"epoch": 0.022246024553019692,
"grad_norm": 20.729872546438557,
"learning_rate": 3.6899862825788757e-06,
"loss": 0.575,
"step": 270
},
{
"epoch": 0.022328417236549394,
"grad_norm": 8.579634165930928,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.2851,
"step": 271
},
{
"epoch": 0.022410809920079097,
"grad_norm": 8.32603360510797,
"learning_rate": 3.7174211248285326e-06,
"loss": 0.549,
"step": 272
},
{
"epoch": 0.0224932026036088,
"grad_norm": 7.119710330257647,
"learning_rate": 3.731138545953361e-06,
"loss": 0.6911,
"step": 273
},
{
"epoch": 0.0225755952871385,
"grad_norm": 12.334011335320461,
"learning_rate": 3.7448559670781896e-06,
"loss": 0.2933,
"step": 274
},
{
"epoch": 0.022657987970668203,
"grad_norm": 8.819773878214544,
"learning_rate": 3.758573388203018e-06,
"loss": 0.5093,
"step": 275
},
{
"epoch": 0.02274038065419791,
"grad_norm": 7.853520271881538,
"learning_rate": 3.7722908093278465e-06,
"loss": 0.4793,
"step": 276
},
{
"epoch": 0.02282277333772761,
"grad_norm": 12.938012573178968,
"learning_rate": 3.786008230452675e-06,
"loss": 0.7915,
"step": 277
},
{
"epoch": 0.022905166021257313,
"grad_norm": 7.351505350233861,
"learning_rate": 3.799725651577504e-06,
"loss": 0.4733,
"step": 278
},
{
"epoch": 0.022987558704787015,
"grad_norm": 7.3673534060195855,
"learning_rate": 3.8134430727023324e-06,
"loss": 0.41,
"step": 279
},
{
"epoch": 0.023069951388316717,
"grad_norm": 9.289270184893226,
"learning_rate": 3.827160493827161e-06,
"loss": 0.4778,
"step": 280
},
{
"epoch": 0.02315234407184642,
"grad_norm": 6.28612952057349,
"learning_rate": 3.840877914951989e-06,
"loss": 0.406,
"step": 281
},
{
"epoch": 0.02323473675537612,
"grad_norm": 5.946903311816241,
"learning_rate": 3.854595336076818e-06,
"loss": 0.4755,
"step": 282
},
{
"epoch": 0.023317129438905824,
"grad_norm": 6.834284548380493,
"learning_rate": 3.868312757201647e-06,
"loss": 0.5116,
"step": 283
},
{
"epoch": 0.02339952212243553,
"grad_norm": 11.774766376537002,
"learning_rate": 3.882030178326475e-06,
"loss": 0.3654,
"step": 284
},
{
"epoch": 0.02348191480596523,
"grad_norm": 10.79007061340619,
"learning_rate": 3.895747599451304e-06,
"loss": 0.3865,
"step": 285
},
{
"epoch": 0.023564307489494934,
"grad_norm": 8.26845323051975,
"learning_rate": 3.909465020576132e-06,
"loss": 0.6248,
"step": 286
},
{
"epoch": 0.023646700173024636,
"grad_norm": 49.81368555681276,
"learning_rate": 3.9231824417009605e-06,
"loss": 0.6136,
"step": 287
},
{
"epoch": 0.023729092856554338,
"grad_norm": 13.035734882105107,
"learning_rate": 3.9368998628257894e-06,
"loss": 0.7522,
"step": 288
},
{
"epoch": 0.02381148554008404,
"grad_norm": 10.878498977499254,
"learning_rate": 3.9506172839506175e-06,
"loss": 0.6219,
"step": 289
},
{
"epoch": 0.023893878223613742,
"grad_norm": 11.982100652311225,
"learning_rate": 3.964334705075446e-06,
"loss": 0.6203,
"step": 290
},
{
"epoch": 0.023976270907143445,
"grad_norm": 9.985093354905656,
"learning_rate": 3.9780521262002744e-06,
"loss": 0.6091,
"step": 291
},
{
"epoch": 0.024058663590673147,
"grad_norm": 10.79524941182704,
"learning_rate": 3.991769547325103e-06,
"loss": 0.511,
"step": 292
},
{
"epoch": 0.024141056274202852,
"grad_norm": 8.306861975711865,
"learning_rate": 4.005486968449932e-06,
"loss": 0.6458,
"step": 293
},
{
"epoch": 0.024223448957732555,
"grad_norm": 8.445514735869802,
"learning_rate": 4.01920438957476e-06,
"loss": 0.4246,
"step": 294
},
{
"epoch": 0.024305841641262257,
"grad_norm": 10.946877499366206,
"learning_rate": 4.032921810699589e-06,
"loss": 0.4455,
"step": 295
},
{
"epoch": 0.02438823432479196,
"grad_norm": 9.95334826152276,
"learning_rate": 4.046639231824417e-06,
"loss": 0.7388,
"step": 296
},
{
"epoch": 0.02447062700832166,
"grad_norm": 6.182291262554031,
"learning_rate": 4.060356652949246e-06,
"loss": 0.255,
"step": 297
},
{
"epoch": 0.024553019691851363,
"grad_norm": 6.230437793654582,
"learning_rate": 4.074074074074074e-06,
"loss": 0.317,
"step": 298
},
{
"epoch": 0.024635412375381065,
"grad_norm": 37.925863944916884,
"learning_rate": 4.087791495198903e-06,
"loss": 0.6706,
"step": 299
},
{
"epoch": 0.024717805058910768,
"grad_norm": 9.275615694704983,
"learning_rate": 4.101508916323731e-06,
"loss": 0.7037,
"step": 300
},
{
"epoch": 0.02480019774244047,
"grad_norm": 8.011699796447031,
"learning_rate": 4.11522633744856e-06,
"loss": 0.5876,
"step": 301
},
{
"epoch": 0.024882590425970175,
"grad_norm": 6.150375331005549,
"learning_rate": 4.128943758573389e-06,
"loss": 0.5513,
"step": 302
},
{
"epoch": 0.024964983109499878,
"grad_norm": 9.66910213632119,
"learning_rate": 4.142661179698217e-06,
"loss": 0.7654,
"step": 303
},
{
"epoch": 0.02504737579302958,
"grad_norm": 7.492282251093323,
"learning_rate": 4.156378600823046e-06,
"loss": 0.5467,
"step": 304
},
{
"epoch": 0.025129768476559282,
"grad_norm": 8.84736509671235,
"learning_rate": 4.170096021947874e-06,
"loss": 0.6802,
"step": 305
},
{
"epoch": 0.025212161160088984,
"grad_norm": 11.147679298984809,
"learning_rate": 4.183813443072703e-06,
"loss": 0.3183,
"step": 306
},
{
"epoch": 0.025294553843618686,
"grad_norm": 6.052642841781604,
"learning_rate": 4.197530864197531e-06,
"loss": 0.2843,
"step": 307
},
{
"epoch": 0.02537694652714839,
"grad_norm": 5.853639145291225,
"learning_rate": 4.21124828532236e-06,
"loss": 0.6171,
"step": 308
},
{
"epoch": 0.02545933921067809,
"grad_norm": 7.664697260789782,
"learning_rate": 4.224965706447189e-06,
"loss": 0.5526,
"step": 309
},
{
"epoch": 0.025541731894207793,
"grad_norm": 8.309328255797821,
"learning_rate": 4.238683127572017e-06,
"loss": 0.3675,
"step": 310
},
{
"epoch": 0.0256241245777375,
"grad_norm": 6.048028087236039,
"learning_rate": 4.2524005486968456e-06,
"loss": 0.3439,
"step": 311
},
{
"epoch": 0.0257065172612672,
"grad_norm": 7.684783842069482,
"learning_rate": 4.266117969821674e-06,
"loss": 0.671,
"step": 312
},
{
"epoch": 0.025788909944796903,
"grad_norm": 14.534992720713337,
"learning_rate": 4.2798353909465025e-06,
"loss": 0.7617,
"step": 313
},
{
"epoch": 0.025871302628326605,
"grad_norm": 8.615671182896788,
"learning_rate": 4.293552812071331e-06,
"loss": 0.6877,
"step": 314
},
{
"epoch": 0.025953695311856307,
"grad_norm": 7.575816304061312,
"learning_rate": 4.3072702331961594e-06,
"loss": 0.6431,
"step": 315
},
{
"epoch": 0.02603608799538601,
"grad_norm": 8.768345528296296,
"learning_rate": 4.3209876543209875e-06,
"loss": 0.4367,
"step": 316
},
{
"epoch": 0.02611848067891571,
"grad_norm": 7.718681777042332,
"learning_rate": 4.334705075445816e-06,
"loss": 0.3518,
"step": 317
},
{
"epoch": 0.026200873362445413,
"grad_norm": 15.692592743928309,
"learning_rate": 4.348422496570645e-06,
"loss": 0.8163,
"step": 318
},
{
"epoch": 0.02628326604597512,
"grad_norm": 8.46510288305704,
"learning_rate": 4.362139917695473e-06,
"loss": 0.6321,
"step": 319
},
{
"epoch": 0.02636565872950482,
"grad_norm": 6.317038560782752,
"learning_rate": 4.375857338820302e-06,
"loss": 0.4448,
"step": 320
},
{
"epoch": 0.026448051413034523,
"grad_norm": 17.22722159022544,
"learning_rate": 4.38957475994513e-06,
"loss": 0.6319,
"step": 321
},
{
"epoch": 0.026530444096564226,
"grad_norm": 35.34200236087575,
"learning_rate": 4.403292181069959e-06,
"loss": 0.6193,
"step": 322
},
{
"epoch": 0.026612836780093928,
"grad_norm": 11.559769027360641,
"learning_rate": 4.417009602194788e-06,
"loss": 0.6904,
"step": 323
},
{
"epoch": 0.02669522946362363,
"grad_norm": 7.249712253160824,
"learning_rate": 4.430727023319616e-06,
"loss": 0.4605,
"step": 324
},
{
"epoch": 0.026777622147153332,
"grad_norm": 5.9808851831275,
"learning_rate": 4.444444444444444e-06,
"loss": 0.3127,
"step": 325
},
{
"epoch": 0.026860014830683034,
"grad_norm": 6.4015006508429995,
"learning_rate": 4.458161865569273e-06,
"loss": 0.6494,
"step": 326
},
{
"epoch": 0.026942407514212736,
"grad_norm": 7.388398333528457,
"learning_rate": 4.471879286694102e-06,
"loss": 0.4854,
"step": 327
},
{
"epoch": 0.027024800197742442,
"grad_norm": 5.459089110209384,
"learning_rate": 4.485596707818931e-06,
"loss": 0.463,
"step": 328
},
{
"epoch": 0.027107192881272144,
"grad_norm": 7.377223891634756,
"learning_rate": 4.499314128943759e-06,
"loss": 0.6919,
"step": 329
},
{
"epoch": 0.027189585564801846,
"grad_norm": 6.3768264626554805,
"learning_rate": 4.513031550068587e-06,
"loss": 0.6361,
"step": 330
},
{
"epoch": 0.02727197824833155,
"grad_norm": 6.99781725288093,
"learning_rate": 4.526748971193416e-06,
"loss": 0.6743,
"step": 331
},
{
"epoch": 0.02735437093186125,
"grad_norm": 5.9726845052369075,
"learning_rate": 4.540466392318245e-06,
"loss": 0.3525,
"step": 332
},
{
"epoch": 0.027436763615390953,
"grad_norm": 4.715507305833575,
"learning_rate": 4.554183813443074e-06,
"loss": 0.365,
"step": 333
},
{
"epoch": 0.027519156298920655,
"grad_norm": 5.786778668319323,
"learning_rate": 4.567901234567902e-06,
"loss": 0.3823,
"step": 334
},
{
"epoch": 0.027601548982450357,
"grad_norm": 7.779154962035555,
"learning_rate": 4.58161865569273e-06,
"loss": 0.5933,
"step": 335
},
{
"epoch": 0.02768394166598006,
"grad_norm": 12.355416669442642,
"learning_rate": 4.595336076817559e-06,
"loss": 0.6039,
"step": 336
},
{
"epoch": 0.027766334349509765,
"grad_norm": 5.078241648282684,
"learning_rate": 4.6090534979423875e-06,
"loss": 0.3775,
"step": 337
},
{
"epoch": 0.027848727033039467,
"grad_norm": 5.863453886765012,
"learning_rate": 4.622770919067216e-06,
"loss": 0.2638,
"step": 338
},
{
"epoch": 0.02793111971656917,
"grad_norm": 12.580103906062597,
"learning_rate": 4.6364883401920444e-06,
"loss": 0.7607,
"step": 339
},
{
"epoch": 0.02801351240009887,
"grad_norm": 6.6784963620823525,
"learning_rate": 4.6502057613168725e-06,
"loss": 0.4435,
"step": 340
},
{
"epoch": 0.028095905083628574,
"grad_norm": 7.038657857694269,
"learning_rate": 4.663923182441701e-06,
"loss": 0.6857,
"step": 341
},
{
"epoch": 0.028178297767158276,
"grad_norm": 14.379372920193825,
"learning_rate": 4.67764060356653e-06,
"loss": 0.8087,
"step": 342
},
{
"epoch": 0.028260690450687978,
"grad_norm": 9.837898915303215,
"learning_rate": 4.691358024691358e-06,
"loss": 0.5434,
"step": 343
},
{
"epoch": 0.02834308313421768,
"grad_norm": 6.687023299218438,
"learning_rate": 4.705075445816187e-06,
"loss": 0.3655,
"step": 344
},
{
"epoch": 0.028425475817747382,
"grad_norm": 5.726375583817776,
"learning_rate": 4.718792866941015e-06,
"loss": 0.5816,
"step": 345
},
{
"epoch": 0.028507868501277088,
"grad_norm": 6.8852392247210945,
"learning_rate": 4.732510288065844e-06,
"loss": 0.5028,
"step": 346
},
{
"epoch": 0.02859026118480679,
"grad_norm": 5.798984025320741,
"learning_rate": 4.746227709190673e-06,
"loss": 0.5701,
"step": 347
},
{
"epoch": 0.028672653868336492,
"grad_norm": 10.24662425737303,
"learning_rate": 4.759945130315501e-06,
"loss": 0.6865,
"step": 348
},
{
"epoch": 0.028755046551866194,
"grad_norm": 6.089865434146081,
"learning_rate": 4.773662551440329e-06,
"loss": 0.6887,
"step": 349
},
{
"epoch": 0.028837439235395897,
"grad_norm": 6.365563959115913,
"learning_rate": 4.787379972565158e-06,
"loss": 0.6076,
"step": 350
},
{
"epoch": 0.0289198319189256,
"grad_norm": 5.916203223471868,
"learning_rate": 4.801097393689987e-06,
"loss": 0.6597,
"step": 351
},
{
"epoch": 0.0290022246024553,
"grad_norm": 4.599031509093365,
"learning_rate": 4.814814814814815e-06,
"loss": 0.395,
"step": 352
},
{
"epoch": 0.029084617285985003,
"grad_norm": 6.6869153491637485,
"learning_rate": 4.828532235939644e-06,
"loss": 0.548,
"step": 353
},
{
"epoch": 0.02916700996951471,
"grad_norm": 4.333465973974785,
"learning_rate": 4.842249657064472e-06,
"loss": 0.5945,
"step": 354
},
{
"epoch": 0.02924940265304441,
"grad_norm": 6.808396103175679,
"learning_rate": 4.855967078189301e-06,
"loss": 0.5964,
"step": 355
},
{
"epoch": 0.029331795336574113,
"grad_norm": 9.755100603471288,
"learning_rate": 4.86968449931413e-06,
"loss": 0.6287,
"step": 356
},
{
"epoch": 0.029414188020103815,
"grad_norm": 5.80905261329336,
"learning_rate": 4.883401920438958e-06,
"loss": 0.4366,
"step": 357
},
{
"epoch": 0.029496580703633517,
"grad_norm": 5.5904604000702545,
"learning_rate": 4.897119341563787e-06,
"loss": 0.5093,
"step": 358
},
{
"epoch": 0.02957897338716322,
"grad_norm": 7.090237952729793,
"learning_rate": 4.910836762688615e-06,
"loss": 0.4749,
"step": 359
},
{
"epoch": 0.029661366070692922,
"grad_norm": 6.9638534563545695,
"learning_rate": 4.924554183813444e-06,
"loss": 0.6429,
"step": 360
},
{
"epoch": 0.029743758754222624,
"grad_norm": 4.026899519732204,
"learning_rate": 4.938271604938272e-06,
"loss": 0.4878,
"step": 361
},
{
"epoch": 0.029826151437752326,
"grad_norm": 7.3582470565677065,
"learning_rate": 4.9519890260631005e-06,
"loss": 0.5546,
"step": 362
},
{
"epoch": 0.029908544121282032,
"grad_norm": 5.356724068201691,
"learning_rate": 4.9657064471879294e-06,
"loss": 0.345,
"step": 363
},
{
"epoch": 0.029990936804811734,
"grad_norm": 5.364214773411196,
"learning_rate": 4.9794238683127575e-06,
"loss": 0.3246,
"step": 364
},
{
"epoch": 0.030073329488341436,
"grad_norm": 8.332851292756342,
"learning_rate": 4.993141289437586e-06,
"loss": 0.6733,
"step": 365
},
{
"epoch": 0.030155722171871138,
"grad_norm": 9.765581947528334,
"learning_rate": 5.0068587105624144e-06,
"loss": 0.4704,
"step": 366
},
{
"epoch": 0.03023811485540084,
"grad_norm": 5.498161157247211,
"learning_rate": 5.020576131687243e-06,
"loss": 0.2827,
"step": 367
},
{
"epoch": 0.030320507538930543,
"grad_norm": 43.68184948115611,
"learning_rate": 5.034293552812071e-06,
"loss": 0.3449,
"step": 368
},
{
"epoch": 0.030402900222460245,
"grad_norm": 6.219439174212411,
"learning_rate": 5.0480109739369e-06,
"loss": 0.3369,
"step": 369
},
{
"epoch": 0.030485292905989947,
"grad_norm": 5.267942442082339,
"learning_rate": 5.061728395061729e-06,
"loss": 0.4036,
"step": 370
},
{
"epoch": 0.03056768558951965,
"grad_norm": 4.7582676290183,
"learning_rate": 5.075445816186557e-06,
"loss": 0.2824,
"step": 371
},
{
"epoch": 0.030650078273049355,
"grad_norm": 8.46553998751592,
"learning_rate": 5.089163237311386e-06,
"loss": 0.5826,
"step": 372
},
{
"epoch": 0.030732470956579057,
"grad_norm": 9.33378948895317,
"learning_rate": 5.102880658436214e-06,
"loss": 0.6385,
"step": 373
},
{
"epoch": 0.03081486364010876,
"grad_norm": 10.627769537470304,
"learning_rate": 5.116598079561042e-06,
"loss": 0.6659,
"step": 374
},
{
"epoch": 0.03089725632363846,
"grad_norm": 6.414932231735032,
"learning_rate": 5.130315500685872e-06,
"loss": 0.4209,
"step": 375
},
{
"epoch": 0.030979649007168163,
"grad_norm": 5.200756588061871,
"learning_rate": 5.1440329218107e-06,
"loss": 0.3344,
"step": 376
},
{
"epoch": 0.031062041690697866,
"grad_norm": 8.066096499546177,
"learning_rate": 5.157750342935528e-06,
"loss": 0.5315,
"step": 377
},
{
"epoch": 0.031144434374227568,
"grad_norm": 7.5104265708571125,
"learning_rate": 5.171467764060357e-06,
"loss": 0.5785,
"step": 378
},
{
"epoch": 0.03122682705775727,
"grad_norm": 7.326972182415438,
"learning_rate": 5.185185185185185e-06,
"loss": 0.6173,
"step": 379
},
{
"epoch": 0.031309219741286975,
"grad_norm": 7.064196146503117,
"learning_rate": 5.198902606310015e-06,
"loss": 0.4073,
"step": 380
},
{
"epoch": 0.031391612424816674,
"grad_norm": 10.55313726796645,
"learning_rate": 5.212620027434843e-06,
"loss": 0.5923,
"step": 381
},
{
"epoch": 0.03147400510834638,
"grad_norm": 9.355498638304265,
"learning_rate": 5.226337448559671e-06,
"loss": 0.3621,
"step": 382
},
{
"epoch": 0.03155639779187608,
"grad_norm": 6.985263971534283,
"learning_rate": 5.2400548696845e-06,
"loss": 0.5704,
"step": 383
},
{
"epoch": 0.031638790475405784,
"grad_norm": 12.680293995262826,
"learning_rate": 5.253772290809328e-06,
"loss": 0.6467,
"step": 384
},
{
"epoch": 0.03172118315893549,
"grad_norm": 8.65674983418551,
"learning_rate": 5.2674897119341575e-06,
"loss": 0.5878,
"step": 385
},
{
"epoch": 0.03180357584246519,
"grad_norm": 6.341472071022504,
"learning_rate": 5.2812071330589856e-06,
"loss": 0.4152,
"step": 386
},
{
"epoch": 0.031885968525994894,
"grad_norm": 6.869168720162001,
"learning_rate": 5.294924554183814e-06,
"loss": 0.4097,
"step": 387
},
{
"epoch": 0.03196836120952459,
"grad_norm": 14.190249862283896,
"learning_rate": 5.3086419753086425e-06,
"loss": 0.8292,
"step": 388
},
{
"epoch": 0.0320507538930543,
"grad_norm": 8.275823508063906,
"learning_rate": 5.3223593964334705e-06,
"loss": 0.5986,
"step": 389
},
{
"epoch": 0.032133146576584,
"grad_norm": 6.358393504534048,
"learning_rate": 5.3360768175583e-06,
"loss": 0.4381,
"step": 390
},
{
"epoch": 0.0322155392601137,
"grad_norm": 7.516552473352774,
"learning_rate": 5.349794238683128e-06,
"loss": 0.6122,
"step": 391
},
{
"epoch": 0.0322979319436434,
"grad_norm": 8.856630051592028,
"learning_rate": 5.363511659807956e-06,
"loss": 0.5578,
"step": 392
},
{
"epoch": 0.03238032462717311,
"grad_norm": 6.892129730394783,
"learning_rate": 5.377229080932785e-06,
"loss": 0.3553,
"step": 393
},
{
"epoch": 0.03246271731070281,
"grad_norm": 21.05820599044793,
"learning_rate": 5.390946502057613e-06,
"loss": 0.7038,
"step": 394
},
{
"epoch": 0.03254510999423251,
"grad_norm": 8.434869352368931,
"learning_rate": 5.404663923182441e-06,
"loss": 0.6087,
"step": 395
},
{
"epoch": 0.03262750267776222,
"grad_norm": 7.561351421213893,
"learning_rate": 5.418381344307271e-06,
"loss": 0.5764,
"step": 396
},
{
"epoch": 0.032709895361291916,
"grad_norm": 8.211243703767535,
"learning_rate": 5.432098765432099e-06,
"loss": 0.4147,
"step": 397
},
{
"epoch": 0.03279228804482162,
"grad_norm": 6.985876038283628,
"learning_rate": 5.445816186556928e-06,
"loss": 0.6712,
"step": 398
},
{
"epoch": 0.03287468072835132,
"grad_norm": 7.796492055840742,
"learning_rate": 5.459533607681756e-06,
"loss": 0.5813,
"step": 399
},
{
"epoch": 0.032957073411881026,
"grad_norm": 103.54807755066446,
"learning_rate": 5.473251028806584e-06,
"loss": 2.5632,
"step": 400
},
{
"epoch": 0.033039466095410724,
"grad_norm": 5.515831392953944,
"learning_rate": 5.486968449931414e-06,
"loss": 0.4869,
"step": 401
},
{
"epoch": 0.03312185877894043,
"grad_norm": 7.030021195326967,
"learning_rate": 5.500685871056242e-06,
"loss": 0.5261,
"step": 402
},
{
"epoch": 0.033204251462470136,
"grad_norm": 19.87615017638583,
"learning_rate": 5.514403292181071e-06,
"loss": 0.6917,
"step": 403
},
{
"epoch": 0.033286644145999834,
"grad_norm": 7.081294911924975,
"learning_rate": 5.528120713305899e-06,
"loss": 0.6978,
"step": 404
},
{
"epoch": 0.03336903682952954,
"grad_norm": 11.979085035070433,
"learning_rate": 5.541838134430727e-06,
"loss": 0.8872,
"step": 405
},
{
"epoch": 0.03345142951305924,
"grad_norm": 8.195578186353957,
"learning_rate": 5.555555555555557e-06,
"loss": 0.6859,
"step": 406
},
{
"epoch": 0.033533822196588944,
"grad_norm": 8.47406281800443,
"learning_rate": 5.569272976680385e-06,
"loss": 0.8004,
"step": 407
},
{
"epoch": 0.03361621488011864,
"grad_norm": 7.986033367143951,
"learning_rate": 5.582990397805214e-06,
"loss": 0.6459,
"step": 408
},
{
"epoch": 0.03369860756364835,
"grad_norm": 56.85009393490188,
"learning_rate": 5.596707818930042e-06,
"loss": 0.4587,
"step": 409
},
{
"epoch": 0.03378100024717805,
"grad_norm": 7.34821171599197,
"learning_rate": 5.61042524005487e-06,
"loss": 0.3975,
"step": 410
},
{
"epoch": 0.03386339293070775,
"grad_norm": 10.977592364670041,
"learning_rate": 5.624142661179699e-06,
"loss": 0.6514,
"step": 411
},
{
"epoch": 0.03394578561423746,
"grad_norm": 6.281256114995981,
"learning_rate": 5.6378600823045275e-06,
"loss": 0.5191,
"step": 412
},
{
"epoch": 0.03402817829776716,
"grad_norm": 8.676725239026284,
"learning_rate": 5.651577503429356e-06,
"loss": 0.6749,
"step": 413
},
{
"epoch": 0.03411057098129686,
"grad_norm": 5.271388347433627,
"learning_rate": 5.6652949245541844e-06,
"loss": 0.3394,
"step": 414
},
{
"epoch": 0.03419296366482656,
"grad_norm": 8.62394479818176,
"learning_rate": 5.6790123456790125e-06,
"loss": 0.6579,
"step": 415
},
{
"epoch": 0.03427535634835627,
"grad_norm": 8.725171620314873,
"learning_rate": 5.692729766803841e-06,
"loss": 0.5777,
"step": 416
},
{
"epoch": 0.034357749031885966,
"grad_norm": 103.08679101276228,
"learning_rate": 5.7064471879286694e-06,
"loss": 1.5575,
"step": 417
},
{
"epoch": 0.03444014171541567,
"grad_norm": 5.761947609743853,
"learning_rate": 5.720164609053498e-06,
"loss": 0.5843,
"step": 418
},
{
"epoch": 0.03452253439894537,
"grad_norm": 7.898907683412111,
"learning_rate": 5.733882030178327e-06,
"loss": 0.4965,
"step": 419
},
{
"epoch": 0.034604927082475076,
"grad_norm": 8.409149079491211,
"learning_rate": 5.747599451303155e-06,
"loss": 0.5187,
"step": 420
},
{
"epoch": 0.03468731976600478,
"grad_norm": 7.164102449901402,
"learning_rate": 5.761316872427984e-06,
"loss": 0.6237,
"step": 421
},
{
"epoch": 0.03476971244953448,
"grad_norm": 6.78383472471162,
"learning_rate": 5.775034293552812e-06,
"loss": 0.47,
"step": 422
},
{
"epoch": 0.034852105133064186,
"grad_norm": 8.352679629190035,
"learning_rate": 5.78875171467764e-06,
"loss": 0.6486,
"step": 423
},
{
"epoch": 0.034934497816593885,
"grad_norm": 10.944499686428724,
"learning_rate": 5.80246913580247e-06,
"loss": 0.6787,
"step": 424
},
{
"epoch": 0.03501689050012359,
"grad_norm": 7.169250883656542,
"learning_rate": 5.816186556927298e-06,
"loss": 0.6341,
"step": 425
},
{
"epoch": 0.03509928318365329,
"grad_norm": 8.444427076931605,
"learning_rate": 5.829903978052127e-06,
"loss": 0.3875,
"step": 426
},
{
"epoch": 0.035181675867182995,
"grad_norm": 5.943310209215223,
"learning_rate": 5.843621399176955e-06,
"loss": 0.5543,
"step": 427
},
{
"epoch": 0.03526406855071269,
"grad_norm": 9.00551355023434,
"learning_rate": 5.857338820301783e-06,
"loss": 0.4304,
"step": 428
},
{
"epoch": 0.0353464612342424,
"grad_norm": 5.450933353978881,
"learning_rate": 5.871056241426613e-06,
"loss": 0.439,
"step": 429
},
{
"epoch": 0.035428853917772105,
"grad_norm": 5.508996916628157,
"learning_rate": 5.884773662551441e-06,
"loss": 0.3802,
"step": 430
},
{
"epoch": 0.0355112466013018,
"grad_norm": 5.247685983735031,
"learning_rate": 5.89849108367627e-06,
"loss": 0.2684,
"step": 431
},
{
"epoch": 0.03559363928483151,
"grad_norm": 5.397506286271877,
"learning_rate": 5.912208504801098e-06,
"loss": 0.341,
"step": 432
},
{
"epoch": 0.03567603196836121,
"grad_norm": 5.27131135952353,
"learning_rate": 5.925925925925926e-06,
"loss": 0.2659,
"step": 433
},
{
"epoch": 0.03575842465189091,
"grad_norm": 480.0659176486544,
"learning_rate": 5.9396433470507556e-06,
"loss": 0.638,
"step": 434
},
{
"epoch": 0.03584081733542061,
"grad_norm": 13.678674149772142,
"learning_rate": 5.953360768175584e-06,
"loss": 0.7309,
"step": 435
},
{
"epoch": 0.03592321001895032,
"grad_norm": 10.780807692308517,
"learning_rate": 5.967078189300412e-06,
"loss": 0.7032,
"step": 436
},
{
"epoch": 0.03600560270248002,
"grad_norm": 13.02329032201945,
"learning_rate": 5.9807956104252405e-06,
"loss": 0.5507,
"step": 437
},
{
"epoch": 0.03608799538600972,
"grad_norm": 6.048832451175678,
"learning_rate": 5.994513031550069e-06,
"loss": 0.4034,
"step": 438
},
{
"epoch": 0.03617038806953943,
"grad_norm": 6.96057463234594,
"learning_rate": 6.008230452674898e-06,
"loss": 0.4898,
"step": 439
},
{
"epoch": 0.036252780753069126,
"grad_norm": 9.398283406999298,
"learning_rate": 6.021947873799726e-06,
"loss": 0.645,
"step": 440
},
{
"epoch": 0.03633517343659883,
"grad_norm": 8.87917346131629,
"learning_rate": 6.0356652949245544e-06,
"loss": 0.6331,
"step": 441
},
{
"epoch": 0.03641756612012853,
"grad_norm": 7.58770542825193,
"learning_rate": 6.049382716049383e-06,
"loss": 0.496,
"step": 442
},
{
"epoch": 0.036499958803658236,
"grad_norm": 7.319287387022868,
"learning_rate": 6.063100137174211e-06,
"loss": 0.4107,
"step": 443
},
{
"epoch": 0.036582351487187935,
"grad_norm": 6.047688078633024,
"learning_rate": 6.076817558299041e-06,
"loss": 0.5706,
"step": 444
},
{
"epoch": 0.03666474417071764,
"grad_norm": 4.016360698698718,
"learning_rate": 6.090534979423869e-06,
"loss": 0.4581,
"step": 445
},
{
"epoch": 0.036747136854247346,
"grad_norm": 4.965075908435688,
"learning_rate": 6.104252400548697e-06,
"loss": 0.4266,
"step": 446
},
{
"epoch": 0.036829529537777045,
"grad_norm": 6.216596173925147,
"learning_rate": 6.117969821673526e-06,
"loss": 0.4522,
"step": 447
},
{
"epoch": 0.03691192222130675,
"grad_norm": 4.842662632695187,
"learning_rate": 6.131687242798354e-06,
"loss": 0.4335,
"step": 448
},
{
"epoch": 0.03699431490483645,
"grad_norm": 4.9752196790209515,
"learning_rate": 6.145404663923183e-06,
"loss": 0.557,
"step": 449
},
{
"epoch": 0.037076707588366155,
"grad_norm": 5.735323380928238,
"learning_rate": 6.159122085048012e-06,
"loss": 0.4763,
"step": 450
},
{
"epoch": 0.037159100271895854,
"grad_norm": 5.77466848724865,
"learning_rate": 6.17283950617284e-06,
"loss": 0.4934,
"step": 451
},
{
"epoch": 0.03724149295542556,
"grad_norm": 9.112092371493077,
"learning_rate": 6.186556927297669e-06,
"loss": 0.7463,
"step": 452
},
{
"epoch": 0.03732388563895526,
"grad_norm": 5.7102176201346,
"learning_rate": 6.200274348422497e-06,
"loss": 0.5117,
"step": 453
},
{
"epoch": 0.037406278322484963,
"grad_norm": 78.79906885419953,
"learning_rate": 6.213991769547325e-06,
"loss": 0.4034,
"step": 454
},
{
"epoch": 0.03748867100601467,
"grad_norm": 6.538900045540373,
"learning_rate": 6.227709190672154e-06,
"loss": 0.5889,
"step": 455
},
{
"epoch": 0.03757106368954437,
"grad_norm": 6.303771338902271,
"learning_rate": 6.241426611796983e-06,
"loss": 0.4705,
"step": 456
},
{
"epoch": 0.03765345637307407,
"grad_norm": 6.123584679010139,
"learning_rate": 6.255144032921812e-06,
"loss": 0.4935,
"step": 457
},
{
"epoch": 0.03773584905660377,
"grad_norm": 6.632670832269488,
"learning_rate": 6.26886145404664e-06,
"loss": 0.4198,
"step": 458
},
{
"epoch": 0.03781824174013348,
"grad_norm": 5.546218761188075,
"learning_rate": 6.282578875171468e-06,
"loss": 0.4775,
"step": 459
},
{
"epoch": 0.037900634423663176,
"grad_norm": 7.410628500383756,
"learning_rate": 6.296296296296297e-06,
"loss": 0.5848,
"step": 460
},
{
"epoch": 0.03798302710719288,
"grad_norm": 13.29983237450669,
"learning_rate": 6.310013717421125e-06,
"loss": 0.5264,
"step": 461
},
{
"epoch": 0.03806541979072258,
"grad_norm": 6.248701868467927,
"learning_rate": 6.3237311385459544e-06,
"loss": 0.4628,
"step": 462
},
{
"epoch": 0.038147812474252286,
"grad_norm": 5.555557826856918,
"learning_rate": 6.3374485596707825e-06,
"loss": 0.4553,
"step": 463
},
{
"epoch": 0.03823020515778199,
"grad_norm": 9.184422285800041,
"learning_rate": 6.3511659807956105e-06,
"loss": 0.8573,
"step": 464
},
{
"epoch": 0.03831259784131169,
"grad_norm": 5.747050476310712,
"learning_rate": 6.3648834019204394e-06,
"loss": 0.482,
"step": 465
},
{
"epoch": 0.038394990524841396,
"grad_norm": 4.763610826797223,
"learning_rate": 6.3786008230452675e-06,
"loss": 0.5323,
"step": 466
},
{
"epoch": 0.038477383208371095,
"grad_norm": 5.592179783867718,
"learning_rate": 6.392318244170097e-06,
"loss": 0.4239,
"step": 467
},
{
"epoch": 0.0385597758919008,
"grad_norm": 6.874653123972646,
"learning_rate": 6.406035665294925e-06,
"loss": 0.6832,
"step": 468
},
{
"epoch": 0.0386421685754305,
"grad_norm": 8.90284342707074,
"learning_rate": 6.419753086419753e-06,
"loss": 0.7136,
"step": 469
},
{
"epoch": 0.038724561258960205,
"grad_norm": 4.321479415948776,
"learning_rate": 6.433470507544582e-06,
"loss": 0.3613,
"step": 470
},
{
"epoch": 0.038806953942489904,
"grad_norm": 6.212842061197888,
"learning_rate": 6.44718792866941e-06,
"loss": 0.5118,
"step": 471
},
{
"epoch": 0.03888934662601961,
"grad_norm": 6.620819776714046,
"learning_rate": 6.460905349794238e-06,
"loss": 0.6616,
"step": 472
},
{
"epoch": 0.038971739309549315,
"grad_norm": 4.727885284155705,
"learning_rate": 6.474622770919068e-06,
"loss": 0.5001,
"step": 473
},
{
"epoch": 0.039054131993079014,
"grad_norm": 5.762977515077748,
"learning_rate": 6.488340192043896e-06,
"loss": 0.496,
"step": 474
},
{
"epoch": 0.03913652467660872,
"grad_norm": 4.548105976567005,
"learning_rate": 6.502057613168725e-06,
"loss": 0.4174,
"step": 475
},
{
"epoch": 0.03921891736013842,
"grad_norm": 10.758680715013131,
"learning_rate": 6.515775034293553e-06,
"loss": 1.0219,
"step": 476
},
{
"epoch": 0.039301310043668124,
"grad_norm": 4.833629523221548,
"learning_rate": 6.529492455418381e-06,
"loss": 0.4714,
"step": 477
},
{
"epoch": 0.03938370272719782,
"grad_norm": 6.168003173970242,
"learning_rate": 6.543209876543211e-06,
"loss": 0.5518,
"step": 478
},
{
"epoch": 0.03946609541072753,
"grad_norm": 6.781568533057639,
"learning_rate": 6.556927297668039e-06,
"loss": 0.6373,
"step": 479
},
{
"epoch": 0.03954848809425723,
"grad_norm": 7.807648679748847,
"learning_rate": 6.570644718792868e-06,
"loss": 0.6692,
"step": 480
},
{
"epoch": 0.03963088077778693,
"grad_norm": 5.261036851137123,
"learning_rate": 6.584362139917696e-06,
"loss": 0.4972,
"step": 481
},
{
"epoch": 0.03971327346131664,
"grad_norm": 8.82287432796905,
"learning_rate": 6.598079561042524e-06,
"loss": 0.5782,
"step": 482
},
{
"epoch": 0.03979566614484634,
"grad_norm": 4.74653534153213,
"learning_rate": 6.611796982167354e-06,
"loss": 0.4713,
"step": 483
},
{
"epoch": 0.03987805882837604,
"grad_norm": 6.115095781407729,
"learning_rate": 6.625514403292182e-06,
"loss": 0.4716,
"step": 484
},
{
"epoch": 0.03996045151190574,
"grad_norm": 9.137058349523514,
"learning_rate": 6.6392318244170106e-06,
"loss": 0.5758,
"step": 485
},
{
"epoch": 0.04004284419543545,
"grad_norm": 6.208461928327539,
"learning_rate": 6.652949245541839e-06,
"loss": 0.6577,
"step": 486
},
{
"epoch": 0.040125236878965145,
"grad_norm": 7.110169283978004,
"learning_rate": 6.666666666666667e-06,
"loss": 0.5648,
"step": 487
},
{
"epoch": 0.04020762956249485,
"grad_norm": 5.955335908180957,
"learning_rate": 6.680384087791496e-06,
"loss": 0.6219,
"step": 488
},
{
"epoch": 0.04029002224602455,
"grad_norm": 11.624360168848643,
"learning_rate": 6.6941015089163244e-06,
"loss": 0.5879,
"step": 489
},
{
"epoch": 0.040372414929554255,
"grad_norm": 6.045762234406257,
"learning_rate": 6.707818930041153e-06,
"loss": 0.5048,
"step": 490
},
{
"epoch": 0.04045480761308396,
"grad_norm": 7.274352306633563,
"learning_rate": 6.721536351165981e-06,
"loss": 0.7685,
"step": 491
},
{
"epoch": 0.04053720029661366,
"grad_norm": 5.5877312296120465,
"learning_rate": 6.7352537722908094e-06,
"loss": 0.4779,
"step": 492
},
{
"epoch": 0.040619592980143365,
"grad_norm": 190.3033861867491,
"learning_rate": 6.748971193415639e-06,
"loss": 2.688,
"step": 493
},
{
"epoch": 0.040701985663673064,
"grad_norm": 10.422219949359063,
"learning_rate": 6.762688614540467e-06,
"loss": 0.6075,
"step": 494
},
{
"epoch": 0.04078437834720277,
"grad_norm": 5.781767462518796,
"learning_rate": 6.776406035665295e-06,
"loss": 0.4923,
"step": 495
},
{
"epoch": 0.04086677103073247,
"grad_norm": 7.096850817273703,
"learning_rate": 6.790123456790124e-06,
"loss": 0.3914,
"step": 496
},
{
"epoch": 0.040949163714262174,
"grad_norm": 6.715044383416897,
"learning_rate": 6.803840877914952e-06,
"loss": 0.6559,
"step": 497
},
{
"epoch": 0.04103155639779187,
"grad_norm": 5.869148124670727,
"learning_rate": 6.817558299039781e-06,
"loss": 0.6119,
"step": 498
},
{
"epoch": 0.04111394908132158,
"grad_norm": 6.817431946634836,
"learning_rate": 6.83127572016461e-06,
"loss": 0.7011,
"step": 499
},
{
"epoch": 0.041196341764851284,
"grad_norm": 6.726271933368034,
"learning_rate": 6.844993141289438e-06,
"loss": 0.5462,
"step": 500
},
{
"epoch": 0.04127873444838098,
"grad_norm": 5.340136233900829,
"learning_rate": 6.858710562414267e-06,
"loss": 0.4988,
"step": 501
},
{
"epoch": 0.04136112713191069,
"grad_norm": 6.910491082536719,
"learning_rate": 6.872427983539095e-06,
"loss": 0.3951,
"step": 502
},
{
"epoch": 0.04144351981544039,
"grad_norm": 5.788747410824949,
"learning_rate": 6.886145404663924e-06,
"loss": 0.5783,
"step": 503
},
{
"epoch": 0.04152591249897009,
"grad_norm": 4.802638214101094,
"learning_rate": 6.899862825788752e-06,
"loss": 0.2877,
"step": 504
},
{
"epoch": 0.04160830518249979,
"grad_norm": 4.484566887630588,
"learning_rate": 6.913580246913581e-06,
"loss": 0.3811,
"step": 505
},
{
"epoch": 0.0416906978660295,
"grad_norm": 6.051035690893021,
"learning_rate": 6.92729766803841e-06,
"loss": 0.4715,
"step": 506
},
{
"epoch": 0.0417730905495592,
"grad_norm": 6.986842742202583,
"learning_rate": 6.941015089163238e-06,
"loss": 0.3989,
"step": 507
},
{
"epoch": 0.0418554832330889,
"grad_norm": 3.822985062741508,
"learning_rate": 6.954732510288067e-06,
"loss": 0.3269,
"step": 508
},
{
"epoch": 0.04193787591661861,
"grad_norm": 6.271803341046997,
"learning_rate": 6.968449931412895e-06,
"loss": 0.5455,
"step": 509
},
{
"epoch": 0.042020268600148306,
"grad_norm": 4.324558170988178,
"learning_rate": 6.982167352537723e-06,
"loss": 0.3514,
"step": 510
},
{
"epoch": 0.04210266128367801,
"grad_norm": 7.0738043807784265,
"learning_rate": 6.9958847736625525e-06,
"loss": 0.4582,
"step": 511
},
{
"epoch": 0.04218505396720771,
"grad_norm": 4.303336598762236,
"learning_rate": 7.0096021947873805e-06,
"loss": 0.2196,
"step": 512
},
{
"epoch": 0.042267446650737416,
"grad_norm": 6.729727246828568,
"learning_rate": 7.023319615912209e-06,
"loss": 0.5618,
"step": 513
},
{
"epoch": 0.042349839334267114,
"grad_norm": 5.868167847006668,
"learning_rate": 7.0370370370370375e-06,
"loss": 0.5361,
"step": 514
},
{
"epoch": 0.04243223201779682,
"grad_norm": 8.605825638540875,
"learning_rate": 7.0507544581618655e-06,
"loss": 0.6892,
"step": 515
},
{
"epoch": 0.042514624701326525,
"grad_norm": 5.431792569281863,
"learning_rate": 7.064471879286695e-06,
"loss": 0.4384,
"step": 516
},
{
"epoch": 0.042597017384856224,
"grad_norm": 6.984146114234522,
"learning_rate": 7.078189300411523e-06,
"loss": 0.483,
"step": 517
},
{
"epoch": 0.04267941006838593,
"grad_norm": 9.3422311974361,
"learning_rate": 7.091906721536351e-06,
"loss": 0.5842,
"step": 518
},
{
"epoch": 0.04276180275191563,
"grad_norm": 5.732842769025313,
"learning_rate": 7.10562414266118e-06,
"loss": 0.3972,
"step": 519
},
{
"epoch": 0.042844195435445334,
"grad_norm": 8.65862913312267,
"learning_rate": 7.119341563786008e-06,
"loss": 0.6456,
"step": 520
},
{
"epoch": 0.04292658811897503,
"grad_norm": 6.864043755425524,
"learning_rate": 7.133058984910838e-06,
"loss": 0.5415,
"step": 521
},
{
"epoch": 0.04300898080250474,
"grad_norm": 7.980923425776543,
"learning_rate": 7.146776406035666e-06,
"loss": 0.6034,
"step": 522
},
{
"epoch": 0.04309137348603444,
"grad_norm": 6.747289314662945,
"learning_rate": 7.160493827160494e-06,
"loss": 0.5615,
"step": 523
},
{
"epoch": 0.04317376616956414,
"grad_norm": 5.139243463936733,
"learning_rate": 7.174211248285323e-06,
"loss": 0.4397,
"step": 524
},
{
"epoch": 0.04325615885309385,
"grad_norm": 4.121695404994115,
"learning_rate": 7.187928669410151e-06,
"loss": 0.576,
"step": 525
},
{
"epoch": 0.04333855153662355,
"grad_norm": 4.650257020504757,
"learning_rate": 7.201646090534981e-06,
"loss": 0.3738,
"step": 526
},
{
"epoch": 0.04342094422015325,
"grad_norm": 5.611100034997747,
"learning_rate": 7.215363511659809e-06,
"loss": 0.5172,
"step": 527
},
{
"epoch": 0.04350333690368295,
"grad_norm": 7.677372417584333,
"learning_rate": 7.229080932784637e-06,
"loss": 0.6671,
"step": 528
},
{
"epoch": 0.04358572958721266,
"grad_norm": 6.6136776618758875,
"learning_rate": 7.242798353909466e-06,
"loss": 0.7559,
"step": 529
},
{
"epoch": 0.043668122270742356,
"grad_norm": 4.792630608864752,
"learning_rate": 7.256515775034294e-06,
"loss": 0.3489,
"step": 530
},
{
"epoch": 0.04375051495427206,
"grad_norm": 6.35062782260829,
"learning_rate": 7.270233196159122e-06,
"loss": 0.6693,
"step": 531
},
{
"epoch": 0.04383290763780176,
"grad_norm": 5.605832169231131,
"learning_rate": 7.283950617283952e-06,
"loss": 0.6839,
"step": 532
},
{
"epoch": 0.043915300321331466,
"grad_norm": 7.340777524703994,
"learning_rate": 7.29766803840878e-06,
"loss": 0.4743,
"step": 533
},
{
"epoch": 0.04399769300486117,
"grad_norm": 5.736671549282368,
"learning_rate": 7.311385459533609e-06,
"loss": 0.5146,
"step": 534
},
{
"epoch": 0.04408008568839087,
"grad_norm": 5.290429644156163,
"learning_rate": 7.325102880658437e-06,
"loss": 0.4368,
"step": 535
},
{
"epoch": 0.044162478371920576,
"grad_norm": 5.60282614307985,
"learning_rate": 7.338820301783265e-06,
"loss": 0.5957,
"step": 536
},
{
"epoch": 0.044244871055450274,
"grad_norm": 5.166334947065374,
"learning_rate": 7.3525377229080944e-06,
"loss": 0.3518,
"step": 537
},
{
"epoch": 0.04432726373897998,
"grad_norm": 4.03828573750434,
"learning_rate": 7.3662551440329225e-06,
"loss": 0.4423,
"step": 538
},
{
"epoch": 0.04440965642250968,
"grad_norm": 5.668301363403015,
"learning_rate": 7.379972565157751e-06,
"loss": 0.5681,
"step": 539
},
{
"epoch": 0.044492049106039384,
"grad_norm": 5.359958652089056,
"learning_rate": 7.3936899862825794e-06,
"loss": 0.2408,
"step": 540
},
{
"epoch": 0.04457444178956908,
"grad_norm": 7.013014929960838,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.5804,
"step": 541
},
{
"epoch": 0.04465683447309879,
"grad_norm": 4.355083270145565,
"learning_rate": 7.421124828532237e-06,
"loss": 0.3215,
"step": 542
},
{
"epoch": 0.044739227156628494,
"grad_norm": 4.950584901024228,
"learning_rate": 7.434842249657065e-06,
"loss": 0.4059,
"step": 543
},
{
"epoch": 0.04482161984015819,
"grad_norm": 7.881681417974008,
"learning_rate": 7.448559670781894e-06,
"loss": 0.6569,
"step": 544
},
{
"epoch": 0.0449040125236879,
"grad_norm": 8.797495038662335,
"learning_rate": 7.462277091906722e-06,
"loss": 0.6289,
"step": 545
},
{
"epoch": 0.0449864052072176,
"grad_norm": 5.34524571582464,
"learning_rate": 7.47599451303155e-06,
"loss": 0.4069,
"step": 546
},
{
"epoch": 0.0450687978907473,
"grad_norm": 6.158525652435086,
"learning_rate": 7.489711934156379e-06,
"loss": 0.2462,
"step": 547
},
{
"epoch": 0.045151190574277,
"grad_norm": 4.744502082646233,
"learning_rate": 7.503429355281208e-06,
"loss": 0.3248,
"step": 548
},
{
"epoch": 0.04523358325780671,
"grad_norm": 5.917171401303492,
"learning_rate": 7.517146776406036e-06,
"loss": 0.5406,
"step": 549
},
{
"epoch": 0.045315975941336406,
"grad_norm": 12.458906212002898,
"learning_rate": 7.530864197530865e-06,
"loss": 0.7253,
"step": 550
},
{
"epoch": 0.04539836862486611,
"grad_norm": 7.531914989608172,
"learning_rate": 7.544581618655693e-06,
"loss": 0.5998,
"step": 551
},
{
"epoch": 0.04548076130839582,
"grad_norm": 4.0280860261800004,
"learning_rate": 7.558299039780522e-06,
"loss": 0.4643,
"step": 552
},
{
"epoch": 0.045563153991925516,
"grad_norm": 6.015033091452991,
"learning_rate": 7.57201646090535e-06,
"loss": 0.4256,
"step": 553
},
{
"epoch": 0.04564554667545522,
"grad_norm": 7.13456394416836,
"learning_rate": 7.585733882030179e-06,
"loss": 0.7066,
"step": 554
},
{
"epoch": 0.04572793935898492,
"grad_norm": 9.287192614752263,
"learning_rate": 7.599451303155008e-06,
"loss": 0.7301,
"step": 555
},
{
"epoch": 0.045810332042514626,
"grad_norm": 6.284874505694774,
"learning_rate": 7.613168724279836e-06,
"loss": 0.433,
"step": 556
},
{
"epoch": 0.045892724726044325,
"grad_norm": 5.947339381004487,
"learning_rate": 7.626886145404665e-06,
"loss": 0.4314,
"step": 557
},
{
"epoch": 0.04597511740957403,
"grad_norm": 5.263524060407473,
"learning_rate": 7.640603566529494e-06,
"loss": 0.4686,
"step": 558
},
{
"epoch": 0.04605751009310373,
"grad_norm": 6.4191613503614775,
"learning_rate": 7.654320987654322e-06,
"loss": 0.3771,
"step": 559
},
{
"epoch": 0.046139902776633435,
"grad_norm": 4.786426740476273,
"learning_rate": 7.66803840877915e-06,
"loss": 0.5297,
"step": 560
},
{
"epoch": 0.04622229546016314,
"grad_norm": 5.969988084382956,
"learning_rate": 7.681755829903978e-06,
"loss": 0.538,
"step": 561
},
{
"epoch": 0.04630468814369284,
"grad_norm": 7.308885572221812,
"learning_rate": 7.695473251028807e-06,
"loss": 0.5234,
"step": 562
},
{
"epoch": 0.046387080827222545,
"grad_norm": 5.97558760032448,
"learning_rate": 7.709190672153636e-06,
"loss": 0.4623,
"step": 563
},
{
"epoch": 0.04646947351075224,
"grad_norm": 6.703370454236969,
"learning_rate": 7.722908093278464e-06,
"loss": 0.4321,
"step": 564
},
{
"epoch": 0.04655186619428195,
"grad_norm": 5.449290012281466,
"learning_rate": 7.736625514403293e-06,
"loss": 0.4356,
"step": 565
},
{
"epoch": 0.04663425887781165,
"grad_norm": 7.6918159276005245,
"learning_rate": 7.750342935528121e-06,
"loss": 0.7377,
"step": 566
},
{
"epoch": 0.04671665156134135,
"grad_norm": 6.70070388013919,
"learning_rate": 7.76406035665295e-06,
"loss": 0.4832,
"step": 567
},
{
"epoch": 0.04679904424487106,
"grad_norm": 7.054794661079072,
"learning_rate": 7.77777777777778e-06,
"loss": 0.5,
"step": 568
},
{
"epoch": 0.04688143692840076,
"grad_norm": 6.480167958787211,
"learning_rate": 7.791495198902607e-06,
"loss": 0.6175,
"step": 569
},
{
"epoch": 0.04696382961193046,
"grad_norm": 7.506915605641678,
"learning_rate": 7.805212620027435e-06,
"loss": 0.6497,
"step": 570
},
{
"epoch": 0.04704622229546016,
"grad_norm": 5.494983006094659,
"learning_rate": 7.818930041152263e-06,
"loss": 0.4892,
"step": 571
},
{
"epoch": 0.04712861497898987,
"grad_norm": 4.3290727230186175,
"learning_rate": 7.832647462277091e-06,
"loss": 0.3603,
"step": 572
},
{
"epoch": 0.047211007662519566,
"grad_norm": 5.4053345835693545,
"learning_rate": 7.846364883401921e-06,
"loss": 0.4964,
"step": 573
},
{
"epoch": 0.04729340034604927,
"grad_norm": 6.8723464860852275,
"learning_rate": 7.860082304526749e-06,
"loss": 0.5672,
"step": 574
},
{
"epoch": 0.04737579302957897,
"grad_norm": 6.656820969576108,
"learning_rate": 7.873799725651579e-06,
"loss": 0.645,
"step": 575
},
{
"epoch": 0.047458185713108676,
"grad_norm": 5.514831433681132,
"learning_rate": 7.887517146776407e-06,
"loss": 0.4784,
"step": 576
},
{
"epoch": 0.04754057839663838,
"grad_norm": 4.686188017061002,
"learning_rate": 7.901234567901235e-06,
"loss": 0.3503,
"step": 577
},
{
"epoch": 0.04762297108016808,
"grad_norm": 4.387746808622081,
"learning_rate": 7.914951989026065e-06,
"loss": 0.4085,
"step": 578
},
{
"epoch": 0.047705363763697786,
"grad_norm": 5.842284742709113,
"learning_rate": 7.928669410150893e-06,
"loss": 0.6309,
"step": 579
},
{
"epoch": 0.047787756447227485,
"grad_norm": 3.9959223903308883,
"learning_rate": 7.94238683127572e-06,
"loss": 0.4597,
"step": 580
},
{
"epoch": 0.04787014913075719,
"grad_norm": 5.369023494162627,
"learning_rate": 7.956104252400549e-06,
"loss": 0.3751,
"step": 581
},
{
"epoch": 0.04795254181428689,
"grad_norm": 5.2283340567591345,
"learning_rate": 7.969821673525377e-06,
"loss": 0.5132,
"step": 582
},
{
"epoch": 0.048034934497816595,
"grad_norm": 5.349903681875905,
"learning_rate": 7.983539094650207e-06,
"loss": 0.3902,
"step": 583
},
{
"epoch": 0.048117327181346294,
"grad_norm": 6.025785140359736,
"learning_rate": 7.997256515775035e-06,
"loss": 0.4025,
"step": 584
},
{
"epoch": 0.048199719864876,
"grad_norm": 11.461654268608273,
"learning_rate": 8.010973936899864e-06,
"loss": 0.679,
"step": 585
},
{
"epoch": 0.048282112548405705,
"grad_norm": 6.778197127568214,
"learning_rate": 8.024691358024692e-06,
"loss": 0.6373,
"step": 586
},
{
"epoch": 0.048364505231935404,
"grad_norm": 6.682544812430659,
"learning_rate": 8.03840877914952e-06,
"loss": 0.6251,
"step": 587
},
{
"epoch": 0.04844689791546511,
"grad_norm": 4.574480748574406,
"learning_rate": 8.052126200274349e-06,
"loss": 0.3578,
"step": 588
},
{
"epoch": 0.04852929059899481,
"grad_norm": 5.884448857284855,
"learning_rate": 8.065843621399178e-06,
"loss": 0.4917,
"step": 589
},
{
"epoch": 0.048611683282524513,
"grad_norm": 7.305232846822941,
"learning_rate": 8.079561042524006e-06,
"loss": 0.606,
"step": 590
},
{
"epoch": 0.04869407596605421,
"grad_norm": 5.674679345815404,
"learning_rate": 8.093278463648834e-06,
"loss": 0.6098,
"step": 591
},
{
"epoch": 0.04877646864958392,
"grad_norm": 5.8357949188415805,
"learning_rate": 8.106995884773662e-06,
"loss": 0.4771,
"step": 592
},
{
"epoch": 0.048858861333113617,
"grad_norm": 6.271507937692957,
"learning_rate": 8.120713305898492e-06,
"loss": 0.4724,
"step": 593
},
{
"epoch": 0.04894125401664332,
"grad_norm": 6.878289052557951,
"learning_rate": 8.13443072702332e-06,
"loss": 0.6355,
"step": 594
},
{
"epoch": 0.04902364670017303,
"grad_norm": 6.77176191369003,
"learning_rate": 8.148148148148148e-06,
"loss": 0.5007,
"step": 595
},
{
"epoch": 0.049106039383702726,
"grad_norm": 5.259636242937289,
"learning_rate": 8.161865569272978e-06,
"loss": 0.4991,
"step": 596
},
{
"epoch": 0.04918843206723243,
"grad_norm": 6.83053596050915,
"learning_rate": 8.175582990397806e-06,
"loss": 0.4805,
"step": 597
},
{
"epoch": 0.04927082475076213,
"grad_norm": 8.169736862097166,
"learning_rate": 8.189300411522634e-06,
"loss": 0.5456,
"step": 598
},
{
"epoch": 0.049353217434291836,
"grad_norm": 7.689359450945521,
"learning_rate": 8.203017832647462e-06,
"loss": 0.6017,
"step": 599
},
{
"epoch": 0.049435610117821535,
"grad_norm": 5.228139769258261,
"learning_rate": 8.21673525377229e-06,
"loss": 0.5305,
"step": 600
},
{
"epoch": 0.04951800280135124,
"grad_norm": 8.65116366410673,
"learning_rate": 8.23045267489712e-06,
"loss": 0.5437,
"step": 601
},
{
"epoch": 0.04960039548488094,
"grad_norm": 11.17746374283472,
"learning_rate": 8.244170096021948e-06,
"loss": 0.7125,
"step": 602
},
{
"epoch": 0.049682788168410645,
"grad_norm": 7.301491118538198,
"learning_rate": 8.257887517146778e-06,
"loss": 0.5577,
"step": 603
},
{
"epoch": 0.04976518085194035,
"grad_norm": 4.970573337029868,
"learning_rate": 8.271604938271606e-06,
"loss": 0.5968,
"step": 604
},
{
"epoch": 0.04984757353547005,
"grad_norm": 5.587652792417023,
"learning_rate": 8.285322359396434e-06,
"loss": 0.6005,
"step": 605
},
{
"epoch": 0.049929966218999755,
"grad_norm": 5.549916285590465,
"learning_rate": 8.299039780521264e-06,
"loss": 0.3548,
"step": 606
},
{
"epoch": 0.050012358902529454,
"grad_norm": 8.10309244938579,
"learning_rate": 8.312757201646092e-06,
"loss": 0.5691,
"step": 607
},
{
"epoch": 0.05009475158605916,
"grad_norm": 5.8031246153733935,
"learning_rate": 8.32647462277092e-06,
"loss": 0.5858,
"step": 608
},
{
"epoch": 0.05017714426958886,
"grad_norm": 4.633845595880233,
"learning_rate": 8.340192043895748e-06,
"loss": 0.6119,
"step": 609
},
{
"epoch": 0.050259536953118564,
"grad_norm": 4.538100964584221,
"learning_rate": 8.353909465020576e-06,
"loss": 0.4402,
"step": 610
},
{
"epoch": 0.05034192963664826,
"grad_norm": 5.323060646938032,
"learning_rate": 8.367626886145406e-06,
"loss": 0.634,
"step": 611
},
{
"epoch": 0.05042432232017797,
"grad_norm": 8.019191719629505,
"learning_rate": 8.381344307270234e-06,
"loss": 0.5162,
"step": 612
},
{
"epoch": 0.050506715003707674,
"grad_norm": 8.490592176269905,
"learning_rate": 8.395061728395062e-06,
"loss": 0.7862,
"step": 613
},
{
"epoch": 0.05058910768723737,
"grad_norm": 6.715022235925982,
"learning_rate": 8.408779149519891e-06,
"loss": 0.693,
"step": 614
},
{
"epoch": 0.05067150037076708,
"grad_norm": 6.662593981470133,
"learning_rate": 8.42249657064472e-06,
"loss": 0.7293,
"step": 615
},
{
"epoch": 0.05075389305429678,
"grad_norm": 6.601584476220066,
"learning_rate": 8.43621399176955e-06,
"loss": 0.8338,
"step": 616
},
{
"epoch": 0.05083628573782648,
"grad_norm": 4.392372074940804,
"learning_rate": 8.449931412894377e-06,
"loss": 0.362,
"step": 617
},
{
"epoch": 0.05091867842135618,
"grad_norm": 67.48725205124786,
"learning_rate": 8.463648834019205e-06,
"loss": 2.8128,
"step": 618
},
{
"epoch": 0.05100107110488589,
"grad_norm": 7.506512019819455,
"learning_rate": 8.477366255144033e-06,
"loss": 0.5441,
"step": 619
},
{
"epoch": 0.051083463788415585,
"grad_norm": 6.880593438312231,
"learning_rate": 8.491083676268861e-06,
"loss": 0.5519,
"step": 620
},
{
"epoch": 0.05116585647194529,
"grad_norm": 5.888795257341883,
"learning_rate": 8.504801097393691e-06,
"loss": 0.5516,
"step": 621
},
{
"epoch": 0.051248249155475,
"grad_norm": 6.252602124665069,
"learning_rate": 8.518518518518519e-06,
"loss": 0.6957,
"step": 622
},
{
"epoch": 0.051330641839004695,
"grad_norm": 5.4090002095589975,
"learning_rate": 8.532235939643347e-06,
"loss": 0.5359,
"step": 623
},
{
"epoch": 0.0514130345225344,
"grad_norm": 10.890995083855032,
"learning_rate": 8.545953360768177e-06,
"loss": 0.709,
"step": 624
},
{
"epoch": 0.0514954272060641,
"grad_norm": 5.34598899270713,
"learning_rate": 8.559670781893005e-06,
"loss": 0.7751,
"step": 625
},
{
"epoch": 0.051577819889593805,
"grad_norm": 4.45040160733867,
"learning_rate": 8.573388203017833e-06,
"loss": 0.2535,
"step": 626
},
{
"epoch": 0.051660212573123504,
"grad_norm": 5.585882232730492,
"learning_rate": 8.587105624142663e-06,
"loss": 0.6672,
"step": 627
},
{
"epoch": 0.05174260525665321,
"grad_norm": 5.864022504893711,
"learning_rate": 8.60082304526749e-06,
"loss": 0.7359,
"step": 628
},
{
"epoch": 0.05182499794018291,
"grad_norm": 4.476550092270306,
"learning_rate": 8.614540466392319e-06,
"loss": 0.4212,
"step": 629
},
{
"epoch": 0.051907390623712614,
"grad_norm": 4.761925495673636,
"learning_rate": 8.628257887517147e-06,
"loss": 0.5568,
"step": 630
},
{
"epoch": 0.05198978330724232,
"grad_norm": 4.598522719894157,
"learning_rate": 8.641975308641975e-06,
"loss": 0.2614,
"step": 631
},
{
"epoch": 0.05207217599077202,
"grad_norm": 5.5021749646336175,
"learning_rate": 8.655692729766805e-06,
"loss": 0.6382,
"step": 632
},
{
"epoch": 0.052154568674301724,
"grad_norm": 6.517012701844157,
"learning_rate": 8.669410150891633e-06,
"loss": 0.5804,
"step": 633
},
{
"epoch": 0.05223696135783142,
"grad_norm": 6.805425239578879,
"learning_rate": 8.683127572016463e-06,
"loss": 0.5015,
"step": 634
},
{
"epoch": 0.05231935404136113,
"grad_norm": 4.6738708514525715,
"learning_rate": 8.69684499314129e-06,
"loss": 0.3992,
"step": 635
},
{
"epoch": 0.05240174672489083,
"grad_norm": 8.234844782748597,
"learning_rate": 8.710562414266119e-06,
"loss": 0.7507,
"step": 636
},
{
"epoch": 0.05248413940842053,
"grad_norm": 6.698687047110895,
"learning_rate": 8.724279835390947e-06,
"loss": 0.6197,
"step": 637
},
{
"epoch": 0.05256653209195024,
"grad_norm": 4.1168902182520615,
"learning_rate": 8.737997256515776e-06,
"loss": 0.3694,
"step": 638
},
{
"epoch": 0.05264892477547994,
"grad_norm": 4.351008788296417,
"learning_rate": 8.751714677640604e-06,
"loss": 0.4905,
"step": 639
},
{
"epoch": 0.05273131745900964,
"grad_norm": 9.457012453724198,
"learning_rate": 8.765432098765432e-06,
"loss": 0.7262,
"step": 640
},
{
"epoch": 0.05281371014253934,
"grad_norm": 4.28519533402721,
"learning_rate": 8.77914951989026e-06,
"loss": 0.4703,
"step": 641
},
{
"epoch": 0.05289610282606905,
"grad_norm": 7.396862648357201,
"learning_rate": 8.79286694101509e-06,
"loss": 0.4252,
"step": 642
},
{
"epoch": 0.052978495509598746,
"grad_norm": 4.898822144574726,
"learning_rate": 8.806584362139918e-06,
"loss": 0.4963,
"step": 643
},
{
"epoch": 0.05306088819312845,
"grad_norm": 5.754512361115338,
"learning_rate": 8.820301783264746e-06,
"loss": 0.7162,
"step": 644
},
{
"epoch": 0.05314328087665815,
"grad_norm": 3.6053519506068605,
"learning_rate": 8.834019204389576e-06,
"loss": 0.2067,
"step": 645
},
{
"epoch": 0.053225673560187856,
"grad_norm": 13.554388572711437,
"learning_rate": 8.847736625514404e-06,
"loss": 0.7916,
"step": 646
},
{
"epoch": 0.05330806624371756,
"grad_norm": 9.04741748435677,
"learning_rate": 8.861454046639232e-06,
"loss": 0.6137,
"step": 647
},
{
"epoch": 0.05339045892724726,
"grad_norm": 5.876495893543201,
"learning_rate": 8.87517146776406e-06,
"loss": 0.5187,
"step": 648
},
{
"epoch": 0.053472851610776966,
"grad_norm": 16.50292992475323,
"learning_rate": 8.888888888888888e-06,
"loss": 0.5344,
"step": 649
},
{
"epoch": 0.053555244294306664,
"grad_norm": 16.27884445947484,
"learning_rate": 8.902606310013718e-06,
"loss": 0.8425,
"step": 650
},
{
"epoch": 0.05363763697783637,
"grad_norm": 6.05812949734961,
"learning_rate": 8.916323731138546e-06,
"loss": 0.5581,
"step": 651
},
{
"epoch": 0.05372002966136607,
"grad_norm": 4.631703021154219,
"learning_rate": 8.930041152263376e-06,
"loss": 0.6053,
"step": 652
},
{
"epoch": 0.053802422344895774,
"grad_norm": 5.0840641963520925,
"learning_rate": 8.943758573388204e-06,
"loss": 0.4591,
"step": 653
},
{
"epoch": 0.05388481502842547,
"grad_norm": 11.145274102530228,
"learning_rate": 8.957475994513032e-06,
"loss": 0.4096,
"step": 654
},
{
"epoch": 0.05396720771195518,
"grad_norm": 5.150949637450351,
"learning_rate": 8.971193415637862e-06,
"loss": 0.6682,
"step": 655
},
{
"epoch": 0.054049600395484884,
"grad_norm": 6.053088680153872,
"learning_rate": 8.98491083676269e-06,
"loss": 0.6298,
"step": 656
},
{
"epoch": 0.05413199307901458,
"grad_norm": 4.576638977362141,
"learning_rate": 8.998628257887518e-06,
"loss": 0.5385,
"step": 657
},
{
"epoch": 0.05421438576254429,
"grad_norm": 5.279899070079792,
"learning_rate": 9.012345679012346e-06,
"loss": 0.5517,
"step": 658
},
{
"epoch": 0.05429677844607399,
"grad_norm": 9.816689344331193,
"learning_rate": 9.026063100137174e-06,
"loss": 0.528,
"step": 659
},
{
"epoch": 0.05437917112960369,
"grad_norm": 5.344268401980664,
"learning_rate": 9.039780521262004e-06,
"loss": 0.6586,
"step": 660
},
{
"epoch": 0.05446156381313339,
"grad_norm": 4.907761624647467,
"learning_rate": 9.053497942386832e-06,
"loss": 0.416,
"step": 661
},
{
"epoch": 0.0545439564966631,
"grad_norm": 8.816994387823925,
"learning_rate": 9.067215363511661e-06,
"loss": 0.8245,
"step": 662
},
{
"epoch": 0.054626349180192796,
"grad_norm": 6.742675433781916,
"learning_rate": 9.08093278463649e-06,
"loss": 0.4017,
"step": 663
},
{
"epoch": 0.0547087418637225,
"grad_norm": 5.140173007369312,
"learning_rate": 9.094650205761317e-06,
"loss": 0.5631,
"step": 664
},
{
"epoch": 0.05479113454725221,
"grad_norm": 6.004556037601133,
"learning_rate": 9.108367626886147e-06,
"loss": 0.5043,
"step": 665
},
{
"epoch": 0.054873527230781906,
"grad_norm": 11.766169776927814,
"learning_rate": 9.122085048010975e-06,
"loss": 0.5823,
"step": 666
},
{
"epoch": 0.05495591991431161,
"grad_norm": 3.9038912525359484,
"learning_rate": 9.135802469135803e-06,
"loss": 0.4185,
"step": 667
},
{
"epoch": 0.05503831259784131,
"grad_norm": 5.764415526037728,
"learning_rate": 9.149519890260631e-06,
"loss": 0.7135,
"step": 668
},
{
"epoch": 0.055120705281371016,
"grad_norm": 5.200914706950143,
"learning_rate": 9.16323731138546e-06,
"loss": 0.5353,
"step": 669
},
{
"epoch": 0.055203097964900714,
"grad_norm": 5.012014991774245,
"learning_rate": 9.17695473251029e-06,
"loss": 0.372,
"step": 670
},
{
"epoch": 0.05528549064843042,
"grad_norm": 2.8951621516844677,
"learning_rate": 9.190672153635117e-06,
"loss": 0.2485,
"step": 671
},
{
"epoch": 0.05536788333196012,
"grad_norm": 5.756434032002608,
"learning_rate": 9.204389574759945e-06,
"loss": 0.5129,
"step": 672
},
{
"epoch": 0.055450276015489824,
"grad_norm": 5.5513950603318785,
"learning_rate": 9.218106995884775e-06,
"loss": 0.5475,
"step": 673
},
{
"epoch": 0.05553266869901953,
"grad_norm": 7.155824300287789,
"learning_rate": 9.231824417009603e-06,
"loss": 0.6587,
"step": 674
},
{
"epoch": 0.05561506138254923,
"grad_norm": 6.693719659190916,
"learning_rate": 9.245541838134433e-06,
"loss": 0.6653,
"step": 675
},
{
"epoch": 0.055697454066078934,
"grad_norm": 5.899028184857873,
"learning_rate": 9.25925925925926e-06,
"loss": 0.5912,
"step": 676
},
{
"epoch": 0.05577984674960863,
"grad_norm": 4.996123098753804,
"learning_rate": 9.272976680384089e-06,
"loss": 0.4981,
"step": 677
},
{
"epoch": 0.05586223943313834,
"grad_norm": 7.59709581355784,
"learning_rate": 9.286694101508917e-06,
"loss": 0.6263,
"step": 678
},
{
"epoch": 0.05594463211666804,
"grad_norm": 7.995633663002308,
"learning_rate": 9.300411522633745e-06,
"loss": 0.6424,
"step": 679
},
{
"epoch": 0.05602702480019774,
"grad_norm": 4.669826110736232,
"learning_rate": 9.314128943758575e-06,
"loss": 0.4041,
"step": 680
},
{
"epoch": 0.05610941748372744,
"grad_norm": 6.343393787810919,
"learning_rate": 9.327846364883403e-06,
"loss": 0.7479,
"step": 681
},
{
"epoch": 0.05619181016725715,
"grad_norm": 96.70000934708833,
"learning_rate": 9.34156378600823e-06,
"loss": 2.6674,
"step": 682
},
{
"epoch": 0.05627420285078685,
"grad_norm": 5.214445104499895,
"learning_rate": 9.35528120713306e-06,
"loss": 0.3759,
"step": 683
},
{
"epoch": 0.05635659553431655,
"grad_norm": 6.426486685956931,
"learning_rate": 9.368998628257889e-06,
"loss": 0.7181,
"step": 684
},
{
"epoch": 0.05643898821784626,
"grad_norm": 7.141014135224707,
"learning_rate": 9.382716049382717e-06,
"loss": 0.7562,
"step": 685
},
{
"epoch": 0.056521380901375956,
"grad_norm": 5.988619932398916,
"learning_rate": 9.396433470507545e-06,
"loss": 0.3709,
"step": 686
},
{
"epoch": 0.05660377358490566,
"grad_norm": 5.891136898704754,
"learning_rate": 9.410150891632374e-06,
"loss": 0.6162,
"step": 687
},
{
"epoch": 0.05668616626843536,
"grad_norm": 6.170436173120623,
"learning_rate": 9.423868312757202e-06,
"loss": 0.576,
"step": 688
},
{
"epoch": 0.056768558951965066,
"grad_norm": 6.151013921299717,
"learning_rate": 9.43758573388203e-06,
"loss": 0.478,
"step": 689
},
{
"epoch": 0.056850951635494765,
"grad_norm": 6.117432660032868,
"learning_rate": 9.451303155006859e-06,
"loss": 0.5562,
"step": 690
},
{
"epoch": 0.05693334431902447,
"grad_norm": 6.9566546187232206,
"learning_rate": 9.465020576131688e-06,
"loss": 0.7103,
"step": 691
},
{
"epoch": 0.057015737002554176,
"grad_norm": 7.5413525683464435,
"learning_rate": 9.478737997256516e-06,
"loss": 0.5024,
"step": 692
},
{
"epoch": 0.057098129686083875,
"grad_norm": 6.783803405617549,
"learning_rate": 9.492455418381346e-06,
"loss": 0.6286,
"step": 693
},
{
"epoch": 0.05718052236961358,
"grad_norm": 7.745792551245552,
"learning_rate": 9.506172839506174e-06,
"loss": 0.5794,
"step": 694
},
{
"epoch": 0.05726291505314328,
"grad_norm": 5.774054351127429,
"learning_rate": 9.519890260631002e-06,
"loss": 0.5072,
"step": 695
},
{
"epoch": 0.057345307736672985,
"grad_norm": 5.0098435672277555,
"learning_rate": 9.53360768175583e-06,
"loss": 0.5214,
"step": 696
},
{
"epoch": 0.05742770042020268,
"grad_norm": 6.134234294504796,
"learning_rate": 9.547325102880658e-06,
"loss": 0.6675,
"step": 697
},
{
"epoch": 0.05751009310373239,
"grad_norm": 8.689201856152978,
"learning_rate": 9.561042524005488e-06,
"loss": 0.5794,
"step": 698
},
{
"epoch": 0.05759248578726209,
"grad_norm": 5.8119206456550145,
"learning_rate": 9.574759945130316e-06,
"loss": 0.3022,
"step": 699
},
{
"epoch": 0.05767487847079179,
"grad_norm": 32.92612650154318,
"learning_rate": 9.588477366255144e-06,
"loss": 0.527,
"step": 700
},
{
"epoch": 0.0577572711543215,
"grad_norm": 8.182146639732006,
"learning_rate": 9.602194787379974e-06,
"loss": 0.6942,
"step": 701
},
{
"epoch": 0.0578396638378512,
"grad_norm": 4.748298256564357,
"learning_rate": 9.615912208504802e-06,
"loss": 0.3809,
"step": 702
},
{
"epoch": 0.0579220565213809,
"grad_norm": 7.767690253299567,
"learning_rate": 9.62962962962963e-06,
"loss": 0.7925,
"step": 703
},
{
"epoch": 0.0580044492049106,
"grad_norm": 6.152146994551039,
"learning_rate": 9.64334705075446e-06,
"loss": 0.6161,
"step": 704
},
{
"epoch": 0.05808684188844031,
"grad_norm": 5.059103423212747,
"learning_rate": 9.657064471879288e-06,
"loss": 0.32,
"step": 705
},
{
"epoch": 0.058169234571970006,
"grad_norm": 5.104441529492062,
"learning_rate": 9.670781893004116e-06,
"loss": 0.4551,
"step": 706
},
{
"epoch": 0.05825162725549971,
"grad_norm": 10.994238478560392,
"learning_rate": 9.684499314128944e-06,
"loss": 0.7538,
"step": 707
},
{
"epoch": 0.05833401993902942,
"grad_norm": 4.968918212244643,
"learning_rate": 9.698216735253772e-06,
"loss": 0.5932,
"step": 708
},
{
"epoch": 0.058416412622559116,
"grad_norm": 6.423172055805545,
"learning_rate": 9.711934156378602e-06,
"loss": 0.7786,
"step": 709
},
{
"epoch": 0.05849880530608882,
"grad_norm": 3.880934215636923,
"learning_rate": 9.72565157750343e-06,
"loss": 0.4232,
"step": 710
},
{
"epoch": 0.05858119798961852,
"grad_norm": 8.269601261803912,
"learning_rate": 9.73936899862826e-06,
"loss": 0.6481,
"step": 711
},
{
"epoch": 0.058663590673148226,
"grad_norm": 3.8961399197860658,
"learning_rate": 9.753086419753087e-06,
"loss": 0.3662,
"step": 712
},
{
"epoch": 0.058745983356677925,
"grad_norm": 6.851544898008151,
"learning_rate": 9.766803840877916e-06,
"loss": 0.6113,
"step": 713
},
{
"epoch": 0.05882837604020763,
"grad_norm": 4.163553582824758,
"learning_rate": 9.780521262002745e-06,
"loss": 0.4059,
"step": 714
},
{
"epoch": 0.05891076872373733,
"grad_norm": 8.045649533095332,
"learning_rate": 9.794238683127573e-06,
"loss": 0.5178,
"step": 715
},
{
"epoch": 0.058993161407267035,
"grad_norm": 8.105313201818435,
"learning_rate": 9.807956104252401e-06,
"loss": 0.6752,
"step": 716
},
{
"epoch": 0.05907555409079674,
"grad_norm": 7.225672961161458,
"learning_rate": 9.82167352537723e-06,
"loss": 0.557,
"step": 717
},
{
"epoch": 0.05915794677432644,
"grad_norm": 4.4835661046768776,
"learning_rate": 9.835390946502057e-06,
"loss": 0.4336,
"step": 718
},
{
"epoch": 0.059240339457856145,
"grad_norm": 4.824952188625617,
"learning_rate": 9.849108367626887e-06,
"loss": 0.4902,
"step": 719
},
{
"epoch": 0.059322732141385844,
"grad_norm": 4.503287721058772,
"learning_rate": 9.862825788751715e-06,
"loss": 0.4704,
"step": 720
},
{
"epoch": 0.05940512482491555,
"grad_norm": 3.7716661123547413,
"learning_rate": 9.876543209876543e-06,
"loss": 0.2055,
"step": 721
},
{
"epoch": 0.05948751750844525,
"grad_norm": 5.833818295505862,
"learning_rate": 9.890260631001373e-06,
"loss": 0.3876,
"step": 722
},
{
"epoch": 0.059569910191974954,
"grad_norm": 8.755791086817371,
"learning_rate": 9.903978052126201e-06,
"loss": 0.687,
"step": 723
},
{
"epoch": 0.05965230287550465,
"grad_norm": 6.667121450588804,
"learning_rate": 9.91769547325103e-06,
"loss": 0.4723,
"step": 724
},
{
"epoch": 0.05973469555903436,
"grad_norm": 4.706421774706928,
"learning_rate": 9.931412894375859e-06,
"loss": 0.5538,
"step": 725
},
{
"epoch": 0.059817088242564063,
"grad_norm": 10.070112083827578,
"learning_rate": 9.945130315500687e-06,
"loss": 0.8218,
"step": 726
},
{
"epoch": 0.05989948092609376,
"grad_norm": 5.252163320718281,
"learning_rate": 9.958847736625515e-06,
"loss": 0.4948,
"step": 727
},
{
"epoch": 0.05998187360962347,
"grad_norm": 9.412813237828644,
"learning_rate": 9.972565157750343e-06,
"loss": 0.81,
"step": 728
},
{
"epoch": 0.060064266293153167,
"grad_norm": 4.587877285973218,
"learning_rate": 9.986282578875173e-06,
"loss": 0.4304,
"step": 729
},
{
"epoch": 0.06014665897668287,
"grad_norm": 4.75570214128782,
"learning_rate": 1e-05,
"loss": 0.4285,
"step": 730
},
{
"epoch": 0.06022905166021257,
"grad_norm": 4.53172116436025,
"learning_rate": 9.999999955491562e-06,
"loss": 0.4013,
"step": 731
},
{
"epoch": 0.060311444343742276,
"grad_norm": 4.01036872409321,
"learning_rate": 9.999999821966245e-06,
"loss": 0.4827,
"step": 732
},
{
"epoch": 0.060393837027271975,
"grad_norm": 5.750887446275034,
"learning_rate": 9.999999599424054e-06,
"loss": 0.554,
"step": 733
},
{
"epoch": 0.06047622971080168,
"grad_norm": 11.992520749521988,
"learning_rate": 9.99999928786499e-06,
"loss": 0.5623,
"step": 734
},
{
"epoch": 0.060558622394331386,
"grad_norm": 8.169748148911982,
"learning_rate": 9.999998887289063e-06,
"loss": 0.5052,
"step": 735
},
{
"epoch": 0.060641015077861085,
"grad_norm": 4.563084100494972,
"learning_rate": 9.999998397696277e-06,
"loss": 0.2568,
"step": 736
},
{
"epoch": 0.06072340776139079,
"grad_norm": 5.198391042090528,
"learning_rate": 9.999997819086641e-06,
"loss": 0.4664,
"step": 737
},
{
"epoch": 0.06080580044492049,
"grad_norm": 5.384798403619332,
"learning_rate": 9.999997151460166e-06,
"loss": 0.4522,
"step": 738
},
{
"epoch": 0.060888193128450195,
"grad_norm": 4.41206122981705,
"learning_rate": 9.999996394816863e-06,
"loss": 0.36,
"step": 739
},
{
"epoch": 0.060970585811979894,
"grad_norm": 11.803173169655384,
"learning_rate": 9.999995549156746e-06,
"loss": 0.4347,
"step": 740
},
{
"epoch": 0.0610529784955096,
"grad_norm": 7.305968155022703,
"learning_rate": 9.999994614479829e-06,
"loss": 0.6298,
"step": 741
},
{
"epoch": 0.0611353711790393,
"grad_norm": 6.627646463372147,
"learning_rate": 9.999993590786133e-06,
"loss": 0.4627,
"step": 742
},
{
"epoch": 0.061217763862569004,
"grad_norm": 7.961247066335053,
"learning_rate": 9.999992478075669e-06,
"loss": 0.7048,
"step": 743
},
{
"epoch": 0.06130015654609871,
"grad_norm": 11.004530222939419,
"learning_rate": 9.999991276348463e-06,
"loss": 0.3497,
"step": 744
},
{
"epoch": 0.06138254922962841,
"grad_norm": 7.087534685277768,
"learning_rate": 9.999989985604533e-06,
"loss": 0.7451,
"step": 745
},
{
"epoch": 0.061464941913158114,
"grad_norm": 4.0530498589819155,
"learning_rate": 9.999988605843905e-06,
"loss": 0.3691,
"step": 746
},
{
"epoch": 0.06154733459668781,
"grad_norm": 3.7550933352714186,
"learning_rate": 9.9999871370666e-06,
"loss": 0.2569,
"step": 747
},
{
"epoch": 0.06162972728021752,
"grad_norm": 5.5935186429304,
"learning_rate": 9.999985579272646e-06,
"loss": 0.687,
"step": 748
},
{
"epoch": 0.06171211996374722,
"grad_norm": 5.254856304496337,
"learning_rate": 9.99998393246207e-06,
"loss": 0.4639,
"step": 749
},
{
"epoch": 0.06179451264727692,
"grad_norm": 7.5048343045294965,
"learning_rate": 9.999982196634904e-06,
"loss": 0.7888,
"step": 750
},
{
"epoch": 0.06187690533080662,
"grad_norm": 6.480623845749662,
"learning_rate": 9.999980371791175e-06,
"loss": 0.7256,
"step": 751
},
{
"epoch": 0.06195929801433633,
"grad_norm": 3.675842921010652,
"learning_rate": 9.999978457930918e-06,
"loss": 0.4443,
"step": 752
},
{
"epoch": 0.06204169069786603,
"grad_norm": 16.26561159606985,
"learning_rate": 9.999976455054165e-06,
"loss": 0.7932,
"step": 753
},
{
"epoch": 0.06212408338139573,
"grad_norm": 5.493234318965289,
"learning_rate": 9.999974363160954e-06,
"loss": 0.3184,
"step": 754
},
{
"epoch": 0.06220647606492544,
"grad_norm": 5.1041491695509436,
"learning_rate": 9.999972182251323e-06,
"loss": 0.6043,
"step": 755
},
{
"epoch": 0.062288868748455135,
"grad_norm": 4.851912554069924,
"learning_rate": 9.999969912325307e-06,
"loss": 0.5401,
"step": 756
},
{
"epoch": 0.06237126143198484,
"grad_norm": 5.751575373973642,
"learning_rate": 9.999967553382947e-06,
"loss": 0.641,
"step": 757
},
{
"epoch": 0.06245365411551454,
"grad_norm": 9.081514003066161,
"learning_rate": 9.999965105424289e-06,
"loss": 0.6134,
"step": 758
},
{
"epoch": 0.06253604679904425,
"grad_norm": 4.811807174517819,
"learning_rate": 9.999962568449374e-06,
"loss": 0.4439,
"step": 759
},
{
"epoch": 0.06261843948257395,
"grad_norm": 4.62379971606797,
"learning_rate": 9.999959942458246e-06,
"loss": 0.6629,
"step": 760
},
{
"epoch": 0.06270083216610366,
"grad_norm": 11.03505749827588,
"learning_rate": 9.999957227450953e-06,
"loss": 0.6224,
"step": 761
},
{
"epoch": 0.06278322484963335,
"grad_norm": 25.986656518689404,
"learning_rate": 9.999954423427545e-06,
"loss": 0.3417,
"step": 762
},
{
"epoch": 0.06286561753316305,
"grad_norm": 7.11476454390553,
"learning_rate": 9.99995153038807e-06,
"loss": 0.6036,
"step": 763
},
{
"epoch": 0.06294801021669276,
"grad_norm": 7.766009203845259,
"learning_rate": 9.999948548332579e-06,
"loss": 0.5608,
"step": 764
},
{
"epoch": 0.06303040290022247,
"grad_norm": 5.920541614447556,
"learning_rate": 9.999945477261124e-06,
"loss": 0.5298,
"step": 765
},
{
"epoch": 0.06311279558375216,
"grad_norm": 12.67177199319788,
"learning_rate": 9.999942317173764e-06,
"loss": 0.7621,
"step": 766
},
{
"epoch": 0.06319518826728186,
"grad_norm": 14.925189851609852,
"learning_rate": 9.999939068070552e-06,
"loss": 0.6965,
"step": 767
},
{
"epoch": 0.06327758095081157,
"grad_norm": 3.6293587522698427,
"learning_rate": 9.999935729951547e-06,
"loss": 0.4481,
"step": 768
},
{
"epoch": 0.06335997363434127,
"grad_norm": 4.466378171099753,
"learning_rate": 9.999932302816808e-06,
"loss": 0.5852,
"step": 769
},
{
"epoch": 0.06344236631787098,
"grad_norm": 5.132249154356962,
"learning_rate": 9.999928786666395e-06,
"loss": 0.3901,
"step": 770
},
{
"epoch": 0.06352475900140067,
"grad_norm": 4.746744082094216,
"learning_rate": 9.999925181500372e-06,
"loss": 0.4565,
"step": 771
},
{
"epoch": 0.06360715168493038,
"grad_norm": 5.8813150292186505,
"learning_rate": 9.999921487318805e-06,
"loss": 0.3263,
"step": 772
},
{
"epoch": 0.06368954436846008,
"grad_norm": 4.410894729084581,
"learning_rate": 9.999917704121756e-06,
"loss": 0.345,
"step": 773
},
{
"epoch": 0.06377193705198979,
"grad_norm": 6.405832396274801,
"learning_rate": 9.999913831909292e-06,
"loss": 0.8081,
"step": 774
},
{
"epoch": 0.06385432973551948,
"grad_norm": 6.920285535512553,
"learning_rate": 9.999909870681486e-06,
"loss": 0.6784,
"step": 775
},
{
"epoch": 0.06393672241904919,
"grad_norm": 5.579593067307145,
"learning_rate": 9.999905820438407e-06,
"loss": 0.578,
"step": 776
},
{
"epoch": 0.06401911510257889,
"grad_norm": 6.074551579414587,
"learning_rate": 9.999901681180123e-06,
"loss": 0.5795,
"step": 777
},
{
"epoch": 0.0641015077861086,
"grad_norm": 7.013231380533223,
"learning_rate": 9.999897452906715e-06,
"loss": 0.437,
"step": 778
},
{
"epoch": 0.0641839004696383,
"grad_norm": 5.4113592934672985,
"learning_rate": 9.999893135618255e-06,
"loss": 0.5025,
"step": 779
},
{
"epoch": 0.064266293153168,
"grad_norm": 6.6071206609748865,
"learning_rate": 9.999888729314817e-06,
"loss": 0.7329,
"step": 780
},
{
"epoch": 0.0643486858366977,
"grad_norm": 5.440902339138829,
"learning_rate": 9.999884233996482e-06,
"loss": 0.4127,
"step": 781
},
{
"epoch": 0.0644310785202274,
"grad_norm": 4.994301771597056,
"learning_rate": 9.999879649663332e-06,
"loss": 0.5911,
"step": 782
},
{
"epoch": 0.06451347120375711,
"grad_norm": 6.0543541357228055,
"learning_rate": 9.999874976315443e-06,
"loss": 0.5825,
"step": 783
},
{
"epoch": 0.0645958638872868,
"grad_norm": 4.3097161136235655,
"learning_rate": 9.999870213952904e-06,
"loss": 0.5132,
"step": 784
},
{
"epoch": 0.06467825657081651,
"grad_norm": 6.888363932085567,
"learning_rate": 9.999865362575799e-06,
"loss": 0.7543,
"step": 785
},
{
"epoch": 0.06476064925434621,
"grad_norm": 4.724388546371243,
"learning_rate": 9.999860422184209e-06,
"loss": 0.5942,
"step": 786
},
{
"epoch": 0.06484304193787592,
"grad_norm": 7.4935192383230955,
"learning_rate": 9.999855392778228e-06,
"loss": 0.4375,
"step": 787
},
{
"epoch": 0.06492543462140563,
"grad_norm": 6.498332631896668,
"learning_rate": 9.999850274357943e-06,
"loss": 0.6782,
"step": 788
},
{
"epoch": 0.06500782730493532,
"grad_norm": 5.187935409845064,
"learning_rate": 9.999845066923445e-06,
"loss": 0.5646,
"step": 789
},
{
"epoch": 0.06509021998846502,
"grad_norm": 5.9252659829991075,
"learning_rate": 9.999839770474827e-06,
"loss": 0.4834,
"step": 790
},
{
"epoch": 0.06517261267199473,
"grad_norm": 4.848728708034422,
"learning_rate": 9.999834385012184e-06,
"loss": 0.4574,
"step": 791
},
{
"epoch": 0.06525500535552443,
"grad_norm": 4.1703778115278665,
"learning_rate": 9.999828910535612e-06,
"loss": 0.5278,
"step": 792
},
{
"epoch": 0.06533739803905413,
"grad_norm": 5.464196351317177,
"learning_rate": 9.999823347045206e-06,
"loss": 0.481,
"step": 793
},
{
"epoch": 0.06541979072258383,
"grad_norm": 9.661214620692148,
"learning_rate": 9.999817694541067e-06,
"loss": 0.7433,
"step": 794
},
{
"epoch": 0.06550218340611354,
"grad_norm": 6.375475870240334,
"learning_rate": 9.999811953023297e-06,
"loss": 0.6541,
"step": 795
},
{
"epoch": 0.06558457608964324,
"grad_norm": 5.1053662811437865,
"learning_rate": 9.999806122491998e-06,
"loss": 0.4034,
"step": 796
},
{
"epoch": 0.06566696877317295,
"grad_norm": 4.706843481268382,
"learning_rate": 9.99980020294727e-06,
"loss": 0.5359,
"step": 797
},
{
"epoch": 0.06574936145670264,
"grad_norm": 5.942544872541858,
"learning_rate": 9.99979419438922e-06,
"loss": 0.5096,
"step": 798
},
{
"epoch": 0.06583175414023235,
"grad_norm": 7.443525959047362,
"learning_rate": 9.999788096817957e-06,
"loss": 0.3826,
"step": 799
},
{
"epoch": 0.06591414682376205,
"grad_norm": 5.603804250629685,
"learning_rate": 9.999781910233589e-06,
"loss": 0.5671,
"step": 800
},
{
"epoch": 0.06599653950729176,
"grad_norm": 4.5063553175865705,
"learning_rate": 9.999775634636226e-06,
"loss": 0.4009,
"step": 801
},
{
"epoch": 0.06607893219082145,
"grad_norm": 8.076520038006072,
"learning_rate": 9.999769270025978e-06,
"loss": 0.8698,
"step": 802
},
{
"epoch": 0.06616132487435115,
"grad_norm": 5.500455378105405,
"learning_rate": 9.99976281640296e-06,
"loss": 0.4873,
"step": 803
},
{
"epoch": 0.06624371755788086,
"grad_norm": 3.348900655594007,
"learning_rate": 9.999756273767288e-06,
"loss": 0.4912,
"step": 804
},
{
"epoch": 0.06632611024141057,
"grad_norm": 4.699563891054634,
"learning_rate": 9.999749642119075e-06,
"loss": 0.4622,
"step": 805
},
{
"epoch": 0.06640850292494027,
"grad_norm": 6.752287413544824,
"learning_rate": 9.99974292145844e-06,
"loss": 0.7243,
"step": 806
},
{
"epoch": 0.06649089560846996,
"grad_norm": 7.99053258176144,
"learning_rate": 9.999736111785507e-06,
"loss": 0.6806,
"step": 807
},
{
"epoch": 0.06657328829199967,
"grad_norm": 4.006803923880306,
"learning_rate": 9.99972921310039e-06,
"loss": 0.3127,
"step": 808
},
{
"epoch": 0.06665568097552937,
"grad_norm": 5.684928104174344,
"learning_rate": 9.99972222540322e-06,
"loss": 0.3958,
"step": 809
},
{
"epoch": 0.06673807365905908,
"grad_norm": 6.2655870002623715,
"learning_rate": 9.999715148694114e-06,
"loss": 0.4125,
"step": 810
},
{
"epoch": 0.06682046634258877,
"grad_norm": 4.858773634051298,
"learning_rate": 9.999707982973203e-06,
"loss": 0.4663,
"step": 811
},
{
"epoch": 0.06690285902611848,
"grad_norm": 4.4468583578233645,
"learning_rate": 9.999700728240612e-06,
"loss": 0.4221,
"step": 812
},
{
"epoch": 0.06698525170964818,
"grad_norm": 5.30420324889732,
"learning_rate": 9.999693384496469e-06,
"loss": 0.6381,
"step": 813
},
{
"epoch": 0.06706764439317789,
"grad_norm": 5.03373160052799,
"learning_rate": 9.99968595174091e-06,
"loss": 0.5375,
"step": 814
},
{
"epoch": 0.0671500370767076,
"grad_norm": 4.342016897255091,
"learning_rate": 9.999678429974063e-06,
"loss": 0.5741,
"step": 815
},
{
"epoch": 0.06723242976023729,
"grad_norm": 4.020245998757659,
"learning_rate": 9.999670819196061e-06,
"loss": 0.5157,
"step": 816
},
{
"epoch": 0.06731482244376699,
"grad_norm": 5.037402626653137,
"learning_rate": 9.999663119407043e-06,
"loss": 0.5872,
"step": 817
},
{
"epoch": 0.0673972151272967,
"grad_norm": 5.351119050045274,
"learning_rate": 9.999655330607143e-06,
"loss": 0.4749,
"step": 818
},
{
"epoch": 0.0674796078108264,
"grad_norm": 3.9429223358828294,
"learning_rate": 9.999647452796502e-06,
"loss": 0.4117,
"step": 819
},
{
"epoch": 0.0675620004943561,
"grad_norm": 5.59405888819271,
"learning_rate": 9.99963948597526e-06,
"loss": 0.314,
"step": 820
},
{
"epoch": 0.0676443931778858,
"grad_norm": 4.687434371571609,
"learning_rate": 9.999631430143558e-06,
"loss": 0.5861,
"step": 821
},
{
"epoch": 0.0677267858614155,
"grad_norm": 4.995276130163719,
"learning_rate": 9.999623285301538e-06,
"loss": 0.4674,
"step": 822
},
{
"epoch": 0.06780917854494521,
"grad_norm": 4.533079712780735,
"learning_rate": 9.999615051449348e-06,
"loss": 0.5473,
"step": 823
},
{
"epoch": 0.06789157122847492,
"grad_norm": 4.406552313322744,
"learning_rate": 9.999606728587134e-06,
"loss": 0.6765,
"step": 824
},
{
"epoch": 0.06797396391200461,
"grad_norm": 5.688948880327145,
"learning_rate": 9.999598316715043e-06,
"loss": 0.5709,
"step": 825
},
{
"epoch": 0.06805635659553431,
"grad_norm": 4.327277968958632,
"learning_rate": 9.999589815833224e-06,
"loss": 0.3639,
"step": 826
},
{
"epoch": 0.06813874927906402,
"grad_norm": 5.27747946166832,
"learning_rate": 9.999581225941829e-06,
"loss": 0.3616,
"step": 827
},
{
"epoch": 0.06822114196259373,
"grad_norm": 4.249209100815019,
"learning_rate": 9.999572547041013e-06,
"loss": 0.4393,
"step": 828
},
{
"epoch": 0.06830353464612342,
"grad_norm": 7.1321873495194525,
"learning_rate": 9.999563779130928e-06,
"loss": 0.7852,
"step": 829
},
{
"epoch": 0.06838592732965312,
"grad_norm": 5.422129659924177,
"learning_rate": 9.999554922211732e-06,
"loss": 0.4847,
"step": 830
},
{
"epoch": 0.06846832001318283,
"grad_norm": 5.212554077950715,
"learning_rate": 9.99954597628358e-06,
"loss": 0.5653,
"step": 831
},
{
"epoch": 0.06855071269671253,
"grad_norm": 4.785517487664035,
"learning_rate": 9.999536941346635e-06,
"loss": 0.3497,
"step": 832
},
{
"epoch": 0.06863310538024224,
"grad_norm": 3.9488684791068165,
"learning_rate": 9.999527817401053e-06,
"loss": 0.4563,
"step": 833
},
{
"epoch": 0.06871549806377193,
"grad_norm": 5.63061411017133,
"learning_rate": 9.999518604447003e-06,
"loss": 0.5806,
"step": 834
},
{
"epoch": 0.06879789074730164,
"grad_norm": 4.343401510593881,
"learning_rate": 9.999509302484642e-06,
"loss": 0.5283,
"step": 835
},
{
"epoch": 0.06888028343083134,
"grad_norm": 4.23058174413733,
"learning_rate": 9.99949991151414e-06,
"loss": 0.558,
"step": 836
},
{
"epoch": 0.06896267611436105,
"grad_norm": 5.669493224488694,
"learning_rate": 9.999490431535664e-06,
"loss": 0.5318,
"step": 837
},
{
"epoch": 0.06904506879789074,
"grad_norm": 4.17773496395434,
"learning_rate": 9.999480862549383e-06,
"loss": 0.531,
"step": 838
},
{
"epoch": 0.06912746148142045,
"grad_norm": 5.0993501898509574,
"learning_rate": 9.999471204555464e-06,
"loss": 0.5367,
"step": 839
},
{
"epoch": 0.06920985416495015,
"grad_norm": 6.258867241760913,
"learning_rate": 9.99946145755408e-06,
"loss": 0.625,
"step": 840
},
{
"epoch": 0.06929224684847986,
"grad_norm": 6.14568981769412,
"learning_rate": 9.999451621545408e-06,
"loss": 0.6203,
"step": 841
},
{
"epoch": 0.06937463953200956,
"grad_norm": 4.50524135844046,
"learning_rate": 9.99944169652962e-06,
"loss": 0.4648,
"step": 842
},
{
"epoch": 0.06945703221553925,
"grad_norm": 4.759545744558735,
"learning_rate": 9.999431682506893e-06,
"loss": 0.3331,
"step": 843
},
{
"epoch": 0.06953942489906896,
"grad_norm": 4.161271349934926,
"learning_rate": 9.999421579477406e-06,
"loss": 0.3352,
"step": 844
},
{
"epoch": 0.06962181758259867,
"grad_norm": 4.557594062709752,
"learning_rate": 9.99941138744134e-06,
"loss": 0.6043,
"step": 845
},
{
"epoch": 0.06970421026612837,
"grad_norm": 5.655003710422895,
"learning_rate": 9.999401106398874e-06,
"loss": 0.5817,
"step": 846
},
{
"epoch": 0.06978660294965806,
"grad_norm": 4.925240291130296,
"learning_rate": 9.999390736350192e-06,
"loss": 0.5053,
"step": 847
},
{
"epoch": 0.06986899563318777,
"grad_norm": 5.566654097661635,
"learning_rate": 9.99938027729548e-06,
"loss": 0.5705,
"step": 848
},
{
"epoch": 0.06995138831671747,
"grad_norm": 4.473152490217241,
"learning_rate": 9.999369729234923e-06,
"loss": 0.3942,
"step": 849
},
{
"epoch": 0.07003378100024718,
"grad_norm": 3.0533580021318665,
"learning_rate": 9.999359092168707e-06,
"loss": 0.3085,
"step": 850
},
{
"epoch": 0.07011617368377689,
"grad_norm": 3.8516009324085747,
"learning_rate": 9.999348366097024e-06,
"loss": 0.3975,
"step": 851
},
{
"epoch": 0.07019856636730658,
"grad_norm": 5.43210019067299,
"learning_rate": 9.999337551020062e-06,
"loss": 0.5565,
"step": 852
},
{
"epoch": 0.07028095905083628,
"grad_norm": 5.9263640825101405,
"learning_rate": 9.999326646938019e-06,
"loss": 0.486,
"step": 853
},
{
"epoch": 0.07036335173436599,
"grad_norm": 4.914603335651239,
"learning_rate": 9.999315653851085e-06,
"loss": 0.5884,
"step": 854
},
{
"epoch": 0.0704457444178957,
"grad_norm": 4.624934035035607,
"learning_rate": 9.999304571759456e-06,
"loss": 0.446,
"step": 855
},
{
"epoch": 0.07052813710142539,
"grad_norm": 4.290804101316048,
"learning_rate": 9.99929340066333e-06,
"loss": 0.501,
"step": 856
},
{
"epoch": 0.07061052978495509,
"grad_norm": 9.34261485450524,
"learning_rate": 9.999282140562905e-06,
"loss": 0.7741,
"step": 857
},
{
"epoch": 0.0706929224684848,
"grad_norm": 4.590850263399005,
"learning_rate": 9.999270791458383e-06,
"loss": 0.4548,
"step": 858
},
{
"epoch": 0.0707753151520145,
"grad_norm": 4.7695609921791355,
"learning_rate": 9.999259353349964e-06,
"loss": 0.53,
"step": 859
},
{
"epoch": 0.07085770783554421,
"grad_norm": 3.904977095403514,
"learning_rate": 9.999247826237854e-06,
"loss": 0.3604,
"step": 860
},
{
"epoch": 0.0709401005190739,
"grad_norm": 8.682031296595586,
"learning_rate": 9.999236210122256e-06,
"loss": 0.78,
"step": 861
},
{
"epoch": 0.0710224932026036,
"grad_norm": 4.5193801163329494,
"learning_rate": 9.999224505003379e-06,
"loss": 0.5151,
"step": 862
},
{
"epoch": 0.07110488588613331,
"grad_norm": 6.453604475727589,
"learning_rate": 9.999212710881429e-06,
"loss": 0.7898,
"step": 863
},
{
"epoch": 0.07118727856966302,
"grad_norm": 5.0929895898718165,
"learning_rate": 9.99920082775662e-06,
"loss": 0.3436,
"step": 864
},
{
"epoch": 0.07126967125319271,
"grad_norm": 3.977322671420692,
"learning_rate": 9.999188855629159e-06,
"loss": 0.6677,
"step": 865
},
{
"epoch": 0.07135206393672242,
"grad_norm": 4.011549270668664,
"learning_rate": 9.99917679449926e-06,
"loss": 0.5224,
"step": 866
},
{
"epoch": 0.07143445662025212,
"grad_norm": 4.242067648981852,
"learning_rate": 9.999164644367139e-06,
"loss": 0.6407,
"step": 867
},
{
"epoch": 0.07151684930378183,
"grad_norm": 4.6287285335817225,
"learning_rate": 9.999152405233013e-06,
"loss": 0.4401,
"step": 868
},
{
"epoch": 0.07159924198731153,
"grad_norm": 7.160675102549129,
"learning_rate": 9.999140077097096e-06,
"loss": 0.6419,
"step": 869
},
{
"epoch": 0.07168163467084122,
"grad_norm": 5.074238907861252,
"learning_rate": 9.999127659959613e-06,
"loss": 0.6444,
"step": 870
},
{
"epoch": 0.07176402735437093,
"grad_norm": 3.6382794617465253,
"learning_rate": 9.999115153820782e-06,
"loss": 0.4528,
"step": 871
},
{
"epoch": 0.07184642003790064,
"grad_norm": 4.599057975224609,
"learning_rate": 9.999102558680827e-06,
"loss": 0.531,
"step": 872
},
{
"epoch": 0.07192881272143034,
"grad_norm": 4.5872619129769046,
"learning_rate": 9.999089874539968e-06,
"loss": 0.4827,
"step": 873
},
{
"epoch": 0.07201120540496005,
"grad_norm": 3.689648865108007,
"learning_rate": 9.999077101398437e-06,
"loss": 0.492,
"step": 874
},
{
"epoch": 0.07209359808848974,
"grad_norm": 5.8761872625125715,
"learning_rate": 9.999064239256459e-06,
"loss": 0.543,
"step": 875
},
{
"epoch": 0.07217599077201944,
"grad_norm": 5.870430873318824,
"learning_rate": 9.99905128811426e-06,
"loss": 0.5407,
"step": 876
},
{
"epoch": 0.07225838345554915,
"grad_norm": 6.386341937119981,
"learning_rate": 9.999038247972076e-06,
"loss": 0.6123,
"step": 877
},
{
"epoch": 0.07234077613907886,
"grad_norm": 5.280967236144987,
"learning_rate": 9.999025118830134e-06,
"loss": 0.4707,
"step": 878
},
{
"epoch": 0.07242316882260855,
"grad_norm": 5.724340686807843,
"learning_rate": 9.999011900688672e-06,
"loss": 0.6817,
"step": 879
},
{
"epoch": 0.07250556150613825,
"grad_norm": 5.777864365455918,
"learning_rate": 9.998998593547923e-06,
"loss": 0.7443,
"step": 880
},
{
"epoch": 0.07258795418966796,
"grad_norm": 4.481030640410297,
"learning_rate": 9.998985197408122e-06,
"loss": 0.4052,
"step": 881
},
{
"epoch": 0.07267034687319766,
"grad_norm": 5.7539373865432895,
"learning_rate": 9.998971712269512e-06,
"loss": 0.6414,
"step": 882
},
{
"epoch": 0.07275273955672737,
"grad_norm": 5.074248646089831,
"learning_rate": 9.99895813813233e-06,
"loss": 0.6889,
"step": 883
},
{
"epoch": 0.07283513224025706,
"grad_norm": 6.08389615807529,
"learning_rate": 9.998944474996817e-06,
"loss": 0.6358,
"step": 884
},
{
"epoch": 0.07291752492378677,
"grad_norm": 7.007593747793593,
"learning_rate": 9.99893072286322e-06,
"loss": 0.723,
"step": 885
},
{
"epoch": 0.07299991760731647,
"grad_norm": 5.295613649313166,
"learning_rate": 9.998916881731781e-06,
"loss": 0.5226,
"step": 886
},
{
"epoch": 0.07308231029084618,
"grad_norm": 5.1499745635077225,
"learning_rate": 9.998902951602746e-06,
"loss": 0.6138,
"step": 887
},
{
"epoch": 0.07316470297437587,
"grad_norm": 4.404865550062876,
"learning_rate": 9.998888932476365e-06,
"loss": 0.4733,
"step": 888
},
{
"epoch": 0.07324709565790558,
"grad_norm": 5.914838603616755,
"learning_rate": 9.998874824352887e-06,
"loss": 0.5345,
"step": 889
},
{
"epoch": 0.07332948834143528,
"grad_norm": 9.339300610542013,
"learning_rate": 9.99886062723256e-06,
"loss": 0.7452,
"step": 890
},
{
"epoch": 0.07341188102496499,
"grad_norm": 5.575845113308426,
"learning_rate": 9.998846341115642e-06,
"loss": 0.5513,
"step": 891
},
{
"epoch": 0.07349427370849469,
"grad_norm": 4.403803637718023,
"learning_rate": 9.998831966002385e-06,
"loss": 0.4927,
"step": 892
},
{
"epoch": 0.07357666639202438,
"grad_norm": 4.716127059524325,
"learning_rate": 9.998817501893044e-06,
"loss": 0.5894,
"step": 893
},
{
"epoch": 0.07365905907555409,
"grad_norm": 4.162456660043013,
"learning_rate": 9.998802948787878e-06,
"loss": 0.5348,
"step": 894
},
{
"epoch": 0.0737414517590838,
"grad_norm": 7.865128926097237,
"learning_rate": 9.998788306687144e-06,
"loss": 0.7982,
"step": 895
},
{
"epoch": 0.0738238444426135,
"grad_norm": 5.062814936226218,
"learning_rate": 9.998773575591105e-06,
"loss": 0.5444,
"step": 896
},
{
"epoch": 0.07390623712614319,
"grad_norm": 5.0766827309307345,
"learning_rate": 9.998758755500022e-06,
"loss": 0.5396,
"step": 897
},
{
"epoch": 0.0739886298096729,
"grad_norm": 5.588620720256569,
"learning_rate": 9.998743846414158e-06,
"loss": 0.607,
"step": 898
},
{
"epoch": 0.0740710224932026,
"grad_norm": 4.508870984621243,
"learning_rate": 9.998728848333781e-06,
"loss": 0.5817,
"step": 899
},
{
"epoch": 0.07415341517673231,
"grad_norm": 6.255880304807757,
"learning_rate": 9.998713761259157e-06,
"loss": 0.624,
"step": 900
},
{
"epoch": 0.07423580786026202,
"grad_norm": 6.448010541448679,
"learning_rate": 9.998698585190554e-06,
"loss": 0.592,
"step": 901
},
{
"epoch": 0.07431820054379171,
"grad_norm": 6.581360057122206,
"learning_rate": 9.998683320128242e-06,
"loss": 0.7091,
"step": 902
},
{
"epoch": 0.07440059322732141,
"grad_norm": 7.178574221854368,
"learning_rate": 9.998667966072492e-06,
"loss": 0.514,
"step": 903
},
{
"epoch": 0.07448298591085112,
"grad_norm": 5.640474989894486,
"learning_rate": 9.998652523023582e-06,
"loss": 0.5192,
"step": 904
},
{
"epoch": 0.07456537859438082,
"grad_norm": 4.859242002152594,
"learning_rate": 9.99863699098178e-06,
"loss": 0.393,
"step": 905
},
{
"epoch": 0.07464777127791052,
"grad_norm": 4.7363551715969905,
"learning_rate": 9.998621369947368e-06,
"loss": 0.5423,
"step": 906
},
{
"epoch": 0.07473016396144022,
"grad_norm": 7.11873781156602,
"learning_rate": 9.998605659920621e-06,
"loss": 0.6214,
"step": 907
},
{
"epoch": 0.07481255664496993,
"grad_norm": 4.642272372196515,
"learning_rate": 9.99858986090182e-06,
"loss": 0.3049,
"step": 908
},
{
"epoch": 0.07489494932849963,
"grad_norm": 4.975074076132109,
"learning_rate": 9.998573972891246e-06,
"loss": 0.5958,
"step": 909
},
{
"epoch": 0.07497734201202934,
"grad_norm": 6.247010066207867,
"learning_rate": 9.998557995889183e-06,
"loss": 0.5501,
"step": 910
},
{
"epoch": 0.07505973469555903,
"grad_norm": 6.758308077386494,
"learning_rate": 9.998541929895912e-06,
"loss": 0.4744,
"step": 911
},
{
"epoch": 0.07514212737908874,
"grad_norm": 6.321946117889499,
"learning_rate": 9.998525774911723e-06,
"loss": 0.4651,
"step": 912
},
{
"epoch": 0.07522452006261844,
"grad_norm": 5.910428930462289,
"learning_rate": 9.998509530936901e-06,
"loss": 0.5662,
"step": 913
},
{
"epoch": 0.07530691274614815,
"grad_norm": 4.4469553646271525,
"learning_rate": 9.998493197971737e-06,
"loss": 0.529,
"step": 914
},
{
"epoch": 0.07538930542967784,
"grad_norm": 6.494702959527984,
"learning_rate": 9.998476776016521e-06,
"loss": 0.5877,
"step": 915
},
{
"epoch": 0.07547169811320754,
"grad_norm": 7.669316446508771,
"learning_rate": 9.998460265071546e-06,
"loss": 0.6812,
"step": 916
},
{
"epoch": 0.07555409079673725,
"grad_norm": 14.38156671027216,
"learning_rate": 9.998443665137104e-06,
"loss": 0.8116,
"step": 917
},
{
"epoch": 0.07563648348026696,
"grad_norm": 8.288458207815854,
"learning_rate": 9.998426976213493e-06,
"loss": 0.656,
"step": 918
},
{
"epoch": 0.07571887616379666,
"grad_norm": 3.703380187983116,
"learning_rate": 9.998410198301007e-06,
"loss": 0.3688,
"step": 919
},
{
"epoch": 0.07580126884732635,
"grad_norm": 5.446454718623625,
"learning_rate": 9.99839333139995e-06,
"loss": 0.5218,
"step": 920
},
{
"epoch": 0.07588366153085606,
"grad_norm": 5.7339819465370825,
"learning_rate": 9.998376375510617e-06,
"loss": 0.4874,
"step": 921
},
{
"epoch": 0.07596605421438576,
"grad_norm": 5.321192426624907,
"learning_rate": 9.99835933063331e-06,
"loss": 0.3845,
"step": 922
},
{
"epoch": 0.07604844689791547,
"grad_norm": 5.797682110550929,
"learning_rate": 9.998342196768337e-06,
"loss": 0.4487,
"step": 923
},
{
"epoch": 0.07613083958144516,
"grad_norm": 6.324776101604922,
"learning_rate": 9.998324973915999e-06,
"loss": 0.5774,
"step": 924
},
{
"epoch": 0.07621323226497487,
"grad_norm": 6.886486065521574,
"learning_rate": 9.998307662076604e-06,
"loss": 0.5918,
"step": 925
},
{
"epoch": 0.07629562494850457,
"grad_norm": 4.598957573831131,
"learning_rate": 9.998290261250461e-06,
"loss": 0.5424,
"step": 926
},
{
"epoch": 0.07637801763203428,
"grad_norm": 5.054899919494791,
"learning_rate": 9.998272771437878e-06,
"loss": 0.3453,
"step": 927
},
{
"epoch": 0.07646041031556398,
"grad_norm": 4.915345392459135,
"learning_rate": 9.998255192639167e-06,
"loss": 0.5505,
"step": 928
},
{
"epoch": 0.07654280299909368,
"grad_norm": 13.147418222703912,
"learning_rate": 9.998237524854643e-06,
"loss": 0.7975,
"step": 929
},
{
"epoch": 0.07662519568262338,
"grad_norm": 4.249042164087028,
"learning_rate": 9.998219768084619e-06,
"loss": 0.5132,
"step": 930
},
{
"epoch": 0.07670758836615309,
"grad_norm": 4.6864159906629865,
"learning_rate": 9.998201922329409e-06,
"loss": 0.5093,
"step": 931
},
{
"epoch": 0.07678998104968279,
"grad_norm": 4.704532568350368,
"learning_rate": 9.998183987589332e-06,
"loss": 0.5385,
"step": 932
},
{
"epoch": 0.07687237373321248,
"grad_norm": 4.250326737192859,
"learning_rate": 9.99816596386471e-06,
"loss": 0.5081,
"step": 933
},
{
"epoch": 0.07695476641674219,
"grad_norm": 3.2609943143186393,
"learning_rate": 9.998147851155862e-06,
"loss": 0.3386,
"step": 934
},
{
"epoch": 0.0770371591002719,
"grad_norm": 4.686119380610933,
"learning_rate": 9.998129649463108e-06,
"loss": 0.4959,
"step": 935
},
{
"epoch": 0.0771195517838016,
"grad_norm": 8.093847783734134,
"learning_rate": 9.998111358786777e-06,
"loss": 0.6091,
"step": 936
},
{
"epoch": 0.07720194446733131,
"grad_norm": 4.128988601766982,
"learning_rate": 9.998092979127191e-06,
"loss": 0.3564,
"step": 937
},
{
"epoch": 0.077284337150861,
"grad_norm": 12.520048545236683,
"learning_rate": 9.998074510484679e-06,
"loss": 0.8451,
"step": 938
},
{
"epoch": 0.0773667298343907,
"grad_norm": 5.0581596409425025,
"learning_rate": 9.998055952859567e-06,
"loss": 0.4862,
"step": 939
},
{
"epoch": 0.07744912251792041,
"grad_norm": 6.315311219118197,
"learning_rate": 9.998037306252188e-06,
"loss": 0.5742,
"step": 940
},
{
"epoch": 0.07753151520145012,
"grad_norm": 5.819386724118808,
"learning_rate": 9.998018570662875e-06,
"loss": 0.6873,
"step": 941
},
{
"epoch": 0.07761390788497981,
"grad_norm": 6.606119564610502,
"learning_rate": 9.99799974609196e-06,
"loss": 0.6462,
"step": 942
},
{
"epoch": 0.07769630056850951,
"grad_norm": 4.9064741570302965,
"learning_rate": 9.997980832539775e-06,
"loss": 0.4563,
"step": 943
},
{
"epoch": 0.07777869325203922,
"grad_norm": 6.403026987978093,
"learning_rate": 9.997961830006663e-06,
"loss": 0.5919,
"step": 944
},
{
"epoch": 0.07786108593556892,
"grad_norm": 3.995002622383316,
"learning_rate": 9.997942738492959e-06,
"loss": 0.6035,
"step": 945
},
{
"epoch": 0.07794347861909863,
"grad_norm": 5.979508594311581,
"learning_rate": 9.997923557999001e-06,
"loss": 0.6438,
"step": 946
},
{
"epoch": 0.07802587130262832,
"grad_norm": 12.403971641606825,
"learning_rate": 9.997904288525133e-06,
"loss": 0.7055,
"step": 947
},
{
"epoch": 0.07810826398615803,
"grad_norm": 5.065356925074259,
"learning_rate": 9.997884930071698e-06,
"loss": 0.489,
"step": 948
},
{
"epoch": 0.07819065666968773,
"grad_norm": 6.739162552025105,
"learning_rate": 9.99786548263904e-06,
"loss": 0.6978,
"step": 949
},
{
"epoch": 0.07827304935321744,
"grad_norm": 5.869680200446658,
"learning_rate": 9.997845946227506e-06,
"loss": 0.6373,
"step": 950
},
{
"epoch": 0.07835544203674713,
"grad_norm": 5.498434870964948,
"learning_rate": 9.997826320837445e-06,
"loss": 0.5772,
"step": 951
},
{
"epoch": 0.07843783472027684,
"grad_norm": 6.304272494085451,
"learning_rate": 9.997806606469201e-06,
"loss": 0.3892,
"step": 952
},
{
"epoch": 0.07852022740380654,
"grad_norm": 6.170055796070485,
"learning_rate": 9.997786803123131e-06,
"loss": 0.5676,
"step": 953
},
{
"epoch": 0.07860262008733625,
"grad_norm": 4.355001196596673,
"learning_rate": 9.997766910799585e-06,
"loss": 0.3932,
"step": 954
},
{
"epoch": 0.07868501277086595,
"grad_norm": 4.587353573095941,
"learning_rate": 9.997746929498915e-06,
"loss": 0.4951,
"step": 955
},
{
"epoch": 0.07876740545439564,
"grad_norm": 4.7560661272306115,
"learning_rate": 9.997726859221482e-06,
"loss": 0.3379,
"step": 956
},
{
"epoch": 0.07884979813792535,
"grad_norm": 3.875204186498522,
"learning_rate": 9.997706699967638e-06,
"loss": 0.4937,
"step": 957
},
{
"epoch": 0.07893219082145506,
"grad_norm": 4.5436308343446665,
"learning_rate": 9.997686451737745e-06,
"loss": 0.4664,
"step": 958
},
{
"epoch": 0.07901458350498476,
"grad_norm": 5.085715886421841,
"learning_rate": 9.997666114532166e-06,
"loss": 0.3532,
"step": 959
},
{
"epoch": 0.07909697618851445,
"grad_norm": 5.736037074111639,
"learning_rate": 9.997645688351256e-06,
"loss": 0.6229,
"step": 960
},
{
"epoch": 0.07917936887204416,
"grad_norm": 4.6641539689143885,
"learning_rate": 9.997625173195384e-06,
"loss": 0.5927,
"step": 961
},
{
"epoch": 0.07926176155557386,
"grad_norm": 6.986142276021053,
"learning_rate": 9.997604569064913e-06,
"loss": 0.6011,
"step": 962
},
{
"epoch": 0.07934415423910357,
"grad_norm": 4.936390533586499,
"learning_rate": 9.99758387596021e-06,
"loss": 0.6004,
"step": 963
},
{
"epoch": 0.07942654692263328,
"grad_norm": 25.174262625674295,
"learning_rate": 9.997563093881647e-06,
"loss": 1.0481,
"step": 964
},
{
"epoch": 0.07950893960616297,
"grad_norm": 5.911910212113453,
"learning_rate": 9.997542222829588e-06,
"loss": 0.5712,
"step": 965
},
{
"epoch": 0.07959133228969267,
"grad_norm": 5.683154041671146,
"learning_rate": 9.997521262804408e-06,
"loss": 0.5152,
"step": 966
},
{
"epoch": 0.07967372497322238,
"grad_norm": 6.247591970007456,
"learning_rate": 9.997500213806481e-06,
"loss": 0.589,
"step": 967
},
{
"epoch": 0.07975611765675208,
"grad_norm": 6.037728527699037,
"learning_rate": 9.997479075836179e-06,
"loss": 0.4865,
"step": 968
},
{
"epoch": 0.07983851034028178,
"grad_norm": 4.848668306571336,
"learning_rate": 9.997457848893881e-06,
"loss": 0.5364,
"step": 969
},
{
"epoch": 0.07992090302381148,
"grad_norm": 5.738086820942736,
"learning_rate": 9.997436532979963e-06,
"loss": 0.4475,
"step": 970
},
{
"epoch": 0.08000329570734119,
"grad_norm": 7.7003958590754245,
"learning_rate": 9.997415128094805e-06,
"loss": 0.6297,
"step": 971
},
{
"epoch": 0.0800856883908709,
"grad_norm": 4.6301976482339375,
"learning_rate": 9.997393634238789e-06,
"loss": 0.4354,
"step": 972
},
{
"epoch": 0.0801680810744006,
"grad_norm": 5.051471456165753,
"learning_rate": 9.997372051412296e-06,
"loss": 0.5246,
"step": 973
},
{
"epoch": 0.08025047375793029,
"grad_norm": 6.840214215762055,
"learning_rate": 9.997350379615712e-06,
"loss": 0.6289,
"step": 974
},
{
"epoch": 0.08033286644146,
"grad_norm": 5.025951954709692,
"learning_rate": 9.997328618849422e-06,
"loss": 0.6347,
"step": 975
},
{
"epoch": 0.0804152591249897,
"grad_norm": 4.175068481847607,
"learning_rate": 9.997306769113812e-06,
"loss": 0.4474,
"step": 976
},
{
"epoch": 0.08049765180851941,
"grad_norm": 6.49606794332559,
"learning_rate": 9.997284830409275e-06,
"loss": 0.7058,
"step": 977
},
{
"epoch": 0.0805800444920491,
"grad_norm": 4.355993261479888,
"learning_rate": 9.997262802736197e-06,
"loss": 0.3175,
"step": 978
},
{
"epoch": 0.0806624371755788,
"grad_norm": 4.173278705106478,
"learning_rate": 9.997240686094974e-06,
"loss": 0.4082,
"step": 979
},
{
"epoch": 0.08074482985910851,
"grad_norm": 3.6798094983336638,
"learning_rate": 9.997218480485994e-06,
"loss": 0.266,
"step": 980
},
{
"epoch": 0.08082722254263822,
"grad_norm": 6.875883284171418,
"learning_rate": 9.997196185909662e-06,
"loss": 0.397,
"step": 981
},
{
"epoch": 0.08090961522616792,
"grad_norm": 4.7547519593139125,
"learning_rate": 9.997173802366365e-06,
"loss": 0.6101,
"step": 982
},
{
"epoch": 0.08099200790969761,
"grad_norm": 7.068828139356332,
"learning_rate": 9.997151329856508e-06,
"loss": 0.753,
"step": 983
},
{
"epoch": 0.08107440059322732,
"grad_norm": 5.32212084183463,
"learning_rate": 9.997128768380486e-06,
"loss": 0.5187,
"step": 984
},
{
"epoch": 0.08115679327675702,
"grad_norm": 4.63635990769238,
"learning_rate": 9.997106117938704e-06,
"loss": 0.5448,
"step": 985
},
{
"epoch": 0.08123918596028673,
"grad_norm": 5.156911960898855,
"learning_rate": 9.997083378531567e-06,
"loss": 0.6237,
"step": 986
},
{
"epoch": 0.08132157864381642,
"grad_norm": 66.02610969540054,
"learning_rate": 9.997060550159477e-06,
"loss": 2.7918,
"step": 987
},
{
"epoch": 0.08140397132734613,
"grad_norm": 4.63314347325949,
"learning_rate": 9.997037632822839e-06,
"loss": 0.4784,
"step": 988
},
{
"epoch": 0.08148636401087583,
"grad_norm": 17.125886475254017,
"learning_rate": 9.997014626522064e-06,
"loss": 0.5182,
"step": 989
},
{
"epoch": 0.08156875669440554,
"grad_norm": 5.112815655421804,
"learning_rate": 9.99699153125756e-06,
"loss": 0.5992,
"step": 990
},
{
"epoch": 0.08165114937793524,
"grad_norm": 4.634165444142863,
"learning_rate": 9.996968347029739e-06,
"loss": 0.5552,
"step": 991
},
{
"epoch": 0.08173354206146494,
"grad_norm": 4.500448623996411,
"learning_rate": 9.996945073839015e-06,
"loss": 0.5293,
"step": 992
},
{
"epoch": 0.08181593474499464,
"grad_norm": 4.985902758629872,
"learning_rate": 9.996921711685798e-06,
"loss": 0.5077,
"step": 993
},
{
"epoch": 0.08189832742852435,
"grad_norm": 11.160741895344973,
"learning_rate": 9.99689826057051e-06,
"loss": 0.6087,
"step": 994
},
{
"epoch": 0.08198072011205405,
"grad_norm": 5.767479903001831,
"learning_rate": 9.996874720493563e-06,
"loss": 0.5006,
"step": 995
},
{
"epoch": 0.08206311279558375,
"grad_norm": 4.604269350433775,
"learning_rate": 9.996851091455379e-06,
"loss": 0.4231,
"step": 996
},
{
"epoch": 0.08214550547911345,
"grad_norm": 6.964228664965358,
"learning_rate": 9.996827373456379e-06,
"loss": 0.7993,
"step": 997
},
{
"epoch": 0.08222789816264316,
"grad_norm": 4.503845133301578,
"learning_rate": 9.996803566496982e-06,
"loss": 0.574,
"step": 998
},
{
"epoch": 0.08231029084617286,
"grad_norm": 5.102706241374167,
"learning_rate": 9.996779670577615e-06,
"loss": 0.523,
"step": 999
},
{
"epoch": 0.08239268352970257,
"grad_norm": 4.718633665300077,
"learning_rate": 9.996755685698703e-06,
"loss": 0.4039,
"step": 1000
},
{
"epoch": 0.08247507621323226,
"grad_norm": 31.817192653927222,
"learning_rate": 9.996731611860674e-06,
"loss": 0.4298,
"step": 1001
},
{
"epoch": 0.08255746889676197,
"grad_norm": 21.27663857241567,
"learning_rate": 9.996707449063952e-06,
"loss": 0.2222,
"step": 1002
},
{
"epoch": 0.08263986158029167,
"grad_norm": 4.401888936601498,
"learning_rate": 9.996683197308973e-06,
"loss": 0.4995,
"step": 1003
},
{
"epoch": 0.08272225426382138,
"grad_norm": 5.508952355584309,
"learning_rate": 9.996658856596165e-06,
"loss": 0.5681,
"step": 1004
},
{
"epoch": 0.08280464694735108,
"grad_norm": 4.992828446636793,
"learning_rate": 9.996634426925962e-06,
"loss": 0.4845,
"step": 1005
},
{
"epoch": 0.08288703963088077,
"grad_norm": 7.4045853788823015,
"learning_rate": 9.9966099082988e-06,
"loss": 0.6107,
"step": 1006
},
{
"epoch": 0.08296943231441048,
"grad_norm": 7.359292979226814,
"learning_rate": 9.996585300715117e-06,
"loss": 0.7944,
"step": 1007
},
{
"epoch": 0.08305182499794019,
"grad_norm": 5.680106009666031,
"learning_rate": 9.996560604175344e-06,
"loss": 0.4504,
"step": 1008
},
{
"epoch": 0.08313421768146989,
"grad_norm": 4.403376579772996,
"learning_rate": 9.99653581867993e-06,
"loss": 0.5481,
"step": 1009
},
{
"epoch": 0.08321661036499958,
"grad_norm": 10.223711349213156,
"learning_rate": 9.99651094422931e-06,
"loss": 0.9755,
"step": 1010
},
{
"epoch": 0.08329900304852929,
"grad_norm": 4.683680775421051,
"learning_rate": 9.99648598082393e-06,
"loss": 0.6508,
"step": 1011
},
{
"epoch": 0.083381395732059,
"grad_norm": 6.840856586449906,
"learning_rate": 9.99646092846423e-06,
"loss": 0.6494,
"step": 1012
},
{
"epoch": 0.0834637884155887,
"grad_norm": 5.564536440133124,
"learning_rate": 9.996435787150663e-06,
"loss": 0.6494,
"step": 1013
},
{
"epoch": 0.0835461810991184,
"grad_norm": 8.55630832138283,
"learning_rate": 9.996410556883672e-06,
"loss": 0.5978,
"step": 1014
},
{
"epoch": 0.0836285737826481,
"grad_norm": 4.505682298137676,
"learning_rate": 9.996385237663706e-06,
"loss": 0.4981,
"step": 1015
},
{
"epoch": 0.0837109664661778,
"grad_norm": 6.026491102637283,
"learning_rate": 9.996359829491218e-06,
"loss": 0.6929,
"step": 1016
},
{
"epoch": 0.08379335914970751,
"grad_norm": 5.23031637253995,
"learning_rate": 9.996334332366658e-06,
"loss": 0.468,
"step": 1017
},
{
"epoch": 0.08387575183323721,
"grad_norm": 6.02650829355012,
"learning_rate": 9.996308746290482e-06,
"loss": 0.6166,
"step": 1018
},
{
"epoch": 0.0839581445167669,
"grad_norm": 4.1819439593996535,
"learning_rate": 9.996283071263145e-06,
"loss": 0.4417,
"step": 1019
},
{
"epoch": 0.08404053720029661,
"grad_norm": 5.489497484777263,
"learning_rate": 9.996257307285102e-06,
"loss": 0.441,
"step": 1020
},
{
"epoch": 0.08412292988382632,
"grad_norm": 6.1845291497461545,
"learning_rate": 9.996231454356814e-06,
"loss": 0.8055,
"step": 1021
},
{
"epoch": 0.08420532256735602,
"grad_norm": 6.335171357274994,
"learning_rate": 9.996205512478741e-06,
"loss": 0.7177,
"step": 1022
},
{
"epoch": 0.08428771525088573,
"grad_norm": 5.77099800958352,
"learning_rate": 9.996179481651345e-06,
"loss": 0.6201,
"step": 1023
},
{
"epoch": 0.08437010793441542,
"grad_norm": 5.277909161458072,
"learning_rate": 9.996153361875086e-06,
"loss": 0.5087,
"step": 1024
},
{
"epoch": 0.08445250061794513,
"grad_norm": 4.995606457759668,
"learning_rate": 9.996127153150436e-06,
"loss": 0.4032,
"step": 1025
},
{
"epoch": 0.08453489330147483,
"grad_norm": 4.431456907318335,
"learning_rate": 9.996100855477856e-06,
"loss": 0.3881,
"step": 1026
},
{
"epoch": 0.08461728598500454,
"grad_norm": 3.511167845084898,
"learning_rate": 9.996074468857815e-06,
"loss": 0.4317,
"step": 1027
},
{
"epoch": 0.08469967866853423,
"grad_norm": 4.355083337852244,
"learning_rate": 9.996047993290784e-06,
"loss": 0.481,
"step": 1028
},
{
"epoch": 0.08478207135206393,
"grad_norm": 4.7477919241040425,
"learning_rate": 9.996021428777234e-06,
"loss": 0.4123,
"step": 1029
},
{
"epoch": 0.08486446403559364,
"grad_norm": 4.845625383441968,
"learning_rate": 9.99599477531764e-06,
"loss": 0.7018,
"step": 1030
},
{
"epoch": 0.08494685671912335,
"grad_norm": 5.263537268769247,
"learning_rate": 9.995968032912471e-06,
"loss": 0.5584,
"step": 1031
},
{
"epoch": 0.08502924940265305,
"grad_norm": 4.316936086223206,
"learning_rate": 9.995941201562207e-06,
"loss": 0.5342,
"step": 1032
},
{
"epoch": 0.08511164208618274,
"grad_norm": 4.862616503432274,
"learning_rate": 9.995914281267326e-06,
"loss": 0.5874,
"step": 1033
},
{
"epoch": 0.08519403476971245,
"grad_norm": 6.347242300001709,
"learning_rate": 9.995887272028307e-06,
"loss": 0.6603,
"step": 1034
},
{
"epoch": 0.08527642745324215,
"grad_norm": 4.525346388262154,
"learning_rate": 9.995860173845629e-06,
"loss": 0.441,
"step": 1035
},
{
"epoch": 0.08535882013677186,
"grad_norm": 4.606242184878409,
"learning_rate": 9.995832986719776e-06,
"loss": 0.5658,
"step": 1036
},
{
"epoch": 0.08544121282030155,
"grad_norm": 5.55541041688934,
"learning_rate": 9.995805710651233e-06,
"loss": 0.5883,
"step": 1037
},
{
"epoch": 0.08552360550383126,
"grad_norm": 4.805999702498609,
"learning_rate": 9.995778345640481e-06,
"loss": 0.5197,
"step": 1038
},
{
"epoch": 0.08560599818736096,
"grad_norm": 5.6071103942756535,
"learning_rate": 9.995750891688013e-06,
"loss": 0.4935,
"step": 1039
},
{
"epoch": 0.08568839087089067,
"grad_norm": 50.57617337717754,
"learning_rate": 9.995723348794315e-06,
"loss": 2.4806,
"step": 1040
},
{
"epoch": 0.08577078355442037,
"grad_norm": 6.25280098712178,
"learning_rate": 9.995695716959877e-06,
"loss": 0.6218,
"step": 1041
},
{
"epoch": 0.08585317623795007,
"grad_norm": 3.9373446441524025,
"learning_rate": 9.995667996185193e-06,
"loss": 0.5496,
"step": 1042
},
{
"epoch": 0.08593556892147977,
"grad_norm": 4.343696783721304,
"learning_rate": 9.995640186470755e-06,
"loss": 0.4876,
"step": 1043
},
{
"epoch": 0.08601796160500948,
"grad_norm": 5.0506015991461,
"learning_rate": 9.995612287817056e-06,
"loss": 0.5382,
"step": 1044
},
{
"epoch": 0.08610035428853918,
"grad_norm": 3.3472528865228015,
"learning_rate": 9.995584300224597e-06,
"loss": 0.4219,
"step": 1045
},
{
"epoch": 0.08618274697206887,
"grad_norm": 3.6590082207213666,
"learning_rate": 9.995556223693874e-06,
"loss": 0.4519,
"step": 1046
},
{
"epoch": 0.08626513965559858,
"grad_norm": 3.3110168095564414,
"learning_rate": 9.995528058225386e-06,
"loss": 0.3475,
"step": 1047
},
{
"epoch": 0.08634753233912829,
"grad_norm": 5.863164197441982,
"learning_rate": 9.995499803819637e-06,
"loss": 0.6212,
"step": 1048
},
{
"epoch": 0.08642992502265799,
"grad_norm": 5.518382693746084,
"learning_rate": 9.995471460477127e-06,
"loss": 0.5021,
"step": 1049
},
{
"epoch": 0.0865123177061877,
"grad_norm": 6.110381483524074,
"learning_rate": 9.995443028198362e-06,
"loss": 0.4432,
"step": 1050
},
{
"epoch": 0.08659471038971739,
"grad_norm": 4.892003499667656,
"learning_rate": 9.99541450698385e-06,
"loss": 0.3957,
"step": 1051
},
{
"epoch": 0.0866771030732471,
"grad_norm": 7.104164940402655,
"learning_rate": 9.995385896834095e-06,
"loss": 0.676,
"step": 1052
},
{
"epoch": 0.0867594957567768,
"grad_norm": 11.570101337811126,
"learning_rate": 9.995357197749611e-06,
"loss": 0.5474,
"step": 1053
},
{
"epoch": 0.0868418884403065,
"grad_norm": 4.624106631340867,
"learning_rate": 9.995328409730905e-06,
"loss": 0.506,
"step": 1054
},
{
"epoch": 0.0869242811238362,
"grad_norm": 5.743273453010476,
"learning_rate": 9.99529953277849e-06,
"loss": 0.3698,
"step": 1055
},
{
"epoch": 0.0870066738073659,
"grad_norm": 8.029930627600796,
"learning_rate": 9.995270566892884e-06,
"loss": 0.5471,
"step": 1056
},
{
"epoch": 0.08708906649089561,
"grad_norm": 5.194894530300615,
"learning_rate": 9.995241512074596e-06,
"loss": 0.6335,
"step": 1057
},
{
"epoch": 0.08717145917442531,
"grad_norm": 6.84466633274184,
"learning_rate": 9.995212368324147e-06,
"loss": 0.5793,
"step": 1058
},
{
"epoch": 0.08725385185795502,
"grad_norm": 7.268033072915504,
"learning_rate": 9.99518313564206e-06,
"loss": 0.653,
"step": 1059
},
{
"epoch": 0.08733624454148471,
"grad_norm": 4.021612447179571,
"learning_rate": 9.995153814028846e-06,
"loss": 0.4125,
"step": 1060
},
{
"epoch": 0.08741863722501442,
"grad_norm": 5.826262293101434,
"learning_rate": 9.995124403485036e-06,
"loss": 0.5812,
"step": 1061
},
{
"epoch": 0.08750102990854412,
"grad_norm": 4.557140038560946,
"learning_rate": 9.995094904011148e-06,
"loss": 0.3531,
"step": 1062
},
{
"epoch": 0.08758342259207383,
"grad_norm": 7.238981062698408,
"learning_rate": 9.99506531560771e-06,
"loss": 0.7622,
"step": 1063
},
{
"epoch": 0.08766581527560352,
"grad_norm": 4.21469929038344,
"learning_rate": 9.995035638275248e-06,
"loss": 0.6258,
"step": 1064
},
{
"epoch": 0.08774820795913323,
"grad_norm": 3.6655616059927345,
"learning_rate": 9.995005872014289e-06,
"loss": 0.3423,
"step": 1065
},
{
"epoch": 0.08783060064266293,
"grad_norm": 5.412977135346199,
"learning_rate": 9.994976016825367e-06,
"loss": 0.6841,
"step": 1066
},
{
"epoch": 0.08791299332619264,
"grad_norm": 5.379025435070761,
"learning_rate": 9.994946072709007e-06,
"loss": 0.6847,
"step": 1067
},
{
"epoch": 0.08799538600972234,
"grad_norm": 4.472227140989464,
"learning_rate": 9.994916039665748e-06,
"loss": 0.4647,
"step": 1068
},
{
"epoch": 0.08807777869325203,
"grad_norm": 5.060338129572461,
"learning_rate": 9.994885917696122e-06,
"loss": 0.6175,
"step": 1069
},
{
"epoch": 0.08816017137678174,
"grad_norm": 4.715322598305957,
"learning_rate": 9.994855706800666e-06,
"loss": 0.4338,
"step": 1070
},
{
"epoch": 0.08824256406031145,
"grad_norm": 4.281526810708517,
"learning_rate": 9.994825406979918e-06,
"loss": 0.5457,
"step": 1071
},
{
"epoch": 0.08832495674384115,
"grad_norm": 3.4716224269919476,
"learning_rate": 9.994795018234416e-06,
"loss": 0.4955,
"step": 1072
},
{
"epoch": 0.08840734942737084,
"grad_norm": 5.027771756957362,
"learning_rate": 9.994764540564702e-06,
"loss": 0.6585,
"step": 1073
},
{
"epoch": 0.08848974211090055,
"grad_norm": 4.724781093913095,
"learning_rate": 9.99473397397132e-06,
"loss": 0.4618,
"step": 1074
},
{
"epoch": 0.08857213479443025,
"grad_norm": 6.9846657667334915,
"learning_rate": 9.99470331845481e-06,
"loss": 0.5641,
"step": 1075
},
{
"epoch": 0.08865452747795996,
"grad_norm": 5.484701110827303,
"learning_rate": 9.994672574015724e-06,
"loss": 0.542,
"step": 1076
},
{
"epoch": 0.08873692016148967,
"grad_norm": 7.037433703723267,
"learning_rate": 9.994641740654604e-06,
"loss": 0.4367,
"step": 1077
},
{
"epoch": 0.08881931284501936,
"grad_norm": 5.776582968652833,
"learning_rate": 9.994610818372002e-06,
"loss": 0.5423,
"step": 1078
},
{
"epoch": 0.08890170552854906,
"grad_norm": 6.745245173679915,
"learning_rate": 9.994579807168468e-06,
"loss": 0.698,
"step": 1079
},
{
"epoch": 0.08898409821207877,
"grad_norm": 5.473254655410174,
"learning_rate": 9.994548707044551e-06,
"loss": 0.4812,
"step": 1080
},
{
"epoch": 0.08906649089560847,
"grad_norm": 4.628085367767044,
"learning_rate": 9.994517518000809e-06,
"loss": 0.4693,
"step": 1081
},
{
"epoch": 0.08914888357913817,
"grad_norm": 7.525465774535276,
"learning_rate": 9.994486240037794e-06,
"loss": 0.6911,
"step": 1082
},
{
"epoch": 0.08923127626266787,
"grad_norm": 5.893129420637143,
"learning_rate": 9.994454873156068e-06,
"loss": 0.6289,
"step": 1083
},
{
"epoch": 0.08931366894619758,
"grad_norm": 5.50149802924285,
"learning_rate": 9.994423417356183e-06,
"loss": 0.6096,
"step": 1084
},
{
"epoch": 0.08939606162972728,
"grad_norm": 4.773359114172165,
"learning_rate": 9.994391872638702e-06,
"loss": 0.4555,
"step": 1085
},
{
"epoch": 0.08947845431325699,
"grad_norm": 7.212055616353917,
"learning_rate": 9.994360239004186e-06,
"loss": 0.7443,
"step": 1086
},
{
"epoch": 0.08956084699678668,
"grad_norm": 7.876765760130183,
"learning_rate": 9.9943285164532e-06,
"loss": 0.7077,
"step": 1087
},
{
"epoch": 0.08964323968031639,
"grad_norm": 7.317645647763081,
"learning_rate": 9.994296704986306e-06,
"loss": 0.6041,
"step": 1088
},
{
"epoch": 0.08972563236384609,
"grad_norm": 4.530023086536175,
"learning_rate": 9.994264804604073e-06,
"loss": 0.4931,
"step": 1089
},
{
"epoch": 0.0898080250473758,
"grad_norm": 4.2526214869784065,
"learning_rate": 9.994232815307065e-06,
"loss": 0.2995,
"step": 1090
},
{
"epoch": 0.08989041773090549,
"grad_norm": 3.505242547342838,
"learning_rate": 9.994200737095857e-06,
"loss": 0.4473,
"step": 1091
},
{
"epoch": 0.0899728104144352,
"grad_norm": 4.429638177463622,
"learning_rate": 9.994168569971017e-06,
"loss": 0.5841,
"step": 1092
},
{
"epoch": 0.0900552030979649,
"grad_norm": 5.82955819981545,
"learning_rate": 9.994136313933117e-06,
"loss": 0.4789,
"step": 1093
},
{
"epoch": 0.0901375957814946,
"grad_norm": 13.672647133825611,
"learning_rate": 9.994103968982733e-06,
"loss": 0.8772,
"step": 1094
},
{
"epoch": 0.09021998846502431,
"grad_norm": 4.710965238730624,
"learning_rate": 9.994071535120439e-06,
"loss": 0.4686,
"step": 1095
},
{
"epoch": 0.090302381148554,
"grad_norm": 3.9920741559548367,
"learning_rate": 9.994039012346814e-06,
"loss": 0.4907,
"step": 1096
},
{
"epoch": 0.09038477383208371,
"grad_norm": 5.7078667377661825,
"learning_rate": 9.994006400662436e-06,
"loss": 0.617,
"step": 1097
},
{
"epoch": 0.09046716651561341,
"grad_norm": 3.8186640801772125,
"learning_rate": 9.993973700067888e-06,
"loss": 0.4375,
"step": 1098
},
{
"epoch": 0.09054955919914312,
"grad_norm": 5.873776628166568,
"learning_rate": 9.99394091056375e-06,
"loss": 0.5642,
"step": 1099
},
{
"epoch": 0.09063195188267281,
"grad_norm": 6.100201545489032,
"learning_rate": 9.993908032150604e-06,
"loss": 0.603,
"step": 1100
},
{
"epoch": 0.09071434456620252,
"grad_norm": 5.704701398966367,
"learning_rate": 9.99387506482904e-06,
"loss": 0.4836,
"step": 1101
},
{
"epoch": 0.09079673724973222,
"grad_norm": 3.5476812408346934,
"learning_rate": 9.99384200859964e-06,
"loss": 0.2808,
"step": 1102
},
{
"epoch": 0.09087912993326193,
"grad_norm": 4.151988355820425,
"learning_rate": 9.993808863462995e-06,
"loss": 0.381,
"step": 1103
},
{
"epoch": 0.09096152261679163,
"grad_norm": 4.313112079524609,
"learning_rate": 9.993775629419696e-06,
"loss": 0.3598,
"step": 1104
},
{
"epoch": 0.09104391530032133,
"grad_norm": 3.5113027125878085,
"learning_rate": 9.993742306470332e-06,
"loss": 0.2947,
"step": 1105
},
{
"epoch": 0.09112630798385103,
"grad_norm": 5.254248735680192,
"learning_rate": 9.993708894615502e-06,
"loss": 0.3881,
"step": 1106
},
{
"epoch": 0.09120870066738074,
"grad_norm": 7.426494059694848,
"learning_rate": 9.993675393855793e-06,
"loss": 0.812,
"step": 1107
},
{
"epoch": 0.09129109335091044,
"grad_norm": 8.860170716007438,
"learning_rate": 9.993641804191805e-06,
"loss": 0.7974,
"step": 1108
},
{
"epoch": 0.09137348603444014,
"grad_norm": 8.101723704520365,
"learning_rate": 9.99360812562414e-06,
"loss": 0.5309,
"step": 1109
},
{
"epoch": 0.09145587871796984,
"grad_norm": 42.341845463398435,
"learning_rate": 9.99357435815339e-06,
"loss": 1.8562,
"step": 1110
},
{
"epoch": 0.09153827140149955,
"grad_norm": 6.81287903015811,
"learning_rate": 9.993540501780161e-06,
"loss": 0.6941,
"step": 1111
},
{
"epoch": 0.09162066408502925,
"grad_norm": 5.635578780831226,
"learning_rate": 9.993506556505054e-06,
"loss": 0.4578,
"step": 1112
},
{
"epoch": 0.09170305676855896,
"grad_norm": 5.211604855452364,
"learning_rate": 9.993472522328676e-06,
"loss": 0.6778,
"step": 1113
},
{
"epoch": 0.09178544945208865,
"grad_norm": 5.1777549097773665,
"learning_rate": 9.99343839925163e-06,
"loss": 0.5578,
"step": 1114
},
{
"epoch": 0.09186784213561835,
"grad_norm": 4.842543684963219,
"learning_rate": 9.993404187274522e-06,
"loss": 0.5595,
"step": 1115
},
{
"epoch": 0.09195023481914806,
"grad_norm": 6.42302616275195,
"learning_rate": 9.993369886397967e-06,
"loss": 0.7556,
"step": 1116
},
{
"epoch": 0.09203262750267777,
"grad_norm": 6.211592177133203,
"learning_rate": 9.99333549662257e-06,
"loss": 0.5456,
"step": 1117
},
{
"epoch": 0.09211502018620746,
"grad_norm": 4.630213204114295,
"learning_rate": 9.993301017948946e-06,
"loss": 0.4993,
"step": 1118
},
{
"epoch": 0.09219741286973716,
"grad_norm": 7.009132877346605,
"learning_rate": 9.99326645037771e-06,
"loss": 0.8535,
"step": 1119
},
{
"epoch": 0.09227980555326687,
"grad_norm": 4.964465276218794,
"learning_rate": 9.993231793909474e-06,
"loss": 0.4111,
"step": 1120
},
{
"epoch": 0.09236219823679657,
"grad_norm": 5.696307168046908,
"learning_rate": 9.993197048544857e-06,
"loss": 0.5841,
"step": 1121
},
{
"epoch": 0.09244459092032628,
"grad_norm": 5.527989894111306,
"learning_rate": 9.993162214284478e-06,
"loss": 0.5463,
"step": 1122
},
{
"epoch": 0.09252698360385597,
"grad_norm": 5.171245280928823,
"learning_rate": 9.993127291128956e-06,
"loss": 0.6916,
"step": 1123
},
{
"epoch": 0.09260937628738568,
"grad_norm": 5.320160130999334,
"learning_rate": 9.993092279078914e-06,
"loss": 0.406,
"step": 1124
},
{
"epoch": 0.09269176897091538,
"grad_norm": 5.163510317928108,
"learning_rate": 9.993057178134973e-06,
"loss": 0.6965,
"step": 1125
},
{
"epoch": 0.09277416165444509,
"grad_norm": 6.282384783932995,
"learning_rate": 9.99302198829776e-06,
"loss": 0.6963,
"step": 1126
},
{
"epoch": 0.09285655433797478,
"grad_norm": 6.815231221580075,
"learning_rate": 9.992986709567902e-06,
"loss": 0.6793,
"step": 1127
},
{
"epoch": 0.09293894702150449,
"grad_norm": 4.643382322672108,
"learning_rate": 9.992951341946025e-06,
"loss": 0.3584,
"step": 1128
},
{
"epoch": 0.09302133970503419,
"grad_norm": 17.845535518840695,
"learning_rate": 9.992915885432759e-06,
"loss": 0.781,
"step": 1129
},
{
"epoch": 0.0931037323885639,
"grad_norm": 4.085175770281815,
"learning_rate": 9.992880340028736e-06,
"loss": 0.2735,
"step": 1130
},
{
"epoch": 0.0931861250720936,
"grad_norm": 4.864690311245262,
"learning_rate": 9.992844705734591e-06,
"loss": 0.547,
"step": 1131
},
{
"epoch": 0.0932685177556233,
"grad_norm": 5.03732878733345,
"learning_rate": 9.992808982550955e-06,
"loss": 0.5577,
"step": 1132
},
{
"epoch": 0.093350910439153,
"grad_norm": 8.201592365589455,
"learning_rate": 9.992773170478465e-06,
"loss": 0.7697,
"step": 1133
},
{
"epoch": 0.0934333031226827,
"grad_norm": 5.9794222226843114,
"learning_rate": 9.992737269517759e-06,
"loss": 0.6587,
"step": 1134
},
{
"epoch": 0.09351569580621241,
"grad_norm": 4.465989480795388,
"learning_rate": 9.992701279669477e-06,
"loss": 0.5631,
"step": 1135
},
{
"epoch": 0.09359808848974212,
"grad_norm": 4.85574702436748,
"learning_rate": 9.992665200934258e-06,
"loss": 0.4923,
"step": 1136
},
{
"epoch": 0.09368048117327181,
"grad_norm": 5.903537966151732,
"learning_rate": 9.992629033312744e-06,
"loss": 0.5924,
"step": 1137
},
{
"epoch": 0.09376287385680152,
"grad_norm": 5.9804211363312065,
"learning_rate": 9.99259277680558e-06,
"loss": 0.6413,
"step": 1138
},
{
"epoch": 0.09384526654033122,
"grad_norm": 6.216811540759033,
"learning_rate": 9.992556431413412e-06,
"loss": 0.4857,
"step": 1139
},
{
"epoch": 0.09392765922386093,
"grad_norm": 3.992746573683693,
"learning_rate": 9.992519997136887e-06,
"loss": 0.5609,
"step": 1140
},
{
"epoch": 0.09401005190739062,
"grad_norm": 6.133646477579303,
"learning_rate": 9.992483473976652e-06,
"loss": 0.7192,
"step": 1141
},
{
"epoch": 0.09409244459092032,
"grad_norm": 3.996697571383684,
"learning_rate": 9.992446861933358e-06,
"loss": 0.5403,
"step": 1142
},
{
"epoch": 0.09417483727445003,
"grad_norm": 4.876746898177986,
"learning_rate": 9.992410161007658e-06,
"loss": 0.6047,
"step": 1143
},
{
"epoch": 0.09425722995797974,
"grad_norm": 7.113186432470744,
"learning_rate": 9.992373371200206e-06,
"loss": 0.6164,
"step": 1144
},
{
"epoch": 0.09433962264150944,
"grad_norm": 4.5979462744336494,
"learning_rate": 9.992336492511653e-06,
"loss": 0.5623,
"step": 1145
},
{
"epoch": 0.09442201532503913,
"grad_norm": 5.0662774129679935,
"learning_rate": 9.992299524942658e-06,
"loss": 0.5222,
"step": 1146
},
{
"epoch": 0.09450440800856884,
"grad_norm": 6.317547083033332,
"learning_rate": 9.992262468493883e-06,
"loss": 0.7313,
"step": 1147
},
{
"epoch": 0.09458680069209854,
"grad_norm": 6.083162361809943,
"learning_rate": 9.99222532316598e-06,
"loss": 0.7805,
"step": 1148
},
{
"epoch": 0.09466919337562825,
"grad_norm": 6.645090805437031,
"learning_rate": 9.992188088959616e-06,
"loss": 0.5836,
"step": 1149
},
{
"epoch": 0.09475158605915794,
"grad_norm": 4.61742230222751,
"learning_rate": 9.992150765875452e-06,
"loss": 0.3845,
"step": 1150
},
{
"epoch": 0.09483397874268765,
"grad_norm": 4.9888454907973,
"learning_rate": 9.992113353914153e-06,
"loss": 0.5926,
"step": 1151
},
{
"epoch": 0.09491637142621735,
"grad_norm": 4.584756235692825,
"learning_rate": 9.992075853076385e-06,
"loss": 0.3355,
"step": 1152
},
{
"epoch": 0.09499876410974706,
"grad_norm": 3.6417513821813583,
"learning_rate": 9.992038263362815e-06,
"loss": 0.4314,
"step": 1153
},
{
"epoch": 0.09508115679327676,
"grad_norm": 4.554586660083454,
"learning_rate": 9.992000584774113e-06,
"loss": 0.5483,
"step": 1154
},
{
"epoch": 0.09516354947680646,
"grad_norm": 4.795571278019529,
"learning_rate": 9.991962817310947e-06,
"loss": 0.7088,
"step": 1155
},
{
"epoch": 0.09524594216033616,
"grad_norm": 5.551121573962564,
"learning_rate": 9.991924960973995e-06,
"loss": 0.6027,
"step": 1156
},
{
"epoch": 0.09532833484386587,
"grad_norm": 5.793039171212162,
"learning_rate": 9.991887015763926e-06,
"loss": 0.5796,
"step": 1157
},
{
"epoch": 0.09541072752739557,
"grad_norm": 3.914653681105744,
"learning_rate": 9.991848981681417e-06,
"loss": 0.5456,
"step": 1158
},
{
"epoch": 0.09549312021092526,
"grad_norm": 4.822015558280715,
"learning_rate": 9.991810858727147e-06,
"loss": 0.4228,
"step": 1159
},
{
"epoch": 0.09557551289445497,
"grad_norm": 6.430460772568001,
"learning_rate": 9.991772646901793e-06,
"loss": 0.511,
"step": 1160
},
{
"epoch": 0.09565790557798468,
"grad_norm": 5.31544358134059,
"learning_rate": 9.991734346206034e-06,
"loss": 0.4908,
"step": 1161
},
{
"epoch": 0.09574029826151438,
"grad_norm": 4.579164539846181,
"learning_rate": 9.991695956640555e-06,
"loss": 0.5216,
"step": 1162
},
{
"epoch": 0.09582269094504409,
"grad_norm": 10.980434026023534,
"learning_rate": 9.991657478206037e-06,
"loss": 0.4225,
"step": 1163
},
{
"epoch": 0.09590508362857378,
"grad_norm": 11.232153491350044,
"learning_rate": 9.991618910903165e-06,
"loss": 0.4346,
"step": 1164
},
{
"epoch": 0.09598747631210348,
"grad_norm": 5.016670131834788,
"learning_rate": 9.99158025473263e-06,
"loss": 0.4156,
"step": 1165
},
{
"epoch": 0.09606986899563319,
"grad_norm": 5.823029951418563,
"learning_rate": 9.991541509695113e-06,
"loss": 0.6179,
"step": 1166
},
{
"epoch": 0.0961522616791629,
"grad_norm": 6.14466976320633,
"learning_rate": 9.991502675791308e-06,
"loss": 0.5943,
"step": 1167
},
{
"epoch": 0.09623465436269259,
"grad_norm": 5.663257166183292,
"learning_rate": 9.991463753021907e-06,
"loss": 0.4774,
"step": 1168
},
{
"epoch": 0.09631704704622229,
"grad_norm": 4.613753681467516,
"learning_rate": 9.991424741387601e-06,
"loss": 0.3702,
"step": 1169
},
{
"epoch": 0.096399439729752,
"grad_norm": 6.32248078526071,
"learning_rate": 9.991385640889087e-06,
"loss": 0.548,
"step": 1170
},
{
"epoch": 0.0964818324132817,
"grad_norm": 6.541355663315621,
"learning_rate": 9.991346451527058e-06,
"loss": 0.6273,
"step": 1171
},
{
"epoch": 0.09656422509681141,
"grad_norm": 5.283297075822956,
"learning_rate": 9.991307173302212e-06,
"loss": 0.3891,
"step": 1172
},
{
"epoch": 0.0966466177803411,
"grad_norm": 4.233220917693166,
"learning_rate": 9.991267806215251e-06,
"loss": 0.3118,
"step": 1173
},
{
"epoch": 0.09672901046387081,
"grad_norm": 4.413307234382051,
"learning_rate": 9.991228350266875e-06,
"loss": 0.5013,
"step": 1174
},
{
"epoch": 0.09681140314740051,
"grad_norm": 3.56821873368059,
"learning_rate": 9.991188805457784e-06,
"loss": 0.4205,
"step": 1175
},
{
"epoch": 0.09689379583093022,
"grad_norm": 6.493304664428958,
"learning_rate": 9.991149171788686e-06,
"loss": 0.6007,
"step": 1176
},
{
"epoch": 0.09697618851445991,
"grad_norm": 4.839910460452555,
"learning_rate": 9.991109449260283e-06,
"loss": 0.511,
"step": 1177
},
{
"epoch": 0.09705858119798962,
"grad_norm": 6.988660129796937,
"learning_rate": 9.991069637873282e-06,
"loss": 0.8373,
"step": 1178
},
{
"epoch": 0.09714097388151932,
"grad_norm": 7.174369580906187,
"learning_rate": 9.991029737628397e-06,
"loss": 0.4762,
"step": 1179
},
{
"epoch": 0.09722336656504903,
"grad_norm": 5.530363323174134,
"learning_rate": 9.990989748526334e-06,
"loss": 0.4079,
"step": 1180
},
{
"epoch": 0.09730575924857873,
"grad_norm": 7.764029045054272,
"learning_rate": 9.990949670567804e-06,
"loss": 0.5609,
"step": 1181
},
{
"epoch": 0.09738815193210842,
"grad_norm": 4.6082841469746025,
"learning_rate": 9.990909503753524e-06,
"loss": 0.5465,
"step": 1182
},
{
"epoch": 0.09747054461563813,
"grad_norm": 4.287818673511596,
"learning_rate": 9.990869248084205e-06,
"loss": 0.3848,
"step": 1183
},
{
"epoch": 0.09755293729916784,
"grad_norm": 5.323729474167395,
"learning_rate": 9.990828903560568e-06,
"loss": 0.5052,
"step": 1184
},
{
"epoch": 0.09763532998269754,
"grad_norm": 3.5424616789602856,
"learning_rate": 9.990788470183328e-06,
"loss": 0.2952,
"step": 1185
},
{
"epoch": 0.09771772266622723,
"grad_norm": 3.9137775832429584,
"learning_rate": 9.990747947953207e-06,
"loss": 0.2791,
"step": 1186
},
{
"epoch": 0.09780011534975694,
"grad_norm": 5.032302762992251,
"learning_rate": 9.990707336870925e-06,
"loss": 0.3739,
"step": 1187
},
{
"epoch": 0.09788250803328664,
"grad_norm": 5.581163892106261,
"learning_rate": 9.990666636937207e-06,
"loss": 0.5531,
"step": 1188
},
{
"epoch": 0.09796490071681635,
"grad_norm": 6.884687269301154,
"learning_rate": 9.990625848152775e-06,
"loss": 0.7531,
"step": 1189
},
{
"epoch": 0.09804729340034606,
"grad_norm": 5.374496441514754,
"learning_rate": 9.990584970518355e-06,
"loss": 0.6825,
"step": 1190
},
{
"epoch": 0.09812968608387575,
"grad_norm": 4.491076319002902,
"learning_rate": 9.99054400403468e-06,
"loss": 0.5471,
"step": 1191
},
{
"epoch": 0.09821207876740545,
"grad_norm": 4.568092241493959,
"learning_rate": 9.990502948702472e-06,
"loss": 0.2779,
"step": 1192
},
{
"epoch": 0.09829447145093516,
"grad_norm": 5.707272721328885,
"learning_rate": 9.990461804522466e-06,
"loss": 0.6366,
"step": 1193
},
{
"epoch": 0.09837686413446486,
"grad_norm": 4.826130217944562,
"learning_rate": 9.990420571495394e-06,
"loss": 0.605,
"step": 1194
},
{
"epoch": 0.09845925681799456,
"grad_norm": 4.33472383368196,
"learning_rate": 9.990379249621991e-06,
"loss": 0.6158,
"step": 1195
},
{
"epoch": 0.09854164950152426,
"grad_norm": 4.997331262544171,
"learning_rate": 9.990337838902992e-06,
"loss": 0.5247,
"step": 1196
},
{
"epoch": 0.09862404218505397,
"grad_norm": 3.544784718142565,
"learning_rate": 9.990296339339131e-06,
"loss": 0.5761,
"step": 1197
},
{
"epoch": 0.09870643486858367,
"grad_norm": 4.904002550116714,
"learning_rate": 9.990254750931153e-06,
"loss": 0.4465,
"step": 1198
},
{
"epoch": 0.09878882755211338,
"grad_norm": 4.097219050092533,
"learning_rate": 9.990213073679793e-06,
"loss": 0.5315,
"step": 1199
},
{
"epoch": 0.09887122023564307,
"grad_norm": 5.885850480185024,
"learning_rate": 9.990171307585797e-06,
"loss": 0.4493,
"step": 1200
},
{
"epoch": 0.09895361291917278,
"grad_norm": 4.33215452297953,
"learning_rate": 9.990129452649906e-06,
"loss": 0.4882,
"step": 1201
},
{
"epoch": 0.09903600560270248,
"grad_norm": 4.268587794277847,
"learning_rate": 9.990087508872865e-06,
"loss": 0.444,
"step": 1202
},
{
"epoch": 0.09911839828623219,
"grad_norm": 42.319887791095226,
"learning_rate": 9.990045476255422e-06,
"loss": 1.8771,
"step": 1203
},
{
"epoch": 0.09920079096976188,
"grad_norm": 40.893472301948556,
"learning_rate": 9.990003354798326e-06,
"loss": 1.546,
"step": 1204
},
{
"epoch": 0.09928318365329158,
"grad_norm": 4.301738471538322,
"learning_rate": 9.989961144502324e-06,
"loss": 0.6113,
"step": 1205
},
{
"epoch": 0.09936557633682129,
"grad_norm": 4.904331603406959,
"learning_rate": 9.98991884536817e-06,
"loss": 0.504,
"step": 1206
},
{
"epoch": 0.099447969020351,
"grad_norm": 4.841374601844491,
"learning_rate": 9.989876457396616e-06,
"loss": 0.6375,
"step": 1207
},
{
"epoch": 0.0995303617038807,
"grad_norm": 3.6457744187919996,
"learning_rate": 9.989833980588419e-06,
"loss": 0.4475,
"step": 1208
},
{
"epoch": 0.0996127543874104,
"grad_norm": 6.177090683739293,
"learning_rate": 9.989791414944332e-06,
"loss": 0.4527,
"step": 1209
},
{
"epoch": 0.0996951470709401,
"grad_norm": 6.948933775306833,
"learning_rate": 9.989748760465114e-06,
"loss": 0.4229,
"step": 1210
},
{
"epoch": 0.0997775397544698,
"grad_norm": 4.081918060719312,
"learning_rate": 9.989706017151526e-06,
"loss": 0.4226,
"step": 1211
},
{
"epoch": 0.09985993243799951,
"grad_norm": 5.241837641447958,
"learning_rate": 9.989663185004326e-06,
"loss": 0.6111,
"step": 1212
},
{
"epoch": 0.0999423251215292,
"grad_norm": 4.874711378457223,
"learning_rate": 9.989620264024278e-06,
"loss": 0.5264,
"step": 1213
},
{
"epoch": 0.10002471780505891,
"grad_norm": 6.576308785350509,
"learning_rate": 9.989577254212147e-06,
"loss": 0.7179,
"step": 1214
},
{
"epoch": 0.10010711048858861,
"grad_norm": 5.81751849230087,
"learning_rate": 9.989534155568696e-06,
"loss": 0.4763,
"step": 1215
},
{
"epoch": 0.10018950317211832,
"grad_norm": 5.144482743322481,
"learning_rate": 9.989490968094695e-06,
"loss": 0.5334,
"step": 1216
},
{
"epoch": 0.10027189585564802,
"grad_norm": 4.558820107084972,
"learning_rate": 9.989447691790912e-06,
"loss": 0.4786,
"step": 1217
},
{
"epoch": 0.10035428853917772,
"grad_norm": 5.042904719669194,
"learning_rate": 9.98940432665812e-06,
"loss": 0.351,
"step": 1218
},
{
"epoch": 0.10043668122270742,
"grad_norm": 7.9762498299247095,
"learning_rate": 9.989360872697085e-06,
"loss": 0.0755,
"step": 1219
},
{
"epoch": 0.10051907390623713,
"grad_norm": 5.90945797643557,
"learning_rate": 9.989317329908585e-06,
"loss": 0.5389,
"step": 1220
},
{
"epoch": 0.10060146658976683,
"grad_norm": 6.595876901546739,
"learning_rate": 9.989273698293396e-06,
"loss": 0.5458,
"step": 1221
},
{
"epoch": 0.10068385927329652,
"grad_norm": 3.5037946342073076,
"learning_rate": 9.989229977852292e-06,
"loss": 0.3967,
"step": 1222
},
{
"epoch": 0.10076625195682623,
"grad_norm": 4.8746671742155145,
"learning_rate": 9.989186168586054e-06,
"loss": 0.536,
"step": 1223
},
{
"epoch": 0.10084864464035594,
"grad_norm": 6.885826091957547,
"learning_rate": 9.989142270495458e-06,
"loss": 0.7177,
"step": 1224
},
{
"epoch": 0.10093103732388564,
"grad_norm": 4.581417536969941,
"learning_rate": 9.98909828358129e-06,
"loss": 0.4632,
"step": 1225
},
{
"epoch": 0.10101343000741535,
"grad_norm": 6.322372039008234,
"learning_rate": 9.989054207844331e-06,
"loss": 0.6098,
"step": 1226
},
{
"epoch": 0.10109582269094504,
"grad_norm": 5.273630448320928,
"learning_rate": 9.989010043285365e-06,
"loss": 0.5149,
"step": 1227
},
{
"epoch": 0.10117821537447474,
"grad_norm": 7.791366040723516,
"learning_rate": 9.988965789905179e-06,
"loss": 0.693,
"step": 1228
},
{
"epoch": 0.10126060805800445,
"grad_norm": 9.738396758859928,
"learning_rate": 9.988921447704563e-06,
"loss": 0.5488,
"step": 1229
},
{
"epoch": 0.10134300074153416,
"grad_norm": 5.146859630461013,
"learning_rate": 9.988877016684302e-06,
"loss": 0.5047,
"step": 1230
},
{
"epoch": 0.10142539342506385,
"grad_norm": 5.257519730963288,
"learning_rate": 9.98883249684519e-06,
"loss": 0.6279,
"step": 1231
},
{
"epoch": 0.10150778610859355,
"grad_norm": 6.374293874468573,
"learning_rate": 9.988787888188021e-06,
"loss": 0.5565,
"step": 1232
},
{
"epoch": 0.10159017879212326,
"grad_norm": 3.579113688809698,
"learning_rate": 9.988743190713585e-06,
"loss": 0.3567,
"step": 1233
},
{
"epoch": 0.10167257147565296,
"grad_norm": 5.160088719699968,
"learning_rate": 9.988698404422682e-06,
"loss": 0.512,
"step": 1234
},
{
"epoch": 0.10175496415918267,
"grad_norm": 6.862175012633274,
"learning_rate": 9.988653529316106e-06,
"loss": 0.4836,
"step": 1235
},
{
"epoch": 0.10183735684271236,
"grad_norm": 7.94291959200385,
"learning_rate": 9.988608565394658e-06,
"loss": 0.6196,
"step": 1236
},
{
"epoch": 0.10191974952624207,
"grad_norm": 8.071509431185923,
"learning_rate": 9.988563512659137e-06,
"loss": 0.7937,
"step": 1237
},
{
"epoch": 0.10200214220977177,
"grad_norm": 4.085112773207024,
"learning_rate": 9.988518371110346e-06,
"loss": 0.5843,
"step": 1238
},
{
"epoch": 0.10208453489330148,
"grad_norm": 3.4369756535780036,
"learning_rate": 9.988473140749089e-06,
"loss": 0.4593,
"step": 1239
},
{
"epoch": 0.10216692757683117,
"grad_norm": 4.233546401366111,
"learning_rate": 9.98842782157617e-06,
"loss": 0.1937,
"step": 1240
},
{
"epoch": 0.10224932026036088,
"grad_norm": 5.0020721675763165,
"learning_rate": 9.988382413592398e-06,
"loss": 0.3163,
"step": 1241
},
{
"epoch": 0.10233171294389058,
"grad_norm": 6.741423995217005,
"learning_rate": 9.98833691679858e-06,
"loss": 0.4598,
"step": 1242
},
{
"epoch": 0.10241410562742029,
"grad_norm": 5.601574362819235,
"learning_rate": 9.988291331195525e-06,
"loss": 0.468,
"step": 1243
},
{
"epoch": 0.10249649831095,
"grad_norm": 5.251186632818674,
"learning_rate": 9.988245656784045e-06,
"loss": 0.4222,
"step": 1244
},
{
"epoch": 0.10257889099447969,
"grad_norm": 8.378822533378939,
"learning_rate": 9.988199893564956e-06,
"loss": 0.8973,
"step": 1245
},
{
"epoch": 0.10266128367800939,
"grad_norm": 5.961577135188696,
"learning_rate": 9.98815404153907e-06,
"loss": 0.5762,
"step": 1246
},
{
"epoch": 0.1027436763615391,
"grad_norm": 9.099142670765637,
"learning_rate": 9.988108100707203e-06,
"loss": 0.7662,
"step": 1247
},
{
"epoch": 0.1028260690450688,
"grad_norm": 6.092979434479812,
"learning_rate": 9.988062071070174e-06,
"loss": 0.6146,
"step": 1248
},
{
"epoch": 0.1029084617285985,
"grad_norm": 5.159988340065593,
"learning_rate": 9.988015952628802e-06,
"loss": 0.4235,
"step": 1249
},
{
"epoch": 0.1029908544121282,
"grad_norm": 5.5843012218596995,
"learning_rate": 9.987969745383908e-06,
"loss": 0.6002,
"step": 1250
},
{
"epoch": 0.1030732470956579,
"grad_norm": 5.194975705721119,
"learning_rate": 9.987923449336316e-06,
"loss": 0.3804,
"step": 1251
},
{
"epoch": 0.10315563977918761,
"grad_norm": 4.495661995779881,
"learning_rate": 9.98787706448685e-06,
"loss": 0.5055,
"step": 1252
},
{
"epoch": 0.10323803246271732,
"grad_norm": 4.23602600968263,
"learning_rate": 9.987830590836335e-06,
"loss": 0.5776,
"step": 1253
},
{
"epoch": 0.10332042514624701,
"grad_norm": 6.237589314146654,
"learning_rate": 9.987784028385596e-06,
"loss": 0.4792,
"step": 1254
},
{
"epoch": 0.10340281782977671,
"grad_norm": 7.598718878609416,
"learning_rate": 9.987737377135464e-06,
"loss": 0.8099,
"step": 1255
},
{
"epoch": 0.10348521051330642,
"grad_norm": 3.2818516957200283,
"learning_rate": 9.987690637086772e-06,
"loss": 0.3107,
"step": 1256
},
{
"epoch": 0.10356760319683612,
"grad_norm": 5.155044529125087,
"learning_rate": 9.987643808240351e-06,
"loss": 0.4354,
"step": 1257
},
{
"epoch": 0.10364999588036582,
"grad_norm": 4.263048262690633,
"learning_rate": 9.98759689059703e-06,
"loss": 0.3961,
"step": 1258
},
{
"epoch": 0.10373238856389552,
"grad_norm": 5.420585016670734,
"learning_rate": 9.987549884157652e-06,
"loss": 0.5856,
"step": 1259
},
{
"epoch": 0.10381478124742523,
"grad_norm": 5.035305806630145,
"learning_rate": 9.987502788923047e-06,
"loss": 0.5991,
"step": 1260
},
{
"epoch": 0.10389717393095493,
"grad_norm": 5.210172313300331,
"learning_rate": 9.987455604894059e-06,
"loss": 0.6802,
"step": 1261
},
{
"epoch": 0.10397956661448464,
"grad_norm": 5.208839152242212,
"learning_rate": 9.987408332071522e-06,
"loss": 0.5894,
"step": 1262
},
{
"epoch": 0.10406195929801433,
"grad_norm": 5.007424953872612,
"learning_rate": 9.987360970456284e-06,
"loss": 0.6866,
"step": 1263
},
{
"epoch": 0.10414435198154404,
"grad_norm": 5.492166942733279,
"learning_rate": 9.987313520049184e-06,
"loss": 0.5856,
"step": 1264
},
{
"epoch": 0.10422674466507374,
"grad_norm": 44.58199918343172,
"learning_rate": 9.987265980851069e-06,
"loss": 1.9599,
"step": 1265
},
{
"epoch": 0.10430913734860345,
"grad_norm": 4.5829858248139175,
"learning_rate": 9.987218352862781e-06,
"loss": 0.5187,
"step": 1266
},
{
"epoch": 0.10439153003213314,
"grad_norm": 5.025959968933363,
"learning_rate": 9.987170636085175e-06,
"loss": 0.5232,
"step": 1267
},
{
"epoch": 0.10447392271566285,
"grad_norm": 33.25337475336521,
"learning_rate": 9.987122830519096e-06,
"loss": 0.675,
"step": 1268
},
{
"epoch": 0.10455631539919255,
"grad_norm": 6.627024708916473,
"learning_rate": 9.987074936165394e-06,
"loss": 0.6327,
"step": 1269
},
{
"epoch": 0.10463870808272226,
"grad_norm": 14.391260890255582,
"learning_rate": 9.987026953024927e-06,
"loss": 0.171,
"step": 1270
},
{
"epoch": 0.10472110076625196,
"grad_norm": 4.702108978609021,
"learning_rate": 9.986978881098543e-06,
"loss": 0.3207,
"step": 1271
},
{
"epoch": 0.10480349344978165,
"grad_norm": 6.034579770929771,
"learning_rate": 9.986930720387103e-06,
"loss": 0.4834,
"step": 1272
},
{
"epoch": 0.10488588613331136,
"grad_norm": 5.096157625239223,
"learning_rate": 9.986882470891458e-06,
"loss": 0.3464,
"step": 1273
},
{
"epoch": 0.10496827881684107,
"grad_norm": 6.817695716429012,
"learning_rate": 9.986834132612475e-06,
"loss": 0.6021,
"step": 1274
},
{
"epoch": 0.10505067150037077,
"grad_norm": 5.953244971338033,
"learning_rate": 9.98678570555101e-06,
"loss": 0.682,
"step": 1275
},
{
"epoch": 0.10513306418390048,
"grad_norm": 5.0509725079955965,
"learning_rate": 9.986737189707924e-06,
"loss": 0.6976,
"step": 1276
},
{
"epoch": 0.10521545686743017,
"grad_norm": 5.45215491215228,
"learning_rate": 9.986688585084086e-06,
"loss": 0.5298,
"step": 1277
},
{
"epoch": 0.10529784955095987,
"grad_norm": 4.412292451733061,
"learning_rate": 9.986639891680356e-06,
"loss": 0.3362,
"step": 1278
},
{
"epoch": 0.10538024223448958,
"grad_norm": 5.221866379996899,
"learning_rate": 9.986591109497601e-06,
"loss": 0.5397,
"step": 1279
},
{
"epoch": 0.10546263491801929,
"grad_norm": 5.480429366404604,
"learning_rate": 9.986542238536694e-06,
"loss": 0.4179,
"step": 1280
},
{
"epoch": 0.10554502760154898,
"grad_norm": 4.6929338481975,
"learning_rate": 9.986493278798502e-06,
"loss": 0.3414,
"step": 1281
},
{
"epoch": 0.10562742028507868,
"grad_norm": 5.128214656502921,
"learning_rate": 9.986444230283896e-06,
"loss": 0.3893,
"step": 1282
},
{
"epoch": 0.10570981296860839,
"grad_norm": 6.237929917953249,
"learning_rate": 9.986395092993751e-06,
"loss": 0.479,
"step": 1283
},
{
"epoch": 0.1057922056521381,
"grad_norm": 7.2745793133813015,
"learning_rate": 9.98634586692894e-06,
"loss": 0.6739,
"step": 1284
},
{
"epoch": 0.1058745983356678,
"grad_norm": 4.95514436276389,
"learning_rate": 9.986296552090343e-06,
"loss": 0.3535,
"step": 1285
},
{
"epoch": 0.10595699101919749,
"grad_norm": 6.709823023172423,
"learning_rate": 9.986247148478834e-06,
"loss": 0.4273,
"step": 1286
},
{
"epoch": 0.1060393837027272,
"grad_norm": 7.894794011841938,
"learning_rate": 9.986197656095293e-06,
"loss": 0.5231,
"step": 1287
},
{
"epoch": 0.1061217763862569,
"grad_norm": 6.771758510510305,
"learning_rate": 9.986148074940602e-06,
"loss": 0.6098,
"step": 1288
},
{
"epoch": 0.10620416906978661,
"grad_norm": 8.620140427216912,
"learning_rate": 9.986098405015646e-06,
"loss": 0.6816,
"step": 1289
},
{
"epoch": 0.1062865617533163,
"grad_norm": 7.649934951424651,
"learning_rate": 9.986048646321306e-06,
"loss": 0.6417,
"step": 1290
},
{
"epoch": 0.106368954436846,
"grad_norm": 5.89217219407464,
"learning_rate": 9.98599879885847e-06,
"loss": 0.436,
"step": 1291
},
{
"epoch": 0.10645134712037571,
"grad_norm": 4.562164944360262,
"learning_rate": 9.985948862628023e-06,
"loss": 0.4035,
"step": 1292
},
{
"epoch": 0.10653373980390542,
"grad_norm": 10.224434742049766,
"learning_rate": 9.985898837630856e-06,
"loss": 0.7638,
"step": 1293
},
{
"epoch": 0.10661613248743512,
"grad_norm": 6.688388151927518,
"learning_rate": 9.98584872386786e-06,
"loss": 0.6856,
"step": 1294
},
{
"epoch": 0.10669852517096481,
"grad_norm": 5.548007921469972,
"learning_rate": 9.985798521339924e-06,
"loss": 0.4693,
"step": 1295
},
{
"epoch": 0.10678091785449452,
"grad_norm": 6.389072481221684,
"learning_rate": 9.985748230047944e-06,
"loss": 0.7325,
"step": 1296
},
{
"epoch": 0.10686331053802423,
"grad_norm": 4.857740386437905,
"learning_rate": 9.985697849992818e-06,
"loss": 0.4256,
"step": 1297
},
{
"epoch": 0.10694570322155393,
"grad_norm": 4.244141671261363,
"learning_rate": 9.98564738117544e-06,
"loss": 0.4349,
"step": 1298
},
{
"epoch": 0.10702809590508362,
"grad_norm": 8.506166832118577,
"learning_rate": 9.985596823596708e-06,
"loss": 0.6764,
"step": 1299
},
{
"epoch": 0.10711048858861333,
"grad_norm": 35.49665995961184,
"learning_rate": 9.985546177257523e-06,
"loss": 0.8473,
"step": 1300
},
{
"epoch": 0.10719288127214303,
"grad_norm": 4.860038932251913,
"learning_rate": 9.985495442158785e-06,
"loss": 0.4825,
"step": 1301
},
{
"epoch": 0.10727527395567274,
"grad_norm": 6.230406301466884,
"learning_rate": 9.985444618301401e-06,
"loss": 0.65,
"step": 1302
},
{
"epoch": 0.10735766663920245,
"grad_norm": 4.218360186395634,
"learning_rate": 9.985393705686274e-06,
"loss": 0.3347,
"step": 1303
},
{
"epoch": 0.10744005932273214,
"grad_norm": 14.47184287987901,
"learning_rate": 9.985342704314308e-06,
"loss": 1.0207,
"step": 1304
},
{
"epoch": 0.10752245200626184,
"grad_norm": 6.850987039774206,
"learning_rate": 9.985291614186417e-06,
"loss": 0.7262,
"step": 1305
},
{
"epoch": 0.10760484468979155,
"grad_norm": 5.9020975240354385,
"learning_rate": 9.985240435303505e-06,
"loss": 0.4397,
"step": 1306
},
{
"epoch": 0.10768723737332125,
"grad_norm": 4.756886284647011,
"learning_rate": 9.985189167666484e-06,
"loss": 0.3688,
"step": 1307
},
{
"epoch": 0.10776963005685095,
"grad_norm": 5.157047719389633,
"learning_rate": 9.985137811276268e-06,
"loss": 0.6059,
"step": 1308
},
{
"epoch": 0.10785202274038065,
"grad_norm": 6.27032589224842,
"learning_rate": 9.985086366133771e-06,
"loss": 0.7094,
"step": 1309
},
{
"epoch": 0.10793441542391036,
"grad_norm": 4.952356354117285,
"learning_rate": 9.985034832239908e-06,
"loss": 0.4373,
"step": 1310
},
{
"epoch": 0.10801680810744006,
"grad_norm": 6.469389577291621,
"learning_rate": 9.984983209595598e-06,
"loss": 0.5434,
"step": 1311
},
{
"epoch": 0.10809920079096977,
"grad_norm": 4.993310174827445,
"learning_rate": 9.98493149820176e-06,
"loss": 0.5393,
"step": 1312
},
{
"epoch": 0.10818159347449946,
"grad_norm": 3.6178325149897046,
"learning_rate": 9.984879698059314e-06,
"loss": 0.2839,
"step": 1313
},
{
"epoch": 0.10826398615802917,
"grad_norm": 4.710143431767484,
"learning_rate": 9.98482780916918e-06,
"loss": 0.5721,
"step": 1314
},
{
"epoch": 0.10834637884155887,
"grad_norm": 7.01427162272288,
"learning_rate": 9.984775831532288e-06,
"loss": 0.6726,
"step": 1315
},
{
"epoch": 0.10842877152508858,
"grad_norm": 6.672994509695413,
"learning_rate": 9.984723765149555e-06,
"loss": 0.6024,
"step": 1316
},
{
"epoch": 0.10851116420861827,
"grad_norm": 5.677032485840668,
"learning_rate": 9.984671610021916e-06,
"loss": 0.6153,
"step": 1317
},
{
"epoch": 0.10859355689214797,
"grad_norm": 5.458697212060859,
"learning_rate": 9.984619366150294e-06,
"loss": 0.599,
"step": 1318
},
{
"epoch": 0.10867594957567768,
"grad_norm": 4.616675469079038,
"learning_rate": 9.98456703353562e-06,
"loss": 0.5492,
"step": 1319
},
{
"epoch": 0.10875834225920739,
"grad_norm": 6.386675749076506,
"learning_rate": 9.98451461217883e-06,
"loss": 0.6014,
"step": 1320
},
{
"epoch": 0.10884073494273709,
"grad_norm": 7.024628115810297,
"learning_rate": 9.984462102080852e-06,
"loss": 0.778,
"step": 1321
},
{
"epoch": 0.10892312762626678,
"grad_norm": 5.033824012329933,
"learning_rate": 9.984409503242623e-06,
"loss": 0.4687,
"step": 1322
},
{
"epoch": 0.10900552030979649,
"grad_norm": 4.962648224329656,
"learning_rate": 9.98435681566508e-06,
"loss": 0.5488,
"step": 1323
},
{
"epoch": 0.1090879129933262,
"grad_norm": 7.920473586250384,
"learning_rate": 9.984304039349159e-06,
"loss": 0.6991,
"step": 1324
},
{
"epoch": 0.1091703056768559,
"grad_norm": 5.738981101778399,
"learning_rate": 9.9842511742958e-06,
"loss": 0.5916,
"step": 1325
},
{
"epoch": 0.10925269836038559,
"grad_norm": 4.679325132639655,
"learning_rate": 9.984198220505947e-06,
"loss": 0.5934,
"step": 1326
},
{
"epoch": 0.1093350910439153,
"grad_norm": 3.9674401675344346,
"learning_rate": 9.984145177980541e-06,
"loss": 0.5008,
"step": 1327
},
{
"epoch": 0.109417483727445,
"grad_norm": 5.381770819469537,
"learning_rate": 9.984092046720526e-06,
"loss": 0.4177,
"step": 1328
},
{
"epoch": 0.10949987641097471,
"grad_norm": 5.770846813479589,
"learning_rate": 9.984038826726847e-06,
"loss": 0.4167,
"step": 1329
},
{
"epoch": 0.10958226909450441,
"grad_norm": 6.865983869702629,
"learning_rate": 9.983985518000455e-06,
"loss": 0.7061,
"step": 1330
},
{
"epoch": 0.1096646617780341,
"grad_norm": 3.9275472528501925,
"learning_rate": 9.983932120542294e-06,
"loss": 0.2918,
"step": 1331
},
{
"epoch": 0.10974705446156381,
"grad_norm": 6.07045204922866,
"learning_rate": 9.983878634353317e-06,
"loss": 0.6954,
"step": 1332
},
{
"epoch": 0.10982944714509352,
"grad_norm": 4.141327747866078,
"learning_rate": 9.983825059434478e-06,
"loss": 0.3842,
"step": 1333
},
{
"epoch": 0.10991183982862322,
"grad_norm": 5.786533535647971,
"learning_rate": 9.98377139578673e-06,
"loss": 0.5725,
"step": 1334
},
{
"epoch": 0.10999423251215291,
"grad_norm": 6.8133544098549415,
"learning_rate": 9.983717643411027e-06,
"loss": 0.7385,
"step": 1335
},
{
"epoch": 0.11007662519568262,
"grad_norm": 6.85720202285886,
"learning_rate": 9.983663802308326e-06,
"loss": 0.5718,
"step": 1336
},
{
"epoch": 0.11015901787921233,
"grad_norm": 6.136362509792475,
"learning_rate": 9.983609872479587e-06,
"loss": 0.3897,
"step": 1337
},
{
"epoch": 0.11024141056274203,
"grad_norm": 4.797859015857666,
"learning_rate": 9.98355585392577e-06,
"loss": 0.6325,
"step": 1338
},
{
"epoch": 0.11032380324627174,
"grad_norm": 8.316767655871656,
"learning_rate": 9.983501746647835e-06,
"loss": 0.6866,
"step": 1339
},
{
"epoch": 0.11040619592980143,
"grad_norm": 9.033213998462069,
"learning_rate": 9.983447550646748e-06,
"loss": 0.6542,
"step": 1340
},
{
"epoch": 0.11048858861333113,
"grad_norm": 33.85428238105279,
"learning_rate": 9.98339326592347e-06,
"loss": 0.9791,
"step": 1341
},
{
"epoch": 0.11057098129686084,
"grad_norm": 6.407367839369825,
"learning_rate": 9.98333889247897e-06,
"loss": 0.6029,
"step": 1342
},
{
"epoch": 0.11065337398039055,
"grad_norm": 20.144367498349798,
"learning_rate": 9.983284430314217e-06,
"loss": 0.5827,
"step": 1343
},
{
"epoch": 0.11073576666392024,
"grad_norm": 4.6534351475822655,
"learning_rate": 9.98322987943018e-06,
"loss": 0.4664,
"step": 1344
},
{
"epoch": 0.11081815934744994,
"grad_norm": 8.15101971157244,
"learning_rate": 9.983175239827829e-06,
"loss": 0.7332,
"step": 1345
},
{
"epoch": 0.11090055203097965,
"grad_norm": 4.753437093306661,
"learning_rate": 9.983120511508136e-06,
"loss": 0.5571,
"step": 1346
},
{
"epoch": 0.11098294471450935,
"grad_norm": 11.681875120750966,
"learning_rate": 9.983065694472078e-06,
"loss": 0.4647,
"step": 1347
},
{
"epoch": 0.11106533739803906,
"grad_norm": 12.117460717638025,
"learning_rate": 9.983010788720629e-06,
"loss": 0.538,
"step": 1348
},
{
"epoch": 0.11114773008156875,
"grad_norm": 6.829274575800984,
"learning_rate": 9.982955794254768e-06,
"loss": 0.7028,
"step": 1349
},
{
"epoch": 0.11123012276509846,
"grad_norm": 13.602827030669436,
"learning_rate": 9.982900711075473e-06,
"loss": 0.7064,
"step": 1350
},
{
"epoch": 0.11131251544862816,
"grad_norm": 5.8175552794909775,
"learning_rate": 9.982845539183724e-06,
"loss": 0.6018,
"step": 1351
},
{
"epoch": 0.11139490813215787,
"grad_norm": 8.31044623036391,
"learning_rate": 9.982790278580505e-06,
"loss": 0.3879,
"step": 1352
},
{
"epoch": 0.11147730081568756,
"grad_norm": 6.099537144897278,
"learning_rate": 9.982734929266799e-06,
"loss": 0.5985,
"step": 1353
},
{
"epoch": 0.11155969349921727,
"grad_norm": 5.0871649102753675,
"learning_rate": 9.98267949124359e-06,
"loss": 0.6366,
"step": 1354
},
{
"epoch": 0.11164208618274697,
"grad_norm": 7.594378062700149,
"learning_rate": 9.982623964511868e-06,
"loss": 0.3574,
"step": 1355
},
{
"epoch": 0.11172447886627668,
"grad_norm": 6.421114608398679,
"learning_rate": 9.982568349072619e-06,
"loss": 0.424,
"step": 1356
},
{
"epoch": 0.11180687154980638,
"grad_norm": 4.43829490618891,
"learning_rate": 9.982512644926835e-06,
"loss": 0.5444,
"step": 1357
},
{
"epoch": 0.11188926423333607,
"grad_norm": 5.484026046951222,
"learning_rate": 9.982456852075505e-06,
"loss": 0.4623,
"step": 1358
},
{
"epoch": 0.11197165691686578,
"grad_norm": 4.551122319765745,
"learning_rate": 9.982400970519625e-06,
"loss": 0.482,
"step": 1359
},
{
"epoch": 0.11205404960039549,
"grad_norm": 5.123980967275811,
"learning_rate": 9.982345000260189e-06,
"loss": 0.6261,
"step": 1360
},
{
"epoch": 0.11213644228392519,
"grad_norm": 7.0859474884143845,
"learning_rate": 9.982288941298193e-06,
"loss": 0.482,
"step": 1361
},
{
"epoch": 0.11221883496745488,
"grad_norm": 5.062990740013323,
"learning_rate": 9.982232793634637e-06,
"loss": 0.4078,
"step": 1362
},
{
"epoch": 0.11230122765098459,
"grad_norm": 6.563773730859556,
"learning_rate": 9.982176557270518e-06,
"loss": 0.6887,
"step": 1363
},
{
"epoch": 0.1123836203345143,
"grad_norm": 3.9679727896989507,
"learning_rate": 9.982120232206837e-06,
"loss": 0.5059,
"step": 1364
},
{
"epoch": 0.112466013018044,
"grad_norm": 4.109093720360083,
"learning_rate": 9.9820638184446e-06,
"loss": 0.3438,
"step": 1365
},
{
"epoch": 0.1125484057015737,
"grad_norm": 5.911123344422388,
"learning_rate": 9.98200731598481e-06,
"loss": 0.5155,
"step": 1366
},
{
"epoch": 0.1126307983851034,
"grad_norm": 3.88318872578385,
"learning_rate": 9.98195072482847e-06,
"loss": 0.5121,
"step": 1367
},
{
"epoch": 0.1127131910686331,
"grad_norm": 5.233996486233231,
"learning_rate": 9.98189404497659e-06,
"loss": 0.6168,
"step": 1368
},
{
"epoch": 0.11279558375216281,
"grad_norm": 4.602934758003588,
"learning_rate": 9.981837276430181e-06,
"loss": 0.3514,
"step": 1369
},
{
"epoch": 0.11287797643569251,
"grad_norm": 6.096124850039538,
"learning_rate": 9.98178041919025e-06,
"loss": 0.7239,
"step": 1370
},
{
"epoch": 0.1129603691192222,
"grad_norm": 3.74675189974131,
"learning_rate": 9.981723473257812e-06,
"loss": 0.2741,
"step": 1371
},
{
"epoch": 0.11304276180275191,
"grad_norm": 4.56350112320027,
"learning_rate": 9.981666438633877e-06,
"loss": 0.4282,
"step": 1372
},
{
"epoch": 0.11312515448628162,
"grad_norm": 14.508417238617053,
"learning_rate": 9.981609315319467e-06,
"loss": 0.6197,
"step": 1373
},
{
"epoch": 0.11320754716981132,
"grad_norm": 3.0941627955941367,
"learning_rate": 9.981552103315593e-06,
"loss": 0.2163,
"step": 1374
},
{
"epoch": 0.11328993985334103,
"grad_norm": 4.946676977341611,
"learning_rate": 9.981494802623275e-06,
"loss": 0.387,
"step": 1375
},
{
"epoch": 0.11337233253687072,
"grad_norm": 7.823751960171278,
"learning_rate": 9.981437413243535e-06,
"loss": 0.6005,
"step": 1376
},
{
"epoch": 0.11345472522040043,
"grad_norm": 4.525532982557422,
"learning_rate": 9.981379935177393e-06,
"loss": 0.5959,
"step": 1377
},
{
"epoch": 0.11353711790393013,
"grad_norm": 6.635732248640508,
"learning_rate": 9.981322368425873e-06,
"loss": 0.5028,
"step": 1378
},
{
"epoch": 0.11361951058745984,
"grad_norm": 7.813709567215677,
"learning_rate": 9.98126471299e-06,
"loss": 0.5495,
"step": 1379
},
{
"epoch": 0.11370190327098953,
"grad_norm": 4.630427176502277,
"learning_rate": 9.981206968870798e-06,
"loss": 0.5631,
"step": 1380
},
{
"epoch": 0.11378429595451924,
"grad_norm": 33.40325879735037,
"learning_rate": 9.9811491360693e-06,
"loss": 1.1373,
"step": 1381
},
{
"epoch": 0.11386668863804894,
"grad_norm": 4.6566891571505575,
"learning_rate": 9.981091214586533e-06,
"loss": 0.4544,
"step": 1382
},
{
"epoch": 0.11394908132157865,
"grad_norm": 5.069079604968983,
"learning_rate": 9.981033204423526e-06,
"loss": 0.5782,
"step": 1383
},
{
"epoch": 0.11403147400510835,
"grad_norm": 4.434254165048741,
"learning_rate": 9.980975105581315e-06,
"loss": 0.5051,
"step": 1384
},
{
"epoch": 0.11411386668863804,
"grad_norm": 7.74689606386784,
"learning_rate": 9.980916918060932e-06,
"loss": 0.5908,
"step": 1385
},
{
"epoch": 0.11419625937216775,
"grad_norm": 4.577069084019305,
"learning_rate": 9.980858641863415e-06,
"loss": 0.5266,
"step": 1386
},
{
"epoch": 0.11427865205569745,
"grad_norm": 3.7822421197101046,
"learning_rate": 9.980800276989802e-06,
"loss": 0.5155,
"step": 1387
},
{
"epoch": 0.11436104473922716,
"grad_norm": 6.354305643899875,
"learning_rate": 9.98074182344113e-06,
"loss": 0.5797,
"step": 1388
},
{
"epoch": 0.11444343742275685,
"grad_norm": 5.1062797154471085,
"learning_rate": 9.980683281218438e-06,
"loss": 0.3497,
"step": 1389
},
{
"epoch": 0.11452583010628656,
"grad_norm": 4.339769494165447,
"learning_rate": 9.980624650322772e-06,
"loss": 0.5299,
"step": 1390
},
{
"epoch": 0.11460822278981626,
"grad_norm": 7.573051401034229,
"learning_rate": 9.980565930755174e-06,
"loss": 0.681,
"step": 1391
},
{
"epoch": 0.11469061547334597,
"grad_norm": 14.775312547328694,
"learning_rate": 9.980507122516692e-06,
"loss": 0.7312,
"step": 1392
},
{
"epoch": 0.11477300815687567,
"grad_norm": 4.248998579788246,
"learning_rate": 9.980448225608369e-06,
"loss": 0.5819,
"step": 1393
},
{
"epoch": 0.11485540084040537,
"grad_norm": 4.820974256874798,
"learning_rate": 9.980389240031256e-06,
"loss": 0.3988,
"step": 1394
},
{
"epoch": 0.11493779352393507,
"grad_norm": 5.3611932668737134,
"learning_rate": 9.980330165786403e-06,
"loss": 0.553,
"step": 1395
},
{
"epoch": 0.11502018620746478,
"grad_norm": 4.803109353772144,
"learning_rate": 9.98027100287486e-06,
"loss": 0.4254,
"step": 1396
},
{
"epoch": 0.11510257889099448,
"grad_norm": 6.166043944368681,
"learning_rate": 9.980211751297682e-06,
"loss": 0.6435,
"step": 1397
},
{
"epoch": 0.11518497157452418,
"grad_norm": 6.321449333735212,
"learning_rate": 9.980152411055923e-06,
"loss": 0.5901,
"step": 1398
},
{
"epoch": 0.11526736425805388,
"grad_norm": 3.9058938395757736,
"learning_rate": 9.980092982150641e-06,
"loss": 0.4481,
"step": 1399
},
{
"epoch": 0.11534975694158359,
"grad_norm": 7.060663282013449,
"learning_rate": 9.980033464582892e-06,
"loss": 0.7435,
"step": 1400
},
{
"epoch": 0.11543214962511329,
"grad_norm": 5.044696423020422,
"learning_rate": 9.979973858353738e-06,
"loss": 0.4583,
"step": 1401
},
{
"epoch": 0.115514542308643,
"grad_norm": 4.010481581383983,
"learning_rate": 9.979914163464237e-06,
"loss": 0.4307,
"step": 1402
},
{
"epoch": 0.11559693499217269,
"grad_norm": 4.715002567821283,
"learning_rate": 9.979854379915454e-06,
"loss": 0.4067,
"step": 1403
},
{
"epoch": 0.1156793276757024,
"grad_norm": 5.095282676387148,
"learning_rate": 9.979794507708453e-06,
"loss": 0.5319,
"step": 1404
},
{
"epoch": 0.1157617203592321,
"grad_norm": 5.434641667200585,
"learning_rate": 9.979734546844301e-06,
"loss": 0.5371,
"step": 1405
},
{
"epoch": 0.1158441130427618,
"grad_norm": 5.864807571595335,
"learning_rate": 9.979674497324063e-06,
"loss": 0.7502,
"step": 1406
},
{
"epoch": 0.11592650572629151,
"grad_norm": 3.931855774749118,
"learning_rate": 9.979614359148809e-06,
"loss": 0.4857,
"step": 1407
},
{
"epoch": 0.1160088984098212,
"grad_norm": 4.415958936160234,
"learning_rate": 9.97955413231961e-06,
"loss": 0.4143,
"step": 1408
},
{
"epoch": 0.11609129109335091,
"grad_norm": 5.477681605181166,
"learning_rate": 9.97949381683754e-06,
"loss": 0.5853,
"step": 1409
},
{
"epoch": 0.11617368377688062,
"grad_norm": 5.394465919333391,
"learning_rate": 9.97943341270367e-06,
"loss": 0.6864,
"step": 1410
},
{
"epoch": 0.11625607646041032,
"grad_norm": 7.0540860642905825,
"learning_rate": 9.979372919919077e-06,
"loss": 0.5353,
"step": 1411
},
{
"epoch": 0.11633846914394001,
"grad_norm": 4.457918745669958,
"learning_rate": 9.979312338484837e-06,
"loss": 0.5332,
"step": 1412
},
{
"epoch": 0.11642086182746972,
"grad_norm": 4.318862068781239,
"learning_rate": 9.979251668402027e-06,
"loss": 0.5383,
"step": 1413
},
{
"epoch": 0.11650325451099942,
"grad_norm": 4.15910796680108,
"learning_rate": 9.979190909671732e-06,
"loss": 0.3852,
"step": 1414
},
{
"epoch": 0.11658564719452913,
"grad_norm": 4.409347363114283,
"learning_rate": 9.97913006229503e-06,
"loss": 0.5274,
"step": 1415
},
{
"epoch": 0.11666803987805884,
"grad_norm": 3.860140816734429,
"learning_rate": 9.979069126273006e-06,
"loss": 0.3016,
"step": 1416
},
{
"epoch": 0.11675043256158853,
"grad_norm": 4.05973495912906,
"learning_rate": 9.979008101606743e-06,
"loss": 0.5707,
"step": 1417
},
{
"epoch": 0.11683282524511823,
"grad_norm": 5.335313848877938,
"learning_rate": 9.978946988297329e-06,
"loss": 0.4813,
"step": 1418
},
{
"epoch": 0.11691521792864794,
"grad_norm": 5.075262879615677,
"learning_rate": 9.978885786345851e-06,
"loss": 0.463,
"step": 1419
},
{
"epoch": 0.11699761061217764,
"grad_norm": 5.215466147778947,
"learning_rate": 9.978824495753399e-06,
"loss": 0.5398,
"step": 1420
},
{
"epoch": 0.11708000329570734,
"grad_norm": 10.408226861855583,
"learning_rate": 9.978763116521065e-06,
"loss": 0.7895,
"step": 1421
},
{
"epoch": 0.11716239597923704,
"grad_norm": 3.486562350465299,
"learning_rate": 9.97870164864994e-06,
"loss": 0.5399,
"step": 1422
},
{
"epoch": 0.11724478866276675,
"grad_norm": 3.263975256020736,
"learning_rate": 9.97864009214112e-06,
"loss": 0.2916,
"step": 1423
},
{
"epoch": 0.11732718134629645,
"grad_norm": 3.687714872193806,
"learning_rate": 9.9785784469957e-06,
"loss": 0.304,
"step": 1424
},
{
"epoch": 0.11740957402982616,
"grad_norm": 5.509829107986992,
"learning_rate": 9.978516713214779e-06,
"loss": 0.5531,
"step": 1425
},
{
"epoch": 0.11749196671335585,
"grad_norm": 3.5921556435301474,
"learning_rate": 9.978454890799453e-06,
"loss": 0.3016,
"step": 1426
},
{
"epoch": 0.11757435939688556,
"grad_norm": 4.173814176850804,
"learning_rate": 9.978392979750825e-06,
"loss": 0.4396,
"step": 1427
},
{
"epoch": 0.11765675208041526,
"grad_norm": 33.23788817879558,
"learning_rate": 9.978330980069996e-06,
"loss": 1.1389,
"step": 1428
},
{
"epoch": 0.11773914476394497,
"grad_norm": 5.852780126730159,
"learning_rate": 9.978268891758072e-06,
"loss": 0.6514,
"step": 1429
},
{
"epoch": 0.11782153744747466,
"grad_norm": 4.939407725307317,
"learning_rate": 9.978206714816156e-06,
"loss": 0.7286,
"step": 1430
},
{
"epoch": 0.11790393013100436,
"grad_norm": 5.03926308808164,
"learning_rate": 9.978144449245357e-06,
"loss": 0.3388,
"step": 1431
},
{
"epoch": 0.11798632281453407,
"grad_norm": 6.049230382777763,
"learning_rate": 9.978082095046781e-06,
"loss": 0.4871,
"step": 1432
},
{
"epoch": 0.11806871549806378,
"grad_norm": 3.489939110522026,
"learning_rate": 9.978019652221543e-06,
"loss": 0.2893,
"step": 1433
},
{
"epoch": 0.11815110818159348,
"grad_norm": 5.028102433756064,
"learning_rate": 9.977957120770748e-06,
"loss": 0.5076,
"step": 1434
},
{
"epoch": 0.11823350086512317,
"grad_norm": 5.274453037216691,
"learning_rate": 9.977894500695512e-06,
"loss": 0.3798,
"step": 1435
},
{
"epoch": 0.11831589354865288,
"grad_norm": 4.751680688931771,
"learning_rate": 9.977831791996952e-06,
"loss": 0.4783,
"step": 1436
},
{
"epoch": 0.11839828623218258,
"grad_norm": 3.5749837950342274,
"learning_rate": 9.977768994676181e-06,
"loss": 0.3682,
"step": 1437
},
{
"epoch": 0.11848067891571229,
"grad_norm": 3.671226493661603,
"learning_rate": 9.97770610873432e-06,
"loss": 0.347,
"step": 1438
},
{
"epoch": 0.11856307159924198,
"grad_norm": 6.196939033251727,
"learning_rate": 9.977643134172487e-06,
"loss": 0.5274,
"step": 1439
},
{
"epoch": 0.11864546428277169,
"grad_norm": 5.2089347723562085,
"learning_rate": 9.977580070991804e-06,
"loss": 0.5175,
"step": 1440
},
{
"epoch": 0.11872785696630139,
"grad_norm": 6.334013108356391,
"learning_rate": 9.977516919193393e-06,
"loss": 0.6194,
"step": 1441
},
{
"epoch": 0.1188102496498311,
"grad_norm": 7.306839104821746,
"learning_rate": 9.977453678778379e-06,
"loss": 0.7709,
"step": 1442
},
{
"epoch": 0.1188926423333608,
"grad_norm": 4.519396435491379,
"learning_rate": 9.977390349747886e-06,
"loss": 0.4418,
"step": 1443
},
{
"epoch": 0.1189750350168905,
"grad_norm": 4.698209909244438,
"learning_rate": 9.977326932103044e-06,
"loss": 0.535,
"step": 1444
},
{
"epoch": 0.1190574277004202,
"grad_norm": 4.8932208776361446,
"learning_rate": 9.977263425844981e-06,
"loss": 0.5942,
"step": 1445
},
{
"epoch": 0.11913982038394991,
"grad_norm": 5.026667743592898,
"learning_rate": 9.977199830974826e-06,
"loss": 0.547,
"step": 1446
},
{
"epoch": 0.11922221306747961,
"grad_norm": 5.23253291298616,
"learning_rate": 9.977136147493715e-06,
"loss": 0.5333,
"step": 1447
},
{
"epoch": 0.1193046057510093,
"grad_norm": 4.719053095707587,
"learning_rate": 9.97707237540278e-06,
"loss": 0.5265,
"step": 1448
},
{
"epoch": 0.11938699843453901,
"grad_norm": 4.625540723780489,
"learning_rate": 9.977008514703153e-06,
"loss": 0.5827,
"step": 1449
},
{
"epoch": 0.11946939111806872,
"grad_norm": 3.856805995552182,
"learning_rate": 9.976944565395976e-06,
"loss": 0.6053,
"step": 1450
},
{
"epoch": 0.11955178380159842,
"grad_norm": 3.330853507113743,
"learning_rate": 9.976880527482385e-06,
"loss": 0.5254,
"step": 1451
},
{
"epoch": 0.11963417648512813,
"grad_norm": 4.459173017435613,
"learning_rate": 9.97681640096352e-06,
"loss": 0.4728,
"step": 1452
},
{
"epoch": 0.11971656916865782,
"grad_norm": 7.018979668301613,
"learning_rate": 9.976752185840524e-06,
"loss": 0.6592,
"step": 1453
},
{
"epoch": 0.11979896185218752,
"grad_norm": 4.695286668122802,
"learning_rate": 9.976687882114538e-06,
"loss": 0.2927,
"step": 1454
},
{
"epoch": 0.11988135453571723,
"grad_norm": 6.372078077902412,
"learning_rate": 9.976623489786708e-06,
"loss": 0.7203,
"step": 1455
},
{
"epoch": 0.11996374721924694,
"grad_norm": 13.058182473303905,
"learning_rate": 9.976559008858182e-06,
"loss": 0.8304,
"step": 1456
},
{
"epoch": 0.12004613990277663,
"grad_norm": 3.750428817875247,
"learning_rate": 9.976494439330106e-06,
"loss": 0.4544,
"step": 1457
},
{
"epoch": 0.12012853258630633,
"grad_norm": 5.714795325590294,
"learning_rate": 9.976429781203631e-06,
"loss": 0.5764,
"step": 1458
},
{
"epoch": 0.12021092526983604,
"grad_norm": 6.249017860839444,
"learning_rate": 9.976365034479907e-06,
"loss": 0.6097,
"step": 1459
},
{
"epoch": 0.12029331795336574,
"grad_norm": 4.366534697481048,
"learning_rate": 9.976300199160087e-06,
"loss": 0.5872,
"step": 1460
},
{
"epoch": 0.12037571063689545,
"grad_norm": 8.297730150263703,
"learning_rate": 9.976235275245325e-06,
"loss": 0.6445,
"step": 1461
},
{
"epoch": 0.12045810332042514,
"grad_norm": 3.0791550000816437,
"learning_rate": 9.976170262736777e-06,
"loss": 0.3265,
"step": 1462
},
{
"epoch": 0.12054049600395485,
"grad_norm": 4.587962090590767,
"learning_rate": 9.9761051616356e-06,
"loss": 0.4655,
"step": 1463
},
{
"epoch": 0.12062288868748455,
"grad_norm": 4.083352587701113,
"learning_rate": 9.976039971942955e-06,
"loss": 0.5081,
"step": 1464
},
{
"epoch": 0.12070528137101426,
"grad_norm": 19.36776574624167,
"learning_rate": 9.97597469366e-06,
"loss": 0.8676,
"step": 1465
},
{
"epoch": 0.12078767405454395,
"grad_norm": 13.102581672872773,
"learning_rate": 9.975909326787898e-06,
"loss": 0.6868,
"step": 1466
},
{
"epoch": 0.12087006673807366,
"grad_norm": 4.961145426331136,
"learning_rate": 9.975843871327815e-06,
"loss": 0.5178,
"step": 1467
},
{
"epoch": 0.12095245942160336,
"grad_norm": 3.5357157810853512,
"learning_rate": 9.975778327280914e-06,
"loss": 0.2661,
"step": 1468
},
{
"epoch": 0.12103485210513307,
"grad_norm": 3.0027106887596036,
"learning_rate": 9.97571269464836e-06,
"loss": 0.2429,
"step": 1469
},
{
"epoch": 0.12111724478866277,
"grad_norm": 4.596992981690704,
"learning_rate": 9.975646973431326e-06,
"loss": 0.4726,
"step": 1470
},
{
"epoch": 0.12119963747219246,
"grad_norm": 6.690419035435042,
"learning_rate": 9.975581163630981e-06,
"loss": 0.7149,
"step": 1471
},
{
"epoch": 0.12128203015572217,
"grad_norm": 5.306103849360317,
"learning_rate": 9.975515265248493e-06,
"loss": 0.5719,
"step": 1472
},
{
"epoch": 0.12136442283925188,
"grad_norm": 5.095282694288974,
"learning_rate": 9.975449278285038e-06,
"loss": 0.5793,
"step": 1473
},
{
"epoch": 0.12144681552278158,
"grad_norm": 5.007439354366816,
"learning_rate": 9.975383202741793e-06,
"loss": 0.5795,
"step": 1474
},
{
"epoch": 0.12152920820631127,
"grad_norm": 4.86506910122601,
"learning_rate": 9.97531703861993e-06,
"loss": 0.4545,
"step": 1475
},
{
"epoch": 0.12161160088984098,
"grad_norm": 12.362020895190154,
"learning_rate": 9.975250785920629e-06,
"loss": 0.6358,
"step": 1476
},
{
"epoch": 0.12169399357337068,
"grad_norm": 5.088274293351666,
"learning_rate": 9.97518444464507e-06,
"loss": 0.4832,
"step": 1477
},
{
"epoch": 0.12177638625690039,
"grad_norm": 4.852830296274398,
"learning_rate": 9.975118014794431e-06,
"loss": 0.4588,
"step": 1478
},
{
"epoch": 0.1218587789404301,
"grad_norm": 5.076993779434122,
"learning_rate": 9.975051496369899e-06,
"loss": 0.5489,
"step": 1479
},
{
"epoch": 0.12194117162395979,
"grad_norm": 4.492297833507457,
"learning_rate": 9.974984889372658e-06,
"loss": 0.3727,
"step": 1480
},
{
"epoch": 0.1220235643074895,
"grad_norm": 4.969531949366711,
"learning_rate": 9.97491819380389e-06,
"loss": 0.6192,
"step": 1481
},
{
"epoch": 0.1221059569910192,
"grad_norm": 5.5171550464058985,
"learning_rate": 9.974851409664786e-06,
"loss": 0.5168,
"step": 1482
},
{
"epoch": 0.1221883496745489,
"grad_norm": 4.549692286167438,
"learning_rate": 9.974784536956533e-06,
"loss": 0.5461,
"step": 1483
},
{
"epoch": 0.1222707423580786,
"grad_norm": 4.638601246264793,
"learning_rate": 9.974717575680321e-06,
"loss": 0.5586,
"step": 1484
},
{
"epoch": 0.1223531350416083,
"grad_norm": 3.9934881662408617,
"learning_rate": 9.974650525837345e-06,
"loss": 0.4364,
"step": 1485
},
{
"epoch": 0.12243552772513801,
"grad_norm": 8.227489553944196,
"learning_rate": 9.974583387428797e-06,
"loss": 0.8104,
"step": 1486
},
{
"epoch": 0.12251792040866771,
"grad_norm": 6.644787549148226,
"learning_rate": 9.974516160455872e-06,
"loss": 0.4537,
"step": 1487
},
{
"epoch": 0.12260031309219742,
"grad_norm": 4.325870512007751,
"learning_rate": 9.974448844919766e-06,
"loss": 0.4874,
"step": 1488
},
{
"epoch": 0.12268270577572711,
"grad_norm": 6.081726074936586,
"learning_rate": 9.97438144082168e-06,
"loss": 0.3875,
"step": 1489
},
{
"epoch": 0.12276509845925682,
"grad_norm": 4.548854894101285,
"learning_rate": 9.974313948162812e-06,
"loss": 0.5696,
"step": 1490
},
{
"epoch": 0.12284749114278652,
"grad_norm": 9.255791845151686,
"learning_rate": 9.974246366944364e-06,
"loss": 0.9999,
"step": 1491
},
{
"epoch": 0.12292988382631623,
"grad_norm": 6.508244913389245,
"learning_rate": 9.97417869716754e-06,
"loss": 0.6256,
"step": 1492
},
{
"epoch": 0.12301227650984592,
"grad_norm": 2.9004728725960067,
"learning_rate": 9.974110938833545e-06,
"loss": 0.2222,
"step": 1493
},
{
"epoch": 0.12309466919337562,
"grad_norm": 6.840818959886781,
"learning_rate": 9.974043091943584e-06,
"loss": 0.6488,
"step": 1494
},
{
"epoch": 0.12317706187690533,
"grad_norm": 4.564889329777813,
"learning_rate": 9.973975156498866e-06,
"loss": 0.4834,
"step": 1495
},
{
"epoch": 0.12325945456043504,
"grad_norm": 6.124262779948301,
"learning_rate": 9.973907132500597e-06,
"loss": 0.6345,
"step": 1496
},
{
"epoch": 0.12334184724396474,
"grad_norm": 4.143149551754576,
"learning_rate": 9.973839019949994e-06,
"loss": 0.5449,
"step": 1497
},
{
"epoch": 0.12342423992749443,
"grad_norm": 4.692433215382461,
"learning_rate": 9.973770818848265e-06,
"loss": 0.381,
"step": 1498
},
{
"epoch": 0.12350663261102414,
"grad_norm": 4.558902674629272,
"learning_rate": 9.973702529196627e-06,
"loss": 0.4342,
"step": 1499
},
{
"epoch": 0.12358902529455384,
"grad_norm": 4.544800628848942,
"learning_rate": 9.973634150996291e-06,
"loss": 0.3499,
"step": 1500
},
{
"epoch": 0.12367141797808355,
"grad_norm": 5.059581470168767,
"learning_rate": 9.973565684248483e-06,
"loss": 0.5135,
"step": 1501
},
{
"epoch": 0.12375381066161324,
"grad_norm": 4.258353140119297,
"learning_rate": 9.973497128954414e-06,
"loss": 0.269,
"step": 1502
},
{
"epoch": 0.12383620334514295,
"grad_norm": 7.132848659419424,
"learning_rate": 9.973428485115308e-06,
"loss": 0.6726,
"step": 1503
},
{
"epoch": 0.12391859602867265,
"grad_norm": 3.5983862118163277,
"learning_rate": 9.973359752732386e-06,
"loss": 0.4669,
"step": 1504
},
{
"epoch": 0.12400098871220236,
"grad_norm": 3.448183524414553,
"learning_rate": 9.973290931806874e-06,
"loss": 0.1703,
"step": 1505
},
{
"epoch": 0.12408338139573206,
"grad_norm": 3.0366884309895794,
"learning_rate": 9.973222022339992e-06,
"loss": 0.2643,
"step": 1506
},
{
"epoch": 0.12416577407926176,
"grad_norm": 4.8670408538461745,
"learning_rate": 9.973153024332974e-06,
"loss": 0.2684,
"step": 1507
},
{
"epoch": 0.12424816676279146,
"grad_norm": 5.9026783007837205,
"learning_rate": 9.973083937787042e-06,
"loss": 0.5869,
"step": 1508
},
{
"epoch": 0.12433055944632117,
"grad_norm": 6.025970687624214,
"learning_rate": 9.973014762703429e-06,
"loss": 0.4191,
"step": 1509
},
{
"epoch": 0.12441295212985087,
"grad_norm": 6.51904193753321,
"learning_rate": 9.972945499083366e-06,
"loss": 0.7139,
"step": 1510
},
{
"epoch": 0.12449534481338057,
"grad_norm": 6.684992245083261,
"learning_rate": 9.972876146928088e-06,
"loss": 0.6404,
"step": 1511
},
{
"epoch": 0.12457773749691027,
"grad_norm": 5.001971574274414,
"learning_rate": 9.972806706238826e-06,
"loss": 0.4946,
"step": 1512
},
{
"epoch": 0.12466013018043998,
"grad_norm": 6.7337019014227355,
"learning_rate": 9.97273717701682e-06,
"loss": 0.5832,
"step": 1513
},
{
"epoch": 0.12474252286396968,
"grad_norm": 5.220718061812807,
"learning_rate": 9.972667559263305e-06,
"loss": 0.5482,
"step": 1514
},
{
"epoch": 0.12482491554749939,
"grad_norm": 5.150466794347032,
"learning_rate": 9.97259785297952e-06,
"loss": 0.6143,
"step": 1515
},
{
"epoch": 0.12490730823102908,
"grad_norm": 6.537060800514755,
"learning_rate": 9.972528058166711e-06,
"loss": 0.6394,
"step": 1516
},
{
"epoch": 0.12498970091455879,
"grad_norm": 4.955150662710675,
"learning_rate": 9.972458174826115e-06,
"loss": 0.5091,
"step": 1517
},
{
"epoch": 0.1250720935980885,
"grad_norm": 4.842000273038944,
"learning_rate": 9.972388202958977e-06,
"loss": 0.4601,
"step": 1518
},
{
"epoch": 0.12515448628161818,
"grad_norm": 4.184733000664414,
"learning_rate": 9.972318142566547e-06,
"loss": 0.4797,
"step": 1519
},
{
"epoch": 0.1252368789651479,
"grad_norm": 5.585543182543029,
"learning_rate": 9.972247993650067e-06,
"loss": 0.6223,
"step": 1520
},
{
"epoch": 0.1253192716486776,
"grad_norm": 5.879739754393376,
"learning_rate": 9.97217775621079e-06,
"loss": 0.628,
"step": 1521
},
{
"epoch": 0.1254016643322073,
"grad_norm": 11.002403812555526,
"learning_rate": 9.972107430249963e-06,
"loss": 0.553,
"step": 1522
},
{
"epoch": 0.125484057015737,
"grad_norm": 6.203040642309966,
"learning_rate": 9.972037015768841e-06,
"loss": 0.7279,
"step": 1523
},
{
"epoch": 0.1255664496992667,
"grad_norm": 7.577934526871955,
"learning_rate": 9.971966512768677e-06,
"loss": 0.538,
"step": 1524
},
{
"epoch": 0.12564884238279642,
"grad_norm": 4.019605674039673,
"learning_rate": 9.971895921250723e-06,
"loss": 0.5441,
"step": 1525
},
{
"epoch": 0.1257312350663261,
"grad_norm": 5.764658658433788,
"learning_rate": 9.97182524121624e-06,
"loss": 0.6654,
"step": 1526
},
{
"epoch": 0.1258136277498558,
"grad_norm": 4.945527867426451,
"learning_rate": 9.971754472666484e-06,
"loss": 0.514,
"step": 1527
},
{
"epoch": 0.12589602043338552,
"grad_norm": 6.0441278677454005,
"learning_rate": 9.971683615602716e-06,
"loss": 0.4181,
"step": 1528
},
{
"epoch": 0.1259784131169152,
"grad_norm": 14.25711273437809,
"learning_rate": 9.971612670026196e-06,
"loss": 0.6254,
"step": 1529
},
{
"epoch": 0.12606080580044493,
"grad_norm": 4.700273765258455,
"learning_rate": 9.97154163593819e-06,
"loss": 0.6229,
"step": 1530
},
{
"epoch": 0.12614319848397462,
"grad_norm": 3.3864821654511945,
"learning_rate": 9.97147051333996e-06,
"loss": 0.399,
"step": 1531
},
{
"epoch": 0.12622559116750431,
"grad_norm": 5.12783193960563,
"learning_rate": 9.971399302232772e-06,
"loss": 0.5427,
"step": 1532
},
{
"epoch": 0.12630798385103403,
"grad_norm": 4.931238238343592,
"learning_rate": 9.971328002617895e-06,
"loss": 0.2851,
"step": 1533
},
{
"epoch": 0.12639037653456373,
"grad_norm": 5.285172145335731,
"learning_rate": 9.971256614496598e-06,
"loss": 0.5647,
"step": 1534
},
{
"epoch": 0.12647276921809344,
"grad_norm": 4.744357889613329,
"learning_rate": 9.971185137870155e-06,
"loss": 0.5237,
"step": 1535
},
{
"epoch": 0.12655516190162314,
"grad_norm": 8.652911164552046,
"learning_rate": 9.971113572739832e-06,
"loss": 0.8313,
"step": 1536
},
{
"epoch": 0.12663755458515283,
"grad_norm": 5.869655307998774,
"learning_rate": 9.971041919106908e-06,
"loss": 0.5802,
"step": 1537
},
{
"epoch": 0.12671994726868255,
"grad_norm": 6.3300754330457,
"learning_rate": 9.970970176972658e-06,
"loss": 0.6885,
"step": 1538
},
{
"epoch": 0.12680233995221224,
"grad_norm": 7.1758070025576055,
"learning_rate": 9.970898346338358e-06,
"loss": 0.6389,
"step": 1539
},
{
"epoch": 0.12688473263574196,
"grad_norm": 12.961913535688998,
"learning_rate": 9.970826427205287e-06,
"loss": 0.6393,
"step": 1540
},
{
"epoch": 0.12696712531927165,
"grad_norm": 5.401262177099866,
"learning_rate": 9.970754419574728e-06,
"loss": 0.5455,
"step": 1541
},
{
"epoch": 0.12704951800280134,
"grad_norm": 3.3340002399513,
"learning_rate": 9.970682323447959e-06,
"loss": 0.2345,
"step": 1542
},
{
"epoch": 0.12713191068633106,
"grad_norm": 8.894631262296059,
"learning_rate": 9.970610138826267e-06,
"loss": 0.7767,
"step": 1543
},
{
"epoch": 0.12721430336986075,
"grad_norm": 4.952958216270172,
"learning_rate": 9.970537865710934e-06,
"loss": 0.2786,
"step": 1544
},
{
"epoch": 0.12729669605339045,
"grad_norm": 2.6515561410684194,
"learning_rate": 9.970465504103249e-06,
"loss": 0.2038,
"step": 1545
},
{
"epoch": 0.12737908873692017,
"grad_norm": 5.298892635444798,
"learning_rate": 9.9703930540045e-06,
"loss": 0.4559,
"step": 1546
},
{
"epoch": 0.12746148142044986,
"grad_norm": 4.197621921991316,
"learning_rate": 9.970320515415974e-06,
"loss": 0.4502,
"step": 1547
},
{
"epoch": 0.12754387410397958,
"grad_norm": 4.800228256482724,
"learning_rate": 9.970247888338966e-06,
"loss": 0.5957,
"step": 1548
},
{
"epoch": 0.12762626678750927,
"grad_norm": 4.815816035104436,
"learning_rate": 9.970175172774768e-06,
"loss": 0.5874,
"step": 1549
},
{
"epoch": 0.12770865947103896,
"grad_norm": 6.545724686124551,
"learning_rate": 9.970102368724675e-06,
"loss": 0.5925,
"step": 1550
},
{
"epoch": 0.12779105215456868,
"grad_norm": 5.121538774261143,
"learning_rate": 9.970029476189984e-06,
"loss": 0.5063,
"step": 1551
},
{
"epoch": 0.12787344483809837,
"grad_norm": 5.7759067014268135,
"learning_rate": 9.969956495171989e-06,
"loss": 0.6866,
"step": 1552
},
{
"epoch": 0.1279558375216281,
"grad_norm": 6.588677492190531,
"learning_rate": 9.96988342567199e-06,
"loss": 0.7341,
"step": 1553
},
{
"epoch": 0.12803823020515778,
"grad_norm": 6.843692520636973,
"learning_rate": 9.969810267691293e-06,
"loss": 0.7034,
"step": 1554
},
{
"epoch": 0.12812062288868747,
"grad_norm": 5.436070981898065,
"learning_rate": 9.969737021231196e-06,
"loss": 0.4046,
"step": 1555
},
{
"epoch": 0.1282030155722172,
"grad_norm": 15.426453647795194,
"learning_rate": 9.969663686293003e-06,
"loss": 0.8425,
"step": 1556
},
{
"epoch": 0.12828540825574689,
"grad_norm": 6.468281649010783,
"learning_rate": 9.969590262878021e-06,
"loss": 0.5969,
"step": 1557
},
{
"epoch": 0.1283678009392766,
"grad_norm": 8.361371730974474,
"learning_rate": 9.969516750987558e-06,
"loss": 0.6787,
"step": 1558
},
{
"epoch": 0.1284501936228063,
"grad_norm": 5.3302943640173925,
"learning_rate": 9.969443150622921e-06,
"loss": 0.5459,
"step": 1559
},
{
"epoch": 0.128532586306336,
"grad_norm": 4.633429138093913,
"learning_rate": 9.96936946178542e-06,
"loss": 0.649,
"step": 1560
},
{
"epoch": 0.1286149789898657,
"grad_norm": 8.13100570319315,
"learning_rate": 9.96929568447637e-06,
"loss": 0.7892,
"step": 1561
},
{
"epoch": 0.1286973716733954,
"grad_norm": 5.039540157664388,
"learning_rate": 9.96922181869708e-06,
"loss": 0.5216,
"step": 1562
},
{
"epoch": 0.1287797643569251,
"grad_norm": 3.6431682961361633,
"learning_rate": 9.969147864448867e-06,
"loss": 0.2595,
"step": 1563
},
{
"epoch": 0.1288621570404548,
"grad_norm": 5.4887751624743375,
"learning_rate": 9.96907382173305e-06,
"loss": 0.5876,
"step": 1564
},
{
"epoch": 0.1289445497239845,
"grad_norm": 4.129305542648151,
"learning_rate": 9.968999690550945e-06,
"loss": 0.567,
"step": 1565
},
{
"epoch": 0.12902694240751422,
"grad_norm": 4.6202359784395695,
"learning_rate": 9.96892547090387e-06,
"loss": 0.3541,
"step": 1566
},
{
"epoch": 0.12910933509104391,
"grad_norm": 5.530337251130461,
"learning_rate": 9.968851162793149e-06,
"loss": 0.4995,
"step": 1567
},
{
"epoch": 0.1291917277745736,
"grad_norm": 6.098832238315356,
"learning_rate": 9.968776766220105e-06,
"loss": 0.6462,
"step": 1568
},
{
"epoch": 0.12927412045810333,
"grad_norm": 36.110655337933196,
"learning_rate": 9.968702281186062e-06,
"loss": 1.8144,
"step": 1569
},
{
"epoch": 0.12935651314163302,
"grad_norm": 3.73141968324195,
"learning_rate": 9.968627707692345e-06,
"loss": 0.4626,
"step": 1570
},
{
"epoch": 0.12943890582516274,
"grad_norm": 5.556066968723632,
"learning_rate": 9.968553045740283e-06,
"loss": 0.4893,
"step": 1571
},
{
"epoch": 0.12952129850869243,
"grad_norm": 3.5534727840085303,
"learning_rate": 9.968478295331206e-06,
"loss": 0.3499,
"step": 1572
},
{
"epoch": 0.12960369119222212,
"grad_norm": 5.954589534858549,
"learning_rate": 9.96840345646644e-06,
"loss": 0.7069,
"step": 1573
},
{
"epoch": 0.12968608387575184,
"grad_norm": 5.504329435904227,
"learning_rate": 9.968328529147324e-06,
"loss": 0.5345,
"step": 1574
},
{
"epoch": 0.12976847655928153,
"grad_norm": 4.780089942398109,
"learning_rate": 9.968253513375187e-06,
"loss": 0.6211,
"step": 1575
},
{
"epoch": 0.12985086924281125,
"grad_norm": 3.6400688935977183,
"learning_rate": 9.968178409151368e-06,
"loss": 0.4675,
"step": 1576
},
{
"epoch": 0.12993326192634094,
"grad_norm": 5.161844362296579,
"learning_rate": 9.968103216477203e-06,
"loss": 0.5463,
"step": 1577
},
{
"epoch": 0.13001565460987063,
"grad_norm": 6.01338401708055,
"learning_rate": 9.968027935354029e-06,
"loss": 0.5191,
"step": 1578
},
{
"epoch": 0.13009804729340035,
"grad_norm": 4.717446966537538,
"learning_rate": 9.967952565783188e-06,
"loss": 0.5651,
"step": 1579
},
{
"epoch": 0.13018043997693005,
"grad_norm": 5.2385252642930125,
"learning_rate": 9.96787710776602e-06,
"loss": 0.5444,
"step": 1580
},
{
"epoch": 0.13026283266045974,
"grad_norm": 4.091715325329443,
"learning_rate": 9.967801561303871e-06,
"loss": 0.4193,
"step": 1581
},
{
"epoch": 0.13034522534398946,
"grad_norm": 7.045107352433902,
"learning_rate": 9.967725926398086e-06,
"loss": 0.4062,
"step": 1582
},
{
"epoch": 0.13042761802751915,
"grad_norm": 4.770986545998961,
"learning_rate": 9.967650203050007e-06,
"loss": 0.5442,
"step": 1583
},
{
"epoch": 0.13051001071104887,
"grad_norm": 4.436604203751775,
"learning_rate": 9.967574391260988e-06,
"loss": 0.4965,
"step": 1584
},
{
"epoch": 0.13059240339457856,
"grad_norm": 5.81892153360883,
"learning_rate": 9.967498491032376e-06,
"loss": 0.5432,
"step": 1585
},
{
"epoch": 0.13067479607810825,
"grad_norm": 6.625865132029466,
"learning_rate": 9.967422502365523e-06,
"loss": 0.6075,
"step": 1586
},
{
"epoch": 0.13075718876163797,
"grad_norm": 5.357243836445354,
"learning_rate": 9.96734642526178e-06,
"loss": 0.5635,
"step": 1587
},
{
"epoch": 0.13083958144516766,
"grad_norm": 7.170092927283846,
"learning_rate": 9.9672702597225e-06,
"loss": 0.5262,
"step": 1588
},
{
"epoch": 0.13092197412869738,
"grad_norm": 5.58870075455088,
"learning_rate": 9.967194005749045e-06,
"loss": 0.5163,
"step": 1589
},
{
"epoch": 0.13100436681222707,
"grad_norm": 3.3901516521405113,
"learning_rate": 9.96711766334277e-06,
"loss": 0.3337,
"step": 1590
},
{
"epoch": 0.13108675949575677,
"grad_norm": 6.177430383120361,
"learning_rate": 9.967041232505032e-06,
"loss": 0.6221,
"step": 1591
},
{
"epoch": 0.13116915217928649,
"grad_norm": 3.409414315356522,
"learning_rate": 9.966964713237193e-06,
"loss": 0.4258,
"step": 1592
},
{
"epoch": 0.13125154486281618,
"grad_norm": 5.796187847217588,
"learning_rate": 9.966888105540615e-06,
"loss": 0.6415,
"step": 1593
},
{
"epoch": 0.1313339375463459,
"grad_norm": 4.297779577006258,
"learning_rate": 9.966811409416664e-06,
"loss": 0.353,
"step": 1594
},
{
"epoch": 0.1314163302298756,
"grad_norm": 8.019855525742495,
"learning_rate": 9.966734624866702e-06,
"loss": 0.6986,
"step": 1595
},
{
"epoch": 0.13149872291340528,
"grad_norm": 4.454829131276674,
"learning_rate": 9.966657751892099e-06,
"loss": 0.44,
"step": 1596
},
{
"epoch": 0.131581115596935,
"grad_norm": 5.627067380885521,
"learning_rate": 9.966580790494222e-06,
"loss": 0.5673,
"step": 1597
},
{
"epoch": 0.1316635082804647,
"grad_norm": 16.272684626405372,
"learning_rate": 9.96650374067444e-06,
"loss": 0.6624,
"step": 1598
},
{
"epoch": 0.13174590096399438,
"grad_norm": 4.793323279968309,
"learning_rate": 9.966426602434128e-06,
"loss": 0.543,
"step": 1599
},
{
"epoch": 0.1318282936475241,
"grad_norm": 5.433792865346555,
"learning_rate": 9.966349375774658e-06,
"loss": 0.5756,
"step": 1600
},
{
"epoch": 0.1319106863310538,
"grad_norm": 6.05169655830317,
"learning_rate": 9.966272060697403e-06,
"loss": 0.5257,
"step": 1601
},
{
"epoch": 0.13199307901458351,
"grad_norm": 28.334925324741096,
"learning_rate": 9.966194657203743e-06,
"loss": 0.7121,
"step": 1602
},
{
"epoch": 0.1320754716981132,
"grad_norm": 4.439485716182711,
"learning_rate": 9.966117165295053e-06,
"loss": 0.4213,
"step": 1603
},
{
"epoch": 0.1321578643816429,
"grad_norm": 4.634956482246697,
"learning_rate": 9.966039584972713e-06,
"loss": 0.5792,
"step": 1604
},
{
"epoch": 0.13224025706517262,
"grad_norm": 8.01022976032216,
"learning_rate": 9.965961916238105e-06,
"loss": 0.8657,
"step": 1605
},
{
"epoch": 0.1323226497487023,
"grad_norm": 4.263056023595695,
"learning_rate": 9.965884159092613e-06,
"loss": 0.5201,
"step": 1606
},
{
"epoch": 0.13240504243223203,
"grad_norm": 5.493539454273528,
"learning_rate": 9.965806313537618e-06,
"loss": 0.551,
"step": 1607
},
{
"epoch": 0.13248743511576172,
"grad_norm": 5.696550440916956,
"learning_rate": 9.965728379574508e-06,
"loss": 0.6679,
"step": 1608
},
{
"epoch": 0.1325698277992914,
"grad_norm": 5.757476810730032,
"learning_rate": 9.965650357204673e-06,
"loss": 0.347,
"step": 1609
},
{
"epoch": 0.13265222048282113,
"grad_norm": 3.9736532348314686,
"learning_rate": 9.965572246429498e-06,
"loss": 0.4657,
"step": 1610
},
{
"epoch": 0.13273461316635082,
"grad_norm": 5.014191491777652,
"learning_rate": 9.965494047250374e-06,
"loss": 0.5738,
"step": 1611
},
{
"epoch": 0.13281700584988054,
"grad_norm": 4.378594249950214,
"learning_rate": 9.965415759668696e-06,
"loss": 0.5816,
"step": 1612
},
{
"epoch": 0.13289939853341023,
"grad_norm": 4.370547607797723,
"learning_rate": 9.965337383685854e-06,
"loss": 0.4593,
"step": 1613
},
{
"epoch": 0.13298179121693993,
"grad_norm": 2.9688769844150844,
"learning_rate": 9.965258919303246e-06,
"loss": 0.4406,
"step": 1614
},
{
"epoch": 0.13306418390046965,
"grad_norm": 5.547068937775387,
"learning_rate": 9.965180366522269e-06,
"loss": 0.6537,
"step": 1615
},
{
"epoch": 0.13314657658399934,
"grad_norm": 6.145359120990056,
"learning_rate": 9.96510172534432e-06,
"loss": 0.5867,
"step": 1616
},
{
"epoch": 0.13322896926752906,
"grad_norm": 5.312644770705092,
"learning_rate": 9.9650229957708e-06,
"loss": 0.6586,
"step": 1617
},
{
"epoch": 0.13331136195105875,
"grad_norm": 5.562860466575434,
"learning_rate": 9.96494417780311e-06,
"loss": 0.5398,
"step": 1618
},
{
"epoch": 0.13339375463458844,
"grad_norm": 4.158707385172828,
"learning_rate": 9.964865271442656e-06,
"loss": 0.3144,
"step": 1619
},
{
"epoch": 0.13347614731811816,
"grad_norm": 4.431346560539185,
"learning_rate": 9.964786276690839e-06,
"loss": 0.4856,
"step": 1620
},
{
"epoch": 0.13355854000164785,
"grad_norm": 5.449051878207818,
"learning_rate": 9.964707193549069e-06,
"loss": 0.5363,
"step": 1621
},
{
"epoch": 0.13364093268517754,
"grad_norm": 6.425628126296071,
"learning_rate": 9.964628022018748e-06,
"loss": 0.7224,
"step": 1622
},
{
"epoch": 0.13372332536870726,
"grad_norm": 7.144634256191189,
"learning_rate": 9.964548762101293e-06,
"loss": 0.7207,
"step": 1623
},
{
"epoch": 0.13380571805223695,
"grad_norm": 5.800859450448251,
"learning_rate": 9.96446941379811e-06,
"loss": 0.6978,
"step": 1624
},
{
"epoch": 0.13388811073576667,
"grad_norm": 5.499598864277338,
"learning_rate": 9.964389977110613e-06,
"loss": 0.4624,
"step": 1625
},
{
"epoch": 0.13397050341929637,
"grad_norm": 4.9609394948407255,
"learning_rate": 9.964310452040216e-06,
"loss": 0.555,
"step": 1626
},
{
"epoch": 0.13405289610282606,
"grad_norm": 5.42629026019687,
"learning_rate": 9.964230838588336e-06,
"loss": 0.4247,
"step": 1627
},
{
"epoch": 0.13413528878635578,
"grad_norm": 4.866848300258573,
"learning_rate": 9.964151136756391e-06,
"loss": 0.5655,
"step": 1628
},
{
"epoch": 0.13421768146988547,
"grad_norm": 4.615561791061624,
"learning_rate": 9.964071346545796e-06,
"loss": 0.58,
"step": 1629
},
{
"epoch": 0.1343000741534152,
"grad_norm": 6.71642950132474,
"learning_rate": 9.963991467957977e-06,
"loss": 0.7631,
"step": 1630
},
{
"epoch": 0.13438246683694488,
"grad_norm": 4.315299600438297,
"learning_rate": 9.963911500994352e-06,
"loss": 0.5401,
"step": 1631
},
{
"epoch": 0.13446485952047457,
"grad_norm": 4.648644316335041,
"learning_rate": 9.963831445656345e-06,
"loss": 0.5922,
"step": 1632
},
{
"epoch": 0.1345472522040043,
"grad_norm": 3.5272454240753266,
"learning_rate": 9.96375130194538e-06,
"loss": 0.4294,
"step": 1633
},
{
"epoch": 0.13462964488753398,
"grad_norm": 4.157793884858299,
"learning_rate": 9.963671069862891e-06,
"loss": 0.3727,
"step": 1634
},
{
"epoch": 0.1347120375710637,
"grad_norm": 5.325914533957815,
"learning_rate": 9.9635907494103e-06,
"loss": 0.5745,
"step": 1635
},
{
"epoch": 0.1347944302545934,
"grad_norm": 5.069398707440809,
"learning_rate": 9.963510340589037e-06,
"loss": 0.706,
"step": 1636
},
{
"epoch": 0.1348768229381231,
"grad_norm": 28.812450237360544,
"learning_rate": 9.963429843400536e-06,
"loss": 0.7662,
"step": 1637
},
{
"epoch": 0.1349592156216528,
"grad_norm": 3.5444323522446624,
"learning_rate": 9.963349257846227e-06,
"loss": 0.524,
"step": 1638
},
{
"epoch": 0.1350416083051825,
"grad_norm": 3.5539689124915856,
"learning_rate": 9.963268583927549e-06,
"loss": 0.2982,
"step": 1639
},
{
"epoch": 0.1351240009887122,
"grad_norm": 5.938623235390135,
"learning_rate": 9.963187821645934e-06,
"loss": 0.7121,
"step": 1640
},
{
"epoch": 0.1352063936722419,
"grad_norm": 6.0354335604105565,
"learning_rate": 9.963106971002825e-06,
"loss": 0.4654,
"step": 1641
},
{
"epoch": 0.1352887863557716,
"grad_norm": 4.70397412329452,
"learning_rate": 9.963026031999657e-06,
"loss": 0.4274,
"step": 1642
},
{
"epoch": 0.13537117903930132,
"grad_norm": 10.030510740341603,
"learning_rate": 9.96294500463787e-06,
"loss": 0.946,
"step": 1643
},
{
"epoch": 0.135453571722831,
"grad_norm": 3.6329870853318207,
"learning_rate": 9.96286388891891e-06,
"loss": 0.3884,
"step": 1644
},
{
"epoch": 0.1355359644063607,
"grad_norm": 21.36243576092974,
"learning_rate": 9.962782684844222e-06,
"loss": 0.352,
"step": 1645
},
{
"epoch": 0.13561835708989042,
"grad_norm": 4.951295627734071,
"learning_rate": 9.962701392415248e-06,
"loss": 0.4897,
"step": 1646
},
{
"epoch": 0.13570074977342012,
"grad_norm": 8.046321604277672,
"learning_rate": 9.962620011633437e-06,
"loss": 0.535,
"step": 1647
},
{
"epoch": 0.13578314245694983,
"grad_norm": 6.575825142302053,
"learning_rate": 9.962538542500237e-06,
"loss": 0.309,
"step": 1648
},
{
"epoch": 0.13586553514047953,
"grad_norm": 6.94532275093602,
"learning_rate": 9.9624569850171e-06,
"loss": 0.6461,
"step": 1649
},
{
"epoch": 0.13594792782400922,
"grad_norm": 4.7357288915786855,
"learning_rate": 9.962375339185477e-06,
"loss": 0.4357,
"step": 1650
},
{
"epoch": 0.13603032050753894,
"grad_norm": 6.597949073770168,
"learning_rate": 9.962293605006824e-06,
"loss": 0.6975,
"step": 1651
},
{
"epoch": 0.13611271319106863,
"grad_norm": 4.15014398746701,
"learning_rate": 9.962211782482592e-06,
"loss": 0.3552,
"step": 1652
},
{
"epoch": 0.13619510587459835,
"grad_norm": 4.809601849964069,
"learning_rate": 9.962129871614238e-06,
"loss": 0.6046,
"step": 1653
},
{
"epoch": 0.13627749855812804,
"grad_norm": 4.127491331508017,
"learning_rate": 9.962047872403225e-06,
"loss": 0.4958,
"step": 1654
},
{
"epoch": 0.13635989124165773,
"grad_norm": 4.711346309289919,
"learning_rate": 9.961965784851008e-06,
"loss": 0.5391,
"step": 1655
},
{
"epoch": 0.13644228392518745,
"grad_norm": 5.386352678608556,
"learning_rate": 9.96188360895905e-06,
"loss": 0.5528,
"step": 1656
},
{
"epoch": 0.13652467660871714,
"grad_norm": 3.7171000317960674,
"learning_rate": 9.961801344728814e-06,
"loss": 0.3146,
"step": 1657
},
{
"epoch": 0.13660706929224684,
"grad_norm": 8.89069958620607,
"learning_rate": 9.961718992161766e-06,
"loss": 0.7587,
"step": 1658
},
{
"epoch": 0.13668946197577655,
"grad_norm": 4.697622743571729,
"learning_rate": 9.961636551259372e-06,
"loss": 0.5835,
"step": 1659
},
{
"epoch": 0.13677185465930625,
"grad_norm": 4.830696241299008,
"learning_rate": 9.961554022023096e-06,
"loss": 0.4971,
"step": 1660
},
{
"epoch": 0.13685424734283597,
"grad_norm": 6.104707702622647,
"learning_rate": 9.961471404454412e-06,
"loss": 0.7071,
"step": 1661
},
{
"epoch": 0.13693664002636566,
"grad_norm": 4.34172790300022,
"learning_rate": 9.961388698554788e-06,
"loss": 0.5556,
"step": 1662
},
{
"epoch": 0.13701903270989535,
"grad_norm": 4.298900363667948,
"learning_rate": 9.961305904325698e-06,
"loss": 0.5294,
"step": 1663
},
{
"epoch": 0.13710142539342507,
"grad_norm": 4.954490101460579,
"learning_rate": 9.961223021768616e-06,
"loss": 0.6465,
"step": 1664
},
{
"epoch": 0.13718381807695476,
"grad_norm": 3.726228481637675,
"learning_rate": 9.961140050885014e-06,
"loss": 0.631,
"step": 1665
},
{
"epoch": 0.13726621076048448,
"grad_norm": 4.358895727873771,
"learning_rate": 9.961056991676374e-06,
"loss": 0.3122,
"step": 1666
},
{
"epoch": 0.13734860344401417,
"grad_norm": 3.7045601167381945,
"learning_rate": 9.960973844144173e-06,
"loss": 0.4447,
"step": 1667
},
{
"epoch": 0.13743099612754386,
"grad_norm": 4.571967455353021,
"learning_rate": 9.960890608289892e-06,
"loss": 0.4863,
"step": 1668
},
{
"epoch": 0.13751338881107358,
"grad_norm": 5.0825669590129205,
"learning_rate": 9.96080728411501e-06,
"loss": 0.6621,
"step": 1669
},
{
"epoch": 0.13759578149460328,
"grad_norm": 5.845767088881154,
"learning_rate": 9.960723871621015e-06,
"loss": 0.7493,
"step": 1670
},
{
"epoch": 0.137678174178133,
"grad_norm": 13.232880223960649,
"learning_rate": 9.960640370809386e-06,
"loss": 0.8361,
"step": 1671
},
{
"epoch": 0.1377605668616627,
"grad_norm": 3.393345594707139,
"learning_rate": 9.960556781681617e-06,
"loss": 0.4943,
"step": 1672
},
{
"epoch": 0.13784295954519238,
"grad_norm": 4.815367039541761,
"learning_rate": 9.960473104239188e-06,
"loss": 0.5859,
"step": 1673
},
{
"epoch": 0.1379253522287221,
"grad_norm": 25.934815306172943,
"learning_rate": 9.960389338483595e-06,
"loss": 0.5897,
"step": 1674
},
{
"epoch": 0.1380077449122518,
"grad_norm": 24.48503833988488,
"learning_rate": 9.960305484416329e-06,
"loss": 0.4866,
"step": 1675
},
{
"epoch": 0.13809013759578148,
"grad_norm": 4.687506824918209,
"learning_rate": 9.96022154203888e-06,
"loss": 0.6256,
"step": 1676
},
{
"epoch": 0.1381725302793112,
"grad_norm": 4.850750355427269,
"learning_rate": 9.960137511352743e-06,
"loss": 0.6337,
"step": 1677
},
{
"epoch": 0.1382549229628409,
"grad_norm": 3.8668299030247524,
"learning_rate": 9.960053392359415e-06,
"loss": 0.469,
"step": 1678
},
{
"epoch": 0.1383373156463706,
"grad_norm": 6.853909013979112,
"learning_rate": 9.959969185060393e-06,
"loss": 0.6324,
"step": 1679
},
{
"epoch": 0.1384197083299003,
"grad_norm": 6.736126149335651,
"learning_rate": 9.959884889457176e-06,
"loss": 0.562,
"step": 1680
},
{
"epoch": 0.13850210101343,
"grad_norm": 3.2727359868331862,
"learning_rate": 9.959800505551266e-06,
"loss": 0.3456,
"step": 1681
},
{
"epoch": 0.13858449369695972,
"grad_norm": 5.234071812547031,
"learning_rate": 9.959716033344164e-06,
"loss": 0.5409,
"step": 1682
},
{
"epoch": 0.1386668863804894,
"grad_norm": 6.251936957368742,
"learning_rate": 9.959631472837376e-06,
"loss": 0.5655,
"step": 1683
},
{
"epoch": 0.13874927906401913,
"grad_norm": 5.288184313366442,
"learning_rate": 9.959546824032404e-06,
"loss": 0.5368,
"step": 1684
},
{
"epoch": 0.13883167174754882,
"grad_norm": 6.679955478940586,
"learning_rate": 9.959462086930757e-06,
"loss": 0.6028,
"step": 1685
},
{
"epoch": 0.1389140644310785,
"grad_norm": 5.000778668824838,
"learning_rate": 9.959377261533945e-06,
"loss": 0.5867,
"step": 1686
},
{
"epoch": 0.13899645711460823,
"grad_norm": 5.120889487517192,
"learning_rate": 9.959292347843476e-06,
"loss": 0.5128,
"step": 1687
},
{
"epoch": 0.13907884979813792,
"grad_norm": 6.017331185338668,
"learning_rate": 9.959207345860863e-06,
"loss": 0.8164,
"step": 1688
},
{
"epoch": 0.13916124248166764,
"grad_norm": 4.242760533926133,
"learning_rate": 9.959122255587617e-06,
"loss": 0.4745,
"step": 1689
},
{
"epoch": 0.13924363516519733,
"grad_norm": 6.532137559532413,
"learning_rate": 9.959037077025256e-06,
"loss": 0.6932,
"step": 1690
},
{
"epoch": 0.13932602784872702,
"grad_norm": 5.970476933345833,
"learning_rate": 9.958951810175294e-06,
"loss": 0.5707,
"step": 1691
},
{
"epoch": 0.13940842053225674,
"grad_norm": 4.783279541756054,
"learning_rate": 9.958866455039253e-06,
"loss": 0.4375,
"step": 1692
},
{
"epoch": 0.13949081321578644,
"grad_norm": 5.1671883616236025,
"learning_rate": 9.958781011618648e-06,
"loss": 0.5305,
"step": 1693
},
{
"epoch": 0.13957320589931613,
"grad_norm": 4.396863523163892,
"learning_rate": 9.958695479915002e-06,
"loss": 0.4693,
"step": 1694
},
{
"epoch": 0.13965559858284585,
"grad_norm": 6.13192250177938,
"learning_rate": 9.958609859929836e-06,
"loss": 0.7002,
"step": 1695
},
{
"epoch": 0.13973799126637554,
"grad_norm": 4.437325147612999,
"learning_rate": 9.958524151664677e-06,
"loss": 0.3917,
"step": 1696
},
{
"epoch": 0.13982038394990526,
"grad_norm": 5.070019626853925,
"learning_rate": 9.958438355121052e-06,
"loss": 0.4742,
"step": 1697
},
{
"epoch": 0.13990277663343495,
"grad_norm": 6.083396673742178,
"learning_rate": 9.958352470300485e-06,
"loss": 0.621,
"step": 1698
},
{
"epoch": 0.13998516931696464,
"grad_norm": 6.384051434280243,
"learning_rate": 9.958266497204506e-06,
"loss": 0.5026,
"step": 1699
},
{
"epoch": 0.14006756200049436,
"grad_norm": 4.010479307928655,
"learning_rate": 9.958180435834646e-06,
"loss": 0.4158,
"step": 1700
},
{
"epoch": 0.14014995468402405,
"grad_norm": 4.719623857149445,
"learning_rate": 9.958094286192437e-06,
"loss": 0.4985,
"step": 1701
},
{
"epoch": 0.14023234736755377,
"grad_norm": 5.426958394750245,
"learning_rate": 9.958008048279413e-06,
"loss": 0.5531,
"step": 1702
},
{
"epoch": 0.14031474005108346,
"grad_norm": 5.08856362765558,
"learning_rate": 9.95792172209711e-06,
"loss": 0.3943,
"step": 1703
},
{
"epoch": 0.14039713273461316,
"grad_norm": 5.638213422297256,
"learning_rate": 9.957835307647063e-06,
"loss": 0.6932,
"step": 1704
},
{
"epoch": 0.14047952541814288,
"grad_norm": 6.63185919475462,
"learning_rate": 9.957748804930813e-06,
"loss": 0.6148,
"step": 1705
},
{
"epoch": 0.14056191810167257,
"grad_norm": 7.754221383064333,
"learning_rate": 9.9576622139499e-06,
"loss": 0.7738,
"step": 1706
},
{
"epoch": 0.1406443107852023,
"grad_norm": 5.15021907426077,
"learning_rate": 9.957575534705861e-06,
"loss": 0.3595,
"step": 1707
},
{
"epoch": 0.14072670346873198,
"grad_norm": 7.00492373956156,
"learning_rate": 9.957488767200246e-06,
"loss": 0.6592,
"step": 1708
},
{
"epoch": 0.14080909615226167,
"grad_norm": 6.8725037736438885,
"learning_rate": 9.957401911434594e-06,
"loss": 0.6738,
"step": 1709
},
{
"epoch": 0.1408914888357914,
"grad_norm": 5.474394546552511,
"learning_rate": 9.957314967410455e-06,
"loss": 0.4472,
"step": 1710
},
{
"epoch": 0.14097388151932108,
"grad_norm": 6.395917661111817,
"learning_rate": 9.957227935129374e-06,
"loss": 0.781,
"step": 1711
},
{
"epoch": 0.14105627420285077,
"grad_norm": 5.013972013359134,
"learning_rate": 9.957140814592901e-06,
"loss": 0.5114,
"step": 1712
},
{
"epoch": 0.1411386668863805,
"grad_norm": 5.229649358585399,
"learning_rate": 9.95705360580259e-06,
"loss": 0.7029,
"step": 1713
},
{
"epoch": 0.14122105956991018,
"grad_norm": 4.092719529026001,
"learning_rate": 9.956966308759993e-06,
"loss": 0.3894,
"step": 1714
},
{
"epoch": 0.1413034522534399,
"grad_norm": 5.2989880902352136,
"learning_rate": 9.95687892346666e-06,
"loss": 0.6119,
"step": 1715
},
{
"epoch": 0.1413858449369696,
"grad_norm": 3.6082549788649962,
"learning_rate": 9.95679144992415e-06,
"loss": 0.2587,
"step": 1716
},
{
"epoch": 0.1414682376204993,
"grad_norm": 4.328630549185103,
"learning_rate": 9.95670388813402e-06,
"loss": 0.4679,
"step": 1717
},
{
"epoch": 0.141550630304029,
"grad_norm": 6.666267680234917,
"learning_rate": 9.95661623809783e-06,
"loss": 0.6764,
"step": 1718
},
{
"epoch": 0.1416330229875587,
"grad_norm": 6.030588485073613,
"learning_rate": 9.956528499817137e-06,
"loss": 0.5958,
"step": 1719
},
{
"epoch": 0.14171541567108842,
"grad_norm": 3.697734271168265,
"learning_rate": 9.956440673293508e-06,
"loss": 0.3724,
"step": 1720
},
{
"epoch": 0.1417978083546181,
"grad_norm": 5.078205684000341,
"learning_rate": 9.956352758528501e-06,
"loss": 0.4152,
"step": 1721
},
{
"epoch": 0.1418802010381478,
"grad_norm": 5.998477976751115,
"learning_rate": 9.956264755523687e-06,
"loss": 0.4393,
"step": 1722
},
{
"epoch": 0.14196259372167752,
"grad_norm": 4.1617349318422105,
"learning_rate": 9.956176664280628e-06,
"loss": 0.5035,
"step": 1723
},
{
"epoch": 0.1420449864052072,
"grad_norm": 5.193159222144419,
"learning_rate": 9.956088484800895e-06,
"loss": 0.5345,
"step": 1724
},
{
"epoch": 0.14212737908873693,
"grad_norm": 4.085571835755461,
"learning_rate": 9.956000217086055e-06,
"loss": 0.5145,
"step": 1725
},
{
"epoch": 0.14220977177226662,
"grad_norm": 8.724102472644716,
"learning_rate": 9.955911861137683e-06,
"loss": 0.7727,
"step": 1726
},
{
"epoch": 0.14229216445579632,
"grad_norm": 5.455750786296661,
"learning_rate": 9.95582341695735e-06,
"loss": 0.519,
"step": 1727
},
{
"epoch": 0.14237455713932604,
"grad_norm": 7.3612630042618,
"learning_rate": 9.955734884546632e-06,
"loss": 0.9181,
"step": 1728
},
{
"epoch": 0.14245694982285573,
"grad_norm": 6.860122301504194,
"learning_rate": 9.955646263907103e-06,
"loss": 0.7269,
"step": 1729
},
{
"epoch": 0.14253934250638542,
"grad_norm": 5.246035437947141,
"learning_rate": 9.955557555040344e-06,
"loss": 0.6018,
"step": 1730
},
{
"epoch": 0.14262173518991514,
"grad_norm": 4.620747795169099,
"learning_rate": 9.95546875794793e-06,
"loss": 0.4273,
"step": 1731
},
{
"epoch": 0.14270412787344483,
"grad_norm": 4.879100737888411,
"learning_rate": 9.955379872631447e-06,
"loss": 0.4053,
"step": 1732
},
{
"epoch": 0.14278652055697455,
"grad_norm": 5.1781106148154965,
"learning_rate": 9.955290899092473e-06,
"loss": 0.5273,
"step": 1733
},
{
"epoch": 0.14286891324050424,
"grad_norm": 5.28098812734887,
"learning_rate": 9.955201837332592e-06,
"loss": 0.4492,
"step": 1734
},
{
"epoch": 0.14295130592403393,
"grad_norm": 4.588098614679162,
"learning_rate": 9.955112687353395e-06,
"loss": 0.6444,
"step": 1735
},
{
"epoch": 0.14303369860756365,
"grad_norm": 6.807890987436337,
"learning_rate": 9.955023449156464e-06,
"loss": 0.8301,
"step": 1736
},
{
"epoch": 0.14311609129109334,
"grad_norm": 6.19317980765747,
"learning_rate": 9.95493412274339e-06,
"loss": 0.644,
"step": 1737
},
{
"epoch": 0.14319848397462306,
"grad_norm": 4.804293638065614,
"learning_rate": 9.954844708115761e-06,
"loss": 0.3949,
"step": 1738
},
{
"epoch": 0.14328087665815276,
"grad_norm": 5.446810889833297,
"learning_rate": 9.95475520527517e-06,
"loss": 0.4823,
"step": 1739
},
{
"epoch": 0.14336326934168245,
"grad_norm": 5.587157787333849,
"learning_rate": 9.954665614223212e-06,
"loss": 0.3342,
"step": 1740
},
{
"epoch": 0.14344566202521217,
"grad_norm": 6.671984779337379,
"learning_rate": 9.954575934961482e-06,
"loss": 0.4438,
"step": 1741
},
{
"epoch": 0.14352805470874186,
"grad_norm": 6.720067281421165,
"learning_rate": 9.954486167491574e-06,
"loss": 0.6546,
"step": 1742
},
{
"epoch": 0.14361044739227158,
"grad_norm": 4.310942822266015,
"learning_rate": 9.954396311815088e-06,
"loss": 0.3683,
"step": 1743
},
{
"epoch": 0.14369284007580127,
"grad_norm": 6.662276529673106,
"learning_rate": 9.954306367933623e-06,
"loss": 0.7405,
"step": 1744
},
{
"epoch": 0.14377523275933096,
"grad_norm": 5.668799637679965,
"learning_rate": 9.954216335848781e-06,
"loss": 0.5108,
"step": 1745
},
{
"epoch": 0.14385762544286068,
"grad_norm": 11.70486674091479,
"learning_rate": 9.954126215562165e-06,
"loss": 0.5593,
"step": 1746
},
{
"epoch": 0.14394001812639037,
"grad_norm": 4.7414430921442285,
"learning_rate": 9.954036007075378e-06,
"loss": 0.6503,
"step": 1747
},
{
"epoch": 0.1440224108099201,
"grad_norm": 5.383455910246553,
"learning_rate": 9.953945710390029e-06,
"loss": 0.534,
"step": 1748
},
{
"epoch": 0.14410480349344978,
"grad_norm": 5.207132564825904,
"learning_rate": 9.953855325507723e-06,
"loss": 0.6014,
"step": 1749
},
{
"epoch": 0.14418719617697948,
"grad_norm": 4.522652655624019,
"learning_rate": 9.95376485243007e-06,
"loss": 0.561,
"step": 1750
},
{
"epoch": 0.1442695888605092,
"grad_norm": 3.575322962748732,
"learning_rate": 9.95367429115868e-06,
"loss": 0.3608,
"step": 1751
},
{
"epoch": 0.1443519815440389,
"grad_norm": 3.799989212310609,
"learning_rate": 9.953583641695163e-06,
"loss": 0.5892,
"step": 1752
},
{
"epoch": 0.14443437422756858,
"grad_norm": 4.154410303606598,
"learning_rate": 9.95349290404114e-06,
"loss": 0.6148,
"step": 1753
},
{
"epoch": 0.1445167669110983,
"grad_norm": 4.526476783100888,
"learning_rate": 9.95340207819822e-06,
"loss": 0.5512,
"step": 1754
},
{
"epoch": 0.144599159594628,
"grad_norm": 5.470982965204849,
"learning_rate": 9.953311164168023e-06,
"loss": 0.6535,
"step": 1755
},
{
"epoch": 0.1446815522781577,
"grad_norm": 6.153122788881867,
"learning_rate": 9.953220161952165e-06,
"loss": 0.5768,
"step": 1756
},
{
"epoch": 0.1447639449616874,
"grad_norm": 3.888887680481176,
"learning_rate": 9.95312907155227e-06,
"loss": 0.3174,
"step": 1757
},
{
"epoch": 0.1448463376452171,
"grad_norm": 5.669629872152485,
"learning_rate": 9.953037892969957e-06,
"loss": 0.6727,
"step": 1758
},
{
"epoch": 0.1449287303287468,
"grad_norm": 35.771339221273166,
"learning_rate": 9.952946626206848e-06,
"loss": 1.7314,
"step": 1759
},
{
"epoch": 0.1450111230122765,
"grad_norm": 4.3289004927992725,
"learning_rate": 9.952855271264573e-06,
"loss": 0.5573,
"step": 1760
},
{
"epoch": 0.14509351569580622,
"grad_norm": 3.799289594497197,
"learning_rate": 9.952763828144752e-06,
"loss": 0.3963,
"step": 1761
},
{
"epoch": 0.14517590837933592,
"grad_norm": 5.288650375462546,
"learning_rate": 9.952672296849017e-06,
"loss": 0.475,
"step": 1762
},
{
"epoch": 0.1452583010628656,
"grad_norm": 3.9972186574876734,
"learning_rate": 9.952580677378998e-06,
"loss": 0.5127,
"step": 1763
},
{
"epoch": 0.14534069374639533,
"grad_norm": 4.3332751493193475,
"learning_rate": 9.952488969736324e-06,
"loss": 0.5247,
"step": 1764
},
{
"epoch": 0.14542308642992502,
"grad_norm": 4.195892689128413,
"learning_rate": 9.952397173922629e-06,
"loss": 0.3199,
"step": 1765
},
{
"epoch": 0.14550547911345474,
"grad_norm": 5.722985806247223,
"learning_rate": 9.952305289939545e-06,
"loss": 0.6056,
"step": 1766
},
{
"epoch": 0.14558787179698443,
"grad_norm": 6.368312651719165,
"learning_rate": 9.952213317788713e-06,
"loss": 0.5713,
"step": 1767
},
{
"epoch": 0.14567026448051412,
"grad_norm": 4.5910410814721585,
"learning_rate": 9.952121257471765e-06,
"loss": 0.5231,
"step": 1768
},
{
"epoch": 0.14575265716404384,
"grad_norm": 4.149070439095053,
"learning_rate": 9.952029108990341e-06,
"loss": 0.4691,
"step": 1769
},
{
"epoch": 0.14583504984757353,
"grad_norm": 6.698020175906564,
"learning_rate": 9.951936872346084e-06,
"loss": 0.6816,
"step": 1770
},
{
"epoch": 0.14591744253110323,
"grad_norm": 4.448335960350891,
"learning_rate": 9.951844547540634e-06,
"loss": 0.5778,
"step": 1771
},
{
"epoch": 0.14599983521463294,
"grad_norm": 5.697992048620652,
"learning_rate": 9.951752134575636e-06,
"loss": 0.5035,
"step": 1772
},
{
"epoch": 0.14608222789816264,
"grad_norm": 6.626396310224672,
"learning_rate": 9.951659633452735e-06,
"loss": 0.6606,
"step": 1773
},
{
"epoch": 0.14616462058169236,
"grad_norm": 4.046239182291733,
"learning_rate": 9.951567044173577e-06,
"loss": 0.3849,
"step": 1774
},
{
"epoch": 0.14624701326522205,
"grad_norm": 3.688760782701429,
"learning_rate": 9.951474366739811e-06,
"loss": 0.4589,
"step": 1775
},
{
"epoch": 0.14632940594875174,
"grad_norm": 4.790126428067029,
"learning_rate": 9.951381601153087e-06,
"loss": 0.6115,
"step": 1776
},
{
"epoch": 0.14641179863228146,
"grad_norm": 6.205649281121326,
"learning_rate": 9.951288747415055e-06,
"loss": 0.5969,
"step": 1777
},
{
"epoch": 0.14649419131581115,
"grad_norm": 3.3187533632736397,
"learning_rate": 9.95119580552737e-06,
"loss": 0.5073,
"step": 1778
},
{
"epoch": 0.14657658399934087,
"grad_norm": 3.258398469094614,
"learning_rate": 9.95110277549169e-06,
"loss": 0.2559,
"step": 1779
},
{
"epoch": 0.14665897668287056,
"grad_norm": 4.3933466019017375,
"learning_rate": 9.951009657309664e-06,
"loss": 0.4237,
"step": 1780
},
{
"epoch": 0.14674136936640025,
"grad_norm": 5.039755273960905,
"learning_rate": 9.950916450982954e-06,
"loss": 0.5285,
"step": 1781
},
{
"epoch": 0.14682376204992997,
"grad_norm": 5.052820158208282,
"learning_rate": 9.95082315651322e-06,
"loss": 0.5216,
"step": 1782
},
{
"epoch": 0.14690615473345967,
"grad_norm": 7.221433965049104,
"learning_rate": 9.950729773902119e-06,
"loss": 0.7875,
"step": 1783
},
{
"epoch": 0.14698854741698938,
"grad_norm": 4.022936489503606,
"learning_rate": 9.950636303151318e-06,
"loss": 0.3333,
"step": 1784
},
{
"epoch": 0.14707094010051908,
"grad_norm": 4.202488842137441,
"learning_rate": 9.950542744262478e-06,
"loss": 0.3781,
"step": 1785
},
{
"epoch": 0.14715333278404877,
"grad_norm": 7.63848412977035,
"learning_rate": 9.950449097237268e-06,
"loss": 0.6791,
"step": 1786
},
{
"epoch": 0.1472357254675785,
"grad_norm": 4.718688788366161,
"learning_rate": 9.950355362077351e-06,
"loss": 0.5764,
"step": 1787
},
{
"epoch": 0.14731811815110818,
"grad_norm": 7.447234767381768,
"learning_rate": 9.950261538784399e-06,
"loss": 0.4469,
"step": 1788
},
{
"epoch": 0.14740051083463787,
"grad_norm": 5.095202612388618,
"learning_rate": 9.950167627360078e-06,
"loss": 0.6372,
"step": 1789
},
{
"epoch": 0.1474829035181676,
"grad_norm": 7.204391060200845,
"learning_rate": 9.950073627806068e-06,
"loss": 0.4491,
"step": 1790
},
{
"epoch": 0.14756529620169728,
"grad_norm": 5.269735640638179,
"learning_rate": 9.949979540124036e-06,
"loss": 0.7009,
"step": 1791
},
{
"epoch": 0.147647688885227,
"grad_norm": 5.921433725601925,
"learning_rate": 9.949885364315659e-06,
"loss": 0.3743,
"step": 1792
},
{
"epoch": 0.1477300815687567,
"grad_norm": 4.473723831399909,
"learning_rate": 9.949791100382613e-06,
"loss": 0.5765,
"step": 1793
},
{
"epoch": 0.14781247425228639,
"grad_norm": 4.15202235170927,
"learning_rate": 9.949696748326576e-06,
"loss": 0.4384,
"step": 1794
},
{
"epoch": 0.1478948669358161,
"grad_norm": 5.1563131436767335,
"learning_rate": 9.94960230814923e-06,
"loss": 0.4914,
"step": 1795
},
{
"epoch": 0.1479772596193458,
"grad_norm": 7.2266024173875,
"learning_rate": 9.949507779852255e-06,
"loss": 0.8423,
"step": 1796
},
{
"epoch": 0.14805965230287552,
"grad_norm": 3.6509897775167817,
"learning_rate": 9.949413163437334e-06,
"loss": 0.2087,
"step": 1797
},
{
"epoch": 0.1481420449864052,
"grad_norm": 4.574456787530481,
"learning_rate": 9.94931845890615e-06,
"loss": 0.4738,
"step": 1798
},
{
"epoch": 0.1482244376699349,
"grad_norm": 5.648889619706296,
"learning_rate": 9.949223666260391e-06,
"loss": 0.5997,
"step": 1799
},
{
"epoch": 0.14830683035346462,
"grad_norm": 5.202785968386703,
"learning_rate": 9.949128785501744e-06,
"loss": 0.3894,
"step": 1800
},
{
"epoch": 0.1483892230369943,
"grad_norm": 10.997911457737233,
"learning_rate": 9.949033816631897e-06,
"loss": 0.6417,
"step": 1801
},
{
"epoch": 0.14847161572052403,
"grad_norm": 4.6037373374277015,
"learning_rate": 9.948938759652545e-06,
"loss": 0.4054,
"step": 1802
},
{
"epoch": 0.14855400840405372,
"grad_norm": 6.562807171790634,
"learning_rate": 9.948843614565373e-06,
"loss": 0.5643,
"step": 1803
},
{
"epoch": 0.14863640108758341,
"grad_norm": 4.45858174996238,
"learning_rate": 9.948748381372081e-06,
"loss": 0.4779,
"step": 1804
},
{
"epoch": 0.14871879377111313,
"grad_norm": 5.1171416882597915,
"learning_rate": 9.948653060074365e-06,
"loss": 0.6325,
"step": 1805
},
{
"epoch": 0.14880118645464283,
"grad_norm": 5.1648545519748765,
"learning_rate": 9.948557650673917e-06,
"loss": 0.6289,
"step": 1806
},
{
"epoch": 0.14888357913817252,
"grad_norm": 3.3249976860584125,
"learning_rate": 9.94846215317244e-06,
"loss": 0.5099,
"step": 1807
},
{
"epoch": 0.14896597182170224,
"grad_norm": 4.210265252795672,
"learning_rate": 9.94836656757163e-06,
"loss": 0.3661,
"step": 1808
},
{
"epoch": 0.14904836450523193,
"grad_norm": 5.856159591100659,
"learning_rate": 9.948270893873194e-06,
"loss": 0.6683,
"step": 1809
},
{
"epoch": 0.14913075718876165,
"grad_norm": 5.665378850590255,
"learning_rate": 9.94817513207883e-06,
"loss": 0.4274,
"step": 1810
},
{
"epoch": 0.14921314987229134,
"grad_norm": 3.2139105645087644,
"learning_rate": 9.948079282190246e-06,
"loss": 0.3374,
"step": 1811
},
{
"epoch": 0.14929554255582103,
"grad_norm": 4.41472261330769,
"learning_rate": 9.947983344209149e-06,
"loss": 0.5235,
"step": 1812
},
{
"epoch": 0.14937793523935075,
"grad_norm": 5.0108760558021235,
"learning_rate": 9.947887318137246e-06,
"loss": 0.4894,
"step": 1813
},
{
"epoch": 0.14946032792288044,
"grad_norm": 4.076125168503218,
"learning_rate": 9.947791203976246e-06,
"loss": 0.486,
"step": 1814
},
{
"epoch": 0.14954272060641016,
"grad_norm": 4.9262153367813175,
"learning_rate": 9.94769500172786e-06,
"loss": 0.4339,
"step": 1815
},
{
"epoch": 0.14962511328993985,
"grad_norm": 4.823936819797354,
"learning_rate": 9.947598711393803e-06,
"loss": 0.5129,
"step": 1816
},
{
"epoch": 0.14970750597346955,
"grad_norm": 7.422686319765741,
"learning_rate": 9.947502332975785e-06,
"loss": 0.724,
"step": 1817
},
{
"epoch": 0.14978989865699927,
"grad_norm": 5.069308352660441,
"learning_rate": 9.947405866475526e-06,
"loss": 0.3606,
"step": 1818
},
{
"epoch": 0.14987229134052896,
"grad_norm": 4.817079842052196,
"learning_rate": 9.947309311894741e-06,
"loss": 0.6129,
"step": 1819
},
{
"epoch": 0.14995468402405868,
"grad_norm": 4.409288579092636,
"learning_rate": 9.947212669235151e-06,
"loss": 0.3029,
"step": 1820
},
{
"epoch": 0.15003707670758837,
"grad_norm": 4.088287504533524,
"learning_rate": 9.947115938498475e-06,
"loss": 0.3747,
"step": 1821
},
{
"epoch": 0.15011946939111806,
"grad_norm": 4.526831599614096,
"learning_rate": 9.947019119686437e-06,
"loss": 0.5938,
"step": 1822
},
{
"epoch": 0.15020186207464778,
"grad_norm": 5.180661766219478,
"learning_rate": 9.946922212800758e-06,
"loss": 0.5274,
"step": 1823
},
{
"epoch": 0.15028425475817747,
"grad_norm": 4.075075934628573,
"learning_rate": 9.946825217843165e-06,
"loss": 0.5151,
"step": 1824
},
{
"epoch": 0.15036664744170716,
"grad_norm": 4.371753027926906,
"learning_rate": 9.946728134815384e-06,
"loss": 0.3841,
"step": 1825
},
{
"epoch": 0.15044904012523688,
"grad_norm": 7.004245095318552,
"learning_rate": 9.946630963719143e-06,
"loss": 0.7213,
"step": 1826
},
{
"epoch": 0.15053143280876657,
"grad_norm": 4.695580060254283,
"learning_rate": 9.946533704556174e-06,
"loss": 0.4254,
"step": 1827
},
{
"epoch": 0.1506138254922963,
"grad_norm": 6.379558882024517,
"learning_rate": 9.946436357328208e-06,
"loss": 0.4716,
"step": 1828
},
{
"epoch": 0.15069621817582599,
"grad_norm": 30.181991180317187,
"learning_rate": 9.946338922036977e-06,
"loss": 1.3403,
"step": 1829
},
{
"epoch": 0.15077861085935568,
"grad_norm": 4.760372710692016,
"learning_rate": 9.946241398684216e-06,
"loss": 0.6688,
"step": 1830
},
{
"epoch": 0.1508610035428854,
"grad_norm": 29.77514513346913,
"learning_rate": 9.94614378727166e-06,
"loss": 1.0769,
"step": 1831
},
{
"epoch": 0.1509433962264151,
"grad_norm": 5.446400107754182,
"learning_rate": 9.946046087801052e-06,
"loss": 0.4928,
"step": 1832
},
{
"epoch": 0.1510257889099448,
"grad_norm": 4.18231130411727,
"learning_rate": 9.945948300274124e-06,
"loss": 0.5164,
"step": 1833
},
{
"epoch": 0.1511081815934745,
"grad_norm": 9.784576516645163,
"learning_rate": 9.945850424692622e-06,
"loss": 0.7336,
"step": 1834
},
{
"epoch": 0.1511905742770042,
"grad_norm": 4.280452136926324,
"learning_rate": 9.945752461058286e-06,
"loss": 0.5356,
"step": 1835
},
{
"epoch": 0.1512729669605339,
"grad_norm": 4.514588928723341,
"learning_rate": 9.945654409372861e-06,
"loss": 0.3138,
"step": 1836
},
{
"epoch": 0.1513553596440636,
"grad_norm": 4.977510975280522,
"learning_rate": 9.945556269638095e-06,
"loss": 0.6125,
"step": 1837
},
{
"epoch": 0.15143775232759332,
"grad_norm": 3.7210768337208595,
"learning_rate": 9.945458041855732e-06,
"loss": 0.6259,
"step": 1838
},
{
"epoch": 0.15152014501112301,
"grad_norm": 3.836298613785647,
"learning_rate": 9.94535972602752e-06,
"loss": 0.4432,
"step": 1839
},
{
"epoch": 0.1516025376946527,
"grad_norm": 8.066856187714102,
"learning_rate": 9.945261322155213e-06,
"loss": 0.8116,
"step": 1840
},
{
"epoch": 0.15168493037818243,
"grad_norm": 4.468142883297208,
"learning_rate": 9.94516283024056e-06,
"loss": 0.4872,
"step": 1841
},
{
"epoch": 0.15176732306171212,
"grad_norm": 5.892106263712281,
"learning_rate": 9.945064250285318e-06,
"loss": 0.7393,
"step": 1842
},
{
"epoch": 0.1518497157452418,
"grad_norm": 5.748773653446861,
"learning_rate": 9.944965582291236e-06,
"loss": 0.491,
"step": 1843
},
{
"epoch": 0.15193210842877153,
"grad_norm": 6.267061350789159,
"learning_rate": 9.944866826260076e-06,
"loss": 0.5588,
"step": 1844
},
{
"epoch": 0.15201450111230122,
"grad_norm": 5.56231366013553,
"learning_rate": 9.944767982193595e-06,
"loss": 0.392,
"step": 1845
},
{
"epoch": 0.15209689379583094,
"grad_norm": 5.383200889814545,
"learning_rate": 9.944669050093552e-06,
"loss": 0.6692,
"step": 1846
},
{
"epoch": 0.15217928647936063,
"grad_norm": 6.870277257184773,
"learning_rate": 9.944570029961706e-06,
"loss": 0.7725,
"step": 1847
},
{
"epoch": 0.15226167916289032,
"grad_norm": 4.559517814413792,
"learning_rate": 9.944470921799825e-06,
"loss": 0.6481,
"step": 1848
},
{
"epoch": 0.15234407184642004,
"grad_norm": 4.74898634616392,
"learning_rate": 9.944371725609671e-06,
"loss": 0.5648,
"step": 1849
},
{
"epoch": 0.15242646452994973,
"grad_norm": 4.860513869955498,
"learning_rate": 9.944272441393008e-06,
"loss": 0.6931,
"step": 1850
},
{
"epoch": 0.15250885721347945,
"grad_norm": 4.3722520697227205,
"learning_rate": 9.944173069151609e-06,
"loss": 0.3393,
"step": 1851
},
{
"epoch": 0.15259124989700915,
"grad_norm": 5.0443736184423065,
"learning_rate": 9.944073608887235e-06,
"loss": 0.5772,
"step": 1852
},
{
"epoch": 0.15267364258053884,
"grad_norm": 8.549939972371906,
"learning_rate": 9.943974060601664e-06,
"loss": 0.5043,
"step": 1853
},
{
"epoch": 0.15275603526406856,
"grad_norm": 4.757709984298188,
"learning_rate": 9.943874424296666e-06,
"loss": 0.6423,
"step": 1854
},
{
"epoch": 0.15283842794759825,
"grad_norm": 5.4002386456791305,
"learning_rate": 9.943774699974014e-06,
"loss": 0.5686,
"step": 1855
},
{
"epoch": 0.15292082063112797,
"grad_norm": 4.215127806823176,
"learning_rate": 9.943674887635483e-06,
"loss": 0.5367,
"step": 1856
},
{
"epoch": 0.15300321331465766,
"grad_norm": 4.274941922810144,
"learning_rate": 9.943574987282853e-06,
"loss": 0.5136,
"step": 1857
},
{
"epoch": 0.15308560599818735,
"grad_norm": 5.007470901292181,
"learning_rate": 9.943474998917899e-06,
"loss": 0.4348,
"step": 1858
},
{
"epoch": 0.15316799868171707,
"grad_norm": 4.406761750552184,
"learning_rate": 9.943374922542403e-06,
"loss": 0.5084,
"step": 1859
},
{
"epoch": 0.15325039136524676,
"grad_norm": 5.414163225734359,
"learning_rate": 9.943274758158146e-06,
"loss": 0.545,
"step": 1860
},
{
"epoch": 0.15333278404877645,
"grad_norm": 4.477228911899572,
"learning_rate": 9.943174505766912e-06,
"loss": 0.5127,
"step": 1861
},
{
"epoch": 0.15341517673230617,
"grad_norm": 5.4286296996844134,
"learning_rate": 9.943074165370486e-06,
"loss": 0.5424,
"step": 1862
},
{
"epoch": 0.15349756941583587,
"grad_norm": 7.8227006152958625,
"learning_rate": 9.94297373697065e-06,
"loss": 0.6181,
"step": 1863
},
{
"epoch": 0.15357996209936559,
"grad_norm": 4.58280474696728,
"learning_rate": 9.942873220569201e-06,
"loss": 0.5705,
"step": 1864
},
{
"epoch": 0.15366235478289528,
"grad_norm": 5.000809086425357,
"learning_rate": 9.942772616167921e-06,
"loss": 0.5671,
"step": 1865
},
{
"epoch": 0.15374474746642497,
"grad_norm": 4.052269078249802,
"learning_rate": 9.942671923768604e-06,
"loss": 0.5368,
"step": 1866
},
{
"epoch": 0.1538271401499547,
"grad_norm": 4.354386031164391,
"learning_rate": 9.942571143373041e-06,
"loss": 0.4506,
"step": 1867
},
{
"epoch": 0.15390953283348438,
"grad_norm": 5.323874342321902,
"learning_rate": 9.942470274983029e-06,
"loss": 0.5499,
"step": 1868
},
{
"epoch": 0.1539919255170141,
"grad_norm": 4.166290067353925,
"learning_rate": 9.94236931860036e-06,
"loss": 0.57,
"step": 1869
},
{
"epoch": 0.1540743182005438,
"grad_norm": 4.663093189419887,
"learning_rate": 9.942268274226836e-06,
"loss": 0.6149,
"step": 1870
},
{
"epoch": 0.15415671088407348,
"grad_norm": 5.011905916587221,
"learning_rate": 9.942167141864252e-06,
"loss": 0.5146,
"step": 1871
},
{
"epoch": 0.1542391035676032,
"grad_norm": 3.684085210483375,
"learning_rate": 9.94206592151441e-06,
"loss": 0.471,
"step": 1872
},
{
"epoch": 0.1543214962511329,
"grad_norm": 4.529894364792616,
"learning_rate": 9.941964613179113e-06,
"loss": 0.6402,
"step": 1873
},
{
"epoch": 0.15440388893466261,
"grad_norm": 5.379119541477112,
"learning_rate": 9.941863216860161e-06,
"loss": 0.4688,
"step": 1874
},
{
"epoch": 0.1544862816181923,
"grad_norm": 4.711252577285658,
"learning_rate": 9.941761732559365e-06,
"loss": 0.4731,
"step": 1875
},
{
"epoch": 0.154568674301722,
"grad_norm": 4.627102248023416,
"learning_rate": 9.941660160278526e-06,
"loss": 0.5882,
"step": 1876
},
{
"epoch": 0.15465106698525172,
"grad_norm": 4.784269516867435,
"learning_rate": 9.941558500019458e-06,
"loss": 0.5784,
"step": 1877
},
{
"epoch": 0.1547334596687814,
"grad_norm": 3.964863589286543,
"learning_rate": 9.941456751783965e-06,
"loss": 0.4781,
"step": 1878
},
{
"epoch": 0.15481585235231113,
"grad_norm": 3.7942935218449954,
"learning_rate": 9.941354915573863e-06,
"loss": 0.3864,
"step": 1879
},
{
"epoch": 0.15489824503584082,
"grad_norm": 3.7341181608097385,
"learning_rate": 9.941252991390961e-06,
"loss": 0.4249,
"step": 1880
},
{
"epoch": 0.1549806377193705,
"grad_norm": 5.551866143123865,
"learning_rate": 9.941150979237078e-06,
"loss": 0.5836,
"step": 1881
},
{
"epoch": 0.15506303040290023,
"grad_norm": 9.280745802972426,
"learning_rate": 9.941048879114025e-06,
"loss": 0.7968,
"step": 1882
},
{
"epoch": 0.15514542308642992,
"grad_norm": 4.957929728130742,
"learning_rate": 9.940946691023625e-06,
"loss": 0.4156,
"step": 1883
},
{
"epoch": 0.15522781576995962,
"grad_norm": 5.612906720964241,
"learning_rate": 9.940844414967697e-06,
"loss": 0.5885,
"step": 1884
},
{
"epoch": 0.15531020845348933,
"grad_norm": 4.208810615796374,
"learning_rate": 9.940742050948057e-06,
"loss": 0.3961,
"step": 1885
},
{
"epoch": 0.15539260113701903,
"grad_norm": 23.92972768953157,
"learning_rate": 9.94063959896653e-06,
"loss": 0.6231,
"step": 1886
},
{
"epoch": 0.15547499382054875,
"grad_norm": 5.76934083591682,
"learning_rate": 9.940537059024942e-06,
"loss": 0.4364,
"step": 1887
},
{
"epoch": 0.15555738650407844,
"grad_norm": 5.707451138870683,
"learning_rate": 9.940434431125117e-06,
"loss": 0.5047,
"step": 1888
},
{
"epoch": 0.15563977918760813,
"grad_norm": 5.776218724756825,
"learning_rate": 9.940331715268883e-06,
"loss": 0.5968,
"step": 1889
},
{
"epoch": 0.15572217187113785,
"grad_norm": 4.954848575236138,
"learning_rate": 9.940228911458065e-06,
"loss": 0.5645,
"step": 1890
},
{
"epoch": 0.15580456455466754,
"grad_norm": 4.99108420622021,
"learning_rate": 9.940126019694498e-06,
"loss": 0.6215,
"step": 1891
},
{
"epoch": 0.15588695723819726,
"grad_norm": 4.922687738818404,
"learning_rate": 9.940023039980012e-06,
"loss": 0.4969,
"step": 1892
},
{
"epoch": 0.15596934992172695,
"grad_norm": 6.857403736754451,
"learning_rate": 9.939919972316437e-06,
"loss": 0.7445,
"step": 1893
},
{
"epoch": 0.15605174260525664,
"grad_norm": 5.0582162606668115,
"learning_rate": 9.939816816705615e-06,
"loss": 0.4787,
"step": 1894
},
{
"epoch": 0.15613413528878636,
"grad_norm": 4.756494625315935,
"learning_rate": 9.939713573149377e-06,
"loss": 0.6097,
"step": 1895
},
{
"epoch": 0.15621652797231605,
"grad_norm": 6.419763789825375,
"learning_rate": 9.939610241649561e-06,
"loss": 0.4783,
"step": 1896
},
{
"epoch": 0.15629892065584577,
"grad_norm": 4.5978040753369935,
"learning_rate": 9.93950682220801e-06,
"loss": 0.4438,
"step": 1897
},
{
"epoch": 0.15638131333937547,
"grad_norm": 4.462028232430049,
"learning_rate": 9.939403314826563e-06,
"loss": 0.3942,
"step": 1898
},
{
"epoch": 0.15646370602290516,
"grad_norm": 5.337997568156949,
"learning_rate": 9.939299719507065e-06,
"loss": 0.5616,
"step": 1899
},
{
"epoch": 0.15654609870643488,
"grad_norm": 3.60824355878785,
"learning_rate": 9.939196036251357e-06,
"loss": 0.4209,
"step": 1900
},
{
"epoch": 0.15662849138996457,
"grad_norm": 3.9143893571730146,
"learning_rate": 9.939092265061288e-06,
"loss": 0.3278,
"step": 1901
},
{
"epoch": 0.15671088407349426,
"grad_norm": 4.439793896511223,
"learning_rate": 9.938988405938703e-06,
"loss": 0.5322,
"step": 1902
},
{
"epoch": 0.15679327675702398,
"grad_norm": 4.36845403354661,
"learning_rate": 9.938884458885454e-06,
"loss": 0.5716,
"step": 1903
},
{
"epoch": 0.15687566944055367,
"grad_norm": 6.284830972083645,
"learning_rate": 9.938780423903387e-06,
"loss": 0.6454,
"step": 1904
},
{
"epoch": 0.1569580621240834,
"grad_norm": 6.495990019493336,
"learning_rate": 9.938676300994358e-06,
"loss": 0.5363,
"step": 1905
},
{
"epoch": 0.15704045480761308,
"grad_norm": 5.215129230889581,
"learning_rate": 9.938572090160222e-06,
"loss": 0.3534,
"step": 1906
},
{
"epoch": 0.15712284749114278,
"grad_norm": 5.0984361516420735,
"learning_rate": 9.938467791402828e-06,
"loss": 0.3469,
"step": 1907
},
{
"epoch": 0.1572052401746725,
"grad_norm": 35.6236332120135,
"learning_rate": 9.938363404724038e-06,
"loss": 1.4843,
"step": 1908
},
{
"epoch": 0.1572876328582022,
"grad_norm": 7.458138808494354,
"learning_rate": 9.93825893012571e-06,
"loss": 0.6091,
"step": 1909
},
{
"epoch": 0.1573700255417319,
"grad_norm": 6.398287378682458,
"learning_rate": 9.938154367609705e-06,
"loss": 0.5134,
"step": 1910
},
{
"epoch": 0.1574524182252616,
"grad_norm": 3.5343424335631433,
"learning_rate": 9.93804971717788e-06,
"loss": 0.286,
"step": 1911
},
{
"epoch": 0.1575348109087913,
"grad_norm": 4.504459000128934,
"learning_rate": 9.937944978832103e-06,
"loss": 0.4624,
"step": 1912
},
{
"epoch": 0.157617203592321,
"grad_norm": 7.108577599603217,
"learning_rate": 9.937840152574235e-06,
"loss": 0.7483,
"step": 1913
},
{
"epoch": 0.1576995962758507,
"grad_norm": 3.6174046857082898,
"learning_rate": 9.937735238406146e-06,
"loss": 0.4525,
"step": 1914
},
{
"epoch": 0.15778198895938042,
"grad_norm": 6.389978345222535,
"learning_rate": 9.9376302363297e-06,
"loss": 0.7896,
"step": 1915
},
{
"epoch": 0.1578643816429101,
"grad_norm": 6.548196848276821,
"learning_rate": 9.937525146346767e-06,
"loss": 0.6367,
"step": 1916
},
{
"epoch": 0.1579467743264398,
"grad_norm": 3.655411742497471,
"learning_rate": 9.937419968459221e-06,
"loss": 0.5116,
"step": 1917
},
{
"epoch": 0.15802916700996952,
"grad_norm": 4.929959679569643,
"learning_rate": 9.937314702668933e-06,
"loss": 0.4611,
"step": 1918
},
{
"epoch": 0.15811155969349922,
"grad_norm": 6.432318019871717,
"learning_rate": 9.937209348977776e-06,
"loss": 0.5361,
"step": 1919
},
{
"epoch": 0.1581939523770289,
"grad_norm": 4.031201714847671,
"learning_rate": 9.937103907387626e-06,
"loss": 0.3384,
"step": 1920
},
{
"epoch": 0.15827634506055863,
"grad_norm": 3.9206730902636253,
"learning_rate": 9.936998377900362e-06,
"loss": 0.5143,
"step": 1921
},
{
"epoch": 0.15835873774408832,
"grad_norm": 5.579814626803759,
"learning_rate": 9.93689276051786e-06,
"loss": 0.6045,
"step": 1922
},
{
"epoch": 0.15844113042761804,
"grad_norm": 3.917683732272094,
"learning_rate": 9.936787055242002e-06,
"loss": 0.2376,
"step": 1923
},
{
"epoch": 0.15852352311114773,
"grad_norm": 4.78423472401811,
"learning_rate": 9.93668126207467e-06,
"loss": 0.4524,
"step": 1924
},
{
"epoch": 0.15860591579467742,
"grad_norm": 3.2708331638186716,
"learning_rate": 9.936575381017746e-06,
"loss": 0.3629,
"step": 1925
},
{
"epoch": 0.15868830847820714,
"grad_norm": 23.158576456528266,
"learning_rate": 9.936469412073117e-06,
"loss": 0.4387,
"step": 1926
},
{
"epoch": 0.15877070116173683,
"grad_norm": 5.907761415620391,
"learning_rate": 9.936363355242668e-06,
"loss": 0.5724,
"step": 1927
},
{
"epoch": 0.15885309384526655,
"grad_norm": 4.421376470888521,
"learning_rate": 9.93625721052829e-06,
"loss": 0.6202,
"step": 1928
},
{
"epoch": 0.15893548652879624,
"grad_norm": 5.122603282277841,
"learning_rate": 9.936150977931869e-06,
"loss": 0.4856,
"step": 1929
},
{
"epoch": 0.15901787921232594,
"grad_norm": 5.697392483109681,
"learning_rate": 9.936044657455298e-06,
"loss": 0.7097,
"step": 1930
},
{
"epoch": 0.15910027189585565,
"grad_norm": 8.995800779777136,
"learning_rate": 9.93593824910047e-06,
"loss": 0.7263,
"step": 1931
},
{
"epoch": 0.15918266457938535,
"grad_norm": 3.015815073457665,
"learning_rate": 9.935831752869278e-06,
"loss": 0.4475,
"step": 1932
},
{
"epoch": 0.15926505726291507,
"grad_norm": 5.3090549745473075,
"learning_rate": 9.93572516876362e-06,
"loss": 0.5545,
"step": 1933
},
{
"epoch": 0.15934744994644476,
"grad_norm": 12.995406107077027,
"learning_rate": 9.935618496785396e-06,
"loss": 0.1749,
"step": 1934
},
{
"epoch": 0.15942984262997445,
"grad_norm": 4.825072935595757,
"learning_rate": 9.935511736936498e-06,
"loss": 0.4887,
"step": 1935
},
{
"epoch": 0.15951223531350417,
"grad_norm": 4.746919626773541,
"learning_rate": 9.935404889218831e-06,
"loss": 0.5112,
"step": 1936
},
{
"epoch": 0.15959462799703386,
"grad_norm": 6.689116278696351,
"learning_rate": 9.935297953634298e-06,
"loss": 0.7103,
"step": 1937
},
{
"epoch": 0.15967702068056355,
"grad_norm": 6.502278304742261,
"learning_rate": 9.935190930184802e-06,
"loss": 0.7145,
"step": 1938
},
{
"epoch": 0.15975941336409327,
"grad_norm": 5.451164256483429,
"learning_rate": 9.935083818872247e-06,
"loss": 0.4737,
"step": 1939
},
{
"epoch": 0.15984180604762296,
"grad_norm": 3.0051587922433067,
"learning_rate": 9.93497661969854e-06,
"loss": 0.2742,
"step": 1940
},
{
"epoch": 0.15992419873115268,
"grad_norm": 5.427591565725511,
"learning_rate": 9.934869332665592e-06,
"loss": 0.5207,
"step": 1941
},
{
"epoch": 0.16000659141468238,
"grad_norm": 5.4367848489833115,
"learning_rate": 9.934761957775312e-06,
"loss": 0.5983,
"step": 1942
},
{
"epoch": 0.16008898409821207,
"grad_norm": 7.154739577060193,
"learning_rate": 9.93465449502961e-06,
"loss": 0.6409,
"step": 1943
},
{
"epoch": 0.1601713767817418,
"grad_norm": 5.038406869203767,
"learning_rate": 9.934546944430402e-06,
"loss": 0.4853,
"step": 1944
},
{
"epoch": 0.16025376946527148,
"grad_norm": 5.650856613574145,
"learning_rate": 9.934439305979598e-06,
"loss": 0.6453,
"step": 1945
},
{
"epoch": 0.1603361621488012,
"grad_norm": 15.099928104720231,
"learning_rate": 9.934331579679119e-06,
"loss": 0.7712,
"step": 1946
},
{
"epoch": 0.1604185548323309,
"grad_norm": 5.331849451324917,
"learning_rate": 9.934223765530883e-06,
"loss": 0.5346,
"step": 1947
},
{
"epoch": 0.16050094751586058,
"grad_norm": 6.11974330303117,
"learning_rate": 9.934115863536806e-06,
"loss": 0.6636,
"step": 1948
},
{
"epoch": 0.1605833401993903,
"grad_norm": 5.259099308839429,
"learning_rate": 9.934007873698813e-06,
"loss": 0.4581,
"step": 1949
},
{
"epoch": 0.16066573288292,
"grad_norm": 4.763348066034409,
"learning_rate": 9.933899796018821e-06,
"loss": 0.7946,
"step": 1950
},
{
"epoch": 0.1607481255664497,
"grad_norm": 4.328616169486818,
"learning_rate": 9.933791630498761e-06,
"loss": 0.5788,
"step": 1951
},
{
"epoch": 0.1608305182499794,
"grad_norm": 5.067172957788821,
"learning_rate": 9.933683377140552e-06,
"loss": 0.6345,
"step": 1952
},
{
"epoch": 0.1609129109335091,
"grad_norm": 4.119522578831503,
"learning_rate": 9.933575035946128e-06,
"loss": 0.4333,
"step": 1953
},
{
"epoch": 0.16099530361703882,
"grad_norm": 4.532665725421111,
"learning_rate": 9.933466606917412e-06,
"loss": 0.4846,
"step": 1954
},
{
"epoch": 0.1610776963005685,
"grad_norm": 6.677989132375096,
"learning_rate": 9.933358090056337e-06,
"loss": 0.6531,
"step": 1955
},
{
"epoch": 0.1611600889840982,
"grad_norm": 5.737777079771404,
"learning_rate": 9.933249485364836e-06,
"loss": 0.551,
"step": 1956
},
{
"epoch": 0.16124248166762792,
"grad_norm": 5.303362303054575,
"learning_rate": 9.93314079284484e-06,
"loss": 0.5989,
"step": 1957
},
{
"epoch": 0.1613248743511576,
"grad_norm": 4.539301099456485,
"learning_rate": 9.933032012498287e-06,
"loss": 0.5385,
"step": 1958
},
{
"epoch": 0.16140726703468733,
"grad_norm": 7.619380498183769,
"learning_rate": 9.932923144327112e-06,
"loss": 0.5917,
"step": 1959
},
{
"epoch": 0.16148965971821702,
"grad_norm": 6.2711200766201225,
"learning_rate": 9.932814188333252e-06,
"loss": 0.6814,
"step": 1960
},
{
"epoch": 0.1615720524017467,
"grad_norm": 4.925280195591163,
"learning_rate": 9.932705144518648e-06,
"loss": 0.3257,
"step": 1961
},
{
"epoch": 0.16165444508527643,
"grad_norm": 4.900465005853736,
"learning_rate": 9.932596012885243e-06,
"loss": 0.6121,
"step": 1962
},
{
"epoch": 0.16173683776880612,
"grad_norm": 5.597180202998693,
"learning_rate": 9.932486793434976e-06,
"loss": 0.4417,
"step": 1963
},
{
"epoch": 0.16181923045233584,
"grad_norm": 3.786036286281869,
"learning_rate": 9.932377486169795e-06,
"loss": 0.4473,
"step": 1964
},
{
"epoch": 0.16190162313586554,
"grad_norm": 5.333467663504172,
"learning_rate": 9.932268091091647e-06,
"loss": 0.5273,
"step": 1965
},
{
"epoch": 0.16198401581939523,
"grad_norm": 3.114851131702418,
"learning_rate": 9.932158608202473e-06,
"loss": 0.2613,
"step": 1966
},
{
"epoch": 0.16206640850292495,
"grad_norm": 4.933213429654218,
"learning_rate": 9.932049037504228e-06,
"loss": 0.5545,
"step": 1967
},
{
"epoch": 0.16214880118645464,
"grad_norm": 5.732455463753953,
"learning_rate": 9.931939378998862e-06,
"loss": 0.5004,
"step": 1968
},
{
"epoch": 0.16223119386998436,
"grad_norm": 3.3159988830987936,
"learning_rate": 9.931829632688327e-06,
"loss": 0.2362,
"step": 1969
},
{
"epoch": 0.16231358655351405,
"grad_norm": 6.801154604066474,
"learning_rate": 9.931719798574577e-06,
"loss": 0.725,
"step": 1970
},
{
"epoch": 0.16239597923704374,
"grad_norm": 28.52359146325827,
"learning_rate": 9.931609876659567e-06,
"loss": 0.8996,
"step": 1971
},
{
"epoch": 0.16247837192057346,
"grad_norm": 7.673010563827642,
"learning_rate": 9.931499866945254e-06,
"loss": 0.6011,
"step": 1972
},
{
"epoch": 0.16256076460410315,
"grad_norm": 2.9164586276644355,
"learning_rate": 9.931389769433595e-06,
"loss": 0.257,
"step": 1973
},
{
"epoch": 0.16264315728763284,
"grad_norm": 3.4481637153537736,
"learning_rate": 9.931279584126552e-06,
"loss": 0.2657,
"step": 1974
},
{
"epoch": 0.16272554997116256,
"grad_norm": 5.503134114237322,
"learning_rate": 9.931169311026086e-06,
"loss": 0.478,
"step": 1975
},
{
"epoch": 0.16280794265469226,
"grad_norm": 7.31936675050305,
"learning_rate": 9.93105895013416e-06,
"loss": 0.7626,
"step": 1976
},
{
"epoch": 0.16289033533822198,
"grad_norm": 3.325018882378549,
"learning_rate": 9.930948501452739e-06,
"loss": 0.2196,
"step": 1977
},
{
"epoch": 0.16297272802175167,
"grad_norm": 4.015246823982774,
"learning_rate": 9.93083796498379e-06,
"loss": 0.3479,
"step": 1978
},
{
"epoch": 0.16305512070528136,
"grad_norm": 4.097019784111488,
"learning_rate": 9.930727340729283e-06,
"loss": 0.3443,
"step": 1979
},
{
"epoch": 0.16313751338881108,
"grad_norm": 4.548498573960237,
"learning_rate": 9.930616628691182e-06,
"loss": 0.4211,
"step": 1980
},
{
"epoch": 0.16321990607234077,
"grad_norm": 6.9065001345198045,
"learning_rate": 9.930505828871461e-06,
"loss": 0.6502,
"step": 1981
},
{
"epoch": 0.1633022987558705,
"grad_norm": 4.716154264543286,
"learning_rate": 9.930394941272094e-06,
"loss": 0.6323,
"step": 1982
},
{
"epoch": 0.16338469143940018,
"grad_norm": 5.198255317143385,
"learning_rate": 9.930283965895054e-06,
"loss": 0.5158,
"step": 1983
},
{
"epoch": 0.16346708412292987,
"grad_norm": 3.8635971124237267,
"learning_rate": 9.930172902742316e-06,
"loss": 0.393,
"step": 1984
},
{
"epoch": 0.1635494768064596,
"grad_norm": 4.73938319426752,
"learning_rate": 9.930061751815858e-06,
"loss": 0.53,
"step": 1985
},
{
"epoch": 0.16363186948998928,
"grad_norm": 5.053290616839663,
"learning_rate": 9.929950513117658e-06,
"loss": 0.6883,
"step": 1986
},
{
"epoch": 0.163714262173519,
"grad_norm": 4.256070865568776,
"learning_rate": 9.929839186649698e-06,
"loss": 0.4755,
"step": 1987
},
{
"epoch": 0.1637966548570487,
"grad_norm": 5.726661518698401,
"learning_rate": 9.929727772413959e-06,
"loss": 0.6225,
"step": 1988
},
{
"epoch": 0.1638790475405784,
"grad_norm": 6.136396113547857,
"learning_rate": 9.929616270412425e-06,
"loss": 0.5515,
"step": 1989
},
{
"epoch": 0.1639614402241081,
"grad_norm": 5.803352132593352,
"learning_rate": 9.92950468064708e-06,
"loss": 0.6151,
"step": 1990
},
{
"epoch": 0.1640438329076378,
"grad_norm": 5.500294699026293,
"learning_rate": 9.929393003119911e-06,
"loss": 0.645,
"step": 1991
},
{
"epoch": 0.1641262255911675,
"grad_norm": 5.126741967427504,
"learning_rate": 9.929281237832909e-06,
"loss": 0.4252,
"step": 1992
},
{
"epoch": 0.1642086182746972,
"grad_norm": 5.715611583303511,
"learning_rate": 9.92916938478806e-06,
"loss": 0.6065,
"step": 1993
},
{
"epoch": 0.1642910109582269,
"grad_norm": 4.5030944875150505,
"learning_rate": 9.929057443987356e-06,
"loss": 0.4517,
"step": 1994
},
{
"epoch": 0.16437340364175662,
"grad_norm": 4.036448430622849,
"learning_rate": 9.928945415432792e-06,
"loss": 0.3826,
"step": 1995
},
{
"epoch": 0.1644557963252863,
"grad_norm": 4.77432743749195,
"learning_rate": 9.92883329912636e-06,
"loss": 0.4729,
"step": 1996
},
{
"epoch": 0.164538189008816,
"grad_norm": 4.24213889175515,
"learning_rate": 9.92872109507006e-06,
"loss": 0.6217,
"step": 1997
},
{
"epoch": 0.16462058169234572,
"grad_norm": 6.054404874550711,
"learning_rate": 9.928608803265884e-06,
"loss": 0.5828,
"step": 1998
},
{
"epoch": 0.16470297437587542,
"grad_norm": 5.412374428435805,
"learning_rate": 9.928496423715835e-06,
"loss": 0.6126,
"step": 1999
},
{
"epoch": 0.16478536705940514,
"grad_norm": 6.090978296501382,
"learning_rate": 9.928383956421914e-06,
"loss": 0.6151,
"step": 2000
}
],
"logging_steps": 1.0,
"max_steps": 24274,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7648832073984.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}